{ "best_global_step": 30783, "best_metric": 0.01108468, "best_model_checkpoint": "/workspace/output/v1-20250506-233651/checkpoint-30783", "epoch": 1.0, "eval_steps": 500, "global_step": 30783, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.248546275541695e-05, "grad_norm": 6.3699445724487305, "learning_rate": 6.493506493506494e-09, "loss": 2.2578258514404297, "memory(GiB)": 10.82, "step": 1, "token_acc": 0.5493562231759657, "train_speed(iter/s)": 0.052366 }, { "epoch": 6.49709255108339e-05, "grad_norm": 6.273163318634033, "learning_rate": 1.2987012987012988e-08, "loss": 1.9255356788635254, "memory(GiB)": 12.02, "step": 2, "token_acc": 0.5746606334841629, "train_speed(iter/s)": 0.099664 }, { "epoch": 9.745638826625085e-05, "grad_norm": 6.607340335845947, "learning_rate": 1.948051948051948e-08, "loss": 1.9188368320465088, "memory(GiB)": 13.92, "step": 3, "token_acc": 0.6428571428571429, "train_speed(iter/s)": 0.14321 }, { "epoch": 0.0001299418510216678, "grad_norm": 5.225632190704346, "learning_rate": 2.5974025974025976e-08, "loss": 2.160536527633667, "memory(GiB)": 13.92, "step": 4, "token_acc": 0.5540540540540541, "train_speed(iter/s)": 0.182947 }, { "epoch": 0.00016242731377708475, "grad_norm": 17.14189910888672, "learning_rate": 3.2467532467532474e-08, "loss": 2.0171685218811035, "memory(GiB)": 15.83, "step": 5, "token_acc": 0.6491228070175439, "train_speed(iter/s)": 0.219871 }, { "epoch": 0.0001949127765325017, "grad_norm": 4.4432830810546875, "learning_rate": 3.896103896103896e-08, "loss": 1.944705605506897, "memory(GiB)": 15.83, "step": 6, "token_acc": 0.6083650190114068, "train_speed(iter/s)": 0.254102 }, { "epoch": 0.00022739823928791866, "grad_norm": 5.868148326873779, "learning_rate": 4.545454545454546e-08, "loss": 1.9300328493118286, "memory(GiB)": 17.76, "step": 7, "token_acc": 0.6433566433566433, "train_speed(iter/s)": 0.28569 }, { "epoch": 0.0002598837020433356, "grad_norm": 6.955442905426025, "learning_rate": 5.194805194805195e-08, "loss": 2.1119701862335205, "memory(GiB)": 17.76, "step": 8, "token_acc": 0.5891472868217055, "train_speed(iter/s)": 0.315381 }, { "epoch": 0.00029236916479875256, "grad_norm": 8.284260749816895, "learning_rate": 5.844155844155845e-08, "loss": 2.0897021293640137, "memory(GiB)": 17.76, "step": 9, "token_acc": 0.5901639344262295, "train_speed(iter/s)": 0.343249 }, { "epoch": 0.0003248546275541695, "grad_norm": 4.6784186363220215, "learning_rate": 6.493506493506495e-08, "loss": 1.9932334423065186, "memory(GiB)": 17.76, "step": 10, "token_acc": 0.5787234042553191, "train_speed(iter/s)": 0.369149 }, { "epoch": 0.00035734009030958646, "grad_norm": 8.989471435546875, "learning_rate": 7.142857142857144e-08, "loss": 2.130850315093994, "memory(GiB)": 17.76, "step": 11, "token_acc": 0.5617021276595745, "train_speed(iter/s)": 0.393568 }, { "epoch": 0.0003898255530650034, "grad_norm": 9.899321556091309, "learning_rate": 7.792207792207792e-08, "loss": 2.018601655960083, "memory(GiB)": 17.76, "step": 12, "token_acc": 0.5672268907563025, "train_speed(iter/s)": 0.416629 }, { "epoch": 0.00042231101582042036, "grad_norm": 6.531896591186523, "learning_rate": 8.441558441558441e-08, "loss": 2.0590715408325195, "memory(GiB)": 17.76, "step": 13, "token_acc": 0.5656108597285068, "train_speed(iter/s)": 0.438343 }, { "epoch": 0.0004547964785758373, "grad_norm": 4.9614577293396, "learning_rate": 9.090909090909091e-08, "loss": 2.098557710647583, "memory(GiB)": 17.76, "step": 14, "token_acc": 0.5441860465116279, "train_speed(iter/s)": 0.458395 }, { "epoch": 0.00048728194133125426, "grad_norm": 7.553228855133057, "learning_rate": 9.74025974025974e-08, "loss": 2.131424903869629, "memory(GiB)": 17.76, "step": 15, "token_acc": 0.5163043478260869, "train_speed(iter/s)": 0.477552 }, { "epoch": 0.0005197674040866712, "grad_norm": 6.596163272857666, "learning_rate": 1.038961038961039e-07, "loss": 2.0502073764801025, "memory(GiB)": 17.76, "step": 16, "token_acc": 0.5280373831775701, "train_speed(iter/s)": 0.495974 }, { "epoch": 0.0005522528668420882, "grad_norm": 6.312471389770508, "learning_rate": 1.103896103896104e-07, "loss": 1.9949541091918945, "memory(GiB)": 17.76, "step": 17, "token_acc": 0.5767634854771784, "train_speed(iter/s)": 0.513404 }, { "epoch": 0.0005847383295975051, "grad_norm": 8.322571754455566, "learning_rate": 1.168831168831169e-07, "loss": 2.0088253021240234, "memory(GiB)": 17.76, "step": 18, "token_acc": 0.5746606334841629, "train_speed(iter/s)": 0.529645 }, { "epoch": 0.0006172237923529221, "grad_norm": 21.194807052612305, "learning_rate": 1.233766233766234e-07, "loss": 2.11826753616333, "memory(GiB)": 17.76, "step": 19, "token_acc": 0.5081967213114754, "train_speed(iter/s)": 0.544511 }, { "epoch": 0.000649709255108339, "grad_norm": 6.468367099761963, "learning_rate": 1.298701298701299e-07, "loss": 1.9703415632247925, "memory(GiB)": 17.76, "step": 20, "token_acc": 0.5943775100401606, "train_speed(iter/s)": 0.559075 }, { "epoch": 0.000682194717863756, "grad_norm": 7.034184455871582, "learning_rate": 1.3636363636363637e-07, "loss": 2.3511176109313965, "memory(GiB)": 17.76, "step": 21, "token_acc": 0.5574468085106383, "train_speed(iter/s)": 0.573274 }, { "epoch": 0.0007146801806191729, "grad_norm": 6.368590354919434, "learning_rate": 1.4285714285714287e-07, "loss": 2.2827646732330322, "memory(GiB)": 17.76, "step": 22, "token_acc": 0.5726141078838174, "train_speed(iter/s)": 0.587095 }, { "epoch": 0.0007471656433745899, "grad_norm": 6.215671539306641, "learning_rate": 1.4935064935064935e-07, "loss": 1.9707576036453247, "memory(GiB)": 17.76, "step": 23, "token_acc": 0.5938697318007663, "train_speed(iter/s)": 0.600234 }, { "epoch": 0.0007796511061300068, "grad_norm": 7.412448406219482, "learning_rate": 1.5584415584415585e-07, "loss": 2.0658528804779053, "memory(GiB)": 17.76, "step": 24, "token_acc": 0.6096654275092936, "train_speed(iter/s)": 0.612825 }, { "epoch": 0.0008121365688854238, "grad_norm": 5.810245037078857, "learning_rate": 1.6233766233766232e-07, "loss": 2.2270357608795166, "memory(GiB)": 17.76, "step": 25, "token_acc": 0.5606060606060606, "train_speed(iter/s)": 0.624644 }, { "epoch": 0.0008446220316408407, "grad_norm": 6.8377766609191895, "learning_rate": 1.6883116883116883e-07, "loss": 2.037055015563965, "memory(GiB)": 17.76, "step": 26, "token_acc": 0.55, "train_speed(iter/s)": 0.63622 }, { "epoch": 0.0008771074943962577, "grad_norm": 5.827251434326172, "learning_rate": 1.7532467532467533e-07, "loss": 2.0435030460357666, "memory(GiB)": 17.76, "step": 27, "token_acc": 0.6153846153846154, "train_speed(iter/s)": 0.646966 }, { "epoch": 0.0009095929571516746, "grad_norm": 13.760417938232422, "learning_rate": 1.8181818181818183e-07, "loss": 2.00734281539917, "memory(GiB)": 17.76, "step": 28, "token_acc": 0.5326633165829145, "train_speed(iter/s)": 0.657181 }, { "epoch": 0.0009420784199070916, "grad_norm": 6.295630931854248, "learning_rate": 1.8831168831168833e-07, "loss": 2.1308681964874268, "memory(GiB)": 17.76, "step": 29, "token_acc": 0.5299539170506913, "train_speed(iter/s)": 0.666622 }, { "epoch": 0.0009745638826625085, "grad_norm": 18.409849166870117, "learning_rate": 1.948051948051948e-07, "loss": 2.0435526371002197, "memory(GiB)": 17.76, "step": 30, "token_acc": 0.5871559633027523, "train_speed(iter/s)": 0.675602 }, { "epoch": 0.0010070493454179255, "grad_norm": 6.147815227508545, "learning_rate": 2.012987012987013e-07, "loss": 2.221672773361206, "memory(GiB)": 17.76, "step": 31, "token_acc": 0.5714285714285714, "train_speed(iter/s)": 0.684094 }, { "epoch": 0.0010395348081733424, "grad_norm": 11.379590034484863, "learning_rate": 2.077922077922078e-07, "loss": 1.9542949199676514, "memory(GiB)": 17.76, "step": 32, "token_acc": 0.6366906474820144, "train_speed(iter/s)": 0.693059 }, { "epoch": 0.0010720202709287594, "grad_norm": 5.595478057861328, "learning_rate": 2.142857142857143e-07, "loss": 1.9817943572998047, "memory(GiB)": 17.76, "step": 33, "token_acc": 0.5594713656387665, "train_speed(iter/s)": 0.701761 }, { "epoch": 0.0011045057336841763, "grad_norm": 7.813901901245117, "learning_rate": 2.207792207792208e-07, "loss": 1.9649081230163574, "memory(GiB)": 17.76, "step": 34, "token_acc": 0.5365853658536586, "train_speed(iter/s)": 0.710312 }, { "epoch": 0.0011369911964395933, "grad_norm": 5.881162643432617, "learning_rate": 2.2727272727272729e-07, "loss": 2.1655149459838867, "memory(GiB)": 17.76, "step": 35, "token_acc": 0.5840336134453782, "train_speed(iter/s)": 0.718341 }, { "epoch": 0.0011694766591950102, "grad_norm": 8.839664459228516, "learning_rate": 2.337662337662338e-07, "loss": 1.9682817459106445, "memory(GiB)": 17.76, "step": 36, "token_acc": 0.5535714285714286, "train_speed(iter/s)": 0.726439 }, { "epoch": 0.0012019621219504272, "grad_norm": 6.741703510284424, "learning_rate": 2.402597402597403e-07, "loss": 2.059689998626709, "memory(GiB)": 17.76, "step": 37, "token_acc": 0.5388349514563107, "train_speed(iter/s)": 0.734096 }, { "epoch": 0.0012344475847058441, "grad_norm": 5.287846565246582, "learning_rate": 2.467532467532468e-07, "loss": 1.9030508995056152, "memory(GiB)": 17.76, "step": 38, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 0.741344 }, { "epoch": 0.001266933047461261, "grad_norm": 5.392193794250488, "learning_rate": 2.532467532467533e-07, "loss": 1.9017255306243896, "memory(GiB)": 17.76, "step": 39, "token_acc": 0.59375, "train_speed(iter/s)": 0.748588 }, { "epoch": 0.001299418510216678, "grad_norm": 10.487321853637695, "learning_rate": 2.597402597402598e-07, "loss": 1.9271247386932373, "memory(GiB)": 17.76, "step": 40, "token_acc": 0.647887323943662, "train_speed(iter/s)": 0.755671 }, { "epoch": 0.001331903972972095, "grad_norm": 8.630084991455078, "learning_rate": 2.6623376623376624e-07, "loss": 1.9895939826965332, "memory(GiB)": 17.76, "step": 41, "token_acc": 0.596244131455399, "train_speed(iter/s)": 0.762475 }, { "epoch": 0.001364389435727512, "grad_norm": 6.010883808135986, "learning_rate": 2.7272727272727274e-07, "loss": 1.8967952728271484, "memory(GiB)": 17.76, "step": 42, "token_acc": 0.592, "train_speed(iter/s)": 0.769087 }, { "epoch": 0.0013968748984829289, "grad_norm": 5.191703796386719, "learning_rate": 2.7922077922077925e-07, "loss": 2.068856954574585, "memory(GiB)": 17.76, "step": 43, "token_acc": 0.5672268907563025, "train_speed(iter/s)": 0.775488 }, { "epoch": 0.0014293603612383458, "grad_norm": 25.364669799804688, "learning_rate": 2.8571428571428575e-07, "loss": 2.05696964263916, "memory(GiB)": 17.76, "step": 44, "token_acc": 0.5355191256830601, "train_speed(iter/s)": 0.781591 }, { "epoch": 0.0014618458239937628, "grad_norm": 5.782987117767334, "learning_rate": 2.9220779220779225e-07, "loss": 1.819027066230774, "memory(GiB)": 17.76, "step": 45, "token_acc": 0.5609756097560976, "train_speed(iter/s)": 0.787461 }, { "epoch": 0.0014943312867491797, "grad_norm": 6.220740795135498, "learning_rate": 2.987012987012987e-07, "loss": 1.845192790031433, "memory(GiB)": 17.76, "step": 46, "token_acc": 0.5309278350515464, "train_speed(iter/s)": 0.792763 }, { "epoch": 0.0015268167495045967, "grad_norm": 20.472579956054688, "learning_rate": 3.051948051948052e-07, "loss": 1.960672378540039, "memory(GiB)": 17.76, "step": 47, "token_acc": 0.5736434108527132, "train_speed(iter/s)": 0.797769 }, { "epoch": 0.0015593022122600136, "grad_norm": 5.12841796875, "learning_rate": 3.116883116883117e-07, "loss": 1.9372442960739136, "memory(GiB)": 17.76, "step": 48, "token_acc": 0.5650224215246636, "train_speed(iter/s)": 0.802822 }, { "epoch": 0.0015917876750154306, "grad_norm": 8.060571670532227, "learning_rate": 3.181818181818182e-07, "loss": 1.9207570552825928, "memory(GiB)": 17.76, "step": 49, "token_acc": 0.5486725663716814, "train_speed(iter/s)": 0.807717 }, { "epoch": 0.0016242731377708475, "grad_norm": 5.269157409667969, "learning_rate": 3.2467532467532465e-07, "loss": 1.9933130741119385, "memory(GiB)": 17.76, "step": 50, "token_acc": 0.5610859728506787, "train_speed(iter/s)": 0.813025 }, { "epoch": 0.0016567586005262645, "grad_norm": 5.217515468597412, "learning_rate": 3.3116883116883115e-07, "loss": 1.924784541130066, "memory(GiB)": 17.76, "step": 51, "token_acc": 0.5485436893203883, "train_speed(iter/s)": 0.818031 }, { "epoch": 0.0016892440632816814, "grad_norm": 15.14864730834961, "learning_rate": 3.3766233766233765e-07, "loss": 1.9595508575439453, "memory(GiB)": 17.76, "step": 52, "token_acc": 0.5656108597285068, "train_speed(iter/s)": 0.823084 }, { "epoch": 0.0017217295260370984, "grad_norm": 4.537651538848877, "learning_rate": 3.4415584415584415e-07, "loss": 1.8866500854492188, "memory(GiB)": 17.76, "step": 53, "token_acc": 0.60546875, "train_speed(iter/s)": 0.827948 }, { "epoch": 0.0017542149887925153, "grad_norm": 12.753533363342285, "learning_rate": 3.5064935064935066e-07, "loss": 1.9790043830871582, "memory(GiB)": 17.76, "step": 54, "token_acc": 0.6069868995633187, "train_speed(iter/s)": 0.832639 }, { "epoch": 0.0017867004515479323, "grad_norm": 4.0716753005981445, "learning_rate": 3.5714285714285716e-07, "loss": 1.6501327753067017, "memory(GiB)": 17.76, "step": 55, "token_acc": 0.625, "train_speed(iter/s)": 0.837129 }, { "epoch": 0.0018191859143033492, "grad_norm": 15.1567964553833, "learning_rate": 3.6363636363636366e-07, "loss": 1.8156453371047974, "memory(GiB)": 17.76, "step": 56, "token_acc": 0.5914893617021276, "train_speed(iter/s)": 0.841412 }, { "epoch": 0.0018516713770587662, "grad_norm": 6.801799774169922, "learning_rate": 3.7012987012987016e-07, "loss": 1.8616132736206055, "memory(GiB)": 17.76, "step": 57, "token_acc": 0.6, "train_speed(iter/s)": 0.845306 }, { "epoch": 0.0018841568398141832, "grad_norm": 5.113922595977783, "learning_rate": 3.7662337662337666e-07, "loss": 1.847752332687378, "memory(GiB)": 17.76, "step": 58, "token_acc": 0.5297029702970297, "train_speed(iter/s)": 0.849051 }, { "epoch": 0.0019166423025696, "grad_norm": 43.736202239990234, "learning_rate": 3.8311688311688316e-07, "loss": 1.7647939920425415, "memory(GiB)": 17.76, "step": 59, "token_acc": 0.5882352941176471, "train_speed(iter/s)": 0.852882 }, { "epoch": 0.001949127765325017, "grad_norm": 7.572408676147461, "learning_rate": 3.896103896103896e-07, "loss": 1.7359580993652344, "memory(GiB)": 17.76, "step": 60, "token_acc": 0.5825688073394495, "train_speed(iter/s)": 0.856384 }, { "epoch": 0.001981613228080434, "grad_norm": 5.160432815551758, "learning_rate": 3.961038961038961e-07, "loss": 1.7255544662475586, "memory(GiB)": 17.76, "step": 61, "token_acc": 0.6033057851239669, "train_speed(iter/s)": 0.860209 }, { "epoch": 0.002014098690835851, "grad_norm": 4.367397785186768, "learning_rate": 4.025974025974026e-07, "loss": 1.793118953704834, "memory(GiB)": 17.76, "step": 62, "token_acc": 0.6046511627906976, "train_speed(iter/s)": 0.864009 }, { "epoch": 0.002046584153591268, "grad_norm": 6.801490783691406, "learning_rate": 4.090909090909091e-07, "loss": 1.6581332683563232, "memory(GiB)": 17.76, "step": 63, "token_acc": 0.6370370370370371, "train_speed(iter/s)": 0.867837 }, { "epoch": 0.002079069616346685, "grad_norm": 5.546975135803223, "learning_rate": 4.155844155844156e-07, "loss": 1.6096937656402588, "memory(GiB)": 17.76, "step": 64, "token_acc": 0.6160337552742616, "train_speed(iter/s)": 0.871648 }, { "epoch": 0.002111555079102102, "grad_norm": 5.7651047706604, "learning_rate": 4.220779220779221e-07, "loss": 1.6526615619659424, "memory(GiB)": 17.76, "step": 65, "token_acc": 0.6388888888888888, "train_speed(iter/s)": 0.875358 }, { "epoch": 0.0021440405418575188, "grad_norm": 4.113655090332031, "learning_rate": 4.285714285714286e-07, "loss": 1.7223107814788818, "memory(GiB)": 17.76, "step": 66, "token_acc": 0.6118143459915611, "train_speed(iter/s)": 0.878931 }, { "epoch": 0.0021765260046129357, "grad_norm": 12.548155784606934, "learning_rate": 4.350649350649351e-07, "loss": 1.7727237939834595, "memory(GiB)": 17.76, "step": 67, "token_acc": 0.6392156862745098, "train_speed(iter/s)": 0.8824 }, { "epoch": 0.0022090114673683527, "grad_norm": 22.845407485961914, "learning_rate": 4.415584415584416e-07, "loss": 1.7295849323272705, "memory(GiB)": 17.76, "step": 68, "token_acc": 0.5953488372093023, "train_speed(iter/s)": 0.885786 }, { "epoch": 0.0022414969301237696, "grad_norm": 4.626049995422363, "learning_rate": 4.480519480519481e-07, "loss": 1.6784396171569824, "memory(GiB)": 17.76, "step": 69, "token_acc": 0.5885416666666666, "train_speed(iter/s)": 0.889012 }, { "epoch": 0.0022739823928791866, "grad_norm": 8.001653671264648, "learning_rate": 4.5454545454545457e-07, "loss": 1.6178001165390015, "memory(GiB)": 17.76, "step": 70, "token_acc": 0.5844748858447488, "train_speed(iter/s)": 0.892291 }, { "epoch": 0.0023064678556346035, "grad_norm": 5.170897006988525, "learning_rate": 4.610389610389611e-07, "loss": 1.6663768291473389, "memory(GiB)": 17.76, "step": 71, "token_acc": 0.6188340807174888, "train_speed(iter/s)": 0.895427 }, { "epoch": 0.0023389533183900205, "grad_norm": 4.019551753997803, "learning_rate": 4.675324675324676e-07, "loss": 1.561343789100647, "memory(GiB)": 17.76, "step": 72, "token_acc": 0.6306306306306306, "train_speed(iter/s)": 0.898422 }, { "epoch": 0.0023714387811454374, "grad_norm": 4.091439723968506, "learning_rate": 4.740259740259741e-07, "loss": 1.6988348960876465, "memory(GiB)": 17.76, "step": 73, "token_acc": 0.6545454545454545, "train_speed(iter/s)": 0.901513 }, { "epoch": 0.0024039242439008544, "grad_norm": 9.46254825592041, "learning_rate": 4.805194805194806e-07, "loss": 1.6291099786758423, "memory(GiB)": 17.76, "step": 74, "token_acc": 0.6126482213438735, "train_speed(iter/s)": 0.904425 }, { "epoch": 0.0024364097066562713, "grad_norm": 6.064194679260254, "learning_rate": 4.870129870129871e-07, "loss": 1.554851770401001, "memory(GiB)": 17.76, "step": 75, "token_acc": 0.6444444444444445, "train_speed(iter/s)": 0.907303 }, { "epoch": 0.0024688951694116883, "grad_norm": 3.2921836376190186, "learning_rate": 4.935064935064936e-07, "loss": 1.415891170501709, "memory(GiB)": 17.76, "step": 76, "token_acc": 0.6611295681063123, "train_speed(iter/s)": 0.910153 }, { "epoch": 0.0025013806321671052, "grad_norm": 2.715853214263916, "learning_rate": 5.000000000000001e-07, "loss": 1.467050552368164, "memory(GiB)": 17.76, "step": 77, "token_acc": 0.6267281105990783, "train_speed(iter/s)": 0.912883 }, { "epoch": 0.002533866094922522, "grad_norm": 10.485873222351074, "learning_rate": 5.064935064935066e-07, "loss": 1.5355273485183716, "memory(GiB)": 17.76, "step": 78, "token_acc": 0.5572916666666666, "train_speed(iter/s)": 0.915477 }, { "epoch": 0.002566351557677939, "grad_norm": 9.00793743133545, "learning_rate": 5.129870129870131e-07, "loss": 1.4763696193695068, "memory(GiB)": 17.76, "step": 79, "token_acc": 0.5953488372093023, "train_speed(iter/s)": 0.918189 }, { "epoch": 0.002598837020433356, "grad_norm": 3.0307397842407227, "learning_rate": 5.194805194805196e-07, "loss": 1.5207101106643677, "memory(GiB)": 17.76, "step": 80, "token_acc": 0.5774647887323944, "train_speed(iter/s)": 0.920669 }, { "epoch": 0.002631322483188773, "grad_norm": 6.469976425170898, "learning_rate": 5.25974025974026e-07, "loss": 1.417733907699585, "memory(GiB)": 17.76, "step": 81, "token_acc": 0.6287128712871287, "train_speed(iter/s)": 0.923177 }, { "epoch": 0.00266380794594419, "grad_norm": 4.897367477416992, "learning_rate": 5.324675324675325e-07, "loss": 1.5185246467590332, "memory(GiB)": 17.76, "step": 82, "token_acc": 0.6045454545454545, "train_speed(iter/s)": 0.925801 }, { "epoch": 0.002696293408699607, "grad_norm": 3.2687363624572754, "learning_rate": 5.38961038961039e-07, "loss": 1.5348047018051147, "memory(GiB)": 17.76, "step": 83, "token_acc": 0.5548780487804879, "train_speed(iter/s)": 0.928322 }, { "epoch": 0.002728778871455024, "grad_norm": 9.758577346801758, "learning_rate": 5.454545454545455e-07, "loss": 1.5167959928512573, "memory(GiB)": 17.76, "step": 84, "token_acc": 0.6405529953917051, "train_speed(iter/s)": 0.930712 }, { "epoch": 0.002761264334210441, "grad_norm": 3.4235281944274902, "learning_rate": 5.51948051948052e-07, "loss": 1.2954318523406982, "memory(GiB)": 17.76, "step": 85, "token_acc": 0.6567796610169492, "train_speed(iter/s)": 0.933095 }, { "epoch": 0.0027937497969658578, "grad_norm": 3.522008180618286, "learning_rate": 5.584415584415585e-07, "loss": 1.4012722969055176, "memory(GiB)": 17.76, "step": 86, "token_acc": 0.6178010471204188, "train_speed(iter/s)": 0.935234 }, { "epoch": 0.0028262352597212747, "grad_norm": 3.11869478225708, "learning_rate": 5.64935064935065e-07, "loss": 1.3748557567596436, "memory(GiB)": 17.76, "step": 87, "token_acc": 0.5871559633027523, "train_speed(iter/s)": 0.93752 }, { "epoch": 0.0028587207224766917, "grad_norm": 5.486433506011963, "learning_rate": 5.714285714285715e-07, "loss": 1.3675282001495361, "memory(GiB)": 17.76, "step": 88, "token_acc": 0.6694214876033058, "train_speed(iter/s)": 0.939383 }, { "epoch": 0.0028912061852321086, "grad_norm": 3.99532151222229, "learning_rate": 5.77922077922078e-07, "loss": 1.3585894107818604, "memory(GiB)": 17.76, "step": 89, "token_acc": 0.652542372881356, "train_speed(iter/s)": 0.941132 }, { "epoch": 0.0029236916479875256, "grad_norm": 2.493602752685547, "learning_rate": 5.844155844155845e-07, "loss": 1.3077243566513062, "memory(GiB)": 17.76, "step": 90, "token_acc": 0.6556016597510373, "train_speed(iter/s)": 0.943003 }, { "epoch": 0.0029561771107429425, "grad_norm": 3.072343349456787, "learning_rate": 5.90909090909091e-07, "loss": 1.411389708518982, "memory(GiB)": 17.76, "step": 91, "token_acc": 0.6363636363636364, "train_speed(iter/s)": 0.945173 }, { "epoch": 0.0029886625734983595, "grad_norm": 2.641064405441284, "learning_rate": 5.974025974025974e-07, "loss": 1.3965007066726685, "memory(GiB)": 17.76, "step": 92, "token_acc": 0.6358695652173914, "train_speed(iter/s)": 0.947221 }, { "epoch": 0.0030211480362537764, "grad_norm": 2.588033676147461, "learning_rate": 6.038961038961039e-07, "loss": 1.3423349857330322, "memory(GiB)": 17.76, "step": 93, "token_acc": 0.6388888888888888, "train_speed(iter/s)": 0.949344 }, { "epoch": 0.0030536334990091934, "grad_norm": 3.5711429119110107, "learning_rate": 6.103896103896104e-07, "loss": 1.274167537689209, "memory(GiB)": 17.76, "step": 94, "token_acc": 0.6543778801843319, "train_speed(iter/s)": 0.951356 }, { "epoch": 0.0030861189617646103, "grad_norm": 5.257731914520264, "learning_rate": 6.168831168831169e-07, "loss": 1.2861063480377197, "memory(GiB)": 17.76, "step": 95, "token_acc": 0.6791666666666667, "train_speed(iter/s)": 0.953381 }, { "epoch": 0.0031186044245200273, "grad_norm": 4.065532207489014, "learning_rate": 6.233766233766234e-07, "loss": 1.2407722473144531, "memory(GiB)": 17.76, "step": 96, "token_acc": 0.6824817518248175, "train_speed(iter/s)": 0.955334 }, { "epoch": 0.0031510898872754442, "grad_norm": 4.430940628051758, "learning_rate": 6.298701298701299e-07, "loss": 1.2813987731933594, "memory(GiB)": 17.76, "step": 97, "token_acc": 0.6741071428571429, "train_speed(iter/s)": 0.957257 }, { "epoch": 0.003183575350030861, "grad_norm": 2.779845952987671, "learning_rate": 6.363636363636364e-07, "loss": 1.3235514163970947, "memory(GiB)": 17.76, "step": 98, "token_acc": 0.6651785714285714, "train_speed(iter/s)": 0.959129 }, { "epoch": 0.003216060812786278, "grad_norm": 6.602380275726318, "learning_rate": 6.428571428571428e-07, "loss": 1.2377984523773193, "memory(GiB)": 17.76, "step": 99, "token_acc": 0.6292134831460674, "train_speed(iter/s)": 0.961008 }, { "epoch": 0.003248546275541695, "grad_norm": 2.5316238403320312, "learning_rate": 6.493506493506493e-07, "loss": 1.1828628778457642, "memory(GiB)": 17.76, "step": 100, "token_acc": 0.6732283464566929, "train_speed(iter/s)": 0.962676 }, { "epoch": 0.003281031738297112, "grad_norm": 3.6374406814575195, "learning_rate": 6.558441558441558e-07, "loss": 1.1872124671936035, "memory(GiB)": 17.76, "step": 101, "token_acc": 0.6457399103139013, "train_speed(iter/s)": 0.964102 }, { "epoch": 0.003313517201052529, "grad_norm": 4.202330112457275, "learning_rate": 6.623376623376623e-07, "loss": 1.1959435939788818, "memory(GiB)": 17.76, "step": 102, "token_acc": 0.5953488372093023, "train_speed(iter/s)": 0.965515 }, { "epoch": 0.003346002663807946, "grad_norm": 2.483125686645508, "learning_rate": 6.688311688311688e-07, "loss": 1.179594874382019, "memory(GiB)": 17.76, "step": 103, "token_acc": 0.6666666666666666, "train_speed(iter/s)": 0.96712 }, { "epoch": 0.003378488126563363, "grad_norm": 3.2964746952056885, "learning_rate": 6.753246753246753e-07, "loss": 1.181896448135376, "memory(GiB)": 17.76, "step": 104, "token_acc": 0.6696428571428571, "train_speed(iter/s)": 0.968783 }, { "epoch": 0.00341097358931878, "grad_norm": 4.546812057495117, "learning_rate": 6.818181818181818e-07, "loss": 1.121227741241455, "memory(GiB)": 17.76, "step": 105, "token_acc": 0.6798418972332015, "train_speed(iter/s)": 0.970509 }, { "epoch": 0.003443459052074197, "grad_norm": 47.57494354248047, "learning_rate": 6.883116883116883e-07, "loss": 1.1733388900756836, "memory(GiB)": 17.76, "step": 106, "token_acc": 0.6533333333333333, "train_speed(iter/s)": 0.972236 }, { "epoch": 0.0034759445148296137, "grad_norm": 32.52287673950195, "learning_rate": 6.948051948051948e-07, "loss": 1.0933902263641357, "memory(GiB)": 17.76, "step": 107, "token_acc": 0.6954887218045113, "train_speed(iter/s)": 0.973929 }, { "epoch": 0.0035084299775850307, "grad_norm": 5.128788948059082, "learning_rate": 7.012987012987013e-07, "loss": 1.1078615188598633, "memory(GiB)": 17.76, "step": 108, "token_acc": 0.6590909090909091, "train_speed(iter/s)": 0.975519 }, { "epoch": 0.0035409154403404476, "grad_norm": 2.9209811687469482, "learning_rate": 7.077922077922078e-07, "loss": 1.0732967853546143, "memory(GiB)": 17.76, "step": 109, "token_acc": 0.7436974789915967, "train_speed(iter/s)": 0.977123 }, { "epoch": 0.0035734009030958646, "grad_norm": 6.302421569824219, "learning_rate": 7.142857142857143e-07, "loss": 1.0385569334030151, "memory(GiB)": 17.76, "step": 110, "token_acc": 0.6771300448430493, "train_speed(iter/s)": 0.978535 }, { "epoch": 0.0036058863658512815, "grad_norm": 3.087005853652954, "learning_rate": 7.207792207792208e-07, "loss": 1.03572678565979, "memory(GiB)": 17.76, "step": 111, "token_acc": 0.688034188034188, "train_speed(iter/s)": 0.980164 }, { "epoch": 0.0036383718286066985, "grad_norm": 3.241546630859375, "learning_rate": 7.272727272727273e-07, "loss": 1.008386492729187, "memory(GiB)": 17.76, "step": 112, "token_acc": 0.7012448132780082, "train_speed(iter/s)": 0.981637 }, { "epoch": 0.0036708572913621155, "grad_norm": 3.2861921787261963, "learning_rate": 7.337662337662338e-07, "loss": 1.0246024131774902, "memory(GiB)": 17.76, "step": 113, "token_acc": 0.7042801556420234, "train_speed(iter/s)": 0.982939 }, { "epoch": 0.0037033427541175324, "grad_norm": 5.945087432861328, "learning_rate": 7.402597402597403e-07, "loss": 1.0436748266220093, "memory(GiB)": 17.76, "step": 114, "token_acc": 0.6779661016949152, "train_speed(iter/s)": 0.984556 }, { "epoch": 0.0037358282168729494, "grad_norm": 2.9983158111572266, "learning_rate": 7.467532467532468e-07, "loss": 0.9882689714431763, "memory(GiB)": 17.76, "step": 115, "token_acc": 0.6612244897959184, "train_speed(iter/s)": 0.986067 }, { "epoch": 0.0037683136796283663, "grad_norm": 2.5800163745880127, "learning_rate": 7.532467532467533e-07, "loss": 0.9402341842651367, "memory(GiB)": 17.76, "step": 116, "token_acc": 0.7238805970149254, "train_speed(iter/s)": 0.987581 }, { "epoch": 0.0038007991423837833, "grad_norm": 7.973137855529785, "learning_rate": 7.597402597402598e-07, "loss": 1.0395736694335938, "memory(GiB)": 17.76, "step": 117, "token_acc": 0.6971153846153846, "train_speed(iter/s)": 0.989174 }, { "epoch": 0.0038332846051392, "grad_norm": 5.260778427124023, "learning_rate": 7.662337662337663e-07, "loss": 1.0317330360412598, "memory(GiB)": 17.76, "step": 118, "token_acc": 0.6944444444444444, "train_speed(iter/s)": 0.99067 }, { "epoch": 0.003865770067894617, "grad_norm": 1.8395005464553833, "learning_rate": 7.727272727272727e-07, "loss": 0.9742636680603027, "memory(GiB)": 17.76, "step": 119, "token_acc": 0.736, "train_speed(iter/s)": 0.992046 }, { "epoch": 0.003898255530650034, "grad_norm": 2.3460822105407715, "learning_rate": 7.792207792207792e-07, "loss": 0.9320727586746216, "memory(GiB)": 17.76, "step": 120, "token_acc": 0.6636363636363637, "train_speed(iter/s)": 0.993545 }, { "epoch": 0.003930740993405451, "grad_norm": 3.220935821533203, "learning_rate": 7.857142857142857e-07, "loss": 0.9042680263519287, "memory(GiB)": 17.76, "step": 121, "token_acc": 0.6839622641509434, "train_speed(iter/s)": 0.9948 }, { "epoch": 0.003963226456160868, "grad_norm": 1.8865607976913452, "learning_rate": 7.922077922077922e-07, "loss": 0.9167282581329346, "memory(GiB)": 17.76, "step": 122, "token_acc": 0.7361111111111112, "train_speed(iter/s)": 0.996136 }, { "epoch": 0.003995711918916285, "grad_norm": 2.797013759613037, "learning_rate": 7.987012987012987e-07, "loss": 0.9177172183990479, "memory(GiB)": 17.76, "step": 123, "token_acc": 0.72, "train_speed(iter/s)": 0.997577 }, { "epoch": 0.004028197381671702, "grad_norm": 2.711953639984131, "learning_rate": 8.051948051948052e-07, "loss": 0.8555880784988403, "memory(GiB)": 17.76, "step": 124, "token_acc": 0.7725490196078432, "train_speed(iter/s)": 0.998931 }, { "epoch": 0.004060682844427119, "grad_norm": 2.217308759689331, "learning_rate": 8.116883116883117e-07, "loss": 0.8265160918235779, "memory(GiB)": 17.76, "step": 125, "token_acc": 0.7229437229437229, "train_speed(iter/s)": 1.000298 }, { "epoch": 0.004093168307182536, "grad_norm": 2.163361072540283, "learning_rate": 8.181818181818182e-07, "loss": 0.8209342360496521, "memory(GiB)": 17.76, "step": 126, "token_acc": 0.75, "train_speed(iter/s)": 1.001723 }, { "epoch": 0.004125653769937953, "grad_norm": 9.47025203704834, "learning_rate": 8.246753246753247e-07, "loss": 0.8831616044044495, "memory(GiB)": 17.76, "step": 127, "token_acc": 0.6451612903225806, "train_speed(iter/s)": 1.002995 }, { "epoch": 0.00415813923269337, "grad_norm": 3.689645528793335, "learning_rate": 8.311688311688312e-07, "loss": 0.9255027770996094, "memory(GiB)": 17.76, "step": 128, "token_acc": 0.6607142857142857, "train_speed(iter/s)": 1.004156 }, { "epoch": 0.004190624695448787, "grad_norm": 4.4254302978515625, "learning_rate": 8.376623376623377e-07, "loss": 0.8434635996818542, "memory(GiB)": 17.76, "step": 129, "token_acc": 0.7208333333333333, "train_speed(iter/s)": 1.005106 }, { "epoch": 0.004223110158204204, "grad_norm": 2.574427366256714, "learning_rate": 8.441558441558442e-07, "loss": 0.8511883020401001, "memory(GiB)": 17.76, "step": 130, "token_acc": 0.6813186813186813, "train_speed(iter/s)": 1.005956 }, { "epoch": 0.0042555956209596206, "grad_norm": 6.929450035095215, "learning_rate": 8.506493506493507e-07, "loss": 0.8527588844299316, "memory(GiB)": 17.76, "step": 131, "token_acc": 0.7489539748953975, "train_speed(iter/s)": 1.006603 }, { "epoch": 0.0042880810837150375, "grad_norm": 1.7266536951065063, "learning_rate": 8.571428571428572e-07, "loss": 0.7799583673477173, "memory(GiB)": 17.76, "step": 132, "token_acc": 0.7222222222222222, "train_speed(iter/s)": 1.007917 }, { "epoch": 0.0043205665464704545, "grad_norm": 8.34188175201416, "learning_rate": 8.636363636363637e-07, "loss": 0.7494421005249023, "memory(GiB)": 17.76, "step": 133, "token_acc": 0.7831858407079646, "train_speed(iter/s)": 1.009135 }, { "epoch": 0.004353052009225871, "grad_norm": 2.1084036827087402, "learning_rate": 8.701298701298702e-07, "loss": 0.7496717572212219, "memory(GiB)": 17.76, "step": 134, "token_acc": 0.7459677419354839, "train_speed(iter/s)": 1.010375 }, { "epoch": 0.004385537471981288, "grad_norm": 2.472133159637451, "learning_rate": 8.766233766233767e-07, "loss": 0.7508082985877991, "memory(GiB)": 17.76, "step": 135, "token_acc": 0.7591240875912408, "train_speed(iter/s)": 1.011729 }, { "epoch": 0.004418022934736705, "grad_norm": 1.6212021112442017, "learning_rate": 8.831168831168832e-07, "loss": 0.7314716577529907, "memory(GiB)": 17.76, "step": 136, "token_acc": 0.775, "train_speed(iter/s)": 1.012892 }, { "epoch": 0.004450508397492122, "grad_norm": 1.963639259338379, "learning_rate": 8.896103896103897e-07, "loss": 0.7102397680282593, "memory(GiB)": 17.76, "step": 137, "token_acc": 0.7963800904977375, "train_speed(iter/s)": 1.013936 }, { "epoch": 0.004482993860247539, "grad_norm": 41.47132873535156, "learning_rate": 8.961038961038962e-07, "loss": 0.7520434260368347, "memory(GiB)": 17.76, "step": 138, "token_acc": 0.7549407114624506, "train_speed(iter/s)": 1.014976 }, { "epoch": 0.004515479323002956, "grad_norm": 1.9760887622833252, "learning_rate": 9.025974025974026e-07, "loss": 0.7006360292434692, "memory(GiB)": 17.76, "step": 139, "token_acc": 0.7740585774058577, "train_speed(iter/s)": 1.015842 }, { "epoch": 0.004547964785758373, "grad_norm": 1.993683099746704, "learning_rate": 9.090909090909091e-07, "loss": 0.6759151220321655, "memory(GiB)": 17.76, "step": 140, "token_acc": 0.7605042016806722, "train_speed(iter/s)": 1.016885 }, { "epoch": 0.00458045024851379, "grad_norm": 2.5373446941375732, "learning_rate": 9.155844155844156e-07, "loss": 0.6616238355636597, "memory(GiB)": 17.76, "step": 141, "token_acc": 0.7407407407407407, "train_speed(iter/s)": 1.017706 }, { "epoch": 0.004612935711269207, "grad_norm": 2.6133460998535156, "learning_rate": 9.220779220779221e-07, "loss": 0.7093609571456909, "memory(GiB)": 17.76, "step": 142, "token_acc": 0.7832512315270936, "train_speed(iter/s)": 1.018753 }, { "epoch": 0.004645421174024624, "grad_norm": 1.9645118713378906, "learning_rate": 9.285714285714287e-07, "loss": 0.616824746131897, "memory(GiB)": 17.76, "step": 143, "token_acc": 0.7722007722007722, "train_speed(iter/s)": 1.019639 }, { "epoch": 0.004677906636780041, "grad_norm": 3.515613317489624, "learning_rate": 9.350649350649352e-07, "loss": 0.6787400245666504, "memory(GiB)": 17.76, "step": 144, "token_acc": 0.7805907172995781, "train_speed(iter/s)": 1.020616 }, { "epoch": 0.004710392099535458, "grad_norm": 3.5163369178771973, "learning_rate": 9.415584415584417e-07, "loss": 0.6673519611358643, "memory(GiB)": 17.76, "step": 145, "token_acc": 0.746606334841629, "train_speed(iter/s)": 1.02152 }, { "epoch": 0.004742877562290875, "grad_norm": 2.254845380783081, "learning_rate": 9.480519480519482e-07, "loss": 0.620577335357666, "memory(GiB)": 17.76, "step": 146, "token_acc": 0.7637130801687764, "train_speed(iter/s)": 1.022531 }, { "epoch": 0.004775363025046292, "grad_norm": 1.9471118450164795, "learning_rate": 9.545454545454548e-07, "loss": 0.6681690216064453, "memory(GiB)": 17.76, "step": 147, "token_acc": 0.7832512315270936, "train_speed(iter/s)": 1.023659 }, { "epoch": 0.004807848487801709, "grad_norm": 2.0605499744415283, "learning_rate": 9.610389610389612e-07, "loss": 0.6241434812545776, "memory(GiB)": 17.76, "step": 148, "token_acc": 0.7918552036199095, "train_speed(iter/s)": 1.024735 }, { "epoch": 0.004840333950557126, "grad_norm": 2.674926519393921, "learning_rate": 9.675324675324676e-07, "loss": 0.6130306720733643, "memory(GiB)": 17.76, "step": 149, "token_acc": 0.786096256684492, "train_speed(iter/s)": 1.0257 }, { "epoch": 0.004872819413312543, "grad_norm": 7.385551929473877, "learning_rate": 9.740259740259742e-07, "loss": 0.6205039024353027, "memory(GiB)": 17.76, "step": 150, "token_acc": 0.7680608365019012, "train_speed(iter/s)": 1.026681 }, { "epoch": 0.00490530487606796, "grad_norm": 5.163754940032959, "learning_rate": 9.805194805194806e-07, "loss": 0.6025283932685852, "memory(GiB)": 17.76, "step": 151, "token_acc": 0.8067632850241546, "train_speed(iter/s)": 1.02765 }, { "epoch": 0.0049377903388233765, "grad_norm": 1.4703954458236694, "learning_rate": 9.870129870129872e-07, "loss": 0.5769739747047424, "memory(GiB)": 17.76, "step": 152, "token_acc": 0.7751937984496124, "train_speed(iter/s)": 1.028632 }, { "epoch": 0.0049702758015787935, "grad_norm": 1.842751383781433, "learning_rate": 9.935064935064936e-07, "loss": 0.6001898050308228, "memory(GiB)": 17.76, "step": 153, "token_acc": 0.7796610169491526, "train_speed(iter/s)": 1.029573 }, { "epoch": 0.0050027612643342104, "grad_norm": 1.592110276222229, "learning_rate": 1.0000000000000002e-06, "loss": 0.5471850633621216, "memory(GiB)": 17.76, "step": 154, "token_acc": 0.8169642857142857, "train_speed(iter/s)": 1.030492 }, { "epoch": 0.005035246727089627, "grad_norm": 2.906395435333252, "learning_rate": 1.0064935064935066e-06, "loss": 0.5678552389144897, "memory(GiB)": 17.76, "step": 155, "token_acc": 0.8036529680365296, "train_speed(iter/s)": 1.03108 }, { "epoch": 0.005067732189845044, "grad_norm": 3.0354697704315186, "learning_rate": 1.0129870129870132e-06, "loss": 0.5738771557807922, "memory(GiB)": 17.76, "step": 156, "token_acc": 0.7667844522968198, "train_speed(iter/s)": 1.031756 }, { "epoch": 0.005100217652600461, "grad_norm": 1.6863384246826172, "learning_rate": 1.0194805194805196e-06, "loss": 0.5651767253875732, "memory(GiB)": 17.76, "step": 157, "token_acc": 0.784, "train_speed(iter/s)": 1.03261 }, { "epoch": 0.005132703115355878, "grad_norm": 1.3551757335662842, "learning_rate": 1.0259740259740262e-06, "loss": 0.5195839405059814, "memory(GiB)": 17.76, "step": 158, "token_acc": 0.7936507936507936, "train_speed(iter/s)": 1.033508 }, { "epoch": 0.005165188578111295, "grad_norm": 4.454302787780762, "learning_rate": 1.0324675324675326e-06, "loss": 0.5628905892372131, "memory(GiB)": 17.76, "step": 159, "token_acc": 0.8194444444444444, "train_speed(iter/s)": 1.034425 }, { "epoch": 0.005197674040866712, "grad_norm": 1.3632216453552246, "learning_rate": 1.0389610389610392e-06, "loss": 0.5849941968917847, "memory(GiB)": 17.76, "step": 160, "token_acc": 0.7939914163090128, "train_speed(iter/s)": 1.03531 }, { "epoch": 0.005230159503622129, "grad_norm": 1.8875478506088257, "learning_rate": 1.0454545454545456e-06, "loss": 0.5269724726676941, "memory(GiB)": 17.76, "step": 161, "token_acc": 0.8196078431372549, "train_speed(iter/s)": 1.036125 }, { "epoch": 0.005262644966377546, "grad_norm": 2.2476449012756348, "learning_rate": 1.051948051948052e-06, "loss": 0.5709660053253174, "memory(GiB)": 17.76, "step": 162, "token_acc": 0.7878787878787878, "train_speed(iter/s)": 1.037101 }, { "epoch": 0.005295130429132963, "grad_norm": 1.7770518064498901, "learning_rate": 1.0584415584415584e-06, "loss": 0.5637598633766174, "memory(GiB)": 17.76, "step": 163, "token_acc": 0.7902097902097902, "train_speed(iter/s)": 1.037819 }, { "epoch": 0.00532761589188838, "grad_norm": 6.297031879425049, "learning_rate": 1.064935064935065e-06, "loss": 0.5520730018615723, "memory(GiB)": 17.76, "step": 164, "token_acc": 0.7630522088353414, "train_speed(iter/s)": 1.038621 }, { "epoch": 0.005360101354643797, "grad_norm": 5.162660121917725, "learning_rate": 1.0714285714285714e-06, "loss": 0.5392230749130249, "memory(GiB)": 17.76, "step": 165, "token_acc": 0.8007968127490039, "train_speed(iter/s)": 1.039451 }, { "epoch": 0.005392586817399214, "grad_norm": 2.2927019596099854, "learning_rate": 1.077922077922078e-06, "loss": 0.524531364440918, "memory(GiB)": 17.76, "step": 166, "token_acc": 0.793859649122807, "train_speed(iter/s)": 1.040296 }, { "epoch": 0.005425072280154631, "grad_norm": 1.9196659326553345, "learning_rate": 1.0844155844155844e-06, "loss": 0.5078117847442627, "memory(GiB)": 17.76, "step": 167, "token_acc": 0.8280542986425339, "train_speed(iter/s)": 1.041024 }, { "epoch": 0.005457557742910048, "grad_norm": 3.4301795959472656, "learning_rate": 1.090909090909091e-06, "loss": 0.536963939666748, "memory(GiB)": 17.76, "step": 168, "token_acc": 0.7815126050420168, "train_speed(iter/s)": 1.041838 }, { "epoch": 0.005490043205665465, "grad_norm": 2.775611400604248, "learning_rate": 1.0974025974025974e-06, "loss": 0.5384215116500854, "memory(GiB)": 17.76, "step": 169, "token_acc": 0.8190045248868778, "train_speed(iter/s)": 1.042507 }, { "epoch": 0.005522528668420882, "grad_norm": 1.9190115928649902, "learning_rate": 1.103896103896104e-06, "loss": 0.5228817462921143, "memory(GiB)": 17.76, "step": 170, "token_acc": 0.8095238095238095, "train_speed(iter/s)": 1.042978 }, { "epoch": 0.005555014131176299, "grad_norm": 4.480566024780273, "learning_rate": 1.1103896103896104e-06, "loss": 0.4889138638973236, "memory(GiB)": 17.76, "step": 171, "token_acc": 0.8185654008438819, "train_speed(iter/s)": 1.043457 }, { "epoch": 0.0055874995939317156, "grad_norm": 2.200014114379883, "learning_rate": 1.116883116883117e-06, "loss": 0.5094919204711914, "memory(GiB)": 17.76, "step": 172, "token_acc": 0.8305084745762712, "train_speed(iter/s)": 1.043667 }, { "epoch": 0.0056199850566871325, "grad_norm": 1.7071954011917114, "learning_rate": 1.1233766233766234e-06, "loss": 0.5249509811401367, "memory(GiB)": 17.76, "step": 173, "token_acc": 0.7816901408450704, "train_speed(iter/s)": 1.04429 }, { "epoch": 0.0056524705194425495, "grad_norm": 1.3096760511398315, "learning_rate": 1.12987012987013e-06, "loss": 0.5032217502593994, "memory(GiB)": 17.76, "step": 174, "token_acc": 0.7904411764705882, "train_speed(iter/s)": 1.044925 }, { "epoch": 0.005684955982197966, "grad_norm": 1.936140775680542, "learning_rate": 1.1363636363636364e-06, "loss": 0.49176985025405884, "memory(GiB)": 17.76, "step": 175, "token_acc": 0.828125, "train_speed(iter/s)": 1.045589 }, { "epoch": 0.005717441444953383, "grad_norm": 1.409097671508789, "learning_rate": 1.142857142857143e-06, "loss": 0.4961215853691101, "memory(GiB)": 17.76, "step": 176, "token_acc": 0.8078602620087336, "train_speed(iter/s)": 1.046332 }, { "epoch": 0.0057499269077088, "grad_norm": 27.561532974243164, "learning_rate": 1.1493506493506494e-06, "loss": 0.5065730214118958, "memory(GiB)": 17.76, "step": 177, "token_acc": 0.7788018433179723, "train_speed(iter/s)": 1.047109 }, { "epoch": 0.005782412370464217, "grad_norm": 1.79648756980896, "learning_rate": 1.155844155844156e-06, "loss": 0.5053510665893555, "memory(GiB)": 17.76, "step": 178, "token_acc": 0.8211009174311926, "train_speed(iter/s)": 1.047804 }, { "epoch": 0.005814897833219634, "grad_norm": 2.203596591949463, "learning_rate": 1.1623376623376624e-06, "loss": 0.5245300531387329, "memory(GiB)": 17.76, "step": 179, "token_acc": 0.8080808080808081, "train_speed(iter/s)": 1.048451 }, { "epoch": 0.005847383295975051, "grad_norm": 3.1434764862060547, "learning_rate": 1.168831168831169e-06, "loss": 0.48547446727752686, "memory(GiB)": 17.76, "step": 180, "token_acc": 0.8327137546468402, "train_speed(iter/s)": 1.049084 }, { "epoch": 0.005879868758730468, "grad_norm": 1.4957739114761353, "learning_rate": 1.1753246753246754e-06, "loss": 0.4851435422897339, "memory(GiB)": 17.76, "step": 181, "token_acc": 0.8181818181818182, "train_speed(iter/s)": 1.049713 }, { "epoch": 0.005912354221485885, "grad_norm": 2.8647241592407227, "learning_rate": 1.181818181818182e-06, "loss": 0.500892162322998, "memory(GiB)": 17.76, "step": 182, "token_acc": 0.8284313725490197, "train_speed(iter/s)": 1.050447 }, { "epoch": 0.005944839684241302, "grad_norm": 2.1355390548706055, "learning_rate": 1.1883116883116884e-06, "loss": 0.4779704809188843, "memory(GiB)": 17.76, "step": 183, "token_acc": 0.8260869565217391, "train_speed(iter/s)": 1.051228 }, { "epoch": 0.005977325146996719, "grad_norm": 2.0558760166168213, "learning_rate": 1.1948051948051948e-06, "loss": 0.4808422029018402, "memory(GiB)": 17.76, "step": 184, "token_acc": 0.7811320754716982, "train_speed(iter/s)": 1.051734 }, { "epoch": 0.006009810609752136, "grad_norm": 1.7866040468215942, "learning_rate": 1.2012987012987014e-06, "loss": 0.47075897455215454, "memory(GiB)": 17.76, "step": 185, "token_acc": 0.8036529680365296, "train_speed(iter/s)": 1.052419 }, { "epoch": 0.006042296072507553, "grad_norm": 4.960415363311768, "learning_rate": 1.2077922077922078e-06, "loss": 0.4819193184375763, "memory(GiB)": 17.76, "step": 186, "token_acc": 0.8430493273542601, "train_speed(iter/s)": 1.053055 }, { "epoch": 0.00607478153526297, "grad_norm": 1.9266282320022583, "learning_rate": 1.2142857142857144e-06, "loss": 0.48864829540252686, "memory(GiB)": 17.76, "step": 187, "token_acc": 0.82421875, "train_speed(iter/s)": 1.053621 }, { "epoch": 0.006107266998018387, "grad_norm": 1.692337155342102, "learning_rate": 1.2207792207792208e-06, "loss": 0.4532298445701599, "memory(GiB)": 17.76, "step": 188, "token_acc": 0.8447488584474886, "train_speed(iter/s)": 1.054265 }, { "epoch": 0.006139752460773804, "grad_norm": 2.316192150115967, "learning_rate": 1.2272727272727274e-06, "loss": 0.45455503463745117, "memory(GiB)": 17.76, "step": 189, "token_acc": 0.825, "train_speed(iter/s)": 1.054972 }, { "epoch": 0.006172237923529221, "grad_norm": 1.78115975856781, "learning_rate": 1.2337662337662338e-06, "loss": 0.46239131689071655, "memory(GiB)": 17.76, "step": 190, "token_acc": 0.812206572769953, "train_speed(iter/s)": 1.055599 }, { "epoch": 0.006204723386284638, "grad_norm": 1.8972193002700806, "learning_rate": 1.2402597402597404e-06, "loss": 0.43407344818115234, "memory(GiB)": 17.76, "step": 191, "token_acc": 0.8205128205128205, "train_speed(iter/s)": 1.056305 }, { "epoch": 0.006237208849040055, "grad_norm": 1.8080517053604126, "learning_rate": 1.2467532467532468e-06, "loss": 0.45763280987739563, "memory(GiB)": 17.76, "step": 192, "token_acc": 0.8401486988847584, "train_speed(iter/s)": 1.05692 }, { "epoch": 0.0062696943117954715, "grad_norm": 3.078810930252075, "learning_rate": 1.2532467532467532e-06, "loss": 0.4207853674888611, "memory(GiB)": 17.76, "step": 193, "token_acc": 0.8770053475935828, "train_speed(iter/s)": 1.057522 }, { "epoch": 0.0063021797745508885, "grad_norm": 2.497946262359619, "learning_rate": 1.2597402597402598e-06, "loss": 0.4340690076351166, "memory(GiB)": 17.76, "step": 194, "token_acc": 0.8359788359788359, "train_speed(iter/s)": 1.058164 }, { "epoch": 0.006334665237306305, "grad_norm": 2.135392189025879, "learning_rate": 1.2662337662337662e-06, "loss": 0.4370834231376648, "memory(GiB)": 17.76, "step": 195, "token_acc": 0.8207547169811321, "train_speed(iter/s)": 1.058785 }, { "epoch": 0.006367150700061722, "grad_norm": 1.883213996887207, "learning_rate": 1.2727272727272728e-06, "loss": 0.4556792676448822, "memory(GiB)": 17.76, "step": 196, "token_acc": 0.7979274611398963, "train_speed(iter/s)": 1.059351 }, { "epoch": 0.006399636162817139, "grad_norm": 2.6498942375183105, "learning_rate": 1.2792207792207792e-06, "loss": 0.4303506314754486, "memory(GiB)": 17.76, "step": 197, "token_acc": 0.8014981273408239, "train_speed(iter/s)": 1.060024 }, { "epoch": 0.006432121625572556, "grad_norm": 2.03049635887146, "learning_rate": 1.2857142857142856e-06, "loss": 0.44616734981536865, "memory(GiB)": 17.76, "step": 198, "token_acc": 0.8174603174603174, "train_speed(iter/s)": 1.060576 }, { "epoch": 0.006464607088327973, "grad_norm": 1.6429883241653442, "learning_rate": 1.2922077922077922e-06, "loss": 0.41276252269744873, "memory(GiB)": 17.76, "step": 199, "token_acc": 0.8311111111111111, "train_speed(iter/s)": 1.061168 }, { "epoch": 0.00649709255108339, "grad_norm": 2.4720053672790527, "learning_rate": 1.2987012987012986e-06, "loss": 0.4523230791091919, "memory(GiB)": 17.76, "step": 200, "token_acc": 0.8235294117647058, "train_speed(iter/s)": 1.061763 }, { "epoch": 0.006529578013838807, "grad_norm": 2.4778192043304443, "learning_rate": 1.3051948051948052e-06, "loss": 0.42740100622177124, "memory(GiB)": 17.76, "step": 201, "token_acc": 0.8290598290598291, "train_speed(iter/s)": 1.062335 }, { "epoch": 0.006562063476594224, "grad_norm": 3.1783607006073, "learning_rate": 1.3116883116883116e-06, "loss": 0.418241024017334, "memory(GiB)": 17.76, "step": 202, "token_acc": 0.848, "train_speed(iter/s)": 1.06288 }, { "epoch": 0.006594548939349641, "grad_norm": 2.6287856101989746, "learning_rate": 1.3181818181818182e-06, "loss": 0.42416125535964966, "memory(GiB)": 17.76, "step": 203, "token_acc": 0.8663594470046083, "train_speed(iter/s)": 1.063513 }, { "epoch": 0.006627034402105058, "grad_norm": 3.2066495418548584, "learning_rate": 1.3246753246753246e-06, "loss": 0.41297879815101624, "memory(GiB)": 17.76, "step": 204, "token_acc": 0.8540772532188842, "train_speed(iter/s)": 1.063975 }, { "epoch": 0.006659519864860475, "grad_norm": 2.4445083141326904, "learning_rate": 1.3311688311688312e-06, "loss": 0.4228321313858032, "memory(GiB)": 17.76, "step": 205, "token_acc": 0.8558139534883721, "train_speed(iter/s)": 1.064516 }, { "epoch": 0.006692005327615892, "grad_norm": 2.249892473220825, "learning_rate": 1.3376623376623376e-06, "loss": 0.43462973833084106, "memory(GiB)": 17.76, "step": 206, "token_acc": 0.8359375, "train_speed(iter/s)": 1.065077 }, { "epoch": 0.006724490790371309, "grad_norm": 2.369210720062256, "learning_rate": 1.3441558441558442e-06, "loss": 0.410362184047699, "memory(GiB)": 17.76, "step": 207, "token_acc": 0.8774509803921569, "train_speed(iter/s)": 1.065649 }, { "epoch": 0.006756976253126726, "grad_norm": 2.30485200881958, "learning_rate": 1.3506493506493506e-06, "loss": 0.3854529559612274, "memory(GiB)": 17.76, "step": 208, "token_acc": 0.8632478632478633, "train_speed(iter/s)": 1.066158 }, { "epoch": 0.006789461715882143, "grad_norm": 2.8437111377716064, "learning_rate": 1.3571428571428572e-06, "loss": 0.41888588666915894, "memory(GiB)": 17.76, "step": 209, "token_acc": 0.8298969072164949, "train_speed(iter/s)": 1.066677 }, { "epoch": 0.00682194717863756, "grad_norm": 9.058516502380371, "learning_rate": 1.3636363636363636e-06, "loss": 0.38564828038215637, "memory(GiB)": 17.76, "step": 210, "token_acc": 0.824, "train_speed(iter/s)": 1.067149 }, { "epoch": 0.006854432641392977, "grad_norm": 2.5991263389587402, "learning_rate": 1.3701298701298702e-06, "loss": 0.37738925218582153, "memory(GiB)": 17.76, "step": 211, "token_acc": 0.8497854077253219, "train_speed(iter/s)": 1.067427 }, { "epoch": 0.006886918104148394, "grad_norm": 3.4502205848693848, "learning_rate": 1.3766233766233766e-06, "loss": 0.38916730880737305, "memory(GiB)": 17.76, "step": 212, "token_acc": 0.8674033149171271, "train_speed(iter/s)": 1.067749 }, { "epoch": 0.0069194035669038105, "grad_norm": 2.127495050430298, "learning_rate": 1.3831168831168832e-06, "loss": 0.4063950181007385, "memory(GiB)": 17.76, "step": 213, "token_acc": 0.8604651162790697, "train_speed(iter/s)": 1.068017 }, { "epoch": 0.0069518890296592275, "grad_norm": 2.521939754486084, "learning_rate": 1.3896103896103896e-06, "loss": 0.3800434470176697, "memory(GiB)": 17.76, "step": 214, "token_acc": 0.8762376237623762, "train_speed(iter/s)": 1.068512 }, { "epoch": 0.0069843744924146444, "grad_norm": 2.945974349975586, "learning_rate": 1.3961038961038962e-06, "loss": 0.3911517262458801, "memory(GiB)": 17.76, "step": 215, "token_acc": 0.8565737051792829, "train_speed(iter/s)": 1.069008 }, { "epoch": 0.007016859955170061, "grad_norm": 1.8718355894088745, "learning_rate": 1.4025974025974026e-06, "loss": 0.366415798664093, "memory(GiB)": 17.76, "step": 216, "token_acc": 0.8450184501845018, "train_speed(iter/s)": 1.069486 }, { "epoch": 0.007049345417925478, "grad_norm": 2.794412136077881, "learning_rate": 1.409090909090909e-06, "loss": 0.3570905029773712, "memory(GiB)": 17.76, "step": 217, "token_acc": 0.8487084870848709, "train_speed(iter/s)": 1.070066 }, { "epoch": 0.007081830880680895, "grad_norm": 2.1991491317749023, "learning_rate": 1.4155844155844156e-06, "loss": 0.36062413454055786, "memory(GiB)": 17.76, "step": 218, "token_acc": 0.8669724770642202, "train_speed(iter/s)": 1.070592 }, { "epoch": 0.007114316343436312, "grad_norm": 3.247448205947876, "learning_rate": 1.422077922077922e-06, "loss": 0.3642061650753021, "memory(GiB)": 17.76, "step": 219, "token_acc": 0.9052631578947369, "train_speed(iter/s)": 1.071075 }, { "epoch": 0.007146801806191729, "grad_norm": 5.591080665588379, "learning_rate": 1.4285714285714286e-06, "loss": 0.3606182932853699, "memory(GiB)": 17.76, "step": 220, "token_acc": 0.8847926267281107, "train_speed(iter/s)": 1.071528 }, { "epoch": 0.007179287268947146, "grad_norm": 3.557053565979004, "learning_rate": 1.435064935064935e-06, "loss": 0.36216408014297485, "memory(GiB)": 17.76, "step": 221, "token_acc": 0.8660714285714286, "train_speed(iter/s)": 1.072025 }, { "epoch": 0.007211772731702563, "grad_norm": 2.7301082611083984, "learning_rate": 1.4415584415584416e-06, "loss": 0.3587989807128906, "memory(GiB)": 17.76, "step": 222, "token_acc": 0.8975609756097561, "train_speed(iter/s)": 1.072532 }, { "epoch": 0.00724425819445798, "grad_norm": 4.960174083709717, "learning_rate": 1.448051948051948e-06, "loss": 0.3363909125328064, "memory(GiB)": 17.76, "step": 223, "token_acc": 0.8508064516129032, "train_speed(iter/s)": 1.073056 }, { "epoch": 0.007276743657213397, "grad_norm": 2.3565704822540283, "learning_rate": 1.4545454545454546e-06, "loss": 0.35089319944381714, "memory(GiB)": 17.76, "step": 224, "token_acc": 0.8783269961977186, "train_speed(iter/s)": 1.073592 }, { "epoch": 0.007309229119968814, "grad_norm": 2.667978286743164, "learning_rate": 1.461038961038961e-06, "loss": 0.342672735452652, "memory(GiB)": 17.76, "step": 225, "token_acc": 0.8688524590163934, "train_speed(iter/s)": 1.074093 }, { "epoch": 0.007341714582724231, "grad_norm": 5.803235054016113, "learning_rate": 1.4675324675324676e-06, "loss": 0.3464215397834778, "memory(GiB)": 17.76, "step": 226, "token_acc": 0.8487084870848709, "train_speed(iter/s)": 1.074513 }, { "epoch": 0.007374200045479648, "grad_norm": 2.669328212738037, "learning_rate": 1.474025974025974e-06, "loss": 0.32119035720825195, "memory(GiB)": 17.76, "step": 227, "token_acc": 0.8873873873873874, "train_speed(iter/s)": 1.074978 }, { "epoch": 0.007406685508235065, "grad_norm": 3.203291654586792, "learning_rate": 1.4805194805194806e-06, "loss": 0.34284865856170654, "memory(GiB)": 17.76, "step": 228, "token_acc": 0.868421052631579, "train_speed(iter/s)": 1.075509 }, { "epoch": 0.007439170970990482, "grad_norm": 4.560166835784912, "learning_rate": 1.487012987012987e-06, "loss": 0.32827940583229065, "memory(GiB)": 17.76, "step": 229, "token_acc": 0.8821138211382114, "train_speed(iter/s)": 1.075991 }, { "epoch": 0.007471656433745899, "grad_norm": 1.7807403802871704, "learning_rate": 1.4935064935064936e-06, "loss": 0.31533902883529663, "memory(GiB)": 17.76, "step": 230, "token_acc": 0.8725099601593626, "train_speed(iter/s)": 1.076495 }, { "epoch": 0.007504141896501316, "grad_norm": 3.9388327598571777, "learning_rate": 1.5e-06, "loss": 0.3126888871192932, "memory(GiB)": 17.76, "step": 231, "token_acc": 0.8986784140969163, "train_speed(iter/s)": 1.077004 }, { "epoch": 0.007536627359256733, "grad_norm": 4.64258337020874, "learning_rate": 1.5064935064935066e-06, "loss": 0.31242823600769043, "memory(GiB)": 17.76, "step": 232, "token_acc": 0.907563025210084, "train_speed(iter/s)": 1.077456 }, { "epoch": 0.0075691128220121496, "grad_norm": 4.018054962158203, "learning_rate": 1.512987012987013e-06, "loss": 0.3072398900985718, "memory(GiB)": 17.76, "step": 233, "token_acc": 0.8779527559055118, "train_speed(iter/s)": 1.07796 }, { "epoch": 0.0076015982847675665, "grad_norm": 2.6741249561309814, "learning_rate": 1.5194805194805196e-06, "loss": 0.3187340497970581, "memory(GiB)": 17.76, "step": 234, "token_acc": 0.8679245283018868, "train_speed(iter/s)": 1.078363 }, { "epoch": 0.0076340837475229835, "grad_norm": 2.9106836318969727, "learning_rate": 1.525974025974026e-06, "loss": 0.29086849093437195, "memory(GiB)": 17.76, "step": 235, "token_acc": 0.91324200913242, "train_speed(iter/s)": 1.078808 }, { "epoch": 0.0076665692102784, "grad_norm": 8.543121337890625, "learning_rate": 1.5324675324675327e-06, "loss": 0.31234121322631836, "memory(GiB)": 17.76, "step": 236, "token_acc": 0.87890625, "train_speed(iter/s)": 1.079217 }, { "epoch": 0.007699054673033817, "grad_norm": 3.3250954151153564, "learning_rate": 1.538961038961039e-06, "loss": 0.2919905185699463, "memory(GiB)": 17.76, "step": 237, "token_acc": 0.8812785388127854, "train_speed(iter/s)": 1.079707 }, { "epoch": 0.007731540135789234, "grad_norm": 2.0951950550079346, "learning_rate": 1.5454545454545454e-06, "loss": 0.2860032320022583, "memory(GiB)": 17.76, "step": 238, "token_acc": 0.8899082568807339, "train_speed(iter/s)": 1.080127 }, { "epoch": 0.007764025598544651, "grad_norm": 2.867831230163574, "learning_rate": 1.551948051948052e-06, "loss": 0.29967188835144043, "memory(GiB)": 17.76, "step": 239, "token_acc": 0.8782287822878229, "train_speed(iter/s)": 1.080591 }, { "epoch": 0.007796511061300068, "grad_norm": 2.2042198181152344, "learning_rate": 1.5584415584415584e-06, "loss": 0.28688374161720276, "memory(GiB)": 17.76, "step": 240, "token_acc": 0.8974358974358975, "train_speed(iter/s)": 1.08085 }, { "epoch": 0.007828996524055486, "grad_norm": 2.692352056503296, "learning_rate": 1.564935064935065e-06, "loss": 0.27546656131744385, "memory(GiB)": 17.76, "step": 241, "token_acc": 0.9170731707317074, "train_speed(iter/s)": 1.081224 }, { "epoch": 0.007861481986810902, "grad_norm": 3.208808422088623, "learning_rate": 1.5714285714285714e-06, "loss": 0.282926470041275, "memory(GiB)": 17.76, "step": 242, "token_acc": 0.9308510638297872, "train_speed(iter/s)": 1.081728 }, { "epoch": 0.00789396744956632, "grad_norm": 2.1800501346588135, "learning_rate": 1.577922077922078e-06, "loss": 0.28255754709243774, "memory(GiB)": 17.76, "step": 243, "token_acc": 0.8754578754578755, "train_speed(iter/s)": 1.082163 }, { "epoch": 0.007926452912321736, "grad_norm": 2.9791882038116455, "learning_rate": 1.5844155844155845e-06, "loss": 0.2752877473831177, "memory(GiB)": 17.76, "step": 244, "token_acc": 0.9071729957805907, "train_speed(iter/s)": 1.082499 }, { "epoch": 0.007958938375077154, "grad_norm": 2.2999939918518066, "learning_rate": 1.590909090909091e-06, "loss": 0.2713886499404907, "memory(GiB)": 17.76, "step": 245, "token_acc": 0.9113924050632911, "train_speed(iter/s)": 1.082958 }, { "epoch": 0.00799142383783257, "grad_norm": 2.5618646144866943, "learning_rate": 1.5974025974025975e-06, "loss": 0.262769877910614, "memory(GiB)": 17.76, "step": 246, "token_acc": 0.891566265060241, "train_speed(iter/s)": 1.08338 }, { "epoch": 0.008023909300587988, "grad_norm": 2.096426248550415, "learning_rate": 1.603896103896104e-06, "loss": 0.2860155403614044, "memory(GiB)": 17.76, "step": 247, "token_acc": 0.8598130841121495, "train_speed(iter/s)": 1.083814 }, { "epoch": 0.008056394763343404, "grad_norm": 2.4622292518615723, "learning_rate": 1.6103896103896105e-06, "loss": 0.27429401874542236, "memory(GiB)": 17.76, "step": 248, "token_acc": 0.908675799086758, "train_speed(iter/s)": 1.084273 }, { "epoch": 0.008088880226098822, "grad_norm": 8.62856388092041, "learning_rate": 1.616883116883117e-06, "loss": 0.24758927524089813, "memory(GiB)": 17.76, "step": 249, "token_acc": 0.9398907103825137, "train_speed(iter/s)": 1.084722 }, { "epoch": 0.008121365688854238, "grad_norm": 3.617845058441162, "learning_rate": 1.6233766233766235e-06, "loss": 0.2666161060333252, "memory(GiB)": 17.76, "step": 250, "token_acc": 0.8943396226415095, "train_speed(iter/s)": 1.085136 }, { "epoch": 0.008153851151609656, "grad_norm": 3.0547826290130615, "learning_rate": 1.62987012987013e-06, "loss": 0.26345735788345337, "memory(GiB)": 17.76, "step": 251, "token_acc": 0.9024390243902439, "train_speed(iter/s)": 1.085567 }, { "epoch": 0.008186336614365072, "grad_norm": 4.414337158203125, "learning_rate": 1.6363636363636365e-06, "loss": 0.27367103099823, "memory(GiB)": 17.76, "step": 252, "token_acc": 0.8888888888888888, "train_speed(iter/s)": 1.0859 }, { "epoch": 0.00821882207712049, "grad_norm": 2.1922852993011475, "learning_rate": 1.642857142857143e-06, "loss": 0.25431758165359497, "memory(GiB)": 17.76, "step": 253, "token_acc": 0.9174757281553398, "train_speed(iter/s)": 1.086117 }, { "epoch": 0.008251307539875906, "grad_norm": 3.104590654373169, "learning_rate": 1.6493506493506495e-06, "loss": 0.25748661160469055, "memory(GiB)": 17.76, "step": 254, "token_acc": 0.9013452914798207, "train_speed(iter/s)": 1.086293 }, { "epoch": 0.008283793002631323, "grad_norm": 8.906190872192383, "learning_rate": 1.655844155844156e-06, "loss": 0.2493753284215927, "memory(GiB)": 17.76, "step": 255, "token_acc": 0.8663793103448276, "train_speed(iter/s)": 1.086495 }, { "epoch": 0.00831627846538674, "grad_norm": 2.4844446182250977, "learning_rate": 1.6623376623376625e-06, "loss": 0.2675834000110626, "memory(GiB)": 17.76, "step": 256, "token_acc": 0.8932038834951457, "train_speed(iter/s)": 1.086898 }, { "epoch": 0.008348763928142157, "grad_norm": 2.7126340866088867, "learning_rate": 1.6688311688311689e-06, "loss": 0.2613212466239929, "memory(GiB)": 17.76, "step": 257, "token_acc": 0.9140271493212669, "train_speed(iter/s)": 1.087286 }, { "epoch": 0.008381249390897573, "grad_norm": 2.171654224395752, "learning_rate": 1.6753246753246755e-06, "loss": 0.27273231744766235, "memory(GiB)": 17.76, "step": 258, "token_acc": 0.9082969432314411, "train_speed(iter/s)": 1.087626 }, { "epoch": 0.008413734853652991, "grad_norm": 2.1351895332336426, "learning_rate": 1.6818181818181819e-06, "loss": 0.2533820867538452, "memory(GiB)": 17.76, "step": 259, "token_acc": 0.907258064516129, "train_speed(iter/s)": 1.088031 }, { "epoch": 0.008446220316408407, "grad_norm": 2.6301188468933105, "learning_rate": 1.6883116883116885e-06, "loss": 0.2599472105503082, "memory(GiB)": 17.76, "step": 260, "token_acc": 0.8826086956521739, "train_speed(iter/s)": 1.088437 }, { "epoch": 0.008478705779163825, "grad_norm": 2.322700023651123, "learning_rate": 1.6948051948051949e-06, "loss": 0.26497215032577515, "memory(GiB)": 17.76, "step": 261, "token_acc": 0.9102564102564102, "train_speed(iter/s)": 1.088819 }, { "epoch": 0.008511191241919241, "grad_norm": 2.027330160140991, "learning_rate": 1.7012987012987015e-06, "loss": 0.2234863042831421, "memory(GiB)": 17.76, "step": 262, "token_acc": 0.9090909090909091, "train_speed(iter/s)": 1.089189 }, { "epoch": 0.008543676704674659, "grad_norm": 1.805286169052124, "learning_rate": 1.7077922077922079e-06, "loss": 0.25050851702690125, "memory(GiB)": 17.76, "step": 263, "token_acc": 0.9201680672268907, "train_speed(iter/s)": 1.089584 }, { "epoch": 0.008576162167430075, "grad_norm": 4.464593410491943, "learning_rate": 1.7142857142857145e-06, "loss": 0.23526903986930847, "memory(GiB)": 17.76, "step": 264, "token_acc": 0.9036697247706422, "train_speed(iter/s)": 1.089909 }, { "epoch": 0.008608647630185493, "grad_norm": 2.1269617080688477, "learning_rate": 1.7207792207792209e-06, "loss": 0.2532927989959717, "memory(GiB)": 17.76, "step": 265, "token_acc": 0.9322033898305084, "train_speed(iter/s)": 1.090351 }, { "epoch": 0.008641133092940909, "grad_norm": 2.140357732772827, "learning_rate": 1.7272727272727275e-06, "loss": 0.23924073576927185, "memory(GiB)": 17.76, "step": 266, "token_acc": 0.90234375, "train_speed(iter/s)": 1.090728 }, { "epoch": 0.008673618555696327, "grad_norm": 1.670506238937378, "learning_rate": 1.7337662337662339e-06, "loss": 0.23304694890975952, "memory(GiB)": 17.76, "step": 267, "token_acc": 0.9053030303030303, "train_speed(iter/s)": 1.091087 }, { "epoch": 0.008706104018451743, "grad_norm": 2.133282423019409, "learning_rate": 1.7402597402597405e-06, "loss": 0.24890165030956268, "memory(GiB)": 17.76, "step": 268, "token_acc": 0.9009009009009009, "train_speed(iter/s)": 1.091477 }, { "epoch": 0.00873858948120716, "grad_norm": 2.0949745178222656, "learning_rate": 1.7467532467532469e-06, "loss": 0.24569569528102875, "memory(GiB)": 17.76, "step": 269, "token_acc": 0.9149797570850202, "train_speed(iter/s)": 1.091842 }, { "epoch": 0.008771074943962577, "grad_norm": 2.188336133956909, "learning_rate": 1.7532467532467535e-06, "loss": 0.2649422287940979, "memory(GiB)": 17.76, "step": 270, "token_acc": 0.9045226130653267, "train_speed(iter/s)": 1.092185 }, { "epoch": 0.008803560406717995, "grad_norm": 3.332728862762451, "learning_rate": 1.7597402597402599e-06, "loss": 0.2448117583990097, "memory(GiB)": 17.76, "step": 271, "token_acc": 0.9186602870813397, "train_speed(iter/s)": 1.092593 }, { "epoch": 0.00883604586947341, "grad_norm": 2.8581035137176514, "learning_rate": 1.7662337662337665e-06, "loss": 0.2482290267944336, "memory(GiB)": 17.76, "step": 272, "token_acc": 0.9208333333333333, "train_speed(iter/s)": 1.092896 }, { "epoch": 0.008868531332228828, "grad_norm": 2.1674742698669434, "learning_rate": 1.7727272727272729e-06, "loss": 0.23742541670799255, "memory(GiB)": 17.76, "step": 273, "token_acc": 0.9096989966555183, "train_speed(iter/s)": 1.093197 }, { "epoch": 0.008901016794984245, "grad_norm": 3.3183844089508057, "learning_rate": 1.7792207792207795e-06, "loss": 0.24549990892410278, "memory(GiB)": 17.76, "step": 274, "token_acc": 0.94, "train_speed(iter/s)": 1.093492 }, { "epoch": 0.008933502257739662, "grad_norm": 4.827714920043945, "learning_rate": 1.7857142857142859e-06, "loss": 0.22188282012939453, "memory(GiB)": 17.76, "step": 275, "token_acc": 0.9083969465648855, "train_speed(iter/s)": 1.09384 }, { "epoch": 0.008965987720495078, "grad_norm": 1.4940968751907349, "learning_rate": 1.7922077922077925e-06, "loss": 0.23090456426143646, "memory(GiB)": 17.76, "step": 276, "token_acc": 0.9128440366972477, "train_speed(iter/s)": 1.094153 }, { "epoch": 0.008998473183250496, "grad_norm": 3.434025526046753, "learning_rate": 1.7987012987012989e-06, "loss": 0.22425629198551178, "memory(GiB)": 17.76, "step": 277, "token_acc": 0.9163346613545816, "train_speed(iter/s)": 1.09458 }, { "epoch": 0.009030958646005912, "grad_norm": 2.2285726070404053, "learning_rate": 1.8051948051948053e-06, "loss": 0.2251444160938263, "memory(GiB)": 17.76, "step": 278, "token_acc": 0.8832684824902723, "train_speed(iter/s)": 1.094995 }, { "epoch": 0.00906344410876133, "grad_norm": 8.086201667785645, "learning_rate": 1.811688311688312e-06, "loss": 0.23137789964675903, "memory(GiB)": 17.76, "step": 279, "token_acc": 0.9137254901960784, "train_speed(iter/s)": 1.095338 }, { "epoch": 0.009095929571516746, "grad_norm": 1.8410725593566895, "learning_rate": 1.8181818181818183e-06, "loss": 0.23009173572063446, "memory(GiB)": 17.76, "step": 280, "token_acc": 0.9142857142857143, "train_speed(iter/s)": 1.095608 }, { "epoch": 0.009128415034272164, "grad_norm": 1.953693151473999, "learning_rate": 1.824675324675325e-06, "loss": 0.23149731755256653, "memory(GiB)": 17.76, "step": 281, "token_acc": 0.9196787148594378, "train_speed(iter/s)": 1.095947 }, { "epoch": 0.00916090049702758, "grad_norm": 2.6407203674316406, "learning_rate": 1.8311688311688313e-06, "loss": 0.22839923202991486, "memory(GiB)": 17.76, "step": 282, "token_acc": 0.9049773755656109, "train_speed(iter/s)": 1.096263 }, { "epoch": 0.009193385959782998, "grad_norm": 1.9158045053482056, "learning_rate": 1.837662337662338e-06, "loss": 0.23544493317604065, "memory(GiB)": 17.76, "step": 283, "token_acc": 0.9130434782608695, "train_speed(iter/s)": 1.096621 }, { "epoch": 0.009225871422538414, "grad_norm": 2.0960917472839355, "learning_rate": 1.8441558441558443e-06, "loss": 0.22589623928070068, "memory(GiB)": 17.76, "step": 284, "token_acc": 0.905940594059406, "train_speed(iter/s)": 1.096987 }, { "epoch": 0.009258356885293832, "grad_norm": 1.794584035873413, "learning_rate": 1.850649350649351e-06, "loss": 0.22444988787174225, "memory(GiB)": 17.76, "step": 285, "token_acc": 0.9222614840989399, "train_speed(iter/s)": 1.097293 }, { "epoch": 0.009290842348049248, "grad_norm": 2.315452814102173, "learning_rate": 1.8571428571428573e-06, "loss": 0.2368224561214447, "memory(GiB)": 17.76, "step": 286, "token_acc": 0.9193548387096774, "train_speed(iter/s)": 1.097625 }, { "epoch": 0.009323327810804666, "grad_norm": 1.6850122213363647, "learning_rate": 1.863636363636364e-06, "loss": 0.23392444849014282, "memory(GiB)": 17.76, "step": 287, "token_acc": 0.9173228346456693, "train_speed(iter/s)": 1.097971 }, { "epoch": 0.009355813273560082, "grad_norm": 2.6962060928344727, "learning_rate": 1.8701298701298703e-06, "loss": 0.22139303386211395, "memory(GiB)": 17.76, "step": 288, "token_acc": 0.9276595744680851, "train_speed(iter/s)": 1.098239 }, { "epoch": 0.0093882987363155, "grad_norm": 1.8676990270614624, "learning_rate": 1.876623376623377e-06, "loss": 0.22106003761291504, "memory(GiB)": 17.76, "step": 289, "token_acc": 0.908675799086758, "train_speed(iter/s)": 1.098568 }, { "epoch": 0.009420784199070916, "grad_norm": 1.9524506330490112, "learning_rate": 1.8831168831168833e-06, "loss": 0.20950046181678772, "memory(GiB)": 17.76, "step": 290, "token_acc": 0.9259259259259259, "train_speed(iter/s)": 1.098934 }, { "epoch": 0.009453269661826334, "grad_norm": 4.106838226318359, "learning_rate": 1.88961038961039e-06, "loss": 0.20626536011695862, "memory(GiB)": 17.76, "step": 291, "token_acc": 0.9311926605504587, "train_speed(iter/s)": 1.09919 }, { "epoch": 0.00948575512458175, "grad_norm": 2.6110012531280518, "learning_rate": 1.8961038961038963e-06, "loss": 0.22181126475334167, "memory(GiB)": 17.76, "step": 292, "token_acc": 0.9140271493212669, "train_speed(iter/s)": 1.099508 }, { "epoch": 0.009518240587337167, "grad_norm": 2.10369610786438, "learning_rate": 1.902597402597403e-06, "loss": 0.2163085639476776, "memory(GiB)": 17.76, "step": 293, "token_acc": 0.91796875, "train_speed(iter/s)": 1.099804 }, { "epoch": 0.009550726050092584, "grad_norm": 1.7516082525253296, "learning_rate": 1.9090909090909095e-06, "loss": 0.220220148563385, "memory(GiB)": 17.76, "step": 294, "token_acc": 0.9177489177489178, "train_speed(iter/s)": 1.099898 }, { "epoch": 0.009583211512848001, "grad_norm": 1.8215724229812622, "learning_rate": 1.9155844155844157e-06, "loss": 0.23060446977615356, "memory(GiB)": 17.76, "step": 295, "token_acc": 0.9045643153526971, "train_speed(iter/s)": 1.100047 }, { "epoch": 0.009615696975603417, "grad_norm": 2.185941457748413, "learning_rate": 1.9220779220779223e-06, "loss": 0.2228807508945465, "memory(GiB)": 17.76, "step": 296, "token_acc": 0.9291666666666667, "train_speed(iter/s)": 1.100211 }, { "epoch": 0.009648182438358835, "grad_norm": 1.9747037887573242, "learning_rate": 1.928571428571429e-06, "loss": 0.21959185600280762, "memory(GiB)": 17.76, "step": 297, "token_acc": 0.9196787148594378, "train_speed(iter/s)": 1.100455 }, { "epoch": 0.009680667901114251, "grad_norm": 2.0497610569000244, "learning_rate": 1.935064935064935e-06, "loss": 0.22557589411735535, "memory(GiB)": 17.76, "step": 298, "token_acc": 0.8983050847457628, "train_speed(iter/s)": 1.100713 }, { "epoch": 0.00971315336386967, "grad_norm": 1.866763710975647, "learning_rate": 1.9415584415584417e-06, "loss": 0.2020251303911209, "memory(GiB)": 17.76, "step": 299, "token_acc": 0.9090909090909091, "train_speed(iter/s)": 1.100931 }, { "epoch": 0.009745638826625085, "grad_norm": 2.9735846519470215, "learning_rate": 1.9480519480519483e-06, "loss": 0.21826258301734924, "memory(GiB)": 17.76, "step": 300, "token_acc": 0.8838951310861424, "train_speed(iter/s)": 1.101114 }, { "epoch": 0.009778124289380503, "grad_norm": 1.9028233289718628, "learning_rate": 1.954545454545455e-06, "loss": 0.206323504447937, "memory(GiB)": 17.76, "step": 301, "token_acc": 0.9186991869918699, "train_speed(iter/s)": 1.101402 }, { "epoch": 0.00981060975213592, "grad_norm": 1.5965393781661987, "learning_rate": 1.961038961038961e-06, "loss": 0.20087477564811707, "memory(GiB)": 17.76, "step": 302, "token_acc": 0.9247787610619469, "train_speed(iter/s)": 1.101677 }, { "epoch": 0.009843095214891337, "grad_norm": 2.0676796436309814, "learning_rate": 1.9675324675324677e-06, "loss": 0.21018972992897034, "memory(GiB)": 17.76, "step": 303, "token_acc": 0.9467213114754098, "train_speed(iter/s)": 1.101906 }, { "epoch": 0.009875580677646753, "grad_norm": 1.9720770120620728, "learning_rate": 1.9740259740259743e-06, "loss": 0.20931628346443176, "memory(GiB)": 17.76, "step": 304, "token_acc": 0.9299065420560748, "train_speed(iter/s)": 1.102205 }, { "epoch": 0.009908066140402171, "grad_norm": 2.3084850311279297, "learning_rate": 1.980519480519481e-06, "loss": 0.22901032865047455, "memory(GiB)": 17.76, "step": 305, "token_acc": 0.9390243902439024, "train_speed(iter/s)": 1.102405 }, { "epoch": 0.009940551603157587, "grad_norm": 3.3454949855804443, "learning_rate": 1.987012987012987e-06, "loss": 0.21359461545944214, "memory(GiB)": 17.76, "step": 306, "token_acc": 0.8995983935742972, "train_speed(iter/s)": 1.10264 }, { "epoch": 0.009973037065913005, "grad_norm": 2.1773288249969482, "learning_rate": 1.9935064935064937e-06, "loss": 0.20060959458351135, "memory(GiB)": 17.76, "step": 307, "token_acc": 0.9308755760368663, "train_speed(iter/s)": 1.102877 }, { "epoch": 0.010005522528668421, "grad_norm": 2.131591320037842, "learning_rate": 2.0000000000000003e-06, "loss": 0.22035664319992065, "memory(GiB)": 17.76, "step": 308, "token_acc": 0.9067164179104478, "train_speed(iter/s)": 1.103155 }, { "epoch": 0.010038007991423839, "grad_norm": 2.2218739986419678, "learning_rate": 2.006493506493507e-06, "loss": 0.19844108819961548, "memory(GiB)": 17.76, "step": 309, "token_acc": 0.9154929577464789, "train_speed(iter/s)": 1.103376 }, { "epoch": 0.010070493454179255, "grad_norm": 1.5576611757278442, "learning_rate": 2.012987012987013e-06, "loss": 0.2153954803943634, "memory(GiB)": 17.76, "step": 310, "token_acc": 0.8927335640138409, "train_speed(iter/s)": 1.103562 }, { "epoch": 0.010102978916934673, "grad_norm": 2.0083847045898438, "learning_rate": 2.0194805194805197e-06, "loss": 0.204144686460495, "memory(GiB)": 17.76, "step": 311, "token_acc": 0.9181818181818182, "train_speed(iter/s)": 1.103806 }, { "epoch": 0.010135464379690089, "grad_norm": 1.7373933792114258, "learning_rate": 2.0259740259740263e-06, "loss": 0.2154865860939026, "memory(GiB)": 17.76, "step": 312, "token_acc": 0.9262295081967213, "train_speed(iter/s)": 1.104026 }, { "epoch": 0.010167949842445506, "grad_norm": 1.631613850593567, "learning_rate": 2.032467532467533e-06, "loss": 0.19066515564918518, "memory(GiB)": 17.76, "step": 313, "token_acc": 0.9083665338645418, "train_speed(iter/s)": 1.10433 }, { "epoch": 0.010200435305200923, "grad_norm": 3.1717798709869385, "learning_rate": 2.038961038961039e-06, "loss": 0.19599412381649017, "memory(GiB)": 17.76, "step": 314, "token_acc": 0.9230769230769231, "train_speed(iter/s)": 1.104589 }, { "epoch": 0.01023292076795634, "grad_norm": 2.2208268642425537, "learning_rate": 2.0454545454545457e-06, "loss": 0.20127421617507935, "memory(GiB)": 17.76, "step": 315, "token_acc": 0.9173228346456693, "train_speed(iter/s)": 1.104789 }, { "epoch": 0.010265406230711756, "grad_norm": 5.500349521636963, "learning_rate": 2.0519480519480523e-06, "loss": 0.1916043609380722, "memory(GiB)": 17.76, "step": 316, "token_acc": 0.9253112033195021, "train_speed(iter/s)": 1.105054 }, { "epoch": 0.010297891693467174, "grad_norm": 7.8817596435546875, "learning_rate": 2.058441558441559e-06, "loss": 0.20441707968711853, "memory(GiB)": 17.76, "step": 317, "token_acc": 0.9156118143459916, "train_speed(iter/s)": 1.105267 }, { "epoch": 0.01033037715622259, "grad_norm": 1.9358091354370117, "learning_rate": 2.064935064935065e-06, "loss": 0.20554855465888977, "memory(GiB)": 17.76, "step": 318, "token_acc": 0.9451476793248945, "train_speed(iter/s)": 1.105478 }, { "epoch": 0.010362862618978008, "grad_norm": 2.762500762939453, "learning_rate": 2.0714285714285717e-06, "loss": 0.202653706073761, "memory(GiB)": 17.76, "step": 319, "token_acc": 0.9237668161434978, "train_speed(iter/s)": 1.105756 }, { "epoch": 0.010395348081733424, "grad_norm": 3.6477067470550537, "learning_rate": 2.0779220779220784e-06, "loss": 0.19609811902046204, "memory(GiB)": 17.76, "step": 320, "token_acc": 0.9081272084805654, "train_speed(iter/s)": 1.106084 }, { "epoch": 0.010427833544488842, "grad_norm": 1.9916952848434448, "learning_rate": 2.0844155844155845e-06, "loss": 0.20645099878311157, "memory(GiB)": 17.76, "step": 321, "token_acc": 0.9175824175824175, "train_speed(iter/s)": 1.106317 }, { "epoch": 0.010460319007244258, "grad_norm": 2.521425247192383, "learning_rate": 2.090909090909091e-06, "loss": 0.1920478194952011, "memory(GiB)": 17.76, "step": 322, "token_acc": 0.9391304347826087, "train_speed(iter/s)": 1.106659 }, { "epoch": 0.010492804469999676, "grad_norm": 2.470276117324829, "learning_rate": 2.0974025974025973e-06, "loss": 0.20967544615268707, "memory(GiB)": 17.76, "step": 323, "token_acc": 0.9288389513108615, "train_speed(iter/s)": 1.106873 }, { "epoch": 0.010525289932755092, "grad_norm": 1.8756656646728516, "learning_rate": 2.103896103896104e-06, "loss": 0.19045154750347137, "memory(GiB)": 17.76, "step": 324, "token_acc": 0.9130434782608695, "train_speed(iter/s)": 1.107094 }, { "epoch": 0.01055777539551051, "grad_norm": 2.2585535049438477, "learning_rate": 2.1103896103896105e-06, "loss": 0.19577190279960632, "memory(GiB)": 17.76, "step": 325, "token_acc": 0.9377593360995851, "train_speed(iter/s)": 1.107397 }, { "epoch": 0.010590260858265926, "grad_norm": 2.401195526123047, "learning_rate": 2.1168831168831167e-06, "loss": 0.21032115817070007, "memory(GiB)": 17.76, "step": 326, "token_acc": 0.9060150375939849, "train_speed(iter/s)": 1.107686 }, { "epoch": 0.010622746321021344, "grad_norm": 3.5579991340637207, "learning_rate": 2.1233766233766233e-06, "loss": 0.202593594789505, "memory(GiB)": 17.76, "step": 327, "token_acc": 0.919831223628692, "train_speed(iter/s)": 1.107914 }, { "epoch": 0.01065523178377676, "grad_norm": 7.9599528312683105, "learning_rate": 2.12987012987013e-06, "loss": 0.19729387760162354, "memory(GiB)": 17.76, "step": 328, "token_acc": 0.898876404494382, "train_speed(iter/s)": 1.108243 }, { "epoch": 0.010687717246532178, "grad_norm": 2.3752377033233643, "learning_rate": 2.1363636363636365e-06, "loss": 0.2170649766921997, "memory(GiB)": 17.76, "step": 329, "token_acc": 0.9317269076305221, "train_speed(iter/s)": 1.108451 }, { "epoch": 0.010720202709287594, "grad_norm": 4.508569240570068, "learning_rate": 2.1428571428571427e-06, "loss": 0.19888150691986084, "memory(GiB)": 17.76, "step": 330, "token_acc": 0.9256198347107438, "train_speed(iter/s)": 1.108676 }, { "epoch": 0.010752688172043012, "grad_norm": 2.6767196655273438, "learning_rate": 2.1493506493506493e-06, "loss": 0.18314272165298462, "memory(GiB)": 17.76, "step": 331, "token_acc": 0.9409282700421941, "train_speed(iter/s)": 1.1088 }, { "epoch": 0.010785173634798428, "grad_norm": 1.6017730236053467, "learning_rate": 2.155844155844156e-06, "loss": 0.17872416973114014, "memory(GiB)": 17.76, "step": 332, "token_acc": 0.9437229437229437, "train_speed(iter/s)": 1.108915 }, { "epoch": 0.010817659097553846, "grad_norm": 2.3051412105560303, "learning_rate": 2.1623376623376626e-06, "loss": 0.21721959114074707, "memory(GiB)": 17.76, "step": 333, "token_acc": 0.9306930693069307, "train_speed(iter/s)": 1.109097 }, { "epoch": 0.010850144560309262, "grad_norm": 1.8193105459213257, "learning_rate": 2.1688311688311687e-06, "loss": 0.19273121654987335, "memory(GiB)": 17.76, "step": 334, "token_acc": 0.912, "train_speed(iter/s)": 1.109188 }, { "epoch": 0.01088263002306468, "grad_norm": 2.923901319503784, "learning_rate": 2.1753246753246753e-06, "loss": 0.19622543454170227, "memory(GiB)": 17.76, "step": 335, "token_acc": 0.9461883408071748, "train_speed(iter/s)": 1.109158 }, { "epoch": 0.010915115485820095, "grad_norm": 2.267158031463623, "learning_rate": 2.181818181818182e-06, "loss": 0.2136404812335968, "memory(GiB)": 17.76, "step": 336, "token_acc": 0.8907563025210085, "train_speed(iter/s)": 1.109193 }, { "epoch": 0.010947600948575513, "grad_norm": 2.0766279697418213, "learning_rate": 2.1883116883116886e-06, "loss": 0.20962931215763092, "memory(GiB)": 17.76, "step": 337, "token_acc": 0.9306930693069307, "train_speed(iter/s)": 1.109262 }, { "epoch": 0.01098008641133093, "grad_norm": 2.7377703189849854, "learning_rate": 2.1948051948051947e-06, "loss": 0.18995757400989532, "memory(GiB)": 17.76, "step": 338, "token_acc": 0.9291666666666667, "train_speed(iter/s)": 1.109438 }, { "epoch": 0.011012571874086347, "grad_norm": 5.809311389923096, "learning_rate": 2.2012987012987013e-06, "loss": 0.21837925910949707, "memory(GiB)": 17.76, "step": 339, "token_acc": 0.9053030303030303, "train_speed(iter/s)": 1.109675 }, { "epoch": 0.011045057336841763, "grad_norm": 1.501251459121704, "learning_rate": 2.207792207792208e-06, "loss": 0.18480496108531952, "memory(GiB)": 17.76, "step": 340, "token_acc": 0.9230769230769231, "train_speed(iter/s)": 1.109902 }, { "epoch": 0.011077542799597181, "grad_norm": 2.3485605716705322, "learning_rate": 2.2142857142857146e-06, "loss": 0.18811644613742828, "memory(GiB)": 17.76, "step": 341, "token_acc": 0.9051383399209486, "train_speed(iter/s)": 1.109913 }, { "epoch": 0.011110028262352597, "grad_norm": 1.9586107730865479, "learning_rate": 2.2207792207792207e-06, "loss": 0.19884668290615082, "memory(GiB)": 17.76, "step": 342, "token_acc": 0.9351851851851852, "train_speed(iter/s)": 1.110128 }, { "epoch": 0.011142513725108015, "grad_norm": 1.944966435432434, "learning_rate": 2.2272727272727274e-06, "loss": 0.1950923204421997, "memory(GiB)": 17.76, "step": 343, "token_acc": 0.924812030075188, "train_speed(iter/s)": 1.110179 }, { "epoch": 0.011174999187863431, "grad_norm": 2.4233014583587646, "learning_rate": 2.233766233766234e-06, "loss": 0.1867096722126007, "memory(GiB)": 17.76, "step": 344, "token_acc": 0.923728813559322, "train_speed(iter/s)": 1.110341 }, { "epoch": 0.011207484650618849, "grad_norm": 2.300983190536499, "learning_rate": 2.24025974025974e-06, "loss": 0.18400920927524567, "memory(GiB)": 17.76, "step": 345, "token_acc": 0.9305019305019305, "train_speed(iter/s)": 1.110543 }, { "epoch": 0.011239970113374265, "grad_norm": 1.9931364059448242, "learning_rate": 2.2467532467532468e-06, "loss": 0.19007745385169983, "memory(GiB)": 17.76, "step": 346, "token_acc": 0.9322709163346613, "train_speed(iter/s)": 1.110586 }, { "epoch": 0.011272455576129683, "grad_norm": 2.072690725326538, "learning_rate": 2.2532467532467534e-06, "loss": 0.18308201432228088, "memory(GiB)": 17.76, "step": 347, "token_acc": 0.9240506329113924, "train_speed(iter/s)": 1.110755 }, { "epoch": 0.011304941038885099, "grad_norm": 2.0367729663848877, "learning_rate": 2.25974025974026e-06, "loss": 0.17796942591667175, "memory(GiB)": 17.76, "step": 348, "token_acc": 0.9325153374233128, "train_speed(iter/s)": 1.110837 }, { "epoch": 0.011337426501640517, "grad_norm": 1.5884819030761719, "learning_rate": 2.266233766233766e-06, "loss": 0.17889735102653503, "memory(GiB)": 17.76, "step": 349, "token_acc": 0.9449152542372882, "train_speed(iter/s)": 1.110864 }, { "epoch": 0.011369911964395933, "grad_norm": 2.243424654006958, "learning_rate": 2.2727272727272728e-06, "loss": 0.17344287037849426, "memory(GiB)": 17.76, "step": 350, "token_acc": 0.933579335793358, "train_speed(iter/s)": 1.111018 }, { "epoch": 0.01140239742715135, "grad_norm": 2.08217716217041, "learning_rate": 2.2792207792207794e-06, "loss": 0.1887897551059723, "memory(GiB)": 17.76, "step": 351, "token_acc": 0.9237668161434978, "train_speed(iter/s)": 1.111203 }, { "epoch": 0.011434882889906767, "grad_norm": 2.149714231491089, "learning_rate": 2.285714285714286e-06, "loss": 0.1672963947057724, "memory(GiB)": 17.76, "step": 352, "token_acc": 0.9539748953974896, "train_speed(iter/s)": 1.111222 }, { "epoch": 0.011467368352662185, "grad_norm": 2.0367867946624756, "learning_rate": 2.292207792207792e-06, "loss": 0.19988979399204254, "memory(GiB)": 17.76, "step": 353, "token_acc": 0.9012875536480687, "train_speed(iter/s)": 1.111209 }, { "epoch": 0.0114998538154176, "grad_norm": 2.554029941558838, "learning_rate": 2.2987012987012988e-06, "loss": 0.1909279078245163, "memory(GiB)": 17.76, "step": 354, "token_acc": 0.9148148148148149, "train_speed(iter/s)": 1.111218 }, { "epoch": 0.011532339278173018, "grad_norm": 2.371983289718628, "learning_rate": 2.3051948051948054e-06, "loss": 0.19508472084999084, "memory(GiB)": 17.76, "step": 355, "token_acc": 0.9357798165137615, "train_speed(iter/s)": 1.111426 }, { "epoch": 0.011564824740928435, "grad_norm": 2.845789670944214, "learning_rate": 2.311688311688312e-06, "loss": 0.18996933102607727, "memory(GiB)": 17.76, "step": 356, "token_acc": 0.9392523364485982, "train_speed(iter/s)": 1.111613 }, { "epoch": 0.011597310203683852, "grad_norm": 1.865924596786499, "learning_rate": 2.318181818181818e-06, "loss": 0.17704692482948303, "memory(GiB)": 17.76, "step": 357, "token_acc": 0.9083969465648855, "train_speed(iter/s)": 1.111686 }, { "epoch": 0.011629795666439268, "grad_norm": 2.4898743629455566, "learning_rate": 2.3246753246753248e-06, "loss": 0.19802573323249817, "memory(GiB)": 17.76, "step": 358, "token_acc": 0.9297520661157025, "train_speed(iter/s)": 1.111646 }, { "epoch": 0.011662281129194686, "grad_norm": 6.111899375915527, "learning_rate": 2.3311688311688314e-06, "loss": 0.1765836477279663, "memory(GiB)": 17.76, "step": 359, "token_acc": 0.9471153846153846, "train_speed(iter/s)": 1.111696 }, { "epoch": 0.011694766591950102, "grad_norm": 2.609333038330078, "learning_rate": 2.337662337662338e-06, "loss": 0.19861634075641632, "memory(GiB)": 17.76, "step": 360, "token_acc": 0.9230769230769231, "train_speed(iter/s)": 1.111916 }, { "epoch": 0.01172725205470552, "grad_norm": 3.4269943237304688, "learning_rate": 2.344155844155844e-06, "loss": 0.21138890087604523, "memory(GiB)": 17.76, "step": 361, "token_acc": 0.9027777777777778, "train_speed(iter/s)": 1.112141 }, { "epoch": 0.011759737517460936, "grad_norm": 3.266749382019043, "learning_rate": 2.3506493506493508e-06, "loss": 0.20731352269649506, "memory(GiB)": 17.76, "step": 362, "token_acc": 0.9282511210762332, "train_speed(iter/s)": 1.112377 }, { "epoch": 0.011792222980216354, "grad_norm": 2.254676103591919, "learning_rate": 2.3571428571428574e-06, "loss": 0.17803853750228882, "memory(GiB)": 17.76, "step": 363, "token_acc": 0.9372549019607843, "train_speed(iter/s)": 1.112481 }, { "epoch": 0.01182470844297177, "grad_norm": 2.0891668796539307, "learning_rate": 2.363636363636364e-06, "loss": 0.1915147751569748, "memory(GiB)": 17.76, "step": 364, "token_acc": 0.9259259259259259, "train_speed(iter/s)": 1.112487 }, { "epoch": 0.011857193905727188, "grad_norm": 2.1177995204925537, "learning_rate": 2.37012987012987e-06, "loss": 0.1936085820198059, "memory(GiB)": 17.76, "step": 365, "token_acc": 0.8928571428571429, "train_speed(iter/s)": 1.112522 }, { "epoch": 0.011889679368482604, "grad_norm": 5.126674652099609, "learning_rate": 2.3766233766233768e-06, "loss": 0.19037166237831116, "memory(GiB)": 17.76, "step": 366, "token_acc": 0.9343629343629344, "train_speed(iter/s)": 1.112719 }, { "epoch": 0.011922164831238022, "grad_norm": 2.2597484588623047, "learning_rate": 2.3831168831168834e-06, "loss": 0.20108121633529663, "memory(GiB)": 17.76, "step": 367, "token_acc": 0.9227799227799228, "train_speed(iter/s)": 1.112908 }, { "epoch": 0.011954650293993438, "grad_norm": 1.63346529006958, "learning_rate": 2.3896103896103896e-06, "loss": 0.17416301369667053, "memory(GiB)": 17.76, "step": 368, "token_acc": 0.9581589958158996, "train_speed(iter/s)": 1.113096 }, { "epoch": 0.011987135756748856, "grad_norm": 2.2177460193634033, "learning_rate": 2.396103896103896e-06, "loss": 0.2004874348640442, "memory(GiB)": 17.76, "step": 369, "token_acc": 0.9181034482758621, "train_speed(iter/s)": 1.113293 }, { "epoch": 0.012019621219504272, "grad_norm": 21.009794235229492, "learning_rate": 2.402597402597403e-06, "loss": 0.17345844209194183, "memory(GiB)": 17.76, "step": 370, "token_acc": 0.9125874125874126, "train_speed(iter/s)": 1.113455 }, { "epoch": 0.01205210668225969, "grad_norm": 2.1541008949279785, "learning_rate": 2.4090909090909094e-06, "loss": 0.21606089174747467, "memory(GiB)": 17.76, "step": 371, "token_acc": 0.9067796610169492, "train_speed(iter/s)": 1.113641 }, { "epoch": 0.012084592145015106, "grad_norm": 2.577587842941284, "learning_rate": 2.4155844155844156e-06, "loss": 0.178885817527771, "memory(GiB)": 17.76, "step": 372, "token_acc": 0.9511111111111111, "train_speed(iter/s)": 1.113862 }, { "epoch": 0.012117077607770524, "grad_norm": 3.7417213916778564, "learning_rate": 2.422077922077922e-06, "loss": 0.16127115488052368, "memory(GiB)": 17.76, "step": 373, "token_acc": 0.9539170506912442, "train_speed(iter/s)": 1.114052 }, { "epoch": 0.01214956307052594, "grad_norm": 2.0095202922821045, "learning_rate": 2.428571428571429e-06, "loss": 0.20507395267486572, "memory(GiB)": 17.76, "step": 374, "token_acc": 0.9020618556701031, "train_speed(iter/s)": 1.114312 }, { "epoch": 0.012182048533281357, "grad_norm": 1.6667331457138062, "learning_rate": 2.4350649350649354e-06, "loss": 0.17894700169563293, "memory(GiB)": 17.76, "step": 375, "token_acc": 0.9133858267716536, "train_speed(iter/s)": 1.114524 }, { "epoch": 0.012214533996036774, "grad_norm": 2.2144851684570312, "learning_rate": 2.4415584415584416e-06, "loss": 0.17575718462467194, "memory(GiB)": 17.76, "step": 376, "token_acc": 0.9362549800796812, "train_speed(iter/s)": 1.114717 }, { "epoch": 0.012247019458792191, "grad_norm": 2.0116629600524902, "learning_rate": 2.448051948051948e-06, "loss": 0.18809126317501068, "memory(GiB)": 17.76, "step": 377, "token_acc": 0.9321266968325792, "train_speed(iter/s)": 1.114914 }, { "epoch": 0.012279504921547607, "grad_norm": 2.346343755722046, "learning_rate": 2.454545454545455e-06, "loss": 0.17766964435577393, "memory(GiB)": 17.76, "step": 378, "token_acc": 0.9372384937238494, "train_speed(iter/s)": 1.115175 }, { "epoch": 0.012311990384303025, "grad_norm": 2.2499477863311768, "learning_rate": 2.4610389610389614e-06, "loss": 0.1762181520462036, "memory(GiB)": 17.76, "step": 379, "token_acc": 0.9120370370370371, "train_speed(iter/s)": 1.115416 }, { "epoch": 0.012344475847058441, "grad_norm": 1.9582891464233398, "learning_rate": 2.4675324675324676e-06, "loss": 0.17975346744060516, "memory(GiB)": 17.76, "step": 380, "token_acc": 0.9414414414414415, "train_speed(iter/s)": 1.115586 }, { "epoch": 0.01237696130981386, "grad_norm": 2.053661346435547, "learning_rate": 2.474025974025974e-06, "loss": 0.1670929193496704, "memory(GiB)": 17.76, "step": 381, "token_acc": 0.9234042553191489, "train_speed(iter/s)": 1.115747 }, { "epoch": 0.012409446772569275, "grad_norm": 2.879014730453491, "learning_rate": 2.480519480519481e-06, "loss": 0.1722167283296585, "memory(GiB)": 17.76, "step": 382, "token_acc": 0.9058823529411765, "train_speed(iter/s)": 1.115928 }, { "epoch": 0.012441932235324693, "grad_norm": 1.7110508680343628, "learning_rate": 2.4870129870129874e-06, "loss": 0.18559707701206207, "memory(GiB)": 17.76, "step": 383, "token_acc": 0.91015625, "train_speed(iter/s)": 1.116083 }, { "epoch": 0.01247441769808011, "grad_norm": 2.241011142730713, "learning_rate": 2.4935064935064936e-06, "loss": 0.16613087058067322, "memory(GiB)": 17.76, "step": 384, "token_acc": 0.9447236180904522, "train_speed(iter/s)": 1.116332 }, { "epoch": 0.012506903160835527, "grad_norm": 2.6058642864227295, "learning_rate": 2.5e-06, "loss": 0.17598038911819458, "memory(GiB)": 17.76, "step": 385, "token_acc": 0.9025641025641026, "train_speed(iter/s)": 1.116532 }, { "epoch": 0.012539388623590943, "grad_norm": 1.8218919038772583, "learning_rate": 2.5064935064935064e-06, "loss": 0.169888436794281, "memory(GiB)": 17.76, "step": 386, "token_acc": 0.9456066945606695, "train_speed(iter/s)": 1.116678 }, { "epoch": 0.01257187408634636, "grad_norm": 2.332644462585449, "learning_rate": 2.512987012987013e-06, "loss": 0.16840192675590515, "memory(GiB)": 17.76, "step": 387, "token_acc": 0.9254901960784314, "train_speed(iter/s)": 1.116876 }, { "epoch": 0.012604359549101777, "grad_norm": 1.7293576002120972, "learning_rate": 2.5194805194805196e-06, "loss": 0.19909998774528503, "memory(GiB)": 17.76, "step": 388, "token_acc": 0.900398406374502, "train_speed(iter/s)": 1.117012 }, { "epoch": 0.012636845011857195, "grad_norm": 1.8615790605545044, "learning_rate": 2.525974025974026e-06, "loss": 0.16474707424640656, "memory(GiB)": 17.76, "step": 389, "token_acc": 0.9346153846153846, "train_speed(iter/s)": 1.117182 }, { "epoch": 0.01266933047461261, "grad_norm": 1.6560373306274414, "learning_rate": 2.5324675324675324e-06, "loss": 0.16696667671203613, "memory(GiB)": 17.76, "step": 390, "token_acc": 0.956140350877193, "train_speed(iter/s)": 1.117348 }, { "epoch": 0.012701815937368029, "grad_norm": 2.243812084197998, "learning_rate": 2.538961038961039e-06, "loss": 0.17734766006469727, "memory(GiB)": 17.76, "step": 391, "token_acc": 0.9482758620689655, "train_speed(iter/s)": 1.117501 }, { "epoch": 0.012734301400123445, "grad_norm": 2.1841890811920166, "learning_rate": 2.5454545454545456e-06, "loss": 0.18736201524734497, "memory(GiB)": 17.76, "step": 392, "token_acc": 0.9291338582677166, "train_speed(iter/s)": 1.117669 }, { "epoch": 0.012766786862878863, "grad_norm": 1.9858248233795166, "learning_rate": 2.5519480519480522e-06, "loss": 0.1742454171180725, "memory(GiB)": 17.76, "step": 393, "token_acc": 0.9259259259259259, "train_speed(iter/s)": 1.117819 }, { "epoch": 0.012799272325634279, "grad_norm": 1.901384949684143, "learning_rate": 2.5584415584415584e-06, "loss": 0.17415234446525574, "memory(GiB)": 17.76, "step": 394, "token_acc": 0.9502262443438914, "train_speed(iter/s)": 1.117987 }, { "epoch": 0.012831757788389696, "grad_norm": 1.869033694267273, "learning_rate": 2.564935064935065e-06, "loss": 0.18014761805534363, "memory(GiB)": 17.76, "step": 395, "token_acc": 0.9338235294117647, "train_speed(iter/s)": 1.118197 }, { "epoch": 0.012864243251145113, "grad_norm": 1.8633644580841064, "learning_rate": 2.571428571428571e-06, "loss": 0.17139819264411926, "memory(GiB)": 17.76, "step": 396, "token_acc": 0.9408866995073891, "train_speed(iter/s)": 1.118411 }, { "epoch": 0.01289672871390053, "grad_norm": 3.3123676776885986, "learning_rate": 2.5779220779220782e-06, "loss": 0.1653464138507843, "memory(GiB)": 17.76, "step": 397, "token_acc": 0.9404255319148936, "train_speed(iter/s)": 1.118587 }, { "epoch": 0.012929214176655946, "grad_norm": 3.8975255489349365, "learning_rate": 2.5844155844155844e-06, "loss": 0.17008709907531738, "memory(GiB)": 17.76, "step": 398, "token_acc": 0.9364406779661016, "train_speed(iter/s)": 1.118793 }, { "epoch": 0.012961699639411364, "grad_norm": 3.8915607929229736, "learning_rate": 2.590909090909091e-06, "loss": 0.18256573379039764, "memory(GiB)": 17.76, "step": 399, "token_acc": 0.917098445595855, "train_speed(iter/s)": 1.118821 }, { "epoch": 0.01299418510216678, "grad_norm": 1.2797859907150269, "learning_rate": 2.597402597402597e-06, "loss": 0.16365236043930054, "memory(GiB)": 17.76, "step": 400, "token_acc": 0.9221789883268483, "train_speed(iter/s)": 1.118803 }, { "epoch": 0.013026670564922198, "grad_norm": 6.399578094482422, "learning_rate": 2.6038961038961042e-06, "loss": 0.1839086413383484, "memory(GiB)": 17.76, "step": 401, "token_acc": 0.9502262443438914, "train_speed(iter/s)": 1.118814 }, { "epoch": 0.013059156027677614, "grad_norm": 1.4817609786987305, "learning_rate": 2.6103896103896104e-06, "loss": 0.15291626751422882, "memory(GiB)": 17.76, "step": 402, "token_acc": 0.9398496240601504, "train_speed(iter/s)": 1.119016 }, { "epoch": 0.013091641490433032, "grad_norm": 5.428002834320068, "learning_rate": 2.616883116883117e-06, "loss": 0.16441798210144043, "memory(GiB)": 17.76, "step": 403, "token_acc": 0.9409448818897638, "train_speed(iter/s)": 1.119172 }, { "epoch": 0.013124126953188448, "grad_norm": 1.9200255870819092, "learning_rate": 2.623376623376623e-06, "loss": 0.17774249613285065, "memory(GiB)": 17.76, "step": 404, "token_acc": 0.925764192139738, "train_speed(iter/s)": 1.119366 }, { "epoch": 0.013156612415943866, "grad_norm": 1.9575886726379395, "learning_rate": 2.6298701298701302e-06, "loss": 0.16820669174194336, "memory(GiB)": 17.76, "step": 405, "token_acc": 0.9463414634146341, "train_speed(iter/s)": 1.119517 }, { "epoch": 0.013189097878699282, "grad_norm": 1.8838493824005127, "learning_rate": 2.6363636363636364e-06, "loss": 0.1761823296546936, "memory(GiB)": 17.76, "step": 406, "token_acc": 0.9255813953488372, "train_speed(iter/s)": 1.119692 }, { "epoch": 0.0132215833414547, "grad_norm": 6.961893081665039, "learning_rate": 2.642857142857143e-06, "loss": 0.17365548014640808, "memory(GiB)": 17.76, "step": 407, "token_acc": 0.9157509157509157, "train_speed(iter/s)": 1.119874 }, { "epoch": 0.013254068804210116, "grad_norm": 1.514471173286438, "learning_rate": 2.649350649350649e-06, "loss": 0.17034466564655304, "memory(GiB)": 17.76, "step": 408, "token_acc": 0.9656862745098039, "train_speed(iter/s)": 1.120057 }, { "epoch": 0.013286554266965534, "grad_norm": 2.7849409580230713, "learning_rate": 2.6558441558441562e-06, "loss": 0.18027426302433014, "memory(GiB)": 17.76, "step": 409, "token_acc": 0.9513513513513514, "train_speed(iter/s)": 1.120223 }, { "epoch": 0.01331903972972095, "grad_norm": 2.4586639404296875, "learning_rate": 2.6623376623376624e-06, "loss": 0.15828381478786469, "memory(GiB)": 17.76, "step": 410, "token_acc": 0.948, "train_speed(iter/s)": 1.120396 }, { "epoch": 0.013351525192476368, "grad_norm": 1.6140201091766357, "learning_rate": 2.668831168831169e-06, "loss": 0.18246042728424072, "memory(GiB)": 17.76, "step": 411, "token_acc": 0.9365079365079365, "train_speed(iter/s)": 1.120572 }, { "epoch": 0.013384010655231784, "grad_norm": 1.7637321949005127, "learning_rate": 2.6753246753246752e-06, "loss": 0.1627160608768463, "memory(GiB)": 17.76, "step": 412, "token_acc": 0.9261992619926199, "train_speed(iter/s)": 1.120745 }, { "epoch": 0.013416496117987202, "grad_norm": 1.737134575843811, "learning_rate": 2.6818181818181822e-06, "loss": 0.16547086834907532, "memory(GiB)": 17.76, "step": 413, "token_acc": 0.9556650246305419, "train_speed(iter/s)": 1.120931 }, { "epoch": 0.013448981580742618, "grad_norm": 1.9448559284210205, "learning_rate": 2.6883116883116884e-06, "loss": 0.15652650594711304, "memory(GiB)": 17.76, "step": 414, "token_acc": 0.9259259259259259, "train_speed(iter/s)": 1.121073 }, { "epoch": 0.013481467043498035, "grad_norm": 1.5097970962524414, "learning_rate": 2.694805194805195e-06, "loss": 0.1685875654220581, "memory(GiB)": 17.76, "step": 415, "token_acc": 0.9285714285714286, "train_speed(iter/s)": 1.121219 }, { "epoch": 0.013513952506253452, "grad_norm": 1.416047215461731, "learning_rate": 2.7012987012987012e-06, "loss": 0.16346323490142822, "memory(GiB)": 17.76, "step": 416, "token_acc": 0.9247787610619469, "train_speed(iter/s)": 1.121399 }, { "epoch": 0.01354643796900887, "grad_norm": 2.0218560695648193, "learning_rate": 2.7077922077922083e-06, "loss": 0.18605023622512817, "memory(GiB)": 17.76, "step": 417, "token_acc": 0.9306930693069307, "train_speed(iter/s)": 1.121554 }, { "epoch": 0.013578923431764285, "grad_norm": 1.818739414215088, "learning_rate": 2.7142857142857144e-06, "loss": 0.16512839496135712, "memory(GiB)": 17.76, "step": 418, "token_acc": 0.9303135888501742, "train_speed(iter/s)": 1.121737 }, { "epoch": 0.013611408894519703, "grad_norm": 1.5126614570617676, "learning_rate": 2.720779220779221e-06, "loss": 0.16223198175430298, "memory(GiB)": 17.76, "step": 419, "token_acc": 0.9504132231404959, "train_speed(iter/s)": 1.121911 }, { "epoch": 0.01364389435727512, "grad_norm": 1.6917961835861206, "learning_rate": 2.7272727272727272e-06, "loss": 0.16306816041469574, "memory(GiB)": 17.76, "step": 420, "token_acc": 0.9454545454545454, "train_speed(iter/s)": 1.122076 }, { "epoch": 0.013676379820030537, "grad_norm": 2.17999529838562, "learning_rate": 2.7337662337662343e-06, "loss": 0.17241081595420837, "memory(GiB)": 17.76, "step": 421, "token_acc": 0.9243027888446215, "train_speed(iter/s)": 1.122212 }, { "epoch": 0.013708865282785953, "grad_norm": 1.6892659664154053, "learning_rate": 2.7402597402597404e-06, "loss": 0.16963890194892883, "memory(GiB)": 17.76, "step": 422, "token_acc": 0.9147982062780269, "train_speed(iter/s)": 1.122303 }, { "epoch": 0.013741350745541371, "grad_norm": 1.416189432144165, "learning_rate": 2.746753246753247e-06, "loss": 0.16863223910331726, "memory(GiB)": 17.76, "step": 423, "token_acc": 0.9126637554585153, "train_speed(iter/s)": 1.122461 }, { "epoch": 0.013773836208296787, "grad_norm": 1.7973390817642212, "learning_rate": 2.7532467532467532e-06, "loss": 0.16926229000091553, "memory(GiB)": 17.76, "step": 424, "token_acc": 0.9090909090909091, "train_speed(iter/s)": 1.122557 }, { "epoch": 0.013806321671052205, "grad_norm": 1.56753408908844, "learning_rate": 2.7597402597402603e-06, "loss": 0.15436841547489166, "memory(GiB)": 17.76, "step": 425, "token_acc": 0.9414634146341463, "train_speed(iter/s)": 1.12271 }, { "epoch": 0.013838807133807621, "grad_norm": 2.2688772678375244, "learning_rate": 2.7662337662337664e-06, "loss": 0.17470517754554749, "memory(GiB)": 17.76, "step": 426, "token_acc": 0.9366515837104072, "train_speed(iter/s)": 1.12286 }, { "epoch": 0.013871292596563039, "grad_norm": 2.0073635578155518, "learning_rate": 2.772727272727273e-06, "loss": 0.1814616620540619, "memory(GiB)": 17.76, "step": 427, "token_acc": 0.9070631970260223, "train_speed(iter/s)": 1.12303 }, { "epoch": 0.013903778059318455, "grad_norm": 1.6304641962051392, "learning_rate": 2.7792207792207792e-06, "loss": 0.17161825299263, "memory(GiB)": 17.76, "step": 428, "token_acc": 0.9357429718875502, "train_speed(iter/s)": 1.12321 }, { "epoch": 0.013936263522073873, "grad_norm": 2.2504048347473145, "learning_rate": 2.785714285714286e-06, "loss": 0.17298874258995056, "memory(GiB)": 17.76, "step": 429, "token_acc": 0.9173228346456693, "train_speed(iter/s)": 1.123316 }, { "epoch": 0.013968748984829289, "grad_norm": 2.140850782394409, "learning_rate": 2.7922077922077925e-06, "loss": 0.16156861186027527, "memory(GiB)": 17.76, "step": 430, "token_acc": 0.916, "train_speed(iter/s)": 1.123466 }, { "epoch": 0.014001234447584707, "grad_norm": 1.4671255350112915, "learning_rate": 2.798701298701299e-06, "loss": 0.16285529732704163, "memory(GiB)": 17.76, "step": 431, "token_acc": 0.9367088607594937, "train_speed(iter/s)": 1.123636 }, { "epoch": 0.014033719910340123, "grad_norm": 1.697697639465332, "learning_rate": 2.8051948051948052e-06, "loss": 0.15726619958877563, "memory(GiB)": 17.76, "step": 432, "token_acc": 0.9227941176470589, "train_speed(iter/s)": 1.123845 }, { "epoch": 0.01406620537309554, "grad_norm": 1.7181055545806885, "learning_rate": 2.811688311688312e-06, "loss": 0.17184048891067505, "memory(GiB)": 17.76, "step": 433, "token_acc": 0.9322033898305084, "train_speed(iter/s)": 1.123991 }, { "epoch": 0.014098690835850957, "grad_norm": 2.3061394691467285, "learning_rate": 2.818181818181818e-06, "loss": 0.17110030353069305, "memory(GiB)": 17.76, "step": 434, "token_acc": 0.923728813559322, "train_speed(iter/s)": 1.12415 }, { "epoch": 0.014131176298606375, "grad_norm": 1.5805301666259766, "learning_rate": 2.824675324675325e-06, "loss": 0.14658993482589722, "memory(GiB)": 17.76, "step": 435, "token_acc": 0.9315068493150684, "train_speed(iter/s)": 1.12427 }, { "epoch": 0.01416366176136179, "grad_norm": 1.6958074569702148, "learning_rate": 2.8311688311688312e-06, "loss": 0.16797438263893127, "memory(GiB)": 17.76, "step": 436, "token_acc": 0.9401709401709402, "train_speed(iter/s)": 1.124401 }, { "epoch": 0.014196147224117208, "grad_norm": 7.95184326171875, "learning_rate": 2.837662337662338e-06, "loss": 0.17328724265098572, "memory(GiB)": 17.76, "step": 437, "token_acc": 0.9330543933054394, "train_speed(iter/s)": 1.124549 }, { "epoch": 0.014228632686872624, "grad_norm": 1.4944252967834473, "learning_rate": 2.844155844155844e-06, "loss": 0.166367769241333, "memory(GiB)": 17.76, "step": 438, "token_acc": 0.9459459459459459, "train_speed(iter/s)": 1.124698 }, { "epoch": 0.014261118149628042, "grad_norm": 1.801062822341919, "learning_rate": 2.850649350649351e-06, "loss": 0.17221593856811523, "memory(GiB)": 17.76, "step": 439, "token_acc": 0.9226519337016574, "train_speed(iter/s)": 1.124811 }, { "epoch": 0.014293603612383458, "grad_norm": 1.482890009880066, "learning_rate": 2.8571428571428573e-06, "loss": 0.163909912109375, "memory(GiB)": 17.76, "step": 440, "token_acc": 0.9299065420560748, "train_speed(iter/s)": 1.124926 }, { "epoch": 0.014326089075138876, "grad_norm": 1.8707274198532104, "learning_rate": 2.863636363636364e-06, "loss": 0.18719306588172913, "memory(GiB)": 17.76, "step": 441, "token_acc": 0.9339622641509434, "train_speed(iter/s)": 1.124997 }, { "epoch": 0.014358574537894292, "grad_norm": 1.8735429048538208, "learning_rate": 2.87012987012987e-06, "loss": 0.16207173466682434, "memory(GiB)": 17.76, "step": 442, "token_acc": 0.9552238805970149, "train_speed(iter/s)": 1.125025 }, { "epoch": 0.01439106000064971, "grad_norm": 1.6690118312835693, "learning_rate": 2.876623376623377e-06, "loss": 0.1573551893234253, "memory(GiB)": 17.76, "step": 443, "token_acc": 0.94, "train_speed(iter/s)": 1.125002 }, { "epoch": 0.014423545463405126, "grad_norm": 2.3063859939575195, "learning_rate": 2.8831168831168833e-06, "loss": 0.14805494248867035, "memory(GiB)": 17.76, "step": 444, "token_acc": 0.927710843373494, "train_speed(iter/s)": 1.12502 }, { "epoch": 0.014456030926160544, "grad_norm": 1.8738716840744019, "learning_rate": 2.88961038961039e-06, "loss": 0.1653895378112793, "memory(GiB)": 17.76, "step": 445, "token_acc": 0.9359605911330049, "train_speed(iter/s)": 1.125191 }, { "epoch": 0.01448851638891596, "grad_norm": 2.188034772872925, "learning_rate": 2.896103896103896e-06, "loss": 0.16739484667778015, "memory(GiB)": 17.76, "step": 446, "token_acc": 0.9354838709677419, "train_speed(iter/s)": 1.125329 }, { "epoch": 0.014521001851671378, "grad_norm": 2.2164080142974854, "learning_rate": 2.902597402597403e-06, "loss": 0.18801867961883545, "memory(GiB)": 17.76, "step": 447, "token_acc": 0.9396551724137931, "train_speed(iter/s)": 1.125446 }, { "epoch": 0.014553487314426794, "grad_norm": 2.739856243133545, "learning_rate": 2.9090909090909093e-06, "loss": 0.1695852428674698, "memory(GiB)": 17.76, "step": 448, "token_acc": 0.9364406779661016, "train_speed(iter/s)": 1.125634 }, { "epoch": 0.014585972777182212, "grad_norm": 1.7457855939865112, "learning_rate": 2.915584415584416e-06, "loss": 0.1637188196182251, "memory(GiB)": 17.76, "step": 449, "token_acc": 0.9288537549407114, "train_speed(iter/s)": 1.125806 }, { "epoch": 0.014618458239937628, "grad_norm": 1.8449320793151855, "learning_rate": 2.922077922077922e-06, "loss": 0.17466586828231812, "memory(GiB)": 17.76, "step": 450, "token_acc": 0.9444444444444444, "train_speed(iter/s)": 1.125954 }, { "epoch": 0.014650943702693046, "grad_norm": 1.7514182329177856, "learning_rate": 2.928571428571429e-06, "loss": 0.18209277093410492, "memory(GiB)": 17.76, "step": 451, "token_acc": 0.9357429718875502, "train_speed(iter/s)": 1.126107 }, { "epoch": 0.014683429165448462, "grad_norm": 1.5374462604522705, "learning_rate": 2.9350649350649353e-06, "loss": 0.15694399178028107, "memory(GiB)": 17.76, "step": 452, "token_acc": 0.9459459459459459, "train_speed(iter/s)": 1.126216 }, { "epoch": 0.01471591462820388, "grad_norm": 2.129551410675049, "learning_rate": 2.941558441558442e-06, "loss": 0.16749632358551025, "memory(GiB)": 17.76, "step": 453, "token_acc": 0.926605504587156, "train_speed(iter/s)": 1.126333 }, { "epoch": 0.014748400090959296, "grad_norm": 2.1297240257263184, "learning_rate": 2.948051948051948e-06, "loss": 0.16209475696086884, "memory(GiB)": 17.76, "step": 454, "token_acc": 0.9459459459459459, "train_speed(iter/s)": 1.126485 }, { "epoch": 0.014780885553714714, "grad_norm": 45.78993606567383, "learning_rate": 2.954545454545455e-06, "loss": 0.16119441390037537, "memory(GiB)": 17.76, "step": 455, "token_acc": 0.9182156133828996, "train_speed(iter/s)": 1.126662 }, { "epoch": 0.01481337101647013, "grad_norm": 6.006158351898193, "learning_rate": 2.9610389610389613e-06, "loss": 0.15156933665275574, "memory(GiB)": 17.76, "step": 456, "token_acc": 0.9421487603305785, "train_speed(iter/s)": 1.126829 }, { "epoch": 0.014845856479225547, "grad_norm": 1.6593040227890015, "learning_rate": 2.967532467532468e-06, "loss": 0.16163867712020874, "memory(GiB)": 17.76, "step": 457, "token_acc": 0.9093959731543624, "train_speed(iter/s)": 1.126912 }, { "epoch": 0.014878341941980964, "grad_norm": 2.0760581493377686, "learning_rate": 2.974025974025974e-06, "loss": 0.16229459643363953, "memory(GiB)": 17.76, "step": 458, "token_acc": 0.9315068493150684, "train_speed(iter/s)": 1.127072 }, { "epoch": 0.014910827404736381, "grad_norm": 1.537350058555603, "learning_rate": 2.980519480519481e-06, "loss": 0.16837480664253235, "memory(GiB)": 17.76, "step": 459, "token_acc": 0.9033613445378151, "train_speed(iter/s)": 1.127162 }, { "epoch": 0.014943312867491797, "grad_norm": 1.612406611442566, "learning_rate": 2.9870129870129873e-06, "loss": 0.16720497608184814, "memory(GiB)": 17.76, "step": 460, "token_acc": 0.9178743961352657, "train_speed(iter/s)": 1.127324 }, { "epoch": 0.014975798330247215, "grad_norm": 1.7163636684417725, "learning_rate": 2.993506493506494e-06, "loss": 0.148023784160614, "memory(GiB)": 17.76, "step": 461, "token_acc": 0.9322033898305084, "train_speed(iter/s)": 1.127398 }, { "epoch": 0.015008283793002631, "grad_norm": 1.5020605325698853, "learning_rate": 3e-06, "loss": 0.15074226260185242, "memory(GiB)": 17.76, "step": 462, "token_acc": 0.9507575757575758, "train_speed(iter/s)": 1.127501 }, { "epoch": 0.015040769255758049, "grad_norm": 1.9388141632080078, "learning_rate": 3.006493506493507e-06, "loss": 0.16288655996322632, "memory(GiB)": 17.76, "step": 463, "token_acc": 0.944078947368421, "train_speed(iter/s)": 1.127658 }, { "epoch": 0.015073254718513465, "grad_norm": 1.9224883317947388, "learning_rate": 3.0129870129870133e-06, "loss": 0.15711770951747894, "memory(GiB)": 17.76, "step": 464, "token_acc": 0.9372384937238494, "train_speed(iter/s)": 1.127777 }, { "epoch": 0.015105740181268883, "grad_norm": 17.89164161682129, "learning_rate": 3.01948051948052e-06, "loss": 0.16215166449546814, "memory(GiB)": 17.76, "step": 465, "token_acc": 0.9182879377431906, "train_speed(iter/s)": 1.127893 }, { "epoch": 0.015138225644024299, "grad_norm": 2.454139232635498, "learning_rate": 3.025974025974026e-06, "loss": 0.17735609412193298, "memory(GiB)": 17.76, "step": 466, "token_acc": 0.94, "train_speed(iter/s)": 1.128056 }, { "epoch": 0.015170711106779717, "grad_norm": 1.6644890308380127, "learning_rate": 3.0324675324675327e-06, "loss": 0.14822307229042053, "memory(GiB)": 17.76, "step": 467, "token_acc": 0.9375, "train_speed(iter/s)": 1.128197 }, { "epoch": 0.015203196569535133, "grad_norm": 3.502030849456787, "learning_rate": 3.0389610389610393e-06, "loss": 0.17032961547374725, "memory(GiB)": 17.76, "step": 468, "token_acc": 0.9230769230769231, "train_speed(iter/s)": 1.128324 }, { "epoch": 0.01523568203229055, "grad_norm": 2.544832706451416, "learning_rate": 3.045454545454546e-06, "loss": 0.16064518690109253, "memory(GiB)": 17.76, "step": 469, "token_acc": 0.8977777777777778, "train_speed(iter/s)": 1.12848 }, { "epoch": 0.015268167495045967, "grad_norm": 4.540980815887451, "learning_rate": 3.051948051948052e-06, "loss": 0.14299246668815613, "memory(GiB)": 17.76, "step": 470, "token_acc": 0.9382716049382716, "train_speed(iter/s)": 1.128629 }, { "epoch": 0.015300652957801385, "grad_norm": 1.9749482870101929, "learning_rate": 3.0584415584415587e-06, "loss": 0.15112538635730743, "memory(GiB)": 17.76, "step": 471, "token_acc": 0.9644444444444444, "train_speed(iter/s)": 1.128758 }, { "epoch": 0.0153331384205568, "grad_norm": 1.8783942461013794, "learning_rate": 3.0649350649350653e-06, "loss": 0.14846211671829224, "memory(GiB)": 17.76, "step": 472, "token_acc": 0.9458333333333333, "train_speed(iter/s)": 1.12889 }, { "epoch": 0.015365623883312219, "grad_norm": 1.739694595336914, "learning_rate": 3.071428571428572e-06, "loss": 0.14530059695243835, "memory(GiB)": 17.76, "step": 473, "token_acc": 0.945054945054945, "train_speed(iter/s)": 1.129081 }, { "epoch": 0.015398109346067635, "grad_norm": 1.827345371246338, "learning_rate": 3.077922077922078e-06, "loss": 0.15723784267902374, "memory(GiB)": 17.76, "step": 474, "token_acc": 0.9313725490196079, "train_speed(iter/s)": 1.129228 }, { "epoch": 0.015430594808823053, "grad_norm": 2.0877954959869385, "learning_rate": 3.0844155844155847e-06, "loss": 0.15459483861923218, "memory(GiB)": 17.76, "step": 475, "token_acc": 0.9530516431924883, "train_speed(iter/s)": 1.129358 }, { "epoch": 0.015463080271578469, "grad_norm": 3.742835521697998, "learning_rate": 3.090909090909091e-06, "loss": 0.16956765949726105, "memory(GiB)": 17.76, "step": 476, "token_acc": 0.9288888888888889, "train_speed(iter/s)": 1.129507 }, { "epoch": 0.015495565734333886, "grad_norm": 2.087679624557495, "learning_rate": 3.097402597402598e-06, "loss": 0.13997195661067963, "memory(GiB)": 17.76, "step": 477, "token_acc": 0.9224806201550387, "train_speed(iter/s)": 1.129619 }, { "epoch": 0.015528051197089303, "grad_norm": 1.5810765027999878, "learning_rate": 3.103896103896104e-06, "loss": 0.14595627784729004, "memory(GiB)": 17.76, "step": 478, "token_acc": 0.9321428571428572, "train_speed(iter/s)": 1.129792 }, { "epoch": 0.01556053665984472, "grad_norm": 2.3638405799865723, "learning_rate": 3.1103896103896107e-06, "loss": 0.1581854224205017, "memory(GiB)": 17.76, "step": 479, "token_acc": 0.9326923076923077, "train_speed(iter/s)": 1.129937 }, { "epoch": 0.015593022122600136, "grad_norm": 2.271650552749634, "learning_rate": 3.116883116883117e-06, "loss": 0.173322856426239, "memory(GiB)": 17.76, "step": 480, "token_acc": 0.9172932330827067, "train_speed(iter/s)": 1.130073 }, { "epoch": 0.015625507585355553, "grad_norm": 1.465518593788147, "learning_rate": 3.123376623376624e-06, "loss": 0.14465448260307312, "memory(GiB)": 17.76, "step": 481, "token_acc": 0.9227642276422764, "train_speed(iter/s)": 1.130212 }, { "epoch": 0.015657993048110972, "grad_norm": 3.4526450634002686, "learning_rate": 3.12987012987013e-06, "loss": 0.15664911270141602, "memory(GiB)": 17.76, "step": 482, "token_acc": 0.9576271186440678, "train_speed(iter/s)": 1.13032 }, { "epoch": 0.015690478510866388, "grad_norm": 2.0488810539245605, "learning_rate": 3.1363636363636367e-06, "loss": 0.1522769331932068, "memory(GiB)": 17.76, "step": 483, "token_acc": 0.9385245901639344, "train_speed(iter/s)": 1.130442 }, { "epoch": 0.015722963973621804, "grad_norm": 1.8206533193588257, "learning_rate": 3.142857142857143e-06, "loss": 0.16142228245735168, "memory(GiB)": 17.76, "step": 484, "token_acc": 0.9372549019607843, "train_speed(iter/s)": 1.130527 }, { "epoch": 0.01575544943637722, "grad_norm": 1.6962453126907349, "learning_rate": 3.14935064935065e-06, "loss": 0.15359210968017578, "memory(GiB)": 17.76, "step": 485, "token_acc": 0.961864406779661, "train_speed(iter/s)": 1.130599 }, { "epoch": 0.01578793489913264, "grad_norm": 1.5459703207015991, "learning_rate": 3.155844155844156e-06, "loss": 0.15350033342838287, "memory(GiB)": 17.76, "step": 486, "token_acc": 0.9333333333333333, "train_speed(iter/s)": 1.130526 }, { "epoch": 0.015820420361888056, "grad_norm": 1.7528023719787598, "learning_rate": 3.1623376623376627e-06, "loss": 0.16857977211475372, "memory(GiB)": 17.76, "step": 487, "token_acc": 0.9333333333333333, "train_speed(iter/s)": 1.130508 }, { "epoch": 0.015852905824643472, "grad_norm": 1.5798600912094116, "learning_rate": 3.168831168831169e-06, "loss": 0.1430438756942749, "memory(GiB)": 17.76, "step": 488, "token_acc": 0.922077922077922, "train_speed(iter/s)": 1.130562 }, { "epoch": 0.015885391287398888, "grad_norm": 2.1282949447631836, "learning_rate": 3.175324675324676e-06, "loss": 0.1425718069076538, "memory(GiB)": 17.76, "step": 489, "token_acc": 0.954954954954955, "train_speed(iter/s)": 1.130693 }, { "epoch": 0.015917876750154308, "grad_norm": 1.7360306978225708, "learning_rate": 3.181818181818182e-06, "loss": 0.15112319588661194, "memory(GiB)": 17.76, "step": 490, "token_acc": 0.9563318777292577, "train_speed(iter/s)": 1.130854 }, { "epoch": 0.015950362212909724, "grad_norm": 1.903724193572998, "learning_rate": 3.1883116883116887e-06, "loss": 0.14777350425720215, "memory(GiB)": 17.76, "step": 491, "token_acc": 0.9449152542372882, "train_speed(iter/s)": 1.130994 }, { "epoch": 0.01598284767566514, "grad_norm": 1.9359873533248901, "learning_rate": 3.194805194805195e-06, "loss": 0.15071001648902893, "memory(GiB)": 17.76, "step": 492, "token_acc": 0.9409282700421941, "train_speed(iter/s)": 1.131116 }, { "epoch": 0.016015333138420556, "grad_norm": 1.9050734043121338, "learning_rate": 3.201298701298702e-06, "loss": 0.16212081909179688, "memory(GiB)": 17.76, "step": 493, "token_acc": 0.9259259259259259, "train_speed(iter/s)": 1.131237 }, { "epoch": 0.016047818601175975, "grad_norm": 1.777029037475586, "learning_rate": 3.207792207792208e-06, "loss": 0.1447298675775528, "memory(GiB)": 17.76, "step": 494, "token_acc": 0.9545454545454546, "train_speed(iter/s)": 1.131341 }, { "epoch": 0.01608030406393139, "grad_norm": 2.0014452934265137, "learning_rate": 3.2142857142857147e-06, "loss": 0.177791565656662, "memory(GiB)": 17.76, "step": 495, "token_acc": 0.9175824175824175, "train_speed(iter/s)": 1.131447 }, { "epoch": 0.016112789526686808, "grad_norm": 2.356952667236328, "learning_rate": 3.220779220779221e-06, "loss": 0.14387929439544678, "memory(GiB)": 17.76, "step": 496, "token_acc": 0.9495798319327731, "train_speed(iter/s)": 1.13155 }, { "epoch": 0.016145274989442224, "grad_norm": 1.5957980155944824, "learning_rate": 3.227272727272728e-06, "loss": 0.1537850797176361, "memory(GiB)": 17.76, "step": 497, "token_acc": 0.9365079365079365, "train_speed(iter/s)": 1.131694 }, { "epoch": 0.016177760452197643, "grad_norm": 1.5009795427322388, "learning_rate": 3.233766233766234e-06, "loss": 0.16846175491809845, "memory(GiB)": 17.76, "step": 498, "token_acc": 0.9426229508196722, "train_speed(iter/s)": 1.131787 }, { "epoch": 0.01621024591495306, "grad_norm": 1.6735920906066895, "learning_rate": 3.2402597402597407e-06, "loss": 0.1457977294921875, "memory(GiB)": 17.76, "step": 499, "token_acc": 0.9470899470899471, "train_speed(iter/s)": 1.131894 }, { "epoch": 0.016242731377708475, "grad_norm": 2.958961248397827, "learning_rate": 3.246753246753247e-06, "loss": 0.15005481243133545, "memory(GiB)": 17.76, "step": 500, "token_acc": 0.9361702127659575, "train_speed(iter/s)": 1.131951 }, { "epoch": 0.016242731377708475, "eval_loss": 0.1545547991991043, "eval_runtime": 85.0003, "eval_samples_per_second": 117.058, "eval_steps_per_second": 3.659, "eval_token_acc": 0.9364305486196558, "step": 500 }, { "epoch": 0.01627521684046389, "grad_norm": 1.6032423973083496, "learning_rate": 3.253246753246754e-06, "loss": 0.1474534422159195, "memory(GiB)": 18.4, "step": 501, "token_acc": 0.9379444528177795, "train_speed(iter/s)": 0.926183 }, { "epoch": 0.01630770230321931, "grad_norm": 1.472800374031067, "learning_rate": 3.25974025974026e-06, "loss": 0.1497318148612976, "memory(GiB)": 18.4, "step": 502, "token_acc": 0.936, "train_speed(iter/s)": 0.926492 }, { "epoch": 0.016340187765974727, "grad_norm": 1.578385591506958, "learning_rate": 3.2662337662337667e-06, "loss": 0.16733360290527344, "memory(GiB)": 18.4, "step": 503, "token_acc": 0.9201680672268907, "train_speed(iter/s)": 0.926826 }, { "epoch": 0.016372673228730143, "grad_norm": 2.6597824096679688, "learning_rate": 3.272727272727273e-06, "loss": 0.15241798758506775, "memory(GiB)": 18.4, "step": 504, "token_acc": 0.9367088607594937, "train_speed(iter/s)": 0.927103 }, { "epoch": 0.01640515869148556, "grad_norm": 1.8027479648590088, "learning_rate": 3.27922077922078e-06, "loss": 0.1541179120540619, "memory(GiB)": 18.4, "step": 505, "token_acc": 0.9321266968325792, "train_speed(iter/s)": 0.92745 }, { "epoch": 0.01643764415424098, "grad_norm": 2.412358522415161, "learning_rate": 3.285714285714286e-06, "loss": 0.1432965099811554, "memory(GiB)": 18.4, "step": 506, "token_acc": 0.9644128113879004, "train_speed(iter/s)": 0.927801 }, { "epoch": 0.016470129616996395, "grad_norm": 4.929311275482178, "learning_rate": 3.2922077922077927e-06, "loss": 0.1601349264383316, "memory(GiB)": 18.4, "step": 507, "token_acc": 0.9461883408071748, "train_speed(iter/s)": 0.928159 }, { "epoch": 0.01650261507975181, "grad_norm": 2.2727408409118652, "learning_rate": 3.298701298701299e-06, "loss": 0.1567658632993698, "memory(GiB)": 18.4, "step": 508, "token_acc": 0.924901185770751, "train_speed(iter/s)": 0.928499 }, { "epoch": 0.016535100542507227, "grad_norm": 1.9564906358718872, "learning_rate": 3.3051948051948055e-06, "loss": 0.15264783799648285, "memory(GiB)": 18.4, "step": 509, "token_acc": 0.9421487603305785, "train_speed(iter/s)": 0.928853 }, { "epoch": 0.016567586005262647, "grad_norm": 2.280125141143799, "learning_rate": 3.311688311688312e-06, "loss": 0.1520507037639618, "memory(GiB)": 18.4, "step": 510, "token_acc": 0.9473684210526315, "train_speed(iter/s)": 0.929142 }, { "epoch": 0.016600071468018063, "grad_norm": 5.214507102966309, "learning_rate": 3.3181818181818188e-06, "loss": 0.1628376841545105, "memory(GiB)": 18.4, "step": 511, "token_acc": 0.9359605911330049, "train_speed(iter/s)": 0.929481 }, { "epoch": 0.01663255693077348, "grad_norm": 1.8653351068496704, "learning_rate": 3.324675324675325e-06, "loss": 0.17548240721225739, "memory(GiB)": 18.4, "step": 512, "token_acc": 0.9375, "train_speed(iter/s)": 0.929826 }, { "epoch": 0.016665042393528895, "grad_norm": 2.2627334594726562, "learning_rate": 3.3311688311688315e-06, "loss": 0.15627971291542053, "memory(GiB)": 18.4, "step": 513, "token_acc": 0.9512195121951219, "train_speed(iter/s)": 0.930219 }, { "epoch": 0.016697527856284314, "grad_norm": 1.6783487796783447, "learning_rate": 3.3376623376623377e-06, "loss": 0.14106273651123047, "memory(GiB)": 18.4, "step": 514, "token_acc": 0.9607843137254902, "train_speed(iter/s)": 0.930598 }, { "epoch": 0.01673001331903973, "grad_norm": 2.233722448348999, "learning_rate": 3.3441558441558443e-06, "loss": 0.16341781616210938, "memory(GiB)": 18.4, "step": 515, "token_acc": 0.9330543933054394, "train_speed(iter/s)": 0.930999 }, { "epoch": 0.016762498781795147, "grad_norm": 1.5600274801254272, "learning_rate": 3.350649350649351e-06, "loss": 0.15480369329452515, "memory(GiB)": 18.4, "step": 516, "token_acc": 0.9516908212560387, "train_speed(iter/s)": 0.931371 }, { "epoch": 0.016794984244550563, "grad_norm": 1.4268831014633179, "learning_rate": 3.357142857142857e-06, "loss": 0.14514587819576263, "memory(GiB)": 18.4, "step": 517, "token_acc": 0.9357798165137615, "train_speed(iter/s)": 0.931762 }, { "epoch": 0.016827469707305982, "grad_norm": 2.0524332523345947, "learning_rate": 3.3636363636363637e-06, "loss": 0.15550200641155243, "memory(GiB)": 18.4, "step": 518, "token_acc": 0.9327731092436975, "train_speed(iter/s)": 0.93212 }, { "epoch": 0.0168599551700614, "grad_norm": 1.8527673482894897, "learning_rate": 3.3701298701298703e-06, "loss": 0.14396095275878906, "memory(GiB)": 18.4, "step": 519, "token_acc": 0.9264069264069265, "train_speed(iter/s)": 0.932414 }, { "epoch": 0.016892440632816814, "grad_norm": 3.9173827171325684, "learning_rate": 3.376623376623377e-06, "loss": 0.17111355066299438, "memory(GiB)": 18.4, "step": 520, "token_acc": 0.9325396825396826, "train_speed(iter/s)": 0.932711 }, { "epoch": 0.01692492609557223, "grad_norm": 4.219675540924072, "learning_rate": 3.383116883116883e-06, "loss": 0.1585940569639206, "memory(GiB)": 18.4, "step": 521, "token_acc": 0.9318181818181818, "train_speed(iter/s)": 0.93298 }, { "epoch": 0.01695741155832765, "grad_norm": 1.9976084232330322, "learning_rate": 3.3896103896103897e-06, "loss": 0.14513102173805237, "memory(GiB)": 18.4, "step": 522, "token_acc": 0.9253731343283582, "train_speed(iter/s)": 0.933258 }, { "epoch": 0.016989897021083066, "grad_norm": 2.051177740097046, "learning_rate": 3.396103896103896e-06, "loss": 0.136642187833786, "memory(GiB)": 18.4, "step": 523, "token_acc": 0.9063829787234042, "train_speed(iter/s)": 0.933525 }, { "epoch": 0.017022382483838482, "grad_norm": 1.7901877164840698, "learning_rate": 3.402597402597403e-06, "loss": 0.13297390937805176, "memory(GiB)": 18.4, "step": 524, "token_acc": 0.9453125, "train_speed(iter/s)": 0.933812 }, { "epoch": 0.0170548679465939, "grad_norm": 2.38137149810791, "learning_rate": 3.409090909090909e-06, "loss": 0.1688091903924942, "memory(GiB)": 18.4, "step": 525, "token_acc": 0.924, "train_speed(iter/s)": 0.934085 }, { "epoch": 0.017087353409349318, "grad_norm": 1.8857359886169434, "learning_rate": 3.4155844155844157e-06, "loss": 0.14338907599449158, "memory(GiB)": 18.4, "step": 526, "token_acc": 0.9423076923076923, "train_speed(iter/s)": 0.934339 }, { "epoch": 0.017119838872104734, "grad_norm": 2.541412830352783, "learning_rate": 3.422077922077922e-06, "loss": 0.1441720873117447, "memory(GiB)": 18.4, "step": 527, "token_acc": 0.9512195121951219, "train_speed(iter/s)": 0.934592 }, { "epoch": 0.01715232433486015, "grad_norm": 1.84250009059906, "learning_rate": 3.428571428571429e-06, "loss": 0.1406252235174179, "memory(GiB)": 18.4, "step": 528, "token_acc": 0.946058091286307, "train_speed(iter/s)": 0.934871 }, { "epoch": 0.017184809797615566, "grad_norm": 1.8872097730636597, "learning_rate": 3.435064935064935e-06, "loss": 0.14963659644126892, "memory(GiB)": 18.4, "step": 529, "token_acc": 0.9407114624505929, "train_speed(iter/s)": 0.935142 }, { "epoch": 0.017217295260370986, "grad_norm": 1.9952499866485596, "learning_rate": 3.4415584415584418e-06, "loss": 0.15084026753902435, "memory(GiB)": 18.4, "step": 530, "token_acc": 0.9399141630901288, "train_speed(iter/s)": 0.935388 }, { "epoch": 0.017249780723126402, "grad_norm": 1.2998453378677368, "learning_rate": 3.448051948051948e-06, "loss": 0.13720369338989258, "memory(GiB)": 18.4, "step": 531, "token_acc": 0.937007874015748, "train_speed(iter/s)": 0.935708 }, { "epoch": 0.017282266185881818, "grad_norm": 1.8121187686920166, "learning_rate": 3.454545454545455e-06, "loss": 0.14916160702705383, "memory(GiB)": 18.4, "step": 532, "token_acc": 0.9449152542372882, "train_speed(iter/s)": 0.936008 }, { "epoch": 0.017314751648637234, "grad_norm": 1.9251233339309692, "learning_rate": 3.461038961038961e-06, "loss": 0.15279993414878845, "memory(GiB)": 18.4, "step": 533, "token_acc": 0.9411764705882353, "train_speed(iter/s)": 0.936291 }, { "epoch": 0.017347237111392654, "grad_norm": 1.2857500314712524, "learning_rate": 3.4675324675324678e-06, "loss": 0.14446493983268738, "memory(GiB)": 18.4, "step": 534, "token_acc": 0.9475982532751092, "train_speed(iter/s)": 0.93661 }, { "epoch": 0.01737972257414807, "grad_norm": 1.2855591773986816, "learning_rate": 3.474025974025974e-06, "loss": 0.1375606805086136, "memory(GiB)": 18.4, "step": 535, "token_acc": 0.9548611111111112, "train_speed(iter/s)": 0.936941 }, { "epoch": 0.017412208036903486, "grad_norm": 1.460610032081604, "learning_rate": 3.480519480519481e-06, "loss": 0.13757196068763733, "memory(GiB)": 18.4, "step": 536, "token_acc": 0.9581589958158996, "train_speed(iter/s)": 0.937231 }, { "epoch": 0.017444693499658902, "grad_norm": 1.4960049390792847, "learning_rate": 3.487012987012987e-06, "loss": 0.14387811720371246, "memory(GiB)": 18.4, "step": 537, "token_acc": 0.9065420560747663, "train_speed(iter/s)": 0.937547 }, { "epoch": 0.01747717896241432, "grad_norm": 2.1223366260528564, "learning_rate": 3.4935064935064938e-06, "loss": 0.1604171246290207, "memory(GiB)": 18.4, "step": 538, "token_acc": 0.9414893617021277, "train_speed(iter/s)": 0.937818 }, { "epoch": 0.017509664425169737, "grad_norm": 5.462150573730469, "learning_rate": 3.5e-06, "loss": 0.13750788569450378, "memory(GiB)": 18.4, "step": 539, "token_acc": 0.9372384937238494, "train_speed(iter/s)": 0.938118 }, { "epoch": 0.017542149887925153, "grad_norm": 1.6488590240478516, "learning_rate": 3.506493506493507e-06, "loss": 0.14574632048606873, "memory(GiB)": 18.4, "step": 540, "token_acc": 0.9366515837104072, "train_speed(iter/s)": 0.938421 }, { "epoch": 0.01757463535068057, "grad_norm": 2.125429391860962, "learning_rate": 3.512987012987013e-06, "loss": 0.1526491940021515, "memory(GiB)": 18.4, "step": 541, "token_acc": 0.9609756097560975, "train_speed(iter/s)": 0.938719 }, { "epoch": 0.01760712081343599, "grad_norm": 1.6911128759384155, "learning_rate": 3.5194805194805198e-06, "loss": 0.13365939259529114, "memory(GiB)": 18.4, "step": 542, "token_acc": 0.9253112033195021, "train_speed(iter/s)": 0.939003 }, { "epoch": 0.017639606276191405, "grad_norm": 2.6060304641723633, "learning_rate": 3.525974025974026e-06, "loss": 0.14038410782814026, "memory(GiB)": 18.4, "step": 543, "token_acc": 0.958041958041958, "train_speed(iter/s)": 0.938843 }, { "epoch": 0.01767209173894682, "grad_norm": 2.7893412113189697, "learning_rate": 3.532467532467533e-06, "loss": 0.14333771169185638, "memory(GiB)": 18.4, "step": 544, "token_acc": 0.9578313253012049, "train_speed(iter/s)": 0.939071 }, { "epoch": 0.017704577201702237, "grad_norm": 2.976898193359375, "learning_rate": 3.538961038961039e-06, "loss": 0.15193048119544983, "memory(GiB)": 18.4, "step": 545, "token_acc": 0.9367088607594937, "train_speed(iter/s)": 0.939361 }, { "epoch": 0.017737062664457657, "grad_norm": 7.900144577026367, "learning_rate": 3.5454545454545458e-06, "loss": 0.14615856111049652, "memory(GiB)": 18.4, "step": 546, "token_acc": 0.9578059071729957, "train_speed(iter/s)": 0.93965 }, { "epoch": 0.017769548127213073, "grad_norm": 2.46876859664917, "learning_rate": 3.551948051948052e-06, "loss": 0.16265787184238434, "memory(GiB)": 18.4, "step": 547, "token_acc": 0.9333333333333333, "train_speed(iter/s)": 0.939895 }, { "epoch": 0.01780203358996849, "grad_norm": 2.4205446243286133, "learning_rate": 3.558441558441559e-06, "loss": 0.1589956134557724, "memory(GiB)": 18.4, "step": 548, "token_acc": 0.9591836734693877, "train_speed(iter/s)": 0.940186 }, { "epoch": 0.017834519052723905, "grad_norm": 2.0811848640441895, "learning_rate": 3.564935064935065e-06, "loss": 0.16758394241333008, "memory(GiB)": 18.4, "step": 549, "token_acc": 0.9273504273504274, "train_speed(iter/s)": 0.940441 }, { "epoch": 0.017867004515479325, "grad_norm": 2.030421733856201, "learning_rate": 3.5714285714285718e-06, "loss": 0.14812526106834412, "memory(GiB)": 18.4, "step": 550, "token_acc": 0.9304635761589404, "train_speed(iter/s)": 0.940746 }, { "epoch": 0.01789948997823474, "grad_norm": 1.7049440145492554, "learning_rate": 3.577922077922078e-06, "loss": 0.14434322714805603, "memory(GiB)": 18.4, "step": 551, "token_acc": 0.9362549800796812, "train_speed(iter/s)": 0.941008 }, { "epoch": 0.017931975440990157, "grad_norm": 2.1473734378814697, "learning_rate": 3.584415584415585e-06, "loss": 0.14142058789730072, "memory(GiB)": 18.4, "step": 552, "token_acc": 0.9414414414414415, "train_speed(iter/s)": 0.941272 }, { "epoch": 0.017964460903745573, "grad_norm": 3.677006483078003, "learning_rate": 3.590909090909091e-06, "loss": 0.15007229149341583, "memory(GiB)": 18.4, "step": 553, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.941484 }, { "epoch": 0.017996946366500993, "grad_norm": 1.4774932861328125, "learning_rate": 3.5974025974025978e-06, "loss": 0.14012697339057922, "memory(GiB)": 18.4, "step": 554, "token_acc": 0.9154228855721394, "train_speed(iter/s)": 0.94169 }, { "epoch": 0.01802943182925641, "grad_norm": 3.971431016921997, "learning_rate": 3.603896103896104e-06, "loss": 0.1525202840566635, "memory(GiB)": 18.4, "step": 555, "token_acc": 0.940677966101695, "train_speed(iter/s)": 0.941938 }, { "epoch": 0.018061917292011825, "grad_norm": 5.374860763549805, "learning_rate": 3.6103896103896106e-06, "loss": 0.15363335609436035, "memory(GiB)": 18.4, "step": 556, "token_acc": 0.9256198347107438, "train_speed(iter/s)": 0.942228 }, { "epoch": 0.01809440275476724, "grad_norm": 20.317415237426758, "learning_rate": 3.616883116883117e-06, "loss": 0.1669004261493683, "memory(GiB)": 18.4, "step": 557, "token_acc": 0.945, "train_speed(iter/s)": 0.942527 }, { "epoch": 0.01812688821752266, "grad_norm": 2.9638264179229736, "learning_rate": 3.623376623376624e-06, "loss": 0.1641179621219635, "memory(GiB)": 18.4, "step": 558, "token_acc": 0.9368421052631579, "train_speed(iter/s)": 0.942849 }, { "epoch": 0.018159373680278076, "grad_norm": 2.3424084186553955, "learning_rate": 3.62987012987013e-06, "loss": 0.14889860153198242, "memory(GiB)": 18.4, "step": 559, "token_acc": 0.9323671497584541, "train_speed(iter/s)": 0.943208 }, { "epoch": 0.018191859143033492, "grad_norm": 1.7877122163772583, "learning_rate": 3.6363636363636366e-06, "loss": 0.14484190940856934, "memory(GiB)": 18.4, "step": 560, "token_acc": 0.9342723004694836, "train_speed(iter/s)": 0.943539 }, { "epoch": 0.01822434460578891, "grad_norm": 1.8314391374588013, "learning_rate": 3.642857142857143e-06, "loss": 0.135546013712883, "memory(GiB)": 18.4, "step": 561, "token_acc": 0.9488372093023256, "train_speed(iter/s)": 0.943828 }, { "epoch": 0.018256830068544328, "grad_norm": 2.1012613773345947, "learning_rate": 3.64935064935065e-06, "loss": 0.15139134228229523, "memory(GiB)": 18.4, "step": 562, "token_acc": 0.9487179487179487, "train_speed(iter/s)": 0.944181 }, { "epoch": 0.018289315531299744, "grad_norm": 1.821352481842041, "learning_rate": 3.655844155844156e-06, "loss": 0.16146346926689148, "memory(GiB)": 18.4, "step": 563, "token_acc": 0.9227467811158798, "train_speed(iter/s)": 0.944534 }, { "epoch": 0.01832180099405516, "grad_norm": 1.5117899179458618, "learning_rate": 3.6623376623376626e-06, "loss": 0.1358872950077057, "memory(GiB)": 18.4, "step": 564, "token_acc": 0.9550561797752809, "train_speed(iter/s)": 0.944861 }, { "epoch": 0.018354286456810576, "grad_norm": 1.8084839582443237, "learning_rate": 3.6688311688311688e-06, "loss": 0.13195979595184326, "memory(GiB)": 18.4, "step": 565, "token_acc": 0.9622641509433962, "train_speed(iter/s)": 0.945211 }, { "epoch": 0.018386771919565996, "grad_norm": 2.3819801807403564, "learning_rate": 3.675324675324676e-06, "loss": 0.15967363119125366, "memory(GiB)": 18.4, "step": 566, "token_acc": 0.90625, "train_speed(iter/s)": 0.945539 }, { "epoch": 0.018419257382321412, "grad_norm": 2.7689225673675537, "learning_rate": 3.681818181818182e-06, "loss": 0.134703129529953, "memory(GiB)": 18.4, "step": 567, "token_acc": 0.9455445544554455, "train_speed(iter/s)": 0.945877 }, { "epoch": 0.018451742845076828, "grad_norm": 2.051523447036743, "learning_rate": 3.6883116883116886e-06, "loss": 0.1474931538105011, "memory(GiB)": 18.4, "step": 568, "token_acc": 0.9490196078431372, "train_speed(iter/s)": 0.9462 }, { "epoch": 0.018484228307832244, "grad_norm": 2.1993513107299805, "learning_rate": 3.6948051948051948e-06, "loss": 0.1417996883392334, "memory(GiB)": 18.4, "step": 569, "token_acc": 0.9124579124579124, "train_speed(iter/s)": 0.946549 }, { "epoch": 0.018516713770587664, "grad_norm": 1.6264777183532715, "learning_rate": 3.701298701298702e-06, "loss": 0.1356722116470337, "memory(GiB)": 18.4, "step": 570, "token_acc": 0.9276595744680851, "train_speed(iter/s)": 0.946841 }, { "epoch": 0.01854919923334308, "grad_norm": 2.7576797008514404, "learning_rate": 3.707792207792208e-06, "loss": 0.12857259809970856, "memory(GiB)": 18.4, "step": 571, "token_acc": 0.9512195121951219, "train_speed(iter/s)": 0.947209 }, { "epoch": 0.018581684696098496, "grad_norm": 2.1990065574645996, "learning_rate": 3.7142857142857146e-06, "loss": 0.15565404295921326, "memory(GiB)": 18.4, "step": 572, "token_acc": 0.9444444444444444, "train_speed(iter/s)": 0.947533 }, { "epoch": 0.018614170158853912, "grad_norm": 3.0744848251342773, "learning_rate": 3.7207792207792208e-06, "loss": 0.14148053526878357, "memory(GiB)": 18.4, "step": 573, "token_acc": 0.9478260869565217, "train_speed(iter/s)": 0.947854 }, { "epoch": 0.01864665562160933, "grad_norm": 5.0386786460876465, "learning_rate": 3.727272727272728e-06, "loss": 0.14310985803604126, "memory(GiB)": 18.4, "step": 574, "token_acc": 0.9578059071729957, "train_speed(iter/s)": 0.94819 }, { "epoch": 0.018679141084364748, "grad_norm": 6.618046760559082, "learning_rate": 3.733766233766234e-06, "loss": 0.15215766429901123, "memory(GiB)": 18.4, "step": 575, "token_acc": 0.9308943089430894, "train_speed(iter/s)": 0.94852 }, { "epoch": 0.018711626547120164, "grad_norm": 2.0624148845672607, "learning_rate": 3.7402597402597406e-06, "loss": 0.13889294862747192, "memory(GiB)": 18.4, "step": 576, "token_acc": 0.9391634980988594, "train_speed(iter/s)": 0.948847 }, { "epoch": 0.01874411200987558, "grad_norm": 2.2707645893096924, "learning_rate": 3.746753246753247e-06, "loss": 0.1438242495059967, "memory(GiB)": 18.4, "step": 577, "token_acc": 0.9454545454545454, "train_speed(iter/s)": 0.949166 }, { "epoch": 0.018776597472631, "grad_norm": 1.6929857730865479, "learning_rate": 3.753246753246754e-06, "loss": 0.13911521434783936, "memory(GiB)": 18.4, "step": 578, "token_acc": 0.9547169811320755, "train_speed(iter/s)": 0.949183 }, { "epoch": 0.018809082935386415, "grad_norm": 3.1641478538513184, "learning_rate": 3.75974025974026e-06, "loss": 0.13769100606441498, "memory(GiB)": 18.4, "step": 579, "token_acc": 0.9181818181818182, "train_speed(iter/s)": 0.949521 }, { "epoch": 0.01884156839814183, "grad_norm": 1.671533465385437, "learning_rate": 3.7662337662337666e-06, "loss": 0.15238438546657562, "memory(GiB)": 18.4, "step": 580, "token_acc": 0.9291338582677166, "train_speed(iter/s)": 0.949852 }, { "epoch": 0.018874053860897248, "grad_norm": 1.862622618675232, "learning_rate": 3.772727272727273e-06, "loss": 0.14884766936302185, "memory(GiB)": 18.4, "step": 581, "token_acc": 0.9281767955801105, "train_speed(iter/s)": 0.950181 }, { "epoch": 0.018906539323652667, "grad_norm": 1.6392372846603394, "learning_rate": 3.77922077922078e-06, "loss": 0.1443973034620285, "memory(GiB)": 18.4, "step": 582, "token_acc": 0.9494949494949495, "train_speed(iter/s)": 0.950463 }, { "epoch": 0.018939024786408083, "grad_norm": 1.7735867500305176, "learning_rate": 3.785714285714286e-06, "loss": 0.14199373126029968, "memory(GiB)": 18.4, "step": 583, "token_acc": 0.9361702127659575, "train_speed(iter/s)": 0.950799 }, { "epoch": 0.0189715102491635, "grad_norm": 1.520870327949524, "learning_rate": 3.7922077922077926e-06, "loss": 0.1421208381652832, "memory(GiB)": 18.4, "step": 584, "token_acc": 0.9303135888501742, "train_speed(iter/s)": 0.951109 }, { "epoch": 0.019003995711918915, "grad_norm": 2.0776150226593018, "learning_rate": 3.798701298701299e-06, "loss": 0.14705735445022583, "memory(GiB)": 18.4, "step": 585, "token_acc": 0.9447513812154696, "train_speed(iter/s)": 0.951435 }, { "epoch": 0.019036481174674335, "grad_norm": 2.172029733657837, "learning_rate": 3.805194805194806e-06, "loss": 0.12714621424674988, "memory(GiB)": 18.4, "step": 586, "token_acc": 0.9462809917355371, "train_speed(iter/s)": 0.951751 }, { "epoch": 0.01906896663742975, "grad_norm": 2.0971176624298096, "learning_rate": 3.811688311688312e-06, "loss": 0.13412703573703766, "memory(GiB)": 18.4, "step": 587, "token_acc": 0.9310344827586207, "train_speed(iter/s)": 0.952076 }, { "epoch": 0.019101452100185167, "grad_norm": 2.240114212036133, "learning_rate": 3.818181818181819e-06, "loss": 0.12742692232131958, "memory(GiB)": 18.4, "step": 588, "token_acc": 0.9560439560439561, "train_speed(iter/s)": 0.952393 }, { "epoch": 0.019133937562940583, "grad_norm": 2.083829164505005, "learning_rate": 3.824675324675325e-06, "loss": 0.1410948932170868, "memory(GiB)": 18.4, "step": 589, "token_acc": 0.953125, "train_speed(iter/s)": 0.952692 }, { "epoch": 0.019166423025696003, "grad_norm": 1.4618537425994873, "learning_rate": 3.831168831168831e-06, "loss": 0.13210013508796692, "memory(GiB)": 18.4, "step": 590, "token_acc": 0.968609865470852, "train_speed(iter/s)": 0.952995 }, { "epoch": 0.01919890848845142, "grad_norm": 2.0142271518707275, "learning_rate": 3.837662337662338e-06, "loss": 0.14708834886550903, "memory(GiB)": 18.4, "step": 591, "token_acc": 0.9230769230769231, "train_speed(iter/s)": 0.953331 }, { "epoch": 0.019231393951206835, "grad_norm": 2.0457677841186523, "learning_rate": 3.844155844155845e-06, "loss": 0.12926621735095978, "memory(GiB)": 18.4, "step": 592, "token_acc": 0.9502262443438914, "train_speed(iter/s)": 0.953617 }, { "epoch": 0.01926387941396225, "grad_norm": 1.6005898714065552, "learning_rate": 3.850649350649351e-06, "loss": 0.1396380215883255, "memory(GiB)": 18.4, "step": 593, "token_acc": 0.9495798319327731, "train_speed(iter/s)": 0.953921 }, { "epoch": 0.01929636487671767, "grad_norm": 1.3887066841125488, "learning_rate": 3.857142857142858e-06, "loss": 0.1261145919561386, "memory(GiB)": 18.4, "step": 594, "token_acc": 0.9597989949748744, "train_speed(iter/s)": 0.954244 }, { "epoch": 0.019328850339473087, "grad_norm": 2.275109052658081, "learning_rate": 3.863636363636364e-06, "loss": 0.13654254376888275, "memory(GiB)": 18.4, "step": 595, "token_acc": 0.94921875, "train_speed(iter/s)": 0.954517 }, { "epoch": 0.019361335802228503, "grad_norm": 1.8048731088638306, "learning_rate": 3.87012987012987e-06, "loss": 0.1350708156824112, "memory(GiB)": 18.4, "step": 596, "token_acc": 0.9527896995708155, "train_speed(iter/s)": 0.954754 }, { "epoch": 0.01939382126498392, "grad_norm": 4.337887287139893, "learning_rate": 3.876623376623377e-06, "loss": 0.12174493074417114, "memory(GiB)": 18.4, "step": 597, "token_acc": 0.9604743083003953, "train_speed(iter/s)": 0.955001 }, { "epoch": 0.01942630672773934, "grad_norm": 1.5715522766113281, "learning_rate": 3.8831168831168834e-06, "loss": 0.1437683403491974, "memory(GiB)": 18.4, "step": 598, "token_acc": 0.9361702127659575, "train_speed(iter/s)": 0.955231 }, { "epoch": 0.019458792190494754, "grad_norm": 1.905742883682251, "learning_rate": 3.88961038961039e-06, "loss": 0.12660276889801025, "memory(GiB)": 18.4, "step": 599, "token_acc": 0.935064935064935, "train_speed(iter/s)": 0.95542 }, { "epoch": 0.01949127765325017, "grad_norm": 1.90804123878479, "learning_rate": 3.896103896103897e-06, "loss": 0.15139682590961456, "memory(GiB)": 18.4, "step": 600, "token_acc": 0.9234234234234234, "train_speed(iter/s)": 0.955615 }, { "epoch": 0.019523763116005587, "grad_norm": 1.4976352453231812, "learning_rate": 3.902597402597403e-06, "loss": 0.12391165643930435, "memory(GiB)": 18.4, "step": 601, "token_acc": 0.9447236180904522, "train_speed(iter/s)": 0.955813 }, { "epoch": 0.019556248578761006, "grad_norm": 1.543782114982605, "learning_rate": 3.90909090909091e-06, "loss": 0.14340317249298096, "memory(GiB)": 18.4, "step": 602, "token_acc": 0.9567307692307693, "train_speed(iter/s)": 0.95604 }, { "epoch": 0.019588734041516422, "grad_norm": 1.7347967624664307, "learning_rate": 3.915584415584416e-06, "loss": 0.1457490772008896, "memory(GiB)": 18.4, "step": 603, "token_acc": 0.9548872180451128, "train_speed(iter/s)": 0.956254 }, { "epoch": 0.01962121950427184, "grad_norm": 1.8631401062011719, "learning_rate": 3.922077922077922e-06, "loss": 0.13210515677928925, "memory(GiB)": 18.4, "step": 604, "token_acc": 0.9442231075697212, "train_speed(iter/s)": 0.956489 }, { "epoch": 0.019653704967027254, "grad_norm": 1.7113853693008423, "learning_rate": 3.928571428571429e-06, "loss": 0.14965882897377014, "memory(GiB)": 18.4, "step": 605, "token_acc": 0.9471698113207547, "train_speed(iter/s)": 0.95669 }, { "epoch": 0.019686190429782674, "grad_norm": 1.9417842626571655, "learning_rate": 3.9350649350649354e-06, "loss": 0.1299329698085785, "memory(GiB)": 18.4, "step": 606, "token_acc": 0.9487179487179487, "train_speed(iter/s)": 0.956926 }, { "epoch": 0.01971867589253809, "grad_norm": 2.540465831756592, "learning_rate": 3.941558441558442e-06, "loss": 0.13643334805965424, "memory(GiB)": 18.4, "step": 607, "token_acc": 0.9402985074626866, "train_speed(iter/s)": 0.957136 }, { "epoch": 0.019751161355293506, "grad_norm": 1.4739776849746704, "learning_rate": 3.948051948051949e-06, "loss": 0.12432362884283066, "memory(GiB)": 18.4, "step": 608, "token_acc": 0.948936170212766, "train_speed(iter/s)": 0.957356 }, { "epoch": 0.019783646818048922, "grad_norm": 1.5412722826004028, "learning_rate": 3.954545454545454e-06, "loss": 0.12934637069702148, "memory(GiB)": 18.4, "step": 609, "token_acc": 0.944, "train_speed(iter/s)": 0.957571 }, { "epoch": 0.019816132280804342, "grad_norm": 3.005359411239624, "learning_rate": 3.961038961038962e-06, "loss": 0.151470348238945, "memory(GiB)": 18.4, "step": 610, "token_acc": 0.9550561797752809, "train_speed(iter/s)": 0.957703 }, { "epoch": 0.019848617743559758, "grad_norm": 1.9315364360809326, "learning_rate": 3.967532467532468e-06, "loss": 0.13971254229545593, "memory(GiB)": 18.4, "step": 611, "token_acc": 0.9491525423728814, "train_speed(iter/s)": 0.957906 }, { "epoch": 0.019881103206315174, "grad_norm": 2.338228464126587, "learning_rate": 3.974025974025974e-06, "loss": 0.1440768837928772, "memory(GiB)": 18.4, "step": 612, "token_acc": 0.9140271493212669, "train_speed(iter/s)": 0.95812 }, { "epoch": 0.01991358866907059, "grad_norm": 2.6291415691375732, "learning_rate": 3.980519480519481e-06, "loss": 0.15573814511299133, "memory(GiB)": 18.4, "step": 613, "token_acc": 0.9365079365079365, "train_speed(iter/s)": 0.958376 }, { "epoch": 0.01994607413182601, "grad_norm": 2.525557518005371, "learning_rate": 3.9870129870129875e-06, "loss": 0.12407663464546204, "memory(GiB)": 18.4, "step": 614, "token_acc": 0.9354838709677419, "train_speed(iter/s)": 0.958622 }, { "epoch": 0.019978559594581426, "grad_norm": 1.8120310306549072, "learning_rate": 3.993506493506494e-06, "loss": 0.12063071131706238, "memory(GiB)": 18.4, "step": 615, "token_acc": 0.9404761904761905, "train_speed(iter/s)": 0.958806 }, { "epoch": 0.020011045057336842, "grad_norm": 1.521296739578247, "learning_rate": 4.000000000000001e-06, "loss": 0.1467454731464386, "memory(GiB)": 18.4, "step": 616, "token_acc": 0.9367088607594937, "train_speed(iter/s)": 0.959016 }, { "epoch": 0.020043530520092258, "grad_norm": 1.7343051433563232, "learning_rate": 4.0064935064935064e-06, "loss": 0.11442525684833527, "memory(GiB)": 18.4, "step": 617, "token_acc": 0.9532710280373832, "train_speed(iter/s)": 0.959255 }, { "epoch": 0.020076015982847677, "grad_norm": 2.57509708404541, "learning_rate": 4.012987012987014e-06, "loss": 0.13738077878952026, "memory(GiB)": 18.4, "step": 618, "token_acc": 0.9366515837104072, "train_speed(iter/s)": 0.95945 }, { "epoch": 0.020108501445603093, "grad_norm": 2.075260639190674, "learning_rate": 4.01948051948052e-06, "loss": 0.13485680520534515, "memory(GiB)": 18.4, "step": 619, "token_acc": 0.9354838709677419, "train_speed(iter/s)": 0.959747 }, { "epoch": 0.02014098690835851, "grad_norm": 1.3391059637069702, "learning_rate": 4.025974025974026e-06, "loss": 0.13637229800224304, "memory(GiB)": 18.4, "step": 620, "token_acc": 0.94, "train_speed(iter/s)": 0.960046 }, { "epoch": 0.020173472371113926, "grad_norm": 1.4913413524627686, "learning_rate": 4.032467532467533e-06, "loss": 0.13932865858078003, "memory(GiB)": 18.4, "step": 621, "token_acc": 0.928, "train_speed(iter/s)": 0.960333 }, { "epoch": 0.020205957833869345, "grad_norm": 2.112661123275757, "learning_rate": 4.0389610389610395e-06, "loss": 0.13516771793365479, "memory(GiB)": 18.4, "step": 622, "token_acc": 0.9493670886075949, "train_speed(iter/s)": 0.96062 }, { "epoch": 0.02023844329662476, "grad_norm": 1.6470872163772583, "learning_rate": 4.045454545454546e-06, "loss": 0.12731416523456573, "memory(GiB)": 18.4, "step": 623, "token_acc": 0.9529411764705882, "train_speed(iter/s)": 0.960907 }, { "epoch": 0.020270928759380177, "grad_norm": 1.6198598146438599, "learning_rate": 4.051948051948053e-06, "loss": 0.15100684762001038, "memory(GiB)": 18.4, "step": 624, "token_acc": 0.9155555555555556, "train_speed(iter/s)": 0.9612 }, { "epoch": 0.020303414222135593, "grad_norm": 1.7604566812515259, "learning_rate": 4.0584415584415584e-06, "loss": 0.1400255262851715, "memory(GiB)": 18.4, "step": 625, "token_acc": 0.9375, "train_speed(iter/s)": 0.961465 }, { "epoch": 0.020335899684891013, "grad_norm": 2.8186724185943604, "learning_rate": 4.064935064935066e-06, "loss": 0.14478683471679688, "memory(GiB)": 18.4, "step": 626, "token_acc": 0.9325396825396826, "train_speed(iter/s)": 0.961781 }, { "epoch": 0.02036838514764643, "grad_norm": 2.154452323913574, "learning_rate": 4.071428571428572e-06, "loss": 0.13256894052028656, "memory(GiB)": 18.4, "step": 627, "token_acc": 0.9533898305084746, "train_speed(iter/s)": 0.962075 }, { "epoch": 0.020400870610401845, "grad_norm": 2.889845371246338, "learning_rate": 4.077922077922078e-06, "loss": 0.1523556411266327, "memory(GiB)": 18.4, "step": 628, "token_acc": 0.9495798319327731, "train_speed(iter/s)": 0.962372 }, { "epoch": 0.02043335607315726, "grad_norm": 1.7118773460388184, "learning_rate": 4.084415584415585e-06, "loss": 0.14193856716156006, "memory(GiB)": 18.4, "step": 629, "token_acc": 0.9598214285714286, "train_speed(iter/s)": 0.962667 }, { "epoch": 0.02046584153591268, "grad_norm": 2.0572402477264404, "learning_rate": 4.0909090909090915e-06, "loss": 0.1305232048034668, "memory(GiB)": 18.4, "step": 630, "token_acc": 0.9291666666666667, "train_speed(iter/s)": 0.962944 }, { "epoch": 0.020498326998668097, "grad_norm": 2.1665873527526855, "learning_rate": 4.097402597402598e-06, "loss": 0.1502055823802948, "memory(GiB)": 18.4, "step": 631, "token_acc": 0.9323308270676691, "train_speed(iter/s)": 0.963251 }, { "epoch": 0.020530812461423513, "grad_norm": 2.6409823894500732, "learning_rate": 4.103896103896105e-06, "loss": 0.14471879601478577, "memory(GiB)": 18.4, "step": 632, "token_acc": 0.9498327759197325, "train_speed(iter/s)": 0.963557 }, { "epoch": 0.02056329792417893, "grad_norm": 1.2212940454483032, "learning_rate": 4.1103896103896104e-06, "loss": 0.1332002580165863, "memory(GiB)": 18.4, "step": 633, "token_acc": 0.954337899543379, "train_speed(iter/s)": 0.963841 }, { "epoch": 0.02059578338693435, "grad_norm": 1.2449272871017456, "learning_rate": 4.116883116883118e-06, "loss": 0.12768658995628357, "memory(GiB)": 18.4, "step": 634, "token_acc": 0.9263157894736842, "train_speed(iter/s)": 0.964109 }, { "epoch": 0.020628268849689765, "grad_norm": 4.6277899742126465, "learning_rate": 4.123376623376624e-06, "loss": 0.11917711049318314, "memory(GiB)": 18.4, "step": 635, "token_acc": 0.9296296296296296, "train_speed(iter/s)": 0.964394 }, { "epoch": 0.02066075431244518, "grad_norm": 1.917693018913269, "learning_rate": 4.12987012987013e-06, "loss": 0.1314074546098709, "memory(GiB)": 18.4, "step": 636, "token_acc": 0.96875, "train_speed(iter/s)": 0.964659 }, { "epoch": 0.020693239775200597, "grad_norm": 1.916771411895752, "learning_rate": 4.136363636363637e-06, "loss": 0.12416985630989075, "memory(GiB)": 18.4, "step": 637, "token_acc": 0.952191235059761, "train_speed(iter/s)": 0.964953 }, { "epoch": 0.020725725237956016, "grad_norm": 1.6079760789871216, "learning_rate": 4.1428571428571435e-06, "loss": 0.1328718364238739, "memory(GiB)": 18.4, "step": 638, "token_acc": 0.9409090909090909, "train_speed(iter/s)": 0.965212 }, { "epoch": 0.020758210700711432, "grad_norm": 1.4648782014846802, "learning_rate": 4.14935064935065e-06, "loss": 0.1319912225008011, "memory(GiB)": 18.4, "step": 639, "token_acc": 0.9490740740740741, "train_speed(iter/s)": 0.965472 }, { "epoch": 0.02079069616346685, "grad_norm": 2.258331060409546, "learning_rate": 4.155844155844157e-06, "loss": 0.1337721198797226, "memory(GiB)": 18.4, "step": 640, "token_acc": 0.9649805447470817, "train_speed(iter/s)": 0.965761 }, { "epoch": 0.020823181626222265, "grad_norm": 2.191901445388794, "learning_rate": 4.1623376623376625e-06, "loss": 0.1253553181886673, "memory(GiB)": 18.4, "step": 641, "token_acc": 0.941908713692946, "train_speed(iter/s)": 0.966026 }, { "epoch": 0.020855667088977684, "grad_norm": 2.562286376953125, "learning_rate": 4.168831168831169e-06, "loss": 0.12472181022167206, "memory(GiB)": 18.4, "step": 642, "token_acc": 0.9587155963302753, "train_speed(iter/s)": 0.966297 }, { "epoch": 0.0208881525517331, "grad_norm": 2.379490375518799, "learning_rate": 4.175324675324676e-06, "loss": 0.1357421576976776, "memory(GiB)": 18.4, "step": 643, "token_acc": 0.9104477611940298, "train_speed(iter/s)": 0.966591 }, { "epoch": 0.020920638014488516, "grad_norm": 2.685410499572754, "learning_rate": 4.181818181818182e-06, "loss": 0.1418502926826477, "memory(GiB)": 18.4, "step": 644, "token_acc": 0.9504132231404959, "train_speed(iter/s)": 0.96688 }, { "epoch": 0.020953123477243932, "grad_norm": 2.75215482711792, "learning_rate": 4.188311688311689e-06, "loss": 0.13369368016719818, "memory(GiB)": 18.4, "step": 645, "token_acc": 0.9640718562874252, "train_speed(iter/s)": 0.967157 }, { "epoch": 0.020985608939999352, "grad_norm": 2.100287437438965, "learning_rate": 4.194805194805195e-06, "loss": 0.14634230732917786, "memory(GiB)": 18.4, "step": 646, "token_acc": 0.9494163424124513, "train_speed(iter/s)": 0.967461 }, { "epoch": 0.021018094402754768, "grad_norm": 1.575284481048584, "learning_rate": 4.201298701298701e-06, "loss": 0.13621786236763, "memory(GiB)": 18.4, "step": 647, "token_acc": 0.953125, "train_speed(iter/s)": 0.967726 }, { "epoch": 0.021050579865510184, "grad_norm": 1.9755064249038696, "learning_rate": 4.207792207792208e-06, "loss": 0.1382291465997696, "memory(GiB)": 18.4, "step": 648, "token_acc": 0.9575289575289575, "train_speed(iter/s)": 0.967995 }, { "epoch": 0.0210830653282656, "grad_norm": 3.0388832092285156, "learning_rate": 4.2142857142857145e-06, "loss": 0.1437102109193802, "memory(GiB)": 18.4, "step": 649, "token_acc": 0.9477911646586346, "train_speed(iter/s)": 0.968269 }, { "epoch": 0.02111555079102102, "grad_norm": 1.8949596881866455, "learning_rate": 4.220779220779221e-06, "loss": 0.13397137820720673, "memory(GiB)": 18.4, "step": 650, "token_acc": 0.9299610894941635, "train_speed(iter/s)": 0.968561 }, { "epoch": 0.021148036253776436, "grad_norm": 2.5269477367401123, "learning_rate": 4.227272727272728e-06, "loss": 0.14579366147518158, "memory(GiB)": 18.4, "step": 651, "token_acc": 0.9409090909090909, "train_speed(iter/s)": 0.968828 }, { "epoch": 0.021180521716531852, "grad_norm": 1.9753761291503906, "learning_rate": 4.2337662337662334e-06, "loss": 0.11859243363142014, "memory(GiB)": 18.4, "step": 652, "token_acc": 0.9571428571428572, "train_speed(iter/s)": 0.969061 }, { "epoch": 0.021213007179287268, "grad_norm": 1.165493130683899, "learning_rate": 4.240259740259741e-06, "loss": 0.11536288261413574, "memory(GiB)": 18.4, "step": 653, "token_acc": 0.9446808510638298, "train_speed(iter/s)": 0.969256 }, { "epoch": 0.021245492642042688, "grad_norm": 1.4718811511993408, "learning_rate": 4.246753246753247e-06, "loss": 0.14099857211112976, "memory(GiB)": 18.4, "step": 654, "token_acc": 0.9479166666666666, "train_speed(iter/s)": 0.969474 }, { "epoch": 0.021277978104798104, "grad_norm": 2.447542905807495, "learning_rate": 4.253246753246753e-06, "loss": 0.11980672180652618, "memory(GiB)": 18.4, "step": 655, "token_acc": 0.9561752988047809, "train_speed(iter/s)": 0.969688 }, { "epoch": 0.02131046356755352, "grad_norm": 2.052361249923706, "learning_rate": 4.25974025974026e-06, "loss": 0.1480347216129303, "memory(GiB)": 18.4, "step": 656, "token_acc": 0.9477911646586346, "train_speed(iter/s)": 0.969873 }, { "epoch": 0.021342949030308936, "grad_norm": 1.972739815711975, "learning_rate": 4.2662337662337665e-06, "loss": 0.15879634022712708, "memory(GiB)": 18.4, "step": 657, "token_acc": 0.9444444444444444, "train_speed(iter/s)": 0.970053 }, { "epoch": 0.021375434493064355, "grad_norm": 1.3225791454315186, "learning_rate": 4.272727272727273e-06, "loss": 0.1361464262008667, "memory(GiB)": 18.4, "step": 658, "token_acc": 0.9563106796116505, "train_speed(iter/s)": 0.970265 }, { "epoch": 0.02140791995581977, "grad_norm": 2.2138750553131104, "learning_rate": 4.27922077922078e-06, "loss": 0.1257157325744629, "memory(GiB)": 18.4, "step": 659, "token_acc": 0.9147286821705426, "train_speed(iter/s)": 0.970422 }, { "epoch": 0.021440405418575188, "grad_norm": 1.867083191871643, "learning_rate": 4.2857142857142855e-06, "loss": 0.1272042691707611, "memory(GiB)": 18.4, "step": 660, "token_acc": 0.9495798319327731, "train_speed(iter/s)": 0.970594 }, { "epoch": 0.021472890881330604, "grad_norm": 1.7099072933197021, "learning_rate": 4.292207792207793e-06, "loss": 0.1350218653678894, "memory(GiB)": 18.4, "step": 661, "token_acc": 0.9259259259259259, "train_speed(iter/s)": 0.970783 }, { "epoch": 0.021505376344086023, "grad_norm": 1.454772710800171, "learning_rate": 4.298701298701299e-06, "loss": 0.13662555813789368, "memory(GiB)": 18.4, "step": 662, "token_acc": 0.9365671641791045, "train_speed(iter/s)": 0.97096 }, { "epoch": 0.02153786180684144, "grad_norm": 2.3759753704071045, "learning_rate": 4.305194805194805e-06, "loss": 0.12961630523204803, "memory(GiB)": 18.4, "step": 663, "token_acc": 0.9523809523809523, "train_speed(iter/s)": 0.971154 }, { "epoch": 0.021570347269596855, "grad_norm": 1.2429311275482178, "learning_rate": 4.311688311688312e-06, "loss": 0.12961307168006897, "memory(GiB)": 18.4, "step": 664, "token_acc": 0.9282700421940928, "train_speed(iter/s)": 0.971318 }, { "epoch": 0.02160283273235227, "grad_norm": 1.7367992401123047, "learning_rate": 4.3181818181818185e-06, "loss": 0.14512911438941956, "memory(GiB)": 18.4, "step": 665, "token_acc": 0.9340659340659341, "train_speed(iter/s)": 0.9715 }, { "epoch": 0.02163531819510769, "grad_norm": 1.3625614643096924, "learning_rate": 4.324675324675325e-06, "loss": 0.13038796186447144, "memory(GiB)": 18.4, "step": 666, "token_acc": 0.9525862068965517, "train_speed(iter/s)": 0.971686 }, { "epoch": 0.021667803657863107, "grad_norm": 3.0212438106536865, "learning_rate": 4.331168831168832e-06, "loss": 0.12496817111968994, "memory(GiB)": 18.4, "step": 667, "token_acc": 0.9594594594594594, "train_speed(iter/s)": 0.971861 }, { "epoch": 0.021700289120618523, "grad_norm": 1.4211980104446411, "learning_rate": 4.3376623376623375e-06, "loss": 0.11950061470270157, "memory(GiB)": 18.4, "step": 668, "token_acc": 0.9414225941422594, "train_speed(iter/s)": 0.972033 }, { "epoch": 0.02173277458337394, "grad_norm": 1.1058924198150635, "learning_rate": 4.344155844155845e-06, "loss": 0.1306895911693573, "memory(GiB)": 18.4, "step": 669, "token_acc": 0.9421487603305785, "train_speed(iter/s)": 0.972194 }, { "epoch": 0.02176526004612936, "grad_norm": 1.450027585029602, "learning_rate": 4.350649350649351e-06, "loss": 0.1279670000076294, "memory(GiB)": 18.4, "step": 670, "token_acc": 0.9241379310344827, "train_speed(iter/s)": 0.972329 }, { "epoch": 0.021797745508884775, "grad_norm": 1.6039588451385498, "learning_rate": 4.357142857142857e-06, "loss": 0.1290554404258728, "memory(GiB)": 18.4, "step": 671, "token_acc": 0.9426229508196722, "train_speed(iter/s)": 0.972515 }, { "epoch": 0.02183023097164019, "grad_norm": 1.8293240070343018, "learning_rate": 4.363636363636364e-06, "loss": 0.14449749886989594, "memory(GiB)": 18.4, "step": 672, "token_acc": 0.9512195121951219, "train_speed(iter/s)": 0.972681 }, { "epoch": 0.021862716434395607, "grad_norm": 2.9082162380218506, "learning_rate": 4.3701298701298705e-06, "loss": 0.14544397592544556, "memory(GiB)": 18.4, "step": 673, "token_acc": 0.946236559139785, "train_speed(iter/s)": 0.972831 }, { "epoch": 0.021895201897151027, "grad_norm": 1.5920289754867554, "learning_rate": 4.376623376623377e-06, "loss": 0.1297915279865265, "memory(GiB)": 18.4, "step": 674, "token_acc": 0.9487179487179487, "train_speed(iter/s)": 0.973034 }, { "epoch": 0.021927687359906443, "grad_norm": 2.2066738605499268, "learning_rate": 4.383116883116884e-06, "loss": 0.14252160489559174, "memory(GiB)": 18.4, "step": 675, "token_acc": 0.9292035398230089, "train_speed(iter/s)": 0.973203 }, { "epoch": 0.02196017282266186, "grad_norm": 1.6389966011047363, "learning_rate": 4.3896103896103895e-06, "loss": 0.13371361792087555, "memory(GiB)": 18.4, "step": 676, "token_acc": 0.9184549356223176, "train_speed(iter/s)": 0.973362 }, { "epoch": 0.021992658285417275, "grad_norm": 1.4900966882705688, "learning_rate": 4.396103896103897e-06, "loss": 0.12182532250881195, "memory(GiB)": 18.4, "step": 677, "token_acc": 0.9531914893617022, "train_speed(iter/s)": 0.973554 }, { "epoch": 0.022025143748172694, "grad_norm": 1.768061876296997, "learning_rate": 4.402597402597403e-06, "loss": 0.15169882774353027, "memory(GiB)": 18.4, "step": 678, "token_acc": 0.927038626609442, "train_speed(iter/s)": 0.973749 }, { "epoch": 0.02205762921092811, "grad_norm": 1.7222323417663574, "learning_rate": 4.409090909090909e-06, "loss": 0.12969818711280823, "memory(GiB)": 18.4, "step": 679, "token_acc": 0.94921875, "train_speed(iter/s)": 0.973947 }, { "epoch": 0.022090114673683527, "grad_norm": 2.2077834606170654, "learning_rate": 4.415584415584416e-06, "loss": 0.1362258940935135, "memory(GiB)": 18.4, "step": 680, "token_acc": 0.9493670886075949, "train_speed(iter/s)": 0.974219 }, { "epoch": 0.022122600136438943, "grad_norm": 1.6929327249526978, "learning_rate": 4.4220779220779225e-06, "loss": 0.14531674981117249, "memory(GiB)": 18.4, "step": 681, "token_acc": 0.9393939393939394, "train_speed(iter/s)": 0.97445 }, { "epoch": 0.022155085599194362, "grad_norm": 1.7953084707260132, "learning_rate": 4.428571428571429e-06, "loss": 0.13564790785312653, "memory(GiB)": 18.4, "step": 682, "token_acc": 0.9315589353612167, "train_speed(iter/s)": 0.974717 }, { "epoch": 0.02218757106194978, "grad_norm": 6.527939796447754, "learning_rate": 4.435064935064936e-06, "loss": 0.1203548014163971, "memory(GiB)": 18.4, "step": 683, "token_acc": 0.95703125, "train_speed(iter/s)": 0.974993 }, { "epoch": 0.022220056524705194, "grad_norm": 1.6386873722076416, "learning_rate": 4.4415584415584415e-06, "loss": 0.11462704092264175, "memory(GiB)": 18.4, "step": 684, "token_acc": 0.9495798319327731, "train_speed(iter/s)": 0.975261 }, { "epoch": 0.02225254198746061, "grad_norm": 3.051670551300049, "learning_rate": 4.448051948051948e-06, "loss": 0.14608708024024963, "memory(GiB)": 18.4, "step": 685, "token_acc": 0.9339622641509434, "train_speed(iter/s)": 0.975514 }, { "epoch": 0.02228502745021603, "grad_norm": 1.7090394496917725, "learning_rate": 4.454545454545455e-06, "loss": 0.12292235344648361, "memory(GiB)": 18.4, "step": 686, "token_acc": 0.9367588932806324, "train_speed(iter/s)": 0.975751 }, { "epoch": 0.022317512912971446, "grad_norm": 1.733617901802063, "learning_rate": 4.461038961038961e-06, "loss": 0.13372541964054108, "memory(GiB)": 18.4, "step": 687, "token_acc": 0.94, "train_speed(iter/s)": 0.976 }, { "epoch": 0.022349998375726862, "grad_norm": 1.803652048110962, "learning_rate": 4.467532467532468e-06, "loss": 0.1225358322262764, "memory(GiB)": 18.4, "step": 688, "token_acc": 0.9409448818897638, "train_speed(iter/s)": 0.97625 }, { "epoch": 0.02238248383848228, "grad_norm": 1.842800259590149, "learning_rate": 4.4740259740259745e-06, "loss": 0.13185393810272217, "memory(GiB)": 18.4, "step": 689, "token_acc": 0.9416666666666667, "train_speed(iter/s)": 0.976487 }, { "epoch": 0.022414969301237698, "grad_norm": 1.8290425539016724, "learning_rate": 4.48051948051948e-06, "loss": 0.13162526488304138, "memory(GiB)": 18.4, "step": 690, "token_acc": 0.9568965517241379, "train_speed(iter/s)": 0.976741 }, { "epoch": 0.022447454763993114, "grad_norm": 2.569114923477173, "learning_rate": 4.487012987012988e-06, "loss": 0.14589568972587585, "memory(GiB)": 18.4, "step": 691, "token_acc": 0.9390243902439024, "train_speed(iter/s)": 0.976977 }, { "epoch": 0.02247994022674853, "grad_norm": 1.439103364944458, "learning_rate": 4.4935064935064935e-06, "loss": 0.11947663128376007, "memory(GiB)": 18.4, "step": 692, "token_acc": 0.9490740740740741, "train_speed(iter/s)": 0.977235 }, { "epoch": 0.022512425689503946, "grad_norm": 1.3703852891921997, "learning_rate": 4.5e-06, "loss": 0.1129908561706543, "memory(GiB)": 18.4, "step": 693, "token_acc": 0.9702970297029703, "train_speed(iter/s)": 0.977474 }, { "epoch": 0.022544911152259366, "grad_norm": 2.8113949298858643, "learning_rate": 4.506493506493507e-06, "loss": 0.12058036774396896, "memory(GiB)": 18.4, "step": 694, "token_acc": 0.9444444444444444, "train_speed(iter/s)": 0.977723 }, { "epoch": 0.02257739661501478, "grad_norm": 2.7922983169555664, "learning_rate": 4.512987012987013e-06, "loss": 0.13641513884067535, "memory(GiB)": 18.4, "step": 695, "token_acc": 0.9454545454545454, "train_speed(iter/s)": 0.977989 }, { "epoch": 0.022609882077770198, "grad_norm": 2.3089351654052734, "learning_rate": 4.51948051948052e-06, "loss": 0.12821899354457855, "memory(GiB)": 18.4, "step": 696, "token_acc": 0.9409282700421941, "train_speed(iter/s)": 0.978238 }, { "epoch": 0.022642367540525614, "grad_norm": 1.4552056789398193, "learning_rate": 4.5259740259740265e-06, "loss": 0.11666131019592285, "memory(GiB)": 18.4, "step": 697, "token_acc": 0.9514925373134329, "train_speed(iter/s)": 0.978459 }, { "epoch": 0.022674853003281033, "grad_norm": 1.6683647632598877, "learning_rate": 4.532467532467532e-06, "loss": 0.1317666471004486, "memory(GiB)": 18.4, "step": 698, "token_acc": 0.9362745098039216, "train_speed(iter/s)": 0.97872 }, { "epoch": 0.02270733846603645, "grad_norm": 1.4124629497528076, "learning_rate": 4.53896103896104e-06, "loss": 0.12112519145011902, "memory(GiB)": 18.4, "step": 699, "token_acc": 0.9360730593607306, "train_speed(iter/s)": 0.978944 }, { "epoch": 0.022739823928791866, "grad_norm": 3.2214198112487793, "learning_rate": 4.5454545454545455e-06, "loss": 0.15263351798057556, "memory(GiB)": 18.4, "step": 700, "token_acc": 0.9462809917355371, "train_speed(iter/s)": 0.979205 }, { "epoch": 0.02277230939154728, "grad_norm": 5.214594841003418, "learning_rate": 4.551948051948052e-06, "loss": 0.14319467544555664, "memory(GiB)": 18.4, "step": 701, "token_acc": 0.9279279279279279, "train_speed(iter/s)": 0.97947 }, { "epoch": 0.0228047948543027, "grad_norm": 1.857222557067871, "learning_rate": 4.558441558441559e-06, "loss": 0.1333225816488266, "memory(GiB)": 18.4, "step": 702, "token_acc": 0.9638009049773756, "train_speed(iter/s)": 0.979711 }, { "epoch": 0.022837280317058117, "grad_norm": 1.7265225648880005, "learning_rate": 4.564935064935065e-06, "loss": 0.12834376096725464, "memory(GiB)": 18.4, "step": 703, "token_acc": 0.9300699300699301, "train_speed(iter/s)": 0.979964 }, { "epoch": 0.022869765779813533, "grad_norm": 1.5168421268463135, "learning_rate": 4.571428571428572e-06, "loss": 0.1422974169254303, "memory(GiB)": 18.4, "step": 704, "token_acc": 0.944206008583691, "train_speed(iter/s)": 0.980218 }, { "epoch": 0.02290225124256895, "grad_norm": 3.5286173820495605, "learning_rate": 4.5779220779220786e-06, "loss": 0.13715538382530212, "memory(GiB)": 18.4, "step": 705, "token_acc": 0.9330855018587361, "train_speed(iter/s)": 0.980385 }, { "epoch": 0.02293473670532437, "grad_norm": 1.4224106073379517, "learning_rate": 4.584415584415584e-06, "loss": 0.11379346251487732, "memory(GiB)": 18.4, "step": 706, "token_acc": 0.9576271186440678, "train_speed(iter/s)": 0.980583 }, { "epoch": 0.022967222168079785, "grad_norm": 1.516851544380188, "learning_rate": 4.590909090909092e-06, "loss": 0.1276238113641739, "memory(GiB)": 18.4, "step": 707, "token_acc": 0.946078431372549, "train_speed(iter/s)": 0.980776 }, { "epoch": 0.0229997076308352, "grad_norm": 1.4114338159561157, "learning_rate": 4.5974025974025975e-06, "loss": 0.13727155327796936, "memory(GiB)": 18.4, "step": 708, "token_acc": 0.9342105263157895, "train_speed(iter/s)": 0.980919 }, { "epoch": 0.023032193093590617, "grad_norm": 1.4322707653045654, "learning_rate": 4.603896103896104e-06, "loss": 0.12209237366914749, "memory(GiB)": 18.4, "step": 709, "token_acc": 0.9333333333333333, "train_speed(iter/s)": 0.98111 }, { "epoch": 0.023064678556346037, "grad_norm": 1.1662241220474243, "learning_rate": 4.610389610389611e-06, "loss": 0.11518315970897675, "memory(GiB)": 18.4, "step": 710, "token_acc": 0.9411764705882353, "train_speed(iter/s)": 0.981262 }, { "epoch": 0.023097164019101453, "grad_norm": 1.514088749885559, "learning_rate": 4.616883116883117e-06, "loss": 0.13543514907360077, "memory(GiB)": 18.4, "step": 711, "token_acc": 0.9296296296296296, "train_speed(iter/s)": 0.981416 }, { "epoch": 0.02312964948185687, "grad_norm": 2.3001482486724854, "learning_rate": 4.623376623376624e-06, "loss": 0.14007630944252014, "memory(GiB)": 18.4, "step": 712, "token_acc": 0.9451476793248945, "train_speed(iter/s)": 0.981621 }, { "epoch": 0.023162134944612285, "grad_norm": 1.9693948030471802, "learning_rate": 4.6298701298701306e-06, "loss": 0.12112469971179962, "memory(GiB)": 18.4, "step": 713, "token_acc": 0.9555555555555556, "train_speed(iter/s)": 0.981782 }, { "epoch": 0.023194620407367705, "grad_norm": 1.419751524925232, "learning_rate": 4.636363636363636e-06, "loss": 0.11433538049459457, "memory(GiB)": 18.4, "step": 714, "token_acc": 0.9701492537313433, "train_speed(iter/s)": 0.981923 }, { "epoch": 0.02322710587012312, "grad_norm": 1.32814621925354, "learning_rate": 4.642857142857144e-06, "loss": 0.11970062553882599, "memory(GiB)": 18.4, "step": 715, "token_acc": 0.9663461538461539, "train_speed(iter/s)": 0.982115 }, { "epoch": 0.023259591332878537, "grad_norm": 1.7625035047531128, "learning_rate": 4.6493506493506495e-06, "loss": 0.11969545483589172, "memory(GiB)": 18.4, "step": 716, "token_acc": 0.9555555555555556, "train_speed(iter/s)": 0.98227 }, { "epoch": 0.023292076795633953, "grad_norm": 1.3544669151306152, "learning_rate": 4.655844155844156e-06, "loss": 0.12271414697170258, "memory(GiB)": 18.4, "step": 717, "token_acc": 0.9375, "train_speed(iter/s)": 0.982441 }, { "epoch": 0.023324562258389372, "grad_norm": 1.471563458442688, "learning_rate": 4.662337662337663e-06, "loss": 0.1341915726661682, "memory(GiB)": 18.4, "step": 718, "token_acc": 0.9409090909090909, "train_speed(iter/s)": 0.982622 }, { "epoch": 0.02335704772114479, "grad_norm": 1.8874025344848633, "learning_rate": 4.668831168831169e-06, "loss": 0.12278202176094055, "memory(GiB)": 18.4, "step": 719, "token_acc": 0.9186046511627907, "train_speed(iter/s)": 0.98278 }, { "epoch": 0.023389533183900205, "grad_norm": 1.384177565574646, "learning_rate": 4.675324675324676e-06, "loss": 0.13749167323112488, "memory(GiB)": 18.4, "step": 720, "token_acc": 0.9617224880382775, "train_speed(iter/s)": 0.982929 }, { "epoch": 0.02342201864665562, "grad_norm": 1.3915249109268188, "learning_rate": 4.681818181818183e-06, "loss": 0.12026362121105194, "memory(GiB)": 18.4, "step": 721, "token_acc": 0.9745762711864406, "train_speed(iter/s)": 0.983087 }, { "epoch": 0.02345450410941104, "grad_norm": 2.183145761489868, "learning_rate": 4.688311688311688e-06, "loss": 0.12708023190498352, "memory(GiB)": 18.4, "step": 722, "token_acc": 0.941908713692946, "train_speed(iter/s)": 0.983235 }, { "epoch": 0.023486989572166456, "grad_norm": 2.4014534950256348, "learning_rate": 4.694805194805195e-06, "loss": 0.15049168467521667, "memory(GiB)": 18.4, "step": 723, "token_acc": 0.9459459459459459, "train_speed(iter/s)": 0.983372 }, { "epoch": 0.023519475034921872, "grad_norm": 1.3622794151306152, "learning_rate": 4.7012987012987016e-06, "loss": 0.1214037537574768, "memory(GiB)": 18.4, "step": 724, "token_acc": 0.9357429718875502, "train_speed(iter/s)": 0.983563 }, { "epoch": 0.02355196049767729, "grad_norm": 1.2135623693466187, "learning_rate": 4.707792207792208e-06, "loss": 0.13324381411075592, "memory(GiB)": 18.4, "step": 725, "token_acc": 0.959409594095941, "train_speed(iter/s)": 0.983726 }, { "epoch": 0.023584445960432708, "grad_norm": 1.1665124893188477, "learning_rate": 4.714285714285715e-06, "loss": 0.12944522500038147, "memory(GiB)": 18.4, "step": 726, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.983909 }, { "epoch": 0.023616931423188124, "grad_norm": 2.6846117973327637, "learning_rate": 4.720779220779221e-06, "loss": 0.1298886239528656, "memory(GiB)": 18.4, "step": 727, "token_acc": 0.9541284403669725, "train_speed(iter/s)": 0.984053 }, { "epoch": 0.02364941688594354, "grad_norm": 2.4349663257598877, "learning_rate": 4.727272727272728e-06, "loss": 0.12768661975860596, "memory(GiB)": 18.4, "step": 728, "token_acc": 0.9597069597069597, "train_speed(iter/s)": 0.984217 }, { "epoch": 0.023681902348698956, "grad_norm": 2.227724075317383, "learning_rate": 4.733766233766235e-06, "loss": 0.13428634405136108, "memory(GiB)": 18.4, "step": 729, "token_acc": 0.9439252336448598, "train_speed(iter/s)": 0.984384 }, { "epoch": 0.023714387811454376, "grad_norm": 1.236191987991333, "learning_rate": 4.74025974025974e-06, "loss": 0.14410418272018433, "memory(GiB)": 18.4, "step": 730, "token_acc": 0.9407114624505929, "train_speed(iter/s)": 0.984544 }, { "epoch": 0.023746873274209792, "grad_norm": 1.0531202554702759, "learning_rate": 4.746753246753247e-06, "loss": 0.1310749650001526, "memory(GiB)": 18.4, "step": 731, "token_acc": 0.9216589861751152, "train_speed(iter/s)": 0.984718 }, { "epoch": 0.023779358736965208, "grad_norm": 1.2541375160217285, "learning_rate": 4.7532467532467536e-06, "loss": 0.13051092624664307, "memory(GiB)": 18.4, "step": 732, "token_acc": 0.945273631840796, "train_speed(iter/s)": 0.984891 }, { "epoch": 0.023811844199720624, "grad_norm": 1.6375279426574707, "learning_rate": 4.75974025974026e-06, "loss": 0.12946540117263794, "memory(GiB)": 18.4, "step": 733, "token_acc": 0.9333333333333333, "train_speed(iter/s)": 0.985045 }, { "epoch": 0.023844329662476044, "grad_norm": 2.1811578273773193, "learning_rate": 4.766233766233767e-06, "loss": 0.1242784857749939, "memory(GiB)": 18.4, "step": 734, "token_acc": 0.9434628975265018, "train_speed(iter/s)": 0.985232 }, { "epoch": 0.02387681512523146, "grad_norm": 2.1442666053771973, "learning_rate": 4.772727272727273e-06, "loss": 0.12598353624343872, "memory(GiB)": 18.4, "step": 735, "token_acc": 0.9414225941422594, "train_speed(iter/s)": 0.985418 }, { "epoch": 0.023909300587986876, "grad_norm": 1.7355177402496338, "learning_rate": 4.779220779220779e-06, "loss": 0.13972237706184387, "memory(GiB)": 18.4, "step": 736, "token_acc": 0.9313304721030042, "train_speed(iter/s)": 0.98558 }, { "epoch": 0.023941786050742292, "grad_norm": 1.55962336063385, "learning_rate": 4.785714285714287e-06, "loss": 0.12149414420127869, "memory(GiB)": 18.4, "step": 737, "token_acc": 0.9681818181818181, "train_speed(iter/s)": 0.985745 }, { "epoch": 0.02397427151349771, "grad_norm": 1.8325881958007812, "learning_rate": 4.792207792207792e-06, "loss": 0.15115007758140564, "memory(GiB)": 18.4, "step": 738, "token_acc": 0.9405940594059405, "train_speed(iter/s)": 0.985892 }, { "epoch": 0.024006756976253128, "grad_norm": 1.8758331537246704, "learning_rate": 4.798701298701299e-06, "loss": 0.14072644710540771, "memory(GiB)": 18.4, "step": 739, "token_acc": 0.9507575757575758, "train_speed(iter/s)": 0.986058 }, { "epoch": 0.024039242439008544, "grad_norm": 1.4173588752746582, "learning_rate": 4.805194805194806e-06, "loss": 0.13340219855308533, "memory(GiB)": 18.4, "step": 740, "token_acc": 0.9411764705882353, "train_speed(iter/s)": 0.986164 }, { "epoch": 0.02407172790176396, "grad_norm": 1.1218287944793701, "learning_rate": 4.811688311688312e-06, "loss": 0.13594618439674377, "memory(GiB)": 18.4, "step": 741, "token_acc": 0.9590909090909091, "train_speed(iter/s)": 0.986367 }, { "epoch": 0.02410421336451938, "grad_norm": 1.78139328956604, "learning_rate": 4.818181818181819e-06, "loss": 0.11692480742931366, "memory(GiB)": 18.4, "step": 742, "token_acc": 0.9680365296803652, "train_speed(iter/s)": 0.986592 }, { "epoch": 0.024136698827274795, "grad_norm": 1.8472625017166138, "learning_rate": 4.824675324675325e-06, "loss": 0.12840233743190765, "memory(GiB)": 18.4, "step": 743, "token_acc": 0.9372197309417041, "train_speed(iter/s)": 0.98683 }, { "epoch": 0.02416918429003021, "grad_norm": 1.97733736038208, "learning_rate": 4.831168831168831e-06, "loss": 0.12121787667274475, "memory(GiB)": 18.4, "step": 744, "token_acc": 0.9324894514767933, "train_speed(iter/s)": 0.987037 }, { "epoch": 0.024201669752785628, "grad_norm": 1.2237697839736938, "learning_rate": 4.837662337662339e-06, "loss": 0.1187613308429718, "memory(GiB)": 18.4, "step": 745, "token_acc": 0.9529914529914529, "train_speed(iter/s)": 0.987266 }, { "epoch": 0.024234155215541047, "grad_norm": 1.5421807765960693, "learning_rate": 4.844155844155844e-06, "loss": 0.11885815113782883, "memory(GiB)": 18.4, "step": 746, "token_acc": 0.9660194174757282, "train_speed(iter/s)": 0.987519 }, { "epoch": 0.024266640678296463, "grad_norm": 1.3534233570098877, "learning_rate": 4.850649350649351e-06, "loss": 0.11641678214073181, "memory(GiB)": 18.4, "step": 747, "token_acc": 0.9620253164556962, "train_speed(iter/s)": 0.98775 }, { "epoch": 0.02429912614105188, "grad_norm": 1.4405906200408936, "learning_rate": 4.857142857142858e-06, "loss": 0.12814852595329285, "memory(GiB)": 18.4, "step": 748, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.987962 }, { "epoch": 0.024331611603807295, "grad_norm": 1.4574737548828125, "learning_rate": 4.863636363636364e-06, "loss": 0.12337721884250641, "memory(GiB)": 18.4, "step": 749, "token_acc": 0.9330985915492958, "train_speed(iter/s)": 0.988199 }, { "epoch": 0.024364097066562715, "grad_norm": 1.4788663387298584, "learning_rate": 4.870129870129871e-06, "loss": 0.13267071545124054, "memory(GiB)": 18.4, "step": 750, "token_acc": 0.9300411522633745, "train_speed(iter/s)": 0.988411 }, { "epoch": 0.02439658252931813, "grad_norm": 1.2936866283416748, "learning_rate": 4.876623376623377e-06, "loss": 0.12078171223402023, "memory(GiB)": 18.4, "step": 751, "token_acc": 0.9465648854961832, "train_speed(iter/s)": 0.988618 }, { "epoch": 0.024429067992073547, "grad_norm": 2.178541898727417, "learning_rate": 4.883116883116883e-06, "loss": 0.1297510862350464, "memory(GiB)": 18.4, "step": 752, "token_acc": 0.9530685920577617, "train_speed(iter/s)": 0.98885 }, { "epoch": 0.024461553454828963, "grad_norm": 1.5810322761535645, "learning_rate": 4.889610389610391e-06, "loss": 0.13488468527793884, "memory(GiB)": 18.4, "step": 753, "token_acc": 0.9389671361502347, "train_speed(iter/s)": 0.989079 }, { "epoch": 0.024494038917584383, "grad_norm": 1.3292334079742432, "learning_rate": 4.896103896103896e-06, "loss": 0.14084678888320923, "memory(GiB)": 18.4, "step": 754, "token_acc": 0.9613259668508287, "train_speed(iter/s)": 0.989294 }, { "epoch": 0.0245265243803398, "grad_norm": 1.7491446733474731, "learning_rate": 4.902597402597403e-06, "loss": 0.12733925879001617, "memory(GiB)": 18.4, "step": 755, "token_acc": 0.9372937293729373, "train_speed(iter/s)": 0.989517 }, { "epoch": 0.024559009843095215, "grad_norm": 1.4110761880874634, "learning_rate": 4.90909090909091e-06, "loss": 0.15123283863067627, "memory(GiB)": 18.4, "step": 756, "token_acc": 0.9330855018587361, "train_speed(iter/s)": 0.989712 }, { "epoch": 0.02459149530585063, "grad_norm": 1.4154174327850342, "learning_rate": 4.915584415584416e-06, "loss": 0.1258794516324997, "memory(GiB)": 18.4, "step": 757, "token_acc": 0.9452054794520548, "train_speed(iter/s)": 0.989951 }, { "epoch": 0.02462398076860605, "grad_norm": 1.3245508670806885, "learning_rate": 4.922077922077923e-06, "loss": 0.13405510783195496, "memory(GiB)": 18.4, "step": 758, "token_acc": 0.9411764705882353, "train_speed(iter/s)": 0.990168 }, { "epoch": 0.024656466231361467, "grad_norm": 1.3446342945098877, "learning_rate": 4.928571428571429e-06, "loss": 0.12651222944259644, "memory(GiB)": 18.4, "step": 759, "token_acc": 0.9288702928870293, "train_speed(iter/s)": 0.990398 }, { "epoch": 0.024688951694116883, "grad_norm": 1.2653632164001465, "learning_rate": 4.935064935064935e-06, "loss": 0.1283072531223297, "memory(GiB)": 18.4, "step": 760, "token_acc": 0.9402985074626866, "train_speed(iter/s)": 0.990547 }, { "epoch": 0.0247214371568723, "grad_norm": 1.3722578287124634, "learning_rate": 4.941558441558443e-06, "loss": 0.13480696082115173, "memory(GiB)": 18.4, "step": 761, "token_acc": 0.9252669039145908, "train_speed(iter/s)": 0.990703 }, { "epoch": 0.02475392261962772, "grad_norm": 1.6188082695007324, "learning_rate": 4.948051948051948e-06, "loss": 0.12684646248817444, "memory(GiB)": 18.4, "step": 762, "token_acc": 0.9515418502202643, "train_speed(iter/s)": 0.99084 }, { "epoch": 0.024786408082383134, "grad_norm": 1.1995824575424194, "learning_rate": 4.954545454545455e-06, "loss": 0.12157900631427765, "memory(GiB)": 18.4, "step": 763, "token_acc": 0.9567307692307693, "train_speed(iter/s)": 0.990968 }, { "epoch": 0.02481889354513855, "grad_norm": 1.6223152875900269, "learning_rate": 4.961038961038962e-06, "loss": 0.12779712677001953, "memory(GiB)": 18.4, "step": 764, "token_acc": 0.9512195121951219, "train_speed(iter/s)": 0.991113 }, { "epoch": 0.024851379007893967, "grad_norm": 1.45064115524292, "learning_rate": 4.967532467532468e-06, "loss": 0.13159094750881195, "memory(GiB)": 18.4, "step": 765, "token_acc": 0.9442508710801394, "train_speed(iter/s)": 0.991252 }, { "epoch": 0.024883864470649386, "grad_norm": 0.945288896560669, "learning_rate": 4.974025974025975e-06, "loss": 0.11715640127658844, "memory(GiB)": 18.4, "step": 766, "token_acc": 0.9466666666666667, "train_speed(iter/s)": 0.991408 }, { "epoch": 0.024916349933404802, "grad_norm": 1.732515573501587, "learning_rate": 4.9805194805194814e-06, "loss": 0.1432270109653473, "memory(GiB)": 18.4, "step": 767, "token_acc": 0.9626865671641791, "train_speed(iter/s)": 0.991526 }, { "epoch": 0.02494883539616022, "grad_norm": 1.2015422582626343, "learning_rate": 4.987012987012987e-06, "loss": 0.11105281114578247, "memory(GiB)": 18.4, "step": 768, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.991652 }, { "epoch": 0.024981320858915634, "grad_norm": 1.4651365280151367, "learning_rate": 4.993506493506494e-06, "loss": 0.1272260844707489, "memory(GiB)": 18.4, "step": 769, "token_acc": 0.9358974358974359, "train_speed(iter/s)": 0.991812 }, { "epoch": 0.025013806321671054, "grad_norm": 1.3341342210769653, "learning_rate": 5e-06, "loss": 0.10961878299713135, "memory(GiB)": 18.4, "step": 770, "token_acc": 0.9629629629629629, "train_speed(iter/s)": 0.991941 }, { "epoch": 0.02504629178442647, "grad_norm": 1.5533872842788696, "learning_rate": 5.006493506493507e-06, "loss": 0.11577486246824265, "memory(GiB)": 18.4, "step": 771, "token_acc": 0.9552238805970149, "train_speed(iter/s)": 0.992065 }, { "epoch": 0.025078777247181886, "grad_norm": 1.3255159854888916, "learning_rate": 5.012987012987013e-06, "loss": 0.1228143721818924, "memory(GiB)": 18.4, "step": 772, "token_acc": 0.9598214285714286, "train_speed(iter/s)": 0.992207 }, { "epoch": 0.025111262709937302, "grad_norm": 1.4924116134643555, "learning_rate": 5.019480519480519e-06, "loss": 0.1282653957605362, "memory(GiB)": 18.4, "step": 773, "token_acc": 0.9393939393939394, "train_speed(iter/s)": 0.992335 }, { "epoch": 0.02514374817269272, "grad_norm": 1.5341670513153076, "learning_rate": 5.025974025974026e-06, "loss": 0.1345045417547226, "memory(GiB)": 18.4, "step": 774, "token_acc": 0.9558823529411765, "train_speed(iter/s)": 0.992492 }, { "epoch": 0.025176233635448138, "grad_norm": 1.7496485710144043, "learning_rate": 5.0324675324675334e-06, "loss": 0.11894910782575607, "memory(GiB)": 18.4, "step": 775, "token_acc": 0.9471830985915493, "train_speed(iter/s)": 0.992619 }, { "epoch": 0.025208719098203554, "grad_norm": 1.4154289960861206, "learning_rate": 5.038961038961039e-06, "loss": 0.11404263228178024, "memory(GiB)": 18.4, "step": 776, "token_acc": 0.9400921658986175, "train_speed(iter/s)": 0.992785 }, { "epoch": 0.02524120456095897, "grad_norm": 2.17484188079834, "learning_rate": 5.045454545454546e-06, "loss": 0.12223901599645615, "memory(GiB)": 18.4, "step": 777, "token_acc": 0.9626556016597511, "train_speed(iter/s)": 0.992993 }, { "epoch": 0.02527369002371439, "grad_norm": 1.6437907218933105, "learning_rate": 5.051948051948052e-06, "loss": 0.12437193840742111, "memory(GiB)": 18.4, "step": 778, "token_acc": 0.9372693726937269, "train_speed(iter/s)": 0.993214 }, { "epoch": 0.025306175486469806, "grad_norm": 1.1620230674743652, "learning_rate": 5.058441558441559e-06, "loss": 0.1151333600282669, "memory(GiB)": 18.4, "step": 779, "token_acc": 0.9481865284974094, "train_speed(iter/s)": 0.993417 }, { "epoch": 0.02533866094922522, "grad_norm": 1.5161999464035034, "learning_rate": 5.064935064935065e-06, "loss": 0.14683105051517487, "memory(GiB)": 18.4, "step": 780, "token_acc": 0.9526627218934911, "train_speed(iter/s)": 0.993614 }, { "epoch": 0.025371146411980638, "grad_norm": 1.4644652605056763, "learning_rate": 5.071428571428571e-06, "loss": 0.11563290655612946, "memory(GiB)": 18.4, "step": 781, "token_acc": 0.9434782608695652, "train_speed(iter/s)": 0.993773 }, { "epoch": 0.025403631874736057, "grad_norm": 1.0425831079483032, "learning_rate": 5.077922077922078e-06, "loss": 0.12097755819559097, "memory(GiB)": 18.4, "step": 782, "token_acc": 0.9535864978902954, "train_speed(iter/s)": 0.993908 }, { "epoch": 0.025436117337491473, "grad_norm": 1.0926579236984253, "learning_rate": 5.0844155844155855e-06, "loss": 0.13440492749214172, "memory(GiB)": 18.4, "step": 783, "token_acc": 0.946078431372549, "train_speed(iter/s)": 0.994073 }, { "epoch": 0.02546860280024689, "grad_norm": 1.2385950088500977, "learning_rate": 5.090909090909091e-06, "loss": 0.12625673413276672, "memory(GiB)": 18.4, "step": 784, "token_acc": 0.9670781893004116, "train_speed(iter/s)": 0.994187 }, { "epoch": 0.025501088263002306, "grad_norm": 1.1199182271957397, "learning_rate": 5.097402597402598e-06, "loss": 0.12883880734443665, "memory(GiB)": 18.4, "step": 785, "token_acc": 0.9248826291079812, "train_speed(iter/s)": 0.994329 }, { "epoch": 0.025533573725757725, "grad_norm": 1.6249769926071167, "learning_rate": 5.1038961038961044e-06, "loss": 0.12423421442508698, "memory(GiB)": 18.4, "step": 786, "token_acc": 0.9558823529411765, "train_speed(iter/s)": 0.994476 }, { "epoch": 0.02556605918851314, "grad_norm": 2.3541438579559326, "learning_rate": 5.110389610389611e-06, "loss": 0.1255137324333191, "memory(GiB)": 18.4, "step": 787, "token_acc": 0.9704433497536946, "train_speed(iter/s)": 0.994626 }, { "epoch": 0.025598544651268557, "grad_norm": 1.4716876745224, "learning_rate": 5.116883116883117e-06, "loss": 0.13367564976215363, "memory(GiB)": 18.4, "step": 788, "token_acc": 0.9399141630901288, "train_speed(iter/s)": 0.994764 }, { "epoch": 0.025631030114023973, "grad_norm": 1.5901833772659302, "learning_rate": 5.123376623376623e-06, "loss": 0.11503693461418152, "memory(GiB)": 18.4, "step": 789, "token_acc": 0.943609022556391, "train_speed(iter/s)": 0.994923 }, { "epoch": 0.025663515576779393, "grad_norm": 2.293154239654541, "learning_rate": 5.12987012987013e-06, "loss": 0.13373412191867828, "memory(GiB)": 18.4, "step": 790, "token_acc": 0.9253731343283582, "train_speed(iter/s)": 0.99504 }, { "epoch": 0.02569600103953481, "grad_norm": 1.6297615766525269, "learning_rate": 5.1363636363636375e-06, "loss": 0.11766761541366577, "memory(GiB)": 18.4, "step": 791, "token_acc": 0.9459459459459459, "train_speed(iter/s)": 0.995185 }, { "epoch": 0.025728486502290225, "grad_norm": 1.5256356000900269, "learning_rate": 5.142857142857142e-06, "loss": 0.12004158645868301, "memory(GiB)": 18.4, "step": 792, "token_acc": 0.9375, "train_speed(iter/s)": 0.99533 }, { "epoch": 0.02576097196504564, "grad_norm": 1.7951551675796509, "learning_rate": 5.14935064935065e-06, "loss": 0.11942791938781738, "memory(GiB)": 18.4, "step": 793, "token_acc": 0.9647058823529412, "train_speed(iter/s)": 0.995474 }, { "epoch": 0.02579345742780106, "grad_norm": 3.142455577850342, "learning_rate": 5.1558441558441564e-06, "loss": 0.13586091995239258, "memory(GiB)": 18.4, "step": 794, "token_acc": 0.9567307692307693, "train_speed(iter/s)": 0.995625 }, { "epoch": 0.025825942890556477, "grad_norm": 1.4734610319137573, "learning_rate": 5.162337662337663e-06, "loss": 0.11572647094726562, "memory(GiB)": 18.4, "step": 795, "token_acc": 0.9395161290322581, "train_speed(iter/s)": 0.995766 }, { "epoch": 0.025858428353311893, "grad_norm": 1.599536657333374, "learning_rate": 5.168831168831169e-06, "loss": 0.11684343963861465, "memory(GiB)": 18.4, "step": 796, "token_acc": 0.9683257918552036, "train_speed(iter/s)": 0.995914 }, { "epoch": 0.02589091381606731, "grad_norm": 1.2834031581878662, "learning_rate": 5.175324675324675e-06, "loss": 0.13365954160690308, "memory(GiB)": 18.4, "step": 797, "token_acc": 0.9414634146341463, "train_speed(iter/s)": 0.996045 }, { "epoch": 0.02592339927882273, "grad_norm": 1.0900702476501465, "learning_rate": 5.181818181818182e-06, "loss": 0.12496879696846008, "memory(GiB)": 18.4, "step": 798, "token_acc": 0.9456066945606695, "train_speed(iter/s)": 0.996159 }, { "epoch": 0.025955884741578145, "grad_norm": 1.5731481313705444, "learning_rate": 5.1883116883116895e-06, "loss": 0.1309160739183426, "memory(GiB)": 18.4, "step": 799, "token_acc": 0.9457013574660633, "train_speed(iter/s)": 0.996285 }, { "epoch": 0.02598837020433356, "grad_norm": 1.2841546535491943, "learning_rate": 5.194805194805194e-06, "loss": 0.10549096763134003, "memory(GiB)": 18.4, "step": 800, "token_acc": 0.9631336405529954, "train_speed(iter/s)": 0.99644 }, { "epoch": 0.026020855667088977, "grad_norm": 1.7218537330627441, "learning_rate": 5.201298701298702e-06, "loss": 0.12057928740978241, "memory(GiB)": 18.4, "step": 801, "token_acc": 0.9603960396039604, "train_speed(iter/s)": 0.996584 }, { "epoch": 0.026053341129844396, "grad_norm": 1.7588653564453125, "learning_rate": 5.2077922077922085e-06, "loss": 0.13864535093307495, "memory(GiB)": 18.4, "step": 802, "token_acc": 0.9567099567099567, "train_speed(iter/s)": 0.996767 }, { "epoch": 0.026085826592599812, "grad_norm": 1.0121818780899048, "learning_rate": 5.214285714285715e-06, "loss": 0.11245552450418472, "memory(GiB)": 18.4, "step": 803, "token_acc": 0.9753694581280788, "train_speed(iter/s)": 0.996967 }, { "epoch": 0.02611831205535523, "grad_norm": 0.8881147503852844, "learning_rate": 5.220779220779221e-06, "loss": 0.11621152609586716, "memory(GiB)": 18.4, "step": 804, "token_acc": 0.9672727272727273, "train_speed(iter/s)": 0.997178 }, { "epoch": 0.026150797518110645, "grad_norm": 1.3499999046325684, "learning_rate": 5.2272727272727274e-06, "loss": 0.1165727823972702, "memory(GiB)": 18.4, "step": 805, "token_acc": 0.97265625, "train_speed(iter/s)": 0.99736 }, { "epoch": 0.026183282980866064, "grad_norm": 1.3202235698699951, "learning_rate": 5.233766233766234e-06, "loss": 0.11672846227884293, "memory(GiB)": 18.4, "step": 806, "token_acc": 0.952755905511811, "train_speed(iter/s)": 0.99756 }, { "epoch": 0.02621576844362148, "grad_norm": 1.9649673700332642, "learning_rate": 5.240259740259741e-06, "loss": 0.12326739728450775, "memory(GiB)": 18.4, "step": 807, "token_acc": 0.9519650655021834, "train_speed(iter/s)": 0.997758 }, { "epoch": 0.026248253906376896, "grad_norm": 1.8578966856002808, "learning_rate": 5.246753246753246e-06, "loss": 0.14569294452667236, "memory(GiB)": 18.4, "step": 808, "token_acc": 0.9477911646586346, "train_speed(iter/s)": 0.997956 }, { "epoch": 0.026280739369132312, "grad_norm": 1.359864354133606, "learning_rate": 5.253246753246754e-06, "loss": 0.12651655077934265, "memory(GiB)": 18.4, "step": 809, "token_acc": 0.9426229508196722, "train_speed(iter/s)": 0.998139 }, { "epoch": 0.026313224831887732, "grad_norm": 1.253057599067688, "learning_rate": 5.2597402597402605e-06, "loss": 0.1296204924583435, "memory(GiB)": 18.4, "step": 810, "token_acc": 0.955, "train_speed(iter/s)": 0.998338 }, { "epoch": 0.026345710294643148, "grad_norm": 1.4847201108932495, "learning_rate": 5.266233766233767e-06, "loss": 0.11894674599170685, "memory(GiB)": 18.4, "step": 811, "token_acc": 0.9627659574468085, "train_speed(iter/s)": 0.998507 }, { "epoch": 0.026378195757398564, "grad_norm": 1.5995721817016602, "learning_rate": 5.272727272727273e-06, "loss": 0.13377529382705688, "memory(GiB)": 18.4, "step": 812, "token_acc": 0.9219330855018587, "train_speed(iter/s)": 0.998712 }, { "epoch": 0.02641068122015398, "grad_norm": 1.830904483795166, "learning_rate": 5.2792207792207794e-06, "loss": 0.12351098656654358, "memory(GiB)": 18.4, "step": 813, "token_acc": 0.9453125, "train_speed(iter/s)": 0.998868 }, { "epoch": 0.0264431666829094, "grad_norm": 1.5327033996582031, "learning_rate": 5.285714285714286e-06, "loss": 0.11600581556558609, "memory(GiB)": 18.4, "step": 814, "token_acc": 0.9340659340659341, "train_speed(iter/s)": 0.999029 }, { "epoch": 0.026475652145664816, "grad_norm": 1.840722680091858, "learning_rate": 5.292207792207793e-06, "loss": 0.12055781483650208, "memory(GiB)": 18.4, "step": 815, "token_acc": 0.9572649572649573, "train_speed(iter/s)": 0.999193 }, { "epoch": 0.026508137608420232, "grad_norm": 1.275641679763794, "learning_rate": 5.298701298701298e-06, "loss": 0.11691394448280334, "memory(GiB)": 18.4, "step": 816, "token_acc": 0.9672897196261683, "train_speed(iter/s)": 0.999341 }, { "epoch": 0.026540623071175648, "grad_norm": 1.401612639427185, "learning_rate": 5.305194805194806e-06, "loss": 0.12796804308891296, "memory(GiB)": 18.4, "step": 817, "token_acc": 0.9609375, "train_speed(iter/s)": 0.999472 }, { "epoch": 0.026573108533931068, "grad_norm": 1.5083916187286377, "learning_rate": 5.3116883116883125e-06, "loss": 0.11735939979553223, "memory(GiB)": 18.4, "step": 818, "token_acc": 0.9300699300699301, "train_speed(iter/s)": 0.999595 }, { "epoch": 0.026605593996686484, "grad_norm": 1.8164781332015991, "learning_rate": 5.318181818181819e-06, "loss": 0.1313074231147766, "memory(GiB)": 18.4, "step": 819, "token_acc": 0.9421487603305785, "train_speed(iter/s)": 0.999729 }, { "epoch": 0.0266380794594419, "grad_norm": 1.548670768737793, "learning_rate": 5.324675324675325e-06, "loss": 0.12286180257797241, "memory(GiB)": 18.4, "step": 820, "token_acc": 0.9532710280373832, "train_speed(iter/s)": 0.99986 }, { "epoch": 0.026670564922197316, "grad_norm": 1.1152273416519165, "learning_rate": 5.3311688311688315e-06, "loss": 0.12053531408309937, "memory(GiB)": 18.4, "step": 821, "token_acc": 0.9359430604982206, "train_speed(iter/s)": 1.00001 }, { "epoch": 0.026703050384952735, "grad_norm": 6.444648265838623, "learning_rate": 5.337662337662338e-06, "loss": 0.13196724653244019, "memory(GiB)": 18.4, "step": 822, "token_acc": 0.9210526315789473, "train_speed(iter/s)": 1.000116 }, { "epoch": 0.02673553584770815, "grad_norm": 1.4684603214263916, "learning_rate": 5.344155844155845e-06, "loss": 0.11331529915332794, "memory(GiB)": 18.4, "step": 823, "token_acc": 0.9585062240663901, "train_speed(iter/s)": 1.000245 }, { "epoch": 0.026768021310463568, "grad_norm": 1.0775994062423706, "learning_rate": 5.3506493506493504e-06, "loss": 0.13711008429527283, "memory(GiB)": 18.4, "step": 824, "token_acc": 0.9684684684684685, "train_speed(iter/s)": 1.000346 }, { "epoch": 0.026800506773218984, "grad_norm": 1.0994797945022583, "learning_rate": 5.357142857142857e-06, "loss": 0.13843384385108948, "memory(GiB)": 18.4, "step": 825, "token_acc": 0.9493670886075949, "train_speed(iter/s)": 1.000472 }, { "epoch": 0.026832992235974403, "grad_norm": 0.999367892742157, "learning_rate": 5.3636363636363645e-06, "loss": 0.12680584192276, "memory(GiB)": 18.4, "step": 826, "token_acc": 0.9337979094076655, "train_speed(iter/s)": 1.00059 }, { "epoch": 0.02686547769872982, "grad_norm": 3.6780598163604736, "learning_rate": 5.370129870129871e-06, "loss": 0.11282849311828613, "memory(GiB)": 18.4, "step": 827, "token_acc": 0.9513274336283186, "train_speed(iter/s)": 1.000716 }, { "epoch": 0.026897963161485235, "grad_norm": 0.9133362174034119, "learning_rate": 5.376623376623377e-06, "loss": 0.11344708502292633, "memory(GiB)": 18.4, "step": 828, "token_acc": 0.9444444444444444, "train_speed(iter/s)": 1.000841 }, { "epoch": 0.02693044862424065, "grad_norm": 1.5556864738464355, "learning_rate": 5.3831168831168835e-06, "loss": 0.12860706448554993, "memory(GiB)": 18.4, "step": 829, "token_acc": 0.9453125, "train_speed(iter/s)": 1.000967 }, { "epoch": 0.02696293408699607, "grad_norm": 1.1131988763809204, "learning_rate": 5.38961038961039e-06, "loss": 0.11909634619951248, "memory(GiB)": 18.4, "step": 830, "token_acc": 0.924901185770751, "train_speed(iter/s)": 1.001149 }, { "epoch": 0.026995419549751487, "grad_norm": 1.2399417161941528, "learning_rate": 5.396103896103897e-06, "loss": 0.1229708269238472, "memory(GiB)": 18.4, "step": 831, "token_acc": 0.9330855018587361, "train_speed(iter/s)": 1.001303 }, { "epoch": 0.027027905012506903, "grad_norm": 1.6992751359939575, "learning_rate": 5.4025974025974024e-06, "loss": 0.12161214649677277, "memory(GiB)": 18.4, "step": 832, "token_acc": 0.9439252336448598, "train_speed(iter/s)": 1.001477 }, { "epoch": 0.02706039047526232, "grad_norm": 1.762672781944275, "learning_rate": 5.409090909090909e-06, "loss": 0.11947324872016907, "memory(GiB)": 18.4, "step": 833, "token_acc": 0.945054945054945, "train_speed(iter/s)": 1.001644 }, { "epoch": 0.02709287593801774, "grad_norm": 1.3086698055267334, "learning_rate": 5.4155844155844165e-06, "loss": 0.11633876711130142, "memory(GiB)": 18.4, "step": 834, "token_acc": 0.9570815450643777, "train_speed(iter/s)": 1.001836 }, { "epoch": 0.027125361400773155, "grad_norm": 1.537609577178955, "learning_rate": 5.422077922077923e-06, "loss": 0.11594240367412567, "memory(GiB)": 18.4, "step": 835, "token_acc": 0.9617021276595744, "train_speed(iter/s)": 1.002013 }, { "epoch": 0.02715784686352857, "grad_norm": 1.1526014804840088, "learning_rate": 5.428571428571429e-06, "loss": 0.1173354908823967, "memory(GiB)": 18.4, "step": 836, "token_acc": 0.9520295202952029, "train_speed(iter/s)": 1.002194 }, { "epoch": 0.027190332326283987, "grad_norm": 2.121044397354126, "learning_rate": 5.4350649350649355e-06, "loss": 0.12878309190273285, "memory(GiB)": 18.4, "step": 837, "token_acc": 0.9529411764705882, "train_speed(iter/s)": 1.002391 }, { "epoch": 0.027222817789039407, "grad_norm": 1.2134689092636108, "learning_rate": 5.441558441558442e-06, "loss": 0.12901967763900757, "memory(GiB)": 18.4, "step": 838, "token_acc": 0.9607843137254902, "train_speed(iter/s)": 1.002595 }, { "epoch": 0.027255303251794823, "grad_norm": 1.6059950590133667, "learning_rate": 5.448051948051949e-06, "loss": 0.12883111834526062, "memory(GiB)": 18.4, "step": 839, "token_acc": 0.9353233830845771, "train_speed(iter/s)": 1.002786 }, { "epoch": 0.02728778871455024, "grad_norm": 1.8363240957260132, "learning_rate": 5.4545454545454545e-06, "loss": 0.11771868169307709, "memory(GiB)": 18.4, "step": 840, "token_acc": 0.9497716894977168, "train_speed(iter/s)": 1.00298 }, { "epoch": 0.027320274177305655, "grad_norm": 1.552764654159546, "learning_rate": 5.461038961038961e-06, "loss": 0.12154272198677063, "memory(GiB)": 18.4, "step": 841, "token_acc": 0.9319148936170213, "train_speed(iter/s)": 1.003116 }, { "epoch": 0.027352759640061074, "grad_norm": 1.5750386714935303, "learning_rate": 5.4675324675324685e-06, "loss": 0.11616876721382141, "memory(GiB)": 18.4, "step": 842, "token_acc": 0.9752475247524752, "train_speed(iter/s)": 1.003231 }, { "epoch": 0.02738524510281649, "grad_norm": 0.9633020162582397, "learning_rate": 5.474025974025975e-06, "loss": 0.10813839733600616, "memory(GiB)": 18.4, "step": 843, "token_acc": 0.9637096774193549, "train_speed(iter/s)": 1.003374 }, { "epoch": 0.027417730565571907, "grad_norm": 1.6774057149887085, "learning_rate": 5.480519480519481e-06, "loss": 0.11509817838668823, "memory(GiB)": 18.4, "step": 844, "token_acc": 0.9491525423728814, "train_speed(iter/s)": 1.003512 }, { "epoch": 0.027450216028327323, "grad_norm": 1.212708830833435, "learning_rate": 5.4870129870129875e-06, "loss": 0.1257399618625641, "memory(GiB)": 18.4, "step": 845, "token_acc": 0.9509433962264151, "train_speed(iter/s)": 1.003605 }, { "epoch": 0.027482701491082742, "grad_norm": 1.2848888635635376, "learning_rate": 5.493506493506494e-06, "loss": 0.1237981840968132, "memory(GiB)": 18.4, "step": 846, "token_acc": 0.9453781512605042, "train_speed(iter/s)": 1.003759 }, { "epoch": 0.02751518695383816, "grad_norm": 1.336894154548645, "learning_rate": 5.500000000000001e-06, "loss": 0.12898191809654236, "memory(GiB)": 18.4, "step": 847, "token_acc": 0.9662447257383966, "train_speed(iter/s)": 1.003874 }, { "epoch": 0.027547672416593574, "grad_norm": 1.1989940404891968, "learning_rate": 5.5064935064935065e-06, "loss": 0.1244945228099823, "memory(GiB)": 18.4, "step": 848, "token_acc": 0.9375, "train_speed(iter/s)": 1.003996 }, { "epoch": 0.02758015787934899, "grad_norm": 1.4582233428955078, "learning_rate": 5.512987012987013e-06, "loss": 0.1350947916507721, "memory(GiB)": 18.4, "step": 849, "token_acc": 0.9523809523809523, "train_speed(iter/s)": 1.00415 }, { "epoch": 0.02761264334210441, "grad_norm": 3.876556158065796, "learning_rate": 5.5194805194805205e-06, "loss": 0.12058240175247192, "memory(GiB)": 18.4, "step": 850, "token_acc": 0.9444444444444444, "train_speed(iter/s)": 1.004284 }, { "epoch": 0.027645128804859826, "grad_norm": 1.5876522064208984, "learning_rate": 5.525974025974027e-06, "loss": 0.11194992065429688, "memory(GiB)": 18.4, "step": 851, "token_acc": 0.9416666666666667, "train_speed(iter/s)": 1.004398 }, { "epoch": 0.027677614267615242, "grad_norm": 1.9009722471237183, "learning_rate": 5.532467532467533e-06, "loss": 0.13943110406398773, "memory(GiB)": 18.4, "step": 852, "token_acc": 0.9207920792079208, "train_speed(iter/s)": 1.004529 }, { "epoch": 0.027710099730370658, "grad_norm": 1.5323941707611084, "learning_rate": 5.5389610389610395e-06, "loss": 0.13022121787071228, "memory(GiB)": 18.4, "step": 853, "token_acc": 0.94, "train_speed(iter/s)": 1.004657 }, { "epoch": 0.027742585193126078, "grad_norm": 1.387209415435791, "learning_rate": 5.545454545454546e-06, "loss": 0.12389177083969116, "memory(GiB)": 18.4, "step": 854, "token_acc": 0.9563492063492064, "train_speed(iter/s)": 1.004775 }, { "epoch": 0.027775070655881494, "grad_norm": 2.385033130645752, "learning_rate": 5.551948051948053e-06, "loss": 0.13871636986732483, "memory(GiB)": 18.4, "step": 855, "token_acc": 0.946058091286307, "train_speed(iter/s)": 1.00492 }, { "epoch": 0.02780755611863691, "grad_norm": 1.3813872337341309, "learning_rate": 5.5584415584415585e-06, "loss": 0.11594700813293457, "memory(GiB)": 18.4, "step": 856, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 1.005038 }, { "epoch": 0.027840041581392326, "grad_norm": 1.5412713289260864, "learning_rate": 5.564935064935065e-06, "loss": 0.13601793348789215, "memory(GiB)": 18.4, "step": 857, "token_acc": 0.9407894736842105, "train_speed(iter/s)": 1.00518 }, { "epoch": 0.027872527044147746, "grad_norm": 1.2503178119659424, "learning_rate": 5.571428571428572e-06, "loss": 0.12563839554786682, "memory(GiB)": 18.4, "step": 858, "token_acc": 0.9315068493150684, "train_speed(iter/s)": 1.005304 }, { "epoch": 0.02790501250690316, "grad_norm": 1.2584341764450073, "learning_rate": 5.577922077922079e-06, "loss": 0.12918420135974884, "memory(GiB)": 18.4, "step": 859, "token_acc": 0.9585253456221198, "train_speed(iter/s)": 1.005394 }, { "epoch": 0.027937497969658578, "grad_norm": 1.931824803352356, "learning_rate": 5.584415584415585e-06, "loss": 0.13772344589233398, "memory(GiB)": 18.4, "step": 860, "token_acc": 0.9361702127659575, "train_speed(iter/s)": 1.005515 }, { "epoch": 0.027969983432413994, "grad_norm": 1.7343053817749023, "learning_rate": 5.5909090909090915e-06, "loss": 0.12307379394769669, "memory(GiB)": 18.4, "step": 861, "token_acc": 0.9368029739776952, "train_speed(iter/s)": 1.005661 }, { "epoch": 0.028002468895169413, "grad_norm": 1.2766320705413818, "learning_rate": 5.597402597402598e-06, "loss": 0.11769473552703857, "memory(GiB)": 18.4, "step": 862, "token_acc": 0.9507042253521126, "train_speed(iter/s)": 1.005792 }, { "epoch": 0.02803495435792483, "grad_norm": 1.6461639404296875, "learning_rate": 5.603896103896105e-06, "loss": 0.12753018736839294, "memory(GiB)": 18.4, "step": 863, "token_acc": 0.9453125, "train_speed(iter/s)": 1.005958 }, { "epoch": 0.028067439820680246, "grad_norm": 1.3996005058288574, "learning_rate": 5.6103896103896105e-06, "loss": 0.12119852006435394, "memory(GiB)": 18.4, "step": 864, "token_acc": 0.9560975609756097, "train_speed(iter/s)": 1.006095 }, { "epoch": 0.02809992528343566, "grad_norm": 1.3464043140411377, "learning_rate": 5.616883116883117e-06, "loss": 0.12671753764152527, "memory(GiB)": 18.4, "step": 865, "token_acc": 0.9479553903345725, "train_speed(iter/s)": 1.006245 }, { "epoch": 0.02813241074619108, "grad_norm": 0.9888947606086731, "learning_rate": 5.623376623376624e-06, "loss": 0.12752358615398407, "memory(GiB)": 18.4, "step": 866, "token_acc": 0.9375, "train_speed(iter/s)": 1.00638 }, { "epoch": 0.028164896208946497, "grad_norm": 1.4618924856185913, "learning_rate": 5.629870129870131e-06, "loss": 0.12126607447862625, "memory(GiB)": 18.4, "step": 867, "token_acc": 0.955, "train_speed(iter/s)": 1.006491 }, { "epoch": 0.028197381671701913, "grad_norm": 1.3588534593582153, "learning_rate": 5.636363636363636e-06, "loss": 0.10700225830078125, "memory(GiB)": 18.4, "step": 868, "token_acc": 0.9576271186440678, "train_speed(iter/s)": 1.006602 }, { "epoch": 0.02822986713445733, "grad_norm": 1.8044499158859253, "learning_rate": 5.6428571428571435e-06, "loss": 0.10213294625282288, "memory(GiB)": 18.4, "step": 869, "token_acc": 0.9596412556053812, "train_speed(iter/s)": 1.006712 }, { "epoch": 0.02826235259721275, "grad_norm": 1.4077924489974976, "learning_rate": 5.64935064935065e-06, "loss": 0.12308530509471893, "memory(GiB)": 18.4, "step": 870, "token_acc": 0.9479553903345725, "train_speed(iter/s)": 1.006788 }, { "epoch": 0.028294838059968165, "grad_norm": 1.8364239931106567, "learning_rate": 5.655844155844157e-06, "loss": 0.1214507669210434, "memory(GiB)": 18.4, "step": 871, "token_acc": 0.9458128078817734, "train_speed(iter/s)": 1.006874 }, { "epoch": 0.02832732352272358, "grad_norm": 2.9410853385925293, "learning_rate": 5.6623376623376625e-06, "loss": 0.12470120191574097, "memory(GiB)": 18.4, "step": 872, "token_acc": 0.9518072289156626, "train_speed(iter/s)": 1.006971 }, { "epoch": 0.028359808985478997, "grad_norm": 1.6411103010177612, "learning_rate": 5.668831168831169e-06, "loss": 0.12010324001312256, "memory(GiB)": 18.4, "step": 873, "token_acc": 0.951310861423221, "train_speed(iter/s)": 1.007031 }, { "epoch": 0.028392294448234417, "grad_norm": 1.902302861213684, "learning_rate": 5.675324675324676e-06, "loss": 0.12439334392547607, "memory(GiB)": 18.4, "step": 874, "token_acc": 0.9504950495049505, "train_speed(iter/s)": 1.007122 }, { "epoch": 0.028424779910989833, "grad_norm": 1.8651036024093628, "learning_rate": 5.681818181818183e-06, "loss": 0.10731581598520279, "memory(GiB)": 18.4, "step": 875, "token_acc": 0.966824644549763, "train_speed(iter/s)": 1.007199 }, { "epoch": 0.02845726537374525, "grad_norm": 0.6730501651763916, "learning_rate": 5.688311688311688e-06, "loss": 0.1011265367269516, "memory(GiB)": 18.4, "step": 876, "token_acc": 0.9574468085106383, "train_speed(iter/s)": 1.007305 }, { "epoch": 0.028489750836500665, "grad_norm": 2.8615291118621826, "learning_rate": 5.6948051948051955e-06, "loss": 0.12189295142889023, "memory(GiB)": 18.4, "step": 877, "token_acc": 0.9454545454545454, "train_speed(iter/s)": 1.007366 }, { "epoch": 0.028522236299256085, "grad_norm": 1.9691100120544434, "learning_rate": 5.701298701298702e-06, "loss": 0.12894199788570404, "memory(GiB)": 18.4, "step": 878, "token_acc": 0.9562841530054644, "train_speed(iter/s)": 1.007452 }, { "epoch": 0.0285547217620115, "grad_norm": 1.0937113761901855, "learning_rate": 5.707792207792209e-06, "loss": 0.11518525332212448, "memory(GiB)": 18.4, "step": 879, "token_acc": 0.9447004608294931, "train_speed(iter/s)": 1.007536 }, { "epoch": 0.028587207224766917, "grad_norm": 2.097700595855713, "learning_rate": 5.7142857142857145e-06, "loss": 0.11755889654159546, "memory(GiB)": 18.4, "step": 880, "token_acc": 0.9545454545454546, "train_speed(iter/s)": 1.007624 }, { "epoch": 0.028619692687522333, "grad_norm": 2.0877692699432373, "learning_rate": 5.720779220779221e-06, "loss": 0.13533729314804077, "memory(GiB)": 18.4, "step": 881, "token_acc": 0.9475524475524476, "train_speed(iter/s)": 1.007704 }, { "epoch": 0.028652178150277752, "grad_norm": 1.3720123767852783, "learning_rate": 5.727272727272728e-06, "loss": 0.11719463020563126, "memory(GiB)": 18.4, "step": 882, "token_acc": 0.9588477366255144, "train_speed(iter/s)": 1.007808 }, { "epoch": 0.02868466361303317, "grad_norm": 1.2348421812057495, "learning_rate": 5.733766233766235e-06, "loss": 0.1147356703877449, "memory(GiB)": 18.4, "step": 883, "token_acc": 0.958904109589041, "train_speed(iter/s)": 1.007939 }, { "epoch": 0.028717149075788585, "grad_norm": 2.9498794078826904, "learning_rate": 5.74025974025974e-06, "loss": 0.13856534659862518, "memory(GiB)": 18.4, "step": 884, "token_acc": 0.9454545454545454, "train_speed(iter/s)": 1.008051 }, { "epoch": 0.028749634538544, "grad_norm": 2.625483751296997, "learning_rate": 5.7467532467532475e-06, "loss": 0.13732849061489105, "memory(GiB)": 18.4, "step": 885, "token_acc": 0.9493087557603687, "train_speed(iter/s)": 1.00821 }, { "epoch": 0.02878212000129942, "grad_norm": 2.3653910160064697, "learning_rate": 5.753246753246754e-06, "loss": 0.13688060641288757, "memory(GiB)": 18.4, "step": 886, "token_acc": 0.963855421686747, "train_speed(iter/s)": 1.008376 }, { "epoch": 0.028814605464054836, "grad_norm": 2.4393258094787598, "learning_rate": 5.759740259740261e-06, "loss": 0.12141022831201553, "memory(GiB)": 18.4, "step": 887, "token_acc": 0.9399141630901288, "train_speed(iter/s)": 1.008555 }, { "epoch": 0.028847090926810252, "grad_norm": 1.4115554094314575, "learning_rate": 5.7662337662337665e-06, "loss": 0.12049825489521027, "memory(GiB)": 18.4, "step": 888, "token_acc": 0.9496124031007752, "train_speed(iter/s)": 1.008739 }, { "epoch": 0.02887957638956567, "grad_norm": 2.023071765899658, "learning_rate": 5.772727272727273e-06, "loss": 0.10065598785877228, "memory(GiB)": 18.4, "step": 889, "token_acc": 0.9581589958158996, "train_speed(iter/s)": 1.008929 }, { "epoch": 0.028912061852321088, "grad_norm": 1.5514676570892334, "learning_rate": 5.77922077922078e-06, "loss": 0.1166735589504242, "memory(GiB)": 18.4, "step": 890, "token_acc": 0.9421487603305785, "train_speed(iter/s)": 1.009099 }, { "epoch": 0.028944547315076504, "grad_norm": 1.3380217552185059, "learning_rate": 5.785714285714286e-06, "loss": 0.12472101300954819, "memory(GiB)": 18.4, "step": 891, "token_acc": 0.9402390438247012, "train_speed(iter/s)": 1.009289 }, { "epoch": 0.02897703277783192, "grad_norm": 1.7134116888046265, "learning_rate": 5.792207792207792e-06, "loss": 0.12880341708660126, "memory(GiB)": 18.4, "step": 892, "token_acc": 0.9411764705882353, "train_speed(iter/s)": 1.009446 }, { "epoch": 0.029009518240587336, "grad_norm": 1.6777997016906738, "learning_rate": 5.7987012987012996e-06, "loss": 0.11328098177909851, "memory(GiB)": 18.4, "step": 893, "token_acc": 0.9554655870445344, "train_speed(iter/s)": 1.009627 }, { "epoch": 0.029042003703342756, "grad_norm": 1.6700348854064941, "learning_rate": 5.805194805194806e-06, "loss": 0.12520702183246613, "memory(GiB)": 18.4, "step": 894, "token_acc": 0.9447513812154696, "train_speed(iter/s)": 1.009803 }, { "epoch": 0.029074489166098172, "grad_norm": 1.9212032556533813, "learning_rate": 5.811688311688313e-06, "loss": 0.1401883214712143, "memory(GiB)": 18.4, "step": 895, "token_acc": 0.9409090909090909, "train_speed(iter/s)": 1.00999 }, { "epoch": 0.029106974628853588, "grad_norm": 2.5350654125213623, "learning_rate": 5.8181818181818185e-06, "loss": 0.12837937474250793, "memory(GiB)": 18.4, "step": 896, "token_acc": 0.9585062240663901, "train_speed(iter/s)": 1.010159 }, { "epoch": 0.029139460091609004, "grad_norm": 1.802293062210083, "learning_rate": 5.824675324675325e-06, "loss": 0.11033020913600922, "memory(GiB)": 18.4, "step": 897, "token_acc": 0.9559471365638766, "train_speed(iter/s)": 1.010321 }, { "epoch": 0.029171945554364424, "grad_norm": 1.3313475847244263, "learning_rate": 5.831168831168832e-06, "loss": 0.1288619488477707, "memory(GiB)": 18.4, "step": 898, "token_acc": 0.9537366548042705, "train_speed(iter/s)": 1.010492 }, { "epoch": 0.02920443101711984, "grad_norm": 1.9472346305847168, "learning_rate": 5.8376623376623375e-06, "loss": 0.12560558319091797, "memory(GiB)": 18.4, "step": 899, "token_acc": 0.94140625, "train_speed(iter/s)": 1.010662 }, { "epoch": 0.029236916479875256, "grad_norm": 1.8390768766403198, "learning_rate": 5.844155844155844e-06, "loss": 0.1183238998055458, "memory(GiB)": 18.4, "step": 900, "token_acc": 0.9502262443438914, "train_speed(iter/s)": 1.010839 }, { "epoch": 0.029269401942630672, "grad_norm": 2.3578639030456543, "learning_rate": 5.850649350649351e-06, "loss": 0.12633371353149414, "memory(GiB)": 18.4, "step": 901, "token_acc": 0.9450980392156862, "train_speed(iter/s)": 1.011015 }, { "epoch": 0.02930188740538609, "grad_norm": 1.3784431219100952, "learning_rate": 5.857142857142858e-06, "loss": 0.1257329285144806, "memory(GiB)": 18.4, "step": 902, "token_acc": 0.9451476793248945, "train_speed(iter/s)": 1.011168 }, { "epoch": 0.029334372868141508, "grad_norm": 1.017430067062378, "learning_rate": 5.863636363636364e-06, "loss": 0.12384308874607086, "memory(GiB)": 18.4, "step": 903, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 1.011337 }, { "epoch": 0.029366858330896924, "grad_norm": 1.4198073148727417, "learning_rate": 5.8701298701298705e-06, "loss": 0.14523190259933472, "memory(GiB)": 18.4, "step": 904, "token_acc": 0.9533898305084746, "train_speed(iter/s)": 1.011458 }, { "epoch": 0.02939934379365234, "grad_norm": 1.664296269416809, "learning_rate": 5.876623376623377e-06, "loss": 0.12495817244052887, "memory(GiB)": 18.4, "step": 905, "token_acc": 0.9322033898305084, "train_speed(iter/s)": 1.011628 }, { "epoch": 0.02943182925640776, "grad_norm": 1.7675179243087769, "learning_rate": 5.883116883116884e-06, "loss": 0.11756566166877747, "memory(GiB)": 18.4, "step": 906, "token_acc": 0.9173913043478261, "train_speed(iter/s)": 1.011816 }, { "epoch": 0.029464314719163175, "grad_norm": 1.2218259572982788, "learning_rate": 5.8896103896103895e-06, "loss": 0.11629650741815567, "memory(GiB)": 18.4, "step": 907, "token_acc": 0.948, "train_speed(iter/s)": 1.012011 }, { "epoch": 0.02949680018191859, "grad_norm": 1.1937119960784912, "learning_rate": 5.896103896103896e-06, "loss": 0.1141607016324997, "memory(GiB)": 18.4, "step": 908, "token_acc": 0.9683257918552036, "train_speed(iter/s)": 1.012167 }, { "epoch": 0.029529285644674007, "grad_norm": 1.1321200132369995, "learning_rate": 5.902597402597403e-06, "loss": 0.11402187496423721, "memory(GiB)": 18.4, "step": 909, "token_acc": 0.9623430962343096, "train_speed(iter/s)": 1.012349 }, { "epoch": 0.029561771107429427, "grad_norm": 1.7843079566955566, "learning_rate": 5.90909090909091e-06, "loss": 0.12180633842945099, "memory(GiB)": 18.4, "step": 910, "token_acc": 0.9523809523809523, "train_speed(iter/s)": 1.01251 }, { "epoch": 0.029594256570184843, "grad_norm": 1.5233396291732788, "learning_rate": 5.915584415584416e-06, "loss": 0.11162830144166946, "memory(GiB)": 18.4, "step": 911, "token_acc": 0.9574468085106383, "train_speed(iter/s)": 1.012689 }, { "epoch": 0.02962674203294026, "grad_norm": 1.403052568435669, "learning_rate": 5.9220779220779226e-06, "loss": 0.10394158214330673, "memory(GiB)": 18.4, "step": 912, "token_acc": 0.9367588932806324, "train_speed(iter/s)": 1.012873 }, { "epoch": 0.029659227495695675, "grad_norm": 1.735183596611023, "learning_rate": 5.928571428571429e-06, "loss": 0.12715843319892883, "memory(GiB)": 18.4, "step": 913, "token_acc": 0.9594594594594594, "train_speed(iter/s)": 1.013057 }, { "epoch": 0.029691712958451095, "grad_norm": 1.7700836658477783, "learning_rate": 5.935064935064936e-06, "loss": 0.11522535234689713, "memory(GiB)": 18.4, "step": 914, "token_acc": 0.9636363636363636, "train_speed(iter/s)": 1.01322 }, { "epoch": 0.02972419842120651, "grad_norm": 1.2923601865768433, "learning_rate": 5.9415584415584415e-06, "loss": 0.11232998967170715, "memory(GiB)": 18.4, "step": 915, "token_acc": 0.9615384615384616, "train_speed(iter/s)": 1.013418 }, { "epoch": 0.029756683883961927, "grad_norm": 1.2077089548110962, "learning_rate": 5.948051948051948e-06, "loss": 0.1114836037158966, "memory(GiB)": 18.4, "step": 916, "token_acc": 0.9576271186440678, "train_speed(iter/s)": 1.013573 }, { "epoch": 0.029789169346717343, "grad_norm": 1.454604148864746, "learning_rate": 5.954545454545455e-06, "loss": 0.12437650561332703, "memory(GiB)": 18.4, "step": 917, "token_acc": 0.9372693726937269, "train_speed(iter/s)": 1.013744 }, { "epoch": 0.029821654809472763, "grad_norm": 1.4963910579681396, "learning_rate": 5.961038961038962e-06, "loss": 0.10402917861938477, "memory(GiB)": 18.4, "step": 918, "token_acc": 0.9611650485436893, "train_speed(iter/s)": 1.01393 }, { "epoch": 0.02985414027222818, "grad_norm": 2.3826935291290283, "learning_rate": 5.967532467532467e-06, "loss": 0.1313626766204834, "memory(GiB)": 18.4, "step": 919, "token_acc": 0.946078431372549, "train_speed(iter/s)": 1.014113 }, { "epoch": 0.029886625734983595, "grad_norm": 0.9773569703102112, "learning_rate": 5.9740259740259746e-06, "loss": 0.10773418098688126, "memory(GiB)": 18.4, "step": 920, "token_acc": 0.9678714859437751, "train_speed(iter/s)": 1.014271 }, { "epoch": 0.02991911119773901, "grad_norm": 2.741269111633301, "learning_rate": 5.980519480519481e-06, "loss": 0.12178853899240494, "memory(GiB)": 18.4, "step": 921, "token_acc": 0.9378238341968912, "train_speed(iter/s)": 1.014447 }, { "epoch": 0.02995159666049443, "grad_norm": 1.3601495027542114, "learning_rate": 5.987012987012988e-06, "loss": 0.12242618948221207, "memory(GiB)": 18.4, "step": 922, "token_acc": 0.9633699633699634, "train_speed(iter/s)": 1.014555 }, { "epoch": 0.029984082123249847, "grad_norm": 1.570269227027893, "learning_rate": 5.9935064935064935e-06, "loss": 0.11992010474205017, "memory(GiB)": 18.4, "step": 923, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 1.014674 }, { "epoch": 0.030016567586005263, "grad_norm": 1.1871687173843384, "learning_rate": 6e-06, "loss": 0.1088191568851471, "memory(GiB)": 18.4, "step": 924, "token_acc": 0.9360902255639098, "train_speed(iter/s)": 1.014785 }, { "epoch": 0.03004905304876068, "grad_norm": 1.6679133176803589, "learning_rate": 6.006493506493507e-06, "loss": 0.14514122903347015, "memory(GiB)": 18.4, "step": 925, "token_acc": 0.9402985074626866, "train_speed(iter/s)": 1.014877 }, { "epoch": 0.030081538511516098, "grad_norm": 1.0716502666473389, "learning_rate": 6.012987012987014e-06, "loss": 0.11027400195598602, "memory(GiB)": 18.4, "step": 926, "token_acc": 0.9406392694063926, "train_speed(iter/s)": 1.014958 }, { "epoch": 0.030114023974271514, "grad_norm": 1.3356883525848389, "learning_rate": 6.019480519480519e-06, "loss": 0.1193457543849945, "memory(GiB)": 18.4, "step": 927, "token_acc": 0.9407114624505929, "train_speed(iter/s)": 1.015068 }, { "epoch": 0.03014650943702693, "grad_norm": 1.026398777961731, "learning_rate": 6.025974025974027e-06, "loss": 0.10109605640172958, "memory(GiB)": 18.4, "step": 928, "token_acc": 0.9448529411764706, "train_speed(iter/s)": 1.015133 }, { "epoch": 0.030178994899782347, "grad_norm": 1.154034972190857, "learning_rate": 6.032467532467533e-06, "loss": 0.09588386118412018, "memory(GiB)": 18.4, "step": 929, "token_acc": 0.9629629629629629, "train_speed(iter/s)": 1.015238 }, { "epoch": 0.030211480362537766, "grad_norm": 2.8897042274475098, "learning_rate": 6.03896103896104e-06, "loss": 0.11937545239925385, "memory(GiB)": 18.4, "step": 930, "token_acc": 0.9505494505494505, "train_speed(iter/s)": 1.015333 }, { "epoch": 0.030243965825293182, "grad_norm": 1.6086844205856323, "learning_rate": 6.0454545454545456e-06, "loss": 0.12442721426486969, "memory(GiB)": 18.4, "step": 931, "token_acc": 0.948936170212766, "train_speed(iter/s)": 1.015419 }, { "epoch": 0.030276451288048598, "grad_norm": 1.875017762184143, "learning_rate": 6.051948051948052e-06, "loss": 0.1252773404121399, "memory(GiB)": 18.4, "step": 932, "token_acc": 0.9440559440559441, "train_speed(iter/s)": 1.015521 }, { "epoch": 0.030308936750804014, "grad_norm": 1.342980980873108, "learning_rate": 6.058441558441559e-06, "loss": 0.12285616248846054, "memory(GiB)": 18.4, "step": 933, "token_acc": 0.9547511312217195, "train_speed(iter/s)": 1.015592 }, { "epoch": 0.030341422213559434, "grad_norm": 1.7063000202178955, "learning_rate": 6.064935064935065e-06, "loss": 0.12303241342306137, "memory(GiB)": 18.4, "step": 934, "token_acc": 0.9551569506726457, "train_speed(iter/s)": 1.015685 }, { "epoch": 0.03037390767631485, "grad_norm": 1.5343056917190552, "learning_rate": 6.071428571428571e-06, "loss": 0.10108087956905365, "memory(GiB)": 18.4, "step": 935, "token_acc": 0.9732620320855615, "train_speed(iter/s)": 1.01578 }, { "epoch": 0.030406393139070266, "grad_norm": 0.893212080001831, "learning_rate": 6.077922077922079e-06, "loss": 0.10141535103321075, "memory(GiB)": 18.4, "step": 936, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 1.015677 }, { "epoch": 0.030438878601825682, "grad_norm": 1.6808509826660156, "learning_rate": 6.084415584415585e-06, "loss": 0.13247981667518616, "memory(GiB)": 18.4, "step": 937, "token_acc": 0.9420849420849421, "train_speed(iter/s)": 1.015801 }, { "epoch": 0.0304713640645811, "grad_norm": 1.9914629459381104, "learning_rate": 6.090909090909092e-06, "loss": 0.11647064238786697, "memory(GiB)": 18.4, "step": 938, "token_acc": 0.9465648854961832, "train_speed(iter/s)": 1.015912 }, { "epoch": 0.030503849527336518, "grad_norm": 1.42779541015625, "learning_rate": 6.0974025974025976e-06, "loss": 0.13475999236106873, "memory(GiB)": 18.4, "step": 939, "token_acc": 0.9314516129032258, "train_speed(iter/s)": 1.016032 }, { "epoch": 0.030536334990091934, "grad_norm": 0.866474986076355, "learning_rate": 6.103896103896104e-06, "loss": 0.10518632084131241, "memory(GiB)": 18.4, "step": 940, "token_acc": 0.9588014981273408, "train_speed(iter/s)": 1.016159 }, { "epoch": 0.03056882045284735, "grad_norm": 2.383180618286133, "learning_rate": 6.110389610389611e-06, "loss": 0.14142560958862305, "memory(GiB)": 18.4, "step": 941, "token_acc": 0.9537037037037037, "train_speed(iter/s)": 1.016254 }, { "epoch": 0.03060130591560277, "grad_norm": 1.4343712329864502, "learning_rate": 6.116883116883117e-06, "loss": 0.1182142049074173, "memory(GiB)": 18.4, "step": 942, "token_acc": 0.9488372093023256, "train_speed(iter/s)": 1.016348 }, { "epoch": 0.030633791378358186, "grad_norm": 1.6774200201034546, "learning_rate": 6.123376623376623e-06, "loss": 0.10239129513502121, "memory(GiB)": 18.4, "step": 943, "token_acc": 0.9660377358490566, "train_speed(iter/s)": 1.016464 }, { "epoch": 0.0306662768411136, "grad_norm": 1.1139193773269653, "learning_rate": 6.129870129870131e-06, "loss": 0.12106318771839142, "memory(GiB)": 18.4, "step": 944, "token_acc": 0.96484375, "train_speed(iter/s)": 1.016568 }, { "epoch": 0.030698762303869018, "grad_norm": 1.7841767072677612, "learning_rate": 6.136363636363637e-06, "loss": 0.13547460734844208, "memory(GiB)": 18.4, "step": 945, "token_acc": 0.9371980676328503, "train_speed(iter/s)": 1.01673 }, { "epoch": 0.030731247766624437, "grad_norm": 2.178461790084839, "learning_rate": 6.142857142857144e-06, "loss": 0.1167585551738739, "memory(GiB)": 18.4, "step": 946, "token_acc": 0.960352422907489, "train_speed(iter/s)": 1.016889 }, { "epoch": 0.030763733229379853, "grad_norm": 1.674356460571289, "learning_rate": 6.14935064935065e-06, "loss": 0.12063230574131012, "memory(GiB)": 18.4, "step": 947, "token_acc": 0.9541284403669725, "train_speed(iter/s)": 1.017049 }, { "epoch": 0.03079621869213527, "grad_norm": 1.7719988822937012, "learning_rate": 6.155844155844156e-06, "loss": 0.13152308762073517, "memory(GiB)": 18.4, "step": 948, "token_acc": 0.9452054794520548, "train_speed(iter/s)": 1.017202 }, { "epoch": 0.030828704154890686, "grad_norm": 1.6655828952789307, "learning_rate": 6.162337662337663e-06, "loss": 0.116796113550663, "memory(GiB)": 18.4, "step": 949, "token_acc": 0.9482758620689655, "train_speed(iter/s)": 1.017364 }, { "epoch": 0.030861189617646105, "grad_norm": 1.6434537172317505, "learning_rate": 6.168831168831169e-06, "loss": 0.12558993697166443, "memory(GiB)": 18.4, "step": 950, "token_acc": 0.9398496240601504, "train_speed(iter/s)": 1.017487 }, { "epoch": 0.03089367508040152, "grad_norm": 0.9194530248641968, "learning_rate": 6.175324675324675e-06, "loss": 0.10718187689781189, "memory(GiB)": 18.4, "step": 951, "token_acc": 0.9638009049773756, "train_speed(iter/s)": 1.017634 }, { "epoch": 0.030926160543156937, "grad_norm": 1.687656283378601, "learning_rate": 6.181818181818182e-06, "loss": 0.12171398848295212, "memory(GiB)": 18.4, "step": 952, "token_acc": 0.9434628975265018, "train_speed(iter/s)": 1.017798 }, { "epoch": 0.030958646005912353, "grad_norm": 1.2771515846252441, "learning_rate": 6.188311688311689e-06, "loss": 0.10896728932857513, "memory(GiB)": 18.4, "step": 953, "token_acc": 0.961352657004831, "train_speed(iter/s)": 1.017965 }, { "epoch": 0.030991131468667773, "grad_norm": 1.0966315269470215, "learning_rate": 6.194805194805196e-06, "loss": 0.128767728805542, "memory(GiB)": 18.4, "step": 954, "token_acc": 0.932, "train_speed(iter/s)": 1.018106 }, { "epoch": 0.03102361693142319, "grad_norm": 0.9804403781890869, "learning_rate": 6.201298701298702e-06, "loss": 0.11149248480796814, "memory(GiB)": 18.4, "step": 955, "token_acc": 0.9708333333333333, "train_speed(iter/s)": 1.01828 }, { "epoch": 0.031056102394178605, "grad_norm": 1.2910460233688354, "learning_rate": 6.207792207792208e-06, "loss": 0.11241868138313293, "memory(GiB)": 18.4, "step": 956, "token_acc": 0.9497907949790795, "train_speed(iter/s)": 1.018427 }, { "epoch": 0.03108858785693402, "grad_norm": 1.4713680744171143, "learning_rate": 6.214285714285715e-06, "loss": 0.1075463518500328, "memory(GiB)": 18.4, "step": 957, "token_acc": 0.9403669724770642, "train_speed(iter/s)": 1.018598 }, { "epoch": 0.03112107331968944, "grad_norm": 1.1207960844039917, "learning_rate": 6.220779220779221e-06, "loss": 0.11799220740795135, "memory(GiB)": 18.4, "step": 958, "token_acc": 0.9479553903345725, "train_speed(iter/s)": 1.018783 }, { "epoch": 0.031153558782444857, "grad_norm": 1.5597329139709473, "learning_rate": 6.227272727272727e-06, "loss": 0.11591734737157822, "memory(GiB)": 18.4, "step": 959, "token_acc": 0.9508928571428571, "train_speed(iter/s)": 1.018952 }, { "epoch": 0.031186044245200273, "grad_norm": 1.9492794275283813, "learning_rate": 6.233766233766234e-06, "loss": 0.12392018735408783, "memory(GiB)": 18.4, "step": 960, "token_acc": 0.9461883408071748, "train_speed(iter/s)": 1.019109 }, { "epoch": 0.03121852970795569, "grad_norm": 1.576126217842102, "learning_rate": 6.240259740259741e-06, "loss": 0.1213778629899025, "memory(GiB)": 18.4, "step": 961, "token_acc": 0.9539170506912442, "train_speed(iter/s)": 1.01928 }, { "epoch": 0.031251015170711105, "grad_norm": 1.2984604835510254, "learning_rate": 6.246753246753248e-06, "loss": 0.12005715072154999, "memory(GiB)": 18.4, "step": 962, "token_acc": 0.9563106796116505, "train_speed(iter/s)": 1.019428 }, { "epoch": 0.03128350063346652, "grad_norm": 2.1401987075805664, "learning_rate": 6.253246753246754e-06, "loss": 0.11984990537166595, "memory(GiB)": 18.4, "step": 963, "token_acc": 0.9529411764705882, "train_speed(iter/s)": 1.019582 }, { "epoch": 0.031315986096221944, "grad_norm": 2.203883409500122, "learning_rate": 6.25974025974026e-06, "loss": 0.1269518882036209, "memory(GiB)": 18.4, "step": 964, "token_acc": 0.9432314410480349, "train_speed(iter/s)": 1.019734 }, { "epoch": 0.03134847155897736, "grad_norm": 1.9389671087265015, "learning_rate": 6.266233766233767e-06, "loss": 0.12239396572113037, "memory(GiB)": 18.4, "step": 965, "token_acc": 0.9398148148148148, "train_speed(iter/s)": 1.019897 }, { "epoch": 0.031380957021732776, "grad_norm": 1.7745864391326904, "learning_rate": 6.2727272727272734e-06, "loss": 0.12895330786705017, "memory(GiB)": 18.4, "step": 966, "token_acc": 0.9447513812154696, "train_speed(iter/s)": 1.020047 }, { "epoch": 0.03141344248448819, "grad_norm": 1.1008999347686768, "learning_rate": 6.279220779220779e-06, "loss": 0.1129576563835144, "memory(GiB)": 18.4, "step": 967, "token_acc": 0.9590909090909091, "train_speed(iter/s)": 1.020209 }, { "epoch": 0.03144592794724361, "grad_norm": 2.014509916305542, "learning_rate": 6.285714285714286e-06, "loss": 0.13617193698883057, "memory(GiB)": 18.4, "step": 968, "token_acc": 0.94, "train_speed(iter/s)": 1.020352 }, { "epoch": 0.031478413409999025, "grad_norm": 2.3588738441467285, "learning_rate": 6.292207792207793e-06, "loss": 0.12976020574569702, "memory(GiB)": 18.4, "step": 969, "token_acc": 0.967741935483871, "train_speed(iter/s)": 1.020516 }, { "epoch": 0.03151089887275444, "grad_norm": 1.8235116004943848, "learning_rate": 6.2987012987013e-06, "loss": 0.13740521669387817, "memory(GiB)": 18.4, "step": 970, "token_acc": 0.9433962264150944, "train_speed(iter/s)": 1.020665 }, { "epoch": 0.03154338433550986, "grad_norm": 1.505770206451416, "learning_rate": 6.305194805194806e-06, "loss": 0.13105104863643646, "memory(GiB)": 18.4, "step": 971, "token_acc": 0.9535864978902954, "train_speed(iter/s)": 1.020836 }, { "epoch": 0.03157586979826528, "grad_norm": 0.9931778907775879, "learning_rate": 6.311688311688312e-06, "loss": 0.12426473200321198, "memory(GiB)": 18.4, "step": 972, "token_acc": 0.9623655913978495, "train_speed(iter/s)": 1.020986 }, { "epoch": 0.031608355261020696, "grad_norm": 1.1868669986724854, "learning_rate": 6.318181818181819e-06, "loss": 0.12376083433628082, "memory(GiB)": 18.4, "step": 973, "token_acc": 0.9537037037037037, "train_speed(iter/s)": 1.02116 }, { "epoch": 0.03164084072377611, "grad_norm": 1.3422818183898926, "learning_rate": 6.3246753246753254e-06, "loss": 0.10995036363601685, "memory(GiB)": 18.4, "step": 974, "token_acc": 0.9556650246305419, "train_speed(iter/s)": 1.021303 }, { "epoch": 0.03167332618653153, "grad_norm": 2.035072088241577, "learning_rate": 6.331168831168831e-06, "loss": 0.11712490022182465, "memory(GiB)": 18.4, "step": 975, "token_acc": 0.9609756097560975, "train_speed(iter/s)": 1.021432 }, { "epoch": 0.031705811649286944, "grad_norm": 1.4045467376708984, "learning_rate": 6.337662337662338e-06, "loss": 0.1149037554860115, "memory(GiB)": 18.4, "step": 976, "token_acc": 0.9509803921568627, "train_speed(iter/s)": 1.021561 }, { "epoch": 0.03173829711204236, "grad_norm": 1.524244785308838, "learning_rate": 6.344155844155845e-06, "loss": 0.12036366015672684, "memory(GiB)": 18.4, "step": 977, "token_acc": 0.943127962085308, "train_speed(iter/s)": 1.02165 }, { "epoch": 0.031770782574797776, "grad_norm": 1.4342758655548096, "learning_rate": 6.350649350649352e-06, "loss": 0.1256023794412613, "memory(GiB)": 18.4, "step": 978, "token_acc": 0.9459459459459459, "train_speed(iter/s)": 1.021718 }, { "epoch": 0.03180326803755319, "grad_norm": 1.7055249214172363, "learning_rate": 6.357142857142858e-06, "loss": 0.13344630599021912, "memory(GiB)": 18.4, "step": 979, "token_acc": 0.9380165289256198, "train_speed(iter/s)": 1.021812 }, { "epoch": 0.031835753500308615, "grad_norm": 1.4205269813537598, "learning_rate": 6.363636363636364e-06, "loss": 0.10992640256881714, "memory(GiB)": 18.4, "step": 980, "token_acc": 0.9657534246575342, "train_speed(iter/s)": 1.021887 }, { "epoch": 0.03186823896306403, "grad_norm": 1.8520326614379883, "learning_rate": 6.370129870129871e-06, "loss": 0.11045565456151962, "memory(GiB)": 18.4, "step": 981, "token_acc": 0.966804979253112, "train_speed(iter/s)": 1.021988 }, { "epoch": 0.03190072442581945, "grad_norm": 1.9003998041152954, "learning_rate": 6.3766233766233774e-06, "loss": 0.12418630719184875, "memory(GiB)": 18.4, "step": 982, "token_acc": 0.9369747899159664, "train_speed(iter/s)": 1.022093 }, { "epoch": 0.031933209888574864, "grad_norm": 2.5226552486419678, "learning_rate": 6.383116883116883e-06, "loss": 0.12423547357320786, "memory(GiB)": 18.4, "step": 983, "token_acc": 0.9236947791164659, "train_speed(iter/s)": 1.022158 }, { "epoch": 0.03196569535133028, "grad_norm": 2.1297552585601807, "learning_rate": 6.38961038961039e-06, "loss": 0.12109016627073288, "memory(GiB)": 18.4, "step": 984, "token_acc": 0.9587155963302753, "train_speed(iter/s)": 1.022235 }, { "epoch": 0.031998180814085696, "grad_norm": 2.0349647998809814, "learning_rate": 6.3961038961038964e-06, "loss": 0.12634465098381042, "memory(GiB)": 18.4, "step": 985, "token_acc": 0.924901185770751, "train_speed(iter/s)": 1.022303 }, { "epoch": 0.03203066627684111, "grad_norm": 3.1593263149261475, "learning_rate": 6.402597402597404e-06, "loss": 0.13453322649002075, "memory(GiB)": 18.4, "step": 986, "token_acc": 0.9481481481481482, "train_speed(iter/s)": 1.022371 }, { "epoch": 0.03206315173959653, "grad_norm": 2.032776117324829, "learning_rate": 6.40909090909091e-06, "loss": 0.12364798784255981, "memory(GiB)": 18.4, "step": 987, "token_acc": 0.932, "train_speed(iter/s)": 1.022457 }, { "epoch": 0.03209563720235195, "grad_norm": 1.6576926708221436, "learning_rate": 6.415584415584416e-06, "loss": 0.12147464603185654, "memory(GiB)": 18.4, "step": 988, "token_acc": 0.943609022556391, "train_speed(iter/s)": 1.022523 }, { "epoch": 0.03212812266510737, "grad_norm": 1.637753963470459, "learning_rate": 6.422077922077923e-06, "loss": 0.1193665862083435, "memory(GiB)": 18.4, "step": 989, "token_acc": 0.9646017699115044, "train_speed(iter/s)": 1.02259 }, { "epoch": 0.03216060812786278, "grad_norm": 2.0656661987304688, "learning_rate": 6.4285714285714295e-06, "loss": 0.14289090037345886, "memory(GiB)": 18.4, "step": 990, "token_acc": 0.9497907949790795, "train_speed(iter/s)": 1.022677 }, { "epoch": 0.0321930935906182, "grad_norm": 2.236210584640503, "learning_rate": 6.435064935064935e-06, "loss": 0.1348947286605835, "memory(GiB)": 18.4, "step": 991, "token_acc": 0.9742489270386266, "train_speed(iter/s)": 1.022735 }, { "epoch": 0.032225579053373615, "grad_norm": 1.6909993886947632, "learning_rate": 6.441558441558442e-06, "loss": 0.12970703840255737, "memory(GiB)": 18.4, "step": 992, "token_acc": 0.9435897435897436, "train_speed(iter/s)": 1.022822 }, { "epoch": 0.03225806451612903, "grad_norm": 1.1699352264404297, "learning_rate": 6.4480519480519484e-06, "loss": 0.12875957787036896, "memory(GiB)": 18.4, "step": 993, "token_acc": 0.9411764705882353, "train_speed(iter/s)": 1.022931 }, { "epoch": 0.03229054997888445, "grad_norm": 0.854749858379364, "learning_rate": 6.454545454545456e-06, "loss": 0.10910366475582123, "memory(GiB)": 18.4, "step": 994, "token_acc": 0.9587155963302753, "train_speed(iter/s)": 1.023008 }, { "epoch": 0.032323035441639864, "grad_norm": 1.2738475799560547, "learning_rate": 6.461038961038961e-06, "loss": 0.12143474817276001, "memory(GiB)": 18.4, "step": 995, "token_acc": 0.9563492063492064, "train_speed(iter/s)": 1.023104 }, { "epoch": 0.03235552090439529, "grad_norm": 1.0181074142456055, "learning_rate": 6.467532467532468e-06, "loss": 0.11946876347064972, "memory(GiB)": 18.4, "step": 996, "token_acc": 0.961038961038961, "train_speed(iter/s)": 1.023186 }, { "epoch": 0.0323880063671507, "grad_norm": 1.1882433891296387, "learning_rate": 6.474025974025975e-06, "loss": 0.11545340716838837, "memory(GiB)": 18.4, "step": 997, "token_acc": 0.9727272727272728, "train_speed(iter/s)": 1.023281 }, { "epoch": 0.03242049182990612, "grad_norm": 1.3655226230621338, "learning_rate": 6.4805194805194815e-06, "loss": 0.13432905077934265, "memory(GiB)": 18.4, "step": 998, "token_acc": 0.9448529411764706, "train_speed(iter/s)": 1.023363 }, { "epoch": 0.032452977292661535, "grad_norm": 1.4630626440048218, "learning_rate": 6.487012987012987e-06, "loss": 0.10564258694648743, "memory(GiB)": 18.4, "step": 999, "token_acc": 0.9527896995708155, "train_speed(iter/s)": 1.023464 }, { "epoch": 0.03248546275541695, "grad_norm": 2.084989070892334, "learning_rate": 6.493506493506494e-06, "loss": 0.12281776964664459, "memory(GiB)": 18.4, "step": 1000, "token_acc": 0.9592760180995475, "train_speed(iter/s)": 1.02355 }, { "epoch": 0.03248546275541695, "eval_loss": 0.11844995617866516, "eval_runtime": 81.3999, "eval_samples_per_second": 122.236, "eval_steps_per_second": 3.821, "eval_token_acc": 0.950461373188014, "step": 1000 }, { "epoch": 0.03251794821817237, "grad_norm": 1.7248598337173462, "learning_rate": 6.5000000000000004e-06, "loss": 0.11229832470417023, "memory(GiB)": 19.03, "step": 1001, "token_acc": 0.9510555570767504, "train_speed(iter/s)": 0.935349 }, { "epoch": 0.03255043368092778, "grad_norm": 15.222914695739746, "learning_rate": 6.506493506493508e-06, "loss": 0.10752017796039581, "memory(GiB)": 19.03, "step": 1002, "token_acc": 0.948339483394834, "train_speed(iter/s)": 0.935481 }, { "epoch": 0.0325829191436832, "grad_norm": 2.4395508766174316, "learning_rate": 6.512987012987013e-06, "loss": 0.12303078174591064, "memory(GiB)": 19.03, "step": 1003, "token_acc": 0.9543726235741445, "train_speed(iter/s)": 0.93566 }, { "epoch": 0.03261540460643862, "grad_norm": 1.9411144256591797, "learning_rate": 6.51948051948052e-06, "loss": 0.12887778878211975, "memory(GiB)": 19.03, "step": 1004, "token_acc": 0.9525691699604744, "train_speed(iter/s)": 0.935808 }, { "epoch": 0.03264789006919404, "grad_norm": 1.608466625213623, "learning_rate": 6.525974025974027e-06, "loss": 0.1155504435300827, "memory(GiB)": 19.03, "step": 1005, "token_acc": 0.9585253456221198, "train_speed(iter/s)": 0.935791 }, { "epoch": 0.032680375531949454, "grad_norm": 1.580984115600586, "learning_rate": 6.5324675324675335e-06, "loss": 0.12242251634597778, "memory(GiB)": 19.03, "step": 1006, "token_acc": 0.9327731092436975, "train_speed(iter/s)": 0.93597 }, { "epoch": 0.03271286099470487, "grad_norm": 2.2897109985351562, "learning_rate": 6.538961038961039e-06, "loss": 0.12377361953258514, "memory(GiB)": 19.03, "step": 1007, "token_acc": 0.95, "train_speed(iter/s)": 0.93613 }, { "epoch": 0.032745346457460286, "grad_norm": 1.8135051727294922, "learning_rate": 6.545454545454546e-06, "loss": 0.13651129603385925, "memory(GiB)": 19.03, "step": 1008, "token_acc": 0.954954954954955, "train_speed(iter/s)": 0.936269 }, { "epoch": 0.0327778319202157, "grad_norm": 1.88469398021698, "learning_rate": 6.5519480519480525e-06, "loss": 0.12441720068454742, "memory(GiB)": 19.03, "step": 1009, "token_acc": 0.9366515837104072, "train_speed(iter/s)": 0.936434 }, { "epoch": 0.03281031738297112, "grad_norm": 2.209531545639038, "learning_rate": 6.55844155844156e-06, "loss": 0.1236550509929657, "memory(GiB)": 19.03, "step": 1010, "token_acc": 0.94140625, "train_speed(iter/s)": 0.936593 }, { "epoch": 0.032842802845726535, "grad_norm": 1.9136388301849365, "learning_rate": 6.564935064935065e-06, "loss": 0.11501000076532364, "memory(GiB)": 19.03, "step": 1011, "token_acc": 0.9663461538461539, "train_speed(iter/s)": 0.936747 }, { "epoch": 0.03287528830848196, "grad_norm": 1.9766589403152466, "learning_rate": 6.571428571428572e-06, "loss": 0.10810334980487823, "memory(GiB)": 19.03, "step": 1012, "token_acc": 0.9446808510638298, "train_speed(iter/s)": 0.936932 }, { "epoch": 0.032907773771237374, "grad_norm": 1.5897953510284424, "learning_rate": 6.577922077922079e-06, "loss": 0.12330597639083862, "memory(GiB)": 19.03, "step": 1013, "token_acc": 0.9440298507462687, "train_speed(iter/s)": 0.937127 }, { "epoch": 0.03294025923399279, "grad_norm": 1.6129454374313354, "learning_rate": 6.5844155844155855e-06, "loss": 0.1311793327331543, "memory(GiB)": 19.03, "step": 1014, "token_acc": 0.9584905660377359, "train_speed(iter/s)": 0.93732 }, { "epoch": 0.032972744696748206, "grad_norm": 1.362149715423584, "learning_rate": 6.590909090909091e-06, "loss": 0.11080257594585419, "memory(GiB)": 19.03, "step": 1015, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.937511 }, { "epoch": 0.03300523015950362, "grad_norm": 1.2416139841079712, "learning_rate": 6.597402597402598e-06, "loss": 0.11952902376651764, "memory(GiB)": 19.03, "step": 1016, "token_acc": 0.9488372093023256, "train_speed(iter/s)": 0.937703 }, { "epoch": 0.03303771562225904, "grad_norm": 1.3992418050765991, "learning_rate": 6.6038961038961045e-06, "loss": 0.11653119325637817, "memory(GiB)": 19.03, "step": 1017, "token_acc": 0.9675675675675676, "train_speed(iter/s)": 0.937885 }, { "epoch": 0.033070201085014454, "grad_norm": 1.8040673732757568, "learning_rate": 6.610389610389611e-06, "loss": 0.11558365821838379, "memory(GiB)": 19.03, "step": 1018, "token_acc": 0.9536679536679536, "train_speed(iter/s)": 0.938049 }, { "epoch": 0.03310268654776987, "grad_norm": 1.9004161357879639, "learning_rate": 6.616883116883117e-06, "loss": 0.11342097073793411, "memory(GiB)": 19.03, "step": 1019, "token_acc": 0.9541284403669725, "train_speed(iter/s)": 0.93819 }, { "epoch": 0.03313517201052529, "grad_norm": 1.7366111278533936, "learning_rate": 6.623376623376624e-06, "loss": 0.12085692584514618, "memory(GiB)": 19.03, "step": 1020, "token_acc": 0.9493087557603687, "train_speed(iter/s)": 0.938323 }, { "epoch": 0.03316765747328071, "grad_norm": 1.4578001499176025, "learning_rate": 6.629870129870131e-06, "loss": 0.12044530361890793, "memory(GiB)": 19.03, "step": 1021, "token_acc": 0.9369747899159664, "train_speed(iter/s)": 0.938463 }, { "epoch": 0.033200142936036126, "grad_norm": 1.0991880893707275, "learning_rate": 6.6363636363636375e-06, "loss": 0.10972201079130173, "memory(GiB)": 19.03, "step": 1022, "token_acc": 0.9414893617021277, "train_speed(iter/s)": 0.938612 }, { "epoch": 0.03323262839879154, "grad_norm": 1.4227710962295532, "learning_rate": 6.642857142857143e-06, "loss": 0.10973408073186874, "memory(GiB)": 19.03, "step": 1023, "token_acc": 0.968503937007874, "train_speed(iter/s)": 0.938765 }, { "epoch": 0.03326511386154696, "grad_norm": 2.0128090381622314, "learning_rate": 6.64935064935065e-06, "loss": 0.12679670751094818, "memory(GiB)": 19.03, "step": 1024, "token_acc": 0.9399293286219081, "train_speed(iter/s)": 0.938917 }, { "epoch": 0.033297599324302374, "grad_norm": 2.5008208751678467, "learning_rate": 6.6558441558441565e-06, "loss": 0.11159087717533112, "memory(GiB)": 19.03, "step": 1025, "token_acc": 0.9484978540772532, "train_speed(iter/s)": 0.939039 }, { "epoch": 0.03333008478705779, "grad_norm": 1.4216914176940918, "learning_rate": 6.662337662337663e-06, "loss": 0.11798062920570374, "memory(GiB)": 19.03, "step": 1026, "token_acc": 0.9531914893617022, "train_speed(iter/s)": 0.939201 }, { "epoch": 0.033362570249813206, "grad_norm": 1.2304306030273438, "learning_rate": 6.668831168831169e-06, "loss": 0.10411132872104645, "memory(GiB)": 19.03, "step": 1027, "token_acc": 0.9502487562189055, "train_speed(iter/s)": 0.939327 }, { "epoch": 0.03339505571256863, "grad_norm": 1.5148497819900513, "learning_rate": 6.6753246753246755e-06, "loss": 0.11837862432003021, "memory(GiB)": 19.03, "step": 1028, "token_acc": 0.9567099567099567, "train_speed(iter/s)": 0.93947 }, { "epoch": 0.033427541175324045, "grad_norm": 1.8868963718414307, "learning_rate": 6.681818181818183e-06, "loss": 0.11664329469203949, "memory(GiB)": 19.03, "step": 1029, "token_acc": 0.9516908212560387, "train_speed(iter/s)": 0.939606 }, { "epoch": 0.03346002663807946, "grad_norm": 1.91621994972229, "learning_rate": 6.688311688311689e-06, "loss": 0.11846210062503815, "memory(GiB)": 19.03, "step": 1030, "token_acc": 0.9556650246305419, "train_speed(iter/s)": 0.939738 }, { "epoch": 0.03349251210083488, "grad_norm": 1.424439549446106, "learning_rate": 6.694805194805195e-06, "loss": 0.12245932221412659, "memory(GiB)": 19.03, "step": 1031, "token_acc": 0.9527896995708155, "train_speed(iter/s)": 0.939881 }, { "epoch": 0.03352499756359029, "grad_norm": 1.4566328525543213, "learning_rate": 6.701298701298702e-06, "loss": 0.11165519803762436, "memory(GiB)": 19.03, "step": 1032, "token_acc": 0.9502487562189055, "train_speed(iter/s)": 0.940003 }, { "epoch": 0.03355748302634571, "grad_norm": 1.2540500164031982, "learning_rate": 6.7077922077922085e-06, "loss": 0.11152638494968414, "memory(GiB)": 19.03, "step": 1033, "token_acc": 0.95, "train_speed(iter/s)": 0.940138 }, { "epoch": 0.033589968489101125, "grad_norm": 1.3932018280029297, "learning_rate": 6.714285714285714e-06, "loss": 0.10631990432739258, "memory(GiB)": 19.03, "step": 1034, "token_acc": 0.9641255605381166, "train_speed(iter/s)": 0.940302 }, { "epoch": 0.03362245395185654, "grad_norm": 1.6564801931381226, "learning_rate": 6.720779220779221e-06, "loss": 0.1270538866519928, "memory(GiB)": 19.03, "step": 1035, "token_acc": 0.96, "train_speed(iter/s)": 0.940453 }, { "epoch": 0.033654939414611965, "grad_norm": 1.3469147682189941, "learning_rate": 6.7272727272727275e-06, "loss": 0.1227177157998085, "memory(GiB)": 19.03, "step": 1036, "token_acc": 0.9661016949152542, "train_speed(iter/s)": 0.940599 }, { "epoch": 0.03368742487736738, "grad_norm": 2.6516215801239014, "learning_rate": 6.733766233766235e-06, "loss": 0.11921602487564087, "memory(GiB)": 19.03, "step": 1037, "token_acc": 0.9259259259259259, "train_speed(iter/s)": 0.940745 }, { "epoch": 0.0337199103401228, "grad_norm": 1.32436203956604, "learning_rate": 6.740259740259741e-06, "loss": 0.12822136282920837, "memory(GiB)": 19.03, "step": 1038, "token_acc": 0.9534883720930233, "train_speed(iter/s)": 0.940876 }, { "epoch": 0.03375239580287821, "grad_norm": 0.8142095804214478, "learning_rate": 6.746753246753247e-06, "loss": 0.11175130307674408, "memory(GiB)": 19.03, "step": 1039, "token_acc": 0.9446640316205533, "train_speed(iter/s)": 0.941026 }, { "epoch": 0.03378488126563363, "grad_norm": 1.0585323572158813, "learning_rate": 6.753246753246754e-06, "loss": 0.11905161291360855, "memory(GiB)": 19.03, "step": 1040, "token_acc": 0.9545454545454546, "train_speed(iter/s)": 0.941189 }, { "epoch": 0.033817366728389045, "grad_norm": 1.3498624563217163, "learning_rate": 6.7597402597402605e-06, "loss": 0.11492438614368439, "memory(GiB)": 19.03, "step": 1041, "token_acc": 0.9506726457399103, "train_speed(iter/s)": 0.94134 }, { "epoch": 0.03384985219114446, "grad_norm": 1.7162607908248901, "learning_rate": 6.766233766233766e-06, "loss": 0.11617793142795563, "memory(GiB)": 19.03, "step": 1042, "token_acc": 0.9592760180995475, "train_speed(iter/s)": 0.941507 }, { "epoch": 0.03388233765389988, "grad_norm": 1.482121467590332, "learning_rate": 6.772727272727273e-06, "loss": 0.11646448075771332, "memory(GiB)": 19.03, "step": 1043, "token_acc": 0.9558823529411765, "train_speed(iter/s)": 0.941661 }, { "epoch": 0.0339148231166553, "grad_norm": 1.390842318534851, "learning_rate": 6.7792207792207795e-06, "loss": 0.11287765949964523, "memory(GiB)": 19.03, "step": 1044, "token_acc": 0.9641434262948207, "train_speed(iter/s)": 0.941812 }, { "epoch": 0.033947308579410716, "grad_norm": 1.409623146057129, "learning_rate": 6.785714285714287e-06, "loss": 0.11783687025308609, "memory(GiB)": 19.03, "step": 1045, "token_acc": 0.9364406779661016, "train_speed(iter/s)": 0.941963 }, { "epoch": 0.03397979404216613, "grad_norm": 1.504737138748169, "learning_rate": 6.792207792207792e-06, "loss": 0.1089053601026535, "memory(GiB)": 19.03, "step": 1046, "token_acc": 0.9590163934426229, "train_speed(iter/s)": 0.942097 }, { "epoch": 0.03401227950492155, "grad_norm": 3.265803337097168, "learning_rate": 6.798701298701299e-06, "loss": 0.1215028464794159, "memory(GiB)": 19.03, "step": 1047, "token_acc": 0.9644444444444444, "train_speed(iter/s)": 0.942228 }, { "epoch": 0.034044764967676965, "grad_norm": 1.292349100112915, "learning_rate": 6.805194805194806e-06, "loss": 0.13711747527122498, "memory(GiB)": 19.03, "step": 1048, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.942384 }, { "epoch": 0.03407725043043238, "grad_norm": 3.3804328441619873, "learning_rate": 6.8116883116883125e-06, "loss": 0.11265374720096588, "memory(GiB)": 19.03, "step": 1049, "token_acc": 0.9369747899159664, "train_speed(iter/s)": 0.942531 }, { "epoch": 0.0341097358931878, "grad_norm": 1.0905207395553589, "learning_rate": 6.818181818181818e-06, "loss": 0.12392029911279678, "memory(GiB)": 19.03, "step": 1050, "token_acc": 0.9491525423728814, "train_speed(iter/s)": 0.942698 }, { "epoch": 0.03414222135594321, "grad_norm": 2.252791404724121, "learning_rate": 6.824675324675325e-06, "loss": 0.13237503170967102, "memory(GiB)": 19.03, "step": 1051, "token_acc": 0.9525691699604744, "train_speed(iter/s)": 0.942902 }, { "epoch": 0.034174706818698636, "grad_norm": 1.8417389392852783, "learning_rate": 6.8311688311688315e-06, "loss": 0.11708051711320877, "memory(GiB)": 19.03, "step": 1052, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.943102 }, { "epoch": 0.03420719228145405, "grad_norm": 1.0276213884353638, "learning_rate": 6.837662337662339e-06, "loss": 0.11564689874649048, "memory(GiB)": 19.03, "step": 1053, "token_acc": 0.9702380952380952, "train_speed(iter/s)": 0.943284 }, { "epoch": 0.03423967774420947, "grad_norm": 4.746508598327637, "learning_rate": 6.844155844155844e-06, "loss": 0.10691121965646744, "memory(GiB)": 19.03, "step": 1054, "token_acc": 0.9458333333333333, "train_speed(iter/s)": 0.943476 }, { "epoch": 0.034272163206964884, "grad_norm": 1.0758601427078247, "learning_rate": 6.850649350649351e-06, "loss": 0.11857870221138, "memory(GiB)": 19.03, "step": 1055, "token_acc": 0.9545454545454546, "train_speed(iter/s)": 0.943655 }, { "epoch": 0.0343046486697203, "grad_norm": 1.7648966312408447, "learning_rate": 6.857142857142858e-06, "loss": 0.10428984463214874, "memory(GiB)": 19.03, "step": 1056, "token_acc": 0.9314516129032258, "train_speed(iter/s)": 0.943841 }, { "epoch": 0.034337134132475716, "grad_norm": 1.2508293390274048, "learning_rate": 6.8636363636363645e-06, "loss": 0.11585046350955963, "memory(GiB)": 19.03, "step": 1057, "token_acc": 0.9563106796116505, "train_speed(iter/s)": 0.944048 }, { "epoch": 0.03436961959523113, "grad_norm": 3.1175708770751953, "learning_rate": 6.87012987012987e-06, "loss": 0.1095009595155716, "memory(GiB)": 19.03, "step": 1058, "token_acc": 0.9597069597069597, "train_speed(iter/s)": 0.944245 }, { "epoch": 0.03440210505798655, "grad_norm": 1.7535265684127808, "learning_rate": 6.876623376623377e-06, "loss": 0.12682634592056274, "memory(GiB)": 19.03, "step": 1059, "token_acc": 0.965, "train_speed(iter/s)": 0.944429 }, { "epoch": 0.03443459052074197, "grad_norm": 1.1695144176483154, "learning_rate": 6.8831168831168835e-06, "loss": 0.11376123875379562, "memory(GiB)": 19.03, "step": 1060, "token_acc": 0.9392523364485982, "train_speed(iter/s)": 0.944603 }, { "epoch": 0.03446707598349739, "grad_norm": 1.3929080963134766, "learning_rate": 6.88961038961039e-06, "loss": 0.13037364184856415, "memory(GiB)": 19.03, "step": 1061, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.944784 }, { "epoch": 0.034499561446252804, "grad_norm": 1.3049405813217163, "learning_rate": 6.896103896103896e-06, "loss": 0.09866338223218918, "memory(GiB)": 19.03, "step": 1062, "token_acc": 0.9531914893617022, "train_speed(iter/s)": 0.944966 }, { "epoch": 0.03453204690900822, "grad_norm": 1.2349286079406738, "learning_rate": 6.902597402597403e-06, "loss": 0.10837484896183014, "memory(GiB)": 19.03, "step": 1063, "token_acc": 0.9484126984126984, "train_speed(iter/s)": 0.945156 }, { "epoch": 0.034564532371763636, "grad_norm": 1.9093983173370361, "learning_rate": 6.90909090909091e-06, "loss": 0.11501003056764603, "memory(GiB)": 19.03, "step": 1064, "token_acc": 0.944, "train_speed(iter/s)": 0.945337 }, { "epoch": 0.03459701783451905, "grad_norm": 2.8312203884124756, "learning_rate": 6.9155844155844165e-06, "loss": 0.1135258898139, "memory(GiB)": 19.03, "step": 1065, "token_acc": 0.937037037037037, "train_speed(iter/s)": 0.945521 }, { "epoch": 0.03462950329727447, "grad_norm": 1.6568280458450317, "learning_rate": 6.922077922077922e-06, "loss": 0.11156360805034637, "memory(GiB)": 19.03, "step": 1066, "token_acc": 0.9557522123893806, "train_speed(iter/s)": 0.945695 }, { "epoch": 0.034661988760029884, "grad_norm": 1.1495122909545898, "learning_rate": 6.928571428571429e-06, "loss": 0.11965097486972809, "memory(GiB)": 19.03, "step": 1067, "token_acc": 0.935361216730038, "train_speed(iter/s)": 0.945867 }, { "epoch": 0.03469447422278531, "grad_norm": 4.501299858093262, "learning_rate": 6.9350649350649355e-06, "loss": 0.1296221762895584, "memory(GiB)": 19.03, "step": 1068, "token_acc": 0.9453781512605042, "train_speed(iter/s)": 0.946034 }, { "epoch": 0.03472695968554072, "grad_norm": 3.440688371658325, "learning_rate": 6.941558441558442e-06, "loss": 0.13661089539527893, "memory(GiB)": 19.03, "step": 1069, "token_acc": 0.9561752988047809, "train_speed(iter/s)": 0.946209 }, { "epoch": 0.03475944514829614, "grad_norm": 2.875985622406006, "learning_rate": 6.948051948051948e-06, "loss": 0.14225511252880096, "memory(GiB)": 19.03, "step": 1070, "token_acc": 0.9439252336448598, "train_speed(iter/s)": 0.946366 }, { "epoch": 0.034791930611051555, "grad_norm": 2.3241543769836426, "learning_rate": 6.954545454545455e-06, "loss": 0.1268131136894226, "memory(GiB)": 19.03, "step": 1071, "token_acc": 0.961864406779661, "train_speed(iter/s)": 0.946541 }, { "epoch": 0.03482441607380697, "grad_norm": 1.339053750038147, "learning_rate": 6.961038961038962e-06, "loss": 0.12970086932182312, "memory(GiB)": 19.03, "step": 1072, "token_acc": 0.9494163424124513, "train_speed(iter/s)": 0.946727 }, { "epoch": 0.03485690153656239, "grad_norm": 2.001800298690796, "learning_rate": 6.9675324675324686e-06, "loss": 0.12775015830993652, "memory(GiB)": 19.03, "step": 1073, "token_acc": 0.955719557195572, "train_speed(iter/s)": 0.946899 }, { "epoch": 0.034889386999317804, "grad_norm": 1.3145923614501953, "learning_rate": 6.974025974025974e-06, "loss": 0.13165748119354248, "memory(GiB)": 19.03, "step": 1074, "token_acc": 0.9512195121951219, "train_speed(iter/s)": 0.947083 }, { "epoch": 0.03492187246207322, "grad_norm": 1.9042952060699463, "learning_rate": 6.980519480519481e-06, "loss": 0.12877489626407623, "memory(GiB)": 19.03, "step": 1075, "token_acc": 0.9409448818897638, "train_speed(iter/s)": 0.947261 }, { "epoch": 0.03495435792482864, "grad_norm": 1.3962857723236084, "learning_rate": 6.9870129870129875e-06, "loss": 0.11862422525882721, "memory(GiB)": 19.03, "step": 1076, "token_acc": 0.93359375, "train_speed(iter/s)": 0.947415 }, { "epoch": 0.03498684338758406, "grad_norm": 1.4636625051498413, "learning_rate": 6.993506493506494e-06, "loss": 0.11622898280620575, "memory(GiB)": 19.03, "step": 1077, "token_acc": 0.9621848739495799, "train_speed(iter/s)": 0.947549 }, { "epoch": 0.035019328850339475, "grad_norm": 2.1368930339813232, "learning_rate": 7e-06, "loss": 0.12339451909065247, "memory(GiB)": 19.03, "step": 1078, "token_acc": 0.9520295202952029, "train_speed(iter/s)": 0.947686 }, { "epoch": 0.03505181431309489, "grad_norm": 1.303528070449829, "learning_rate": 7.0064935064935065e-06, "loss": 0.11736946552991867, "memory(GiB)": 19.03, "step": 1079, "token_acc": 0.940677966101695, "train_speed(iter/s)": 0.947819 }, { "epoch": 0.03508429977585031, "grad_norm": 2.3244662284851074, "learning_rate": 7.012987012987014e-06, "loss": 0.12908078730106354, "memory(GiB)": 19.03, "step": 1080, "token_acc": 0.9372384937238494, "train_speed(iter/s)": 0.947966 }, { "epoch": 0.03511678523860572, "grad_norm": 1.7633711099624634, "learning_rate": 7.0194805194805206e-06, "loss": 0.1169075220823288, "memory(GiB)": 19.03, "step": 1081, "token_acc": 0.9568627450980393, "train_speed(iter/s)": 0.94811 }, { "epoch": 0.03514927070136114, "grad_norm": 1.4691828489303589, "learning_rate": 7.025974025974026e-06, "loss": 0.14050322771072388, "memory(GiB)": 19.03, "step": 1082, "token_acc": 0.9543859649122807, "train_speed(iter/s)": 0.948243 }, { "epoch": 0.035181756164116555, "grad_norm": 1.328232765197754, "learning_rate": 7.032467532467533e-06, "loss": 0.11412632465362549, "memory(GiB)": 19.03, "step": 1083, "token_acc": 0.9518518518518518, "train_speed(iter/s)": 0.948384 }, { "epoch": 0.03521424162687198, "grad_norm": 1.126452088356018, "learning_rate": 7.0389610389610395e-06, "loss": 0.11711500585079193, "memory(GiB)": 19.03, "step": 1084, "token_acc": 0.96, "train_speed(iter/s)": 0.948487 }, { "epoch": 0.035246727089627394, "grad_norm": 1.4965966939926147, "learning_rate": 7.045454545454546e-06, "loss": 0.1045278012752533, "memory(GiB)": 19.03, "step": 1085, "token_acc": 0.9408866995073891, "train_speed(iter/s)": 0.94861 }, { "epoch": 0.03527921255238281, "grad_norm": 1.7811970710754395, "learning_rate": 7.051948051948052e-06, "loss": 0.10844597220420837, "memory(GiB)": 19.03, "step": 1086, "token_acc": 0.9662447257383966, "train_speed(iter/s)": 0.948754 }, { "epoch": 0.035311698015138226, "grad_norm": 1.678468942642212, "learning_rate": 7.0584415584415585e-06, "loss": 0.1152379959821701, "memory(GiB)": 19.03, "step": 1087, "token_acc": 0.9539748953974896, "train_speed(iter/s)": 0.948894 }, { "epoch": 0.03534418347789364, "grad_norm": 1.69158136844635, "learning_rate": 7.064935064935066e-06, "loss": 0.11298082768917084, "memory(GiB)": 19.03, "step": 1088, "token_acc": 0.9458128078817734, "train_speed(iter/s)": 0.949012 }, { "epoch": 0.03537666894064906, "grad_norm": 1.153485655784607, "learning_rate": 7.0714285714285726e-06, "loss": 0.1015150249004364, "memory(GiB)": 19.03, "step": 1089, "token_acc": 0.9545454545454546, "train_speed(iter/s)": 0.94914 }, { "epoch": 0.035409154403404475, "grad_norm": 1.3033716678619385, "learning_rate": 7.077922077922078e-06, "loss": 0.12413538247346878, "memory(GiB)": 19.03, "step": 1090, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.949257 }, { "epoch": 0.03544163986615989, "grad_norm": 1.726553201675415, "learning_rate": 7.084415584415585e-06, "loss": 0.14141057431697845, "memory(GiB)": 19.03, "step": 1091, "token_acc": 0.9367088607594937, "train_speed(iter/s)": 0.949386 }, { "epoch": 0.035474125328915314, "grad_norm": 2.066311836242676, "learning_rate": 7.0909090909090916e-06, "loss": 0.13763223588466644, "memory(GiB)": 19.03, "step": 1092, "token_acc": 0.9364406779661016, "train_speed(iter/s)": 0.949503 }, { "epoch": 0.03550661079167073, "grad_norm": 1.0294257402420044, "learning_rate": 7.097402597402598e-06, "loss": 0.10727710276842117, "memory(GiB)": 19.03, "step": 1093, "token_acc": 0.9603960396039604, "train_speed(iter/s)": 0.949613 }, { "epoch": 0.035539096254426146, "grad_norm": 0.906927227973938, "learning_rate": 7.103896103896104e-06, "loss": 0.1034620851278305, "memory(GiB)": 19.03, "step": 1094, "token_acc": 0.9659574468085106, "train_speed(iter/s)": 0.949755 }, { "epoch": 0.03557158171718156, "grad_norm": 3.7684543132781982, "learning_rate": 7.1103896103896105e-06, "loss": 0.11856289952993393, "memory(GiB)": 19.03, "step": 1095, "token_acc": 0.9509803921568627, "train_speed(iter/s)": 0.949885 }, { "epoch": 0.03560406717993698, "grad_norm": 1.2297096252441406, "learning_rate": 7.116883116883118e-06, "loss": 0.10223296284675598, "memory(GiB)": 19.03, "step": 1096, "token_acc": 0.9240506329113924, "train_speed(iter/s)": 0.950021 }, { "epoch": 0.035636552642692394, "grad_norm": 1.2456644773483276, "learning_rate": 7.123376623376625e-06, "loss": 0.10970111936330795, "memory(GiB)": 19.03, "step": 1097, "token_acc": 0.9601593625498008, "train_speed(iter/s)": 0.950156 }, { "epoch": 0.03566903810544781, "grad_norm": 1.8228696584701538, "learning_rate": 7.12987012987013e-06, "loss": 0.1163945198059082, "memory(GiB)": 19.03, "step": 1098, "token_acc": 0.9482071713147411, "train_speed(iter/s)": 0.950295 }, { "epoch": 0.035701523568203226, "grad_norm": 1.4375075101852417, "learning_rate": 7.136363636363637e-06, "loss": 0.1227368637919426, "memory(GiB)": 19.03, "step": 1099, "token_acc": 0.964, "train_speed(iter/s)": 0.950414 }, { "epoch": 0.03573400903095865, "grad_norm": 1.2361490726470947, "learning_rate": 7.1428571428571436e-06, "loss": 0.11065907031297684, "memory(GiB)": 19.03, "step": 1100, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.950556 }, { "epoch": 0.035766494493714066, "grad_norm": 1.1875927448272705, "learning_rate": 7.14935064935065e-06, "loss": 0.11835463345050812, "memory(GiB)": 19.03, "step": 1101, "token_acc": 0.9585253456221198, "train_speed(iter/s)": 0.950685 }, { "epoch": 0.03579897995646948, "grad_norm": 1.3283405303955078, "learning_rate": 7.155844155844156e-06, "loss": 0.10582318902015686, "memory(GiB)": 19.03, "step": 1102, "token_acc": 0.9401709401709402, "train_speed(iter/s)": 0.950825 }, { "epoch": 0.0358314654192249, "grad_norm": 1.7098917961120605, "learning_rate": 7.1623376623376625e-06, "loss": 0.11799848824739456, "memory(GiB)": 19.03, "step": 1103, "token_acc": 0.9377593360995851, "train_speed(iter/s)": 0.950982 }, { "epoch": 0.035863950881980314, "grad_norm": 1.7553770542144775, "learning_rate": 7.16883116883117e-06, "loss": 0.10921621322631836, "memory(GiB)": 19.03, "step": 1104, "token_acc": 0.9437751004016064, "train_speed(iter/s)": 0.951134 }, { "epoch": 0.03589643634473573, "grad_norm": 1.191877841949463, "learning_rate": 7.175324675324677e-06, "loss": 0.11940602958202362, "memory(GiB)": 19.03, "step": 1105, "token_acc": 0.9486166007905138, "train_speed(iter/s)": 0.951283 }, { "epoch": 0.035928921807491146, "grad_norm": 1.0339078903198242, "learning_rate": 7.181818181818182e-06, "loss": 0.10930871218442917, "memory(GiB)": 19.03, "step": 1106, "token_acc": 0.9395348837209302, "train_speed(iter/s)": 0.951431 }, { "epoch": 0.03596140727024656, "grad_norm": 1.1299430131912231, "learning_rate": 7.188311688311689e-06, "loss": 0.1118626818060875, "memory(GiB)": 19.03, "step": 1107, "token_acc": 0.9606299212598425, "train_speed(iter/s)": 0.951549 }, { "epoch": 0.035993892733001985, "grad_norm": 1.370383858680725, "learning_rate": 7.1948051948051956e-06, "loss": 0.1185707077383995, "memory(GiB)": 19.03, "step": 1108, "token_acc": 0.9333333333333333, "train_speed(iter/s)": 0.951682 }, { "epoch": 0.0360263781957574, "grad_norm": 0.8709391355514526, "learning_rate": 7.201298701298702e-06, "loss": 0.12320475280284882, "memory(GiB)": 19.03, "step": 1109, "token_acc": 0.9313304721030042, "train_speed(iter/s)": 0.951806 }, { "epoch": 0.03605886365851282, "grad_norm": 1.0709998607635498, "learning_rate": 7.207792207792208e-06, "loss": 0.1337585747241974, "memory(GiB)": 19.03, "step": 1110, "token_acc": 0.9485294117647058, "train_speed(iter/s)": 0.951931 }, { "epoch": 0.03609134912126823, "grad_norm": 1.00570547580719, "learning_rate": 7.2142857142857145e-06, "loss": 0.11450836062431335, "memory(GiB)": 19.03, "step": 1111, "token_acc": 0.946058091286307, "train_speed(iter/s)": 0.952085 }, { "epoch": 0.03612383458402365, "grad_norm": 1.1042875051498413, "learning_rate": 7.220779220779221e-06, "loss": 0.10864148288965225, "memory(GiB)": 19.03, "step": 1112, "token_acc": 0.9592760180995475, "train_speed(iter/s)": 0.952233 }, { "epoch": 0.036156320046779065, "grad_norm": 1.6274917125701904, "learning_rate": 7.227272727272729e-06, "loss": 0.11544129997491837, "memory(GiB)": 19.03, "step": 1113, "token_acc": 0.954337899543379, "train_speed(iter/s)": 0.952397 }, { "epoch": 0.03618880550953448, "grad_norm": 1.3566638231277466, "learning_rate": 7.233766233766234e-06, "loss": 0.11179506778717041, "memory(GiB)": 19.03, "step": 1114, "token_acc": 0.9458128078817734, "train_speed(iter/s)": 0.952557 }, { "epoch": 0.0362212909722899, "grad_norm": 1.0425900220870972, "learning_rate": 7.240259740259741e-06, "loss": 0.11775851994752884, "memory(GiB)": 19.03, "step": 1115, "token_acc": 0.9344262295081968, "train_speed(iter/s)": 0.95271 }, { "epoch": 0.03625377643504532, "grad_norm": 2.301769256591797, "learning_rate": 7.246753246753248e-06, "loss": 0.1216004341840744, "memory(GiB)": 19.03, "step": 1116, "token_acc": 0.9572649572649573, "train_speed(iter/s)": 0.952874 }, { "epoch": 0.03628626189780074, "grad_norm": 1.0564719438552856, "learning_rate": 7.253246753246754e-06, "loss": 0.12631180882453918, "memory(GiB)": 19.03, "step": 1117, "token_acc": 0.9486166007905138, "train_speed(iter/s)": 0.953049 }, { "epoch": 0.03631874736055615, "grad_norm": 1.4556406736373901, "learning_rate": 7.25974025974026e-06, "loss": 0.13685886561870575, "memory(GiB)": 19.03, "step": 1118, "token_acc": 0.9365853658536586, "train_speed(iter/s)": 0.953208 }, { "epoch": 0.03635123282331157, "grad_norm": 1.2216891050338745, "learning_rate": 7.2662337662337666e-06, "loss": 0.11174248158931732, "memory(GiB)": 19.03, "step": 1119, "token_acc": 0.9442231075697212, "train_speed(iter/s)": 0.953369 }, { "epoch": 0.036383718286066985, "grad_norm": 0.7913638353347778, "learning_rate": 7.272727272727273e-06, "loss": 0.10963559150695801, "memory(GiB)": 19.03, "step": 1120, "token_acc": 0.9533898305084746, "train_speed(iter/s)": 0.953533 }, { "epoch": 0.0364162037488224, "grad_norm": 0.8853463530540466, "learning_rate": 7.279220779220781e-06, "loss": 0.11753085255622864, "memory(GiB)": 19.03, "step": 1121, "token_acc": 0.9454545454545454, "train_speed(iter/s)": 0.953706 }, { "epoch": 0.03644868921157782, "grad_norm": 1.0672838687896729, "learning_rate": 7.285714285714286e-06, "loss": 0.10927227139472961, "memory(GiB)": 19.03, "step": 1122, "token_acc": 0.9541284403669725, "train_speed(iter/s)": 0.953865 }, { "epoch": 0.03648117467433323, "grad_norm": 4.376131534576416, "learning_rate": 7.292207792207793e-06, "loss": 0.1152401715517044, "memory(GiB)": 19.03, "step": 1123, "token_acc": 0.94, "train_speed(iter/s)": 0.95404 }, { "epoch": 0.036513660137088656, "grad_norm": 1.0029730796813965, "learning_rate": 7.2987012987013e-06, "loss": 0.11621716618537903, "memory(GiB)": 19.03, "step": 1124, "token_acc": 0.9446808510638298, "train_speed(iter/s)": 0.954199 }, { "epoch": 0.03654614559984407, "grad_norm": 0.8468559384346008, "learning_rate": 7.305194805194806e-06, "loss": 0.11988602578639984, "memory(GiB)": 19.03, "step": 1125, "token_acc": 0.9391304347826087, "train_speed(iter/s)": 0.95436 }, { "epoch": 0.03657863106259949, "grad_norm": 0.9134792685508728, "learning_rate": 7.311688311688312e-06, "loss": 0.100124292075634, "memory(GiB)": 19.03, "step": 1126, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.954531 }, { "epoch": 0.036611116525354905, "grad_norm": 2.3263099193573, "learning_rate": 7.3181818181818186e-06, "loss": 0.12124033272266388, "memory(GiB)": 19.03, "step": 1127, "token_acc": 0.9369369369369369, "train_speed(iter/s)": 0.9547 }, { "epoch": 0.03664360198811032, "grad_norm": 1.1740325689315796, "learning_rate": 7.324675324675325e-06, "loss": 0.11357711255550385, "memory(GiB)": 19.03, "step": 1128, "token_acc": 0.96875, "train_speed(iter/s)": 0.954845 }, { "epoch": 0.03667608745086574, "grad_norm": 0.9864575266838074, "learning_rate": 7.331168831168833e-06, "loss": 0.10686098784208298, "memory(GiB)": 19.03, "step": 1129, "token_acc": 0.9646017699115044, "train_speed(iter/s)": 0.95501 }, { "epoch": 0.03670857291362115, "grad_norm": 1.3510215282440186, "learning_rate": 7.3376623376623375e-06, "loss": 0.117940753698349, "memory(GiB)": 19.03, "step": 1130, "token_acc": 0.9704433497536946, "train_speed(iter/s)": 0.955168 }, { "epoch": 0.03674105837637657, "grad_norm": 2.9425439834594727, "learning_rate": 7.344155844155845e-06, "loss": 0.10330928862094879, "memory(GiB)": 19.03, "step": 1131, "token_acc": 0.9403669724770642, "train_speed(iter/s)": 0.95533 }, { "epoch": 0.03677354383913199, "grad_norm": 1.1454411745071411, "learning_rate": 7.350649350649352e-06, "loss": 0.10861650109291077, "memory(GiB)": 19.03, "step": 1132, "token_acc": 0.9626865671641791, "train_speed(iter/s)": 0.955477 }, { "epoch": 0.03680602930188741, "grad_norm": 1.2347067594528198, "learning_rate": 7.357142857142858e-06, "loss": 0.10914459824562073, "memory(GiB)": 19.03, "step": 1133, "token_acc": 0.9590909090909091, "train_speed(iter/s)": 0.955594 }, { "epoch": 0.036838514764642824, "grad_norm": 2.0335159301757812, "learning_rate": 7.363636363636364e-06, "loss": 0.10770872235298157, "memory(GiB)": 19.03, "step": 1134, "token_acc": 0.9422222222222222, "train_speed(iter/s)": 0.955719 }, { "epoch": 0.03687100022739824, "grad_norm": 2.857877254486084, "learning_rate": 7.370129870129871e-06, "loss": 0.09573137015104294, "memory(GiB)": 19.03, "step": 1135, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.95584 }, { "epoch": 0.036903485690153656, "grad_norm": 1.9056763648986816, "learning_rate": 7.376623376623377e-06, "loss": 0.11132442951202393, "memory(GiB)": 19.03, "step": 1136, "token_acc": 0.9515418502202643, "train_speed(iter/s)": 0.955941 }, { "epoch": 0.03693597115290907, "grad_norm": 1.9218422174453735, "learning_rate": 7.383116883116885e-06, "loss": 0.11487458646297455, "memory(GiB)": 19.03, "step": 1137, "token_acc": 0.9364406779661016, "train_speed(iter/s)": 0.956072 }, { "epoch": 0.03696845661566449, "grad_norm": 1.5413212776184082, "learning_rate": 7.3896103896103896e-06, "loss": 0.11155030876398087, "memory(GiB)": 19.03, "step": 1138, "token_acc": 0.937007874015748, "train_speed(iter/s)": 0.956203 }, { "epoch": 0.037000942078419904, "grad_norm": 1.2977197170257568, "learning_rate": 7.396103896103897e-06, "loss": 0.11966560035943985, "memory(GiB)": 19.03, "step": 1139, "token_acc": 0.9395161290322581, "train_speed(iter/s)": 0.956313 }, { "epoch": 0.03703342754117533, "grad_norm": 1.457398533821106, "learning_rate": 7.402597402597404e-06, "loss": 0.12163826823234558, "memory(GiB)": 19.03, "step": 1140, "token_acc": 0.9636363636363636, "train_speed(iter/s)": 0.956447 }, { "epoch": 0.037065913003930744, "grad_norm": 2.6305840015411377, "learning_rate": 7.40909090909091e-06, "loss": 0.12045927345752716, "memory(GiB)": 19.03, "step": 1141, "token_acc": 0.9441860465116279, "train_speed(iter/s)": 0.95655 }, { "epoch": 0.03709839846668616, "grad_norm": 0.9489102959632874, "learning_rate": 7.415584415584416e-06, "loss": 0.10248204320669174, "memory(GiB)": 19.03, "step": 1142, "token_acc": 0.9529914529914529, "train_speed(iter/s)": 0.95667 }, { "epoch": 0.037130883929441576, "grad_norm": 1.2376447916030884, "learning_rate": 7.422077922077923e-06, "loss": 0.11060120910406113, "memory(GiB)": 19.03, "step": 1143, "token_acc": 0.9431818181818182, "train_speed(iter/s)": 0.956795 }, { "epoch": 0.03716336939219699, "grad_norm": 1.1711493730545044, "learning_rate": 7.428571428571429e-06, "loss": 0.11502514779567719, "memory(GiB)": 19.03, "step": 1144, "token_acc": 0.9575971731448764, "train_speed(iter/s)": 0.956924 }, { "epoch": 0.03719585485495241, "grad_norm": 1.6105867624282837, "learning_rate": 7.435064935064936e-06, "loss": 0.11690941452980042, "memory(GiB)": 19.03, "step": 1145, "token_acc": 0.9461883408071748, "train_speed(iter/s)": 0.957041 }, { "epoch": 0.037228340317707824, "grad_norm": 0.9903091788291931, "learning_rate": 7.4415584415584416e-06, "loss": 0.10622458159923553, "memory(GiB)": 19.03, "step": 1146, "token_acc": 0.9533898305084746, "train_speed(iter/s)": 0.957173 }, { "epoch": 0.03726082578046324, "grad_norm": 0.8823435306549072, "learning_rate": 7.448051948051949e-06, "loss": 0.1220688670873642, "memory(GiB)": 19.03, "step": 1147, "token_acc": 0.9360730593607306, "train_speed(iter/s)": 0.957284 }, { "epoch": 0.03729331124321866, "grad_norm": 1.0301389694213867, "learning_rate": 7.454545454545456e-06, "loss": 0.10172637552022934, "memory(GiB)": 19.03, "step": 1148, "token_acc": 0.9523809523809523, "train_speed(iter/s)": 0.957409 }, { "epoch": 0.03732579670597408, "grad_norm": 1.5351989269256592, "learning_rate": 7.461038961038962e-06, "loss": 0.10759841650724411, "memory(GiB)": 19.03, "step": 1149, "token_acc": 0.9615384615384616, "train_speed(iter/s)": 0.95756 }, { "epoch": 0.037358282168729495, "grad_norm": 0.8982598185539246, "learning_rate": 7.467532467532468e-06, "loss": 0.10792499780654907, "memory(GiB)": 19.03, "step": 1150, "token_acc": 0.9620253164556962, "train_speed(iter/s)": 0.957717 }, { "epoch": 0.03739076763148491, "grad_norm": 0.9806874394416809, "learning_rate": 7.474025974025975e-06, "loss": 0.094109907746315, "memory(GiB)": 19.03, "step": 1151, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.957838 }, { "epoch": 0.03742325309424033, "grad_norm": 1.277650237083435, "learning_rate": 7.480519480519481e-06, "loss": 0.11336036026477814, "memory(GiB)": 19.03, "step": 1152, "token_acc": 0.9653465346534653, "train_speed(iter/s)": 0.957971 }, { "epoch": 0.037455738556995744, "grad_norm": 1.5479389429092407, "learning_rate": 7.487012987012988e-06, "loss": 0.11169306188821793, "memory(GiB)": 19.03, "step": 1153, "token_acc": 0.9486166007905138, "train_speed(iter/s)": 0.958096 }, { "epoch": 0.03748822401975116, "grad_norm": 1.1119831800460815, "learning_rate": 7.493506493506494e-06, "loss": 0.10012218356132507, "memory(GiB)": 19.03, "step": 1154, "token_acc": 0.9568627450980393, "train_speed(iter/s)": 0.958228 }, { "epoch": 0.037520709482506576, "grad_norm": 1.5988836288452148, "learning_rate": 7.500000000000001e-06, "loss": 0.11748024821281433, "memory(GiB)": 19.03, "step": 1155, "token_acc": 0.9453125, "train_speed(iter/s)": 0.95836 }, { "epoch": 0.037553194945262, "grad_norm": 1.2175673246383667, "learning_rate": 7.506493506493508e-06, "loss": 0.11584510654211044, "memory(GiB)": 19.03, "step": 1156, "token_acc": 0.9653465346534653, "train_speed(iter/s)": 0.958468 }, { "epoch": 0.037585680408017415, "grad_norm": 1.3959815502166748, "learning_rate": 7.512987012987013e-06, "loss": 0.09997446835041046, "memory(GiB)": 19.03, "step": 1157, "token_acc": 0.954225352112676, "train_speed(iter/s)": 0.958611 }, { "epoch": 0.03761816587077283, "grad_norm": 1.0680618286132812, "learning_rate": 7.51948051948052e-06, "loss": 0.11293046921491623, "memory(GiB)": 19.03, "step": 1158, "token_acc": 0.963855421686747, "train_speed(iter/s)": 0.95873 }, { "epoch": 0.03765065133352825, "grad_norm": 1.1525390148162842, "learning_rate": 7.525974025974027e-06, "loss": 0.11434592306613922, "memory(GiB)": 19.03, "step": 1159, "token_acc": 0.9568965517241379, "train_speed(iter/s)": 0.958843 }, { "epoch": 0.03768313679628366, "grad_norm": 1.7250590324401855, "learning_rate": 7.532467532467533e-06, "loss": 0.12366705387830734, "memory(GiB)": 19.03, "step": 1160, "token_acc": 0.9471698113207547, "train_speed(iter/s)": 0.958965 }, { "epoch": 0.03771562225903908, "grad_norm": 1.0645697116851807, "learning_rate": 7.538961038961039e-06, "loss": 0.12180925160646439, "memory(GiB)": 19.03, "step": 1161, "token_acc": 0.9303135888501742, "train_speed(iter/s)": 0.959091 }, { "epoch": 0.037748107721794495, "grad_norm": 5.076265811920166, "learning_rate": 7.545454545454546e-06, "loss": 0.10364590585231781, "memory(GiB)": 19.03, "step": 1162, "token_acc": 0.9485981308411215, "train_speed(iter/s)": 0.959215 }, { "epoch": 0.03778059318454991, "grad_norm": 1.7488789558410645, "learning_rate": 7.551948051948052e-06, "loss": 0.11656937003135681, "memory(GiB)": 19.03, "step": 1163, "token_acc": 0.9719626168224299, "train_speed(iter/s)": 0.959346 }, { "epoch": 0.037813078647305334, "grad_norm": 1.6911603212356567, "learning_rate": 7.55844155844156e-06, "loss": 0.12255531549453735, "memory(GiB)": 19.03, "step": 1164, "token_acc": 0.9714285714285714, "train_speed(iter/s)": 0.95947 }, { "epoch": 0.03784556411006075, "grad_norm": 1.1356900930404663, "learning_rate": 7.564935064935065e-06, "loss": 0.11980679631233215, "memory(GiB)": 19.03, "step": 1165, "token_acc": 0.9388646288209607, "train_speed(iter/s)": 0.959609 }, { "epoch": 0.037878049572816166, "grad_norm": 1.4629344940185547, "learning_rate": 7.571428571428572e-06, "loss": 0.12747275829315186, "memory(GiB)": 19.03, "step": 1166, "token_acc": 0.9365671641791045, "train_speed(iter/s)": 0.95975 }, { "epoch": 0.03791053503557158, "grad_norm": 1.0561721324920654, "learning_rate": 7.577922077922079e-06, "loss": 0.10470512509346008, "memory(GiB)": 19.03, "step": 1167, "token_acc": 0.9683257918552036, "train_speed(iter/s)": 0.959871 }, { "epoch": 0.037943020498327, "grad_norm": 1.0247182846069336, "learning_rate": 7.584415584415585e-06, "loss": 0.11374790221452713, "memory(GiB)": 19.03, "step": 1168, "token_acc": 0.9724770642201835, "train_speed(iter/s)": 0.959982 }, { "epoch": 0.037975505961082415, "grad_norm": 0.9629071950912476, "learning_rate": 7.590909090909091e-06, "loss": 0.09959396719932556, "memory(GiB)": 19.03, "step": 1169, "token_acc": 0.96, "train_speed(iter/s)": 0.960105 }, { "epoch": 0.03800799142383783, "grad_norm": 0.834243893623352, "learning_rate": 7.597402597402598e-06, "loss": 0.11797137558460236, "memory(GiB)": 19.03, "step": 1170, "token_acc": 0.9560439560439561, "train_speed(iter/s)": 0.960216 }, { "epoch": 0.03804047688659325, "grad_norm": 1.023325800895691, "learning_rate": 7.603896103896104e-06, "loss": 0.11047108471393585, "memory(GiB)": 19.03, "step": 1171, "token_acc": 0.976303317535545, "train_speed(iter/s)": 0.960336 }, { "epoch": 0.03807296234934867, "grad_norm": 1.1453795433044434, "learning_rate": 7.610389610389612e-06, "loss": 0.11892938613891602, "memory(GiB)": 19.03, "step": 1172, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.960477 }, { "epoch": 0.038105447812104086, "grad_norm": 1.6427394151687622, "learning_rate": 7.616883116883117e-06, "loss": 0.10226384550333023, "memory(GiB)": 19.03, "step": 1173, "token_acc": 0.9626556016597511, "train_speed(iter/s)": 0.960632 }, { "epoch": 0.0381379332748595, "grad_norm": 1.1539584398269653, "learning_rate": 7.623376623376624e-06, "loss": 0.09236869215965271, "memory(GiB)": 19.03, "step": 1174, "token_acc": 0.957983193277311, "train_speed(iter/s)": 0.960795 }, { "epoch": 0.03817041873761492, "grad_norm": 1.0958149433135986, "learning_rate": 7.62987012987013e-06, "loss": 0.10674478113651276, "memory(GiB)": 19.03, "step": 1175, "token_acc": 0.9312977099236641, "train_speed(iter/s)": 0.960933 }, { "epoch": 0.038202904200370334, "grad_norm": 1.1119860410690308, "learning_rate": 7.636363636363638e-06, "loss": 0.10889281332492828, "memory(GiB)": 19.03, "step": 1176, "token_acc": 0.9606299212598425, "train_speed(iter/s)": 0.961087 }, { "epoch": 0.03823538966312575, "grad_norm": 2.3132376670837402, "learning_rate": 7.642857142857143e-06, "loss": 0.11238151788711548, "memory(GiB)": 19.03, "step": 1177, "token_acc": 0.9442231075697212, "train_speed(iter/s)": 0.961247 }, { "epoch": 0.038267875125881166, "grad_norm": 1.8683525323867798, "learning_rate": 7.64935064935065e-06, "loss": 0.13248932361602783, "memory(GiB)": 19.03, "step": 1178, "token_acc": 0.9393939393939394, "train_speed(iter/s)": 0.961396 }, { "epoch": 0.03830036058863658, "grad_norm": 5.020341873168945, "learning_rate": 7.655844155844156e-06, "loss": 0.11210963129997253, "memory(GiB)": 19.03, "step": 1179, "token_acc": 0.9497206703910615, "train_speed(iter/s)": 0.961545 }, { "epoch": 0.038332846051392006, "grad_norm": 2.35825252532959, "learning_rate": 7.662337662337663e-06, "loss": 0.12097223103046417, "memory(GiB)": 19.03, "step": 1180, "token_acc": 0.9271255060728745, "train_speed(iter/s)": 0.961708 }, { "epoch": 0.03836533151414742, "grad_norm": 4.221333026885986, "learning_rate": 7.66883116883117e-06, "loss": 0.1257541924715042, "memory(GiB)": 19.03, "step": 1181, "token_acc": 0.9562043795620438, "train_speed(iter/s)": 0.961848 }, { "epoch": 0.03839781697690284, "grad_norm": 1.2317134141921997, "learning_rate": 7.675324675324676e-06, "loss": 0.11920012533664703, "memory(GiB)": 19.03, "step": 1182, "token_acc": 0.9437751004016064, "train_speed(iter/s)": 0.961998 }, { "epoch": 0.038430302439658254, "grad_norm": 4.646579742431641, "learning_rate": 7.681818181818183e-06, "loss": 0.13111534714698792, "memory(GiB)": 19.03, "step": 1183, "token_acc": 0.9665271966527197, "train_speed(iter/s)": 0.962157 }, { "epoch": 0.03846278790241367, "grad_norm": 3.1308460235595703, "learning_rate": 7.68831168831169e-06, "loss": 0.10595113784074783, "memory(GiB)": 19.03, "step": 1184, "token_acc": 0.9367088607594937, "train_speed(iter/s)": 0.962299 }, { "epoch": 0.038495273365169086, "grad_norm": 1.363930106163025, "learning_rate": 7.694805194805194e-06, "loss": 0.10651031881570816, "memory(GiB)": 19.03, "step": 1185, "token_acc": 0.9566929133858267, "train_speed(iter/s)": 0.962449 }, { "epoch": 0.0385277588279245, "grad_norm": 1.4698988199234009, "learning_rate": 7.701298701298702e-06, "loss": 0.12366712093353271, "memory(GiB)": 19.03, "step": 1186, "token_acc": 0.9574468085106383, "train_speed(iter/s)": 0.962606 }, { "epoch": 0.03856024429067992, "grad_norm": 1.1452856063842773, "learning_rate": 7.707792207792209e-06, "loss": 0.10153789073228836, "memory(GiB)": 19.03, "step": 1187, "token_acc": 0.948, "train_speed(iter/s)": 0.962748 }, { "epoch": 0.03859272975343534, "grad_norm": 1.0761955976486206, "learning_rate": 7.714285714285716e-06, "loss": 0.11936156451702118, "memory(GiB)": 19.03, "step": 1188, "token_acc": 0.9340659340659341, "train_speed(iter/s)": 0.962878 }, { "epoch": 0.03862521521619076, "grad_norm": 1.3240736722946167, "learning_rate": 7.72077922077922e-06, "loss": 0.11742064356803894, "memory(GiB)": 19.03, "step": 1189, "token_acc": 0.9554655870445344, "train_speed(iter/s)": 0.962992 }, { "epoch": 0.03865770067894617, "grad_norm": 2.3224997520446777, "learning_rate": 7.727272727272727e-06, "loss": 0.12472076714038849, "memory(GiB)": 19.03, "step": 1190, "token_acc": 0.9702127659574468, "train_speed(iter/s)": 0.963124 }, { "epoch": 0.03869018614170159, "grad_norm": 1.5174764394760132, "learning_rate": 7.733766233766234e-06, "loss": 0.1239856481552124, "memory(GiB)": 19.03, "step": 1191, "token_acc": 0.9683257918552036, "train_speed(iter/s)": 0.963238 }, { "epoch": 0.038722671604457005, "grad_norm": 1.916271686553955, "learning_rate": 7.74025974025974e-06, "loss": 0.11472758650779724, "memory(GiB)": 19.03, "step": 1192, "token_acc": 0.9625468164794008, "train_speed(iter/s)": 0.963352 }, { "epoch": 0.03875515706721242, "grad_norm": 1.5963094234466553, "learning_rate": 7.746753246753247e-06, "loss": 0.11232201009988785, "memory(GiB)": 19.03, "step": 1193, "token_acc": 0.9563492063492064, "train_speed(iter/s)": 0.963449 }, { "epoch": 0.03878764252996784, "grad_norm": 2.6315362453460693, "learning_rate": 7.753246753246754e-06, "loss": 0.11592760682106018, "memory(GiB)": 19.03, "step": 1194, "token_acc": 0.9567567567567568, "train_speed(iter/s)": 0.963564 }, { "epoch": 0.038820127992723254, "grad_norm": 1.593551516532898, "learning_rate": 7.75974025974026e-06, "loss": 0.1219787523150444, "memory(GiB)": 19.03, "step": 1195, "token_acc": 0.9455445544554455, "train_speed(iter/s)": 0.963538 }, { "epoch": 0.03885261345547868, "grad_norm": 1.890324592590332, "learning_rate": 7.766233766233767e-06, "loss": 0.11229418218135834, "memory(GiB)": 19.03, "step": 1196, "token_acc": 0.944, "train_speed(iter/s)": 0.963623 }, { "epoch": 0.03888509891823409, "grad_norm": 9.513687133789062, "learning_rate": 7.772727272727273e-06, "loss": 0.11904551833868027, "memory(GiB)": 19.03, "step": 1197, "token_acc": 0.9631336405529954, "train_speed(iter/s)": 0.963745 }, { "epoch": 0.03891758438098951, "grad_norm": 4.346449851989746, "learning_rate": 7.77922077922078e-06, "loss": 0.12266823649406433, "memory(GiB)": 19.03, "step": 1198, "token_acc": 0.9563106796116505, "train_speed(iter/s)": 0.963848 }, { "epoch": 0.038950069843744925, "grad_norm": 3.4060356616973877, "learning_rate": 7.785714285714287e-06, "loss": 0.11756904423236847, "memory(GiB)": 19.03, "step": 1199, "token_acc": 0.9606741573033708, "train_speed(iter/s)": 0.963958 }, { "epoch": 0.03898255530650034, "grad_norm": 2.8217742443084717, "learning_rate": 7.792207792207793e-06, "loss": 0.1165434941649437, "memory(GiB)": 19.03, "step": 1200, "token_acc": 0.9576271186440678, "train_speed(iter/s)": 0.964073 }, { "epoch": 0.03901504076925576, "grad_norm": 5.034676551818848, "learning_rate": 7.798701298701298e-06, "loss": 0.12639997899532318, "memory(GiB)": 19.03, "step": 1201, "token_acc": 0.9383886255924171, "train_speed(iter/s)": 0.964188 }, { "epoch": 0.03904752623201117, "grad_norm": 4.600140571594238, "learning_rate": 7.805194805194806e-06, "loss": 0.12304335832595825, "memory(GiB)": 19.03, "step": 1202, "token_acc": 0.963302752293578, "train_speed(iter/s)": 0.964301 }, { "epoch": 0.03908001169476659, "grad_norm": 5.736802101135254, "learning_rate": 7.811688311688313e-06, "loss": 0.1347612738609314, "memory(GiB)": 19.03, "step": 1203, "token_acc": 0.9471544715447154, "train_speed(iter/s)": 0.964409 }, { "epoch": 0.03911249715752201, "grad_norm": 165.7540283203125, "learning_rate": 7.81818181818182e-06, "loss": 0.11103712022304535, "memory(GiB)": 19.03, "step": 1204, "token_acc": 0.9690265486725663, "train_speed(iter/s)": 0.964491 }, { "epoch": 0.03914498262027743, "grad_norm": 1.9967870712280273, "learning_rate": 7.824675324675325e-06, "loss": 0.11194764822721481, "memory(GiB)": 19.03, "step": 1205, "token_acc": 0.970954356846473, "train_speed(iter/s)": 0.964641 }, { "epoch": 0.039177468083032845, "grad_norm": 4.786166191101074, "learning_rate": 7.831168831168831e-06, "loss": 0.11272575706243515, "memory(GiB)": 19.03, "step": 1206, "token_acc": 0.9592760180995475, "train_speed(iter/s)": 0.964786 }, { "epoch": 0.03920995354578826, "grad_norm": 2.4326343536376953, "learning_rate": 7.837662337662338e-06, "loss": 0.12186399847269058, "memory(GiB)": 19.03, "step": 1207, "token_acc": 0.9340659340659341, "train_speed(iter/s)": 0.964934 }, { "epoch": 0.03924243900854368, "grad_norm": 2.5347607135772705, "learning_rate": 7.844155844155844e-06, "loss": 0.10145213454961777, "memory(GiB)": 19.03, "step": 1208, "token_acc": 0.9473684210526315, "train_speed(iter/s)": 0.965081 }, { "epoch": 0.03927492447129909, "grad_norm": 3.9908416271209717, "learning_rate": 7.850649350649351e-06, "loss": 0.11901505291461945, "memory(GiB)": 19.03, "step": 1209, "token_acc": 0.9686274509803922, "train_speed(iter/s)": 0.965231 }, { "epoch": 0.03930740993405451, "grad_norm": 1.8997429609298706, "learning_rate": 7.857142857142858e-06, "loss": 0.10132898390293121, "memory(GiB)": 19.03, "step": 1210, "token_acc": 0.956, "train_speed(iter/s)": 0.965376 }, { "epoch": 0.039339895396809925, "grad_norm": 3.345200300216675, "learning_rate": 7.863636363636364e-06, "loss": 0.11633709073066711, "memory(GiB)": 19.03, "step": 1211, "token_acc": 0.9540983606557377, "train_speed(iter/s)": 0.965498 }, { "epoch": 0.03937238085956535, "grad_norm": 2.6760733127593994, "learning_rate": 7.870129870129871e-06, "loss": 0.11455898731946945, "memory(GiB)": 19.03, "step": 1212, "token_acc": 0.9541284403669725, "train_speed(iter/s)": 0.965621 }, { "epoch": 0.039404866322320764, "grad_norm": 3.496795415878296, "learning_rate": 7.876623376623377e-06, "loss": 0.11243975162506104, "memory(GiB)": 19.03, "step": 1213, "token_acc": 0.9553903345724907, "train_speed(iter/s)": 0.965736 }, { "epoch": 0.03943735178507618, "grad_norm": 2.758495807647705, "learning_rate": 7.883116883116884e-06, "loss": 0.11328929662704468, "memory(GiB)": 19.03, "step": 1214, "token_acc": 0.9596412556053812, "train_speed(iter/s)": 0.965854 }, { "epoch": 0.039469837247831596, "grad_norm": 3.3352715969085693, "learning_rate": 7.88961038961039e-06, "loss": 0.10662444680929184, "memory(GiB)": 19.03, "step": 1215, "token_acc": 0.9456066945606695, "train_speed(iter/s)": 0.965961 }, { "epoch": 0.03950232271058701, "grad_norm": 4.305285453796387, "learning_rate": 7.896103896103897e-06, "loss": 0.10559580475091934, "memory(GiB)": 19.03, "step": 1216, "token_acc": 0.9615384615384616, "train_speed(iter/s)": 0.966079 }, { "epoch": 0.03953480817334243, "grad_norm": 3.8820598125457764, "learning_rate": 7.902597402597402e-06, "loss": 0.10469336807727814, "memory(GiB)": 19.03, "step": 1217, "token_acc": 0.9636363636363636, "train_speed(iter/s)": 0.966203 }, { "epoch": 0.039567293636097844, "grad_norm": 2.046505928039551, "learning_rate": 7.909090909090909e-06, "loss": 0.0959661602973938, "memory(GiB)": 19.03, "step": 1218, "token_acc": 0.9647887323943662, "train_speed(iter/s)": 0.966311 }, { "epoch": 0.03959977909885326, "grad_norm": 1.7905980348587036, "learning_rate": 7.915584415584417e-06, "loss": 0.11431637406349182, "memory(GiB)": 19.03, "step": 1219, "token_acc": 0.9406392694063926, "train_speed(iter/s)": 0.966413 }, { "epoch": 0.039632264561608684, "grad_norm": 2.3669357299804688, "learning_rate": 7.922077922077924e-06, "loss": 0.11630900949239731, "memory(GiB)": 19.03, "step": 1220, "token_acc": 0.9452054794520548, "train_speed(iter/s)": 0.966517 }, { "epoch": 0.0396647500243641, "grad_norm": 1.4054880142211914, "learning_rate": 7.928571428571429e-06, "loss": 0.11644800007343292, "memory(GiB)": 19.03, "step": 1221, "token_acc": 0.9416666666666667, "train_speed(iter/s)": 0.966596 }, { "epoch": 0.039697235487119516, "grad_norm": 3.158482551574707, "learning_rate": 7.935064935064935e-06, "loss": 0.1131872832775116, "memory(GiB)": 19.03, "step": 1222, "token_acc": 0.9346733668341709, "train_speed(iter/s)": 0.966702 }, { "epoch": 0.03972972094987493, "grad_norm": 3.7763147354125977, "learning_rate": 7.941558441558442e-06, "loss": 0.12414997816085815, "memory(GiB)": 19.03, "step": 1223, "token_acc": 0.9453125, "train_speed(iter/s)": 0.966821 }, { "epoch": 0.03976220641263035, "grad_norm": 2.0466012954711914, "learning_rate": 7.948051948051948e-06, "loss": 0.10479725897312164, "memory(GiB)": 19.03, "step": 1224, "token_acc": 0.9535864978902954, "train_speed(iter/s)": 0.966919 }, { "epoch": 0.039794691875385764, "grad_norm": 2.385138511657715, "learning_rate": 7.954545454545455e-06, "loss": 0.1247783899307251, "memory(GiB)": 19.03, "step": 1225, "token_acc": 0.9318181818181818, "train_speed(iter/s)": 0.967036 }, { "epoch": 0.03982717733814118, "grad_norm": 24.644607543945312, "learning_rate": 7.961038961038962e-06, "loss": 0.10605768859386444, "memory(GiB)": 19.03, "step": 1226, "token_acc": 0.9579439252336449, "train_speed(iter/s)": 0.967149 }, { "epoch": 0.039859662800896596, "grad_norm": 3.2364583015441895, "learning_rate": 7.967532467532468e-06, "loss": 0.11203309893608093, "memory(GiB)": 19.03, "step": 1227, "token_acc": 0.966183574879227, "train_speed(iter/s)": 0.967214 }, { "epoch": 0.03989214826365202, "grad_norm": 2.5531883239746094, "learning_rate": 7.974025974025975e-06, "loss": 0.11912326514720917, "memory(GiB)": 19.03, "step": 1228, "token_acc": 0.9597989949748744, "train_speed(iter/s)": 0.967322 }, { "epoch": 0.039924633726407435, "grad_norm": 1.5137133598327637, "learning_rate": 7.980519480519482e-06, "loss": 0.1009659394621849, "memory(GiB)": 19.03, "step": 1229, "token_acc": 0.958904109589041, "train_speed(iter/s)": 0.967417 }, { "epoch": 0.03995711918916285, "grad_norm": 4.164184093475342, "learning_rate": 7.987012987012988e-06, "loss": 0.13120704889297485, "memory(GiB)": 19.03, "step": 1230, "token_acc": 0.9572649572649573, "train_speed(iter/s)": 0.967509 }, { "epoch": 0.03998960465191827, "grad_norm": 3.523951292037964, "learning_rate": 7.993506493506495e-06, "loss": 0.1122843474149704, "memory(GiB)": 19.03, "step": 1231, "token_acc": 0.9601769911504425, "train_speed(iter/s)": 0.967624 }, { "epoch": 0.040022090114673683, "grad_norm": 6.109689235687256, "learning_rate": 8.000000000000001e-06, "loss": 0.146931454539299, "memory(GiB)": 19.03, "step": 1232, "token_acc": 0.9330543933054394, "train_speed(iter/s)": 0.967731 }, { "epoch": 0.0400545755774291, "grad_norm": 1.8551937341690063, "learning_rate": 8.006493506493506e-06, "loss": 0.10614454746246338, "memory(GiB)": 19.03, "step": 1233, "token_acc": 0.9402390438247012, "train_speed(iter/s)": 0.967877 }, { "epoch": 0.040087061040184516, "grad_norm": 2.2432215213775635, "learning_rate": 8.012987012987013e-06, "loss": 0.09744884073734283, "memory(GiB)": 19.03, "step": 1234, "token_acc": 0.9553903345724907, "train_speed(iter/s)": 0.968011 }, { "epoch": 0.04011954650293993, "grad_norm": 1.9422757625579834, "learning_rate": 8.019480519480521e-06, "loss": 0.09596653282642365, "memory(GiB)": 19.03, "step": 1235, "token_acc": 0.952755905511811, "train_speed(iter/s)": 0.968143 }, { "epoch": 0.040152031965695355, "grad_norm": 2.378645420074463, "learning_rate": 8.025974025974028e-06, "loss": 0.12148208916187286, "memory(GiB)": 19.03, "step": 1236, "token_acc": 0.9411764705882353, "train_speed(iter/s)": 0.968265 }, { "epoch": 0.04018451742845077, "grad_norm": 2.4990453720092773, "learning_rate": 8.032467532467533e-06, "loss": 0.11578302830457687, "memory(GiB)": 19.03, "step": 1237, "token_acc": 0.9514925373134329, "train_speed(iter/s)": 0.968405 }, { "epoch": 0.04021700289120619, "grad_norm": 1.5888954401016235, "learning_rate": 8.03896103896104e-06, "loss": 0.12184622883796692, "memory(GiB)": 19.03, "step": 1238, "token_acc": 0.9452054794520548, "train_speed(iter/s)": 0.968531 }, { "epoch": 0.0402494883539616, "grad_norm": 1.5413932800292969, "learning_rate": 8.045454545454546e-06, "loss": 0.11160596460103989, "memory(GiB)": 19.03, "step": 1239, "token_acc": 0.9647577092511013, "train_speed(iter/s)": 0.968667 }, { "epoch": 0.04028197381671702, "grad_norm": 3.4135401248931885, "learning_rate": 8.051948051948052e-06, "loss": 0.12300944328308105, "memory(GiB)": 19.03, "step": 1240, "token_acc": 0.9536423841059603, "train_speed(iter/s)": 0.968803 }, { "epoch": 0.040314459279472435, "grad_norm": 2.182868003845215, "learning_rate": 8.058441558441559e-06, "loss": 0.10923641920089722, "memory(GiB)": 19.03, "step": 1241, "token_acc": 0.96, "train_speed(iter/s)": 0.968936 }, { "epoch": 0.04034694474222785, "grad_norm": 2.935842514038086, "learning_rate": 8.064935064935066e-06, "loss": 0.12509292364120483, "memory(GiB)": 19.03, "step": 1242, "token_acc": 0.9498069498069498, "train_speed(iter/s)": 0.969079 }, { "epoch": 0.04037943020498327, "grad_norm": 1.5866011381149292, "learning_rate": 8.071428571428572e-06, "loss": 0.11981026828289032, "memory(GiB)": 19.03, "step": 1243, "token_acc": 0.9495798319327731, "train_speed(iter/s)": 0.96921 }, { "epoch": 0.04041191566773869, "grad_norm": 1.8358228206634521, "learning_rate": 8.077922077922079e-06, "loss": 0.10640719532966614, "memory(GiB)": 19.03, "step": 1244, "token_acc": 0.970873786407767, "train_speed(iter/s)": 0.969306 }, { "epoch": 0.040444401130494106, "grad_norm": 1.1168339252471924, "learning_rate": 8.084415584415586e-06, "loss": 0.10978962481021881, "memory(GiB)": 19.03, "step": 1245, "token_acc": 0.9566929133858267, "train_speed(iter/s)": 0.96941 }, { "epoch": 0.04047688659324952, "grad_norm": 1.7251309156417847, "learning_rate": 8.090909090909092e-06, "loss": 0.11238938570022583, "memory(GiB)": 19.03, "step": 1246, "token_acc": 0.9606299212598425, "train_speed(iter/s)": 0.969523 }, { "epoch": 0.04050937205600494, "grad_norm": 2.7144558429718018, "learning_rate": 8.097402597402599e-06, "loss": 0.12172353267669678, "memory(GiB)": 19.03, "step": 1247, "token_acc": 0.9617021276595744, "train_speed(iter/s)": 0.969253 }, { "epoch": 0.040541857518760355, "grad_norm": 3.9919769763946533, "learning_rate": 8.103896103896105e-06, "loss": 0.1113811731338501, "memory(GiB)": 19.03, "step": 1248, "token_acc": 0.9566929133858267, "train_speed(iter/s)": 0.969359 }, { "epoch": 0.04057434298151577, "grad_norm": 2.3503596782684326, "learning_rate": 8.11038961038961e-06, "loss": 0.111538365483284, "memory(GiB)": 19.03, "step": 1249, "token_acc": 0.9365079365079365, "train_speed(iter/s)": 0.969452 }, { "epoch": 0.04060682844427119, "grad_norm": 2.0113987922668457, "learning_rate": 8.116883116883117e-06, "loss": 0.11706604808568954, "memory(GiB)": 19.03, "step": 1250, "token_acc": 0.9362549800796812, "train_speed(iter/s)": 0.969566 }, { "epoch": 0.0406393139070266, "grad_norm": 3.704298734664917, "learning_rate": 8.123376623376623e-06, "loss": 0.11685792356729507, "memory(GiB)": 19.03, "step": 1251, "token_acc": 0.9439655172413793, "train_speed(iter/s)": 0.969655 }, { "epoch": 0.040671799369782026, "grad_norm": 2.0856268405914307, "learning_rate": 8.129870129870132e-06, "loss": 0.11592592298984528, "memory(GiB)": 19.03, "step": 1252, "token_acc": 0.9348837209302325, "train_speed(iter/s)": 0.969751 }, { "epoch": 0.04070428483253744, "grad_norm": 2.0655417442321777, "learning_rate": 8.136363636363637e-06, "loss": 0.10861262679100037, "memory(GiB)": 19.03, "step": 1253, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.969864 }, { "epoch": 0.04073677029529286, "grad_norm": 1.6572829484939575, "learning_rate": 8.142857142857143e-06, "loss": 0.11735033243894577, "memory(GiB)": 19.03, "step": 1254, "token_acc": 0.9407407407407408, "train_speed(iter/s)": 0.969954 }, { "epoch": 0.040769255758048274, "grad_norm": 2.5503265857696533, "learning_rate": 8.14935064935065e-06, "loss": 0.10595997422933578, "memory(GiB)": 19.03, "step": 1255, "token_acc": 0.951048951048951, "train_speed(iter/s)": 0.970053 }, { "epoch": 0.04080174122080369, "grad_norm": 1.6111114025115967, "learning_rate": 8.155844155844157e-06, "loss": 0.11835421621799469, "memory(GiB)": 19.03, "step": 1256, "token_acc": 0.9360902255639098, "train_speed(iter/s)": 0.970145 }, { "epoch": 0.040834226683559106, "grad_norm": 1.7036842107772827, "learning_rate": 8.162337662337663e-06, "loss": 0.11199113726615906, "memory(GiB)": 19.03, "step": 1257, "token_acc": 0.940677966101695, "train_speed(iter/s)": 0.97025 }, { "epoch": 0.04086671214631452, "grad_norm": 1.4248842000961304, "learning_rate": 8.16883116883117e-06, "loss": 0.10783331096172333, "memory(GiB)": 19.03, "step": 1258, "token_acc": 0.9535864978902954, "train_speed(iter/s)": 0.97035 }, { "epoch": 0.04089919760906994, "grad_norm": 1.953874111175537, "learning_rate": 8.175324675324676e-06, "loss": 0.11792222410440445, "memory(GiB)": 19.03, "step": 1259, "token_acc": 0.9450980392156862, "train_speed(iter/s)": 0.970442 }, { "epoch": 0.04093168307182536, "grad_norm": 2.66581654548645, "learning_rate": 8.181818181818183e-06, "loss": 0.12624813616275787, "memory(GiB)": 19.03, "step": 1260, "token_acc": 0.964, "train_speed(iter/s)": 0.970562 }, { "epoch": 0.04096416853458078, "grad_norm": 1.900818943977356, "learning_rate": 8.188311688311688e-06, "loss": 0.12444097548723221, "memory(GiB)": 19.03, "step": 1261, "token_acc": 0.9531914893617022, "train_speed(iter/s)": 0.970706 }, { "epoch": 0.040996653997336194, "grad_norm": 1.117665410041809, "learning_rate": 8.194805194805196e-06, "loss": 0.11483652144670486, "memory(GiB)": 19.03, "step": 1262, "token_acc": 0.9357798165137615, "train_speed(iter/s)": 0.970841 }, { "epoch": 0.04102913946009161, "grad_norm": 2.8933920860290527, "learning_rate": 8.201298701298703e-06, "loss": 0.11484819650650024, "memory(GiB)": 19.03, "step": 1263, "token_acc": 0.9451476793248945, "train_speed(iter/s)": 0.970965 }, { "epoch": 0.041061624922847026, "grad_norm": 1.7266772985458374, "learning_rate": 8.20779220779221e-06, "loss": 0.1139674037694931, "memory(GiB)": 19.03, "step": 1264, "token_acc": 0.9366515837104072, "train_speed(iter/s)": 0.971094 }, { "epoch": 0.04109411038560244, "grad_norm": 1.7989543676376343, "learning_rate": 8.214285714285714e-06, "loss": 0.10960961133241653, "memory(GiB)": 19.03, "step": 1265, "token_acc": 0.9504504504504504, "train_speed(iter/s)": 0.971235 }, { "epoch": 0.04112659584835786, "grad_norm": 1.1974174976348877, "learning_rate": 8.220779220779221e-06, "loss": 0.11715178936719894, "memory(GiB)": 19.03, "step": 1266, "token_acc": 0.9369747899159664, "train_speed(iter/s)": 0.971365 }, { "epoch": 0.041159081311113274, "grad_norm": 2.959883451461792, "learning_rate": 8.227272727272728e-06, "loss": 0.11573798954486847, "memory(GiB)": 19.03, "step": 1267, "token_acc": 0.9315068493150684, "train_speed(iter/s)": 0.971512 }, { "epoch": 0.0411915667738687, "grad_norm": 1.8830432891845703, "learning_rate": 8.233766233766236e-06, "loss": 0.1129448413848877, "memory(GiB)": 19.03, "step": 1268, "token_acc": 0.949238578680203, "train_speed(iter/s)": 0.971647 }, { "epoch": 0.04122405223662411, "grad_norm": 2.312933921813965, "learning_rate": 8.24025974025974e-06, "loss": 0.11990976333618164, "memory(GiB)": 19.03, "step": 1269, "token_acc": 0.9518518518518518, "train_speed(iter/s)": 0.971785 }, { "epoch": 0.04125653769937953, "grad_norm": 3.91562819480896, "learning_rate": 8.246753246753247e-06, "loss": 0.11723528802394867, "memory(GiB)": 19.03, "step": 1270, "token_acc": 0.9538461538461539, "train_speed(iter/s)": 0.971933 }, { "epoch": 0.041289023162134945, "grad_norm": 1.6635807752609253, "learning_rate": 8.253246753246754e-06, "loss": 0.09820572286844254, "memory(GiB)": 19.03, "step": 1271, "token_acc": 0.9391304347826087, "train_speed(iter/s)": 0.972033 }, { "epoch": 0.04132150862489036, "grad_norm": 1.7665069103240967, "learning_rate": 8.25974025974026e-06, "loss": 0.13404494524002075, "memory(GiB)": 19.03, "step": 1272, "token_acc": 0.9420849420849421, "train_speed(iter/s)": 0.972134 }, { "epoch": 0.04135399408764578, "grad_norm": 1.6627482175827026, "learning_rate": 8.266233766233767e-06, "loss": 0.10716258734464645, "memory(GiB)": 19.03, "step": 1273, "token_acc": 0.9678899082568807, "train_speed(iter/s)": 0.972245 }, { "epoch": 0.041386479550401194, "grad_norm": 2.5554513931274414, "learning_rate": 8.272727272727274e-06, "loss": 0.11314645409584045, "memory(GiB)": 19.03, "step": 1274, "token_acc": 0.9686274509803922, "train_speed(iter/s)": 0.972356 }, { "epoch": 0.04141896501315661, "grad_norm": 1.3921358585357666, "learning_rate": 8.27922077922078e-06, "loss": 0.10078263282775879, "memory(GiB)": 19.03, "step": 1275, "token_acc": 0.9663865546218487, "train_speed(iter/s)": 0.972453 }, { "epoch": 0.04145145047591203, "grad_norm": 1.3192622661590576, "learning_rate": 8.285714285714287e-06, "loss": 0.10768724977970123, "memory(GiB)": 19.03, "step": 1276, "token_acc": 0.9333333333333333, "train_speed(iter/s)": 0.972564 }, { "epoch": 0.04148393593866745, "grad_norm": 4.7560272216796875, "learning_rate": 8.292207792207792e-06, "loss": 0.10236428678035736, "memory(GiB)": 19.03, "step": 1277, "token_acc": 0.954248366013072, "train_speed(iter/s)": 0.972667 }, { "epoch": 0.041516421401422865, "grad_norm": 9.234132766723633, "learning_rate": 8.2987012987013e-06, "loss": 0.1222279816865921, "memory(GiB)": 19.03, "step": 1278, "token_acc": 0.9541666666666667, "train_speed(iter/s)": 0.972775 }, { "epoch": 0.04154890686417828, "grad_norm": 1.541393518447876, "learning_rate": 8.305194805194807e-06, "loss": 0.11196985840797424, "memory(GiB)": 19.03, "step": 1279, "token_acc": 0.9662447257383966, "train_speed(iter/s)": 0.972885 }, { "epoch": 0.0415813923269337, "grad_norm": 2.523130416870117, "learning_rate": 8.311688311688313e-06, "loss": 0.10236098617315292, "memory(GiB)": 19.03, "step": 1280, "token_acc": 0.9533898305084746, "train_speed(iter/s)": 0.972993 }, { "epoch": 0.04161387778968911, "grad_norm": 2.9693946838378906, "learning_rate": 8.318181818181818e-06, "loss": 0.11924301087856293, "memory(GiB)": 19.03, "step": 1281, "token_acc": 0.9568345323741008, "train_speed(iter/s)": 0.973083 }, { "epoch": 0.04164636325244453, "grad_norm": 2.1369729042053223, "learning_rate": 8.324675324675325e-06, "loss": 0.11393178999423981, "memory(GiB)": 19.03, "step": 1282, "token_acc": 0.9553903345724907, "train_speed(iter/s)": 0.973185 }, { "epoch": 0.041678848715199945, "grad_norm": 1.5309292078018188, "learning_rate": 8.331168831168832e-06, "loss": 0.10955236107110977, "memory(GiB)": 19.03, "step": 1283, "token_acc": 0.9747474747474747, "train_speed(iter/s)": 0.973285 }, { "epoch": 0.04171133417795537, "grad_norm": 3.0558106899261475, "learning_rate": 8.337662337662338e-06, "loss": 0.10357911884784698, "memory(GiB)": 19.03, "step": 1284, "token_acc": 0.948936170212766, "train_speed(iter/s)": 0.973384 }, { "epoch": 0.041743819640710784, "grad_norm": 2.948127508163452, "learning_rate": 8.344155844155845e-06, "loss": 0.10900065302848816, "memory(GiB)": 19.03, "step": 1285, "token_acc": 0.9484978540772532, "train_speed(iter/s)": 0.973502 }, { "epoch": 0.0417763051034662, "grad_norm": 1.077353596687317, "learning_rate": 8.350649350649351e-06, "loss": 0.11346209794282913, "memory(GiB)": 19.03, "step": 1286, "token_acc": 0.9437751004016064, "train_speed(iter/s)": 0.973609 }, { "epoch": 0.04180879056622162, "grad_norm": 1.5992475748062134, "learning_rate": 8.357142857142858e-06, "loss": 0.10510231554508209, "memory(GiB)": 19.03, "step": 1287, "token_acc": 0.9459459459459459, "train_speed(iter/s)": 0.973718 }, { "epoch": 0.04184127602897703, "grad_norm": 1.149908185005188, "learning_rate": 8.363636363636365e-06, "loss": 0.10383398085832596, "memory(GiB)": 19.03, "step": 1288, "token_acc": 0.952191235059761, "train_speed(iter/s)": 0.973817 }, { "epoch": 0.04187376149173245, "grad_norm": 1.3651057481765747, "learning_rate": 8.370129870129871e-06, "loss": 0.09957677125930786, "memory(GiB)": 19.03, "step": 1289, "token_acc": 0.9678899082568807, "train_speed(iter/s)": 0.973888 }, { "epoch": 0.041906246954487865, "grad_norm": 2.345029592514038, "learning_rate": 8.376623376623378e-06, "loss": 0.1234864890575409, "memory(GiB)": 19.03, "step": 1290, "token_acc": 0.9433962264150944, "train_speed(iter/s)": 0.973999 }, { "epoch": 0.04193873241724328, "grad_norm": 0.9316086769104004, "learning_rate": 8.383116883116884e-06, "loss": 0.12018755823373795, "memory(GiB)": 19.03, "step": 1291, "token_acc": 0.9444444444444444, "train_speed(iter/s)": 0.974109 }, { "epoch": 0.041971217879998704, "grad_norm": 0.8175917863845825, "learning_rate": 8.38961038961039e-06, "loss": 0.09209269285202026, "memory(GiB)": 19.03, "step": 1292, "token_acc": 0.965, "train_speed(iter/s)": 0.97421 }, { "epoch": 0.04200370334275412, "grad_norm": 1.3567559719085693, "learning_rate": 8.396103896103896e-06, "loss": 0.09946419298648834, "memory(GiB)": 19.03, "step": 1293, "token_acc": 0.9622641509433962, "train_speed(iter/s)": 0.974342 }, { "epoch": 0.042036188805509536, "grad_norm": 1.717795968055725, "learning_rate": 8.402597402597403e-06, "loss": 0.094688281416893, "memory(GiB)": 19.03, "step": 1294, "token_acc": 0.9702970297029703, "train_speed(iter/s)": 0.974474 }, { "epoch": 0.04206867426826495, "grad_norm": 1.8496462106704712, "learning_rate": 8.40909090909091e-06, "loss": 0.11609463393688202, "memory(GiB)": 19.03, "step": 1295, "token_acc": 0.9490909090909091, "train_speed(iter/s)": 0.974608 }, { "epoch": 0.04210115973102037, "grad_norm": 1.349489688873291, "learning_rate": 8.415584415584416e-06, "loss": 0.12274397164583206, "memory(GiB)": 19.03, "step": 1296, "token_acc": 0.9477911646586346, "train_speed(iter/s)": 0.974744 }, { "epoch": 0.042133645193775784, "grad_norm": 2.8393077850341797, "learning_rate": 8.422077922077922e-06, "loss": 0.10953880101442337, "memory(GiB)": 19.03, "step": 1297, "token_acc": 0.9541284403669725, "train_speed(iter/s)": 0.974883 }, { "epoch": 0.0421661306565312, "grad_norm": 1.24148690700531, "learning_rate": 8.428571428571429e-06, "loss": 0.09846730530261993, "memory(GiB)": 19.03, "step": 1298, "token_acc": 0.9439655172413793, "train_speed(iter/s)": 0.975006 }, { "epoch": 0.04219861611928662, "grad_norm": 6.04234504699707, "learning_rate": 8.435064935064936e-06, "loss": 0.10215297341346741, "memory(GiB)": 19.03, "step": 1299, "token_acc": 0.9363636363636364, "train_speed(iter/s)": 0.975118 }, { "epoch": 0.04223110158204204, "grad_norm": 1.854438304901123, "learning_rate": 8.441558441558442e-06, "loss": 0.10649917274713516, "memory(GiB)": 19.03, "step": 1300, "token_acc": 0.9585253456221198, "train_speed(iter/s)": 0.975203 }, { "epoch": 0.042263587044797456, "grad_norm": 0.9553690552711487, "learning_rate": 8.448051948051949e-06, "loss": 0.10821009427309036, "memory(GiB)": 19.03, "step": 1301, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.975294 }, { "epoch": 0.04229607250755287, "grad_norm": 1.0653141736984253, "learning_rate": 8.454545454545455e-06, "loss": 0.10735533386468887, "memory(GiB)": 19.03, "step": 1302, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.975384 }, { "epoch": 0.04232855797030829, "grad_norm": 1.6798816919326782, "learning_rate": 8.461038961038962e-06, "loss": 0.09916523098945618, "memory(GiB)": 19.03, "step": 1303, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.975481 }, { "epoch": 0.042361043433063704, "grad_norm": 1.231801986694336, "learning_rate": 8.467532467532467e-06, "loss": 0.11109618097543716, "memory(GiB)": 19.03, "step": 1304, "token_acc": 0.9490909090909091, "train_speed(iter/s)": 0.97557 }, { "epoch": 0.04239352889581912, "grad_norm": 1.193326711654663, "learning_rate": 8.474025974025975e-06, "loss": 0.09858749806880951, "memory(GiB)": 19.03, "step": 1305, "token_acc": 0.9558232931726908, "train_speed(iter/s)": 0.975654 }, { "epoch": 0.042426014358574536, "grad_norm": 1.6137504577636719, "learning_rate": 8.480519480519482e-06, "loss": 0.10758975893259048, "memory(GiB)": 19.03, "step": 1306, "token_acc": 0.9463414634146341, "train_speed(iter/s)": 0.975736 }, { "epoch": 0.04245849982132995, "grad_norm": 1.252946376800537, "learning_rate": 8.487012987012988e-06, "loss": 0.12504738569259644, "memory(GiB)": 19.03, "step": 1307, "token_acc": 0.9383259911894273, "train_speed(iter/s)": 0.97584 }, { "epoch": 0.042490985284085375, "grad_norm": 1.1178768873214722, "learning_rate": 8.493506493506493e-06, "loss": 0.10356403887271881, "memory(GiB)": 19.03, "step": 1308, "token_acc": 0.9471830985915493, "train_speed(iter/s)": 0.975921 }, { "epoch": 0.04252347074684079, "grad_norm": 1.6716476678848267, "learning_rate": 8.5e-06, "loss": 0.12416244298219681, "memory(GiB)": 19.03, "step": 1309, "token_acc": 0.9502262443438914, "train_speed(iter/s)": 0.975998 }, { "epoch": 0.04255595620959621, "grad_norm": 1.1634691953659058, "learning_rate": 8.506493506493507e-06, "loss": 0.10561488568782806, "memory(GiB)": 19.03, "step": 1310, "token_acc": 0.9695652173913043, "train_speed(iter/s)": 0.976092 }, { "epoch": 0.042588441672351623, "grad_norm": 1.483144998550415, "learning_rate": 8.512987012987015e-06, "loss": 0.11348151415586472, "memory(GiB)": 19.03, "step": 1311, "token_acc": 0.9468599033816425, "train_speed(iter/s)": 0.976142 }, { "epoch": 0.04262092713510704, "grad_norm": 1.5514239072799683, "learning_rate": 8.51948051948052e-06, "loss": 0.1293022632598877, "memory(GiB)": 19.03, "step": 1312, "token_acc": 0.968, "train_speed(iter/s)": 0.97623 }, { "epoch": 0.042653412597862456, "grad_norm": 1.2036546468734741, "learning_rate": 8.525974025974026e-06, "loss": 0.10153064131736755, "memory(GiB)": 19.03, "step": 1313, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.976322 }, { "epoch": 0.04268589806061787, "grad_norm": 0.9838938117027283, "learning_rate": 8.532467532467533e-06, "loss": 0.1091829389333725, "memory(GiB)": 19.03, "step": 1314, "token_acc": 0.9401709401709402, "train_speed(iter/s)": 0.976415 }, { "epoch": 0.04271838352337329, "grad_norm": 1.1710549592971802, "learning_rate": 8.53896103896104e-06, "loss": 0.10151071846485138, "memory(GiB)": 19.03, "step": 1315, "token_acc": 0.963302752293578, "train_speed(iter/s)": 0.976504 }, { "epoch": 0.04275086898612871, "grad_norm": 2.4475598335266113, "learning_rate": 8.545454545454546e-06, "loss": 0.12586745619773865, "memory(GiB)": 19.03, "step": 1316, "token_acc": 0.9179487179487179, "train_speed(iter/s)": 0.976635 }, { "epoch": 0.04278335444888413, "grad_norm": 0.7846313118934631, "learning_rate": 8.551948051948053e-06, "loss": 0.10780966281890869, "memory(GiB)": 19.03, "step": 1317, "token_acc": 0.9497716894977168, "train_speed(iter/s)": 0.976759 }, { "epoch": 0.04281583991163954, "grad_norm": 1.116126298904419, "learning_rate": 8.55844155844156e-06, "loss": 0.11056773364543915, "memory(GiB)": 19.03, "step": 1318, "token_acc": 0.9424778761061947, "train_speed(iter/s)": 0.976895 }, { "epoch": 0.04284832537439496, "grad_norm": 0.9512715339660645, "learning_rate": 8.564935064935066e-06, "loss": 0.11353045701980591, "memory(GiB)": 19.03, "step": 1319, "token_acc": 0.9493087557603687, "train_speed(iter/s)": 0.977024 }, { "epoch": 0.042880810837150375, "grad_norm": 1.1153268814086914, "learning_rate": 8.571428571428571e-06, "loss": 0.12659329175949097, "memory(GiB)": 19.03, "step": 1320, "token_acc": 0.9537366548042705, "train_speed(iter/s)": 0.977145 }, { "epoch": 0.04291329629990579, "grad_norm": 1.1400253772735596, "learning_rate": 8.57792207792208e-06, "loss": 0.10606241971254349, "memory(GiB)": 19.03, "step": 1321, "token_acc": 0.9461538461538461, "train_speed(iter/s)": 0.97727 }, { "epoch": 0.04294578176266121, "grad_norm": 1.147007942199707, "learning_rate": 8.584415584415586e-06, "loss": 0.12298303842544556, "memory(GiB)": 19.03, "step": 1322, "token_acc": 0.97, "train_speed(iter/s)": 0.977403 }, { "epoch": 0.04297826722541662, "grad_norm": 0.7385469079017639, "learning_rate": 8.590909090909092e-06, "loss": 0.11772328615188599, "memory(GiB)": 19.03, "step": 1323, "token_acc": 0.9688888888888889, "train_speed(iter/s)": 0.977525 }, { "epoch": 0.043010752688172046, "grad_norm": 1.2105236053466797, "learning_rate": 8.597402597402597e-06, "loss": 0.10797017812728882, "memory(GiB)": 19.03, "step": 1324, "token_acc": 0.961038961038961, "train_speed(iter/s)": 0.977658 }, { "epoch": 0.04304323815092746, "grad_norm": 1.1858832836151123, "learning_rate": 8.603896103896104e-06, "loss": 0.1099347323179245, "memory(GiB)": 19.03, "step": 1325, "token_acc": 0.9417040358744395, "train_speed(iter/s)": 0.977793 }, { "epoch": 0.04307572361368288, "grad_norm": 0.8201935291290283, "learning_rate": 8.61038961038961e-06, "loss": 0.11384880542755127, "memory(GiB)": 19.03, "step": 1326, "token_acc": 0.9619565217391305, "train_speed(iter/s)": 0.977915 }, { "epoch": 0.043108209076438295, "grad_norm": 1.3314182758331299, "learning_rate": 8.616883116883117e-06, "loss": 0.116790272295475, "memory(GiB)": 19.03, "step": 1327, "token_acc": 0.9617224880382775, "train_speed(iter/s)": 0.978051 }, { "epoch": 0.04314069453919371, "grad_norm": 0.8666874170303345, "learning_rate": 8.623376623376624e-06, "loss": 0.10023462772369385, "memory(GiB)": 19.03, "step": 1328, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.978183 }, { "epoch": 0.04317318000194913, "grad_norm": 1.1209629774093628, "learning_rate": 8.62987012987013e-06, "loss": 0.11005251109600067, "memory(GiB)": 19.03, "step": 1329, "token_acc": 0.9607142857142857, "train_speed(iter/s)": 0.978309 }, { "epoch": 0.04320566546470454, "grad_norm": 2.4731781482696533, "learning_rate": 8.636363636363637e-06, "loss": 0.11979666352272034, "memory(GiB)": 19.03, "step": 1330, "token_acc": 0.9675925925925926, "train_speed(iter/s)": 0.978449 }, { "epoch": 0.04323815092745996, "grad_norm": 2.0804646015167236, "learning_rate": 8.642857142857144e-06, "loss": 0.12533390522003174, "memory(GiB)": 19.03, "step": 1331, "token_acc": 0.9679144385026738, "train_speed(iter/s)": 0.978582 }, { "epoch": 0.04327063639021538, "grad_norm": 1.5173059701919556, "learning_rate": 8.64935064935065e-06, "loss": 0.12492454051971436, "memory(GiB)": 19.03, "step": 1332, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.978684 }, { "epoch": 0.0433031218529708, "grad_norm": 1.6014165878295898, "learning_rate": 8.655844155844157e-06, "loss": 0.12321127206087112, "memory(GiB)": 19.03, "step": 1333, "token_acc": 0.940677966101695, "train_speed(iter/s)": 0.978789 }, { "epoch": 0.043335607315726214, "grad_norm": 1.3098716735839844, "learning_rate": 8.662337662337663e-06, "loss": 0.10357752442359924, "memory(GiB)": 19.03, "step": 1334, "token_acc": 0.9596412556053812, "train_speed(iter/s)": 0.978886 }, { "epoch": 0.04336809277848163, "grad_norm": 2.2294530868530273, "learning_rate": 8.66883116883117e-06, "loss": 0.12721270322799683, "memory(GiB)": 19.03, "step": 1335, "token_acc": 0.9659574468085106, "train_speed(iter/s)": 0.978979 }, { "epoch": 0.043400578241237046, "grad_norm": 2.4011242389678955, "learning_rate": 8.675324675324675e-06, "loss": 0.12949484586715698, "memory(GiB)": 19.03, "step": 1336, "token_acc": 0.9512195121951219, "train_speed(iter/s)": 0.979083 }, { "epoch": 0.04343306370399246, "grad_norm": 1.6439119577407837, "learning_rate": 8.681818181818182e-06, "loss": 0.11522349715232849, "memory(GiB)": 19.03, "step": 1337, "token_acc": 0.9596774193548387, "train_speed(iter/s)": 0.979178 }, { "epoch": 0.04346554916674788, "grad_norm": 1.0350139141082764, "learning_rate": 8.68831168831169e-06, "loss": 0.1138787716627121, "memory(GiB)": 19.03, "step": 1338, "token_acc": 0.9320754716981132, "train_speed(iter/s)": 0.979264 }, { "epoch": 0.043498034629503295, "grad_norm": 1.894225001335144, "learning_rate": 8.694805194805196e-06, "loss": 0.12632161378860474, "memory(GiB)": 19.03, "step": 1339, "token_acc": 0.9227272727272727, "train_speed(iter/s)": 0.979365 }, { "epoch": 0.04353052009225872, "grad_norm": 3.039815902709961, "learning_rate": 8.701298701298701e-06, "loss": 0.1431938111782074, "memory(GiB)": 19.03, "step": 1340, "token_acc": 0.9533898305084746, "train_speed(iter/s)": 0.979458 }, { "epoch": 0.043563005555014134, "grad_norm": 2.1056582927703857, "learning_rate": 8.707792207792208e-06, "loss": 0.140745609998703, "memory(GiB)": 19.03, "step": 1341, "token_acc": 0.9377431906614786, "train_speed(iter/s)": 0.979557 }, { "epoch": 0.04359549101776955, "grad_norm": 1.3882129192352295, "learning_rate": 8.714285714285715e-06, "loss": 0.14018666744232178, "memory(GiB)": 19.03, "step": 1342, "token_acc": 0.944, "train_speed(iter/s)": 0.979659 }, { "epoch": 0.043627976480524966, "grad_norm": 2.5292978286743164, "learning_rate": 8.720779220779221e-06, "loss": 0.11687929928302765, "memory(GiB)": 19.03, "step": 1343, "token_acc": 0.9505703422053232, "train_speed(iter/s)": 0.979755 }, { "epoch": 0.04366046194328038, "grad_norm": 2.97808837890625, "learning_rate": 8.727272727272728e-06, "loss": 0.1099323034286499, "memory(GiB)": 19.03, "step": 1344, "token_acc": 0.9704641350210971, "train_speed(iter/s)": 0.979856 }, { "epoch": 0.0436929474060358, "grad_norm": 1.3136235475540161, "learning_rate": 8.733766233766234e-06, "loss": 0.11225824058055878, "memory(GiB)": 19.03, "step": 1345, "token_acc": 0.9358490566037736, "train_speed(iter/s)": 0.97996 }, { "epoch": 0.043725432868791214, "grad_norm": 1.5582873821258545, "learning_rate": 8.740259740259741e-06, "loss": 0.1144762635231018, "memory(GiB)": 19.03, "step": 1346, "token_acc": 0.9605911330049262, "train_speed(iter/s)": 0.980054 }, { "epoch": 0.04375791833154663, "grad_norm": 1.1338555812835693, "learning_rate": 8.746753246753248e-06, "loss": 0.10634718835353851, "memory(GiB)": 19.03, "step": 1347, "token_acc": 0.967479674796748, "train_speed(iter/s)": 0.980164 }, { "epoch": 0.04379040379430205, "grad_norm": 1.413562297821045, "learning_rate": 8.753246753246754e-06, "loss": 0.12009594589471817, "memory(GiB)": 19.03, "step": 1348, "token_acc": 0.9330357142857143, "train_speed(iter/s)": 0.980271 }, { "epoch": 0.04382288925705747, "grad_norm": 1.3841803073883057, "learning_rate": 8.75974025974026e-06, "loss": 0.11900272965431213, "memory(GiB)": 19.03, "step": 1349, "token_acc": 0.9448529411764706, "train_speed(iter/s)": 0.98036 }, { "epoch": 0.043855374719812885, "grad_norm": 1.2910805940628052, "learning_rate": 8.766233766233767e-06, "loss": 0.10963280498981476, "memory(GiB)": 19.03, "step": 1350, "token_acc": 0.966804979253112, "train_speed(iter/s)": 0.980436 }, { "epoch": 0.0438878601825683, "grad_norm": 0.9924920797348022, "learning_rate": 8.772727272727274e-06, "loss": 0.11679671704769135, "memory(GiB)": 19.03, "step": 1351, "token_acc": 0.9571428571428572, "train_speed(iter/s)": 0.980526 }, { "epoch": 0.04392034564532372, "grad_norm": 1.7302151918411255, "learning_rate": 8.779220779220779e-06, "loss": 0.1317903995513916, "memory(GiB)": 19.03, "step": 1352, "token_acc": 0.9660377358490566, "train_speed(iter/s)": 0.980628 }, { "epoch": 0.043952831108079134, "grad_norm": 1.4979315996170044, "learning_rate": 8.785714285714286e-06, "loss": 0.12066927552223206, "memory(GiB)": 19.03, "step": 1353, "token_acc": 0.944, "train_speed(iter/s)": 0.980734 }, { "epoch": 0.04398531657083455, "grad_norm": 1.8292227983474731, "learning_rate": 8.792207792207794e-06, "loss": 0.1257379949092865, "memory(GiB)": 19.03, "step": 1354, "token_acc": 0.953307392996109, "train_speed(iter/s)": 0.980854 }, { "epoch": 0.044017802033589966, "grad_norm": 1.3985943794250488, "learning_rate": 8.7987012987013e-06, "loss": 0.1265990138053894, "memory(GiB)": 19.03, "step": 1355, "token_acc": 0.9534883720930233, "train_speed(iter/s)": 0.980941 }, { "epoch": 0.04405028749634539, "grad_norm": 1.2744550704956055, "learning_rate": 8.805194805194805e-06, "loss": 0.10583695769309998, "memory(GiB)": 19.03, "step": 1356, "token_acc": 0.96484375, "train_speed(iter/s)": 0.981011 }, { "epoch": 0.044082772959100805, "grad_norm": 0.9744300246238708, "learning_rate": 8.811688311688312e-06, "loss": 0.10737933218479156, "memory(GiB)": 19.03, "step": 1357, "token_acc": 0.9663865546218487, "train_speed(iter/s)": 0.981085 }, { "epoch": 0.04411525842185622, "grad_norm": 1.2626680135726929, "learning_rate": 8.818181818181819e-06, "loss": 0.12028273940086365, "memory(GiB)": 19.03, "step": 1358, "token_acc": 0.9414414414414415, "train_speed(iter/s)": 0.981174 }, { "epoch": 0.04414774388461164, "grad_norm": 0.8662140369415283, "learning_rate": 8.824675324675325e-06, "loss": 0.1119794100522995, "memory(GiB)": 19.03, "step": 1359, "token_acc": 0.9482758620689655, "train_speed(iter/s)": 0.981273 }, { "epoch": 0.04418022934736705, "grad_norm": 1.571144461631775, "learning_rate": 8.831168831168832e-06, "loss": 0.13021859526634216, "memory(GiB)": 19.03, "step": 1360, "token_acc": 0.9488372093023256, "train_speed(iter/s)": 0.981352 }, { "epoch": 0.04421271481012247, "grad_norm": 0.9484578967094421, "learning_rate": 8.837662337662338e-06, "loss": 0.11130921542644501, "memory(GiB)": 19.03, "step": 1361, "token_acc": 0.9723502304147466, "train_speed(iter/s)": 0.981442 }, { "epoch": 0.044245200272877885, "grad_norm": 1.8202202320098877, "learning_rate": 8.844155844155845e-06, "loss": 0.11834342032670975, "memory(GiB)": 19.03, "step": 1362, "token_acc": 0.9446494464944649, "train_speed(iter/s)": 0.981495 }, { "epoch": 0.0442776857356333, "grad_norm": 1.0125398635864258, "learning_rate": 8.850649350649352e-06, "loss": 0.10731440782546997, "memory(GiB)": 19.03, "step": 1363, "token_acc": 0.9300411522633745, "train_speed(iter/s)": 0.981556 }, { "epoch": 0.044310171198388724, "grad_norm": 0.9004778265953064, "learning_rate": 8.857142857142858e-06, "loss": 0.10212552547454834, "memory(GiB)": 19.03, "step": 1364, "token_acc": 0.9592760180995475, "train_speed(iter/s)": 0.981645 }, { "epoch": 0.04434265666114414, "grad_norm": 1.1976300477981567, "learning_rate": 8.863636363636365e-06, "loss": 0.11335404962301254, "memory(GiB)": 19.03, "step": 1365, "token_acc": 0.9591078066914498, "train_speed(iter/s)": 0.981728 }, { "epoch": 0.04437514212389956, "grad_norm": 2.056924343109131, "learning_rate": 8.870129870129871e-06, "loss": 0.11746753752231598, "memory(GiB)": 19.03, "step": 1366, "token_acc": 0.9322033898305084, "train_speed(iter/s)": 0.981801 }, { "epoch": 0.04440762758665497, "grad_norm": 1.1526894569396973, "learning_rate": 8.876623376623378e-06, "loss": 0.10752218961715698, "memory(GiB)": 19.03, "step": 1367, "token_acc": 0.958904109589041, "train_speed(iter/s)": 0.981899 }, { "epoch": 0.04444011304941039, "grad_norm": 1.3110933303833008, "learning_rate": 8.883116883116883e-06, "loss": 0.10988103598356247, "memory(GiB)": 19.03, "step": 1368, "token_acc": 0.9645669291338582, "train_speed(iter/s)": 0.981972 }, { "epoch": 0.044472598512165805, "grad_norm": 1.6492010354995728, "learning_rate": 8.88961038961039e-06, "loss": 0.12709853053092957, "memory(GiB)": 19.03, "step": 1369, "token_acc": 0.9396984924623115, "train_speed(iter/s)": 0.98207 }, { "epoch": 0.04450508397492122, "grad_norm": 1.4798755645751953, "learning_rate": 8.896103896103896e-06, "loss": 0.11066851019859314, "memory(GiB)": 19.03, "step": 1370, "token_acc": 0.9411764705882353, "train_speed(iter/s)": 0.982159 }, { "epoch": 0.04453756943767664, "grad_norm": 1.8974977731704712, "learning_rate": 8.902597402597405e-06, "loss": 0.11431315541267395, "memory(GiB)": 19.03, "step": 1371, "token_acc": 0.9568965517241379, "train_speed(iter/s)": 0.982234 }, { "epoch": 0.04457005490043206, "grad_norm": 1.9118762016296387, "learning_rate": 8.90909090909091e-06, "loss": 0.10542889684438705, "memory(GiB)": 19.03, "step": 1372, "token_acc": 0.9702970297029703, "train_speed(iter/s)": 0.982348 }, { "epoch": 0.044602540363187476, "grad_norm": 1.0928947925567627, "learning_rate": 8.915584415584416e-06, "loss": 0.11281608045101166, "memory(GiB)": 19.03, "step": 1373, "token_acc": 0.9311740890688259, "train_speed(iter/s)": 0.982474 }, { "epoch": 0.04463502582594289, "grad_norm": 1.8283672332763672, "learning_rate": 8.922077922077923e-06, "loss": 0.12949615716934204, "memory(GiB)": 19.03, "step": 1374, "token_acc": 0.92, "train_speed(iter/s)": 0.982587 }, { "epoch": 0.04466751128869831, "grad_norm": 0.9021936655044556, "learning_rate": 8.92857142857143e-06, "loss": 0.09735478460788727, "memory(GiB)": 19.03, "step": 1375, "token_acc": 0.9698795180722891, "train_speed(iter/s)": 0.98271 }, { "epoch": 0.044699996751453724, "grad_norm": 1.4267410039901733, "learning_rate": 8.935064935064936e-06, "loss": 0.1035381481051445, "memory(GiB)": 19.03, "step": 1376, "token_acc": 0.9628099173553719, "train_speed(iter/s)": 0.982843 }, { "epoch": 0.04473248221420914, "grad_norm": 3.6797239780426025, "learning_rate": 8.941558441558442e-06, "loss": 0.10961420834064484, "memory(GiB)": 19.03, "step": 1377, "token_acc": 0.9587628865979382, "train_speed(iter/s)": 0.982965 }, { "epoch": 0.04476496767696456, "grad_norm": 1.2852243185043335, "learning_rate": 8.948051948051949e-06, "loss": 0.09847654402256012, "memory(GiB)": 19.03, "step": 1378, "token_acc": 0.9683257918552036, "train_speed(iter/s)": 0.983058 }, { "epoch": 0.04479745313971997, "grad_norm": 1.2434824705123901, "learning_rate": 8.954545454545456e-06, "loss": 0.09857206046581268, "memory(GiB)": 19.03, "step": 1379, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.983181 }, { "epoch": 0.044829938602475396, "grad_norm": 1.026296854019165, "learning_rate": 8.96103896103896e-06, "loss": 0.09667332470417023, "memory(GiB)": 19.03, "step": 1380, "token_acc": 0.9585062240663901, "train_speed(iter/s)": 0.98327 }, { "epoch": 0.04486242406523081, "grad_norm": 2.791459798812866, "learning_rate": 8.967532467532469e-06, "loss": 0.1105557382106781, "memory(GiB)": 19.03, "step": 1381, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.983384 }, { "epoch": 0.04489490952798623, "grad_norm": 1.4593781232833862, "learning_rate": 8.974025974025975e-06, "loss": 0.12000773847103119, "memory(GiB)": 19.03, "step": 1382, "token_acc": 0.9392265193370166, "train_speed(iter/s)": 0.983509 }, { "epoch": 0.044927394990741644, "grad_norm": 2.038844347000122, "learning_rate": 8.980519480519482e-06, "loss": 0.12184679508209229, "memory(GiB)": 19.03, "step": 1383, "token_acc": 0.9320388349514563, "train_speed(iter/s)": 0.983626 }, { "epoch": 0.04495988045349706, "grad_norm": 1.32699716091156, "learning_rate": 8.987012987012987e-06, "loss": 0.11788757890462875, "memory(GiB)": 19.03, "step": 1384, "token_acc": 0.9267241379310345, "train_speed(iter/s)": 0.983754 }, { "epoch": 0.044992365916252476, "grad_norm": 1.069881558418274, "learning_rate": 8.993506493506494e-06, "loss": 0.09696875512599945, "memory(GiB)": 19.03, "step": 1385, "token_acc": 0.9615384615384616, "train_speed(iter/s)": 0.983872 }, { "epoch": 0.04502485137900789, "grad_norm": 1.0103881359100342, "learning_rate": 9e-06, "loss": 0.10420476645231247, "memory(GiB)": 19.03, "step": 1386, "token_acc": 0.9574468085106383, "train_speed(iter/s)": 0.983986 }, { "epoch": 0.04505733684176331, "grad_norm": 11.05120849609375, "learning_rate": 9.006493506493509e-06, "loss": 0.09866024553775787, "memory(GiB)": 19.03, "step": 1387, "token_acc": 0.963265306122449, "train_speed(iter/s)": 0.984112 }, { "epoch": 0.04508982230451873, "grad_norm": 1.1486858129501343, "learning_rate": 9.012987012987013e-06, "loss": 0.10543981194496155, "memory(GiB)": 19.03, "step": 1388, "token_acc": 0.9575471698113207, "train_speed(iter/s)": 0.984231 }, { "epoch": 0.04512230776727415, "grad_norm": 0.9947361946105957, "learning_rate": 9.01948051948052e-06, "loss": 0.09524935483932495, "memory(GiB)": 19.03, "step": 1389, "token_acc": 0.9598393574297188, "train_speed(iter/s)": 0.984341 }, { "epoch": 0.04515479323002956, "grad_norm": 1.0705136060714722, "learning_rate": 9.025974025974027e-06, "loss": 0.09389421343803406, "memory(GiB)": 19.03, "step": 1390, "token_acc": 0.9629629629629629, "train_speed(iter/s)": 0.984468 }, { "epoch": 0.04518727869278498, "grad_norm": 1.9885765314102173, "learning_rate": 9.032467532467533e-06, "loss": 0.11640691012144089, "memory(GiB)": 19.03, "step": 1391, "token_acc": 0.9605911330049262, "train_speed(iter/s)": 0.98458 }, { "epoch": 0.045219764155540396, "grad_norm": 3.0108673572540283, "learning_rate": 9.03896103896104e-06, "loss": 0.1114882379770279, "memory(GiB)": 19.03, "step": 1392, "token_acc": 0.951417004048583, "train_speed(iter/s)": 0.984692 }, { "epoch": 0.04525224961829581, "grad_norm": 1.7702515125274658, "learning_rate": 9.045454545454546e-06, "loss": 0.1029270589351654, "memory(GiB)": 19.03, "step": 1393, "token_acc": 0.9665271966527197, "train_speed(iter/s)": 0.984764 }, { "epoch": 0.04528473508105123, "grad_norm": 2.368370771408081, "learning_rate": 9.051948051948053e-06, "loss": 0.13617578148841858, "memory(GiB)": 19.03, "step": 1394, "token_acc": 0.9316239316239316, "train_speed(iter/s)": 0.984861 }, { "epoch": 0.045317220543806644, "grad_norm": 1.9706239700317383, "learning_rate": 9.05844155844156e-06, "loss": 0.11732475459575653, "memory(GiB)": 19.03, "step": 1395, "token_acc": 0.957983193277311, "train_speed(iter/s)": 0.984951 }, { "epoch": 0.04534970600656207, "grad_norm": 1.093971848487854, "learning_rate": 9.064935064935065e-06, "loss": 0.10473925620317459, "memory(GiB)": 19.03, "step": 1396, "token_acc": 0.9531914893617022, "train_speed(iter/s)": 0.985043 }, { "epoch": 0.04538219146931748, "grad_norm": 1.3214137554168701, "learning_rate": 9.071428571428573e-06, "loss": 0.10845336318016052, "memory(GiB)": 19.03, "step": 1397, "token_acc": 0.9805194805194806, "train_speed(iter/s)": 0.985131 }, { "epoch": 0.0454146769320729, "grad_norm": 1.270859956741333, "learning_rate": 9.07792207792208e-06, "loss": 0.09803827106952667, "memory(GiB)": 19.03, "step": 1398, "token_acc": 0.9702970297029703, "train_speed(iter/s)": 0.98522 }, { "epoch": 0.045447162394828315, "grad_norm": 5.317333698272705, "learning_rate": 9.084415584415586e-06, "loss": 0.10898555815219879, "memory(GiB)": 19.03, "step": 1399, "token_acc": 0.9362745098039216, "train_speed(iter/s)": 0.985316 }, { "epoch": 0.04547964785758373, "grad_norm": 1.0878435373306274, "learning_rate": 9.090909090909091e-06, "loss": 0.11166281998157501, "memory(GiB)": 19.03, "step": 1400, "token_acc": 0.9551020408163265, "train_speed(iter/s)": 0.985411 }, { "epoch": 0.04551213332033915, "grad_norm": 4.237914562225342, "learning_rate": 9.097402597402598e-06, "loss": 0.1108710914850235, "memory(GiB)": 19.03, "step": 1401, "token_acc": 0.9486166007905138, "train_speed(iter/s)": 0.9855 }, { "epoch": 0.04554461878309456, "grad_norm": 1.6284236907958984, "learning_rate": 9.103896103896104e-06, "loss": 0.12265600264072418, "memory(GiB)": 19.03, "step": 1402, "token_acc": 0.9444444444444444, "train_speed(iter/s)": 0.985584 }, { "epoch": 0.04557710424584998, "grad_norm": 1.0945976972579956, "learning_rate": 9.110389610389611e-06, "loss": 0.10717125236988068, "memory(GiB)": 19.03, "step": 1403, "token_acc": 0.9523809523809523, "train_speed(iter/s)": 0.985658 }, { "epoch": 0.0456095897086054, "grad_norm": 1.1479206085205078, "learning_rate": 9.116883116883117e-06, "loss": 0.11293444782495499, "memory(GiB)": 19.03, "step": 1404, "token_acc": 0.9609929078014184, "train_speed(iter/s)": 0.985743 }, { "epoch": 0.04564207517136082, "grad_norm": 1.4157943725585938, "learning_rate": 9.123376623376624e-06, "loss": 0.10952533781528473, "memory(GiB)": 19.03, "step": 1405, "token_acc": 0.9574468085106383, "train_speed(iter/s)": 0.985835 }, { "epoch": 0.045674560634116235, "grad_norm": 0.8755702376365662, "learning_rate": 9.12987012987013e-06, "loss": 0.10404519736766815, "memory(GiB)": 19.03, "step": 1406, "token_acc": 0.9626168224299065, "train_speed(iter/s)": 0.985926 }, { "epoch": 0.04570704609687165, "grad_norm": 1.2995957136154175, "learning_rate": 9.136363636363637e-06, "loss": 0.10724540799856186, "memory(GiB)": 19.03, "step": 1407, "token_acc": 0.9638009049773756, "train_speed(iter/s)": 0.986024 }, { "epoch": 0.04573953155962707, "grad_norm": 1.1924833059310913, "learning_rate": 9.142857142857144e-06, "loss": 0.11105187237262726, "memory(GiB)": 19.03, "step": 1408, "token_acc": 0.9589552238805971, "train_speed(iter/s)": 0.986117 }, { "epoch": 0.04577201702238248, "grad_norm": 0.8772698640823364, "learning_rate": 9.14935064935065e-06, "loss": 0.10751917213201523, "memory(GiB)": 19.03, "step": 1409, "token_acc": 0.961864406779661, "train_speed(iter/s)": 0.986206 }, { "epoch": 0.0458045024851379, "grad_norm": 1.7487367391586304, "learning_rate": 9.155844155844157e-06, "loss": 0.12073390185832977, "memory(GiB)": 19.03, "step": 1410, "token_acc": 0.9701492537313433, "train_speed(iter/s)": 0.986288 }, { "epoch": 0.045836987947893315, "grad_norm": 5.855916500091553, "learning_rate": 9.162337662337664e-06, "loss": 0.14757737517356873, "memory(GiB)": 19.03, "step": 1411, "token_acc": 0.9543147208121827, "train_speed(iter/s)": 0.986366 }, { "epoch": 0.04586947341064874, "grad_norm": 1.4983398914337158, "learning_rate": 9.168831168831169e-06, "loss": 0.1213953047990799, "memory(GiB)": 19.03, "step": 1412, "token_acc": 0.9477351916376306, "train_speed(iter/s)": 0.986434 }, { "epoch": 0.045901958873404154, "grad_norm": 2.0444018840789795, "learning_rate": 9.175324675324675e-06, "loss": 0.12581777572631836, "memory(GiB)": 19.03, "step": 1413, "token_acc": 0.9449152542372882, "train_speed(iter/s)": 0.986512 }, { "epoch": 0.04593444433615957, "grad_norm": 7.236894130706787, "learning_rate": 9.181818181818184e-06, "loss": 0.11619959771633148, "memory(GiB)": 19.03, "step": 1414, "token_acc": 0.9606986899563319, "train_speed(iter/s)": 0.986575 }, { "epoch": 0.045966929798914986, "grad_norm": 4.0063581466674805, "learning_rate": 9.188311688311688e-06, "loss": 0.12692537903785706, "memory(GiB)": 19.03, "step": 1415, "token_acc": 0.9459459459459459, "train_speed(iter/s)": 0.986661 }, { "epoch": 0.0459994152616704, "grad_norm": 1.6313987970352173, "learning_rate": 9.194805194805195e-06, "loss": 0.10449466109275818, "memory(GiB)": 19.03, "step": 1416, "token_acc": 0.9558823529411765, "train_speed(iter/s)": 0.986742 }, { "epoch": 0.04603190072442582, "grad_norm": 1.2065153121948242, "learning_rate": 9.201298701298702e-06, "loss": 0.11446133255958557, "memory(GiB)": 19.03, "step": 1417, "token_acc": 0.9504950495049505, "train_speed(iter/s)": 0.986817 }, { "epoch": 0.046064386187181235, "grad_norm": 2.035290479660034, "learning_rate": 9.207792207792208e-06, "loss": 0.13013657927513123, "memory(GiB)": 19.03, "step": 1418, "token_acc": 0.9404255319148936, "train_speed(iter/s)": 0.986906 }, { "epoch": 0.04609687164993665, "grad_norm": 7.933816432952881, "learning_rate": 9.214285714285715e-06, "loss": 0.10720137506723404, "memory(GiB)": 19.03, "step": 1419, "token_acc": 0.963302752293578, "train_speed(iter/s)": 0.986971 }, { "epoch": 0.046129357112692074, "grad_norm": 7.031473159790039, "learning_rate": 9.220779220779221e-06, "loss": 0.10749493539333344, "memory(GiB)": 19.03, "step": 1420, "token_acc": 0.9444444444444444, "train_speed(iter/s)": 0.987038 }, { "epoch": 0.04616184257544749, "grad_norm": 2.572568655014038, "learning_rate": 9.227272727272728e-06, "loss": 0.12373325228691101, "memory(GiB)": 19.03, "step": 1421, "token_acc": 0.9369747899159664, "train_speed(iter/s)": 0.987124 }, { "epoch": 0.046194328038202906, "grad_norm": 3.885472297668457, "learning_rate": 9.233766233766235e-06, "loss": 0.12796427309513092, "memory(GiB)": 19.03, "step": 1422, "token_acc": 0.9504504504504504, "train_speed(iter/s)": 0.987187 }, { "epoch": 0.04622681350095832, "grad_norm": 5.493196487426758, "learning_rate": 9.240259740259741e-06, "loss": 0.11078932881355286, "memory(GiB)": 19.03, "step": 1423, "token_acc": 0.9547325102880658, "train_speed(iter/s)": 0.987253 }, { "epoch": 0.04625929896371374, "grad_norm": 2.7909884452819824, "learning_rate": 9.246753246753248e-06, "loss": 0.12149854004383087, "memory(GiB)": 19.03, "step": 1424, "token_acc": 0.9455445544554455, "train_speed(iter/s)": 0.987331 }, { "epoch": 0.046291784426469154, "grad_norm": 1.2378573417663574, "learning_rate": 9.253246753246755e-06, "loss": 0.11551205813884735, "memory(GiB)": 19.03, "step": 1425, "token_acc": 0.9494949494949495, "train_speed(iter/s)": 0.9874 }, { "epoch": 0.04632426988922457, "grad_norm": 1.8574275970458984, "learning_rate": 9.259740259740261e-06, "loss": 0.13090412318706512, "memory(GiB)": 19.03, "step": 1426, "token_acc": 0.9568627450980393, "train_speed(iter/s)": 0.987482 }, { "epoch": 0.046356755351979986, "grad_norm": 0.7904241681098938, "learning_rate": 9.266233766233766e-06, "loss": 0.11535279452800751, "memory(GiB)": 19.03, "step": 1427, "token_acc": 0.9506172839506173, "train_speed(iter/s)": 0.98756 }, { "epoch": 0.04638924081473541, "grad_norm": 1.036287784576416, "learning_rate": 9.272727272727273e-06, "loss": 0.10871884226799011, "memory(GiB)": 19.03, "step": 1428, "token_acc": 0.9444444444444444, "train_speed(iter/s)": 0.987661 }, { "epoch": 0.046421726277490825, "grad_norm": 4.399527549743652, "learning_rate": 9.27922077922078e-06, "loss": 0.10639418661594391, "memory(GiB)": 19.03, "step": 1429, "token_acc": 0.9574468085106383, "train_speed(iter/s)": 0.987775 }, { "epoch": 0.04645421174024624, "grad_norm": 1.2388978004455566, "learning_rate": 9.285714285714288e-06, "loss": 0.11463436484336853, "memory(GiB)": 19.03, "step": 1430, "token_acc": 0.9567307692307693, "train_speed(iter/s)": 0.987872 }, { "epoch": 0.04648669720300166, "grad_norm": 0.997127890586853, "learning_rate": 9.292207792207792e-06, "loss": 0.10851512849330902, "memory(GiB)": 19.03, "step": 1431, "token_acc": 0.9291338582677166, "train_speed(iter/s)": 0.987981 }, { "epoch": 0.046519182665757074, "grad_norm": 12.834019660949707, "learning_rate": 9.298701298701299e-06, "loss": 0.11267761886119843, "memory(GiB)": 19.03, "step": 1432, "token_acc": 0.958904109589041, "train_speed(iter/s)": 0.988089 }, { "epoch": 0.04655166812851249, "grad_norm": 2.2304906845092773, "learning_rate": 9.305194805194806e-06, "loss": 0.10198165476322174, "memory(GiB)": 19.03, "step": 1433, "token_acc": 0.9664429530201343, "train_speed(iter/s)": 0.988194 }, { "epoch": 0.046584153591267906, "grad_norm": 1.0598734617233276, "learning_rate": 9.311688311688312e-06, "loss": 0.08896764367818832, "memory(GiB)": 19.03, "step": 1434, "token_acc": 0.9552238805970149, "train_speed(iter/s)": 0.988297 }, { "epoch": 0.04661663905402332, "grad_norm": 1.3768290281295776, "learning_rate": 9.318181818181819e-06, "loss": 0.10669561475515366, "memory(GiB)": 19.03, "step": 1435, "token_acc": 0.9776785714285714, "train_speed(iter/s)": 0.988413 }, { "epoch": 0.046649124516778745, "grad_norm": 1.2061909437179565, "learning_rate": 9.324675324675326e-06, "loss": 0.10881989449262619, "memory(GiB)": 19.03, "step": 1436, "token_acc": 0.973404255319149, "train_speed(iter/s)": 0.988515 }, { "epoch": 0.04668160997953416, "grad_norm": 1.4948101043701172, "learning_rate": 9.331168831168832e-06, "loss": 0.10656660795211792, "memory(GiB)": 19.03, "step": 1437, "token_acc": 0.9656862745098039, "train_speed(iter/s)": 0.988622 }, { "epoch": 0.04671409544228958, "grad_norm": 1.7125327587127686, "learning_rate": 9.337662337662339e-06, "loss": 0.11414820700883865, "memory(GiB)": 19.03, "step": 1438, "token_acc": 0.96, "train_speed(iter/s)": 0.988721 }, { "epoch": 0.04674658090504499, "grad_norm": 1.0432711839675903, "learning_rate": 9.344155844155844e-06, "loss": 0.10251229256391525, "memory(GiB)": 19.03, "step": 1439, "token_acc": 0.9279661016949152, "train_speed(iter/s)": 0.988836 }, { "epoch": 0.04677906636780041, "grad_norm": 1.0501450300216675, "learning_rate": 9.350649350649352e-06, "loss": 0.10033433139324188, "memory(GiB)": 19.03, "step": 1440, "token_acc": 0.9288702928870293, "train_speed(iter/s)": 0.988943 }, { "epoch": 0.046811551830555825, "grad_norm": 1.1303547620773315, "learning_rate": 9.357142857142859e-06, "loss": 0.10811059176921844, "memory(GiB)": 19.03, "step": 1441, "token_acc": 0.9626865671641791, "train_speed(iter/s)": 0.989054 }, { "epoch": 0.04684403729331124, "grad_norm": 1.8345422744750977, "learning_rate": 9.363636363636365e-06, "loss": 0.09923268854618073, "memory(GiB)": 19.03, "step": 1442, "token_acc": 0.9444444444444444, "train_speed(iter/s)": 0.989148 }, { "epoch": 0.04687652275606666, "grad_norm": 1.06432044506073, "learning_rate": 9.37012987012987e-06, "loss": 0.09964623302221298, "memory(GiB)": 19.03, "step": 1443, "token_acc": 0.9357798165137615, "train_speed(iter/s)": 0.989246 }, { "epoch": 0.04690900821882208, "grad_norm": 1.453734040260315, "learning_rate": 9.376623376623377e-06, "loss": 0.11882385611534119, "memory(GiB)": 19.03, "step": 1444, "token_acc": 0.94, "train_speed(iter/s)": 0.989351 }, { "epoch": 0.0469414936815775, "grad_norm": 1.2516465187072754, "learning_rate": 9.383116883116883e-06, "loss": 0.09818877279758453, "memory(GiB)": 19.03, "step": 1445, "token_acc": 0.9585062240663901, "train_speed(iter/s)": 0.989466 }, { "epoch": 0.04697397914433291, "grad_norm": 1.0961440801620483, "learning_rate": 9.38961038961039e-06, "loss": 0.1045180931687355, "memory(GiB)": 19.03, "step": 1446, "token_acc": 0.9561752988047809, "train_speed(iter/s)": 0.989559 }, { "epoch": 0.04700646460708833, "grad_norm": 1.3244224786758423, "learning_rate": 9.396103896103896e-06, "loss": 0.09757628291845322, "memory(GiB)": 19.03, "step": 1447, "token_acc": 0.9288389513108615, "train_speed(iter/s)": 0.989663 }, { "epoch": 0.047038950069843745, "grad_norm": 2.117828607559204, "learning_rate": 9.402597402597403e-06, "loss": 0.1282835304737091, "memory(GiB)": 19.03, "step": 1448, "token_acc": 0.9485294117647058, "train_speed(iter/s)": 0.989752 }, { "epoch": 0.04707143553259916, "grad_norm": 1.2584621906280518, "learning_rate": 9.40909090909091e-06, "loss": 0.09710508584976196, "memory(GiB)": 19.03, "step": 1449, "token_acc": 0.955, "train_speed(iter/s)": 0.989863 }, { "epoch": 0.04710392099535458, "grad_norm": 1.1403436660766602, "learning_rate": 9.415584415584416e-06, "loss": 0.12057596445083618, "memory(GiB)": 19.03, "step": 1450, "token_acc": 0.9490740740740741, "train_speed(iter/s)": 0.989971 }, { "epoch": 0.04713640645810999, "grad_norm": 2.963613271713257, "learning_rate": 9.422077922077923e-06, "loss": 0.12223989516496658, "memory(GiB)": 19.03, "step": 1451, "token_acc": 0.9656652360515021, "train_speed(iter/s)": 0.990079 }, { "epoch": 0.047168891920865416, "grad_norm": 2.418426275253296, "learning_rate": 9.42857142857143e-06, "loss": 0.10297059267759323, "memory(GiB)": 19.03, "step": 1452, "token_acc": 0.948, "train_speed(iter/s)": 0.990173 }, { "epoch": 0.04720137738362083, "grad_norm": 1.412007451057434, "learning_rate": 9.435064935064936e-06, "loss": 0.12253418564796448, "memory(GiB)": 19.03, "step": 1453, "token_acc": 0.9497716894977168, "train_speed(iter/s)": 0.990274 }, { "epoch": 0.04723386284637625, "grad_norm": 2.1267175674438477, "learning_rate": 9.441558441558443e-06, "loss": 0.12713158130645752, "memory(GiB)": 19.03, "step": 1454, "token_acc": 0.9653465346534653, "train_speed(iter/s)": 0.990345 }, { "epoch": 0.047266348309131664, "grad_norm": 1.5462197065353394, "learning_rate": 9.448051948051948e-06, "loss": 0.10347270965576172, "memory(GiB)": 19.03, "step": 1455, "token_acc": 0.9496124031007752, "train_speed(iter/s)": 0.990423 }, { "epoch": 0.04729883377188708, "grad_norm": 2.7102439403533936, "learning_rate": 9.454545454545456e-06, "loss": 0.12329673767089844, "memory(GiB)": 19.03, "step": 1456, "token_acc": 0.9369747899159664, "train_speed(iter/s)": 0.990513 }, { "epoch": 0.0473313192346425, "grad_norm": 3.067762851715088, "learning_rate": 9.461038961038963e-06, "loss": 0.12147507071495056, "memory(GiB)": 19.03, "step": 1457, "token_acc": 0.9534883720930233, "train_speed(iter/s)": 0.990596 }, { "epoch": 0.04736380469739791, "grad_norm": 2.374037504196167, "learning_rate": 9.46753246753247e-06, "loss": 0.1272575855255127, "memory(GiB)": 19.03, "step": 1458, "token_acc": 0.9572192513368984, "train_speed(iter/s)": 0.990682 }, { "epoch": 0.04739629016015333, "grad_norm": 1.3515053987503052, "learning_rate": 9.474025974025974e-06, "loss": 0.11994802206754684, "memory(GiB)": 19.03, "step": 1459, "token_acc": 0.9578947368421052, "train_speed(iter/s)": 0.990767 }, { "epoch": 0.04742877562290875, "grad_norm": 3.51845121383667, "learning_rate": 9.48051948051948e-06, "loss": 0.11474592238664627, "memory(GiB)": 19.03, "step": 1460, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.99085 }, { "epoch": 0.04746126108566417, "grad_norm": 1.0114248991012573, "learning_rate": 9.487012987012987e-06, "loss": 0.11122593283653259, "memory(GiB)": 19.03, "step": 1461, "token_acc": 0.9511111111111111, "train_speed(iter/s)": 0.990937 }, { "epoch": 0.047493746548419584, "grad_norm": 1.2042781114578247, "learning_rate": 9.493506493506494e-06, "loss": 0.11542006582021713, "memory(GiB)": 19.03, "step": 1462, "token_acc": 0.9364406779661016, "train_speed(iter/s)": 0.991034 }, { "epoch": 0.047526232011175, "grad_norm": 1.0772912502288818, "learning_rate": 9.5e-06, "loss": 0.11469662189483643, "memory(GiB)": 19.03, "step": 1463, "token_acc": 0.9485981308411215, "train_speed(iter/s)": 0.991089 }, { "epoch": 0.047558717473930416, "grad_norm": 1.0837880373001099, "learning_rate": 9.506493506493507e-06, "loss": 0.11017635464668274, "memory(GiB)": 19.03, "step": 1464, "token_acc": 0.9615384615384616, "train_speed(iter/s)": 0.991178 }, { "epoch": 0.04759120293668583, "grad_norm": 1.1896319389343262, "learning_rate": 9.512987012987014e-06, "loss": 0.12381631880998611, "memory(GiB)": 19.03, "step": 1465, "token_acc": 0.9578059071729957, "train_speed(iter/s)": 0.991256 }, { "epoch": 0.04762368839944125, "grad_norm": 1.433010458946228, "learning_rate": 9.51948051948052e-06, "loss": 0.11182810366153717, "memory(GiB)": 19.03, "step": 1466, "token_acc": 0.9672897196261683, "train_speed(iter/s)": 0.99133 }, { "epoch": 0.047656173862196664, "grad_norm": 1.0958012342453003, "learning_rate": 9.525974025974027e-06, "loss": 0.10294555127620697, "memory(GiB)": 19.03, "step": 1467, "token_acc": 0.9497716894977168, "train_speed(iter/s)": 0.991409 }, { "epoch": 0.04768865932495209, "grad_norm": 0.9658533334732056, "learning_rate": 9.532467532467534e-06, "loss": 0.10016170144081116, "memory(GiB)": 19.03, "step": 1468, "token_acc": 0.9449152542372882, "train_speed(iter/s)": 0.991471 }, { "epoch": 0.0477211447877075, "grad_norm": 0.9741178750991821, "learning_rate": 9.53896103896104e-06, "loss": 0.10295234620571136, "memory(GiB)": 19.03, "step": 1469, "token_acc": 0.9691629955947136, "train_speed(iter/s)": 0.991544 }, { "epoch": 0.04775363025046292, "grad_norm": 3.447805643081665, "learning_rate": 9.545454545454547e-06, "loss": 0.12354600429534912, "memory(GiB)": 19.03, "step": 1470, "token_acc": 0.9555555555555556, "train_speed(iter/s)": 0.991598 }, { "epoch": 0.047786115713218336, "grad_norm": 0.8503185510635376, "learning_rate": 9.551948051948052e-06, "loss": 0.10091185569763184, "memory(GiB)": 19.03, "step": 1471, "token_acc": 0.9291338582677166, "train_speed(iter/s)": 0.991639 }, { "epoch": 0.04781860117597375, "grad_norm": 0.916059136390686, "learning_rate": 9.558441558441558e-06, "loss": 0.10134127736091614, "memory(GiB)": 19.03, "step": 1472, "token_acc": 0.9571984435797666, "train_speed(iter/s)": 0.991699 }, { "epoch": 0.04785108663872917, "grad_norm": 1.0192631483078003, "learning_rate": 9.564935064935067e-06, "loss": 0.10534781217575073, "memory(GiB)": 19.03, "step": 1473, "token_acc": 0.95703125, "train_speed(iter/s)": 0.991767 }, { "epoch": 0.047883572101484584, "grad_norm": 1.1503000259399414, "learning_rate": 9.571428571428573e-06, "loss": 0.09425067901611328, "memory(GiB)": 19.03, "step": 1474, "token_acc": 0.9553571428571429, "train_speed(iter/s)": 0.991827 }, { "epoch": 0.04791605756424, "grad_norm": 1.5490649938583374, "learning_rate": 9.577922077922078e-06, "loss": 0.11805763095617294, "memory(GiB)": 19.03, "step": 1475, "token_acc": 0.9525547445255474, "train_speed(iter/s)": 0.991908 }, { "epoch": 0.04794854302699542, "grad_norm": 1.0289013385772705, "learning_rate": 9.584415584415585e-06, "loss": 0.0989815890789032, "memory(GiB)": 19.03, "step": 1476, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.991976 }, { "epoch": 0.04798102848975084, "grad_norm": 1.8989572525024414, "learning_rate": 9.590909090909091e-06, "loss": 0.11400450766086578, "memory(GiB)": 19.03, "step": 1477, "token_acc": 0.9451476793248945, "train_speed(iter/s)": 0.99205 }, { "epoch": 0.048013513952506255, "grad_norm": 1.2626261711120605, "learning_rate": 9.597402597402598e-06, "loss": 0.12917087972164154, "memory(GiB)": 19.03, "step": 1478, "token_acc": 0.9570815450643777, "train_speed(iter/s)": 0.992101 }, { "epoch": 0.04804599941526167, "grad_norm": 1.3207578659057617, "learning_rate": 9.603896103896105e-06, "loss": 0.12656836211681366, "memory(GiB)": 19.03, "step": 1479, "token_acc": 0.9768518518518519, "train_speed(iter/s)": 0.992159 }, { "epoch": 0.04807848487801709, "grad_norm": 1.1995981931686401, "learning_rate": 9.610389610389611e-06, "loss": 0.11565694212913513, "memory(GiB)": 19.03, "step": 1480, "token_acc": 0.9364406779661016, "train_speed(iter/s)": 0.992228 }, { "epoch": 0.0481109703407725, "grad_norm": 0.9488938450813293, "learning_rate": 9.616883116883118e-06, "loss": 0.08927588164806366, "memory(GiB)": 19.03, "step": 1481, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.992309 }, { "epoch": 0.04814345580352792, "grad_norm": 1.6410167217254639, "learning_rate": 9.623376623376624e-06, "loss": 0.10651372373104095, "memory(GiB)": 19.03, "step": 1482, "token_acc": 0.958904109589041, "train_speed(iter/s)": 0.992364 }, { "epoch": 0.048175941266283336, "grad_norm": 0.9829304218292236, "learning_rate": 9.629870129870131e-06, "loss": 0.09273610264062881, "memory(GiB)": 19.03, "step": 1483, "token_acc": 0.9674418604651163, "train_speed(iter/s)": 0.99245 }, { "epoch": 0.04820842672903876, "grad_norm": 0.9567086100578308, "learning_rate": 9.636363636363638e-06, "loss": 0.1059592217206955, "memory(GiB)": 19.03, "step": 1484, "token_acc": 0.9354838709677419, "train_speed(iter/s)": 0.992559 }, { "epoch": 0.048240912191794175, "grad_norm": 0.976150631904602, "learning_rate": 9.642857142857144e-06, "loss": 0.10698091983795166, "memory(GiB)": 19.03, "step": 1485, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.992668 }, { "epoch": 0.04827339765454959, "grad_norm": 2.2044873237609863, "learning_rate": 9.64935064935065e-06, "loss": 0.11968735605478287, "memory(GiB)": 19.03, "step": 1486, "token_acc": 0.9621993127147767, "train_speed(iter/s)": 0.99278 }, { "epoch": 0.04830588311730501, "grad_norm": 1.4487920999526978, "learning_rate": 9.655844155844156e-06, "loss": 0.11514684557914734, "memory(GiB)": 19.03, "step": 1487, "token_acc": 0.9366666666666666, "train_speed(iter/s)": 0.992871 }, { "epoch": 0.04833836858006042, "grad_norm": 1.194684624671936, "learning_rate": 9.662337662337662e-06, "loss": 0.11214330792427063, "memory(GiB)": 19.03, "step": 1488, "token_acc": 0.9404761904761905, "train_speed(iter/s)": 0.992972 }, { "epoch": 0.04837085404281584, "grad_norm": 1.125386357307434, "learning_rate": 9.66883116883117e-06, "loss": 0.10432802140712738, "memory(GiB)": 19.03, "step": 1489, "token_acc": 0.971830985915493, "train_speed(iter/s)": 0.993074 }, { "epoch": 0.048403339505571255, "grad_norm": 1.045548915863037, "learning_rate": 9.675324675324677e-06, "loss": 0.11543193459510803, "memory(GiB)": 19.03, "step": 1490, "token_acc": 0.9543568464730291, "train_speed(iter/s)": 0.993187 }, { "epoch": 0.04843582496832667, "grad_norm": 0.801432192325592, "learning_rate": 9.681818181818182e-06, "loss": 0.11088314652442932, "memory(GiB)": 19.03, "step": 1491, "token_acc": 0.9665271966527197, "train_speed(iter/s)": 0.993287 }, { "epoch": 0.048468310431082094, "grad_norm": 1.014367938041687, "learning_rate": 9.688311688311689e-06, "loss": 0.10905793309211731, "memory(GiB)": 19.03, "step": 1492, "token_acc": 0.955, "train_speed(iter/s)": 0.993397 }, { "epoch": 0.04850079589383751, "grad_norm": 0.8891326189041138, "learning_rate": 9.694805194805195e-06, "loss": 0.11297484487295151, "memory(GiB)": 19.03, "step": 1493, "token_acc": 0.9316239316239316, "train_speed(iter/s)": 0.9935 }, { "epoch": 0.048533281356592926, "grad_norm": 1.5366554260253906, "learning_rate": 9.701298701298702e-06, "loss": 0.11618849635124207, "memory(GiB)": 19.03, "step": 1494, "token_acc": 0.919831223628692, "train_speed(iter/s)": 0.993583 }, { "epoch": 0.04856576681934834, "grad_norm": 1.08635675907135, "learning_rate": 9.707792207792209e-06, "loss": 0.1062355488538742, "memory(GiB)": 19.03, "step": 1495, "token_acc": 0.9680365296803652, "train_speed(iter/s)": 0.993682 }, { "epoch": 0.04859825228210376, "grad_norm": 0.842925488948822, "learning_rate": 9.714285714285715e-06, "loss": 0.09929883480072021, "memory(GiB)": 19.03, "step": 1496, "token_acc": 0.9662447257383966, "train_speed(iter/s)": 0.993776 }, { "epoch": 0.048630737744859175, "grad_norm": 0.6822811365127563, "learning_rate": 9.720779220779222e-06, "loss": 0.098576121032238, "memory(GiB)": 19.03, "step": 1497, "token_acc": 0.9603174603174603, "train_speed(iter/s)": 0.993863 }, { "epoch": 0.04866322320761459, "grad_norm": 0.7601944208145142, "learning_rate": 9.727272727272728e-06, "loss": 0.10051198303699493, "memory(GiB)": 19.03, "step": 1498, "token_acc": 0.9665271966527197, "train_speed(iter/s)": 0.993977 }, { "epoch": 0.04869570867037001, "grad_norm": 0.9975671172142029, "learning_rate": 9.733766233766235e-06, "loss": 0.10918864607810974, "memory(GiB)": 19.03, "step": 1499, "token_acc": 0.9594594594594594, "train_speed(iter/s)": 0.994065 }, { "epoch": 0.04872819413312543, "grad_norm": 0.8531287312507629, "learning_rate": 9.740259740259742e-06, "loss": 0.10407248139381409, "memory(GiB)": 19.03, "step": 1500, "token_acc": 0.9385245901639344, "train_speed(iter/s)": 0.994167 }, { "epoch": 0.04872819413312543, "eval_loss": 0.10632568597793579, "eval_runtime": 80.3964, "eval_samples_per_second": 123.762, "eval_steps_per_second": 3.868, "eval_token_acc": 0.9557658348735321, "step": 1500 }, { "epoch": 0.048760679595880846, "grad_norm": 1.3401167392730713, "learning_rate": 9.746753246753248e-06, "loss": 0.12197702378034592, "memory(GiB)": 19.03, "step": 1501, "token_acc": 0.9570663855537149, "train_speed(iter/s)": 0.933758 }, { "epoch": 0.04879316505863626, "grad_norm": 1.9641474485397339, "learning_rate": 9.753246753246755e-06, "loss": 0.10654841363430023, "memory(GiB)": 19.03, "step": 1502, "token_acc": 0.935064935064935, "train_speed(iter/s)": 0.933882 }, { "epoch": 0.04882565052139168, "grad_norm": 3.2080514430999756, "learning_rate": 9.75974025974026e-06, "loss": 0.11657829582691193, "memory(GiB)": 19.03, "step": 1503, "token_acc": 0.9502262443438914, "train_speed(iter/s)": 0.934023 }, { "epoch": 0.048858135984147094, "grad_norm": 1.3877323865890503, "learning_rate": 9.766233766233766e-06, "loss": 0.1037537083029747, "memory(GiB)": 19.03, "step": 1504, "token_acc": 0.97, "train_speed(iter/s)": 0.934149 }, { "epoch": 0.04889062144690251, "grad_norm": 0.8155764937400818, "learning_rate": 9.772727272727273e-06, "loss": 0.10103246569633484, "memory(GiB)": 19.03, "step": 1505, "token_acc": 0.952755905511811, "train_speed(iter/s)": 0.934282 }, { "epoch": 0.048923106909657926, "grad_norm": 1.212766170501709, "learning_rate": 9.779220779220781e-06, "loss": 0.11525934189558029, "memory(GiB)": 19.03, "step": 1506, "token_acc": 0.9615384615384616, "train_speed(iter/s)": 0.934424 }, { "epoch": 0.04895559237241334, "grad_norm": 0.8016806840896606, "learning_rate": 9.785714285714286e-06, "loss": 0.0957048162817955, "memory(GiB)": 19.03, "step": 1507, "token_acc": 0.9590909090909091, "train_speed(iter/s)": 0.934561 }, { "epoch": 0.048988077835168765, "grad_norm": 1.3264834880828857, "learning_rate": 9.792207792207793e-06, "loss": 0.09954054653644562, "memory(GiB)": 19.03, "step": 1508, "token_acc": 0.9727626459143969, "train_speed(iter/s)": 0.934681 }, { "epoch": 0.04902056329792418, "grad_norm": 0.8747346997261047, "learning_rate": 9.7987012987013e-06, "loss": 0.10551904141902924, "memory(GiB)": 19.03, "step": 1509, "token_acc": 0.9641255605381166, "train_speed(iter/s)": 0.934811 }, { "epoch": 0.0490530487606796, "grad_norm": 1.193203330039978, "learning_rate": 9.805194805194806e-06, "loss": 0.11177567392587662, "memory(GiB)": 19.03, "step": 1510, "token_acc": 0.9622641509433962, "train_speed(iter/s)": 0.934942 }, { "epoch": 0.049085534223435014, "grad_norm": 1.1303300857543945, "learning_rate": 9.811688311688313e-06, "loss": 0.11171972006559372, "memory(GiB)": 19.03, "step": 1511, "token_acc": 0.9416342412451362, "train_speed(iter/s)": 0.935062 }, { "epoch": 0.04911801968619043, "grad_norm": 1.2445000410079956, "learning_rate": 9.81818181818182e-06, "loss": 0.1138649433851242, "memory(GiB)": 19.03, "step": 1512, "token_acc": 0.963302752293578, "train_speed(iter/s)": 0.935204 }, { "epoch": 0.049150505148945846, "grad_norm": 1.2070386409759521, "learning_rate": 9.824675324675326e-06, "loss": 0.10311259329319, "memory(GiB)": 19.03, "step": 1513, "token_acc": 0.945, "train_speed(iter/s)": 0.935324 }, { "epoch": 0.04918299061170126, "grad_norm": 2.697524070739746, "learning_rate": 9.831168831168832e-06, "loss": 0.10284800082445145, "memory(GiB)": 19.03, "step": 1514, "token_acc": 0.9520295202952029, "train_speed(iter/s)": 0.935453 }, { "epoch": 0.04921547607445668, "grad_norm": 0.9511576890945435, "learning_rate": 9.837662337662337e-06, "loss": 0.11283411830663681, "memory(GiB)": 19.03, "step": 1515, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.935452 }, { "epoch": 0.0492479615372121, "grad_norm": 2.893599510192871, "learning_rate": 9.844155844155846e-06, "loss": 0.10846654325723648, "memory(GiB)": 19.03, "step": 1516, "token_acc": 0.9644268774703557, "train_speed(iter/s)": 0.935582 }, { "epoch": 0.04928044699996752, "grad_norm": 1.260319471359253, "learning_rate": 9.850649350649352e-06, "loss": 0.10571077466011047, "memory(GiB)": 19.03, "step": 1517, "token_acc": 0.9388646288209607, "train_speed(iter/s)": 0.935724 }, { "epoch": 0.04931293246272293, "grad_norm": 1.3773561716079712, "learning_rate": 9.857142857142859e-06, "loss": 0.11543036997318268, "memory(GiB)": 19.03, "step": 1518, "token_acc": 0.947565543071161, "train_speed(iter/s)": 0.935863 }, { "epoch": 0.04934541792547835, "grad_norm": 1.124707818031311, "learning_rate": 9.863636363636364e-06, "loss": 0.10422754287719727, "memory(GiB)": 19.03, "step": 1519, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.935992 }, { "epoch": 0.049377903388233765, "grad_norm": 1.1908290386199951, "learning_rate": 9.87012987012987e-06, "loss": 0.10975071787834167, "memory(GiB)": 19.03, "step": 1520, "token_acc": 0.9475806451612904, "train_speed(iter/s)": 0.936122 }, { "epoch": 0.04941038885098918, "grad_norm": 0.952904224395752, "learning_rate": 9.876623376623377e-06, "loss": 0.10704676061868668, "memory(GiB)": 19.03, "step": 1521, "token_acc": 0.96484375, "train_speed(iter/s)": 0.936262 }, { "epoch": 0.0494428743137446, "grad_norm": 1.1883610486984253, "learning_rate": 9.883116883116885e-06, "loss": 0.10913357138633728, "memory(GiB)": 19.03, "step": 1522, "token_acc": 0.9504504504504504, "train_speed(iter/s)": 0.936384 }, { "epoch": 0.049475359776500014, "grad_norm": 1.3604272603988647, "learning_rate": 9.88961038961039e-06, "loss": 0.10925380885601044, "memory(GiB)": 19.03, "step": 1523, "token_acc": 0.9475806451612904, "train_speed(iter/s)": 0.936519 }, { "epoch": 0.04950784523925544, "grad_norm": 1.1302156448364258, "learning_rate": 9.896103896103897e-06, "loss": 0.10610757023096085, "memory(GiB)": 19.03, "step": 1524, "token_acc": 0.9333333333333333, "train_speed(iter/s)": 0.936654 }, { "epoch": 0.04954033070201085, "grad_norm": 1.3866865634918213, "learning_rate": 9.902597402597403e-06, "loss": 0.1209351196885109, "memory(GiB)": 19.03, "step": 1525, "token_acc": 0.944, "train_speed(iter/s)": 0.936786 }, { "epoch": 0.04957281616476627, "grad_norm": 1.1001622676849365, "learning_rate": 9.90909090909091e-06, "loss": 0.1077088713645935, "memory(GiB)": 19.03, "step": 1526, "token_acc": 0.9626556016597511, "train_speed(iter/s)": 0.936914 }, { "epoch": 0.049605301627521685, "grad_norm": 1.3784912824630737, "learning_rate": 9.915584415584417e-06, "loss": 0.09614747762680054, "memory(GiB)": 19.03, "step": 1527, "token_acc": 0.9662447257383966, "train_speed(iter/s)": 0.937048 }, { "epoch": 0.0496377870902771, "grad_norm": 1.4563877582550049, "learning_rate": 9.922077922077923e-06, "loss": 0.11030574887990952, "memory(GiB)": 19.03, "step": 1528, "token_acc": 0.946078431372549, "train_speed(iter/s)": 0.937165 }, { "epoch": 0.04967027255303252, "grad_norm": 1.2068274021148682, "learning_rate": 9.92857142857143e-06, "loss": 0.10886827111244202, "memory(GiB)": 19.03, "step": 1529, "token_acc": 0.9525691699604744, "train_speed(iter/s)": 0.937306 }, { "epoch": 0.04970275801578793, "grad_norm": 2.300017833709717, "learning_rate": 9.935064935064936e-06, "loss": 0.09379267692565918, "memory(GiB)": 19.03, "step": 1530, "token_acc": 0.9723502304147466, "train_speed(iter/s)": 0.937443 }, { "epoch": 0.04973524347854335, "grad_norm": 1.4724829196929932, "learning_rate": 9.941558441558441e-06, "loss": 0.11474903672933578, "memory(GiB)": 19.03, "step": 1531, "token_acc": 0.9741379310344828, "train_speed(iter/s)": 0.937555 }, { "epoch": 0.04976772894129877, "grad_norm": 8.117696762084961, "learning_rate": 9.94805194805195e-06, "loss": 0.11353565007448196, "memory(GiB)": 19.03, "step": 1532, "token_acc": 0.9496124031007752, "train_speed(iter/s)": 0.937662 }, { "epoch": 0.04980021440405419, "grad_norm": 1.1344211101531982, "learning_rate": 9.954545454545456e-06, "loss": 0.10232442617416382, "memory(GiB)": 19.03, "step": 1533, "token_acc": 0.9451476793248945, "train_speed(iter/s)": 0.937761 }, { "epoch": 0.049832699866809604, "grad_norm": 4.167481899261475, "learning_rate": 9.961038961038963e-06, "loss": 0.11378651857376099, "memory(GiB)": 19.03, "step": 1534, "token_acc": 0.9632352941176471, "train_speed(iter/s)": 0.937843 }, { "epoch": 0.04986518532956502, "grad_norm": 1.216051459312439, "learning_rate": 9.967532467532468e-06, "loss": 0.10666920989751816, "memory(GiB)": 19.03, "step": 1535, "token_acc": 0.9488188976377953, "train_speed(iter/s)": 0.937943 }, { "epoch": 0.04989767079232044, "grad_norm": 1.5433471202850342, "learning_rate": 9.974025974025974e-06, "loss": 0.1208677813410759, "memory(GiB)": 19.03, "step": 1536, "token_acc": 0.9469387755102041, "train_speed(iter/s)": 0.93804 }, { "epoch": 0.04993015625507585, "grad_norm": 6.4674248695373535, "learning_rate": 9.980519480519481e-06, "loss": 0.10891436040401459, "memory(GiB)": 19.03, "step": 1537, "token_acc": 0.96875, "train_speed(iter/s)": 0.938128 }, { "epoch": 0.04996264171783127, "grad_norm": 1.0099353790283203, "learning_rate": 9.987012987012988e-06, "loss": 0.09305854886770248, "memory(GiB)": 19.03, "step": 1538, "token_acc": 0.9632352941176471, "train_speed(iter/s)": 0.938225 }, { "epoch": 0.049995127180586685, "grad_norm": 1.417592167854309, "learning_rate": 9.993506493506494e-06, "loss": 0.10261162370443344, "memory(GiB)": 19.03, "step": 1539, "token_acc": 0.9636363636363636, "train_speed(iter/s)": 0.938313 }, { "epoch": 0.05002761264334211, "grad_norm": 1.5035855770111084, "learning_rate": 1e-05, "loss": 0.10628693550825119, "memory(GiB)": 19.03, "step": 1540, "token_acc": 0.9625, "train_speed(iter/s)": 0.938405 }, { "epoch": 0.050060098106097524, "grad_norm": 0.9924011826515198, "learning_rate": 9.999999971146674e-06, "loss": 0.1101502925157547, "memory(GiB)": 19.03, "step": 1541, "token_acc": 0.9434782608695652, "train_speed(iter/s)": 0.938497 }, { "epoch": 0.05009258356885294, "grad_norm": 1.9740161895751953, "learning_rate": 9.999999884586691e-06, "loss": 0.10104024410247803, "memory(GiB)": 19.03, "step": 1542, "token_acc": 0.9516129032258065, "train_speed(iter/s)": 0.938574 }, { "epoch": 0.050125069031608356, "grad_norm": 0.7871876358985901, "learning_rate": 9.999999740320055e-06, "loss": 0.10582801699638367, "memory(GiB)": 19.03, "step": 1543, "token_acc": 0.9395017793594306, "train_speed(iter/s)": 0.93866 }, { "epoch": 0.05015755449436377, "grad_norm": 1.1703202724456787, "learning_rate": 9.999999538346767e-06, "loss": 0.10097499936819077, "memory(GiB)": 19.03, "step": 1544, "token_acc": 0.9477611940298507, "train_speed(iter/s)": 0.938749 }, { "epoch": 0.05019003995711919, "grad_norm": 1.034203290939331, "learning_rate": 9.99999927866683e-06, "loss": 0.10501287132501602, "memory(GiB)": 19.03, "step": 1545, "token_acc": 0.9485981308411215, "train_speed(iter/s)": 0.938834 }, { "epoch": 0.050222525419874604, "grad_norm": 0.8568242192268372, "learning_rate": 9.999998961280247e-06, "loss": 0.10521844774484634, "memory(GiB)": 19.03, "step": 1546, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.938922 }, { "epoch": 0.05025501088263002, "grad_norm": 0.9615365266799927, "learning_rate": 9.999998586187019e-06, "loss": 0.10226628184318542, "memory(GiB)": 19.03, "step": 1547, "token_acc": 0.944954128440367, "train_speed(iter/s)": 0.939004 }, { "epoch": 0.05028749634538544, "grad_norm": 1.0402320623397827, "learning_rate": 9.999998153387154e-06, "loss": 0.10652769356966019, "memory(GiB)": 19.03, "step": 1548, "token_acc": 0.9615384615384616, "train_speed(iter/s)": 0.939085 }, { "epoch": 0.05031998180814086, "grad_norm": 1.1403419971466064, "learning_rate": 9.999997662880654e-06, "loss": 0.1267072856426239, "memory(GiB)": 19.03, "step": 1549, "token_acc": 0.9438202247191011, "train_speed(iter/s)": 0.939175 }, { "epoch": 0.050352467270896276, "grad_norm": 1.7249699831008911, "learning_rate": 9.999997114667525e-06, "loss": 0.11242849379777908, "memory(GiB)": 19.03, "step": 1550, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.939253 }, { "epoch": 0.05038495273365169, "grad_norm": 1.2559843063354492, "learning_rate": 9.999996508747778e-06, "loss": 0.10824473202228546, "memory(GiB)": 19.03, "step": 1551, "token_acc": 0.9531914893617022, "train_speed(iter/s)": 0.939224 }, { "epoch": 0.05041743819640711, "grad_norm": 1.1064810752868652, "learning_rate": 9.999995845121413e-06, "loss": 0.12039398401975632, "memory(GiB)": 19.03, "step": 1552, "token_acc": 0.9444444444444444, "train_speed(iter/s)": 0.939337 }, { "epoch": 0.050449923659162524, "grad_norm": 1.1751759052276611, "learning_rate": 9.999995123788441e-06, "loss": 0.1032143384218216, "memory(GiB)": 19.03, "step": 1553, "token_acc": 0.9563492063492064, "train_speed(iter/s)": 0.939447 }, { "epoch": 0.05048240912191794, "grad_norm": 1.2866352796554565, "learning_rate": 9.999994344748873e-06, "loss": 0.12209140509366989, "memory(GiB)": 19.03, "step": 1554, "token_acc": 0.9588014981273408, "train_speed(iter/s)": 0.939553 }, { "epoch": 0.050514894584673356, "grad_norm": 2.2493040561676025, "learning_rate": 9.999993508002714e-06, "loss": 0.10683131217956543, "memory(GiB)": 19.03, "step": 1555, "token_acc": 0.954337899543379, "train_speed(iter/s)": 0.93968 }, { "epoch": 0.05054738004742878, "grad_norm": 1.5605465173721313, "learning_rate": 9.999992613549974e-06, "loss": 0.10578586161136627, "memory(GiB)": 19.03, "step": 1556, "token_acc": 0.937007874015748, "train_speed(iter/s)": 0.939799 }, { "epoch": 0.050579865510184195, "grad_norm": 1.2739540338516235, "learning_rate": 9.999991661390665e-06, "loss": 0.09360984712839127, "memory(GiB)": 19.03, "step": 1557, "token_acc": 0.9535864978902954, "train_speed(iter/s)": 0.939935 }, { "epoch": 0.05061235097293961, "grad_norm": 1.2710176706314087, "learning_rate": 9.999990651524799e-06, "loss": 0.112013079226017, "memory(GiB)": 19.03, "step": 1558, "token_acc": 0.9647058823529412, "train_speed(iter/s)": 0.940068 }, { "epoch": 0.05064483643569503, "grad_norm": 1.0936561822891235, "learning_rate": 9.999989583952383e-06, "loss": 0.1008772924542427, "memory(GiB)": 19.03, "step": 1559, "token_acc": 0.9627906976744186, "train_speed(iter/s)": 0.940198 }, { "epoch": 0.05067732189845044, "grad_norm": 1.2323466539382935, "learning_rate": 9.999988458673433e-06, "loss": 0.10657203197479248, "memory(GiB)": 19.03, "step": 1560, "token_acc": 0.9321266968325792, "train_speed(iter/s)": 0.940219 }, { "epoch": 0.05070980736120586, "grad_norm": 0.9936209321022034, "learning_rate": 9.99998727568796e-06, "loss": 0.11380524188280106, "memory(GiB)": 19.03, "step": 1561, "token_acc": 0.959349593495935, "train_speed(iter/s)": 0.940341 }, { "epoch": 0.050742292823961276, "grad_norm": 1.8231664896011353, "learning_rate": 9.99998603499598e-06, "loss": 0.10586370527744293, "memory(GiB)": 19.03, "step": 1562, "token_acc": 0.9514925373134329, "train_speed(iter/s)": 0.94046 }, { "epoch": 0.05077477828671669, "grad_norm": 1.3661390542984009, "learning_rate": 9.999984736597509e-06, "loss": 0.11616009473800659, "memory(GiB)": 19.03, "step": 1563, "token_acc": 0.96875, "train_speed(iter/s)": 0.940582 }, { "epoch": 0.050807263749472115, "grad_norm": 1.4507560729980469, "learning_rate": 9.999983380492556e-06, "loss": 0.123835988342762, "memory(GiB)": 19.03, "step": 1564, "token_acc": 0.9622641509433962, "train_speed(iter/s)": 0.940714 }, { "epoch": 0.05083974921222753, "grad_norm": 1.4810824394226074, "learning_rate": 9.99998196668114e-06, "loss": 0.11703060567378998, "memory(GiB)": 19.03, "step": 1565, "token_acc": 0.9502262443438914, "train_speed(iter/s)": 0.940829 }, { "epoch": 0.05087223467498295, "grad_norm": 2.6852240562438965, "learning_rate": 9.999980495163278e-06, "loss": 0.10617129504680634, "memory(GiB)": 19.03, "step": 1566, "token_acc": 0.9429657794676806, "train_speed(iter/s)": 0.940954 }, { "epoch": 0.05090472013773836, "grad_norm": 1.0791430473327637, "learning_rate": 9.999978965938986e-06, "loss": 0.11936276406049728, "memory(GiB)": 19.03, "step": 1567, "token_acc": 0.9551569506726457, "train_speed(iter/s)": 0.94107 }, { "epoch": 0.05093720560049378, "grad_norm": 1.178032398223877, "learning_rate": 9.999977379008282e-06, "loss": 0.12304064631462097, "memory(GiB)": 19.03, "step": 1568, "token_acc": 0.9534883720930233, "train_speed(iter/s)": 0.941188 }, { "epoch": 0.050969691063249195, "grad_norm": 1.1647228002548218, "learning_rate": 9.999975734371183e-06, "loss": 0.11430341005325317, "memory(GiB)": 19.03, "step": 1569, "token_acc": 0.9728682170542635, "train_speed(iter/s)": 0.941321 }, { "epoch": 0.05100217652600461, "grad_norm": 1.2128068208694458, "learning_rate": 9.999974032027711e-06, "loss": 0.1133786290884018, "memory(GiB)": 19.03, "step": 1570, "token_acc": 0.9381443298969072, "train_speed(iter/s)": 0.941452 }, { "epoch": 0.05103466198876003, "grad_norm": 0.90090411901474, "learning_rate": 9.999972271977882e-06, "loss": 0.12352119386196136, "memory(GiB)": 19.03, "step": 1571, "token_acc": 0.944954128440367, "train_speed(iter/s)": 0.941566 }, { "epoch": 0.05106714745151545, "grad_norm": 0.9100275635719299, "learning_rate": 9.99997045422172e-06, "loss": 0.11319244652986526, "memory(GiB)": 19.03, "step": 1572, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.941695 }, { "epoch": 0.051099632914270866, "grad_norm": 1.1924388408660889, "learning_rate": 9.99996857875924e-06, "loss": 0.10592834651470184, "memory(GiB)": 19.03, "step": 1573, "token_acc": 0.9615384615384616, "train_speed(iter/s)": 0.941815 }, { "epoch": 0.05113211837702628, "grad_norm": 1.9416965246200562, "learning_rate": 9.999966645590472e-06, "loss": 0.11447571218013763, "memory(GiB)": 19.03, "step": 1574, "token_acc": 0.9454545454545454, "train_speed(iter/s)": 0.941935 }, { "epoch": 0.0511646038397817, "grad_norm": 1.1873104572296143, "learning_rate": 9.999964654715432e-06, "loss": 0.11071648448705673, "memory(GiB)": 19.03, "step": 1575, "token_acc": 0.9512195121951219, "train_speed(iter/s)": 0.942061 }, { "epoch": 0.051197089302537115, "grad_norm": 0.8920432329177856, "learning_rate": 9.999962606134145e-06, "loss": 0.10484567284584045, "memory(GiB)": 19.03, "step": 1576, "token_acc": 0.9411764705882353, "train_speed(iter/s)": 0.942187 }, { "epoch": 0.05122957476529253, "grad_norm": 1.6920198202133179, "learning_rate": 9.999960499846636e-06, "loss": 0.1031871885061264, "memory(GiB)": 19.03, "step": 1577, "token_acc": 0.963963963963964, "train_speed(iter/s)": 0.94231 }, { "epoch": 0.05126206022804795, "grad_norm": 0.9464218616485596, "learning_rate": 9.999958335852925e-06, "loss": 0.10857698321342468, "memory(GiB)": 19.03, "step": 1578, "token_acc": 0.9518072289156626, "train_speed(iter/s)": 0.942439 }, { "epoch": 0.05129454569080336, "grad_norm": 1.5224647521972656, "learning_rate": 9.999956114153042e-06, "loss": 0.09243392944335938, "memory(GiB)": 19.03, "step": 1579, "token_acc": 0.9644268774703557, "train_speed(iter/s)": 0.942558 }, { "epoch": 0.051327031153558786, "grad_norm": 1.2002359628677368, "learning_rate": 9.99995383474701e-06, "loss": 0.11236119270324707, "memory(GiB)": 19.03, "step": 1580, "token_acc": 0.948339483394834, "train_speed(iter/s)": 0.942683 }, { "epoch": 0.0513595166163142, "grad_norm": 1.3806334733963013, "learning_rate": 9.999951497634855e-06, "loss": 0.11025252193212509, "memory(GiB)": 19.03, "step": 1581, "token_acc": 0.9604743083003953, "train_speed(iter/s)": 0.942811 }, { "epoch": 0.05139200207906962, "grad_norm": 0.9739217758178711, "learning_rate": 9.999949102816606e-06, "loss": 0.09972042590379715, "memory(GiB)": 19.03, "step": 1582, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.942938 }, { "epoch": 0.051424487541825034, "grad_norm": 0.8171902894973755, "learning_rate": 9.99994665029229e-06, "loss": 0.10322321951389313, "memory(GiB)": 19.03, "step": 1583, "token_acc": 0.9411764705882353, "train_speed(iter/s)": 0.94306 }, { "epoch": 0.05145697300458045, "grad_norm": 1.4716016054153442, "learning_rate": 9.999944140061932e-06, "loss": 0.1130470260977745, "memory(GiB)": 19.03, "step": 1584, "token_acc": 0.9730941704035875, "train_speed(iter/s)": 0.943189 }, { "epoch": 0.051489458467335866, "grad_norm": 1.0128350257873535, "learning_rate": 9.999941572125566e-06, "loss": 0.1005508303642273, "memory(GiB)": 19.03, "step": 1585, "token_acc": 0.9516129032258065, "train_speed(iter/s)": 0.943302 }, { "epoch": 0.05152194393009128, "grad_norm": 0.7588074207305908, "learning_rate": 9.999938946483219e-06, "loss": 0.10810966789722443, "memory(GiB)": 19.03, "step": 1586, "token_acc": 0.9398148148148148, "train_speed(iter/s)": 0.943427 }, { "epoch": 0.0515544293928467, "grad_norm": 3.3543832302093506, "learning_rate": 9.999936263134921e-06, "loss": 0.10982674360275269, "memory(GiB)": 19.03, "step": 1587, "token_acc": 0.9581151832460733, "train_speed(iter/s)": 0.943544 }, { "epoch": 0.05158691485560212, "grad_norm": 1.0260447263717651, "learning_rate": 9.999933522080704e-06, "loss": 0.09994672238826752, "memory(GiB)": 19.03, "step": 1588, "token_acc": 0.9611650485436893, "train_speed(iter/s)": 0.943664 }, { "epoch": 0.05161940031835754, "grad_norm": 0.9245033860206604, "learning_rate": 9.999930723320601e-06, "loss": 0.12101927399635315, "memory(GiB)": 19.03, "step": 1589, "token_acc": 0.95703125, "train_speed(iter/s)": 0.943784 }, { "epoch": 0.051651885781112954, "grad_norm": 3.4783756732940674, "learning_rate": 9.99992786685464e-06, "loss": 0.10496828705072403, "memory(GiB)": 19.03, "step": 1590, "token_acc": 0.9620853080568721, "train_speed(iter/s)": 0.943893 }, { "epoch": 0.05168437124386837, "grad_norm": 2.603774070739746, "learning_rate": 9.999924952682858e-06, "loss": 0.12263926863670349, "memory(GiB)": 19.03, "step": 1591, "token_acc": 0.9453125, "train_speed(iter/s)": 0.943977 }, { "epoch": 0.051716856706623786, "grad_norm": 1.2231638431549072, "learning_rate": 9.999921980805287e-06, "loss": 0.10565292835235596, "memory(GiB)": 19.03, "step": 1592, "token_acc": 0.9475806451612904, "train_speed(iter/s)": 0.944068 }, { "epoch": 0.0517493421693792, "grad_norm": 0.8777924180030823, "learning_rate": 9.99991895122196e-06, "loss": 0.10669244825839996, "memory(GiB)": 19.03, "step": 1593, "token_acc": 0.9437751004016064, "train_speed(iter/s)": 0.944162 }, { "epoch": 0.05178182763213462, "grad_norm": 0.963771402835846, "learning_rate": 9.999915863932915e-06, "loss": 0.10657601058483124, "memory(GiB)": 19.03, "step": 1594, "token_acc": 0.9537815126050421, "train_speed(iter/s)": 0.94425 }, { "epoch": 0.051814313094890034, "grad_norm": 0.8928044438362122, "learning_rate": 9.999912718938186e-06, "loss": 0.10730797797441483, "memory(GiB)": 19.03, "step": 1595, "token_acc": 0.9589552238805971, "train_speed(iter/s)": 0.944329 }, { "epoch": 0.05184679855764546, "grad_norm": 1.3174517154693604, "learning_rate": 9.99990951623781e-06, "loss": 0.100282683968544, "memory(GiB)": 19.03, "step": 1596, "token_acc": 0.9505494505494505, "train_speed(iter/s)": 0.944413 }, { "epoch": 0.05187928402040087, "grad_norm": 0.9326351284980774, "learning_rate": 9.999906255831821e-06, "loss": 0.10910218954086304, "memory(GiB)": 19.03, "step": 1597, "token_acc": 0.956989247311828, "train_speed(iter/s)": 0.944487 }, { "epoch": 0.05191176948315629, "grad_norm": 1.2152841091156006, "learning_rate": 9.99990293772026e-06, "loss": 0.09307920187711716, "memory(GiB)": 19.03, "step": 1598, "token_acc": 0.9681818181818181, "train_speed(iter/s)": 0.944565 }, { "epoch": 0.051944254945911705, "grad_norm": 0.8977797031402588, "learning_rate": 9.999899561903166e-06, "loss": 0.10221485048532486, "memory(GiB)": 19.03, "step": 1599, "token_acc": 0.9557522123893806, "train_speed(iter/s)": 0.944652 }, { "epoch": 0.05197674040866712, "grad_norm": 0.923369824886322, "learning_rate": 9.999896128380574e-06, "loss": 0.09352453052997589, "memory(GiB)": 19.03, "step": 1600, "token_acc": 0.9579439252336449, "train_speed(iter/s)": 0.944736 }, { "epoch": 0.05200922587142254, "grad_norm": 1.1188327074050903, "learning_rate": 9.999892637152527e-06, "loss": 0.09272860735654831, "memory(GiB)": 19.03, "step": 1601, "token_acc": 0.9606299212598425, "train_speed(iter/s)": 0.944824 }, { "epoch": 0.052041711334177954, "grad_norm": 0.9154172539710999, "learning_rate": 9.999889088219063e-06, "loss": 0.08957163989543915, "memory(GiB)": 19.03, "step": 1602, "token_acc": 0.9616858237547893, "train_speed(iter/s)": 0.944898 }, { "epoch": 0.05207419679693337, "grad_norm": 1.0115811824798584, "learning_rate": 9.999885481580224e-06, "loss": 0.10388088971376419, "memory(GiB)": 19.03, "step": 1603, "token_acc": 0.9446640316205533, "train_speed(iter/s)": 0.944981 }, { "epoch": 0.05210668225968879, "grad_norm": 0.9024046659469604, "learning_rate": 9.999881817236053e-06, "loss": 0.1016637533903122, "memory(GiB)": 19.03, "step": 1604, "token_acc": 0.9738805970149254, "train_speed(iter/s)": 0.944945 }, { "epoch": 0.05213916772244421, "grad_norm": 0.9614505767822266, "learning_rate": 9.99987809518659e-06, "loss": 0.09797091037034988, "memory(GiB)": 19.03, "step": 1605, "token_acc": 0.9534883720930233, "train_speed(iter/s)": 0.94501 }, { "epoch": 0.052171653185199625, "grad_norm": 1.4390522241592407, "learning_rate": 9.99987431543188e-06, "loss": 0.11817504465579987, "memory(GiB)": 19.03, "step": 1606, "token_acc": 0.968609865470852, "train_speed(iter/s)": 0.9451 }, { "epoch": 0.05220413864795504, "grad_norm": 1.0011621713638306, "learning_rate": 9.999870477971965e-06, "loss": 0.11545595526695251, "memory(GiB)": 19.03, "step": 1607, "token_acc": 0.9490196078431372, "train_speed(iter/s)": 0.945188 }, { "epoch": 0.05223662411071046, "grad_norm": 1.072740077972412, "learning_rate": 9.99986658280689e-06, "loss": 0.10282249748706818, "memory(GiB)": 19.03, "step": 1608, "token_acc": 0.9578059071729957, "train_speed(iter/s)": 0.94528 }, { "epoch": 0.05226910957346587, "grad_norm": 1.93665611743927, "learning_rate": 9.9998626299367e-06, "loss": 0.10648128390312195, "memory(GiB)": 19.03, "step": 1609, "token_acc": 0.9439252336448598, "train_speed(iter/s)": 0.945369 }, { "epoch": 0.05230159503622129, "grad_norm": 1.4359716176986694, "learning_rate": 9.99985861936144e-06, "loss": 0.10610725730657578, "memory(GiB)": 19.03, "step": 1610, "token_acc": 0.9646017699115044, "train_speed(iter/s)": 0.945457 }, { "epoch": 0.052334080498976705, "grad_norm": 1.131794810295105, "learning_rate": 9.999854551081157e-06, "loss": 0.10793445259332657, "memory(GiB)": 19.03, "step": 1611, "token_acc": 0.95, "train_speed(iter/s)": 0.945541 }, { "epoch": 0.05236656596173213, "grad_norm": 0.7242187857627869, "learning_rate": 9.999850425095897e-06, "loss": 0.10438014566898346, "memory(GiB)": 19.03, "step": 1612, "token_acc": 0.9459459459459459, "train_speed(iter/s)": 0.945636 }, { "epoch": 0.052399051424487544, "grad_norm": 0.7183114886283875, "learning_rate": 9.99984624140571e-06, "loss": 0.08859557658433914, "memory(GiB)": 19.03, "step": 1613, "token_acc": 0.9495412844036697, "train_speed(iter/s)": 0.945728 }, { "epoch": 0.05243153688724296, "grad_norm": 1.2456505298614502, "learning_rate": 9.99984200001064e-06, "loss": 0.10157079249620438, "memory(GiB)": 19.03, "step": 1614, "token_acc": 0.9563318777292577, "train_speed(iter/s)": 0.945847 }, { "epoch": 0.05246402234999838, "grad_norm": 1.5364447832107544, "learning_rate": 9.99983770091074e-06, "loss": 0.10766783356666565, "memory(GiB)": 19.03, "step": 1615, "token_acc": 0.9724409448818898, "train_speed(iter/s)": 0.945971 }, { "epoch": 0.05249650781275379, "grad_norm": 3.222099781036377, "learning_rate": 9.999833344106058e-06, "loss": 0.1107797846198082, "memory(GiB)": 19.03, "step": 1616, "token_acc": 0.9560439560439561, "train_speed(iter/s)": 0.946092 }, { "epoch": 0.05252899327550921, "grad_norm": 1.0106972455978394, "learning_rate": 9.999828929596644e-06, "loss": 0.10873228311538696, "memory(GiB)": 19.03, "step": 1617, "token_acc": 0.9372384937238494, "train_speed(iter/s)": 0.946207 }, { "epoch": 0.052561478738264625, "grad_norm": 0.8745402693748474, "learning_rate": 9.999824457382551e-06, "loss": 0.11680688709020615, "memory(GiB)": 19.03, "step": 1618, "token_acc": 0.9638009049773756, "train_speed(iter/s)": 0.946322 }, { "epoch": 0.05259396420102004, "grad_norm": 1.0121662616729736, "learning_rate": 9.999819927463827e-06, "loss": 0.0938086211681366, "memory(GiB)": 19.03, "step": 1619, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.946415 }, { "epoch": 0.052626449663775464, "grad_norm": 0.922465443611145, "learning_rate": 9.999815339840528e-06, "loss": 0.09356396645307541, "memory(GiB)": 19.03, "step": 1620, "token_acc": 0.9534883720930233, "train_speed(iter/s)": 0.946535 }, { "epoch": 0.05265893512653088, "grad_norm": 0.5927395224571228, "learning_rate": 9.999810694512704e-06, "loss": 0.0826253741979599, "memory(GiB)": 19.03, "step": 1621, "token_acc": 0.9713114754098361, "train_speed(iter/s)": 0.946658 }, { "epoch": 0.052691420589286296, "grad_norm": 1.1791824102401733, "learning_rate": 9.99980599148041e-06, "loss": 0.09827820211648941, "memory(GiB)": 19.03, "step": 1622, "token_acc": 0.9541284403669725, "train_speed(iter/s)": 0.946772 }, { "epoch": 0.05272390605204171, "grad_norm": 1.192817211151123, "learning_rate": 9.999801230743703e-06, "loss": 0.10284647345542908, "memory(GiB)": 19.03, "step": 1623, "token_acc": 0.9615384615384616, "train_speed(iter/s)": 0.94689 }, { "epoch": 0.05275639151479713, "grad_norm": 0.8526455163955688, "learning_rate": 9.999796412302631e-06, "loss": 0.08855374157428741, "memory(GiB)": 19.03, "step": 1624, "token_acc": 0.9702602230483272, "train_speed(iter/s)": 0.947003 }, { "epoch": 0.052788876977552544, "grad_norm": 0.9072294235229492, "learning_rate": 9.999791536157255e-06, "loss": 0.09961554408073425, "memory(GiB)": 19.03, "step": 1625, "token_acc": 0.9557522123893806, "train_speed(iter/s)": 0.94712 }, { "epoch": 0.05282136244030796, "grad_norm": 1.3501272201538086, "learning_rate": 9.999786602307631e-06, "loss": 0.12890106439590454, "memory(GiB)": 19.03, "step": 1626, "token_acc": 0.9518072289156626, "train_speed(iter/s)": 0.947244 }, { "epoch": 0.052853847903063376, "grad_norm": 1.0141355991363525, "learning_rate": 9.999781610753813e-06, "loss": 0.09605114907026291, "memory(GiB)": 19.03, "step": 1627, "token_acc": 0.9690265486725663, "train_speed(iter/s)": 0.947367 }, { "epoch": 0.0528863333658188, "grad_norm": 1.3215140104293823, "learning_rate": 9.999776561495864e-06, "loss": 0.10471422970294952, "memory(GiB)": 19.03, "step": 1628, "token_acc": 0.9453551912568307, "train_speed(iter/s)": 0.947485 }, { "epoch": 0.052918818828574216, "grad_norm": 1.1384968757629395, "learning_rate": 9.999771454533835e-06, "loss": 0.10154898464679718, "memory(GiB)": 19.03, "step": 1629, "token_acc": 0.9661016949152542, "train_speed(iter/s)": 0.947603 }, { "epoch": 0.05295130429132963, "grad_norm": 1.220380425453186, "learning_rate": 9.99976628986779e-06, "loss": 0.10121374577283859, "memory(GiB)": 19.03, "step": 1630, "token_acc": 0.944206008583691, "train_speed(iter/s)": 0.947718 }, { "epoch": 0.05298378975408505, "grad_norm": 1.096673607826233, "learning_rate": 9.99976106749779e-06, "loss": 0.10267718136310577, "memory(GiB)": 19.03, "step": 1631, "token_acc": 0.9623430962343096, "train_speed(iter/s)": 0.947827 }, { "epoch": 0.053016275216840464, "grad_norm": 1.488095998764038, "learning_rate": 9.999755787423889e-06, "loss": 0.09848150610923767, "memory(GiB)": 19.03, "step": 1632, "token_acc": 0.946524064171123, "train_speed(iter/s)": 0.947948 }, { "epoch": 0.05304876067959588, "grad_norm": 0.9975631237030029, "learning_rate": 9.999750449646151e-06, "loss": 0.10820238292217255, "memory(GiB)": 19.03, "step": 1633, "token_acc": 0.9414414414414415, "train_speed(iter/s)": 0.948067 }, { "epoch": 0.053081246142351296, "grad_norm": 1.030568242073059, "learning_rate": 9.99974505416464e-06, "loss": 0.1128392368555069, "memory(GiB)": 19.03, "step": 1634, "token_acc": 0.9482071713147411, "train_speed(iter/s)": 0.948174 }, { "epoch": 0.05311373160510671, "grad_norm": 1.4082633256912231, "learning_rate": 9.999739600979417e-06, "loss": 0.1105010136961937, "memory(GiB)": 19.03, "step": 1635, "token_acc": 0.966183574879227, "train_speed(iter/s)": 0.948294 }, { "epoch": 0.053146217067862135, "grad_norm": 0.704929769039154, "learning_rate": 9.999734090090544e-06, "loss": 0.10567453503608704, "memory(GiB)": 19.03, "step": 1636, "token_acc": 0.9683794466403162, "train_speed(iter/s)": 0.948404 }, { "epoch": 0.05317870253061755, "grad_norm": 1.3465837240219116, "learning_rate": 9.999728521498084e-06, "loss": 0.09721944481134415, "memory(GiB)": 19.03, "step": 1637, "token_acc": 0.9409448818897638, "train_speed(iter/s)": 0.94851 }, { "epoch": 0.05321118799337297, "grad_norm": 0.9206941723823547, "learning_rate": 9.999722895202104e-06, "loss": 0.1154288798570633, "memory(GiB)": 19.03, "step": 1638, "token_acc": 0.9362549800796812, "train_speed(iter/s)": 0.948627 }, { "epoch": 0.05324367345612838, "grad_norm": 1.1169408559799194, "learning_rate": 9.999717211202665e-06, "loss": 0.1140485405921936, "memory(GiB)": 19.03, "step": 1639, "token_acc": 0.9408866995073891, "train_speed(iter/s)": 0.94874 }, { "epoch": 0.0532761589188838, "grad_norm": 0.6193817853927612, "learning_rate": 9.999711469499836e-06, "loss": 0.09330439567565918, "memory(GiB)": 19.03, "step": 1640, "token_acc": 0.9595588235294118, "train_speed(iter/s)": 0.948849 }, { "epoch": 0.053308644381639216, "grad_norm": 0.8554635047912598, "learning_rate": 9.999705670093682e-06, "loss": 0.09729541093111038, "memory(GiB)": 19.03, "step": 1641, "token_acc": 0.9395017793594306, "train_speed(iter/s)": 0.948968 }, { "epoch": 0.05334112984439463, "grad_norm": 0.7938609719276428, "learning_rate": 9.99969981298427e-06, "loss": 0.10596194118261337, "memory(GiB)": 19.03, "step": 1642, "token_acc": 0.9541284403669725, "train_speed(iter/s)": 0.949079 }, { "epoch": 0.05337361530715005, "grad_norm": 1.2554999589920044, "learning_rate": 9.999693898171668e-06, "loss": 0.09781964123249054, "memory(GiB)": 19.03, "step": 1643, "token_acc": 0.96875, "train_speed(iter/s)": 0.949182 }, { "epoch": 0.05340610076990547, "grad_norm": 0.9915162920951843, "learning_rate": 9.999687925655943e-06, "loss": 0.08593706041574478, "memory(GiB)": 19.03, "step": 1644, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.9493 }, { "epoch": 0.05343858623266089, "grad_norm": 0.8631505966186523, "learning_rate": 9.999681895437165e-06, "loss": 0.10235679149627686, "memory(GiB)": 19.03, "step": 1645, "token_acc": 0.9602888086642599, "train_speed(iter/s)": 0.949423 }, { "epoch": 0.0534710716954163, "grad_norm": 1.0656663179397583, "learning_rate": 9.999675807515403e-06, "loss": 0.10021714866161346, "memory(GiB)": 19.03, "step": 1646, "token_acc": 0.9696969696969697, "train_speed(iter/s)": 0.949509 }, { "epoch": 0.05350355715817172, "grad_norm": 1.7187856435775757, "learning_rate": 9.999669661890727e-06, "loss": 0.10137387365102768, "memory(GiB)": 19.03, "step": 1647, "token_acc": 0.9580152671755725, "train_speed(iter/s)": 0.949602 }, { "epoch": 0.053536042620927135, "grad_norm": 0.8285638689994812, "learning_rate": 9.999663458563211e-06, "loss": 0.09704267978668213, "memory(GiB)": 19.03, "step": 1648, "token_acc": 0.9395348837209302, "train_speed(iter/s)": 0.949682 }, { "epoch": 0.05356852808368255, "grad_norm": 0.9988080859184265, "learning_rate": 9.99965719753292e-06, "loss": 0.11155810952186584, "memory(GiB)": 19.03, "step": 1649, "token_acc": 0.9569377990430622, "train_speed(iter/s)": 0.949776 }, { "epoch": 0.05360101354643797, "grad_norm": 1.1587612628936768, "learning_rate": 9.999650878799935e-06, "loss": 0.10622982680797577, "memory(GiB)": 19.03, "step": 1650, "token_acc": 0.9621848739495799, "train_speed(iter/s)": 0.949868 }, { "epoch": 0.05363349900919338, "grad_norm": 1.1843770742416382, "learning_rate": 9.999644502364323e-06, "loss": 0.09989932179450989, "memory(GiB)": 19.03, "step": 1651, "token_acc": 0.9495412844036697, "train_speed(iter/s)": 0.949955 }, { "epoch": 0.053665984471948806, "grad_norm": 4.557580471038818, "learning_rate": 9.999638068226158e-06, "loss": 0.10742561519145966, "memory(GiB)": 19.03, "step": 1652, "token_acc": 0.9402390438247012, "train_speed(iter/s)": 0.950042 }, { "epoch": 0.05369846993470422, "grad_norm": 0.9463300108909607, "learning_rate": 9.999631576385515e-06, "loss": 0.10167037695646286, "memory(GiB)": 19.03, "step": 1653, "token_acc": 0.946360153256705, "train_speed(iter/s)": 0.950117 }, { "epoch": 0.05373095539745964, "grad_norm": 1.3620014190673828, "learning_rate": 9.99962502684247e-06, "loss": 0.1344553828239441, "memory(GiB)": 19.03, "step": 1654, "token_acc": 0.955719557195572, "train_speed(iter/s)": 0.950182 }, { "epoch": 0.053763440860215055, "grad_norm": 0.9241288900375366, "learning_rate": 9.999618419597097e-06, "loss": 0.10471262037754059, "memory(GiB)": 19.03, "step": 1655, "token_acc": 0.9547169811320755, "train_speed(iter/s)": 0.950266 }, { "epoch": 0.05379592632297047, "grad_norm": 0.7770726084709167, "learning_rate": 9.999611754649474e-06, "loss": 0.10563976317644119, "memory(GiB)": 19.03, "step": 1656, "token_acc": 0.958139534883721, "train_speed(iter/s)": 0.950349 }, { "epoch": 0.05382841178572589, "grad_norm": 0.7564349174499512, "learning_rate": 9.999605031999677e-06, "loss": 0.10445106029510498, "memory(GiB)": 19.03, "step": 1657, "token_acc": 0.9612068965517241, "train_speed(iter/s)": 0.950421 }, { "epoch": 0.0538608972484813, "grad_norm": 0.5931096076965332, "learning_rate": 9.999598251647782e-06, "loss": 0.099838986992836, "memory(GiB)": 19.03, "step": 1658, "token_acc": 0.9649122807017544, "train_speed(iter/s)": 0.950503 }, { "epoch": 0.05389338271123672, "grad_norm": 0.8415447473526001, "learning_rate": 9.99959141359387e-06, "loss": 0.10355577617883682, "memory(GiB)": 19.03, "step": 1659, "token_acc": 0.9641434262948207, "train_speed(iter/s)": 0.950573 }, { "epoch": 0.05392586817399214, "grad_norm": 0.7389972805976868, "learning_rate": 9.999584517838018e-06, "loss": 0.09795008599758148, "memory(GiB)": 19.03, "step": 1660, "token_acc": 0.9440298507462687, "train_speed(iter/s)": 0.950646 }, { "epoch": 0.05395835363674756, "grad_norm": 0.8570007681846619, "learning_rate": 9.999577564380306e-06, "loss": 0.09658049046993256, "memory(GiB)": 19.03, "step": 1661, "token_acc": 0.9586776859504132, "train_speed(iter/s)": 0.950728 }, { "epoch": 0.053990839099502974, "grad_norm": 0.8239367008209229, "learning_rate": 9.999570553220814e-06, "loss": 0.09664911031723022, "memory(GiB)": 19.03, "step": 1662, "token_acc": 0.9558232931726908, "train_speed(iter/s)": 0.950808 }, { "epoch": 0.05402332456225839, "grad_norm": 1.671470284461975, "learning_rate": 9.999563484359625e-06, "loss": 0.10230854153633118, "memory(GiB)": 19.03, "step": 1663, "token_acc": 0.9531914893617022, "train_speed(iter/s)": 0.950892 }, { "epoch": 0.054055810025013806, "grad_norm": 1.3459875583648682, "learning_rate": 9.999556357796818e-06, "loss": 0.10425382107496262, "memory(GiB)": 19.03, "step": 1664, "token_acc": 0.9478260869565217, "train_speed(iter/s)": 0.950974 }, { "epoch": 0.05408829548776922, "grad_norm": 1.4691457748413086, "learning_rate": 9.999549173532477e-06, "loss": 0.10550912469625473, "memory(GiB)": 19.03, "step": 1665, "token_acc": 0.9504132231404959, "train_speed(iter/s)": 0.951063 }, { "epoch": 0.05412078095052464, "grad_norm": 1.0799261331558228, "learning_rate": 9.999541931566683e-06, "loss": 0.10663751512765884, "memory(GiB)": 19.03, "step": 1666, "token_acc": 0.9678714859437751, "train_speed(iter/s)": 0.951153 }, { "epoch": 0.054153266413280055, "grad_norm": 1.1100717782974243, "learning_rate": 9.99953463189952e-06, "loss": 0.12127593159675598, "memory(GiB)": 19.03, "step": 1667, "token_acc": 0.9351851851851852, "train_speed(iter/s)": 0.951249 }, { "epoch": 0.05418575187603548, "grad_norm": 1.0917308330535889, "learning_rate": 9.999527274531076e-06, "loss": 0.10365559160709381, "memory(GiB)": 19.03, "step": 1668, "token_acc": 0.9409282700421941, "train_speed(iter/s)": 0.95134 }, { "epoch": 0.054218237338790894, "grad_norm": 1.1839061975479126, "learning_rate": 9.999519859461431e-06, "loss": 0.1079520583152771, "memory(GiB)": 19.03, "step": 1669, "token_acc": 0.9534883720930233, "train_speed(iter/s)": 0.951428 }, { "epoch": 0.05425072280154631, "grad_norm": 0.7552312612533569, "learning_rate": 9.999512386690674e-06, "loss": 0.10466034710407257, "memory(GiB)": 19.03, "step": 1670, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.951512 }, { "epoch": 0.054283208264301726, "grad_norm": 0.889228105545044, "learning_rate": 9.999504856218886e-06, "loss": 0.10548105835914612, "memory(GiB)": 19.03, "step": 1671, "token_acc": 0.9601593625498008, "train_speed(iter/s)": 0.951577 }, { "epoch": 0.05431569372705714, "grad_norm": 0.7347682118415833, "learning_rate": 9.999497268046161e-06, "loss": 0.10271325707435608, "memory(GiB)": 19.03, "step": 1672, "token_acc": 0.9625668449197861, "train_speed(iter/s)": 0.951663 }, { "epoch": 0.05434817918981256, "grad_norm": 1.2618063688278198, "learning_rate": 9.999489622172583e-06, "loss": 0.1036297157406807, "memory(GiB)": 19.03, "step": 1673, "token_acc": 0.9672131147540983, "train_speed(iter/s)": 0.951762 }, { "epoch": 0.054380664652567974, "grad_norm": 0.7692162394523621, "learning_rate": 9.99948191859824e-06, "loss": 0.11102192103862762, "memory(GiB)": 19.03, "step": 1674, "token_acc": 0.9537815126050421, "train_speed(iter/s)": 0.95185 }, { "epoch": 0.05441315011532339, "grad_norm": 0.9430281519889832, "learning_rate": 9.999474157323221e-06, "loss": 0.10969122499227524, "memory(GiB)": 19.03, "step": 1675, "token_acc": 0.9458483754512635, "train_speed(iter/s)": 0.951968 }, { "epoch": 0.05444563557807881, "grad_norm": 1.7558897733688354, "learning_rate": 9.999466338347616e-06, "loss": 0.11569657176733017, "memory(GiB)": 19.03, "step": 1676, "token_acc": 0.9696969696969697, "train_speed(iter/s)": 0.952071 }, { "epoch": 0.05447812104083423, "grad_norm": 1.2268924713134766, "learning_rate": 9.999458461671515e-06, "loss": 0.103854238986969, "memory(GiB)": 19.03, "step": 1677, "token_acc": 0.9653679653679653, "train_speed(iter/s)": 0.952184 }, { "epoch": 0.054510606503589645, "grad_norm": 0.9198592901229858, "learning_rate": 9.99945052729501e-06, "loss": 0.10920913517475128, "memory(GiB)": 19.03, "step": 1678, "token_acc": 0.9592592592592593, "train_speed(iter/s)": 0.952299 }, { "epoch": 0.05454309196634506, "grad_norm": 5.982636451721191, "learning_rate": 9.99944253521819e-06, "loss": 0.10753677785396576, "memory(GiB)": 19.03, "step": 1679, "token_acc": 0.9567307692307693, "train_speed(iter/s)": 0.952407 }, { "epoch": 0.05457557742910048, "grad_norm": 0.7733917236328125, "learning_rate": 9.999434485441149e-06, "loss": 0.10438643395900726, "memory(GiB)": 19.03, "step": 1680, "token_acc": 0.968, "train_speed(iter/s)": 0.952513 }, { "epoch": 0.054608062891855894, "grad_norm": 5.182475566864014, "learning_rate": 9.999426377963981e-06, "loss": 0.10431042313575745, "memory(GiB)": 19.03, "step": 1681, "token_acc": 0.9484126984126984, "train_speed(iter/s)": 0.952628 }, { "epoch": 0.05464054835461131, "grad_norm": 2.1614136695861816, "learning_rate": 9.999418212786777e-06, "loss": 0.1131499633193016, "memory(GiB)": 19.03, "step": 1682, "token_acc": 0.941747572815534, "train_speed(iter/s)": 0.952723 }, { "epoch": 0.054673033817366726, "grad_norm": 1.3018907308578491, "learning_rate": 9.999409989909634e-06, "loss": 0.12362181395292282, "memory(GiB)": 19.03, "step": 1683, "token_acc": 0.9553264604810997, "train_speed(iter/s)": 0.952822 }, { "epoch": 0.05470551928012215, "grad_norm": 0.900160551071167, "learning_rate": 9.999401709332646e-06, "loss": 0.10193222761154175, "memory(GiB)": 19.03, "step": 1684, "token_acc": 0.9693877551020408, "train_speed(iter/s)": 0.952926 }, { "epoch": 0.054738004742877565, "grad_norm": 1.6179587841033936, "learning_rate": 9.999393371055907e-06, "loss": 0.10405656695365906, "memory(GiB)": 19.03, "step": 1685, "token_acc": 0.9683257918552036, "train_speed(iter/s)": 0.953038 }, { "epoch": 0.05477049020563298, "grad_norm": 1.2659612894058228, "learning_rate": 9.999384975079515e-06, "loss": 0.11765481531620026, "memory(GiB)": 19.03, "step": 1686, "token_acc": 0.9698492462311558, "train_speed(iter/s)": 0.953142 }, { "epoch": 0.0548029756683884, "grad_norm": 1.0101888179779053, "learning_rate": 9.999376521403566e-06, "loss": 0.10837981849908829, "memory(GiB)": 19.03, "step": 1687, "token_acc": 0.9311594202898551, "train_speed(iter/s)": 0.953259 }, { "epoch": 0.05483546113114381, "grad_norm": 0.6629707217216492, "learning_rate": 9.999368010028157e-06, "loss": 0.10331571102142334, "memory(GiB)": 19.03, "step": 1688, "token_acc": 0.9529914529914529, "train_speed(iter/s)": 0.953365 }, { "epoch": 0.05486794659389923, "grad_norm": 1.9370371103286743, "learning_rate": 9.999359440953388e-06, "loss": 0.1021057665348053, "memory(GiB)": 19.03, "step": 1689, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.953477 }, { "epoch": 0.054900432056654645, "grad_norm": 1.074863076210022, "learning_rate": 9.999350814179356e-06, "loss": 0.11453744024038315, "memory(GiB)": 19.03, "step": 1690, "token_acc": 0.9469026548672567, "train_speed(iter/s)": 0.953596 }, { "epoch": 0.05493291751941006, "grad_norm": 0.9204773902893066, "learning_rate": 9.999342129706163e-06, "loss": 0.12028298527002335, "memory(GiB)": 19.03, "step": 1691, "token_acc": 0.9482071713147411, "train_speed(iter/s)": 0.953706 }, { "epoch": 0.054965402982165484, "grad_norm": 1.1911166906356812, "learning_rate": 9.999333387533905e-06, "loss": 0.10583219677209854, "memory(GiB)": 19.03, "step": 1692, "token_acc": 0.9665271966527197, "train_speed(iter/s)": 0.953815 }, { "epoch": 0.0549978884449209, "grad_norm": 0.7293856143951416, "learning_rate": 9.999324587662689e-06, "loss": 0.09880882501602173, "memory(GiB)": 19.03, "step": 1693, "token_acc": 0.9759615384615384, "train_speed(iter/s)": 0.953922 }, { "epoch": 0.05503037390767632, "grad_norm": 0.9505778551101685, "learning_rate": 9.99931573009261e-06, "loss": 0.09468618035316467, "memory(GiB)": 19.03, "step": 1694, "token_acc": 0.9615384615384616, "train_speed(iter/s)": 0.954026 }, { "epoch": 0.05506285937043173, "grad_norm": 0.923938512802124, "learning_rate": 9.999306814823775e-06, "loss": 0.11601674556732178, "memory(GiB)": 19.03, "step": 1695, "token_acc": 0.9447004608294931, "train_speed(iter/s)": 0.954143 }, { "epoch": 0.05509534483318715, "grad_norm": 1.4527846574783325, "learning_rate": 9.999297841856286e-06, "loss": 0.10849010199308395, "memory(GiB)": 19.03, "step": 1696, "token_acc": 0.9487179487179487, "train_speed(iter/s)": 0.954261 }, { "epoch": 0.055127830295942565, "grad_norm": 1.3489161729812622, "learning_rate": 9.999288811190245e-06, "loss": 0.09905403107404709, "memory(GiB)": 19.03, "step": 1697, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.954372 }, { "epoch": 0.05516031575869798, "grad_norm": 1.3938134908676147, "learning_rate": 9.999279722825757e-06, "loss": 0.11455792188644409, "memory(GiB)": 19.03, "step": 1698, "token_acc": 0.9467455621301775, "train_speed(iter/s)": 0.954485 }, { "epoch": 0.0551928012214534, "grad_norm": 1.0014628171920776, "learning_rate": 9.999270576762927e-06, "loss": 0.08988244831562042, "memory(GiB)": 19.03, "step": 1699, "token_acc": 0.9680851063829787, "train_speed(iter/s)": 0.9546 }, { "epoch": 0.05522528668420882, "grad_norm": 0.8993846774101257, "learning_rate": 9.99926137300186e-06, "loss": 0.10918466001749039, "memory(GiB)": 19.03, "step": 1700, "token_acc": 0.963963963963964, "train_speed(iter/s)": 0.954705 }, { "epoch": 0.055257772146964236, "grad_norm": 1.3794589042663574, "learning_rate": 9.999252111542663e-06, "loss": 0.1216101199388504, "memory(GiB)": 19.03, "step": 1701, "token_acc": 0.9437086092715232, "train_speed(iter/s)": 0.954818 }, { "epoch": 0.05529025760971965, "grad_norm": 1.9721250534057617, "learning_rate": 9.999242792385443e-06, "loss": 0.09897299110889435, "memory(GiB)": 19.03, "step": 1702, "token_acc": 0.9751243781094527, "train_speed(iter/s)": 0.954928 }, { "epoch": 0.05532274307247507, "grad_norm": 0.9764074087142944, "learning_rate": 9.999233415530308e-06, "loss": 0.10311876237392426, "memory(GiB)": 19.03, "step": 1703, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.955015 }, { "epoch": 0.055355228535230484, "grad_norm": 1.221593976020813, "learning_rate": 9.999223980977364e-06, "loss": 0.09442433714866638, "memory(GiB)": 19.03, "step": 1704, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.955105 }, { "epoch": 0.0553877139979859, "grad_norm": 1.3031598329544067, "learning_rate": 9.999214488726722e-06, "loss": 0.10355211049318314, "memory(GiB)": 19.03, "step": 1705, "token_acc": 0.9526627218934911, "train_speed(iter/s)": 0.955188 }, { "epoch": 0.055420199460741316, "grad_norm": 1.861907958984375, "learning_rate": 9.999204938778493e-06, "loss": 0.08871562778949738, "memory(GiB)": 19.03, "step": 1706, "token_acc": 0.9678714859437751, "train_speed(iter/s)": 0.955268 }, { "epoch": 0.05545268492349673, "grad_norm": 0.9780073761940002, "learning_rate": 9.999195331132781e-06, "loss": 0.08649738132953644, "memory(GiB)": 19.03, "step": 1707, "token_acc": 0.9673469387755103, "train_speed(iter/s)": 0.955355 }, { "epoch": 0.055485170386252156, "grad_norm": 1.0447674989700317, "learning_rate": 9.999185665789704e-06, "loss": 0.08314535766839981, "memory(GiB)": 19.03, "step": 1708, "token_acc": 0.9783549783549783, "train_speed(iter/s)": 0.955439 }, { "epoch": 0.05551765584900757, "grad_norm": 1.3370157480239868, "learning_rate": 9.999175942749368e-06, "loss": 0.1018797755241394, "memory(GiB)": 19.03, "step": 1709, "token_acc": 0.9734513274336283, "train_speed(iter/s)": 0.9555 }, { "epoch": 0.05555014131176299, "grad_norm": 1.2507002353668213, "learning_rate": 9.99916616201189e-06, "loss": 0.10445015132427216, "memory(GiB)": 19.03, "step": 1710, "token_acc": 0.9467213114754098, "train_speed(iter/s)": 0.955587 }, { "epoch": 0.055582626774518404, "grad_norm": 1.7243435382843018, "learning_rate": 9.999156323577379e-06, "loss": 0.12492743134498596, "memory(GiB)": 19.03, "step": 1711, "token_acc": 0.959409594095941, "train_speed(iter/s)": 0.955659 }, { "epoch": 0.05561511223727382, "grad_norm": 0.7730182409286499, "learning_rate": 9.999146427445952e-06, "loss": 0.08789432048797607, "memory(GiB)": 19.03, "step": 1712, "token_acc": 0.9723502304147466, "train_speed(iter/s)": 0.955749 }, { "epoch": 0.055647597700029236, "grad_norm": 0.7697861790657043, "learning_rate": 9.99913647361772e-06, "loss": 0.10401389002799988, "memory(GiB)": 19.03, "step": 1713, "token_acc": 0.9401408450704225, "train_speed(iter/s)": 0.955834 }, { "epoch": 0.05568008316278465, "grad_norm": 1.5729094743728638, "learning_rate": 9.999126462092799e-06, "loss": 0.09882514178752899, "memory(GiB)": 19.03, "step": 1714, "token_acc": 0.953125, "train_speed(iter/s)": 0.955906 }, { "epoch": 0.05571256862554007, "grad_norm": 0.6555546522140503, "learning_rate": 9.999116392871305e-06, "loss": 0.11553863435983658, "memory(GiB)": 19.03, "step": 1715, "token_acc": 0.9448818897637795, "train_speed(iter/s)": 0.955976 }, { "epoch": 0.05574505408829549, "grad_norm": 0.9693384766578674, "learning_rate": 9.999106265953355e-06, "loss": 0.08846218138933182, "memory(GiB)": 19.03, "step": 1716, "token_acc": 0.9545454545454546, "train_speed(iter/s)": 0.956048 }, { "epoch": 0.05577753955105091, "grad_norm": 1.3296383619308472, "learning_rate": 9.999096081339064e-06, "loss": 0.10427158325910568, "memory(GiB)": 19.03, "step": 1717, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.956121 }, { "epoch": 0.05581002501380632, "grad_norm": 0.6587861776351929, "learning_rate": 9.99908583902855e-06, "loss": 0.09618838131427765, "memory(GiB)": 19.03, "step": 1718, "token_acc": 0.9628252788104089, "train_speed(iter/s)": 0.956203 }, { "epoch": 0.05584251047656174, "grad_norm": 1.635479211807251, "learning_rate": 9.999075539021932e-06, "loss": 0.08979026228189468, "memory(GiB)": 19.03, "step": 1719, "token_acc": 0.9585253456221198, "train_speed(iter/s)": 0.956287 }, { "epoch": 0.055874995939317156, "grad_norm": 1.2032499313354492, "learning_rate": 9.99906518131933e-06, "loss": 0.11876831948757172, "memory(GiB)": 19.03, "step": 1720, "token_acc": 0.9621848739495799, "train_speed(iter/s)": 0.956371 }, { "epoch": 0.05590748140207257, "grad_norm": 0.7335876226425171, "learning_rate": 9.999054765920862e-06, "loss": 0.10514887422323227, "memory(GiB)": 19.03, "step": 1721, "token_acc": 0.9444444444444444, "train_speed(iter/s)": 0.956462 }, { "epoch": 0.05593996686482799, "grad_norm": 0.7214311957359314, "learning_rate": 9.999044292826644e-06, "loss": 0.09779596328735352, "memory(GiB)": 19.03, "step": 1722, "token_acc": 0.9587155963302753, "train_speed(iter/s)": 0.95654 }, { "epoch": 0.055972452327583404, "grad_norm": 1.0930389165878296, "learning_rate": 9.999033762036807e-06, "loss": 0.1111205443739891, "memory(GiB)": 19.03, "step": 1723, "token_acc": 0.9559471365638766, "train_speed(iter/s)": 0.956622 }, { "epoch": 0.05600493779033883, "grad_norm": 0.7503601908683777, "learning_rate": 9.999023173551464e-06, "loss": 0.10468579083681107, "memory(GiB)": 19.03, "step": 1724, "token_acc": 0.955719557195572, "train_speed(iter/s)": 0.956703 }, { "epoch": 0.05603742325309424, "grad_norm": 1.1777509450912476, "learning_rate": 9.99901252737074e-06, "loss": 0.10355231165885925, "memory(GiB)": 19.03, "step": 1725, "token_acc": 0.9664179104477612, "train_speed(iter/s)": 0.95679 }, { "epoch": 0.05606990871584966, "grad_norm": 0.6491341590881348, "learning_rate": 9.999001823494757e-06, "loss": 0.09192083775997162, "memory(GiB)": 19.03, "step": 1726, "token_acc": 0.9545454545454546, "train_speed(iter/s)": 0.956877 }, { "epoch": 0.056102394178605075, "grad_norm": 0.64290851354599, "learning_rate": 9.99899106192364e-06, "loss": 0.09616127610206604, "memory(GiB)": 19.03, "step": 1727, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.956966 }, { "epoch": 0.05613487964136049, "grad_norm": 0.866704523563385, "learning_rate": 9.998980242657511e-06, "loss": 0.10463366657495499, "memory(GiB)": 19.03, "step": 1728, "token_acc": 0.9613733905579399, "train_speed(iter/s)": 0.957047 }, { "epoch": 0.05616736510411591, "grad_norm": 0.8682098388671875, "learning_rate": 9.998969365696499e-06, "loss": 0.09514594078063965, "memory(GiB)": 19.03, "step": 1729, "token_acc": 0.9372384937238494, "train_speed(iter/s)": 0.957141 }, { "epoch": 0.05619985056687132, "grad_norm": 0.8512489795684814, "learning_rate": 9.998958431040725e-06, "loss": 0.10342284291982651, "memory(GiB)": 19.03, "step": 1730, "token_acc": 0.9680365296803652, "train_speed(iter/s)": 0.957227 }, { "epoch": 0.05623233602962674, "grad_norm": 0.5882185697555542, "learning_rate": 9.998947438690319e-06, "loss": 0.09181282669305801, "memory(GiB)": 19.03, "step": 1731, "token_acc": 0.9585253456221198, "train_speed(iter/s)": 0.957308 }, { "epoch": 0.05626482149238216, "grad_norm": 1.0432156324386597, "learning_rate": 9.998936388645403e-06, "loss": 0.08609647303819656, "memory(GiB)": 19.03, "step": 1732, "token_acc": 0.948936170212766, "train_speed(iter/s)": 0.957381 }, { "epoch": 0.05629730695513758, "grad_norm": 0.9499170780181885, "learning_rate": 9.99892528090611e-06, "loss": 0.11438646912574768, "memory(GiB)": 19.03, "step": 1733, "token_acc": 0.9411764705882353, "train_speed(iter/s)": 0.957462 }, { "epoch": 0.056329792417892995, "grad_norm": 0.851962149143219, "learning_rate": 9.998914115472566e-06, "loss": 0.0981171727180481, "memory(GiB)": 19.03, "step": 1734, "token_acc": 0.9556650246305419, "train_speed(iter/s)": 0.95755 }, { "epoch": 0.05636227788064841, "grad_norm": 1.1511664390563965, "learning_rate": 9.998902892344899e-06, "loss": 0.10684943944215775, "memory(GiB)": 19.03, "step": 1735, "token_acc": 0.964, "train_speed(iter/s)": 0.957648 }, { "epoch": 0.05639476334340383, "grad_norm": 0.8377902507781982, "learning_rate": 9.998891611523238e-06, "loss": 0.10394541919231415, "memory(GiB)": 19.03, "step": 1736, "token_acc": 0.9678714859437751, "train_speed(iter/s)": 0.957752 }, { "epoch": 0.05642724880615924, "grad_norm": 0.7365651726722717, "learning_rate": 9.998880273007716e-06, "loss": 0.09452660381793976, "memory(GiB)": 19.03, "step": 1737, "token_acc": 0.9578059071729957, "train_speed(iter/s)": 0.957857 }, { "epoch": 0.05645973426891466, "grad_norm": 0.8491837978363037, "learning_rate": 9.99886887679846e-06, "loss": 0.10485436022281647, "memory(GiB)": 19.03, "step": 1738, "token_acc": 0.9446808510638298, "train_speed(iter/s)": 0.957967 }, { "epoch": 0.056492219731670075, "grad_norm": 0.9730795621871948, "learning_rate": 9.998857422895604e-06, "loss": 0.10957959294319153, "memory(GiB)": 19.03, "step": 1739, "token_acc": 0.9636363636363636, "train_speed(iter/s)": 0.958071 }, { "epoch": 0.0565247051944255, "grad_norm": 0.698284924030304, "learning_rate": 9.998845911299281e-06, "loss": 0.09174811094999313, "memory(GiB)": 19.03, "step": 1740, "token_acc": 0.9604743083003953, "train_speed(iter/s)": 0.958168 }, { "epoch": 0.056557190657180914, "grad_norm": 0.9614757299423218, "learning_rate": 9.998834342009623e-06, "loss": 0.09909717738628387, "memory(GiB)": 19.03, "step": 1741, "token_acc": 0.9497907949790795, "train_speed(iter/s)": 0.958277 }, { "epoch": 0.05658967611993633, "grad_norm": 0.9558191895484924, "learning_rate": 9.998822715026764e-06, "loss": 0.10172408819198608, "memory(GiB)": 19.03, "step": 1742, "token_acc": 0.975103734439834, "train_speed(iter/s)": 0.95839 }, { "epoch": 0.056622161582691746, "grad_norm": 0.8100528717041016, "learning_rate": 9.998811030350834e-06, "loss": 0.09692776948213577, "memory(GiB)": 19.03, "step": 1743, "token_acc": 0.9447236180904522, "train_speed(iter/s)": 0.958476 }, { "epoch": 0.05665464704544716, "grad_norm": 0.7404989004135132, "learning_rate": 9.998799287981975e-06, "loss": 0.09647701680660248, "memory(GiB)": 19.03, "step": 1744, "token_acc": 0.9484126984126984, "train_speed(iter/s)": 0.958589 }, { "epoch": 0.05668713250820258, "grad_norm": 1.6044702529907227, "learning_rate": 9.998787487920316e-06, "loss": 0.10387768596410751, "memory(GiB)": 19.03, "step": 1745, "token_acc": 0.9330357142857143, "train_speed(iter/s)": 0.95869 }, { "epoch": 0.056719617970957995, "grad_norm": 1.0467486381530762, "learning_rate": 9.998775630165997e-06, "loss": 0.09965188801288605, "memory(GiB)": 19.03, "step": 1746, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.958796 }, { "epoch": 0.05675210343371341, "grad_norm": 1.3338264226913452, "learning_rate": 9.998763714719156e-06, "loss": 0.10202742367982864, "memory(GiB)": 19.03, "step": 1747, "token_acc": 0.9571428571428572, "train_speed(iter/s)": 0.958903 }, { "epoch": 0.056784588896468834, "grad_norm": 3.227611541748047, "learning_rate": 9.998751741579926e-06, "loss": 0.10716596245765686, "memory(GiB)": 19.03, "step": 1748, "token_acc": 0.953307392996109, "train_speed(iter/s)": 0.959011 }, { "epoch": 0.05681707435922425, "grad_norm": 1.9982671737670898, "learning_rate": 9.99873971074845e-06, "loss": 0.11478661000728607, "memory(GiB)": 19.03, "step": 1749, "token_acc": 0.9333333333333333, "train_speed(iter/s)": 0.959116 }, { "epoch": 0.056849559821979666, "grad_norm": 0.812798798084259, "learning_rate": 9.99872762222486e-06, "loss": 0.10007994621992111, "memory(GiB)": 19.03, "step": 1750, "token_acc": 0.9502262443438914, "train_speed(iter/s)": 0.959229 }, { "epoch": 0.05688204528473508, "grad_norm": 0.9339358806610107, "learning_rate": 9.998715476009305e-06, "loss": 0.11649112403392792, "memory(GiB)": 19.03, "step": 1751, "token_acc": 0.9401709401709402, "train_speed(iter/s)": 0.959333 }, { "epoch": 0.0569145307474905, "grad_norm": 0.9358962774276733, "learning_rate": 9.998703272101916e-06, "loss": 0.11432997137308121, "memory(GiB)": 19.03, "step": 1752, "token_acc": 0.9469387755102041, "train_speed(iter/s)": 0.959437 }, { "epoch": 0.056947016210245914, "grad_norm": 1.435568928718567, "learning_rate": 9.99869101050284e-06, "loss": 0.10503388941287994, "memory(GiB)": 19.03, "step": 1753, "token_acc": 0.9672131147540983, "train_speed(iter/s)": 0.959552 }, { "epoch": 0.05697950167300133, "grad_norm": 1.701131820678711, "learning_rate": 9.998678691212215e-06, "loss": 0.1082969605922699, "memory(GiB)": 19.03, "step": 1754, "token_acc": 0.9556650246305419, "train_speed(iter/s)": 0.959661 }, { "epoch": 0.057011987135756746, "grad_norm": 1.0716428756713867, "learning_rate": 9.998666314230187e-06, "loss": 0.11108951270580292, "memory(GiB)": 19.03, "step": 1755, "token_acc": 0.95703125, "train_speed(iter/s)": 0.959766 }, { "epoch": 0.05704447259851217, "grad_norm": 1.311429738998413, "learning_rate": 9.998653879556894e-06, "loss": 0.11750346422195435, "memory(GiB)": 19.03, "step": 1756, "token_acc": 0.9480968858131488, "train_speed(iter/s)": 0.959869 }, { "epoch": 0.057076958061267585, "grad_norm": 1.7434871196746826, "learning_rate": 9.998641387192485e-06, "loss": 0.10149619728326797, "memory(GiB)": 19.03, "step": 1757, "token_acc": 0.9446808510638298, "train_speed(iter/s)": 0.959968 }, { "epoch": 0.057109443524023, "grad_norm": 1.1030324697494507, "learning_rate": 9.998628837137098e-06, "loss": 0.09559129923582077, "memory(GiB)": 19.03, "step": 1758, "token_acc": 0.9502262443438914, "train_speed(iter/s)": 0.960069 }, { "epoch": 0.05714192898677842, "grad_norm": 1.7793257236480713, "learning_rate": 9.998616229390884e-06, "loss": 0.11069518327713013, "memory(GiB)": 19.03, "step": 1759, "token_acc": 0.9584905660377359, "train_speed(iter/s)": 0.960159 }, { "epoch": 0.057174414449533834, "grad_norm": 0.5214760303497314, "learning_rate": 9.998603563953983e-06, "loss": 0.09861749410629272, "memory(GiB)": 19.03, "step": 1760, "token_acc": 0.9570815450643777, "train_speed(iter/s)": 0.960236 }, { "epoch": 0.05720689991228925, "grad_norm": 0.9755805134773254, "learning_rate": 9.998590840826546e-06, "loss": 0.0883944034576416, "memory(GiB)": 19.03, "step": 1761, "token_acc": 0.9661016949152542, "train_speed(iter/s)": 0.960309 }, { "epoch": 0.057239385375044666, "grad_norm": 1.0804228782653809, "learning_rate": 9.998578060008718e-06, "loss": 0.11056017875671387, "memory(GiB)": 19.03, "step": 1762, "token_acc": 0.9683794466403162, "train_speed(iter/s)": 0.960392 }, { "epoch": 0.05727187083780008, "grad_norm": 1.0416747331619263, "learning_rate": 9.998565221500644e-06, "loss": 0.09578730165958405, "memory(GiB)": 19.03, "step": 1763, "token_acc": 0.9612068965517241, "train_speed(iter/s)": 0.96046 }, { "epoch": 0.057304356300555505, "grad_norm": 1.2310270071029663, "learning_rate": 9.998552325302475e-06, "loss": 0.09320732206106186, "memory(GiB)": 19.03, "step": 1764, "token_acc": 0.9684684684684685, "train_speed(iter/s)": 0.960539 }, { "epoch": 0.05733684176331092, "grad_norm": 2.392787456512451, "learning_rate": 9.998539371414358e-06, "loss": 0.10280926525592804, "memory(GiB)": 19.03, "step": 1765, "token_acc": 0.9427480916030534, "train_speed(iter/s)": 0.960624 }, { "epoch": 0.05736932722606634, "grad_norm": 1.1959805488586426, "learning_rate": 9.998526359836444e-06, "loss": 0.10375452041625977, "memory(GiB)": 19.03, "step": 1766, "token_acc": 0.97165991902834, "train_speed(iter/s)": 0.96069 }, { "epoch": 0.05740181268882175, "grad_norm": 1.2458596229553223, "learning_rate": 9.998513290568884e-06, "loss": 0.10502030700445175, "memory(GiB)": 19.03, "step": 1767, "token_acc": 0.9649122807017544, "train_speed(iter/s)": 0.960764 }, { "epoch": 0.05743429815157717, "grad_norm": 1.7736493349075317, "learning_rate": 9.998500163611828e-06, "loss": 0.11913222074508667, "memory(GiB)": 19.03, "step": 1768, "token_acc": 0.963963963963964, "train_speed(iter/s)": 0.960824 }, { "epoch": 0.057466783614332585, "grad_norm": 1.1391630172729492, "learning_rate": 9.998486978965424e-06, "loss": 0.11593800783157349, "memory(GiB)": 19.03, "step": 1769, "token_acc": 0.9640287769784173, "train_speed(iter/s)": 0.960897 }, { "epoch": 0.057499269077088, "grad_norm": 0.912681519985199, "learning_rate": 9.998473736629831e-06, "loss": 0.10564076900482178, "memory(GiB)": 19.03, "step": 1770, "token_acc": 0.9318181818181818, "train_speed(iter/s)": 0.960983 }, { "epoch": 0.05753175453984342, "grad_norm": 0.9880711436271667, "learning_rate": 9.998460436605198e-06, "loss": 0.11233499646186829, "memory(GiB)": 19.03, "step": 1771, "token_acc": 0.9521276595744681, "train_speed(iter/s)": 0.961064 }, { "epoch": 0.05756424000259884, "grad_norm": 1.1248953342437744, "learning_rate": 9.998447078891676e-06, "loss": 0.12203355133533478, "memory(GiB)": 19.03, "step": 1772, "token_acc": 0.9768518518518519, "train_speed(iter/s)": 0.961132 }, { "epoch": 0.057596725465354257, "grad_norm": 0.7931410670280457, "learning_rate": 9.998433663489423e-06, "loss": 0.1018260195851326, "memory(GiB)": 19.03, "step": 1773, "token_acc": 0.9495412844036697, "train_speed(iter/s)": 0.961202 }, { "epoch": 0.05762921092810967, "grad_norm": 0.6658674478530884, "learning_rate": 9.998420190398592e-06, "loss": 0.10311029851436615, "memory(GiB)": 19.03, "step": 1774, "token_acc": 0.9471698113207547, "train_speed(iter/s)": 0.961263 }, { "epoch": 0.05766169639086509, "grad_norm": 1.1236225366592407, "learning_rate": 9.998406659619339e-06, "loss": 0.11633335053920746, "memory(GiB)": 19.03, "step": 1775, "token_acc": 0.94140625, "train_speed(iter/s)": 0.961326 }, { "epoch": 0.057694181853620505, "grad_norm": 0.6434727311134338, "learning_rate": 9.998393071151821e-06, "loss": 0.09665321558713913, "memory(GiB)": 19.03, "step": 1776, "token_acc": 0.9538461538461539, "train_speed(iter/s)": 0.961408 }, { "epoch": 0.05772666731637592, "grad_norm": 1.254672646522522, "learning_rate": 9.998379424996194e-06, "loss": 0.10522957146167755, "memory(GiB)": 19.03, "step": 1777, "token_acc": 0.9392523364485982, "train_speed(iter/s)": 0.961491 }, { "epoch": 0.05775915277913134, "grad_norm": 1.3269681930541992, "learning_rate": 9.998365721152616e-06, "loss": 0.10398777574300766, "memory(GiB)": 19.03, "step": 1778, "token_acc": 0.9601593625498008, "train_speed(iter/s)": 0.96157 }, { "epoch": 0.05779163824188675, "grad_norm": 0.7977755665779114, "learning_rate": 9.998351959621246e-06, "loss": 0.10640843212604523, "memory(GiB)": 19.03, "step": 1779, "token_acc": 0.9523809523809523, "train_speed(iter/s)": 0.961646 }, { "epoch": 0.057824123704642176, "grad_norm": 1.3031504154205322, "learning_rate": 9.99833814040224e-06, "loss": 0.10171167552471161, "memory(GiB)": 19.03, "step": 1780, "token_acc": 0.970954356846473, "train_speed(iter/s)": 0.961725 }, { "epoch": 0.05785660916739759, "grad_norm": 0.7636814117431641, "learning_rate": 9.998324263495759e-06, "loss": 0.09763728082180023, "memory(GiB)": 19.03, "step": 1781, "token_acc": 0.9509803921568627, "train_speed(iter/s)": 0.96181 }, { "epoch": 0.05788909463015301, "grad_norm": 0.7051041126251221, "learning_rate": 9.998310328901963e-06, "loss": 0.09572732448577881, "memory(GiB)": 19.03, "step": 1782, "token_acc": 0.9581589958158996, "train_speed(iter/s)": 0.961886 }, { "epoch": 0.057921580092908424, "grad_norm": 0.8230692148208618, "learning_rate": 9.998296336621013e-06, "loss": 0.09053695201873779, "memory(GiB)": 19.03, "step": 1783, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.961958 }, { "epoch": 0.05795406555566384, "grad_norm": 0.9319314956665039, "learning_rate": 9.99828228665307e-06, "loss": 0.10069084912538528, "memory(GiB)": 19.03, "step": 1784, "token_acc": 0.9448818897637795, "train_speed(iter/s)": 0.962041 }, { "epoch": 0.057986551018419256, "grad_norm": 0.9134910106658936, "learning_rate": 9.9982681789983e-06, "loss": 0.1148688867688179, "memory(GiB)": 19.03, "step": 1785, "token_acc": 0.9296482412060302, "train_speed(iter/s)": 0.96212 }, { "epoch": 0.05801903648117467, "grad_norm": 0.8737581372261047, "learning_rate": 9.998254013656861e-06, "loss": 0.09391219168901443, "memory(GiB)": 19.03, "step": 1786, "token_acc": 0.9566929133858267, "train_speed(iter/s)": 0.962204 }, { "epoch": 0.05805152194393009, "grad_norm": 0.8485227227210999, "learning_rate": 9.998239790628917e-06, "loss": 0.09271076321601868, "memory(GiB)": 19.03, "step": 1787, "token_acc": 0.9707317073170731, "train_speed(iter/s)": 0.962295 }, { "epoch": 0.05808400740668551, "grad_norm": 0.728982150554657, "learning_rate": 9.998225509914632e-06, "loss": 0.09357473254203796, "memory(GiB)": 19.03, "step": 1788, "token_acc": 0.9684684684684685, "train_speed(iter/s)": 0.96238 }, { "epoch": 0.05811649286944093, "grad_norm": 1.8865550756454468, "learning_rate": 9.998211171514174e-06, "loss": 0.09642571210861206, "memory(GiB)": 19.03, "step": 1789, "token_acc": 0.9719626168224299, "train_speed(iter/s)": 0.962456 }, { "epoch": 0.058148978332196344, "grad_norm": 0.6847565770149231, "learning_rate": 9.998196775427707e-06, "loss": 0.10972874611616135, "memory(GiB)": 19.03, "step": 1790, "token_acc": 0.9333333333333333, "train_speed(iter/s)": 0.962533 }, { "epoch": 0.05818146379495176, "grad_norm": 0.8790081739425659, "learning_rate": 9.998182321655396e-06, "loss": 0.10650724172592163, "memory(GiB)": 19.03, "step": 1791, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.962588 }, { "epoch": 0.058213949257707176, "grad_norm": 0.8375013470649719, "learning_rate": 9.998167810197407e-06, "loss": 0.09352336823940277, "memory(GiB)": 19.03, "step": 1792, "token_acc": 0.9617021276595744, "train_speed(iter/s)": 0.962645 }, { "epoch": 0.05824643472046259, "grad_norm": 0.7344439029693604, "learning_rate": 9.99815324105391e-06, "loss": 0.09903822839260101, "memory(GiB)": 19.03, "step": 1793, "token_acc": 0.9615384615384616, "train_speed(iter/s)": 0.962732 }, { "epoch": 0.05827892018321801, "grad_norm": 0.7580509185791016, "learning_rate": 9.998138614225073e-06, "loss": 0.09202714264392853, "memory(GiB)": 19.03, "step": 1794, "token_acc": 0.9631147540983607, "train_speed(iter/s)": 0.962817 }, { "epoch": 0.058311405645973424, "grad_norm": 1.3482258319854736, "learning_rate": 9.998123929711062e-06, "loss": 0.10310247540473938, "memory(GiB)": 19.03, "step": 1795, "token_acc": 0.9767441860465116, "train_speed(iter/s)": 0.962917 }, { "epoch": 0.05834389110872885, "grad_norm": 0.9992733597755432, "learning_rate": 9.998109187512049e-06, "loss": 0.10120387375354767, "memory(GiB)": 19.03, "step": 1796, "token_acc": 0.9702127659574468, "train_speed(iter/s)": 0.963016 }, { "epoch": 0.05837637657148426, "grad_norm": 0.918083667755127, "learning_rate": 9.998094387628205e-06, "loss": 0.09673762321472168, "memory(GiB)": 19.03, "step": 1797, "token_acc": 0.9560975609756097, "train_speed(iter/s)": 0.963117 }, { "epoch": 0.05840886203423968, "grad_norm": 0.7699006795883179, "learning_rate": 9.998079530059697e-06, "loss": 0.0987878292798996, "memory(GiB)": 19.03, "step": 1798, "token_acc": 0.9645390070921985, "train_speed(iter/s)": 0.963225 }, { "epoch": 0.058441347496995096, "grad_norm": 0.8435183763504028, "learning_rate": 9.9980646148067e-06, "loss": 0.09737975150346756, "memory(GiB)": 19.03, "step": 1799, "token_acc": 0.9601990049751243, "train_speed(iter/s)": 0.963336 }, { "epoch": 0.05847383295975051, "grad_norm": 0.882760763168335, "learning_rate": 9.998049641869384e-06, "loss": 0.1088775098323822, "memory(GiB)": 19.03, "step": 1800, "token_acc": 0.9547325102880658, "train_speed(iter/s)": 0.963437 }, { "epoch": 0.05850631842250593, "grad_norm": 1.159725308418274, "learning_rate": 9.998034611247924e-06, "loss": 0.11755002290010452, "memory(GiB)": 19.03, "step": 1801, "token_acc": 0.946969696969697, "train_speed(iter/s)": 0.963541 }, { "epoch": 0.058538803885261344, "grad_norm": 1.067603588104248, "learning_rate": 9.998019522942492e-06, "loss": 0.10230077803134918, "memory(GiB)": 19.03, "step": 1802, "token_acc": 0.9606986899563319, "train_speed(iter/s)": 0.963632 }, { "epoch": 0.05857128934801676, "grad_norm": 1.004821538925171, "learning_rate": 9.998004376953262e-06, "loss": 0.0954475849866867, "memory(GiB)": 19.03, "step": 1803, "token_acc": 0.9700854700854701, "train_speed(iter/s)": 0.963729 }, { "epoch": 0.05860377481077218, "grad_norm": 5.7209296226501465, "learning_rate": 9.99798917328041e-06, "loss": 0.10044978559017181, "memory(GiB)": 19.03, "step": 1804, "token_acc": 0.9585253456221198, "train_speed(iter/s)": 0.963831 }, { "epoch": 0.0586362602735276, "grad_norm": 1.2456144094467163, "learning_rate": 9.99797391192411e-06, "loss": 0.10400199890136719, "memory(GiB)": 19.03, "step": 1805, "token_acc": 0.952755905511811, "train_speed(iter/s)": 0.963942 }, { "epoch": 0.058668745736283015, "grad_norm": 1.4302676916122437, "learning_rate": 9.997958592884538e-06, "loss": 0.113813616335392, "memory(GiB)": 19.03, "step": 1806, "token_acc": 0.9483568075117371, "train_speed(iter/s)": 0.964035 }, { "epoch": 0.05870123119903843, "grad_norm": 0.677443265914917, "learning_rate": 9.997943216161874e-06, "loss": 0.08673989027738571, "memory(GiB)": 19.03, "step": 1807, "token_acc": 0.9620253164556962, "train_speed(iter/s)": 0.964137 }, { "epoch": 0.05873371666179385, "grad_norm": 0.813174843788147, "learning_rate": 9.99792778175629e-06, "loss": 0.10326943546533585, "memory(GiB)": 19.03, "step": 1808, "token_acc": 0.9484978540772532, "train_speed(iter/s)": 0.964224 }, { "epoch": 0.05876620212454926, "grad_norm": 1.045422911643982, "learning_rate": 9.997912289667969e-06, "loss": 0.10413546860218048, "memory(GiB)": 19.03, "step": 1809, "token_acc": 0.9550561797752809, "train_speed(iter/s)": 0.964322 }, { "epoch": 0.05879868758730468, "grad_norm": 1.4711498022079468, "learning_rate": 9.997896739897088e-06, "loss": 0.11145594716072083, "memory(GiB)": 19.03, "step": 1810, "token_acc": 0.9468085106382979, "train_speed(iter/s)": 0.964418 }, { "epoch": 0.058831173050060095, "grad_norm": 0.776569664478302, "learning_rate": 9.997881132443826e-06, "loss": 0.10421781241893768, "memory(GiB)": 19.03, "step": 1811, "token_acc": 0.9366515837104072, "train_speed(iter/s)": 0.964522 }, { "epoch": 0.05886365851281552, "grad_norm": 1.700232982635498, "learning_rate": 9.997865467308364e-06, "loss": 0.10513122379779816, "memory(GiB)": 19.03, "step": 1812, "token_acc": 0.9402390438247012, "train_speed(iter/s)": 0.964615 }, { "epoch": 0.058896143975570935, "grad_norm": 1.1985726356506348, "learning_rate": 9.997849744490881e-06, "loss": 0.1070794016122818, "memory(GiB)": 19.03, "step": 1813, "token_acc": 0.9547511312217195, "train_speed(iter/s)": 0.964715 }, { "epoch": 0.05892862943832635, "grad_norm": 1.5794389247894287, "learning_rate": 9.997833963991561e-06, "loss": 0.09421505033969879, "memory(GiB)": 19.03, "step": 1814, "token_acc": 0.9545454545454546, "train_speed(iter/s)": 0.964811 }, { "epoch": 0.05896111490108177, "grad_norm": 0.6097093820571899, "learning_rate": 9.997818125810583e-06, "loss": 0.10457009822130203, "memory(GiB)": 19.03, "step": 1815, "token_acc": 0.953125, "train_speed(iter/s)": 0.964886 }, { "epoch": 0.05899360036383718, "grad_norm": 0.9832553267478943, "learning_rate": 9.997802229948135e-06, "loss": 0.10253091156482697, "memory(GiB)": 19.03, "step": 1816, "token_acc": 0.9516728624535316, "train_speed(iter/s)": 0.964957 }, { "epoch": 0.0590260858265926, "grad_norm": 1.1760144233703613, "learning_rate": 9.997786276404396e-06, "loss": 0.10046274960041046, "memory(GiB)": 19.03, "step": 1817, "token_acc": 0.9561752988047809, "train_speed(iter/s)": 0.965044 }, { "epoch": 0.059058571289348015, "grad_norm": 2.242436408996582, "learning_rate": 9.99777026517955e-06, "loss": 0.10325722396373749, "memory(GiB)": 19.03, "step": 1818, "token_acc": 0.9528301886792453, "train_speed(iter/s)": 0.965111 }, { "epoch": 0.05909105675210343, "grad_norm": 1.2056694030761719, "learning_rate": 9.997754196273784e-06, "loss": 0.11785315722227097, "memory(GiB)": 19.03, "step": 1819, "token_acc": 0.9475806451612904, "train_speed(iter/s)": 0.965192 }, { "epoch": 0.059123542214858854, "grad_norm": 0.6609365940093994, "learning_rate": 9.997738069687282e-06, "loss": 0.1043817549943924, "memory(GiB)": 19.03, "step": 1820, "token_acc": 0.9539473684210527, "train_speed(iter/s)": 0.965254 }, { "epoch": 0.05915602767761427, "grad_norm": 0.9738099575042725, "learning_rate": 9.997721885420231e-06, "loss": 0.09695542603731155, "memory(GiB)": 19.03, "step": 1821, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.965331 }, { "epoch": 0.059188513140369686, "grad_norm": 0.8316149711608887, "learning_rate": 9.99770564347282e-06, "loss": 0.1035679280757904, "memory(GiB)": 19.03, "step": 1822, "token_acc": 0.9636363636363636, "train_speed(iter/s)": 0.9654 }, { "epoch": 0.0592209986031251, "grad_norm": 0.7510182857513428, "learning_rate": 9.997689343845231e-06, "loss": 0.09791245311498642, "memory(GiB)": 19.03, "step": 1823, "token_acc": 0.9567099567099567, "train_speed(iter/s)": 0.965461 }, { "epoch": 0.05925348406588052, "grad_norm": 2.064094066619873, "learning_rate": 9.997672986537655e-06, "loss": 0.10917362570762634, "memory(GiB)": 19.03, "step": 1824, "token_acc": 0.9586776859504132, "train_speed(iter/s)": 0.965526 }, { "epoch": 0.059285969528635935, "grad_norm": 0.9108203053474426, "learning_rate": 9.997656571550283e-06, "loss": 0.0955846756696701, "memory(GiB)": 19.03, "step": 1825, "token_acc": 0.9646464646464646, "train_speed(iter/s)": 0.965592 }, { "epoch": 0.05931845499139135, "grad_norm": 1.6964671611785889, "learning_rate": 9.9976400988833e-06, "loss": 0.10692156106233597, "memory(GiB)": 19.03, "step": 1826, "token_acc": 0.9392712550607287, "train_speed(iter/s)": 0.965652 }, { "epoch": 0.05935094045414677, "grad_norm": 0.9365697503089905, "learning_rate": 9.9976235685369e-06, "loss": 0.09588620066642761, "memory(GiB)": 19.03, "step": 1827, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.965727 }, { "epoch": 0.05938342591690219, "grad_norm": 0.6830318570137024, "learning_rate": 9.997606980511271e-06, "loss": 0.08996187150478363, "memory(GiB)": 19.03, "step": 1828, "token_acc": 0.9634703196347032, "train_speed(iter/s)": 0.965804 }, { "epoch": 0.059415911379657606, "grad_norm": 1.077751636505127, "learning_rate": 9.997590334806606e-06, "loss": 0.11761036515235901, "memory(GiB)": 19.03, "step": 1829, "token_acc": 0.9330855018587361, "train_speed(iter/s)": 0.965869 }, { "epoch": 0.05944839684241302, "grad_norm": 0.8421110510826111, "learning_rate": 9.997573631423096e-06, "loss": 0.11681482195854187, "memory(GiB)": 19.03, "step": 1830, "token_acc": 0.9372384937238494, "train_speed(iter/s)": 0.965947 }, { "epoch": 0.05948088230516844, "grad_norm": 0.8978528380393982, "learning_rate": 9.997556870360936e-06, "loss": 0.09418994933366776, "memory(GiB)": 19.03, "step": 1831, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.966007 }, { "epoch": 0.059513367767923854, "grad_norm": 1.1897140741348267, "learning_rate": 9.997540051620319e-06, "loss": 0.09821825474500656, "memory(GiB)": 19.03, "step": 1832, "token_acc": 0.9662921348314607, "train_speed(iter/s)": 0.966084 }, { "epoch": 0.05954585323067927, "grad_norm": 0.8539817929267883, "learning_rate": 9.997523175201435e-06, "loss": 0.0907244086265564, "memory(GiB)": 19.03, "step": 1833, "token_acc": 0.9486166007905138, "train_speed(iter/s)": 0.96616 }, { "epoch": 0.059578338693434686, "grad_norm": 0.675268292427063, "learning_rate": 9.997506241104485e-06, "loss": 0.09193561971187592, "memory(GiB)": 19.03, "step": 1834, "token_acc": 0.9581589958158996, "train_speed(iter/s)": 0.966233 }, { "epoch": 0.0596108241561901, "grad_norm": 1.1937782764434814, "learning_rate": 9.997489249329659e-06, "loss": 0.11588729918003082, "memory(GiB)": 19.03, "step": 1835, "token_acc": 0.9660377358490566, "train_speed(iter/s)": 0.966312 }, { "epoch": 0.059643309618945525, "grad_norm": 1.1095216274261475, "learning_rate": 9.997472199877157e-06, "loss": 0.09852594137191772, "memory(GiB)": 19.03, "step": 1836, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.966394 }, { "epoch": 0.05967579508170094, "grad_norm": 0.7080984711647034, "learning_rate": 9.997455092747173e-06, "loss": 0.10040140897035599, "memory(GiB)": 19.03, "step": 1837, "token_acc": 0.9525862068965517, "train_speed(iter/s)": 0.966466 }, { "epoch": 0.05970828054445636, "grad_norm": 1.6726833581924438, "learning_rate": 9.997437927939906e-06, "loss": 0.09691277891397476, "memory(GiB)": 19.03, "step": 1838, "token_acc": 0.9491525423728814, "train_speed(iter/s)": 0.966548 }, { "epoch": 0.059740766007211774, "grad_norm": 0.7564964294433594, "learning_rate": 9.997420705455554e-06, "loss": 0.09573421627283096, "memory(GiB)": 19.03, "step": 1839, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.966626 }, { "epoch": 0.05977325146996719, "grad_norm": 1.0620359182357788, "learning_rate": 9.997403425294315e-06, "loss": 0.10060004144906998, "memory(GiB)": 19.03, "step": 1840, "token_acc": 0.9354838709677419, "train_speed(iter/s)": 0.966699 }, { "epoch": 0.059805736932722606, "grad_norm": 0.6884528994560242, "learning_rate": 9.99738608745639e-06, "loss": 0.08657022565603256, "memory(GiB)": 19.03, "step": 1841, "token_acc": 0.9641255605381166, "train_speed(iter/s)": 0.96678 }, { "epoch": 0.05983822239547802, "grad_norm": 0.8105958700180054, "learning_rate": 9.997368691941976e-06, "loss": 0.10238479822874069, "memory(GiB)": 19.03, "step": 1842, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.966844 }, { "epoch": 0.05987070785823344, "grad_norm": 0.8204463124275208, "learning_rate": 9.997351238751277e-06, "loss": 0.10773113369941711, "memory(GiB)": 19.03, "step": 1843, "token_acc": 0.9554455445544554, "train_speed(iter/s)": 0.966923 }, { "epoch": 0.05990319332098886, "grad_norm": 0.8386618494987488, "learning_rate": 9.997333727884492e-06, "loss": 0.09194986522197723, "memory(GiB)": 19.03, "step": 1844, "token_acc": 0.9551569506726457, "train_speed(iter/s)": 0.967003 }, { "epoch": 0.05993567878374428, "grad_norm": 2.3426358699798584, "learning_rate": 9.997316159341828e-06, "loss": 0.1091890037059784, "memory(GiB)": 19.03, "step": 1845, "token_acc": 0.9397163120567376, "train_speed(iter/s)": 0.967084 }, { "epoch": 0.05996816424649969, "grad_norm": 1.1979042291641235, "learning_rate": 9.99729853312348e-06, "loss": 0.10076534003019333, "memory(GiB)": 19.03, "step": 1846, "token_acc": 0.9649805447470817, "train_speed(iter/s)": 0.967158 }, { "epoch": 0.06000064970925511, "grad_norm": 1.7979912757873535, "learning_rate": 9.997280849229658e-06, "loss": 0.09395594894886017, "memory(GiB)": 19.03, "step": 1847, "token_acc": 0.9707112970711297, "train_speed(iter/s)": 0.967238 }, { "epoch": 0.060033135172010525, "grad_norm": 0.8965800404548645, "learning_rate": 9.997263107660563e-06, "loss": 0.11387057602405548, "memory(GiB)": 19.03, "step": 1848, "token_acc": 0.9653846153846154, "train_speed(iter/s)": 0.967312 }, { "epoch": 0.06006562063476594, "grad_norm": 0.8429722189903259, "learning_rate": 9.997245308416402e-06, "loss": 0.09378416836261749, "memory(GiB)": 19.03, "step": 1849, "token_acc": 0.954337899543379, "train_speed(iter/s)": 0.967372 }, { "epoch": 0.06009810609752136, "grad_norm": 0.9132452607154846, "learning_rate": 9.997227451497376e-06, "loss": 0.10419396311044693, "memory(GiB)": 19.03, "step": 1850, "token_acc": 0.9508928571428571, "train_speed(iter/s)": 0.967448 }, { "epoch": 0.060130591560276773, "grad_norm": 1.356950283050537, "learning_rate": 9.997209536903695e-06, "loss": 0.10714367032051086, "memory(GiB)": 19.03, "step": 1851, "token_acc": 0.9482758620689655, "train_speed(iter/s)": 0.967524 }, { "epoch": 0.060163077023032197, "grad_norm": 0.9155381917953491, "learning_rate": 9.997191564635564e-06, "loss": 0.10460104793310165, "memory(GiB)": 19.03, "step": 1852, "token_acc": 0.9420289855072463, "train_speed(iter/s)": 0.967605 }, { "epoch": 0.06019556248578761, "grad_norm": 1.0342580080032349, "learning_rate": 9.997173534693191e-06, "loss": 0.09928993880748749, "memory(GiB)": 19.03, "step": 1853, "token_acc": 0.9601593625498008, "train_speed(iter/s)": 0.967697 }, { "epoch": 0.06022804794854303, "grad_norm": 2.862072467803955, "learning_rate": 9.997155447076787e-06, "loss": 0.09294052422046661, "memory(GiB)": 19.03, "step": 1854, "token_acc": 0.9449152542372882, "train_speed(iter/s)": 0.967789 }, { "epoch": 0.060260533411298445, "grad_norm": 0.7029007077217102, "learning_rate": 9.997137301786554e-06, "loss": 0.10540518164634705, "memory(GiB)": 19.03, "step": 1855, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.967891 }, { "epoch": 0.06029301887405386, "grad_norm": 0.9865570664405823, "learning_rate": 9.997119098822709e-06, "loss": 0.10360993444919586, "memory(GiB)": 19.03, "step": 1856, "token_acc": 0.9533898305084746, "train_speed(iter/s)": 0.967995 }, { "epoch": 0.06032550433680928, "grad_norm": 0.9594144821166992, "learning_rate": 9.997100838185456e-06, "loss": 0.10834676772356033, "memory(GiB)": 19.03, "step": 1857, "token_acc": 0.9702602230483272, "train_speed(iter/s)": 0.968084 }, { "epoch": 0.06035798979956469, "grad_norm": 1.78067147731781, "learning_rate": 9.99708251987501e-06, "loss": 0.1066911518573761, "memory(GiB)": 19.03, "step": 1858, "token_acc": 0.9446640316205533, "train_speed(iter/s)": 0.968182 }, { "epoch": 0.06039047526232011, "grad_norm": 0.9240760803222656, "learning_rate": 9.99706414389158e-06, "loss": 0.0949614942073822, "memory(GiB)": 19.03, "step": 1859, "token_acc": 0.9552238805970149, "train_speed(iter/s)": 0.968282 }, { "epoch": 0.06042296072507553, "grad_norm": 0.8341506123542786, "learning_rate": 9.997045710235379e-06, "loss": 0.09825722873210907, "memory(GiB)": 19.03, "step": 1860, "token_acc": 0.9595588235294118, "train_speed(iter/s)": 0.96837 }, { "epoch": 0.06045544618783095, "grad_norm": 5.232631683349609, "learning_rate": 9.997027218906617e-06, "loss": 0.09876929223537445, "memory(GiB)": 19.03, "step": 1861, "token_acc": 0.9730941704035875, "train_speed(iter/s)": 0.968473 }, { "epoch": 0.060487931650586364, "grad_norm": 1.0061630010604858, "learning_rate": 9.997008669905511e-06, "loss": 0.10339116305112839, "memory(GiB)": 19.03, "step": 1862, "token_acc": 0.9663865546218487, "train_speed(iter/s)": 0.968575 }, { "epoch": 0.06052041711334178, "grad_norm": 3.108856201171875, "learning_rate": 9.996990063232275e-06, "loss": 0.09944078326225281, "memory(GiB)": 19.03, "step": 1863, "token_acc": 0.9520295202952029, "train_speed(iter/s)": 0.968664 }, { "epoch": 0.060552902576097196, "grad_norm": 0.8006719946861267, "learning_rate": 9.996971398887124e-06, "loss": 0.0905456691980362, "memory(GiB)": 19.03, "step": 1864, "token_acc": 0.9448818897637795, "train_speed(iter/s)": 0.968763 }, { "epoch": 0.06058538803885261, "grad_norm": 1.373308539390564, "learning_rate": 9.996952676870269e-06, "loss": 0.10102824866771698, "memory(GiB)": 19.03, "step": 1865, "token_acc": 0.9635627530364372, "train_speed(iter/s)": 0.968855 }, { "epoch": 0.06061787350160803, "grad_norm": 1.6090115308761597, "learning_rate": 9.99693389718193e-06, "loss": 0.09980136901140213, "memory(GiB)": 19.03, "step": 1866, "token_acc": 0.9367088607594937, "train_speed(iter/s)": 0.968933 }, { "epoch": 0.060650358964363445, "grad_norm": 1.091390609741211, "learning_rate": 9.996915059822324e-06, "loss": 0.09689085930585861, "memory(GiB)": 19.03, "step": 1867, "token_acc": 0.9550561797752809, "train_speed(iter/s)": 0.969036 }, { "epoch": 0.06068284442711887, "grad_norm": 1.0667119026184082, "learning_rate": 9.996896164791667e-06, "loss": 0.11110547930002213, "memory(GiB)": 19.03, "step": 1868, "token_acc": 0.9510204081632653, "train_speed(iter/s)": 0.969132 }, { "epoch": 0.060715329889874284, "grad_norm": 0.7098801136016846, "learning_rate": 9.996877212090177e-06, "loss": 0.10077504813671112, "memory(GiB)": 19.03, "step": 1869, "token_acc": 0.9598214285714286, "train_speed(iter/s)": 0.96922 }, { "epoch": 0.0607478153526297, "grad_norm": 0.6576372385025024, "learning_rate": 9.996858201718073e-06, "loss": 0.10380388796329498, "memory(GiB)": 19.03, "step": 1870, "token_acc": 0.9567567567567568, "train_speed(iter/s)": 0.969317 }, { "epoch": 0.060780300815385116, "grad_norm": 1.082249402999878, "learning_rate": 9.996839133675576e-06, "loss": 0.1073567122220993, "memory(GiB)": 19.03, "step": 1871, "token_acc": 0.9539170506912442, "train_speed(iter/s)": 0.969392 }, { "epoch": 0.06081278627814053, "grad_norm": 1.1020936965942383, "learning_rate": 9.996820007962901e-06, "loss": 0.10688126087188721, "memory(GiB)": 19.03, "step": 1872, "token_acc": 0.9574468085106383, "train_speed(iter/s)": 0.969459 }, { "epoch": 0.06084527174089595, "grad_norm": 0.8039716482162476, "learning_rate": 9.996800824580276e-06, "loss": 0.09067876636981964, "memory(GiB)": 19.03, "step": 1873, "token_acc": 0.9732620320855615, "train_speed(iter/s)": 0.969542 }, { "epoch": 0.060877757203651364, "grad_norm": 0.9742637872695923, "learning_rate": 9.996781583527918e-06, "loss": 0.08791051059961319, "memory(GiB)": 19.03, "step": 1874, "token_acc": 0.9504950495049505, "train_speed(iter/s)": 0.969619 }, { "epoch": 0.06091024266640678, "grad_norm": 5.06351375579834, "learning_rate": 9.99676228480605e-06, "loss": 0.10156112909317017, "memory(GiB)": 19.03, "step": 1875, "token_acc": 0.9527896995708155, "train_speed(iter/s)": 0.969678 }, { "epoch": 0.0609427281291622, "grad_norm": 1.1487390995025635, "learning_rate": 9.996742928414894e-06, "loss": 0.09492625296115875, "memory(GiB)": 19.03, "step": 1876, "token_acc": 0.9595141700404858, "train_speed(iter/s)": 0.969753 }, { "epoch": 0.06097521359191762, "grad_norm": 0.8257680535316467, "learning_rate": 9.996723514354674e-06, "loss": 0.08865664154291153, "memory(GiB)": 19.03, "step": 1877, "token_acc": 0.9514563106796117, "train_speed(iter/s)": 0.969811 }, { "epoch": 0.061007699054673035, "grad_norm": 0.6924368739128113, "learning_rate": 9.996704042625613e-06, "loss": 0.10556115955114365, "memory(GiB)": 19.03, "step": 1878, "token_acc": 0.9563106796116505, "train_speed(iter/s)": 0.969883 }, { "epoch": 0.06104018451742845, "grad_norm": 0.7586001753807068, "learning_rate": 9.996684513227939e-06, "loss": 0.10607142746448517, "memory(GiB)": 19.03, "step": 1879, "token_acc": 0.9636363636363636, "train_speed(iter/s)": 0.96995 }, { "epoch": 0.06107266998018387, "grad_norm": 0.7849826812744141, "learning_rate": 9.996664926161873e-06, "loss": 0.08532275259494781, "memory(GiB)": 19.03, "step": 1880, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.970018 }, { "epoch": 0.061105155442939284, "grad_norm": 1.1786812543869019, "learning_rate": 9.996645281427644e-06, "loss": 0.09357371926307678, "memory(GiB)": 19.03, "step": 1881, "token_acc": 0.9727272727272728, "train_speed(iter/s)": 0.970087 }, { "epoch": 0.0611376409056947, "grad_norm": 0.8066494464874268, "learning_rate": 9.996625579025479e-06, "loss": 0.09581299871206284, "memory(GiB)": 19.03, "step": 1882, "token_acc": 0.9516129032258065, "train_speed(iter/s)": 0.97015 }, { "epoch": 0.061170126368450116, "grad_norm": 0.7637162208557129, "learning_rate": 9.996605818955603e-06, "loss": 0.09765750169754028, "memory(GiB)": 19.03, "step": 1883, "token_acc": 0.9656862745098039, "train_speed(iter/s)": 0.970208 }, { "epoch": 0.06120261183120554, "grad_norm": 0.9728261232376099, "learning_rate": 9.996586001218245e-06, "loss": 0.09370675683021545, "memory(GiB)": 19.03, "step": 1884, "token_acc": 0.9512195121951219, "train_speed(iter/s)": 0.970279 }, { "epoch": 0.061235097293960955, "grad_norm": 0.7180876731872559, "learning_rate": 9.996566125813634e-06, "loss": 0.09518718719482422, "memory(GiB)": 19.03, "step": 1885, "token_acc": 0.9547511312217195, "train_speed(iter/s)": 0.970351 }, { "epoch": 0.06126758275671637, "grad_norm": 0.6388576626777649, "learning_rate": 9.996546192742e-06, "loss": 0.09823831915855408, "memory(GiB)": 19.03, "step": 1886, "token_acc": 0.9561752988047809, "train_speed(iter/s)": 0.970413 }, { "epoch": 0.06130006821947179, "grad_norm": 0.8622382879257202, "learning_rate": 9.996526202003574e-06, "loss": 0.09384346008300781, "memory(GiB)": 19.03, "step": 1887, "token_acc": 0.954225352112676, "train_speed(iter/s)": 0.970481 }, { "epoch": 0.0613325536822272, "grad_norm": 0.9255167841911316, "learning_rate": 9.996506153598584e-06, "loss": 0.09905911982059479, "memory(GiB)": 19.03, "step": 1888, "token_acc": 0.9626865671641791, "train_speed(iter/s)": 0.970554 }, { "epoch": 0.06136503914498262, "grad_norm": 1.2066134214401245, "learning_rate": 9.996486047527263e-06, "loss": 0.09913870692253113, "memory(GiB)": 19.03, "step": 1889, "token_acc": 0.955719557195572, "train_speed(iter/s)": 0.970644 }, { "epoch": 0.061397524607738035, "grad_norm": 1.5633926391601562, "learning_rate": 9.996465883789843e-06, "loss": 0.09143334627151489, "memory(GiB)": 19.03, "step": 1890, "token_acc": 0.9611650485436893, "train_speed(iter/s)": 0.970702 }, { "epoch": 0.06143001007049345, "grad_norm": 0.9293341636657715, "learning_rate": 9.996445662386556e-06, "loss": 0.09856268763542175, "memory(GiB)": 19.03, "step": 1891, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.970777 }, { "epoch": 0.061462495533248875, "grad_norm": 1.0030150413513184, "learning_rate": 9.996425383317635e-06, "loss": 0.10548315197229385, "memory(GiB)": 19.03, "step": 1892, "token_acc": 0.9622641509433962, "train_speed(iter/s)": 0.970849 }, { "epoch": 0.06149498099600429, "grad_norm": 1.1007766723632812, "learning_rate": 9.996405046583316e-06, "loss": 0.09631555527448654, "memory(GiB)": 19.03, "step": 1893, "token_acc": 0.945, "train_speed(iter/s)": 0.970919 }, { "epoch": 0.06152746645875971, "grad_norm": 0.9927173852920532, "learning_rate": 9.996384652183832e-06, "loss": 0.10419304668903351, "memory(GiB)": 19.03, "step": 1894, "token_acc": 0.958139534883721, "train_speed(iter/s)": 0.97098 }, { "epoch": 0.06155995192151512, "grad_norm": 1.0993717908859253, "learning_rate": 9.99636420011942e-06, "loss": 0.09912440180778503, "memory(GiB)": 19.03, "step": 1895, "token_acc": 0.9662921348314607, "train_speed(iter/s)": 0.971056 }, { "epoch": 0.06159243738427054, "grad_norm": 0.9028220176696777, "learning_rate": 9.996343690390313e-06, "loss": 0.09700895845890045, "memory(GiB)": 19.03, "step": 1896, "token_acc": 0.978494623655914, "train_speed(iter/s)": 0.971131 }, { "epoch": 0.061624922847025955, "grad_norm": 0.9552490711212158, "learning_rate": 9.99632312299675e-06, "loss": 0.08938316255807877, "memory(GiB)": 19.03, "step": 1897, "token_acc": 0.9545454545454546, "train_speed(iter/s)": 0.971206 }, { "epoch": 0.06165740830978137, "grad_norm": 0.82471764087677, "learning_rate": 9.99630249793897e-06, "loss": 0.0973896011710167, "memory(GiB)": 19.03, "step": 1898, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.971266 }, { "epoch": 0.06168989377253679, "grad_norm": 1.0724821090698242, "learning_rate": 9.996281815217208e-06, "loss": 0.09371323883533478, "memory(GiB)": 19.03, "step": 1899, "token_acc": 0.9601990049751243, "train_speed(iter/s)": 0.97133 }, { "epoch": 0.06172237923529221, "grad_norm": 1.530287265777588, "learning_rate": 9.996261074831703e-06, "loss": 0.10413035750389099, "memory(GiB)": 19.03, "step": 1900, "token_acc": 0.9497716894977168, "train_speed(iter/s)": 0.971397 }, { "epoch": 0.061754864698047626, "grad_norm": 0.932511031627655, "learning_rate": 9.996240276782697e-06, "loss": 0.10205517709255219, "memory(GiB)": 19.03, "step": 1901, "token_acc": 0.963302752293578, "train_speed(iter/s)": 0.971473 }, { "epoch": 0.06178735016080304, "grad_norm": 0.927053689956665, "learning_rate": 9.996219421070425e-06, "loss": 0.09497418999671936, "memory(GiB)": 19.03, "step": 1902, "token_acc": 0.9588014981273408, "train_speed(iter/s)": 0.971549 }, { "epoch": 0.06181983562355846, "grad_norm": 1.805476188659668, "learning_rate": 9.996198507695134e-06, "loss": 0.10658188164234161, "memory(GiB)": 19.03, "step": 1903, "token_acc": 0.9587628865979382, "train_speed(iter/s)": 0.971624 }, { "epoch": 0.061852321086313874, "grad_norm": 6.532589435577393, "learning_rate": 9.996177536657061e-06, "loss": 0.11333882808685303, "memory(GiB)": 19.03, "step": 1904, "token_acc": 0.9398496240601504, "train_speed(iter/s)": 0.971705 }, { "epoch": 0.06188480654906929, "grad_norm": 0.9876931309700012, "learning_rate": 9.99615650795645e-06, "loss": 0.10658414661884308, "memory(GiB)": 19.03, "step": 1905, "token_acc": 0.9531914893617022, "train_speed(iter/s)": 0.971775 }, { "epoch": 0.06191729201182471, "grad_norm": 1.5391595363616943, "learning_rate": 9.996135421593543e-06, "loss": 0.12352508306503296, "memory(GiB)": 19.03, "step": 1906, "token_acc": 0.9568627450980393, "train_speed(iter/s)": 0.971848 }, { "epoch": 0.06194977747458012, "grad_norm": 1.8268649578094482, "learning_rate": 9.996114277568583e-06, "loss": 0.09114623814821243, "memory(GiB)": 19.03, "step": 1907, "token_acc": 0.9682539682539683, "train_speed(iter/s)": 0.971918 }, { "epoch": 0.061982262937335546, "grad_norm": 0.8429578542709351, "learning_rate": 9.996093075881814e-06, "loss": 0.1034480631351471, "memory(GiB)": 19.03, "step": 1908, "token_acc": 0.9579439252336449, "train_speed(iter/s)": 0.97198 }, { "epoch": 0.06201474840009096, "grad_norm": 0.8656787872314453, "learning_rate": 9.996071816533483e-06, "loss": 0.09862732887268066, "memory(GiB)": 19.03, "step": 1909, "token_acc": 0.944, "train_speed(iter/s)": 0.972046 }, { "epoch": 0.06204723386284638, "grad_norm": 0.9915949106216431, "learning_rate": 9.996050499523832e-06, "loss": 0.10417643934488297, "memory(GiB)": 19.03, "step": 1910, "token_acc": 0.9433198380566802, "train_speed(iter/s)": 0.972121 }, { "epoch": 0.062079719325601794, "grad_norm": 3.7131710052490234, "learning_rate": 9.996029124853109e-06, "loss": 0.09588667750358582, "memory(GiB)": 19.03, "step": 1911, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.972204 }, { "epoch": 0.06211220478835721, "grad_norm": 1.2082003355026245, "learning_rate": 9.996007692521559e-06, "loss": 0.09973326325416565, "memory(GiB)": 19.03, "step": 1912, "token_acc": 0.964824120603015, "train_speed(iter/s)": 0.97229 }, { "epoch": 0.062144690251112626, "grad_norm": 0.993817150592804, "learning_rate": 9.995986202529432e-06, "loss": 0.10830149054527283, "memory(GiB)": 19.03, "step": 1913, "token_acc": 0.9691780821917808, "train_speed(iter/s)": 0.972382 }, { "epoch": 0.06217717571386804, "grad_norm": 1.1454907655715942, "learning_rate": 9.995964654876973e-06, "loss": 0.09646180272102356, "memory(GiB)": 19.03, "step": 1914, "token_acc": 0.96, "train_speed(iter/s)": 0.972459 }, { "epoch": 0.06220966117662346, "grad_norm": 0.956513524055481, "learning_rate": 9.995943049564434e-06, "loss": 0.09481052309274673, "memory(GiB)": 19.03, "step": 1915, "token_acc": 0.9490196078431372, "train_speed(iter/s)": 0.972552 }, { "epoch": 0.06224214663937888, "grad_norm": 1.5386741161346436, "learning_rate": 9.995921386592062e-06, "loss": 0.10992394387722015, "memory(GiB)": 19.03, "step": 1916, "token_acc": 0.9221789883268483, "train_speed(iter/s)": 0.972647 }, { "epoch": 0.0622746321021343, "grad_norm": 1.5589091777801514, "learning_rate": 9.995899665960108e-06, "loss": 0.12017381191253662, "memory(GiB)": 19.03, "step": 1917, "token_acc": 0.9491525423728814, "train_speed(iter/s)": 0.972733 }, { "epoch": 0.062307117564889714, "grad_norm": 1.060664176940918, "learning_rate": 9.995877887668822e-06, "loss": 0.1033254861831665, "memory(GiB)": 19.03, "step": 1918, "token_acc": 0.953307392996109, "train_speed(iter/s)": 0.972827 }, { "epoch": 0.06233960302764513, "grad_norm": 1.028976321220398, "learning_rate": 9.995856051718456e-06, "loss": 0.08536022901535034, "memory(GiB)": 19.03, "step": 1919, "token_acc": 0.9645390070921985, "train_speed(iter/s)": 0.97292 }, { "epoch": 0.062372088490400546, "grad_norm": 1.8796501159667969, "learning_rate": 9.99583415810926e-06, "loss": 0.10266905277967453, "memory(GiB)": 19.03, "step": 1920, "token_acc": 0.9525691699604744, "train_speed(iter/s)": 0.972985 }, { "epoch": 0.06240457395315596, "grad_norm": 0.8948835134506226, "learning_rate": 9.99581220684149e-06, "loss": 0.08538710325956345, "memory(GiB)": 19.03, "step": 1921, "token_acc": 0.9727272727272728, "train_speed(iter/s)": 0.973078 }, { "epoch": 0.06243705941591138, "grad_norm": 0.9339112639427185, "learning_rate": 9.995790197915397e-06, "loss": 0.10250930488109589, "memory(GiB)": 19.03, "step": 1922, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.973167 }, { "epoch": 0.062469544878666794, "grad_norm": 1.7839628458023071, "learning_rate": 9.995768131331236e-06, "loss": 0.10371601581573486, "memory(GiB)": 19.03, "step": 1923, "token_acc": 0.9397590361445783, "train_speed(iter/s)": 0.973255 }, { "epoch": 0.06250203034142221, "grad_norm": 1.6866998672485352, "learning_rate": 9.995746007089262e-06, "loss": 0.10624915361404419, "memory(GiB)": 19.03, "step": 1924, "token_acc": 0.9675925925925926, "train_speed(iter/s)": 0.973346 }, { "epoch": 0.06253451580417763, "grad_norm": 0.7696824073791504, "learning_rate": 9.995723825189729e-06, "loss": 0.09257231652736664, "memory(GiB)": 19.03, "step": 1925, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.973442 }, { "epoch": 0.06256700126693304, "grad_norm": 0.7316637635231018, "learning_rate": 9.995701585632894e-06, "loss": 0.10010592639446259, "memory(GiB)": 19.03, "step": 1926, "token_acc": 0.9601593625498008, "train_speed(iter/s)": 0.97353 }, { "epoch": 0.06259948672968846, "grad_norm": 1.215345025062561, "learning_rate": 9.995679288419014e-06, "loss": 0.10633023083209991, "memory(GiB)": 19.03, "step": 1927, "token_acc": 0.9624413145539906, "train_speed(iter/s)": 0.973614 }, { "epoch": 0.06263197219244389, "grad_norm": 1.264557957649231, "learning_rate": 9.995656933548345e-06, "loss": 0.10965733230113983, "memory(GiB)": 19.03, "step": 1928, "token_acc": 0.9538461538461539, "train_speed(iter/s)": 0.973672 }, { "epoch": 0.0626644576551993, "grad_norm": 1.1577316522598267, "learning_rate": 9.995634521021144e-06, "loss": 0.10700786858797073, "memory(GiB)": 19.03, "step": 1929, "token_acc": 0.9490445859872612, "train_speed(iter/s)": 0.973746 }, { "epoch": 0.06269694311795472, "grad_norm": 0.6967170238494873, "learning_rate": 9.995612050837675e-06, "loss": 0.09452331066131592, "memory(GiB)": 19.03, "step": 1930, "token_acc": 0.9422222222222222, "train_speed(iter/s)": 0.973819 }, { "epoch": 0.06272942858071014, "grad_norm": 0.8567986488342285, "learning_rate": 9.995589522998192e-06, "loss": 0.09764935076236725, "memory(GiB)": 19.03, "step": 1931, "token_acc": 0.9440298507462687, "train_speed(iter/s)": 0.973889 }, { "epoch": 0.06276191404346555, "grad_norm": 2.032331705093384, "learning_rate": 9.995566937502956e-06, "loss": 0.11196336150169373, "memory(GiB)": 19.03, "step": 1932, "token_acc": 0.9475806451612904, "train_speed(iter/s)": 0.973947 }, { "epoch": 0.06279439950622097, "grad_norm": 0.9281851649284363, "learning_rate": 9.995544294352231e-06, "loss": 0.10226495563983917, "memory(GiB)": 19.03, "step": 1933, "token_acc": 0.9331103678929766, "train_speed(iter/s)": 0.974014 }, { "epoch": 0.06282688496897638, "grad_norm": 1.0862011909484863, "learning_rate": 9.995521593546273e-06, "loss": 0.08434467762708664, "memory(GiB)": 19.03, "step": 1934, "token_acc": 0.9469026548672567, "train_speed(iter/s)": 0.974071 }, { "epoch": 0.0628593704317318, "grad_norm": 1.3715906143188477, "learning_rate": 9.995498835085348e-06, "loss": 0.0907745286822319, "memory(GiB)": 19.03, "step": 1935, "token_acc": 0.9627906976744186, "train_speed(iter/s)": 0.974141 }, { "epoch": 0.06289185589448722, "grad_norm": 0.7470091581344604, "learning_rate": 9.995476018969718e-06, "loss": 0.10770999640226364, "memory(GiB)": 19.03, "step": 1936, "token_acc": 0.9251700680272109, "train_speed(iter/s)": 0.974211 }, { "epoch": 0.06292434135724263, "grad_norm": 1.185518741607666, "learning_rate": 9.995453145199646e-06, "loss": 0.1049325093626976, "memory(GiB)": 19.03, "step": 1937, "token_acc": 0.9473684210526315, "train_speed(iter/s)": 0.974278 }, { "epoch": 0.06295682681999805, "grad_norm": 0.9119787216186523, "learning_rate": 9.995430213775395e-06, "loss": 0.1051088273525238, "memory(GiB)": 19.03, "step": 1938, "token_acc": 0.9598393574297188, "train_speed(iter/s)": 0.974347 }, { "epoch": 0.06298931228275347, "grad_norm": 1.1171953678131104, "learning_rate": 9.99540722469723e-06, "loss": 0.08798106014728546, "memory(GiB)": 19.03, "step": 1939, "token_acc": 0.9630996309963099, "train_speed(iter/s)": 0.974401 }, { "epoch": 0.06302179774550888, "grad_norm": 1.0314379930496216, "learning_rate": 9.995384177965417e-06, "loss": 0.1102914959192276, "memory(GiB)": 19.03, "step": 1940, "token_acc": 0.948, "train_speed(iter/s)": 0.974462 }, { "epoch": 0.0630542832082643, "grad_norm": 1.900831937789917, "learning_rate": 9.995361073580223e-06, "loss": 0.11099867522716522, "memory(GiB)": 19.03, "step": 1941, "token_acc": 0.9479553903345725, "train_speed(iter/s)": 0.974535 }, { "epoch": 0.06308676867101971, "grad_norm": 2.891029119491577, "learning_rate": 9.995337911541911e-06, "loss": 0.1002589538693428, "memory(GiB)": 19.03, "step": 1942, "token_acc": 0.9727272727272728, "train_speed(iter/s)": 0.974602 }, { "epoch": 0.06311925413377513, "grad_norm": 0.5536579489707947, "learning_rate": 9.995314691850752e-06, "loss": 0.09042665362358093, "memory(GiB)": 19.03, "step": 1943, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.974667 }, { "epoch": 0.06315173959653056, "grad_norm": 1.1068086624145508, "learning_rate": 9.995291414507014e-06, "loss": 0.09996241331100464, "memory(GiB)": 19.03, "step": 1944, "token_acc": 0.9826086956521739, "train_speed(iter/s)": 0.97475 }, { "epoch": 0.06318422505928598, "grad_norm": 0.8540689945220947, "learning_rate": 9.995268079510962e-06, "loss": 0.09972493350505829, "memory(GiB)": 19.03, "step": 1945, "token_acc": 0.9558232931726908, "train_speed(iter/s)": 0.974835 }, { "epoch": 0.06321671052204139, "grad_norm": 0.6790709495544434, "learning_rate": 9.99524468686287e-06, "loss": 0.07898908853530884, "memory(GiB)": 19.03, "step": 1946, "token_acc": 0.9598214285714286, "train_speed(iter/s)": 0.974921 }, { "epoch": 0.06324919598479681, "grad_norm": 0.930706799030304, "learning_rate": 9.995221236563003e-06, "loss": 0.0823775976896286, "memory(GiB)": 19.03, "step": 1947, "token_acc": 0.9537815126050421, "train_speed(iter/s)": 0.975007 }, { "epoch": 0.06328168144755222, "grad_norm": 2.2262799739837646, "learning_rate": 9.995197728611636e-06, "loss": 0.10403336584568024, "memory(GiB)": 19.03, "step": 1948, "token_acc": 0.963302752293578, "train_speed(iter/s)": 0.975078 }, { "epoch": 0.06331416691030764, "grad_norm": 0.560820996761322, "learning_rate": 9.995174163009039e-06, "loss": 0.08508514612913132, "memory(GiB)": 19.03, "step": 1949, "token_acc": 0.9604743083003953, "train_speed(iter/s)": 0.975141 }, { "epoch": 0.06334665237306306, "grad_norm": 1.6209627389907837, "learning_rate": 9.995150539755483e-06, "loss": 0.08575010299682617, "memory(GiB)": 19.03, "step": 1950, "token_acc": 0.9653465346534653, "train_speed(iter/s)": 0.975216 }, { "epoch": 0.06337913783581847, "grad_norm": 1.9595576524734497, "learning_rate": 9.995126858851242e-06, "loss": 0.08799061179161072, "memory(GiB)": 19.03, "step": 1951, "token_acc": 0.9570815450643777, "train_speed(iter/s)": 0.975284 }, { "epoch": 0.06341162329857389, "grad_norm": 1.5594112873077393, "learning_rate": 9.995103120296588e-06, "loss": 0.09907252341508865, "memory(GiB)": 19.03, "step": 1952, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.975353 }, { "epoch": 0.0634441087613293, "grad_norm": 0.9922096133232117, "learning_rate": 9.995079324091795e-06, "loss": 0.1056385487318039, "memory(GiB)": 19.03, "step": 1953, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.975426 }, { "epoch": 0.06347659422408472, "grad_norm": 1.2532520294189453, "learning_rate": 9.995055470237138e-06, "loss": 0.08357599377632141, "memory(GiB)": 19.03, "step": 1954, "token_acc": 0.96, "train_speed(iter/s)": 0.975494 }, { "epoch": 0.06350907968684014, "grad_norm": 0.9965395927429199, "learning_rate": 9.995031558732893e-06, "loss": 0.10186263918876648, "memory(GiB)": 19.03, "step": 1955, "token_acc": 0.9444444444444444, "train_speed(iter/s)": 0.975564 }, { "epoch": 0.06354156514959555, "grad_norm": 1.3314954042434692, "learning_rate": 9.995007589579335e-06, "loss": 0.10849639773368835, "memory(GiB)": 19.03, "step": 1956, "token_acc": 0.952755905511811, "train_speed(iter/s)": 0.975627 }, { "epoch": 0.06357405061235097, "grad_norm": 0.784031331539154, "learning_rate": 9.994983562776742e-06, "loss": 0.099761962890625, "memory(GiB)": 19.03, "step": 1957, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.975689 }, { "epoch": 0.06360653607510638, "grad_norm": 0.7259290814399719, "learning_rate": 9.99495947832539e-06, "loss": 0.09681031107902527, "memory(GiB)": 19.03, "step": 1958, "token_acc": 0.97, "train_speed(iter/s)": 0.975755 }, { "epoch": 0.0636390215378618, "grad_norm": 2.513420820236206, "learning_rate": 9.994935336225558e-06, "loss": 0.09456391632556915, "memory(GiB)": 19.03, "step": 1959, "token_acc": 0.9678899082568807, "train_speed(iter/s)": 0.975824 }, { "epoch": 0.06367150700061723, "grad_norm": 1.0010647773742676, "learning_rate": 9.994911136477523e-06, "loss": 0.10413911938667297, "memory(GiB)": 19.03, "step": 1960, "token_acc": 0.9592592592592593, "train_speed(iter/s)": 0.97589 }, { "epoch": 0.06370399246337265, "grad_norm": 0.6968393325805664, "learning_rate": 9.994886879081565e-06, "loss": 0.08498729765415192, "memory(GiB)": 19.03, "step": 1961, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.975955 }, { "epoch": 0.06373647792612806, "grad_norm": 1.2479326725006104, "learning_rate": 9.994862564037965e-06, "loss": 0.10001920163631439, "memory(GiB)": 19.03, "step": 1962, "token_acc": 0.9681818181818181, "train_speed(iter/s)": 0.976024 }, { "epoch": 0.06376896338888348, "grad_norm": 1.1352020502090454, "learning_rate": 9.994838191347003e-06, "loss": 0.1054106131196022, "memory(GiB)": 19.03, "step": 1963, "token_acc": 0.95, "train_speed(iter/s)": 0.976085 }, { "epoch": 0.0638014488516389, "grad_norm": 0.9575924277305603, "learning_rate": 9.994813761008959e-06, "loss": 0.10926110297441483, "memory(GiB)": 19.03, "step": 1964, "token_acc": 0.9594594594594594, "train_speed(iter/s)": 0.976158 }, { "epoch": 0.06383393431439431, "grad_norm": 0.7643014192581177, "learning_rate": 9.994789273024118e-06, "loss": 0.09356629848480225, "memory(GiB)": 19.03, "step": 1965, "token_acc": 0.9586776859504132, "train_speed(iter/s)": 0.976226 }, { "epoch": 0.06386641977714973, "grad_norm": 1.0017026662826538, "learning_rate": 9.994764727392758e-06, "loss": 0.09371823072433472, "memory(GiB)": 19.03, "step": 1966, "token_acc": 0.9484126984126984, "train_speed(iter/s)": 0.976269 }, { "epoch": 0.06389890523990514, "grad_norm": 1.728387713432312, "learning_rate": 9.994740124115167e-06, "loss": 0.09908565133810043, "memory(GiB)": 19.03, "step": 1967, "token_acc": 0.9392523364485982, "train_speed(iter/s)": 0.976335 }, { "epoch": 0.06393139070266056, "grad_norm": 1.1642811298370361, "learning_rate": 9.994715463191626e-06, "loss": 0.10046401619911194, "memory(GiB)": 19.03, "step": 1968, "token_acc": 0.9638009049773756, "train_speed(iter/s)": 0.976402 }, { "epoch": 0.06396387616541598, "grad_norm": 1.2648745775222778, "learning_rate": 9.994690744622422e-06, "loss": 0.09328483790159225, "memory(GiB)": 19.03, "step": 1969, "token_acc": 0.9505703422053232, "train_speed(iter/s)": 0.976476 }, { "epoch": 0.06399636162817139, "grad_norm": 0.9497134685516357, "learning_rate": 9.994665968407837e-06, "loss": 0.09790489077568054, "memory(GiB)": 19.03, "step": 1970, "token_acc": 0.9641025641025641, "train_speed(iter/s)": 0.976566 }, { "epoch": 0.06402884709092681, "grad_norm": 1.0339946746826172, "learning_rate": 9.994641134548161e-06, "loss": 0.08985394239425659, "memory(GiB)": 19.03, "step": 1971, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.976651 }, { "epoch": 0.06406133255368222, "grad_norm": 0.7487345933914185, "learning_rate": 9.994616243043678e-06, "loss": 0.0855109840631485, "memory(GiB)": 19.03, "step": 1972, "token_acc": 0.9529914529914529, "train_speed(iter/s)": 0.976739 }, { "epoch": 0.06409381801643764, "grad_norm": 0.7185418009757996, "learning_rate": 9.994591293894674e-06, "loss": 0.09175275266170502, "memory(GiB)": 19.03, "step": 1973, "token_acc": 0.9512195121951219, "train_speed(iter/s)": 0.976817 }, { "epoch": 0.06412630347919306, "grad_norm": 0.8196381330490112, "learning_rate": 9.99456628710144e-06, "loss": 0.10420776903629303, "memory(GiB)": 19.03, "step": 1974, "token_acc": 0.9493087557603687, "train_speed(iter/s)": 0.976892 }, { "epoch": 0.06415878894194847, "grad_norm": 0.6628251671791077, "learning_rate": 9.994541222664263e-06, "loss": 0.08506769686937332, "memory(GiB)": 19.03, "step": 1975, "token_acc": 0.9739130434782609, "train_speed(iter/s)": 0.976985 }, { "epoch": 0.0641912744047039, "grad_norm": 1.042975664138794, "learning_rate": 9.994516100583434e-06, "loss": 0.09504681825637817, "memory(GiB)": 19.03, "step": 1976, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.977074 }, { "epoch": 0.06422375986745932, "grad_norm": 0.9176746606826782, "learning_rate": 9.99449092085924e-06, "loss": 0.09553858637809753, "memory(GiB)": 19.03, "step": 1977, "token_acc": 0.955, "train_speed(iter/s)": 0.977159 }, { "epoch": 0.06425624533021473, "grad_norm": 1.4222795963287354, "learning_rate": 9.994465683491971e-06, "loss": 0.10284688323736191, "memory(GiB)": 19.03, "step": 1978, "token_acc": 0.9427312775330396, "train_speed(iter/s)": 0.977247 }, { "epoch": 0.06428873079297015, "grad_norm": 4.216034889221191, "learning_rate": 9.994440388481924e-06, "loss": 0.09788748621940613, "memory(GiB)": 19.03, "step": 1979, "token_acc": 0.9482071713147411, "train_speed(iter/s)": 0.977334 }, { "epoch": 0.06432121625572557, "grad_norm": 0.7328101396560669, "learning_rate": 9.994415035829385e-06, "loss": 0.08340948075056076, "memory(GiB)": 19.03, "step": 1980, "token_acc": 0.9686274509803922, "train_speed(iter/s)": 0.977406 }, { "epoch": 0.06435370171848098, "grad_norm": 0.8983317613601685, "learning_rate": 9.99438962553465e-06, "loss": 0.1085081398487091, "memory(GiB)": 19.03, "step": 1981, "token_acc": 0.9651741293532339, "train_speed(iter/s)": 0.977476 }, { "epoch": 0.0643861871812364, "grad_norm": 0.9908891916275024, "learning_rate": 9.994364157598012e-06, "loss": 0.09178655594587326, "memory(GiB)": 19.03, "step": 1982, "token_acc": 0.9516129032258065, "train_speed(iter/s)": 0.977565 }, { "epoch": 0.06441867264399181, "grad_norm": 1.0082627534866333, "learning_rate": 9.994338632019764e-06, "loss": 0.09932099282741547, "memory(GiB)": 19.03, "step": 1983, "token_acc": 0.9634703196347032, "train_speed(iter/s)": 0.977633 }, { "epoch": 0.06445115810674723, "grad_norm": 1.057597279548645, "learning_rate": 9.9943130488002e-06, "loss": 0.09646450728178024, "memory(GiB)": 19.03, "step": 1984, "token_acc": 0.9598765432098766, "train_speed(iter/s)": 0.977681 }, { "epoch": 0.06448364356950265, "grad_norm": 1.0921261310577393, "learning_rate": 9.994287407939615e-06, "loss": 0.09687386453151703, "memory(GiB)": 19.03, "step": 1985, "token_acc": 0.9628252788104089, "train_speed(iter/s)": 0.977733 }, { "epoch": 0.06451612903225806, "grad_norm": 1.0765900611877441, "learning_rate": 9.994261709438307e-06, "loss": 0.10250961035490036, "memory(GiB)": 19.03, "step": 1986, "token_acc": 0.9525691699604744, "train_speed(iter/s)": 0.977787 }, { "epoch": 0.06454861449501348, "grad_norm": 0.8266260027885437, "learning_rate": 9.994235953296572e-06, "loss": 0.08460132777690887, "memory(GiB)": 19.03, "step": 1987, "token_acc": 0.9723502304147466, "train_speed(iter/s)": 0.977854 }, { "epoch": 0.0645810999577689, "grad_norm": 1.2205379009246826, "learning_rate": 9.994210139514706e-06, "loss": 0.10216343402862549, "memory(GiB)": 19.03, "step": 1988, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.977914 }, { "epoch": 0.06461358542052431, "grad_norm": 0.9163573980331421, "learning_rate": 9.994184268093009e-06, "loss": 0.09906702488660812, "memory(GiB)": 19.03, "step": 1989, "token_acc": 0.9682539682539683, "train_speed(iter/s)": 0.977973 }, { "epoch": 0.06464607088327973, "grad_norm": 0.9437257647514343, "learning_rate": 9.994158339031776e-06, "loss": 0.09416935592889786, "memory(GiB)": 19.03, "step": 1990, "token_acc": 0.9630996309963099, "train_speed(iter/s)": 0.978022 }, { "epoch": 0.06467855634603514, "grad_norm": 0.7351680994033813, "learning_rate": 9.994132352331309e-06, "loss": 0.08901570737361908, "memory(GiB)": 19.03, "step": 1991, "token_acc": 0.9478260869565217, "train_speed(iter/s)": 0.978072 }, { "epoch": 0.06471104180879057, "grad_norm": 0.8098472356796265, "learning_rate": 9.99410630799191e-06, "loss": 0.0992233157157898, "memory(GiB)": 19.03, "step": 1992, "token_acc": 0.9485294117647058, "train_speed(iter/s)": 0.97814 }, { "epoch": 0.06474352727154599, "grad_norm": 0.8253336548805237, "learning_rate": 9.994080206013875e-06, "loss": 0.09895959496498108, "memory(GiB)": 19.03, "step": 1993, "token_acc": 0.9608695652173913, "train_speed(iter/s)": 0.978206 }, { "epoch": 0.0647760127343014, "grad_norm": 1.3350534439086914, "learning_rate": 9.994054046397508e-06, "loss": 0.10665164887905121, "memory(GiB)": 19.03, "step": 1994, "token_acc": 0.9629629629629629, "train_speed(iter/s)": 0.978257 }, { "epoch": 0.06480849819705682, "grad_norm": 1.014103651046753, "learning_rate": 9.99402782914311e-06, "loss": 0.09754057228565216, "memory(GiB)": 19.03, "step": 1995, "token_acc": 0.9354838709677419, "train_speed(iter/s)": 0.978326 }, { "epoch": 0.06484098365981224, "grad_norm": 1.6274083852767944, "learning_rate": 9.994001554250983e-06, "loss": 0.10073807835578918, "memory(GiB)": 19.03, "step": 1996, "token_acc": 0.9558011049723757, "train_speed(iter/s)": 0.978378 }, { "epoch": 0.06487346912256765, "grad_norm": 1.467692494392395, "learning_rate": 9.993975221721433e-06, "loss": 0.10413680225610733, "memory(GiB)": 19.03, "step": 1997, "token_acc": 0.9491525423728814, "train_speed(iter/s)": 0.978434 }, { "epoch": 0.06490595458532307, "grad_norm": 0.8422666192054749, "learning_rate": 9.99394883155476e-06, "loss": 0.10497703403234482, "memory(GiB)": 19.03, "step": 1998, "token_acc": 0.9574468085106383, "train_speed(iter/s)": 0.9785 }, { "epoch": 0.06493844004807849, "grad_norm": 0.9572817087173462, "learning_rate": 9.993922383751271e-06, "loss": 0.09843571484088898, "memory(GiB)": 19.03, "step": 1999, "token_acc": 0.950381679389313, "train_speed(iter/s)": 0.97857 }, { "epoch": 0.0649709255108339, "grad_norm": 1.0731467008590698, "learning_rate": 9.993895878311271e-06, "loss": 0.1131550669670105, "memory(GiB)": 19.03, "step": 2000, "token_acc": 0.9711538461538461, "train_speed(iter/s)": 0.978639 }, { "epoch": 0.0649709255108339, "eval_loss": 0.10056711733341217, "eval_runtime": 81.3863, "eval_samples_per_second": 122.257, "eval_steps_per_second": 3.821, "eval_token_acc": 0.9583846017878318, "step": 2000 }, { "epoch": 0.06500341097358932, "grad_norm": 0.9705544114112854, "learning_rate": 9.993869315235066e-06, "loss": 0.09984195232391357, "memory(GiB)": 19.03, "step": 2001, "token_acc": 0.959046797793564, "train_speed(iter/s)": 0.936713 }, { "epoch": 0.06503589643634473, "grad_norm": 0.7743807435035706, "learning_rate": 9.993842694522962e-06, "loss": 0.09899252653121948, "memory(GiB)": 19.03, "step": 2002, "token_acc": 0.9606299212598425, "train_speed(iter/s)": 0.9368 }, { "epoch": 0.06506838189910015, "grad_norm": 10.008055686950684, "learning_rate": 9.993816016175268e-06, "loss": 0.11874155700206757, "memory(GiB)": 19.03, "step": 2003, "token_acc": 0.9620253164556962, "train_speed(iter/s)": 0.936899 }, { "epoch": 0.06510086736185557, "grad_norm": 1.239529013633728, "learning_rate": 9.993789280192287e-06, "loss": 0.10808910429477692, "memory(GiB)": 19.03, "step": 2004, "token_acc": 0.9379310344827586, "train_speed(iter/s)": 0.936982 }, { "epoch": 0.06513335282461098, "grad_norm": 1.2389642000198364, "learning_rate": 9.993762486574335e-06, "loss": 0.1068509966135025, "memory(GiB)": 19.03, "step": 2005, "token_acc": 0.9457013574660633, "train_speed(iter/s)": 0.937069 }, { "epoch": 0.0651658382873664, "grad_norm": 1.0361202955245972, "learning_rate": 9.993735635321715e-06, "loss": 0.10937942564487457, "memory(GiB)": 19.03, "step": 2006, "token_acc": 0.9521276595744681, "train_speed(iter/s)": 0.937154 }, { "epoch": 0.06519832375012181, "grad_norm": 0.4805610179901123, "learning_rate": 9.993708726434738e-06, "loss": 0.08542826771736145, "memory(GiB)": 19.03, "step": 2007, "token_acc": 0.9628252788104089, "train_speed(iter/s)": 0.937253 }, { "epoch": 0.06523080921287724, "grad_norm": 0.9819596409797668, "learning_rate": 9.993681759913718e-06, "loss": 0.1025337502360344, "memory(GiB)": 19.03, "step": 2008, "token_acc": 0.9686274509803922, "train_speed(iter/s)": 0.937338 }, { "epoch": 0.06526329467563266, "grad_norm": 1.6087499856948853, "learning_rate": 9.993654735758962e-06, "loss": 0.09495403617620468, "memory(GiB)": 19.03, "step": 2009, "token_acc": 0.9510869565217391, "train_speed(iter/s)": 0.937435 }, { "epoch": 0.06529578013838808, "grad_norm": 0.7349395751953125, "learning_rate": 9.993627653970785e-06, "loss": 0.08645589649677277, "memory(GiB)": 19.03, "step": 2010, "token_acc": 0.9647887323943662, "train_speed(iter/s)": 0.93753 }, { "epoch": 0.06532826560114349, "grad_norm": 2.9466702938079834, "learning_rate": 9.9936005145495e-06, "loss": 0.09278059750795364, "memory(GiB)": 19.03, "step": 2011, "token_acc": 0.9590909090909091, "train_speed(iter/s)": 0.937623 }, { "epoch": 0.06536075106389891, "grad_norm": 0.696299135684967, "learning_rate": 9.993573317495416e-06, "loss": 0.09348422288894653, "memory(GiB)": 19.03, "step": 2012, "token_acc": 0.973568281938326, "train_speed(iter/s)": 0.937714 }, { "epoch": 0.06539323652665432, "grad_norm": 0.8023068308830261, "learning_rate": 9.99354606280885e-06, "loss": 0.09145773947238922, "memory(GiB)": 19.03, "step": 2013, "token_acc": 0.9672897196261683, "train_speed(iter/s)": 0.937691 }, { "epoch": 0.06542572198940974, "grad_norm": 0.9878606796264648, "learning_rate": 9.993518750490117e-06, "loss": 0.09376166760921478, "memory(GiB)": 19.03, "step": 2014, "token_acc": 0.9735449735449735, "train_speed(iter/s)": 0.937786 }, { "epoch": 0.06545820745216516, "grad_norm": 1.37967848777771, "learning_rate": 9.99349138053953e-06, "loss": 0.0890759825706482, "memory(GiB)": 19.03, "step": 2015, "token_acc": 0.9631578947368421, "train_speed(iter/s)": 0.937885 }, { "epoch": 0.06549069291492057, "grad_norm": 1.2526719570159912, "learning_rate": 9.993463952957406e-06, "loss": 0.0948832631111145, "memory(GiB)": 19.03, "step": 2016, "token_acc": 0.95703125, "train_speed(iter/s)": 0.937977 }, { "epoch": 0.06552317837767599, "grad_norm": 0.8277353644371033, "learning_rate": 9.993436467744062e-06, "loss": 0.09085410088300705, "memory(GiB)": 19.03, "step": 2017, "token_acc": 0.9560975609756097, "train_speed(iter/s)": 0.938074 }, { "epoch": 0.0655556638404314, "grad_norm": 4.238307952880859, "learning_rate": 9.993408924899817e-06, "loss": 0.09065542370080948, "memory(GiB)": 19.03, "step": 2018, "token_acc": 0.9710743801652892, "train_speed(iter/s)": 0.938177 }, { "epoch": 0.06558814930318682, "grad_norm": 1.1303648948669434, "learning_rate": 9.993381324424984e-06, "loss": 0.09572995454072952, "memory(GiB)": 19.03, "step": 2019, "token_acc": 0.9568965517241379, "train_speed(iter/s)": 0.938254 }, { "epoch": 0.06562063476594224, "grad_norm": 1.0841630697250366, "learning_rate": 9.993353666319886e-06, "loss": 0.09084945917129517, "memory(GiB)": 19.03, "step": 2020, "token_acc": 0.9512195121951219, "train_speed(iter/s)": 0.938338 }, { "epoch": 0.06565312022869765, "grad_norm": 0.5849719643592834, "learning_rate": 9.99332595058484e-06, "loss": 0.09368917346000671, "memory(GiB)": 19.03, "step": 2021, "token_acc": 0.9498069498069498, "train_speed(iter/s)": 0.938417 }, { "epoch": 0.06568560569145307, "grad_norm": 1.0074079036712646, "learning_rate": 9.993298177220166e-06, "loss": 0.11545403301715851, "memory(GiB)": 19.03, "step": 2022, "token_acc": 0.970954356846473, "train_speed(iter/s)": 0.938491 }, { "epoch": 0.06571809115420849, "grad_norm": 1.425687551498413, "learning_rate": 9.993270346226185e-06, "loss": 0.0937718003988266, "memory(GiB)": 19.03, "step": 2023, "token_acc": 0.9638009049773756, "train_speed(iter/s)": 0.938576 }, { "epoch": 0.06575057661696392, "grad_norm": 1.0071170330047607, "learning_rate": 9.993242457603218e-06, "loss": 0.10968776047229767, "memory(GiB)": 19.03, "step": 2024, "token_acc": 0.9634703196347032, "train_speed(iter/s)": 0.938661 }, { "epoch": 0.06578306207971933, "grad_norm": 0.7118818759918213, "learning_rate": 9.993214511351589e-06, "loss": 0.085542231798172, "memory(GiB)": 19.03, "step": 2025, "token_acc": 0.9735449735449735, "train_speed(iter/s)": 0.938742 }, { "epoch": 0.06581554754247475, "grad_norm": 1.6097460985183716, "learning_rate": 9.993186507471618e-06, "loss": 0.10886519402265549, "memory(GiB)": 19.03, "step": 2026, "token_acc": 0.9482758620689655, "train_speed(iter/s)": 0.938827 }, { "epoch": 0.06584803300523016, "grad_norm": 0.5351201295852661, "learning_rate": 9.993158445963627e-06, "loss": 0.07560140639543533, "memory(GiB)": 19.03, "step": 2027, "token_acc": 0.9686274509803922, "train_speed(iter/s)": 0.938903 }, { "epoch": 0.06588051846798558, "grad_norm": 0.9815587997436523, "learning_rate": 9.993130326827942e-06, "loss": 0.09592752903699875, "memory(GiB)": 19.03, "step": 2028, "token_acc": 0.9603174603174603, "train_speed(iter/s)": 0.938984 }, { "epoch": 0.065913003930741, "grad_norm": 1.2805614471435547, "learning_rate": 9.993102150064888e-06, "loss": 0.09425526112318039, "memory(GiB)": 19.03, "step": 2029, "token_acc": 0.9365671641791045, "train_speed(iter/s)": 0.939059 }, { "epoch": 0.06594548939349641, "grad_norm": 2.0886316299438477, "learning_rate": 9.993073915674788e-06, "loss": 0.08949930220842361, "memory(GiB)": 19.03, "step": 2030, "token_acc": 0.9629629629629629, "train_speed(iter/s)": 0.939138 }, { "epoch": 0.06597797485625183, "grad_norm": 0.6534184813499451, "learning_rate": 9.993045623657973e-06, "loss": 0.07728004455566406, "memory(GiB)": 19.03, "step": 2031, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.939205 }, { "epoch": 0.06601046031900724, "grad_norm": 1.0111967325210571, "learning_rate": 9.993017274014763e-06, "loss": 0.08398367464542389, "memory(GiB)": 19.03, "step": 2032, "token_acc": 0.9621621621621622, "train_speed(iter/s)": 0.939283 }, { "epoch": 0.06604294578176266, "grad_norm": 2.8174808025360107, "learning_rate": 9.992988866745487e-06, "loss": 0.0840839296579361, "memory(GiB)": 19.03, "step": 2033, "token_acc": 0.9578059071729957, "train_speed(iter/s)": 0.939345 }, { "epoch": 0.06607543124451808, "grad_norm": 1.4635369777679443, "learning_rate": 9.992960401850475e-06, "loss": 0.093821220099926, "memory(GiB)": 19.03, "step": 2034, "token_acc": 0.9590909090909091, "train_speed(iter/s)": 0.93942 }, { "epoch": 0.06610791670727349, "grad_norm": 1.0246893167495728, "learning_rate": 9.992931879330055e-06, "loss": 0.09049002826213837, "memory(GiB)": 19.03, "step": 2035, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.939495 }, { "epoch": 0.06614040217002891, "grad_norm": 1.1362204551696777, "learning_rate": 9.992903299184555e-06, "loss": 0.096904456615448, "memory(GiB)": 19.03, "step": 2036, "token_acc": 0.9512195121951219, "train_speed(iter/s)": 0.939563 }, { "epoch": 0.06617288763278432, "grad_norm": 0.9162061214447021, "learning_rate": 9.992874661414304e-06, "loss": 0.08714266121387482, "memory(GiB)": 19.03, "step": 2037, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.939622 }, { "epoch": 0.06620537309553974, "grad_norm": 0.9916203022003174, "learning_rate": 9.992845966019636e-06, "loss": 0.10082180798053741, "memory(GiB)": 19.03, "step": 2038, "token_acc": 0.9665271966527197, "train_speed(iter/s)": 0.939685 }, { "epoch": 0.06623785855829516, "grad_norm": 1.3899608850479126, "learning_rate": 9.99281721300088e-06, "loss": 0.09184043109416962, "memory(GiB)": 19.03, "step": 2039, "token_acc": 0.9461279461279462, "train_speed(iter/s)": 0.939748 }, { "epoch": 0.06627034402105059, "grad_norm": 0.6528631448745728, "learning_rate": 9.992788402358367e-06, "loss": 0.08991384506225586, "memory(GiB)": 19.03, "step": 2040, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.939814 }, { "epoch": 0.066302829483806, "grad_norm": 0.7594980597496033, "learning_rate": 9.992759534092432e-06, "loss": 0.09596117585897446, "memory(GiB)": 19.03, "step": 2041, "token_acc": 0.967391304347826, "train_speed(iter/s)": 0.939894 }, { "epoch": 0.06633531494656142, "grad_norm": 1.359972596168518, "learning_rate": 9.992730608203404e-06, "loss": 0.08601544052362442, "memory(GiB)": 19.03, "step": 2042, "token_acc": 0.96, "train_speed(iter/s)": 0.939967 }, { "epoch": 0.06636780040931683, "grad_norm": 0.8010476231575012, "learning_rate": 9.992701624691621e-06, "loss": 0.10500366240739822, "memory(GiB)": 19.03, "step": 2043, "token_acc": 0.9626168224299065, "train_speed(iter/s)": 0.940043 }, { "epoch": 0.06640028587207225, "grad_norm": 0.8648406863212585, "learning_rate": 9.992672583557415e-06, "loss": 0.10122406482696533, "memory(GiB)": 19.03, "step": 2044, "token_acc": 0.95703125, "train_speed(iter/s)": 0.940113 }, { "epoch": 0.06643277133482767, "grad_norm": 0.6579434871673584, "learning_rate": 9.992643484801123e-06, "loss": 0.08827201277017593, "memory(GiB)": 19.03, "step": 2045, "token_acc": 0.9525691699604744, "train_speed(iter/s)": 0.940192 }, { "epoch": 0.06646525679758308, "grad_norm": 0.6798830628395081, "learning_rate": 9.99261432842308e-06, "loss": 0.09076206386089325, "memory(GiB)": 19.03, "step": 2046, "token_acc": 0.9550173010380623, "train_speed(iter/s)": 0.940267 }, { "epoch": 0.0664977422603385, "grad_norm": 0.7530714869499207, "learning_rate": 9.992585114423623e-06, "loss": 0.09445739537477493, "memory(GiB)": 19.03, "step": 2047, "token_acc": 0.9545454545454546, "train_speed(iter/s)": 0.940338 }, { "epoch": 0.06653022772309392, "grad_norm": 1.2439043521881104, "learning_rate": 9.992555842803088e-06, "loss": 0.09402719885110855, "memory(GiB)": 19.03, "step": 2048, "token_acc": 0.9531914893617022, "train_speed(iter/s)": 0.940409 }, { "epoch": 0.06656271318584933, "grad_norm": 1.1474796533584595, "learning_rate": 9.992526513561813e-06, "loss": 0.08851367980241776, "memory(GiB)": 19.03, "step": 2049, "token_acc": 0.9707112970711297, "train_speed(iter/s)": 0.940493 }, { "epoch": 0.06659519864860475, "grad_norm": 0.7088181376457214, "learning_rate": 9.992497126700138e-06, "loss": 0.09750719368457794, "memory(GiB)": 19.03, "step": 2050, "token_acc": 0.9620253164556962, "train_speed(iter/s)": 0.94058 }, { "epoch": 0.06662768411136016, "grad_norm": 0.6392426490783691, "learning_rate": 9.9924676822184e-06, "loss": 0.0826249048113823, "memory(GiB)": 19.03, "step": 2051, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.940665 }, { "epoch": 0.06666016957411558, "grad_norm": 0.9509619474411011, "learning_rate": 9.992438180116942e-06, "loss": 0.1040160059928894, "memory(GiB)": 19.03, "step": 2052, "token_acc": 0.945054945054945, "train_speed(iter/s)": 0.940739 }, { "epoch": 0.066692655036871, "grad_norm": 1.1633633375167847, "learning_rate": 9.992408620396102e-06, "loss": 0.09350547194480896, "memory(GiB)": 19.03, "step": 2053, "token_acc": 0.9450980392156862, "train_speed(iter/s)": 0.940808 }, { "epoch": 0.06672514049962641, "grad_norm": 1.060773253440857, "learning_rate": 9.992379003056219e-06, "loss": 0.08826699107885361, "memory(GiB)": 19.03, "step": 2054, "token_acc": 0.9636363636363636, "train_speed(iter/s)": 0.94089 }, { "epoch": 0.06675762596238183, "grad_norm": 0.7772321105003357, "learning_rate": 9.99234932809764e-06, "loss": 0.09298914670944214, "memory(GiB)": 19.03, "step": 2055, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.940956 }, { "epoch": 0.06679011142513726, "grad_norm": 0.7186233997344971, "learning_rate": 9.992319595520703e-06, "loss": 0.09268681704998016, "memory(GiB)": 19.03, "step": 2056, "token_acc": 0.9620253164556962, "train_speed(iter/s)": 0.941038 }, { "epoch": 0.06682259688789267, "grad_norm": 0.9794960021972656, "learning_rate": 9.992289805325754e-06, "loss": 0.09992562979459763, "memory(GiB)": 19.03, "step": 2057, "token_acc": 0.9506578947368421, "train_speed(iter/s)": 0.94111 }, { "epoch": 0.06685508235064809, "grad_norm": 2.8999788761138916, "learning_rate": 9.992259957513137e-06, "loss": 0.08428902924060822, "memory(GiB)": 19.03, "step": 2058, "token_acc": 0.9597069597069597, "train_speed(iter/s)": 0.941174 }, { "epoch": 0.0668875678134035, "grad_norm": 0.6831226348876953, "learning_rate": 9.992230052083195e-06, "loss": 0.08742092549800873, "memory(GiB)": 19.03, "step": 2059, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.941241 }, { "epoch": 0.06692005327615892, "grad_norm": 1.503009557723999, "learning_rate": 9.992200089036274e-06, "loss": 0.10316461324691772, "memory(GiB)": 19.03, "step": 2060, "token_acc": 0.9548872180451128, "train_speed(iter/s)": 0.941307 }, { "epoch": 0.06695253873891434, "grad_norm": 1.8841559886932373, "learning_rate": 9.992170068372717e-06, "loss": 0.0974697470664978, "memory(GiB)": 19.03, "step": 2061, "token_acc": 0.9626168224299065, "train_speed(iter/s)": 0.94137 }, { "epoch": 0.06698502420166975, "grad_norm": 0.7391629219055176, "learning_rate": 9.992139990092875e-06, "loss": 0.08709393441677094, "memory(GiB)": 19.03, "step": 2062, "token_acc": 0.966789667896679, "train_speed(iter/s)": 0.941441 }, { "epoch": 0.06701750966442517, "grad_norm": 0.7335381507873535, "learning_rate": 9.992109854197093e-06, "loss": 0.083705835044384, "memory(GiB)": 19.03, "step": 2063, "token_acc": 0.9459459459459459, "train_speed(iter/s)": 0.941511 }, { "epoch": 0.06704999512718059, "grad_norm": 1.0585025548934937, "learning_rate": 9.992079660685719e-06, "loss": 0.09918875992298126, "memory(GiB)": 19.03, "step": 2064, "token_acc": 0.964824120603015, "train_speed(iter/s)": 0.941571 }, { "epoch": 0.067082480589936, "grad_norm": 0.8661616444587708, "learning_rate": 9.9920494095591e-06, "loss": 0.10783716291189194, "memory(GiB)": 19.03, "step": 2065, "token_acc": 0.943609022556391, "train_speed(iter/s)": 0.941643 }, { "epoch": 0.06711496605269142, "grad_norm": 0.7735105752944946, "learning_rate": 9.992019100817587e-06, "loss": 0.10698948800563812, "memory(GiB)": 19.03, "step": 2066, "token_acc": 0.948339483394834, "train_speed(iter/s)": 0.941708 }, { "epoch": 0.06714745151544683, "grad_norm": 1.8910795450210571, "learning_rate": 9.991988734461529e-06, "loss": 0.0909765437245369, "memory(GiB)": 19.03, "step": 2067, "token_acc": 0.9626556016597511, "train_speed(iter/s)": 0.941784 }, { "epoch": 0.06717993697820225, "grad_norm": 0.8759816288948059, "learning_rate": 9.991958310491276e-06, "loss": 0.09879008680582047, "memory(GiB)": 19.03, "step": 2068, "token_acc": 0.9645669291338582, "train_speed(iter/s)": 0.941864 }, { "epoch": 0.06721242244095767, "grad_norm": 1.0785884857177734, "learning_rate": 9.991927828907181e-06, "loss": 0.09537839889526367, "memory(GiB)": 19.03, "step": 2069, "token_acc": 0.9752475247524752, "train_speed(iter/s)": 0.941951 }, { "epoch": 0.06724490790371308, "grad_norm": 0.7639315128326416, "learning_rate": 9.991897289709595e-06, "loss": 0.0986255332827568, "memory(GiB)": 19.03, "step": 2070, "token_acc": 0.9681274900398407, "train_speed(iter/s)": 0.94204 }, { "epoch": 0.0672773933664685, "grad_norm": 0.6995908617973328, "learning_rate": 9.991866692898869e-06, "loss": 0.09377016127109528, "memory(GiB)": 19.03, "step": 2071, "token_acc": 0.9617486338797814, "train_speed(iter/s)": 0.942131 }, { "epoch": 0.06730987882922393, "grad_norm": 1.362367868423462, "learning_rate": 9.991836038475357e-06, "loss": 0.09772047400474548, "memory(GiB)": 19.03, "step": 2072, "token_acc": 0.9588014981273408, "train_speed(iter/s)": 0.942223 }, { "epoch": 0.06734236429197935, "grad_norm": 0.980414092540741, "learning_rate": 9.991805326439415e-06, "loss": 0.10703569650650024, "memory(GiB)": 19.03, "step": 2073, "token_acc": 0.9629629629629629, "train_speed(iter/s)": 0.942318 }, { "epoch": 0.06737484975473476, "grad_norm": 2.5190412998199463, "learning_rate": 9.991774556791392e-06, "loss": 0.08280480653047562, "memory(GiB)": 19.03, "step": 2074, "token_acc": 0.9695431472081218, "train_speed(iter/s)": 0.942413 }, { "epoch": 0.06740733521749018, "grad_norm": 0.8692382574081421, "learning_rate": 9.99174372953165e-06, "loss": 0.09489762037992477, "memory(GiB)": 19.03, "step": 2075, "token_acc": 0.9511278195488722, "train_speed(iter/s)": 0.942511 }, { "epoch": 0.0674398206802456, "grad_norm": 0.8766717314720154, "learning_rate": 9.991712844660539e-06, "loss": 0.09972919523715973, "memory(GiB)": 19.03, "step": 2076, "token_acc": 0.9572649572649573, "train_speed(iter/s)": 0.942596 }, { "epoch": 0.06747230614300101, "grad_norm": 0.6643247604370117, "learning_rate": 9.991681902178418e-06, "loss": 0.08498449623584747, "memory(GiB)": 19.03, "step": 2077, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.942691 }, { "epoch": 0.06750479160575643, "grad_norm": 3.7491166591644287, "learning_rate": 9.991650902085645e-06, "loss": 0.0868983119726181, "memory(GiB)": 19.03, "step": 2078, "token_acc": 0.9707112970711297, "train_speed(iter/s)": 0.942778 }, { "epoch": 0.06753727706851184, "grad_norm": 0.9112921953201294, "learning_rate": 9.991619844382577e-06, "loss": 0.0930052399635315, "memory(GiB)": 19.03, "step": 2079, "token_acc": 0.9601769911504425, "train_speed(iter/s)": 0.942858 }, { "epoch": 0.06756976253126726, "grad_norm": 1.451727271080017, "learning_rate": 9.991588729069572e-06, "loss": 0.10224359482526779, "memory(GiB)": 19.03, "step": 2080, "token_acc": 0.9494163424124513, "train_speed(iter/s)": 0.942937 }, { "epoch": 0.06760224799402267, "grad_norm": 1.2065588235855103, "learning_rate": 9.99155755614699e-06, "loss": 0.0819234773516655, "memory(GiB)": 19.03, "step": 2081, "token_acc": 0.9621848739495799, "train_speed(iter/s)": 0.943008 }, { "epoch": 0.06763473345677809, "grad_norm": 0.9389881491661072, "learning_rate": 9.991526325615188e-06, "loss": 0.09942255914211273, "memory(GiB)": 19.03, "step": 2082, "token_acc": 0.958904109589041, "train_speed(iter/s)": 0.943083 }, { "epoch": 0.0676672189195335, "grad_norm": 1.0424059629440308, "learning_rate": 9.991495037474529e-06, "loss": 0.1083730161190033, "memory(GiB)": 19.03, "step": 2083, "token_acc": 0.9607142857142857, "train_speed(iter/s)": 0.943154 }, { "epoch": 0.06769970438228892, "grad_norm": 0.7234706878662109, "learning_rate": 9.991463691725373e-06, "loss": 0.09683792293071747, "memory(GiB)": 19.03, "step": 2084, "token_acc": 0.961864406779661, "train_speed(iter/s)": 0.943231 }, { "epoch": 0.06773218984504434, "grad_norm": 1.0398393869400024, "learning_rate": 9.991432288368084e-06, "loss": 0.09034731984138489, "memory(GiB)": 19.03, "step": 2085, "token_acc": 0.9516728624535316, "train_speed(iter/s)": 0.943292 }, { "epoch": 0.06776467530779975, "grad_norm": 0.83295077085495, "learning_rate": 9.991400827403022e-06, "loss": 0.09439084678888321, "memory(GiB)": 19.03, "step": 2086, "token_acc": 0.9592760180995475, "train_speed(iter/s)": 0.94337 }, { "epoch": 0.06779716077055517, "grad_norm": 0.6525197625160217, "learning_rate": 9.991369308830551e-06, "loss": 0.0875461995601654, "memory(GiB)": 19.03, "step": 2087, "token_acc": 0.970873786407767, "train_speed(iter/s)": 0.943441 }, { "epoch": 0.0678296462333106, "grad_norm": 0.895116925239563, "learning_rate": 9.991337732651036e-06, "loss": 0.10629227757453918, "memory(GiB)": 19.03, "step": 2088, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.943515 }, { "epoch": 0.06786213169606602, "grad_norm": 0.6510108113288879, "learning_rate": 9.991306098864839e-06, "loss": 0.0857095941901207, "memory(GiB)": 19.03, "step": 2089, "token_acc": 0.9617021276595744, "train_speed(iter/s)": 0.943582 }, { "epoch": 0.06789461715882143, "grad_norm": 6.413848876953125, "learning_rate": 9.991274407472324e-06, "loss": 0.11743227392435074, "memory(GiB)": 19.03, "step": 2090, "token_acc": 0.9591078066914498, "train_speed(iter/s)": 0.943651 }, { "epoch": 0.06792710262157685, "grad_norm": 0.7513626217842102, "learning_rate": 9.991242658473864e-06, "loss": 0.0846511721611023, "memory(GiB)": 19.03, "step": 2091, "token_acc": 0.9668874172185431, "train_speed(iter/s)": 0.943726 }, { "epoch": 0.06795958808433226, "grad_norm": 0.8540104627609253, "learning_rate": 9.991210851869816e-06, "loss": 0.08692145347595215, "memory(GiB)": 19.03, "step": 2092, "token_acc": 0.9482758620689655, "train_speed(iter/s)": 0.9438 }, { "epoch": 0.06799207354708768, "grad_norm": 0.8426315784454346, "learning_rate": 9.991178987660552e-06, "loss": 0.09322428703308105, "memory(GiB)": 19.03, "step": 2093, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.943874 }, { "epoch": 0.0680245590098431, "grad_norm": 1.039961814880371, "learning_rate": 9.99114706584644e-06, "loss": 0.08495107293128967, "memory(GiB)": 19.03, "step": 2094, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.943953 }, { "epoch": 0.06805704447259851, "grad_norm": 0.9307515025138855, "learning_rate": 9.99111508642785e-06, "loss": 0.10141541063785553, "memory(GiB)": 19.03, "step": 2095, "token_acc": 0.9670781893004116, "train_speed(iter/s)": 0.944015 }, { "epoch": 0.06808952993535393, "grad_norm": 1.0104373693466187, "learning_rate": 9.991083049405147e-06, "loss": 0.09035305678844452, "memory(GiB)": 19.03, "step": 2096, "token_acc": 0.9572649572649573, "train_speed(iter/s)": 0.944073 }, { "epoch": 0.06812201539810935, "grad_norm": 1.8368444442749023, "learning_rate": 9.9910509547787e-06, "loss": 0.10006788372993469, "memory(GiB)": 19.03, "step": 2097, "token_acc": 0.9502262443438914, "train_speed(iter/s)": 0.944145 }, { "epoch": 0.06815450086086476, "grad_norm": 0.9650951027870178, "learning_rate": 9.991018802548885e-06, "loss": 0.10554195940494537, "memory(GiB)": 19.03, "step": 2098, "token_acc": 0.9435483870967742, "train_speed(iter/s)": 0.944212 }, { "epoch": 0.06818698632362018, "grad_norm": 0.8578829765319824, "learning_rate": 9.990986592716068e-06, "loss": 0.0931406319141388, "memory(GiB)": 19.03, "step": 2099, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.944284 }, { "epoch": 0.0682194717863756, "grad_norm": 0.7875182032585144, "learning_rate": 9.990954325280624e-06, "loss": 0.08391446620225906, "memory(GiB)": 19.03, "step": 2100, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.944368 }, { "epoch": 0.06825195724913101, "grad_norm": 1.4952589273452759, "learning_rate": 9.990922000242924e-06, "loss": 0.09612477570772171, "memory(GiB)": 19.03, "step": 2101, "token_acc": 0.9482758620689655, "train_speed(iter/s)": 0.94446 }, { "epoch": 0.06828444271188643, "grad_norm": 1.412405252456665, "learning_rate": 9.990889617603341e-06, "loss": 0.08255545794963837, "memory(GiB)": 19.03, "step": 2102, "token_acc": 0.9836956521739131, "train_speed(iter/s)": 0.94455 }, { "epoch": 0.06831692817464184, "grad_norm": 1.1708141565322876, "learning_rate": 9.990857177362248e-06, "loss": 0.09579389542341232, "memory(GiB)": 19.03, "step": 2103, "token_acc": 0.96875, "train_speed(iter/s)": 0.944638 }, { "epoch": 0.06834941363739727, "grad_norm": 1.2037497758865356, "learning_rate": 9.990824679520022e-06, "loss": 0.08791956305503845, "memory(GiB)": 19.03, "step": 2104, "token_acc": 0.9625468164794008, "train_speed(iter/s)": 0.944715 }, { "epoch": 0.06838189910015269, "grad_norm": 0.8319950103759766, "learning_rate": 9.990792124077036e-06, "loss": 0.08941464871168137, "memory(GiB)": 19.03, "step": 2105, "token_acc": 0.9493670886075949, "train_speed(iter/s)": 0.94481 }, { "epoch": 0.0684143845629081, "grad_norm": 1.48871910572052, "learning_rate": 9.990759511033665e-06, "loss": 0.10003501176834106, "memory(GiB)": 19.03, "step": 2106, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.944883 }, { "epoch": 0.06844687002566352, "grad_norm": 7.202681064605713, "learning_rate": 9.990726840390288e-06, "loss": 0.09372793138027191, "memory(GiB)": 19.03, "step": 2107, "token_acc": 0.9731800766283525, "train_speed(iter/s)": 0.944953 }, { "epoch": 0.06847935548841894, "grad_norm": 1.02649986743927, "learning_rate": 9.990694112147282e-06, "loss": 0.09221183508634567, "memory(GiB)": 19.03, "step": 2108, "token_acc": 0.9684684684684685, "train_speed(iter/s)": 0.945006 }, { "epoch": 0.06851184095117435, "grad_norm": 0.8495153784751892, "learning_rate": 9.99066132630502e-06, "loss": 0.09474661946296692, "memory(GiB)": 19.03, "step": 2109, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.945078 }, { "epoch": 0.06854432641392977, "grad_norm": 1.2647444009780884, "learning_rate": 9.990628482863885e-06, "loss": 0.09791576117277145, "memory(GiB)": 19.03, "step": 2110, "token_acc": 0.9402985074626866, "train_speed(iter/s)": 0.945151 }, { "epoch": 0.06857681187668518, "grad_norm": 1.3425672054290771, "learning_rate": 9.990595581824255e-06, "loss": 0.10517533868551254, "memory(GiB)": 19.03, "step": 2111, "token_acc": 0.940677966101695, "train_speed(iter/s)": 0.945214 }, { "epoch": 0.0686092973394406, "grad_norm": 1.6391137838363647, "learning_rate": 9.99056262318651e-06, "loss": 0.10849488526582718, "memory(GiB)": 19.03, "step": 2112, "token_acc": 0.9661016949152542, "train_speed(iter/s)": 0.945284 }, { "epoch": 0.06864178280219602, "grad_norm": 1.0620638132095337, "learning_rate": 9.99052960695103e-06, "loss": 0.09699441492557526, "memory(GiB)": 19.03, "step": 2113, "token_acc": 0.9701492537313433, "train_speed(iter/s)": 0.945354 }, { "epoch": 0.06867426826495143, "grad_norm": 1.1156026124954224, "learning_rate": 9.990496533118194e-06, "loss": 0.0948362648487091, "memory(GiB)": 19.03, "step": 2114, "token_acc": 0.968609865470852, "train_speed(iter/s)": 0.945418 }, { "epoch": 0.06870675372770685, "grad_norm": 0.810838520526886, "learning_rate": 9.990463401688388e-06, "loss": 0.09632031619548798, "memory(GiB)": 19.03, "step": 2115, "token_acc": 0.955, "train_speed(iter/s)": 0.945485 }, { "epoch": 0.06873923919046226, "grad_norm": 3.8370370864868164, "learning_rate": 9.99043021266199e-06, "loss": 0.09016026556491852, "memory(GiB)": 19.03, "step": 2116, "token_acc": 0.9724770642201835, "train_speed(iter/s)": 0.945559 }, { "epoch": 0.06877172465321768, "grad_norm": 1.0077526569366455, "learning_rate": 9.990396966039386e-06, "loss": 0.10054630786180496, "memory(GiB)": 19.03, "step": 2117, "token_acc": 0.968421052631579, "train_speed(iter/s)": 0.945622 }, { "epoch": 0.0688042101159731, "grad_norm": 1.1029248237609863, "learning_rate": 9.990363661820959e-06, "loss": 0.10276905447244644, "memory(GiB)": 19.03, "step": 2118, "token_acc": 0.9669421487603306, "train_speed(iter/s)": 0.945686 }, { "epoch": 0.06883669557872851, "grad_norm": 0.765781044960022, "learning_rate": 9.990330300007093e-06, "loss": 0.10321540385484695, "memory(GiB)": 19.03, "step": 2119, "token_acc": 0.9585253456221198, "train_speed(iter/s)": 0.945762 }, { "epoch": 0.06886918104148394, "grad_norm": 0.9208530187606812, "learning_rate": 9.990296880598173e-06, "loss": 0.09711186587810516, "memory(GiB)": 19.03, "step": 2120, "token_acc": 0.9396984924623115, "train_speed(iter/s)": 0.945826 }, { "epoch": 0.06890166650423936, "grad_norm": 0.859665036201477, "learning_rate": 9.990263403594586e-06, "loss": 0.08159208297729492, "memory(GiB)": 19.03, "step": 2121, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.945892 }, { "epoch": 0.06893415196699477, "grad_norm": 0.6648647785186768, "learning_rate": 9.990229868996717e-06, "loss": 0.08833252638578415, "memory(GiB)": 19.03, "step": 2122, "token_acc": 0.9707317073170731, "train_speed(iter/s)": 0.945962 }, { "epoch": 0.06896663742975019, "grad_norm": 0.933071494102478, "learning_rate": 9.990196276804953e-06, "loss": 0.09009328484535217, "memory(GiB)": 19.03, "step": 2123, "token_acc": 0.9704433497536946, "train_speed(iter/s)": 0.946021 }, { "epoch": 0.06899912289250561, "grad_norm": 0.9190893769264221, "learning_rate": 9.990162627019682e-06, "loss": 0.09526045620441437, "memory(GiB)": 19.03, "step": 2124, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.946113 }, { "epoch": 0.06903160835526102, "grad_norm": 0.7979125380516052, "learning_rate": 9.990128919641294e-06, "loss": 0.10260426253080368, "memory(GiB)": 19.03, "step": 2125, "token_acc": 0.9607843137254902, "train_speed(iter/s)": 0.946201 }, { "epoch": 0.06906409381801644, "grad_norm": 2.171133279800415, "learning_rate": 9.990095154670174e-06, "loss": 0.0903102234005928, "memory(GiB)": 19.03, "step": 2126, "token_acc": 0.9514925373134329, "train_speed(iter/s)": 0.946294 }, { "epoch": 0.06909657928077186, "grad_norm": 0.789832592010498, "learning_rate": 9.990061332106715e-06, "loss": 0.0937206968665123, "memory(GiB)": 19.03, "step": 2127, "token_acc": 0.9606299212598425, "train_speed(iter/s)": 0.946371 }, { "epoch": 0.06912906474352727, "grad_norm": 0.6950978636741638, "learning_rate": 9.990027451951305e-06, "loss": 0.09498515725135803, "memory(GiB)": 19.03, "step": 2128, "token_acc": 0.9692982456140351, "train_speed(iter/s)": 0.94645 }, { "epoch": 0.06916155020628269, "grad_norm": 0.8150939345359802, "learning_rate": 9.989993514204339e-06, "loss": 0.09730921685695648, "memory(GiB)": 19.03, "step": 2129, "token_acc": 0.9557522123893806, "train_speed(iter/s)": 0.94653 }, { "epoch": 0.0691940356690381, "grad_norm": 1.2181953191757202, "learning_rate": 9.989959518866206e-06, "loss": 0.10302393138408661, "memory(GiB)": 19.03, "step": 2130, "token_acc": 0.9660194174757282, "train_speed(iter/s)": 0.946612 }, { "epoch": 0.06922652113179352, "grad_norm": 0.9194772839546204, "learning_rate": 9.989925465937297e-06, "loss": 0.10458661615848541, "memory(GiB)": 19.03, "step": 2131, "token_acc": 0.9552238805970149, "train_speed(iter/s)": 0.946695 }, { "epoch": 0.06925900659454894, "grad_norm": 1.6365588903427124, "learning_rate": 9.989891355418007e-06, "loss": 0.10418374836444855, "memory(GiB)": 19.03, "step": 2132, "token_acc": 0.9497716894977168, "train_speed(iter/s)": 0.94678 }, { "epoch": 0.06929149205730435, "grad_norm": 1.2510932683944702, "learning_rate": 9.989857187308729e-06, "loss": 0.10901006311178207, "memory(GiB)": 19.03, "step": 2133, "token_acc": 0.949238578680203, "train_speed(iter/s)": 0.946871 }, { "epoch": 0.06932397752005977, "grad_norm": 0.6992551684379578, "learning_rate": 9.989822961609858e-06, "loss": 0.09228618443012238, "memory(GiB)": 19.03, "step": 2134, "token_acc": 0.9543568464730291, "train_speed(iter/s)": 0.946962 }, { "epoch": 0.06935646298281518, "grad_norm": 0.6832252740859985, "learning_rate": 9.989788678321788e-06, "loss": 0.09050916135311127, "memory(GiB)": 19.03, "step": 2135, "token_acc": 0.9596412556053812, "train_speed(iter/s)": 0.947049 }, { "epoch": 0.06938894844557061, "grad_norm": 1.0167226791381836, "learning_rate": 9.989754337444917e-06, "loss": 0.10038110613822937, "memory(GiB)": 19.03, "step": 2136, "token_acc": 0.9400921658986175, "train_speed(iter/s)": 0.947136 }, { "epoch": 0.06942143390832603, "grad_norm": 0.6242355108261108, "learning_rate": 9.989719938979638e-06, "loss": 0.09708505868911743, "memory(GiB)": 19.03, "step": 2137, "token_acc": 0.9757085020242915, "train_speed(iter/s)": 0.947207 }, { "epoch": 0.06945391937108145, "grad_norm": 1.1312568187713623, "learning_rate": 9.98968548292635e-06, "loss": 0.10253921151161194, "memory(GiB)": 19.03, "step": 2138, "token_acc": 0.9563492063492064, "train_speed(iter/s)": 0.947277 }, { "epoch": 0.06948640483383686, "grad_norm": 0.8107582330703735, "learning_rate": 9.98965096928545e-06, "loss": 0.09246417135000229, "memory(GiB)": 19.03, "step": 2139, "token_acc": 0.9516129032258065, "train_speed(iter/s)": 0.947349 }, { "epoch": 0.06951889029659228, "grad_norm": 0.6613621711730957, "learning_rate": 9.989616398057337e-06, "loss": 0.09796233475208282, "memory(GiB)": 19.03, "step": 2140, "token_acc": 0.964824120603015, "train_speed(iter/s)": 0.947412 }, { "epoch": 0.0695513757593477, "grad_norm": 1.342791199684143, "learning_rate": 9.989581769242408e-06, "loss": 0.10345231741666794, "memory(GiB)": 19.03, "step": 2141, "token_acc": 0.9592592592592593, "train_speed(iter/s)": 0.94748 }, { "epoch": 0.06958386122210311, "grad_norm": 0.7842831015586853, "learning_rate": 9.989547082841067e-06, "loss": 0.09384679049253464, "memory(GiB)": 19.03, "step": 2142, "token_acc": 0.9548611111111112, "train_speed(iter/s)": 0.947545 }, { "epoch": 0.06961634668485853, "grad_norm": 0.8985412120819092, "learning_rate": 9.98951233885371e-06, "loss": 0.09645045548677444, "memory(GiB)": 19.03, "step": 2143, "token_acc": 0.9628099173553719, "train_speed(iter/s)": 0.947623 }, { "epoch": 0.06964883214761394, "grad_norm": 0.7866504192352295, "learning_rate": 9.98947753728074e-06, "loss": 0.08949614316225052, "memory(GiB)": 19.03, "step": 2144, "token_acc": 0.9605911330049262, "train_speed(iter/s)": 0.94769 }, { "epoch": 0.06968131761036936, "grad_norm": 1.646414041519165, "learning_rate": 9.989442678122558e-06, "loss": 0.09388962388038635, "memory(GiB)": 19.03, "step": 2145, "token_acc": 0.9563106796116505, "train_speed(iter/s)": 0.947768 }, { "epoch": 0.06971380307312477, "grad_norm": 1.0492032766342163, "learning_rate": 9.989407761379567e-06, "loss": 0.09469819813966751, "memory(GiB)": 19.03, "step": 2146, "token_acc": 0.9545454545454546, "train_speed(iter/s)": 0.947829 }, { "epoch": 0.06974628853588019, "grad_norm": 0.7553348541259766, "learning_rate": 9.989372787052169e-06, "loss": 0.09912457317113876, "memory(GiB)": 19.03, "step": 2147, "token_acc": 0.95578231292517, "train_speed(iter/s)": 0.947901 }, { "epoch": 0.06977877399863561, "grad_norm": 0.6108880043029785, "learning_rate": 9.98933775514077e-06, "loss": 0.08926169574260712, "memory(GiB)": 19.03, "step": 2148, "token_acc": 0.9680365296803652, "train_speed(iter/s)": 0.947976 }, { "epoch": 0.06981125946139102, "grad_norm": 0.7023391127586365, "learning_rate": 9.98930266564577e-06, "loss": 0.09020265936851501, "memory(GiB)": 19.03, "step": 2149, "token_acc": 0.9563492063492064, "train_speed(iter/s)": 0.948047 }, { "epoch": 0.06984374492414644, "grad_norm": 1.2585233449935913, "learning_rate": 9.989267518567578e-06, "loss": 0.09577582031488419, "memory(GiB)": 19.03, "step": 2150, "token_acc": 0.9555555555555556, "train_speed(iter/s)": 0.948115 }, { "epoch": 0.06987623038690186, "grad_norm": 0.7102580666542053, "learning_rate": 9.989232313906597e-06, "loss": 0.08887208998203278, "memory(GiB)": 19.03, "step": 2151, "token_acc": 0.966542750929368, "train_speed(iter/s)": 0.948184 }, { "epoch": 0.06990871584965729, "grad_norm": 0.9212521910667419, "learning_rate": 9.989197051663236e-06, "loss": 0.09578132629394531, "memory(GiB)": 19.03, "step": 2152, "token_acc": 0.9592592592592593, "train_speed(iter/s)": 0.948251 }, { "epoch": 0.0699412013124127, "grad_norm": 0.8749202489852905, "learning_rate": 9.989161731837899e-06, "loss": 0.0831960067152977, "memory(GiB)": 19.03, "step": 2153, "token_acc": 0.9530685920577617, "train_speed(iter/s)": 0.948316 }, { "epoch": 0.06997368677516812, "grad_norm": 0.747065544128418, "learning_rate": 9.989126354430996e-06, "loss": 0.08136998116970062, "memory(GiB)": 19.03, "step": 2154, "token_acc": 0.9581589958158996, "train_speed(iter/s)": 0.948384 }, { "epoch": 0.07000617223792353, "grad_norm": 1.6386613845825195, "learning_rate": 9.989090919442933e-06, "loss": 0.08460835367441177, "memory(GiB)": 19.03, "step": 2155, "token_acc": 0.9485981308411215, "train_speed(iter/s)": 0.948447 }, { "epoch": 0.07003865770067895, "grad_norm": 0.7944446206092834, "learning_rate": 9.989055426874122e-06, "loss": 0.08542267978191376, "memory(GiB)": 19.03, "step": 2156, "token_acc": 0.9724409448818898, "train_speed(iter/s)": 0.948521 }, { "epoch": 0.07007114316343437, "grad_norm": 1.1261917352676392, "learning_rate": 9.98901987672497e-06, "loss": 0.08556956797838211, "memory(GiB)": 19.03, "step": 2157, "token_acc": 0.9547511312217195, "train_speed(iter/s)": 0.948584 }, { "epoch": 0.07010362862618978, "grad_norm": 0.8397229909896851, "learning_rate": 9.988984268995887e-06, "loss": 0.08945208787918091, "memory(GiB)": 19.03, "step": 2158, "token_acc": 0.9555555555555556, "train_speed(iter/s)": 0.948669 }, { "epoch": 0.0701361140889452, "grad_norm": 0.9114766716957092, "learning_rate": 9.988948603687288e-06, "loss": 0.09281028807163239, "memory(GiB)": 19.03, "step": 2159, "token_acc": 0.961038961038961, "train_speed(iter/s)": 0.948766 }, { "epoch": 0.07016859955170061, "grad_norm": 0.67564857006073, "learning_rate": 9.98891288079958e-06, "loss": 0.08086007833480835, "memory(GiB)": 19.03, "step": 2160, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.948858 }, { "epoch": 0.07020108501445603, "grad_norm": 0.9790970087051392, "learning_rate": 9.988877100333178e-06, "loss": 0.09344129264354706, "memory(GiB)": 19.03, "step": 2161, "token_acc": 0.9594594594594594, "train_speed(iter/s)": 0.948945 }, { "epoch": 0.07023357047721145, "grad_norm": 0.9330422282218933, "learning_rate": 9.988841262288493e-06, "loss": 0.095049649477005, "memory(GiB)": 19.03, "step": 2162, "token_acc": 0.9577464788732394, "train_speed(iter/s)": 0.949034 }, { "epoch": 0.07026605593996686, "grad_norm": 0.8229904174804688, "learning_rate": 9.98880536666594e-06, "loss": 0.09526363760232925, "memory(GiB)": 19.03, "step": 2163, "token_acc": 0.96, "train_speed(iter/s)": 0.949096 }, { "epoch": 0.07029854140272228, "grad_norm": 0.82728511095047, "learning_rate": 9.988769413465933e-06, "loss": 0.09654141962528229, "memory(GiB)": 19.03, "step": 2164, "token_acc": 0.970954356846473, "train_speed(iter/s)": 0.949169 }, { "epoch": 0.0703310268654777, "grad_norm": 0.8253698945045471, "learning_rate": 9.988733402688889e-06, "loss": 0.09206096827983856, "memory(GiB)": 19.03, "step": 2165, "token_acc": 0.9575971731448764, "train_speed(iter/s)": 0.949241 }, { "epoch": 0.07036351232823311, "grad_norm": 0.7670581340789795, "learning_rate": 9.988697334335219e-06, "loss": 0.08298195898532867, "memory(GiB)": 19.03, "step": 2166, "token_acc": 0.9681818181818181, "train_speed(iter/s)": 0.949315 }, { "epoch": 0.07039599779098853, "grad_norm": 0.5169503688812256, "learning_rate": 9.988661208405343e-06, "loss": 0.09116601198911667, "memory(GiB)": 19.03, "step": 2167, "token_acc": 0.961864406779661, "train_speed(iter/s)": 0.949375 }, { "epoch": 0.07042848325374396, "grad_norm": 0.5377506017684937, "learning_rate": 9.988625024899677e-06, "loss": 0.08032737672328949, "memory(GiB)": 19.03, "step": 2168, "token_acc": 0.9592760180995475, "train_speed(iter/s)": 0.949442 }, { "epoch": 0.07046096871649937, "grad_norm": 0.9068633317947388, "learning_rate": 9.988588783818636e-06, "loss": 0.0961495190858841, "memory(GiB)": 19.03, "step": 2169, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.949498 }, { "epoch": 0.07049345417925479, "grad_norm": 0.9878231883049011, "learning_rate": 9.988552485162644e-06, "loss": 0.09826954454183578, "memory(GiB)": 19.03, "step": 2170, "token_acc": 0.9658536585365853, "train_speed(iter/s)": 0.949571 }, { "epoch": 0.0705259396420102, "grad_norm": 0.6760252714157104, "learning_rate": 9.988516128932114e-06, "loss": 0.09756503254175186, "memory(GiB)": 19.03, "step": 2171, "token_acc": 0.9615384615384616, "train_speed(iter/s)": 0.949638 }, { "epoch": 0.07055842510476562, "grad_norm": 0.7167002558708191, "learning_rate": 9.98847971512747e-06, "loss": 0.09883460402488708, "memory(GiB)": 19.03, "step": 2172, "token_acc": 0.9502487562189055, "train_speed(iter/s)": 0.949696 }, { "epoch": 0.07059091056752104, "grad_norm": 0.7520925402641296, "learning_rate": 9.988443243749128e-06, "loss": 0.08392193913459778, "memory(GiB)": 19.03, "step": 2173, "token_acc": 0.9680851063829787, "train_speed(iter/s)": 0.949768 }, { "epoch": 0.07062339603027645, "grad_norm": 0.5682413578033447, "learning_rate": 9.988406714797513e-06, "loss": 0.08329280465841293, "memory(GiB)": 19.03, "step": 2174, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.949819 }, { "epoch": 0.07065588149303187, "grad_norm": 0.7390419244766235, "learning_rate": 9.988370128273045e-06, "loss": 0.09725342690944672, "memory(GiB)": 19.03, "step": 2175, "token_acc": 0.9560975609756097, "train_speed(iter/s)": 0.949878 }, { "epoch": 0.07068836695578729, "grad_norm": 0.7310824990272522, "learning_rate": 9.988333484176148e-06, "loss": 0.09174151718616486, "memory(GiB)": 19.03, "step": 2176, "token_acc": 0.9739776951672863, "train_speed(iter/s)": 0.949941 }, { "epoch": 0.0707208524185427, "grad_norm": 1.3214659690856934, "learning_rate": 9.98829678250724e-06, "loss": 0.10159988701343536, "memory(GiB)": 19.03, "step": 2177, "token_acc": 0.9400749063670412, "train_speed(iter/s)": 0.95001 }, { "epoch": 0.07075333788129812, "grad_norm": 0.8997796177864075, "learning_rate": 9.98826002326675e-06, "loss": 0.09681074321269989, "memory(GiB)": 19.03, "step": 2178, "token_acc": 0.9615384615384616, "train_speed(iter/s)": 0.95007 }, { "epoch": 0.07078582334405353, "grad_norm": 1.188812494277954, "learning_rate": 9.988223206455097e-06, "loss": 0.08368952572345734, "memory(GiB)": 19.03, "step": 2179, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.950141 }, { "epoch": 0.07081830880680895, "grad_norm": 0.87801194190979, "learning_rate": 9.988186332072711e-06, "loss": 0.10323686897754669, "memory(GiB)": 19.03, "step": 2180, "token_acc": 0.9496124031007752, "train_speed(iter/s)": 0.950219 }, { "epoch": 0.07085079426956437, "grad_norm": 0.9664713740348816, "learning_rate": 9.988149400120017e-06, "loss": 0.10267595946788788, "memory(GiB)": 19.03, "step": 2181, "token_acc": 0.9633699633699634, "train_speed(iter/s)": 0.950309 }, { "epoch": 0.07088327973231978, "grad_norm": 0.7482767105102539, "learning_rate": 9.988112410597437e-06, "loss": 0.1055629700422287, "memory(GiB)": 19.03, "step": 2182, "token_acc": 0.9678899082568807, "train_speed(iter/s)": 0.950398 }, { "epoch": 0.0709157651950752, "grad_norm": 0.6622495055198669, "learning_rate": 9.988075363505402e-06, "loss": 0.07937309145927429, "memory(GiB)": 19.03, "step": 2183, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.950486 }, { "epoch": 0.07094825065783063, "grad_norm": 0.7884204387664795, "learning_rate": 9.988038258844336e-06, "loss": 0.1087145134806633, "memory(GiB)": 19.03, "step": 2184, "token_acc": 0.9481481481481482, "train_speed(iter/s)": 0.950573 }, { "epoch": 0.07098073612058604, "grad_norm": 0.873724102973938, "learning_rate": 9.988001096614673e-06, "loss": 0.09818445146083832, "memory(GiB)": 19.03, "step": 2185, "token_acc": 0.9572953736654805, "train_speed(iter/s)": 0.950657 }, { "epoch": 0.07101322158334146, "grad_norm": 0.7613703608512878, "learning_rate": 9.987963876816837e-06, "loss": 0.09634524583816528, "memory(GiB)": 19.03, "step": 2186, "token_acc": 0.9704797047970479, "train_speed(iter/s)": 0.950742 }, { "epoch": 0.07104570704609688, "grad_norm": 1.1102640628814697, "learning_rate": 9.987926599451259e-06, "loss": 0.08300583809614182, "memory(GiB)": 19.03, "step": 2187, "token_acc": 0.9706959706959707, "train_speed(iter/s)": 0.950834 }, { "epoch": 0.07107819250885229, "grad_norm": 2.1319615840911865, "learning_rate": 9.987889264518367e-06, "loss": 0.09246227145195007, "memory(GiB)": 19.03, "step": 2188, "token_acc": 0.9456066945606695, "train_speed(iter/s)": 0.950918 }, { "epoch": 0.07111067797160771, "grad_norm": 1.1376137733459473, "learning_rate": 9.987851872018597e-06, "loss": 0.10147862136363983, "memory(GiB)": 19.03, "step": 2189, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.951008 }, { "epoch": 0.07114316343436312, "grad_norm": 0.589139997959137, "learning_rate": 9.987814421952376e-06, "loss": 0.08381085097789764, "memory(GiB)": 19.03, "step": 2190, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.951087 }, { "epoch": 0.07117564889711854, "grad_norm": 1.1571664810180664, "learning_rate": 9.987776914320137e-06, "loss": 0.10093700885772705, "memory(GiB)": 19.03, "step": 2191, "token_acc": 0.9575289575289575, "train_speed(iter/s)": 0.951168 }, { "epoch": 0.07120813435987396, "grad_norm": 0.5512689352035522, "learning_rate": 9.987739349122315e-06, "loss": 0.08792775124311447, "memory(GiB)": 19.03, "step": 2192, "token_acc": 0.975, "train_speed(iter/s)": 0.95125 }, { "epoch": 0.07124061982262937, "grad_norm": 0.9283116459846497, "learning_rate": 9.987701726359342e-06, "loss": 0.0874822735786438, "memory(GiB)": 19.03, "step": 2193, "token_acc": 0.96875, "train_speed(iter/s)": 0.951343 }, { "epoch": 0.07127310528538479, "grad_norm": 0.7358200550079346, "learning_rate": 9.987664046031654e-06, "loss": 0.10018230229616165, "memory(GiB)": 19.03, "step": 2194, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.951432 }, { "epoch": 0.0713055907481402, "grad_norm": 0.5552287101745605, "learning_rate": 9.987626308139682e-06, "loss": 0.0809207335114479, "memory(GiB)": 19.03, "step": 2195, "token_acc": 0.9611650485436893, "train_speed(iter/s)": 0.951516 }, { "epoch": 0.07133807621089562, "grad_norm": 0.7031753063201904, "learning_rate": 9.987588512683864e-06, "loss": 0.08347009122371674, "memory(GiB)": 19.03, "step": 2196, "token_acc": 0.9683794466403162, "train_speed(iter/s)": 0.951589 }, { "epoch": 0.07137056167365104, "grad_norm": 0.8329357504844666, "learning_rate": 9.987550659664637e-06, "loss": 0.09298288822174072, "memory(GiB)": 19.03, "step": 2197, "token_acc": 0.9612403100775194, "train_speed(iter/s)": 0.95166 }, { "epoch": 0.07140304713640645, "grad_norm": 0.9295387864112854, "learning_rate": 9.987512749082437e-06, "loss": 0.10344589501619339, "memory(GiB)": 19.03, "step": 2198, "token_acc": 0.957983193277311, "train_speed(iter/s)": 0.95172 }, { "epoch": 0.07143553259916187, "grad_norm": 0.49313294887542725, "learning_rate": 9.9874747809377e-06, "loss": 0.07850392162799835, "memory(GiB)": 19.03, "step": 2199, "token_acc": 0.9662921348314607, "train_speed(iter/s)": 0.951784 }, { "epoch": 0.0714680180619173, "grad_norm": 1.5135473012924194, "learning_rate": 9.987436755230868e-06, "loss": 0.0834362655878067, "memory(GiB)": 19.03, "step": 2200, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.951852 }, { "epoch": 0.07150050352467271, "grad_norm": 0.9282638430595398, "learning_rate": 9.987398671962375e-06, "loss": 0.07979840785264969, "memory(GiB)": 19.03, "step": 2201, "token_acc": 0.9592760180995475, "train_speed(iter/s)": 0.951921 }, { "epoch": 0.07153298898742813, "grad_norm": 0.7486870884895325, "learning_rate": 9.987360531132666e-06, "loss": 0.09573403000831604, "memory(GiB)": 19.03, "step": 2202, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.951998 }, { "epoch": 0.07156547445018355, "grad_norm": 0.8284701108932495, "learning_rate": 9.987322332742177e-06, "loss": 0.08131900429725647, "memory(GiB)": 19.03, "step": 2203, "token_acc": 0.9710144927536232, "train_speed(iter/s)": 0.952062 }, { "epoch": 0.07159795991293896, "grad_norm": 1.3250668048858643, "learning_rate": 9.987284076791351e-06, "loss": 0.08810510486364365, "memory(GiB)": 19.03, "step": 2204, "token_acc": 0.9627906976744186, "train_speed(iter/s)": 0.952136 }, { "epoch": 0.07163044537569438, "grad_norm": 0.7041580080986023, "learning_rate": 9.987245763280626e-06, "loss": 0.09929907321929932, "memory(GiB)": 19.03, "step": 2205, "token_acc": 0.9453781512605042, "train_speed(iter/s)": 0.952206 }, { "epoch": 0.0716629308384498, "grad_norm": 0.9726312160491943, "learning_rate": 9.98720739221045e-06, "loss": 0.0973874107003212, "memory(GiB)": 19.03, "step": 2206, "token_acc": 0.9581589958158996, "train_speed(iter/s)": 0.952281 }, { "epoch": 0.07169541630120521, "grad_norm": 0.8983069062232971, "learning_rate": 9.987168963581261e-06, "loss": 0.08796371519565582, "memory(GiB)": 19.03, "step": 2207, "token_acc": 0.9652777777777778, "train_speed(iter/s)": 0.95235 }, { "epoch": 0.07172790176396063, "grad_norm": 0.8548983335494995, "learning_rate": 9.987130477393505e-06, "loss": 0.08774516731500626, "memory(GiB)": 19.03, "step": 2208, "token_acc": 0.966542750929368, "train_speed(iter/s)": 0.952418 }, { "epoch": 0.07176038722671604, "grad_norm": 0.7248504161834717, "learning_rate": 9.987091933647624e-06, "loss": 0.08666897565126419, "memory(GiB)": 19.03, "step": 2209, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.952487 }, { "epoch": 0.07179287268947146, "grad_norm": 1.0066884756088257, "learning_rate": 9.987053332344066e-06, "loss": 0.0921156108379364, "memory(GiB)": 19.03, "step": 2210, "token_acc": 0.9552238805970149, "train_speed(iter/s)": 0.952564 }, { "epoch": 0.07182535815222688, "grad_norm": 0.8530367612838745, "learning_rate": 9.987014673483273e-06, "loss": 0.08453868329524994, "memory(GiB)": 19.03, "step": 2211, "token_acc": 0.9453551912568307, "train_speed(iter/s)": 0.952637 }, { "epoch": 0.07185784361498229, "grad_norm": 0.9361763000488281, "learning_rate": 9.986975957065696e-06, "loss": 0.09317632764577866, "memory(GiB)": 19.03, "step": 2212, "token_acc": 0.9603960396039604, "train_speed(iter/s)": 0.952705 }, { "epoch": 0.07189032907773771, "grad_norm": 0.6383073329925537, "learning_rate": 9.986937183091776e-06, "loss": 0.08042019605636597, "memory(GiB)": 19.03, "step": 2213, "token_acc": 0.968503937007874, "train_speed(iter/s)": 0.952766 }, { "epoch": 0.07192281454049312, "grad_norm": 1.140254020690918, "learning_rate": 9.986898351561964e-06, "loss": 0.10175823420286179, "memory(GiB)": 19.03, "step": 2214, "token_acc": 0.9662447257383966, "train_speed(iter/s)": 0.95283 }, { "epoch": 0.07195530000324854, "grad_norm": 0.7276111245155334, "learning_rate": 9.986859462476708e-06, "loss": 0.09194919466972351, "memory(GiB)": 19.03, "step": 2215, "token_acc": 0.9529914529914529, "train_speed(iter/s)": 0.952901 }, { "epoch": 0.07198778546600397, "grad_norm": 0.688831627368927, "learning_rate": 9.986820515836457e-06, "loss": 0.07874627411365509, "memory(GiB)": 19.03, "step": 2216, "token_acc": 0.9576271186440678, "train_speed(iter/s)": 0.952967 }, { "epoch": 0.07202027092875939, "grad_norm": 0.6210655570030212, "learning_rate": 9.986781511641658e-06, "loss": 0.08650314807891846, "memory(GiB)": 19.03, "step": 2217, "token_acc": 0.9587155963302753, "train_speed(iter/s)": 0.953056 }, { "epoch": 0.0720527563915148, "grad_norm": 0.7068778872489929, "learning_rate": 9.986742449892763e-06, "loss": 0.1032620444893837, "memory(GiB)": 19.03, "step": 2218, "token_acc": 0.9707112970711297, "train_speed(iter/s)": 0.953135 }, { "epoch": 0.07208524185427022, "grad_norm": 0.9162643551826477, "learning_rate": 9.986703330590226e-06, "loss": 0.09096981585025787, "memory(GiB)": 19.03, "step": 2219, "token_acc": 0.9322033898305084, "train_speed(iter/s)": 0.953199 }, { "epoch": 0.07211772731702563, "grad_norm": 0.4845772683620453, "learning_rate": 9.986664153734491e-06, "loss": 0.0869179368019104, "memory(GiB)": 19.03, "step": 2220, "token_acc": 0.9486166007905138, "train_speed(iter/s)": 0.953254 }, { "epoch": 0.07215021277978105, "grad_norm": 1.312678575515747, "learning_rate": 9.986624919326018e-06, "loss": 0.08942563831806183, "memory(GiB)": 19.03, "step": 2221, "token_acc": 0.9621848739495799, "train_speed(iter/s)": 0.953325 }, { "epoch": 0.07218269824253647, "grad_norm": 0.7467425465583801, "learning_rate": 9.986585627365253e-06, "loss": 0.08565516769886017, "memory(GiB)": 19.03, "step": 2222, "token_acc": 0.9523809523809523, "train_speed(iter/s)": 0.953396 }, { "epoch": 0.07221518370529188, "grad_norm": 0.8236146569252014, "learning_rate": 9.986546277852656e-06, "loss": 0.08399791270494461, "memory(GiB)": 19.03, "step": 2223, "token_acc": 0.9758064516129032, "train_speed(iter/s)": 0.953461 }, { "epoch": 0.0722476691680473, "grad_norm": 0.8040243983268738, "learning_rate": 9.986506870788676e-06, "loss": 0.0883677750825882, "memory(GiB)": 19.03, "step": 2224, "token_acc": 0.9601593625498008, "train_speed(iter/s)": 0.953522 }, { "epoch": 0.07228015463080271, "grad_norm": 0.8642016649246216, "learning_rate": 9.986467406173769e-06, "loss": 0.09339103102684021, "memory(GiB)": 19.03, "step": 2225, "token_acc": 0.9678714859437751, "train_speed(iter/s)": 0.953579 }, { "epoch": 0.07231264009355813, "grad_norm": 0.9935875535011292, "learning_rate": 9.986427884008392e-06, "loss": 0.0993088036775589, "memory(GiB)": 19.03, "step": 2226, "token_acc": 0.9626556016597511, "train_speed(iter/s)": 0.953635 }, { "epoch": 0.07234512555631355, "grad_norm": 1.1072111129760742, "learning_rate": 9.986388304293e-06, "loss": 0.09657884389162064, "memory(GiB)": 19.03, "step": 2227, "token_acc": 0.9752066115702479, "train_speed(iter/s)": 0.953696 }, { "epoch": 0.07237761101906896, "grad_norm": 1.0231142044067383, "learning_rate": 9.98634866702805e-06, "loss": 0.09460698813199997, "memory(GiB)": 19.03, "step": 2228, "token_acc": 0.9453551912568307, "train_speed(iter/s)": 0.953766 }, { "epoch": 0.07241009648182438, "grad_norm": 1.0269311666488647, "learning_rate": 9.986308972214e-06, "loss": 0.09705725312232971, "memory(GiB)": 19.03, "step": 2229, "token_acc": 0.9624060150375939, "train_speed(iter/s)": 0.953822 }, { "epoch": 0.0724425819445798, "grad_norm": 1.0178570747375488, "learning_rate": 9.986269219851308e-06, "loss": 0.08978940546512604, "memory(GiB)": 19.03, "step": 2230, "token_acc": 0.9653679653679653, "train_speed(iter/s)": 0.953887 }, { "epoch": 0.07247506740733521, "grad_norm": 0.7072281241416931, "learning_rate": 9.986229409940433e-06, "loss": 0.09895648062229156, "memory(GiB)": 19.03, "step": 2231, "token_acc": 0.9488188976377953, "train_speed(iter/s)": 0.953936 }, { "epoch": 0.07250755287009064, "grad_norm": 1.562839388847351, "learning_rate": 9.986189542481832e-06, "loss": 0.0912286713719368, "memory(GiB)": 19.03, "step": 2232, "token_acc": 0.9676113360323887, "train_speed(iter/s)": 0.954001 }, { "epoch": 0.07254003833284606, "grad_norm": 0.9133356809616089, "learning_rate": 9.986149617475968e-06, "loss": 0.11545425653457642, "memory(GiB)": 19.03, "step": 2233, "token_acc": 0.9444444444444444, "train_speed(iter/s)": 0.954068 }, { "epoch": 0.07257252379560147, "grad_norm": 0.9208292365074158, "learning_rate": 9.986109634923302e-06, "loss": 0.09123177081346512, "memory(GiB)": 19.03, "step": 2234, "token_acc": 0.96484375, "train_speed(iter/s)": 0.954134 }, { "epoch": 0.07260500925835689, "grad_norm": 0.5289310216903687, "learning_rate": 9.986069594824294e-06, "loss": 0.09020653367042542, "memory(GiB)": 19.03, "step": 2235, "token_acc": 0.9707112970711297, "train_speed(iter/s)": 0.954191 }, { "epoch": 0.0726374947211123, "grad_norm": 0.7014942765235901, "learning_rate": 9.986029497179405e-06, "loss": 0.08800013363361359, "memory(GiB)": 19.03, "step": 2236, "token_acc": 0.9682539682539683, "train_speed(iter/s)": 0.954279 }, { "epoch": 0.07266998018386772, "grad_norm": 0.7613483667373657, "learning_rate": 9.985989341989099e-06, "loss": 0.090381920337677, "memory(GiB)": 19.03, "step": 2237, "token_acc": 0.9634703196347032, "train_speed(iter/s)": 0.95436 }, { "epoch": 0.07270246564662314, "grad_norm": 0.5534802079200745, "learning_rate": 9.985949129253842e-06, "loss": 0.08884772658348083, "memory(GiB)": 19.03, "step": 2238, "token_acc": 0.9436619718309859, "train_speed(iter/s)": 0.954452 }, { "epoch": 0.07273495110937855, "grad_norm": 0.7867254018783569, "learning_rate": 9.985908858974093e-06, "loss": 0.10244566202163696, "memory(GiB)": 19.03, "step": 2239, "token_acc": 0.9543568464730291, "train_speed(iter/s)": 0.95454 }, { "epoch": 0.07276743657213397, "grad_norm": 0.7722902297973633, "learning_rate": 9.985868531150322e-06, "loss": 0.103721022605896, "memory(GiB)": 19.03, "step": 2240, "token_acc": 0.9551020408163265, "train_speed(iter/s)": 0.954627 }, { "epoch": 0.07279992203488939, "grad_norm": 0.5666378140449524, "learning_rate": 9.985828145782991e-06, "loss": 0.08298172056674957, "memory(GiB)": 19.03, "step": 2241, "token_acc": 0.9661016949152542, "train_speed(iter/s)": 0.954713 }, { "epoch": 0.0728324074976448, "grad_norm": 0.6791301965713501, "learning_rate": 9.985787702872567e-06, "loss": 0.087458536028862, "memory(GiB)": 19.03, "step": 2242, "token_acc": 0.967032967032967, "train_speed(iter/s)": 0.954797 }, { "epoch": 0.07286489296040022, "grad_norm": 6.390538215637207, "learning_rate": 9.985747202419517e-06, "loss": 0.09088660776615143, "memory(GiB)": 19.03, "step": 2243, "token_acc": 0.9509433962264151, "train_speed(iter/s)": 0.954879 }, { "epoch": 0.07289737842315563, "grad_norm": 0.839015781879425, "learning_rate": 9.98570664442431e-06, "loss": 0.08235777169466019, "memory(GiB)": 19.03, "step": 2244, "token_acc": 0.9768518518518519, "train_speed(iter/s)": 0.954953 }, { "epoch": 0.07292986388591105, "grad_norm": 0.7640619277954102, "learning_rate": 9.985666028887408e-06, "loss": 0.0817883163690567, "memory(GiB)": 19.03, "step": 2245, "token_acc": 0.9621848739495799, "train_speed(iter/s)": 0.955039 }, { "epoch": 0.07296234934866647, "grad_norm": 0.7117531895637512, "learning_rate": 9.985625355809288e-06, "loss": 0.08434141427278519, "memory(GiB)": 19.03, "step": 2246, "token_acc": 0.9651741293532339, "train_speed(iter/s)": 0.955119 }, { "epoch": 0.07299483481142188, "grad_norm": 0.7721126079559326, "learning_rate": 9.985584625190415e-06, "loss": 0.0917787104845047, "memory(GiB)": 19.03, "step": 2247, "token_acc": 0.9568627450980393, "train_speed(iter/s)": 0.955203 }, { "epoch": 0.07302732027417731, "grad_norm": 0.6706045269966125, "learning_rate": 9.985543837031257e-06, "loss": 0.09286805987358093, "memory(GiB)": 19.03, "step": 2248, "token_acc": 0.9467680608365019, "train_speed(iter/s)": 0.955288 }, { "epoch": 0.07305980573693273, "grad_norm": 0.9773775339126587, "learning_rate": 9.98550299133229e-06, "loss": 0.09124286472797394, "memory(GiB)": 19.03, "step": 2249, "token_acc": 0.9658119658119658, "train_speed(iter/s)": 0.95537 }, { "epoch": 0.07309229119968814, "grad_norm": 1.1313456296920776, "learning_rate": 9.985462088093982e-06, "loss": 0.09194958955049515, "memory(GiB)": 19.03, "step": 2250, "token_acc": 0.9477911646586346, "train_speed(iter/s)": 0.955441 }, { "epoch": 0.07312477666244356, "grad_norm": 0.7699487209320068, "learning_rate": 9.985421127316806e-06, "loss": 0.08789689838886261, "memory(GiB)": 19.03, "step": 2251, "token_acc": 0.9715639810426541, "train_speed(iter/s)": 0.955525 }, { "epoch": 0.07315726212519898, "grad_norm": 0.9211583733558655, "learning_rate": 9.985380109001234e-06, "loss": 0.09691252559423447, "memory(GiB)": 19.03, "step": 2252, "token_acc": 0.9585062240663901, "train_speed(iter/s)": 0.95561 }, { "epoch": 0.07318974758795439, "grad_norm": 0.9247029423713684, "learning_rate": 9.98533903314774e-06, "loss": 0.08045495301485062, "memory(GiB)": 19.03, "step": 2253, "token_acc": 0.9629629629629629, "train_speed(iter/s)": 0.95569 }, { "epoch": 0.07322223305070981, "grad_norm": 1.0818241834640503, "learning_rate": 9.985297899756799e-06, "loss": 0.09513242542743683, "memory(GiB)": 19.03, "step": 2254, "token_acc": 0.9634703196347032, "train_speed(iter/s)": 0.955755 }, { "epoch": 0.07325471851346523, "grad_norm": 1.722193956375122, "learning_rate": 9.985256708828884e-06, "loss": 0.08334611356258392, "memory(GiB)": 19.03, "step": 2255, "token_acc": 0.9675925925925926, "train_speed(iter/s)": 0.955827 }, { "epoch": 0.07328720397622064, "grad_norm": 0.53691166639328, "learning_rate": 9.985215460364472e-06, "loss": 0.09118302166461945, "memory(GiB)": 19.03, "step": 2256, "token_acc": 0.954954954954955, "train_speed(iter/s)": 0.955884 }, { "epoch": 0.07331968943897606, "grad_norm": 0.9469232559204102, "learning_rate": 9.985174154364038e-06, "loss": 0.08774576336145401, "memory(GiB)": 19.03, "step": 2257, "token_acc": 0.9527896995708155, "train_speed(iter/s)": 0.955954 }, { "epoch": 0.07335217490173147, "grad_norm": 0.7444537281990051, "learning_rate": 9.985132790828059e-06, "loss": 0.09229488670825958, "memory(GiB)": 19.03, "step": 2258, "token_acc": 0.9603960396039604, "train_speed(iter/s)": 0.956022 }, { "epoch": 0.07338466036448689, "grad_norm": 0.8869407176971436, "learning_rate": 9.985091369757011e-06, "loss": 0.09165199846029282, "memory(GiB)": 19.03, "step": 2259, "token_acc": 0.9556451612903226, "train_speed(iter/s)": 0.956091 }, { "epoch": 0.0734171458272423, "grad_norm": 2.0167179107666016, "learning_rate": 9.985049891151376e-06, "loss": 0.0904463455080986, "memory(GiB)": 19.03, "step": 2260, "token_acc": 0.9574468085106383, "train_speed(iter/s)": 0.956153 }, { "epoch": 0.07344963128999772, "grad_norm": 1.787517786026001, "learning_rate": 9.985008355011629e-06, "loss": 0.08147768676280975, "memory(GiB)": 19.03, "step": 2261, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.956213 }, { "epoch": 0.07348211675275314, "grad_norm": 0.7617782354354858, "learning_rate": 9.98496676133825e-06, "loss": 0.08801206201314926, "memory(GiB)": 19.03, "step": 2262, "token_acc": 0.9590909090909091, "train_speed(iter/s)": 0.956283 }, { "epoch": 0.07351460221550855, "grad_norm": 0.7366697788238525, "learning_rate": 9.98492511013172e-06, "loss": 0.08441674709320068, "memory(GiB)": 19.03, "step": 2263, "token_acc": 0.9707112970711297, "train_speed(iter/s)": 0.956352 }, { "epoch": 0.07354708767826398, "grad_norm": 0.8824705481529236, "learning_rate": 9.98488340139252e-06, "loss": 0.09003929048776627, "memory(GiB)": 19.03, "step": 2264, "token_acc": 0.9712918660287081, "train_speed(iter/s)": 0.95641 }, { "epoch": 0.0735795731410194, "grad_norm": 0.8452993035316467, "learning_rate": 9.98484163512113e-06, "loss": 0.09346142411231995, "memory(GiB)": 19.03, "step": 2265, "token_acc": 0.9702970297029703, "train_speed(iter/s)": 0.956466 }, { "epoch": 0.07361205860377482, "grad_norm": 1.040207028388977, "learning_rate": 9.984799811318034e-06, "loss": 0.09143625199794769, "memory(GiB)": 19.03, "step": 2266, "token_acc": 0.9759036144578314, "train_speed(iter/s)": 0.956528 }, { "epoch": 0.07364454406653023, "grad_norm": 0.8418054580688477, "learning_rate": 9.984757929983711e-06, "loss": 0.08525888621807098, "memory(GiB)": 19.03, "step": 2267, "token_acc": 0.9669421487603306, "train_speed(iter/s)": 0.956593 }, { "epoch": 0.07367702952928565, "grad_norm": 0.8625137805938721, "learning_rate": 9.984715991118648e-06, "loss": 0.0926496684551239, "memory(GiB)": 19.03, "step": 2268, "token_acc": 0.9724770642201835, "train_speed(iter/s)": 0.95665 }, { "epoch": 0.07370951499204106, "grad_norm": 0.8107585310935974, "learning_rate": 9.984673994723327e-06, "loss": 0.09848382323980331, "memory(GiB)": 19.03, "step": 2269, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.956706 }, { "epoch": 0.07374200045479648, "grad_norm": 0.9816144108772278, "learning_rate": 9.984631940798235e-06, "loss": 0.08955592662096024, "memory(GiB)": 19.03, "step": 2270, "token_acc": 0.9508196721311475, "train_speed(iter/s)": 0.956776 }, { "epoch": 0.0737744859175519, "grad_norm": 0.7624189257621765, "learning_rate": 9.984589829343854e-06, "loss": 0.08440675586462021, "memory(GiB)": 19.03, "step": 2271, "token_acc": 0.968609865470852, "train_speed(iter/s)": 0.956835 }, { "epoch": 0.07380697138030731, "grad_norm": 0.6717184782028198, "learning_rate": 9.984547660360675e-06, "loss": 0.08318851888179779, "memory(GiB)": 19.03, "step": 2272, "token_acc": 0.9442379182156134, "train_speed(iter/s)": 0.956885 }, { "epoch": 0.07383945684306273, "grad_norm": 1.0165501832962036, "learning_rate": 9.984505433849179e-06, "loss": 0.10502735525369644, "memory(GiB)": 19.03, "step": 2273, "token_acc": 0.9781420765027322, "train_speed(iter/s)": 0.956948 }, { "epoch": 0.07387194230581814, "grad_norm": 0.67645263671875, "learning_rate": 9.984463149809857e-06, "loss": 0.08380503952503204, "memory(GiB)": 19.03, "step": 2274, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.957016 }, { "epoch": 0.07390442776857356, "grad_norm": 1.078870177268982, "learning_rate": 9.984420808243196e-06, "loss": 0.09955176711082458, "memory(GiB)": 19.03, "step": 2275, "token_acc": 0.9572649572649573, "train_speed(iter/s)": 0.957079 }, { "epoch": 0.07393691323132898, "grad_norm": 0.8152749538421631, "learning_rate": 9.984378409149684e-06, "loss": 0.08967338502407074, "memory(GiB)": 19.03, "step": 2276, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.957141 }, { "epoch": 0.07396939869408439, "grad_norm": 0.7460400462150574, "learning_rate": 9.98433595252981e-06, "loss": 0.09007086604833603, "memory(GiB)": 19.03, "step": 2277, "token_acc": 0.94140625, "train_speed(iter/s)": 0.957196 }, { "epoch": 0.07400188415683981, "grad_norm": 0.8280956149101257, "learning_rate": 9.984293438384066e-06, "loss": 0.08773826062679291, "memory(GiB)": 19.03, "step": 2278, "token_acc": 0.9731800766283525, "train_speed(iter/s)": 0.957257 }, { "epoch": 0.07403436961959522, "grad_norm": 1.0875167846679688, "learning_rate": 9.984250866712943e-06, "loss": 0.09416821599006653, "memory(GiB)": 19.03, "step": 2279, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.957317 }, { "epoch": 0.07406685508235065, "grad_norm": 0.9790240526199341, "learning_rate": 9.984208237516927e-06, "loss": 0.09147719293832779, "memory(GiB)": 19.03, "step": 2280, "token_acc": 0.9605911330049262, "train_speed(iter/s)": 0.957383 }, { "epoch": 0.07409934054510607, "grad_norm": 0.6000155806541443, "learning_rate": 9.984165550796519e-06, "loss": 0.08801262080669403, "memory(GiB)": 19.03, "step": 2281, "token_acc": 0.9495412844036697, "train_speed(iter/s)": 0.957441 }, { "epoch": 0.07413182600786149, "grad_norm": 0.6592977046966553, "learning_rate": 9.984122806552203e-06, "loss": 0.08476297557353973, "memory(GiB)": 19.03, "step": 2282, "token_acc": 0.9669421487603306, "train_speed(iter/s)": 0.957505 }, { "epoch": 0.0741643114706169, "grad_norm": 0.9598533511161804, "learning_rate": 9.984080004784478e-06, "loss": 0.10116227716207504, "memory(GiB)": 19.03, "step": 2283, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.957559 }, { "epoch": 0.07419679693337232, "grad_norm": 0.6639151573181152, "learning_rate": 9.984037145493834e-06, "loss": 0.08814560621976852, "memory(GiB)": 19.03, "step": 2284, "token_acc": 0.9617021276595744, "train_speed(iter/s)": 0.957609 }, { "epoch": 0.07422928239612774, "grad_norm": 0.956191897392273, "learning_rate": 9.98399422868077e-06, "loss": 0.08606580644845963, "memory(GiB)": 19.03, "step": 2285, "token_acc": 0.9576271186440678, "train_speed(iter/s)": 0.957666 }, { "epoch": 0.07426176785888315, "grad_norm": 0.6845441460609436, "learning_rate": 9.983951254345778e-06, "loss": 0.08536702394485474, "memory(GiB)": 19.03, "step": 2286, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.957727 }, { "epoch": 0.07429425332163857, "grad_norm": 1.4207571744918823, "learning_rate": 9.983908222489354e-06, "loss": 0.10807891190052032, "memory(GiB)": 19.03, "step": 2287, "token_acc": 0.9461883408071748, "train_speed(iter/s)": 0.957791 }, { "epoch": 0.07432673878439398, "grad_norm": 1.4478964805603027, "learning_rate": 9.983865133111995e-06, "loss": 0.08046454191207886, "memory(GiB)": 19.03, "step": 2288, "token_acc": 0.9490740740740741, "train_speed(iter/s)": 0.957847 }, { "epoch": 0.0743592242471494, "grad_norm": 0.7058472633361816, "learning_rate": 9.983821986214201e-06, "loss": 0.09309174120426178, "memory(GiB)": 19.03, "step": 2289, "token_acc": 0.958904109589041, "train_speed(iter/s)": 0.957908 }, { "epoch": 0.07439170970990482, "grad_norm": 0.991851270198822, "learning_rate": 9.983778781796468e-06, "loss": 0.0922141820192337, "memory(GiB)": 19.03, "step": 2290, "token_acc": 0.9633507853403142, "train_speed(iter/s)": 0.957977 }, { "epoch": 0.07442419517266023, "grad_norm": 0.6723524332046509, "learning_rate": 9.983735519859293e-06, "loss": 0.08840037137269974, "memory(GiB)": 19.03, "step": 2291, "token_acc": 0.9681818181818181, "train_speed(iter/s)": 0.958035 }, { "epoch": 0.07445668063541565, "grad_norm": 0.6783084869384766, "learning_rate": 9.983692200403176e-06, "loss": 0.09173006564378738, "memory(GiB)": 19.03, "step": 2292, "token_acc": 0.9601593625498008, "train_speed(iter/s)": 0.958109 }, { "epoch": 0.07448916609817106, "grad_norm": 0.7987493276596069, "learning_rate": 9.98364882342862e-06, "loss": 0.09284874051809311, "memory(GiB)": 19.03, "step": 2293, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.958194 }, { "epoch": 0.07452165156092648, "grad_norm": 0.7883319854736328, "learning_rate": 9.983605388936122e-06, "loss": 0.08896889537572861, "memory(GiB)": 19.03, "step": 2294, "token_acc": 0.9688715953307393, "train_speed(iter/s)": 0.958276 }, { "epoch": 0.0745541370236819, "grad_norm": 0.5427132844924927, "learning_rate": 9.983561896926185e-06, "loss": 0.0874178558588028, "memory(GiB)": 19.03, "step": 2295, "token_acc": 0.9621848739495799, "train_speed(iter/s)": 0.958357 }, { "epoch": 0.07458662248643733, "grad_norm": 2.060706377029419, "learning_rate": 9.98351834739931e-06, "loss": 0.07289113849401474, "memory(GiB)": 19.03, "step": 2296, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.958426 }, { "epoch": 0.07461910794919274, "grad_norm": 0.9963043332099915, "learning_rate": 9.983474740356e-06, "loss": 0.09061767160892487, "memory(GiB)": 19.03, "step": 2297, "token_acc": 0.9606299212598425, "train_speed(iter/s)": 0.958512 }, { "epoch": 0.07465159341194816, "grad_norm": 1.0036038160324097, "learning_rate": 9.983431075796762e-06, "loss": 0.08119291812181473, "memory(GiB)": 19.03, "step": 2298, "token_acc": 0.959409594095941, "train_speed(iter/s)": 0.958594 }, { "epoch": 0.07468407887470357, "grad_norm": 0.7219527363777161, "learning_rate": 9.983387353722092e-06, "loss": 0.08752806484699249, "memory(GiB)": 19.03, "step": 2299, "token_acc": 0.9665271966527197, "train_speed(iter/s)": 0.958675 }, { "epoch": 0.07471656433745899, "grad_norm": 0.8596091270446777, "learning_rate": 9.9833435741325e-06, "loss": 0.09918058663606644, "memory(GiB)": 19.03, "step": 2300, "token_acc": 0.9563106796116505, "train_speed(iter/s)": 0.958759 }, { "epoch": 0.0747490498002144, "grad_norm": 0.7700859308242798, "learning_rate": 9.98329973702849e-06, "loss": 0.07623043656349182, "memory(GiB)": 19.03, "step": 2301, "token_acc": 0.968503937007874, "train_speed(iter/s)": 0.958839 }, { "epoch": 0.07478153526296982, "grad_norm": 0.5213108658790588, "learning_rate": 9.983255842410572e-06, "loss": 0.07502593100070953, "memory(GiB)": 19.03, "step": 2302, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.958923 }, { "epoch": 0.07481402072572524, "grad_norm": 0.7700233459472656, "learning_rate": 9.983211890279246e-06, "loss": 0.08979514241218567, "memory(GiB)": 19.03, "step": 2303, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.959004 }, { "epoch": 0.07484650618848065, "grad_norm": 0.8053324222564697, "learning_rate": 9.983167880635021e-06, "loss": 0.08548030257225037, "memory(GiB)": 19.03, "step": 2304, "token_acc": 0.9621848739495799, "train_speed(iter/s)": 0.959084 }, { "epoch": 0.07487899165123607, "grad_norm": 0.8910031318664551, "learning_rate": 9.983123813478409e-06, "loss": 0.0933723896741867, "memory(GiB)": 19.03, "step": 2305, "token_acc": 0.96, "train_speed(iter/s)": 0.959166 }, { "epoch": 0.07491147711399149, "grad_norm": 0.8200814723968506, "learning_rate": 9.983079688809914e-06, "loss": 0.08228404819965363, "memory(GiB)": 19.03, "step": 2306, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.95925 }, { "epoch": 0.0749439625767469, "grad_norm": 1.0195921659469604, "learning_rate": 9.983035506630045e-06, "loss": 0.09728285670280457, "memory(GiB)": 19.03, "step": 2307, "token_acc": 0.9607142857142857, "train_speed(iter/s)": 0.959331 }, { "epoch": 0.07497644803950232, "grad_norm": 0.7693842053413391, "learning_rate": 9.982991266939317e-06, "loss": 0.08508709818124771, "memory(GiB)": 19.03, "step": 2308, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.959417 }, { "epoch": 0.07500893350225774, "grad_norm": 1.4146058559417725, "learning_rate": 9.982946969738236e-06, "loss": 0.086153544485569, "memory(GiB)": 19.03, "step": 2309, "token_acc": 0.96875, "train_speed(iter/s)": 0.9595 }, { "epoch": 0.07504141896501315, "grad_norm": 0.9285668134689331, "learning_rate": 9.982902615027314e-06, "loss": 0.10639176517724991, "memory(GiB)": 19.03, "step": 2310, "token_acc": 0.9568627450980393, "train_speed(iter/s)": 0.959583 }, { "epoch": 0.07507390442776857, "grad_norm": 1.0997511148452759, "learning_rate": 9.982858202807066e-06, "loss": 0.10582193732261658, "memory(GiB)": 19.03, "step": 2311, "token_acc": 0.9647058823529412, "train_speed(iter/s)": 0.959663 }, { "epoch": 0.075106389890524, "grad_norm": 0.6681762337684631, "learning_rate": 9.982813733078e-06, "loss": 0.07393607497215271, "memory(GiB)": 19.03, "step": 2312, "token_acc": 0.9813084112149533, "train_speed(iter/s)": 0.959739 }, { "epoch": 0.07513887535327941, "grad_norm": 0.6130222678184509, "learning_rate": 9.982769205840631e-06, "loss": 0.08369697630405426, "memory(GiB)": 19.03, "step": 2313, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.959793 }, { "epoch": 0.07517136081603483, "grad_norm": 2.910210371017456, "learning_rate": 9.982724621095476e-06, "loss": 0.09537028521299362, "memory(GiB)": 19.03, "step": 2314, "token_acc": 0.9706959706959707, "train_speed(iter/s)": 0.95986 }, { "epoch": 0.07520384627879025, "grad_norm": 0.859457790851593, "learning_rate": 9.982679978843047e-06, "loss": 0.0993267223238945, "memory(GiB)": 19.03, "step": 2315, "token_acc": 0.9703703703703703, "train_speed(iter/s)": 0.959922 }, { "epoch": 0.07523633174154566, "grad_norm": 0.8222335577011108, "learning_rate": 9.982635279083857e-06, "loss": 0.0897761806845665, "memory(GiB)": 19.03, "step": 2316, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.95999 }, { "epoch": 0.07526881720430108, "grad_norm": 0.47271594405174255, "learning_rate": 9.982590521818422e-06, "loss": 0.08831792324781418, "memory(GiB)": 19.03, "step": 2317, "token_acc": 0.9615384615384616, "train_speed(iter/s)": 0.960056 }, { "epoch": 0.0753013026670565, "grad_norm": 0.48905885219573975, "learning_rate": 9.982545707047264e-06, "loss": 0.07507407665252686, "memory(GiB)": 19.03, "step": 2318, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.960121 }, { "epoch": 0.07533378812981191, "grad_norm": 0.6543373465538025, "learning_rate": 9.982500834770897e-06, "loss": 0.0903581976890564, "memory(GiB)": 19.03, "step": 2319, "token_acc": 0.9604743083003953, "train_speed(iter/s)": 0.960188 }, { "epoch": 0.07536627359256733, "grad_norm": 0.9068905711174011, "learning_rate": 9.982455904989837e-06, "loss": 0.08279725909233093, "memory(GiB)": 19.03, "step": 2320, "token_acc": 0.9640718562874252, "train_speed(iter/s)": 0.960251 }, { "epoch": 0.07539875905532274, "grad_norm": 0.6354385018348694, "learning_rate": 9.982410917704604e-06, "loss": 0.09525449573993683, "memory(GiB)": 19.03, "step": 2321, "token_acc": 0.9493087557603687, "train_speed(iter/s)": 0.960315 }, { "epoch": 0.07543124451807816, "grad_norm": 0.6202157735824585, "learning_rate": 9.982365872915718e-06, "loss": 0.07626716792583466, "memory(GiB)": 19.03, "step": 2322, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.960375 }, { "epoch": 0.07546372998083357, "grad_norm": 0.8420051336288452, "learning_rate": 9.9823207706237e-06, "loss": 0.08808505535125732, "memory(GiB)": 19.03, "step": 2323, "token_acc": 0.9576271186440678, "train_speed(iter/s)": 0.960436 }, { "epoch": 0.07549621544358899, "grad_norm": 0.796623706817627, "learning_rate": 9.982275610829066e-06, "loss": 0.09011819958686829, "memory(GiB)": 19.03, "step": 2324, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.960503 }, { "epoch": 0.0755287009063444, "grad_norm": 1.2684510946273804, "learning_rate": 9.982230393532342e-06, "loss": 0.09277836233377457, "memory(GiB)": 19.03, "step": 2325, "token_acc": 0.9692307692307692, "train_speed(iter/s)": 0.960571 }, { "epoch": 0.07556118636909982, "grad_norm": 0.8296024799346924, "learning_rate": 9.982185118734048e-06, "loss": 0.08699493110179901, "memory(GiB)": 19.03, "step": 2326, "token_acc": 0.9678714859437751, "train_speed(iter/s)": 0.96064 }, { "epoch": 0.07559367183185524, "grad_norm": 0.6607963442802429, "learning_rate": 9.982139786434705e-06, "loss": 0.08573596179485321, "memory(GiB)": 19.03, "step": 2327, "token_acc": 0.957983193277311, "train_speed(iter/s)": 0.960704 }, { "epoch": 0.07562615729461067, "grad_norm": 1.0156230926513672, "learning_rate": 9.982094396634839e-06, "loss": 0.09520766884088516, "memory(GiB)": 19.03, "step": 2328, "token_acc": 0.9557522123893806, "train_speed(iter/s)": 0.960772 }, { "epoch": 0.07565864275736608, "grad_norm": 1.2681994438171387, "learning_rate": 9.982048949334971e-06, "loss": 0.08837535977363586, "memory(GiB)": 19.03, "step": 2329, "token_acc": 0.96, "train_speed(iter/s)": 0.960832 }, { "epoch": 0.0756911282201215, "grad_norm": 0.6859672665596008, "learning_rate": 9.982003444535629e-06, "loss": 0.08514314889907837, "memory(GiB)": 19.03, "step": 2330, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.960886 }, { "epoch": 0.07572361368287692, "grad_norm": 0.7044030427932739, "learning_rate": 9.981957882237335e-06, "loss": 0.09136845171451569, "memory(GiB)": 19.03, "step": 2331, "token_acc": 0.951310861423221, "train_speed(iter/s)": 0.960931 }, { "epoch": 0.07575609914563233, "grad_norm": 0.7692452669143677, "learning_rate": 9.981912262440615e-06, "loss": 0.09315098077058792, "memory(GiB)": 19.03, "step": 2332, "token_acc": 0.9587155963302753, "train_speed(iter/s)": 0.960983 }, { "epoch": 0.07578858460838775, "grad_norm": 0.9040567278862, "learning_rate": 9.981866585145998e-06, "loss": 0.09376388788223267, "memory(GiB)": 19.03, "step": 2333, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.961042 }, { "epoch": 0.07582107007114317, "grad_norm": 0.7168840169906616, "learning_rate": 9.981820850354009e-06, "loss": 0.09992939233779907, "memory(GiB)": 19.03, "step": 2334, "token_acc": 0.9547325102880658, "train_speed(iter/s)": 0.9611 }, { "epoch": 0.07585355553389858, "grad_norm": 0.6807868480682373, "learning_rate": 9.981775058065177e-06, "loss": 0.08875676989555359, "memory(GiB)": 19.03, "step": 2335, "token_acc": 0.9636363636363636, "train_speed(iter/s)": 0.961161 }, { "epoch": 0.075886040996654, "grad_norm": 0.9004185199737549, "learning_rate": 9.98172920828003e-06, "loss": 0.08105487376451492, "memory(GiB)": 19.03, "step": 2336, "token_acc": 0.978494623655914, "train_speed(iter/s)": 0.961206 }, { "epoch": 0.07591852645940941, "grad_norm": 0.5154204964637756, "learning_rate": 9.981683300999099e-06, "loss": 0.08127855509519577, "memory(GiB)": 19.03, "step": 2337, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.961269 }, { "epoch": 0.07595101192216483, "grad_norm": 0.7639763951301575, "learning_rate": 9.98163733622291e-06, "loss": 0.09434022754430771, "memory(GiB)": 19.03, "step": 2338, "token_acc": 0.9619565217391305, "train_speed(iter/s)": 0.961325 }, { "epoch": 0.07598349738492025, "grad_norm": 0.6565103530883789, "learning_rate": 9.981591313951998e-06, "loss": 0.07832655310630798, "memory(GiB)": 19.03, "step": 2339, "token_acc": 0.9665271966527197, "train_speed(iter/s)": 0.961391 }, { "epoch": 0.07601598284767566, "grad_norm": 1.0534995794296265, "learning_rate": 9.981545234186889e-06, "loss": 0.09693054854869843, "memory(GiB)": 19.03, "step": 2340, "token_acc": 0.9626168224299065, "train_speed(iter/s)": 0.961451 }, { "epoch": 0.07604846831043108, "grad_norm": 0.6040367484092712, "learning_rate": 9.981499096928118e-06, "loss": 0.08212514966726303, "memory(GiB)": 19.03, "step": 2341, "token_acc": 0.9587155963302753, "train_speed(iter/s)": 0.961516 }, { "epoch": 0.0760809537731865, "grad_norm": 0.8437787890434265, "learning_rate": 9.981452902176218e-06, "loss": 0.08825404196977615, "memory(GiB)": 19.03, "step": 2342, "token_acc": 0.9330357142857143, "train_speed(iter/s)": 0.961581 }, { "epoch": 0.07611343923594191, "grad_norm": 0.5512387752532959, "learning_rate": 9.981406649931722e-06, "loss": 0.0790330246090889, "memory(GiB)": 19.03, "step": 2343, "token_acc": 0.9407114624505929, "train_speed(iter/s)": 0.961639 }, { "epoch": 0.07614592469869734, "grad_norm": 0.7944846749305725, "learning_rate": 9.98136034019516e-06, "loss": 0.08094438165426254, "memory(GiB)": 19.03, "step": 2344, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.961704 }, { "epoch": 0.07617841016145276, "grad_norm": 0.7252054810523987, "learning_rate": 9.981313972967073e-06, "loss": 0.08594989031553268, "memory(GiB)": 19.03, "step": 2345, "token_acc": 0.9798994974874372, "train_speed(iter/s)": 0.961757 }, { "epoch": 0.07621089562420817, "grad_norm": 0.6512803435325623, "learning_rate": 9.981267548247991e-06, "loss": 0.08057133853435516, "memory(GiB)": 19.03, "step": 2346, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.961814 }, { "epoch": 0.07624338108696359, "grad_norm": 0.7419253587722778, "learning_rate": 9.98122106603845e-06, "loss": 0.08743013441562653, "memory(GiB)": 19.03, "step": 2347, "token_acc": 0.9764705882352941, "train_speed(iter/s)": 0.961875 }, { "epoch": 0.076275866549719, "grad_norm": 0.799351692199707, "learning_rate": 9.98117452633899e-06, "loss": 0.07869810611009598, "memory(GiB)": 19.03, "step": 2348, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.961949 }, { "epoch": 0.07630835201247442, "grad_norm": 1.524869441986084, "learning_rate": 9.981127929150144e-06, "loss": 0.08823879808187485, "memory(GiB)": 19.03, "step": 2349, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.962029 }, { "epoch": 0.07634083747522984, "grad_norm": 2.4341893196105957, "learning_rate": 9.981081274472454e-06, "loss": 0.09499892592430115, "memory(GiB)": 19.03, "step": 2350, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.962115 }, { "epoch": 0.07637332293798525, "grad_norm": 0.950829029083252, "learning_rate": 9.981034562306455e-06, "loss": 0.09019780904054642, "memory(GiB)": 19.03, "step": 2351, "token_acc": 0.9587155963302753, "train_speed(iter/s)": 0.962193 }, { "epoch": 0.07640580840074067, "grad_norm": 0.646977424621582, "learning_rate": 9.98098779265269e-06, "loss": 0.07782807946205139, "memory(GiB)": 19.03, "step": 2352, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.962275 }, { "epoch": 0.07643829386349608, "grad_norm": 1.5436772108078003, "learning_rate": 9.980940965511693e-06, "loss": 0.0724945068359375, "memory(GiB)": 19.03, "step": 2353, "token_acc": 0.9816176470588235, "train_speed(iter/s)": 0.962353 }, { "epoch": 0.0764707793262515, "grad_norm": 1.1015503406524658, "learning_rate": 9.98089408088401e-06, "loss": 0.10563838481903076, "memory(GiB)": 19.03, "step": 2354, "token_acc": 0.9568627450980393, "train_speed(iter/s)": 0.962435 }, { "epoch": 0.07650326478900692, "grad_norm": 0.7644921541213989, "learning_rate": 9.980847138770178e-06, "loss": 0.08874733746051788, "memory(GiB)": 19.03, "step": 2355, "token_acc": 0.9308510638297872, "train_speed(iter/s)": 0.962513 }, { "epoch": 0.07653575025176233, "grad_norm": 1.2896804809570312, "learning_rate": 9.980800139170741e-06, "loss": 0.09276826679706573, "memory(GiB)": 19.03, "step": 2356, "token_acc": 0.9615384615384616, "train_speed(iter/s)": 0.962594 }, { "epoch": 0.07656823571451775, "grad_norm": 0.8119263648986816, "learning_rate": 9.980753082086242e-06, "loss": 0.0930163711309433, "memory(GiB)": 19.03, "step": 2357, "token_acc": 0.9704797047970479, "train_speed(iter/s)": 0.962664 }, { "epoch": 0.07660072117727316, "grad_norm": 0.6665164828300476, "learning_rate": 9.98070596751722e-06, "loss": 0.08670724928379059, "memory(GiB)": 19.03, "step": 2358, "token_acc": 0.9403669724770642, "train_speed(iter/s)": 0.962743 }, { "epoch": 0.07663320664002858, "grad_norm": 0.719807505607605, "learning_rate": 9.980658795464224e-06, "loss": 0.09166368097066879, "memory(GiB)": 19.03, "step": 2359, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.962823 }, { "epoch": 0.07666569210278401, "grad_norm": 0.8982640504837036, "learning_rate": 9.980611565927795e-06, "loss": 0.091895692050457, "memory(GiB)": 19.03, "step": 2360, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.962906 }, { "epoch": 0.07669817756553943, "grad_norm": 1.110273838043213, "learning_rate": 9.980564278908482e-06, "loss": 0.09849734604358673, "memory(GiB)": 19.03, "step": 2361, "token_acc": 0.9520295202952029, "train_speed(iter/s)": 0.962985 }, { "epoch": 0.07673066302829484, "grad_norm": 0.6057907938957214, "learning_rate": 9.980516934406826e-06, "loss": 0.09043239802122116, "memory(GiB)": 19.03, "step": 2362, "token_acc": 0.9727272727272728, "train_speed(iter/s)": 0.963068 }, { "epoch": 0.07676314849105026, "grad_norm": 0.7160767912864685, "learning_rate": 9.980469532423376e-06, "loss": 0.08057639002799988, "memory(GiB)": 19.03, "step": 2363, "token_acc": 0.9702380952380952, "train_speed(iter/s)": 0.96314 }, { "epoch": 0.07679563395380568, "grad_norm": 1.1358795166015625, "learning_rate": 9.98042207295868e-06, "loss": 0.09823673963546753, "memory(GiB)": 19.03, "step": 2364, "token_acc": 0.9407894736842105, "train_speed(iter/s)": 0.96322 }, { "epoch": 0.07682811941656109, "grad_norm": 0.8260435461997986, "learning_rate": 9.980374556013283e-06, "loss": 0.09098070114850998, "memory(GiB)": 19.03, "step": 2365, "token_acc": 0.9628099173553719, "train_speed(iter/s)": 0.963299 }, { "epoch": 0.07686060487931651, "grad_norm": 0.7341353297233582, "learning_rate": 9.980326981587734e-06, "loss": 0.10027094185352325, "memory(GiB)": 19.03, "step": 2366, "token_acc": 0.948936170212766, "train_speed(iter/s)": 0.963372 }, { "epoch": 0.07689309034207192, "grad_norm": 0.6307118535041809, "learning_rate": 9.980279349682584e-06, "loss": 0.08019773662090302, "memory(GiB)": 19.03, "step": 2367, "token_acc": 0.9795081967213115, "train_speed(iter/s)": 0.963452 }, { "epoch": 0.07692557580482734, "grad_norm": 0.895309567451477, "learning_rate": 9.98023166029838e-06, "loss": 0.08412398397922516, "memory(GiB)": 19.03, "step": 2368, "token_acc": 0.9613733905579399, "train_speed(iter/s)": 0.963537 }, { "epoch": 0.07695806126758276, "grad_norm": 3.688023567199707, "learning_rate": 9.980183913435677e-06, "loss": 0.09544117003679276, "memory(GiB)": 19.03, "step": 2369, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.963614 }, { "epoch": 0.07699054673033817, "grad_norm": 0.7147188186645508, "learning_rate": 9.98013610909502e-06, "loss": 0.09072522819042206, "memory(GiB)": 19.03, "step": 2370, "token_acc": 0.9641025641025641, "train_speed(iter/s)": 0.963697 }, { "epoch": 0.07702303219309359, "grad_norm": 0.8960369825363159, "learning_rate": 9.980088247276966e-06, "loss": 0.08577392995357513, "memory(GiB)": 19.03, "step": 2371, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.963772 }, { "epoch": 0.077055517655849, "grad_norm": 0.7403032779693604, "learning_rate": 9.980040327982064e-06, "loss": 0.07927387952804565, "memory(GiB)": 19.03, "step": 2372, "token_acc": 0.9688888888888889, "train_speed(iter/s)": 0.963842 }, { "epoch": 0.07708800311860442, "grad_norm": 0.6671332120895386, "learning_rate": 9.979992351210867e-06, "loss": 0.09111785143613815, "memory(GiB)": 19.03, "step": 2373, "token_acc": 0.9678899082568807, "train_speed(iter/s)": 0.963904 }, { "epoch": 0.07712048858135984, "grad_norm": 1.0474578142166138, "learning_rate": 9.979944316963931e-06, "loss": 0.09690140187740326, "memory(GiB)": 19.03, "step": 2374, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.963887 }, { "epoch": 0.07715297404411525, "grad_norm": 0.9804646968841553, "learning_rate": 9.97989622524181e-06, "loss": 0.08652149140834808, "memory(GiB)": 19.03, "step": 2375, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.963945 }, { "epoch": 0.07718545950687068, "grad_norm": 0.6102199554443359, "learning_rate": 9.97984807604506e-06, "loss": 0.08283479511737823, "memory(GiB)": 19.03, "step": 2376, "token_acc": 0.9794238683127572, "train_speed(iter/s)": 0.96401 }, { "epoch": 0.0772179449696261, "grad_norm": 0.6752263903617859, "learning_rate": 9.979799869374232e-06, "loss": 0.08750645071268082, "memory(GiB)": 19.03, "step": 2377, "token_acc": 0.9619047619047619, "train_speed(iter/s)": 0.964074 }, { "epoch": 0.07725043043238151, "grad_norm": 0.8768412470817566, "learning_rate": 9.979751605229886e-06, "loss": 0.08315866440534592, "memory(GiB)": 19.03, "step": 2378, "token_acc": 0.9702602230483272, "train_speed(iter/s)": 0.964138 }, { "epoch": 0.07728291589513693, "grad_norm": 0.7131137251853943, "learning_rate": 9.979703283612579e-06, "loss": 0.08801453560590744, "memory(GiB)": 19.03, "step": 2379, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.964203 }, { "epoch": 0.07731540135789235, "grad_norm": 0.6334106922149658, "learning_rate": 9.979654904522867e-06, "loss": 0.0884973406791687, "memory(GiB)": 19.03, "step": 2380, "token_acc": 0.9629629629629629, "train_speed(iter/s)": 0.964267 }, { "epoch": 0.07734788682064776, "grad_norm": 0.9799544215202332, "learning_rate": 9.979606467961312e-06, "loss": 0.09371689707040787, "memory(GiB)": 19.03, "step": 2381, "token_acc": 0.9683098591549296, "train_speed(iter/s)": 0.964332 }, { "epoch": 0.07738037228340318, "grad_norm": 0.9001302719116211, "learning_rate": 9.979557973928468e-06, "loss": 0.08919326215982437, "memory(GiB)": 19.03, "step": 2382, "token_acc": 0.9369747899159664, "train_speed(iter/s)": 0.964391 }, { "epoch": 0.0774128577461586, "grad_norm": 0.6527213454246521, "learning_rate": 9.979509422424898e-06, "loss": 0.0889388918876648, "memory(GiB)": 19.03, "step": 2383, "token_acc": 0.9598214285714286, "train_speed(iter/s)": 0.964458 }, { "epoch": 0.07744534320891401, "grad_norm": 0.6260448694229126, "learning_rate": 9.979460813451161e-06, "loss": 0.08005627989768982, "memory(GiB)": 19.03, "step": 2384, "token_acc": 0.9581589958158996, "train_speed(iter/s)": 0.964522 }, { "epoch": 0.07747782867166943, "grad_norm": 0.6152176856994629, "learning_rate": 9.97941214700782e-06, "loss": 0.08824262022972107, "memory(GiB)": 19.03, "step": 2385, "token_acc": 0.9624060150375939, "train_speed(iter/s)": 0.964588 }, { "epoch": 0.07751031413442484, "grad_norm": 0.6638330221176147, "learning_rate": 9.979363423095434e-06, "loss": 0.09150990843772888, "memory(GiB)": 19.03, "step": 2386, "token_acc": 0.9621848739495799, "train_speed(iter/s)": 0.964654 }, { "epoch": 0.07754279959718026, "grad_norm": 0.7432165741920471, "learning_rate": 9.979314641714568e-06, "loss": 0.08622948080301285, "memory(GiB)": 19.03, "step": 2387, "token_acc": 0.9528985507246377, "train_speed(iter/s)": 0.96472 }, { "epoch": 0.07757528505993568, "grad_norm": 1.061033844947815, "learning_rate": 9.979265802865782e-06, "loss": 0.0805772989988327, "memory(GiB)": 19.03, "step": 2388, "token_acc": 0.9621621621621622, "train_speed(iter/s)": 0.964757 }, { "epoch": 0.07760777052269109, "grad_norm": 1.040858268737793, "learning_rate": 9.979216906549642e-06, "loss": 0.08384191244840622, "memory(GiB)": 19.03, "step": 2389, "token_acc": 0.9675925925925926, "train_speed(iter/s)": 0.964791 }, { "epoch": 0.07764025598544651, "grad_norm": 0.831661581993103, "learning_rate": 9.97916795276671e-06, "loss": 0.08343202620744705, "memory(GiB)": 19.03, "step": 2390, "token_acc": 0.9783549783549783, "train_speed(iter/s)": 0.964836 }, { "epoch": 0.07767274144820192, "grad_norm": 0.7689361572265625, "learning_rate": 9.979118941517556e-06, "loss": 0.09343266487121582, "memory(GiB)": 19.03, "step": 2391, "token_acc": 0.9560975609756097, "train_speed(iter/s)": 0.964881 }, { "epoch": 0.07770522691095735, "grad_norm": 0.7696644067764282, "learning_rate": 9.97906987280274e-06, "loss": 0.08758348226547241, "memory(GiB)": 19.03, "step": 2392, "token_acc": 0.9737827715355806, "train_speed(iter/s)": 0.964939 }, { "epoch": 0.07773771237371277, "grad_norm": 0.634218156337738, "learning_rate": 9.979020746622831e-06, "loss": 0.08804789185523987, "memory(GiB)": 19.03, "step": 2393, "token_acc": 0.9683098591549296, "train_speed(iter/s)": 0.964999 }, { "epoch": 0.07777019783646819, "grad_norm": 0.6783912181854248, "learning_rate": 9.978971562978395e-06, "loss": 0.09015955030918121, "memory(GiB)": 19.03, "step": 2394, "token_acc": 0.9736842105263158, "train_speed(iter/s)": 0.965062 }, { "epoch": 0.0778026832992236, "grad_norm": 0.9552816152572632, "learning_rate": 9.97892232187e-06, "loss": 0.09080219268798828, "memory(GiB)": 19.03, "step": 2395, "token_acc": 0.96484375, "train_speed(iter/s)": 0.965125 }, { "epoch": 0.07783516876197902, "grad_norm": 0.8295803070068359, "learning_rate": 9.978873023298215e-06, "loss": 0.0817522183060646, "memory(GiB)": 19.03, "step": 2396, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.965187 }, { "epoch": 0.07786765422473443, "grad_norm": 0.9857364296913147, "learning_rate": 9.978823667263609e-06, "loss": 0.08185654878616333, "memory(GiB)": 19.03, "step": 2397, "token_acc": 0.9558823529411765, "train_speed(iter/s)": 0.965238 }, { "epoch": 0.07790013968748985, "grad_norm": 0.6156433820724487, "learning_rate": 9.97877425376675e-06, "loss": 0.0886150449514389, "memory(GiB)": 19.03, "step": 2398, "token_acc": 0.9665071770334929, "train_speed(iter/s)": 0.965302 }, { "epoch": 0.07793262515024527, "grad_norm": 0.8466494083404541, "learning_rate": 9.97872478280821e-06, "loss": 0.0927450954914093, "memory(GiB)": 19.03, "step": 2399, "token_acc": 0.9553571428571429, "train_speed(iter/s)": 0.965362 }, { "epoch": 0.07796511061300068, "grad_norm": 0.5990666747093201, "learning_rate": 9.97867525438856e-06, "loss": 0.08620832860469818, "memory(GiB)": 19.03, "step": 2400, "token_acc": 0.9628252788104089, "train_speed(iter/s)": 0.965421 }, { "epoch": 0.0779975960757561, "grad_norm": 0.7187096476554871, "learning_rate": 9.97862566850837e-06, "loss": 0.08863073587417603, "memory(GiB)": 19.03, "step": 2401, "token_acc": 0.9647577092511013, "train_speed(iter/s)": 0.965474 }, { "epoch": 0.07803008153851151, "grad_norm": 0.6124057769775391, "learning_rate": 9.978576025168212e-06, "loss": 0.08214080333709717, "memory(GiB)": 19.03, "step": 2402, "token_acc": 0.9593908629441624, "train_speed(iter/s)": 0.965531 }, { "epoch": 0.07806256700126693, "grad_norm": 0.8569777011871338, "learning_rate": 9.978526324368662e-06, "loss": 0.08783911168575287, "memory(GiB)": 19.03, "step": 2403, "token_acc": 0.9636363636363636, "train_speed(iter/s)": 0.96559 }, { "epoch": 0.07809505246402235, "grad_norm": 0.6600889563560486, "learning_rate": 9.978476566110291e-06, "loss": 0.07482828199863434, "memory(GiB)": 19.03, "step": 2404, "token_acc": 0.9561752988047809, "train_speed(iter/s)": 0.965655 }, { "epoch": 0.07812753792677776, "grad_norm": 0.9085114598274231, "learning_rate": 9.978426750393675e-06, "loss": 0.0788489580154419, "memory(GiB)": 19.03, "step": 2405, "token_acc": 0.9604743083003953, "train_speed(iter/s)": 0.96571 }, { "epoch": 0.07816002338953318, "grad_norm": 0.7070927023887634, "learning_rate": 9.978376877219387e-06, "loss": 0.09373053908348083, "memory(GiB)": 19.03, "step": 2406, "token_acc": 0.954337899543379, "train_speed(iter/s)": 0.965785 }, { "epoch": 0.0781925088522886, "grad_norm": 0.6724722981452942, "learning_rate": 9.978326946588004e-06, "loss": 0.0856209322810173, "memory(GiB)": 19.03, "step": 2407, "token_acc": 0.9641255605381166, "train_speed(iter/s)": 0.965853 }, { "epoch": 0.07822499431504402, "grad_norm": 1.4107661247253418, "learning_rate": 9.978276958500105e-06, "loss": 0.07936863601207733, "memory(GiB)": 19.03, "step": 2408, "token_acc": 0.953125, "train_speed(iter/s)": 0.965929 }, { "epoch": 0.07825747977779944, "grad_norm": 0.763042688369751, "learning_rate": 9.97822691295626e-06, "loss": 0.08928479254245758, "memory(GiB)": 19.03, "step": 2409, "token_acc": 0.9570815450643777, "train_speed(iter/s)": 0.966005 }, { "epoch": 0.07828996524055486, "grad_norm": 0.9208130836486816, "learning_rate": 9.97817680995705e-06, "loss": 0.09264370799064636, "memory(GiB)": 19.03, "step": 2410, "token_acc": 0.9681274900398407, "train_speed(iter/s)": 0.96608 }, { "epoch": 0.07832245070331027, "grad_norm": 0.7175596952438354, "learning_rate": 9.978126649503054e-06, "loss": 0.07371307909488678, "memory(GiB)": 19.03, "step": 2411, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.966158 }, { "epoch": 0.07835493616606569, "grad_norm": 0.6781087517738342, "learning_rate": 9.978076431594853e-06, "loss": 0.07861894369125366, "memory(GiB)": 19.03, "step": 2412, "token_acc": 0.9671361502347418, "train_speed(iter/s)": 0.96623 }, { "epoch": 0.0783874216288211, "grad_norm": 0.9245063662528992, "learning_rate": 9.978026156233023e-06, "loss": 0.07958079874515533, "memory(GiB)": 19.03, "step": 2413, "token_acc": 0.96, "train_speed(iter/s)": 0.966306 }, { "epoch": 0.07841990709157652, "grad_norm": 1.0491880178451538, "learning_rate": 9.977975823418145e-06, "loss": 0.09768374264240265, "memory(GiB)": 19.03, "step": 2414, "token_acc": 0.9576271186440678, "train_speed(iter/s)": 0.966383 }, { "epoch": 0.07845239255433194, "grad_norm": 0.8253234624862671, "learning_rate": 9.9779254331508e-06, "loss": 0.081407330930233, "memory(GiB)": 19.03, "step": 2415, "token_acc": 0.9660194174757282, "train_speed(iter/s)": 0.966459 }, { "epoch": 0.07848487801708735, "grad_norm": 0.8990654945373535, "learning_rate": 9.97787498543157e-06, "loss": 0.07583751529455185, "memory(GiB)": 19.03, "step": 2416, "token_acc": 0.9753086419753086, "train_speed(iter/s)": 0.966538 }, { "epoch": 0.07851736347984277, "grad_norm": 0.8191389441490173, "learning_rate": 9.977824480261037e-06, "loss": 0.09051991254091263, "memory(GiB)": 19.03, "step": 2417, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.966618 }, { "epoch": 0.07854984894259819, "grad_norm": 0.8027973771095276, "learning_rate": 9.977773917639787e-06, "loss": 0.07757943868637085, "memory(GiB)": 19.03, "step": 2418, "token_acc": 0.958139534883721, "train_speed(iter/s)": 0.966697 }, { "epoch": 0.0785823344053536, "grad_norm": 2.0361599922180176, "learning_rate": 9.977723297568398e-06, "loss": 0.0930439680814743, "memory(GiB)": 19.03, "step": 2419, "token_acc": 0.9663865546218487, "train_speed(iter/s)": 0.966774 }, { "epoch": 0.07861481986810902, "grad_norm": 0.9917873740196228, "learning_rate": 9.977672620047456e-06, "loss": 0.08540048450231552, "memory(GiB)": 19.03, "step": 2420, "token_acc": 0.970873786407767, "train_speed(iter/s)": 0.966851 }, { "epoch": 0.07864730533086443, "grad_norm": 0.8191654086112976, "learning_rate": 9.977621885077547e-06, "loss": 0.09131309390068054, "memory(GiB)": 19.03, "step": 2421, "token_acc": 0.9672131147540983, "train_speed(iter/s)": 0.966927 }, { "epoch": 0.07867979079361985, "grad_norm": 0.7749219536781311, "learning_rate": 9.977571092659256e-06, "loss": 0.08692830055952072, "memory(GiB)": 19.03, "step": 2422, "token_acc": 0.9621848739495799, "train_speed(iter/s)": 0.96701 }, { "epoch": 0.07871227625637527, "grad_norm": 0.8465043902397156, "learning_rate": 9.977520242793172e-06, "loss": 0.09272095561027527, "memory(GiB)": 19.03, "step": 2423, "token_acc": 0.9522058823529411, "train_speed(iter/s)": 0.967079 }, { "epoch": 0.0787447617191307, "grad_norm": 2.04961895942688, "learning_rate": 9.977469335479876e-06, "loss": 0.09514767676591873, "memory(GiB)": 19.03, "step": 2424, "token_acc": 0.959409594095941, "train_speed(iter/s)": 0.967152 }, { "epoch": 0.07877724718188611, "grad_norm": 0.4839288592338562, "learning_rate": 9.977418370719963e-06, "loss": 0.08786974847316742, "memory(GiB)": 19.03, "step": 2425, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.967226 }, { "epoch": 0.07880973264464153, "grad_norm": 1.0711803436279297, "learning_rate": 9.977367348514014e-06, "loss": 0.08909184485673904, "memory(GiB)": 19.03, "step": 2426, "token_acc": 0.964, "train_speed(iter/s)": 0.967298 }, { "epoch": 0.07884221810739694, "grad_norm": 0.7668153643608093, "learning_rate": 9.977316268862622e-06, "loss": 0.09054496884346008, "memory(GiB)": 19.03, "step": 2427, "token_acc": 0.9663461538461539, "train_speed(iter/s)": 0.967372 }, { "epoch": 0.07887470357015236, "grad_norm": 0.6492622494697571, "learning_rate": 9.977265131766374e-06, "loss": 0.09663684666156769, "memory(GiB)": 19.03, "step": 2428, "token_acc": 0.9271523178807947, "train_speed(iter/s)": 0.967437 }, { "epoch": 0.07890718903290778, "grad_norm": 1.064974308013916, "learning_rate": 9.977213937225863e-06, "loss": 0.08188037574291229, "memory(GiB)": 19.03, "step": 2429, "token_acc": 0.9633699633699634, "train_speed(iter/s)": 0.967517 }, { "epoch": 0.07893967449566319, "grad_norm": 0.6018089652061462, "learning_rate": 9.97716268524168e-06, "loss": 0.08323021978139877, "memory(GiB)": 19.03, "step": 2430, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.967593 }, { "epoch": 0.07897215995841861, "grad_norm": 0.5234221816062927, "learning_rate": 9.977111375814412e-06, "loss": 0.07705414295196533, "memory(GiB)": 19.03, "step": 2431, "token_acc": 0.96, "train_speed(iter/s)": 0.96765 }, { "epoch": 0.07900464542117402, "grad_norm": 0.7185481786727905, "learning_rate": 9.977060008944655e-06, "loss": 0.08355129510164261, "memory(GiB)": 19.03, "step": 2432, "token_acc": 0.972, "train_speed(iter/s)": 0.967704 }, { "epoch": 0.07903713088392944, "grad_norm": 0.6317185759544373, "learning_rate": 9.977008584633e-06, "loss": 0.07827122509479523, "memory(GiB)": 19.03, "step": 2433, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.967763 }, { "epoch": 0.07906961634668486, "grad_norm": 0.5829789638519287, "learning_rate": 9.976957102880044e-06, "loss": 0.09457459300756454, "memory(GiB)": 19.03, "step": 2434, "token_acc": 0.9661016949152542, "train_speed(iter/s)": 0.967824 }, { "epoch": 0.07910210180944027, "grad_norm": 0.577201247215271, "learning_rate": 9.976905563686379e-06, "loss": 0.09624660015106201, "memory(GiB)": 19.03, "step": 2435, "token_acc": 0.9691629955947136, "train_speed(iter/s)": 0.967889 }, { "epoch": 0.07913458727219569, "grad_norm": 0.7170349359512329, "learning_rate": 9.976853967052597e-06, "loss": 0.08781111985445023, "memory(GiB)": 19.03, "step": 2436, "token_acc": 0.9684684684684685, "train_speed(iter/s)": 0.96795 }, { "epoch": 0.0791670727349511, "grad_norm": 1.7112321853637695, "learning_rate": 9.976802312979299e-06, "loss": 0.08750182390213013, "memory(GiB)": 19.03, "step": 2437, "token_acc": 0.9737827715355806, "train_speed(iter/s)": 0.968014 }, { "epoch": 0.07919955819770652, "grad_norm": 0.6242662668228149, "learning_rate": 9.976750601467075e-06, "loss": 0.07652169466018677, "memory(GiB)": 19.03, "step": 2438, "token_acc": 0.9735849056603774, "train_speed(iter/s)": 0.96807 }, { "epoch": 0.07923204366046194, "grad_norm": 0.5827071070671082, "learning_rate": 9.976698832516527e-06, "loss": 0.08348537981510162, "memory(GiB)": 19.03, "step": 2439, "token_acc": 0.9632352941176471, "train_speed(iter/s)": 0.968132 }, { "epoch": 0.07926452912321737, "grad_norm": 0.6394525766372681, "learning_rate": 9.97664700612825e-06, "loss": 0.08404514193534851, "memory(GiB)": 19.03, "step": 2440, "token_acc": 0.9607843137254902, "train_speed(iter/s)": 0.968188 }, { "epoch": 0.07929701458597278, "grad_norm": 0.7699972987174988, "learning_rate": 9.976595122302843e-06, "loss": 0.08231617510318756, "memory(GiB)": 19.03, "step": 2441, "token_acc": 0.9606299212598425, "train_speed(iter/s)": 0.968246 }, { "epoch": 0.0793295000487282, "grad_norm": 1.171790599822998, "learning_rate": 9.976543181040905e-06, "loss": 0.08265578001737595, "memory(GiB)": 19.03, "step": 2442, "token_acc": 0.9539170506912442, "train_speed(iter/s)": 0.968293 }, { "epoch": 0.07936198551148362, "grad_norm": 0.6131710410118103, "learning_rate": 9.976491182343033e-06, "loss": 0.07657009363174438, "memory(GiB)": 19.03, "step": 2443, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.968351 }, { "epoch": 0.07939447097423903, "grad_norm": 0.9988771080970764, "learning_rate": 9.97643912620983e-06, "loss": 0.1051192432641983, "memory(GiB)": 19.03, "step": 2444, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.968416 }, { "epoch": 0.07942695643699445, "grad_norm": 0.6906008720397949, "learning_rate": 9.976387012641897e-06, "loss": 0.08078354597091675, "memory(GiB)": 19.03, "step": 2445, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.968472 }, { "epoch": 0.07945944189974986, "grad_norm": 0.639553427696228, "learning_rate": 9.976334841639832e-06, "loss": 0.08319368958473206, "memory(GiB)": 19.03, "step": 2446, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.968528 }, { "epoch": 0.07949192736250528, "grad_norm": 0.8194611668586731, "learning_rate": 9.97628261320424e-06, "loss": 0.08011627942323685, "memory(GiB)": 19.03, "step": 2447, "token_acc": 0.9571984435797666, "train_speed(iter/s)": 0.968583 }, { "epoch": 0.0795244128252607, "grad_norm": 0.7256738543510437, "learning_rate": 9.976230327335725e-06, "loss": 0.08621636033058167, "memory(GiB)": 19.03, "step": 2448, "token_acc": 0.9629629629629629, "train_speed(iter/s)": 0.968623 }, { "epoch": 0.07955689828801611, "grad_norm": 1.1249356269836426, "learning_rate": 9.976177984034887e-06, "loss": 0.08727636933326721, "memory(GiB)": 19.03, "step": 2449, "token_acc": 0.9641255605381166, "train_speed(iter/s)": 0.968663 }, { "epoch": 0.07958938375077153, "grad_norm": 0.7521206736564636, "learning_rate": 9.976125583302333e-06, "loss": 0.08620887994766235, "memory(GiB)": 19.03, "step": 2450, "token_acc": 0.9554655870445344, "train_speed(iter/s)": 0.968718 }, { "epoch": 0.07962186921352694, "grad_norm": 1.0183566808700562, "learning_rate": 9.976073125138666e-06, "loss": 0.08979624509811401, "memory(GiB)": 19.03, "step": 2451, "token_acc": 0.9629629629629629, "train_speed(iter/s)": 0.968769 }, { "epoch": 0.07965435467628236, "grad_norm": 1.245913028717041, "learning_rate": 9.976020609544493e-06, "loss": 0.10835575312376022, "memory(GiB)": 19.03, "step": 2452, "token_acc": 0.9502487562189055, "train_speed(iter/s)": 0.96883 }, { "epoch": 0.07968684013903778, "grad_norm": 0.6657974720001221, "learning_rate": 9.975968036520418e-06, "loss": 0.08602295070886612, "memory(GiB)": 19.03, "step": 2453, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.968889 }, { "epoch": 0.07971932560179319, "grad_norm": 0.6257211565971375, "learning_rate": 9.97591540606705e-06, "loss": 0.08222110569477081, "memory(GiB)": 19.03, "step": 2454, "token_acc": 0.9624060150375939, "train_speed(iter/s)": 0.96895 }, { "epoch": 0.07975181106454861, "grad_norm": 1.042253851890564, "learning_rate": 9.975862718184992e-06, "loss": 0.08503258228302002, "memory(GiB)": 19.03, "step": 2455, "token_acc": 0.9670781893004116, "train_speed(iter/s)": 0.968998 }, { "epoch": 0.07978429652730404, "grad_norm": 0.5482000112533569, "learning_rate": 9.975809972874858e-06, "loss": 0.07898584008216858, "memory(GiB)": 19.03, "step": 2456, "token_acc": 0.9539170506912442, "train_speed(iter/s)": 0.969055 }, { "epoch": 0.07981678199005945, "grad_norm": 0.699492871761322, "learning_rate": 9.975757170137255e-06, "loss": 0.09005390852689743, "memory(GiB)": 19.03, "step": 2457, "token_acc": 0.9681818181818181, "train_speed(iter/s)": 0.969113 }, { "epoch": 0.07984926745281487, "grad_norm": 0.5926472544670105, "learning_rate": 9.975704309972789e-06, "loss": 0.08648562431335449, "memory(GiB)": 19.03, "step": 2458, "token_acc": 0.9581589958158996, "train_speed(iter/s)": 0.969174 }, { "epoch": 0.07988175291557029, "grad_norm": 0.530603289604187, "learning_rate": 9.975651392382074e-06, "loss": 0.08341687172651291, "memory(GiB)": 19.03, "step": 2459, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.96923 }, { "epoch": 0.0799142383783257, "grad_norm": 0.6865925788879395, "learning_rate": 9.97559841736572e-06, "loss": 0.0898025631904602, "memory(GiB)": 19.03, "step": 2460, "token_acc": 0.9606299212598425, "train_speed(iter/s)": 0.969277 }, { "epoch": 0.07994672384108112, "grad_norm": 0.9242945313453674, "learning_rate": 9.975545384924337e-06, "loss": 0.08351834118366241, "memory(GiB)": 19.03, "step": 2461, "token_acc": 0.9847908745247148, "train_speed(iter/s)": 0.969334 }, { "epoch": 0.07997920930383653, "grad_norm": 0.6214008927345276, "learning_rate": 9.975492295058538e-06, "loss": 0.08079898357391357, "memory(GiB)": 19.03, "step": 2462, "token_acc": 0.9566929133858267, "train_speed(iter/s)": 0.969409 }, { "epoch": 0.08001169476659195, "grad_norm": 0.7623364925384521, "learning_rate": 9.975439147768936e-06, "loss": 0.09187982976436615, "memory(GiB)": 19.03, "step": 2463, "token_acc": 0.9627906976744186, "train_speed(iter/s)": 0.969487 }, { "epoch": 0.08004418022934737, "grad_norm": 0.7631397247314453, "learning_rate": 9.975385943056141e-06, "loss": 0.08630837500095367, "memory(GiB)": 19.03, "step": 2464, "token_acc": 0.9683794466403162, "train_speed(iter/s)": 0.969563 }, { "epoch": 0.08007666569210278, "grad_norm": 0.8234983682632446, "learning_rate": 9.975332680920772e-06, "loss": 0.07512276619672775, "memory(GiB)": 19.03, "step": 2465, "token_acc": 0.9631336405529954, "train_speed(iter/s)": 0.969628 }, { "epoch": 0.0801091511548582, "grad_norm": 0.6168221235275269, "learning_rate": 9.975279361363444e-06, "loss": 0.09003226459026337, "memory(GiB)": 19.03, "step": 2466, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.969701 }, { "epoch": 0.08014163661761362, "grad_norm": 0.7846832275390625, "learning_rate": 9.975225984384768e-06, "loss": 0.07804057002067566, "memory(GiB)": 19.03, "step": 2467, "token_acc": 0.976878612716763, "train_speed(iter/s)": 0.969776 }, { "epoch": 0.08017412208036903, "grad_norm": 0.6948996186256409, "learning_rate": 9.975172549985361e-06, "loss": 0.07740208506584167, "memory(GiB)": 19.03, "step": 2468, "token_acc": 0.9612068965517241, "train_speed(iter/s)": 0.969852 }, { "epoch": 0.08020660754312445, "grad_norm": 0.6519885063171387, "learning_rate": 9.975119058165843e-06, "loss": 0.08090018481016159, "memory(GiB)": 19.03, "step": 2469, "token_acc": 0.9411764705882353, "train_speed(iter/s)": 0.969925 }, { "epoch": 0.08023909300587986, "grad_norm": 0.5953985452651978, "learning_rate": 9.975065508926829e-06, "loss": 0.07464385032653809, "memory(GiB)": 19.03, "step": 2470, "token_acc": 0.9724770642201835, "train_speed(iter/s)": 0.969996 }, { "epoch": 0.08027157846863528, "grad_norm": 0.5486710071563721, "learning_rate": 9.975011902268937e-06, "loss": 0.07645764946937561, "memory(GiB)": 19.03, "step": 2471, "token_acc": 0.9607843137254902, "train_speed(iter/s)": 0.970073 }, { "epoch": 0.08030406393139071, "grad_norm": 0.8183301687240601, "learning_rate": 9.974958238192785e-06, "loss": 0.09815624356269836, "memory(GiB)": 19.03, "step": 2472, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.970142 }, { "epoch": 0.08033654939414613, "grad_norm": 0.759232223033905, "learning_rate": 9.974904516698993e-06, "loss": 0.08397895097732544, "memory(GiB)": 19.03, "step": 2473, "token_acc": 0.946236559139785, "train_speed(iter/s)": 0.970214 }, { "epoch": 0.08036903485690154, "grad_norm": 2.0053975582122803, "learning_rate": 9.974850737788182e-06, "loss": 0.09460464119911194, "memory(GiB)": 19.03, "step": 2474, "token_acc": 0.9590909090909091, "train_speed(iter/s)": 0.970288 }, { "epoch": 0.08040152031965696, "grad_norm": 0.7851563692092896, "learning_rate": 9.974796901460973e-06, "loss": 0.10073128342628479, "memory(GiB)": 19.03, "step": 2475, "token_acc": 0.9624060150375939, "train_speed(iter/s)": 0.970356 }, { "epoch": 0.08043400578241237, "grad_norm": 0.7118868827819824, "learning_rate": 9.974743007717983e-06, "loss": 0.0850856751203537, "memory(GiB)": 19.03, "step": 2476, "token_acc": 0.9523809523809523, "train_speed(iter/s)": 0.970432 }, { "epoch": 0.08046649124516779, "grad_norm": 1.4351422786712646, "learning_rate": 9.97468905655984e-06, "loss": 0.08135702461004257, "memory(GiB)": 19.03, "step": 2477, "token_acc": 0.9537366548042705, "train_speed(iter/s)": 0.970506 }, { "epoch": 0.0804989767079232, "grad_norm": 0.6427842378616333, "learning_rate": 9.974635047987164e-06, "loss": 0.0847126692533493, "memory(GiB)": 19.03, "step": 2478, "token_acc": 0.9598393574297188, "train_speed(iter/s)": 0.970579 }, { "epoch": 0.08053146217067862, "grad_norm": 0.9829986095428467, "learning_rate": 9.97458098200058e-06, "loss": 0.09055396914482117, "memory(GiB)": 19.03, "step": 2479, "token_acc": 0.975, "train_speed(iter/s)": 0.970653 }, { "epoch": 0.08056394763343404, "grad_norm": 0.8603424429893494, "learning_rate": 9.974526858600705e-06, "loss": 0.07723745703697205, "memory(GiB)": 19.03, "step": 2480, "token_acc": 0.9645669291338582, "train_speed(iter/s)": 0.970724 }, { "epoch": 0.08059643309618945, "grad_norm": 0.8948754668235779, "learning_rate": 9.974472677788174e-06, "loss": 0.08816203474998474, "memory(GiB)": 19.03, "step": 2481, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.9708 }, { "epoch": 0.08062891855894487, "grad_norm": 0.8052870035171509, "learning_rate": 9.974418439563606e-06, "loss": 0.07306023687124252, "memory(GiB)": 19.03, "step": 2482, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.970875 }, { "epoch": 0.08066140402170029, "grad_norm": 0.5998266339302063, "learning_rate": 9.974364143927627e-06, "loss": 0.08424581587314606, "memory(GiB)": 19.03, "step": 2483, "token_acc": 0.9603174603174603, "train_speed(iter/s)": 0.970953 }, { "epoch": 0.0806938894844557, "grad_norm": 1.7190757989883423, "learning_rate": 9.974309790880867e-06, "loss": 0.08398495614528656, "memory(GiB)": 19.03, "step": 2484, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.971025 }, { "epoch": 0.08072637494721112, "grad_norm": 0.9361393451690674, "learning_rate": 9.97425538042395e-06, "loss": 0.08917238563299179, "memory(GiB)": 19.03, "step": 2485, "token_acc": 0.9621848739495799, "train_speed(iter/s)": 0.9711 }, { "epoch": 0.08075886040996653, "grad_norm": 0.9965783357620239, "learning_rate": 9.974200912557504e-06, "loss": 0.08410167694091797, "memory(GiB)": 19.03, "step": 2486, "token_acc": 0.9507389162561576, "train_speed(iter/s)": 0.971177 }, { "epoch": 0.08079134587272195, "grad_norm": 0.7874189019203186, "learning_rate": 9.97414638728216e-06, "loss": 0.0934533178806305, "memory(GiB)": 19.03, "step": 2487, "token_acc": 0.9558823529411765, "train_speed(iter/s)": 0.97125 }, { "epoch": 0.08082383133547738, "grad_norm": 0.8197523951530457, "learning_rate": 9.974091804598545e-06, "loss": 0.09151874482631683, "memory(GiB)": 19.03, "step": 2488, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.971323 }, { "epoch": 0.0808563167982328, "grad_norm": 0.7078588604927063, "learning_rate": 9.974037164507293e-06, "loss": 0.07014675438404083, "memory(GiB)": 19.03, "step": 2489, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.971396 }, { "epoch": 0.08088880226098821, "grad_norm": 2.5300071239471436, "learning_rate": 9.973982467009029e-06, "loss": 0.07451482117176056, "memory(GiB)": 19.03, "step": 2490, "token_acc": 0.9550561797752809, "train_speed(iter/s)": 0.971457 }, { "epoch": 0.08092128772374363, "grad_norm": 0.7167807221412659, "learning_rate": 9.973927712104388e-06, "loss": 0.082495778799057, "memory(GiB)": 19.03, "step": 2491, "token_acc": 0.969811320754717, "train_speed(iter/s)": 0.971516 }, { "epoch": 0.08095377318649905, "grad_norm": 0.5523871183395386, "learning_rate": 9.973872899794e-06, "loss": 0.06875412166118622, "memory(GiB)": 19.03, "step": 2492, "token_acc": 0.9733333333333334, "train_speed(iter/s)": 0.971577 }, { "epoch": 0.08098625864925446, "grad_norm": 0.5877563953399658, "learning_rate": 9.9738180300785e-06, "loss": 0.07286401838064194, "memory(GiB)": 19.03, "step": 2493, "token_acc": 0.9736842105263158, "train_speed(iter/s)": 0.971633 }, { "epoch": 0.08101874411200988, "grad_norm": 0.8965755105018616, "learning_rate": 9.973763102958518e-06, "loss": 0.08245465904474258, "memory(GiB)": 19.03, "step": 2494, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.97169 }, { "epoch": 0.0810512295747653, "grad_norm": 1.0815017223358154, "learning_rate": 9.97370811843469e-06, "loss": 0.08257019519805908, "memory(GiB)": 19.03, "step": 2495, "token_acc": 0.94, "train_speed(iter/s)": 0.971743 }, { "epoch": 0.08108371503752071, "grad_norm": 0.6763813495635986, "learning_rate": 9.973653076507652e-06, "loss": 0.08987686038017273, "memory(GiB)": 19.03, "step": 2496, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.971795 }, { "epoch": 0.08111620050027613, "grad_norm": 0.6063116788864136, "learning_rate": 9.973597977178037e-06, "loss": 0.08705811202526093, "memory(GiB)": 19.03, "step": 2497, "token_acc": 0.9567099567099567, "train_speed(iter/s)": 0.971851 }, { "epoch": 0.08114868596303154, "grad_norm": 0.8399289846420288, "learning_rate": 9.97354282044648e-06, "loss": 0.0835312008857727, "memory(GiB)": 19.03, "step": 2498, "token_acc": 0.9710982658959537, "train_speed(iter/s)": 0.971909 }, { "epoch": 0.08118117142578696, "grad_norm": 0.8837868571281433, "learning_rate": 9.97348760631362e-06, "loss": 0.10056497156620026, "memory(GiB)": 19.03, "step": 2499, "token_acc": 0.9641255605381166, "train_speed(iter/s)": 0.971966 }, { "epoch": 0.08121365688854237, "grad_norm": 0.6677971482276917, "learning_rate": 9.973432334780095e-06, "loss": 0.07976683229207993, "memory(GiB)": 19.03, "step": 2500, "token_acc": 0.9644444444444444, "train_speed(iter/s)": 0.972018 }, { "epoch": 0.08121365688854237, "eval_loss": 0.08685367554426193, "eval_runtime": 80.2117, "eval_samples_per_second": 124.047, "eval_steps_per_second": 3.877, "eval_token_acc": 0.964698129623814, "step": 2500 }, { "epoch": 0.08124614235129779, "grad_norm": 0.9885827898979187, "learning_rate": 9.97337700584654e-06, "loss": 0.07737167179584503, "memory(GiB)": 19.03, "step": 2501, "token_acc": 0.9650115030674846, "train_speed(iter/s)": 0.938977 }, { "epoch": 0.0812786278140532, "grad_norm": 0.5990250110626221, "learning_rate": 9.973321619513595e-06, "loss": 0.0819665938615799, "memory(GiB)": 19.03, "step": 2502, "token_acc": 0.9659574468085106, "train_speed(iter/s)": 0.93904 }, { "epoch": 0.08131111327680862, "grad_norm": 0.5941383838653564, "learning_rate": 9.973266175781898e-06, "loss": 0.08425220847129822, "memory(GiB)": 19.03, "step": 2503, "token_acc": 0.9592760180995475, "train_speed(iter/s)": 0.939094 }, { "epoch": 0.08134359873956405, "grad_norm": 0.8824098110198975, "learning_rate": 9.97321067465209e-06, "loss": 0.07888403534889221, "memory(GiB)": 19.03, "step": 2504, "token_acc": 0.963963963963964, "train_speed(iter/s)": 0.939157 }, { "epoch": 0.08137608420231947, "grad_norm": 1.1548315286636353, "learning_rate": 9.973155116124814e-06, "loss": 0.09823019057512283, "memory(GiB)": 19.03, "step": 2505, "token_acc": 0.9601990049751243, "train_speed(iter/s)": 0.939211 }, { "epoch": 0.08140856966507488, "grad_norm": 0.580597460269928, "learning_rate": 9.973099500200706e-06, "loss": 0.08116397261619568, "memory(GiB)": 19.03, "step": 2506, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.939272 }, { "epoch": 0.0814410551278303, "grad_norm": 1.123109221458435, "learning_rate": 9.97304382688041e-06, "loss": 0.08970879763364792, "memory(GiB)": 19.03, "step": 2507, "token_acc": 0.975, "train_speed(iter/s)": 0.939345 }, { "epoch": 0.08147354059058572, "grad_norm": 0.6199605464935303, "learning_rate": 9.97298809616457e-06, "loss": 0.07928690314292908, "memory(GiB)": 19.03, "step": 2508, "token_acc": 0.9702127659574468, "train_speed(iter/s)": 0.939402 }, { "epoch": 0.08150602605334113, "grad_norm": 0.8918476700782776, "learning_rate": 9.972932308053831e-06, "loss": 0.09790872037410736, "memory(GiB)": 19.03, "step": 2509, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.939453 }, { "epoch": 0.08153851151609655, "grad_norm": 0.6087450981140137, "learning_rate": 9.972876462548831e-06, "loss": 0.09190620481967926, "memory(GiB)": 19.03, "step": 2510, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.939513 }, { "epoch": 0.08157099697885196, "grad_norm": 0.7469656467437744, "learning_rate": 9.97282055965022e-06, "loss": 0.08044842630624771, "memory(GiB)": 19.03, "step": 2511, "token_acc": 0.9417040358744395, "train_speed(iter/s)": 0.939577 }, { "epoch": 0.08160348244160738, "grad_norm": 1.0298999547958374, "learning_rate": 9.972764599358638e-06, "loss": 0.08266548812389374, "memory(GiB)": 19.03, "step": 2512, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.939636 }, { "epoch": 0.0816359679043628, "grad_norm": 0.9817577004432678, "learning_rate": 9.972708581674737e-06, "loss": 0.08007313311100006, "memory(GiB)": 19.03, "step": 2513, "token_acc": 0.9623430962343096, "train_speed(iter/s)": 0.939699 }, { "epoch": 0.08166845336711821, "grad_norm": 0.6270835995674133, "learning_rate": 9.97265250659916e-06, "loss": 0.08287997543811798, "memory(GiB)": 19.03, "step": 2514, "token_acc": 0.9573643410852714, "train_speed(iter/s)": 0.939763 }, { "epoch": 0.08170093882987363, "grad_norm": 0.8180480003356934, "learning_rate": 9.972596374132551e-06, "loss": 0.09205232560634613, "memory(GiB)": 19.03, "step": 2515, "token_acc": 0.966804979253112, "train_speed(iter/s)": 0.939821 }, { "epoch": 0.08173342429262904, "grad_norm": 0.7003759741783142, "learning_rate": 9.972540184275564e-06, "loss": 0.09359030425548553, "memory(GiB)": 19.03, "step": 2516, "token_acc": 0.9783783783783784, "train_speed(iter/s)": 0.939884 }, { "epoch": 0.08176590975538446, "grad_norm": 0.7048999071121216, "learning_rate": 9.972483937028844e-06, "loss": 0.08759421110153198, "memory(GiB)": 19.03, "step": 2517, "token_acc": 0.961038961038961, "train_speed(iter/s)": 0.939949 }, { "epoch": 0.08179839521813988, "grad_norm": 0.7177832722663879, "learning_rate": 9.972427632393042e-06, "loss": 0.07670068740844727, "memory(GiB)": 19.03, "step": 2518, "token_acc": 0.9641434262948207, "train_speed(iter/s)": 0.940014 }, { "epoch": 0.0818308806808953, "grad_norm": 0.5750634074211121, "learning_rate": 9.972371270368806e-06, "loss": 0.07850503921508789, "memory(GiB)": 19.03, "step": 2519, "token_acc": 0.961038961038961, "train_speed(iter/s)": 0.940074 }, { "epoch": 0.08186336614365072, "grad_norm": 0.9640226364135742, "learning_rate": 9.972314850956788e-06, "loss": 0.0861537754535675, "memory(GiB)": 19.03, "step": 2520, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.940133 }, { "epoch": 0.08189585160640614, "grad_norm": 2.089916944503784, "learning_rate": 9.972258374157637e-06, "loss": 0.09710089862346649, "memory(GiB)": 19.03, "step": 2521, "token_acc": 0.9704641350210971, "train_speed(iter/s)": 0.940198 }, { "epoch": 0.08192833706916156, "grad_norm": 0.7232937216758728, "learning_rate": 9.97220183997201e-06, "loss": 0.07956381142139435, "memory(GiB)": 19.03, "step": 2522, "token_acc": 0.9704797047970479, "train_speed(iter/s)": 0.94026 }, { "epoch": 0.08196082253191697, "grad_norm": 0.7336341738700867, "learning_rate": 9.97214524840055e-06, "loss": 0.08255855739116669, "memory(GiB)": 19.03, "step": 2523, "token_acc": 0.9702970297029703, "train_speed(iter/s)": 0.940325 }, { "epoch": 0.08199330799467239, "grad_norm": 0.6804583072662354, "learning_rate": 9.97208859944392e-06, "loss": 0.08270128071308136, "memory(GiB)": 19.03, "step": 2524, "token_acc": 0.9627906976744186, "train_speed(iter/s)": 0.940392 }, { "epoch": 0.0820257934574278, "grad_norm": 0.6922446489334106, "learning_rate": 9.97203189310277e-06, "loss": 0.07367898523807526, "memory(GiB)": 19.03, "step": 2525, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.940465 }, { "epoch": 0.08205827892018322, "grad_norm": 0.9735689759254456, "learning_rate": 9.971975129377752e-06, "loss": 0.07359373569488525, "memory(GiB)": 19.03, "step": 2526, "token_acc": 0.976, "train_speed(iter/s)": 0.940544 }, { "epoch": 0.08209076438293864, "grad_norm": 0.9128717184066772, "learning_rate": 9.971918308269524e-06, "loss": 0.08737847208976746, "memory(GiB)": 19.03, "step": 2527, "token_acc": 0.9551569506726457, "train_speed(iter/s)": 0.940625 }, { "epoch": 0.08212324984569405, "grad_norm": 0.6593810319900513, "learning_rate": 9.971861429778743e-06, "loss": 0.07728707790374756, "memory(GiB)": 19.03, "step": 2528, "token_acc": 0.9644268774703557, "train_speed(iter/s)": 0.940703 }, { "epoch": 0.08215573530844947, "grad_norm": 1.1598937511444092, "learning_rate": 9.971804493906061e-06, "loss": 0.0906732976436615, "memory(GiB)": 19.03, "step": 2529, "token_acc": 0.9760765550239234, "train_speed(iter/s)": 0.940767 }, { "epoch": 0.08218822077120488, "grad_norm": 0.7432015538215637, "learning_rate": 9.971747500652139e-06, "loss": 0.08312825113534927, "memory(GiB)": 19.03, "step": 2530, "token_acc": 0.95703125, "train_speed(iter/s)": 0.940837 }, { "epoch": 0.0822207062339603, "grad_norm": 0.7147486209869385, "learning_rate": 9.971690450017633e-06, "loss": 0.08284284174442291, "memory(GiB)": 19.03, "step": 2531, "token_acc": 0.9597989949748744, "train_speed(iter/s)": 0.940891 }, { "epoch": 0.08225319169671572, "grad_norm": 1.0670679807662964, "learning_rate": 9.971633342003201e-06, "loss": 0.08715814352035522, "memory(GiB)": 19.03, "step": 2532, "token_acc": 0.9586776859504132, "train_speed(iter/s)": 0.940959 }, { "epoch": 0.08228567715947113, "grad_norm": 0.7305157780647278, "learning_rate": 9.971576176609506e-06, "loss": 0.10016795992851257, "memory(GiB)": 19.03, "step": 2533, "token_acc": 0.9534883720930233, "train_speed(iter/s)": 0.941022 }, { "epoch": 0.08231816262222655, "grad_norm": 1.237687110900879, "learning_rate": 9.971518953837202e-06, "loss": 0.09136120229959488, "memory(GiB)": 19.03, "step": 2534, "token_acc": 0.9511111111111111, "train_speed(iter/s)": 0.941087 }, { "epoch": 0.08235064808498196, "grad_norm": 3.84004545211792, "learning_rate": 9.971461673686953e-06, "loss": 0.09406659007072449, "memory(GiB)": 19.03, "step": 2535, "token_acc": 0.9572953736654805, "train_speed(iter/s)": 0.941155 }, { "epoch": 0.0823831335477374, "grad_norm": 0.7554854154586792, "learning_rate": 9.97140433615942e-06, "loss": 0.08471006155014038, "memory(GiB)": 19.03, "step": 2536, "token_acc": 0.975103734439834, "train_speed(iter/s)": 0.941217 }, { "epoch": 0.08241561901049281, "grad_norm": 1.90143620967865, "learning_rate": 9.971346941255264e-06, "loss": 0.09099462628364563, "memory(GiB)": 19.03, "step": 2537, "token_acc": 0.9651567944250871, "train_speed(iter/s)": 0.941282 }, { "epoch": 0.08244810447324823, "grad_norm": 0.6953761577606201, "learning_rate": 9.971289488975146e-06, "loss": 0.09068990498781204, "memory(GiB)": 19.03, "step": 2538, "token_acc": 0.9617021276595744, "train_speed(iter/s)": 0.941344 }, { "epoch": 0.08248058993600364, "grad_norm": 0.7250992059707642, "learning_rate": 9.97123197931973e-06, "loss": 0.08854002505540848, "memory(GiB)": 19.03, "step": 2539, "token_acc": 0.9584905660377359, "train_speed(iter/s)": 0.94141 }, { "epoch": 0.08251307539875906, "grad_norm": 0.8202682733535767, "learning_rate": 9.971174412289683e-06, "loss": 0.08697245270013809, "memory(GiB)": 19.03, "step": 2540, "token_acc": 0.9537815126050421, "train_speed(iter/s)": 0.941471 }, { "epoch": 0.08254556086151447, "grad_norm": 0.9501035213470459, "learning_rate": 9.971116787885665e-06, "loss": 0.09326522797346115, "memory(GiB)": 19.03, "step": 2541, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.94153 }, { "epoch": 0.08257804632426989, "grad_norm": 0.6667494773864746, "learning_rate": 9.971059106108342e-06, "loss": 0.08388079702854156, "memory(GiB)": 19.03, "step": 2542, "token_acc": 0.956989247311828, "train_speed(iter/s)": 0.941597 }, { "epoch": 0.08261053178702531, "grad_norm": 0.8241135478019714, "learning_rate": 9.971001366958382e-06, "loss": 0.0854191780090332, "memory(GiB)": 19.03, "step": 2543, "token_acc": 0.9797979797979798, "train_speed(iter/s)": 0.941658 }, { "epoch": 0.08264301724978072, "grad_norm": 0.7238147258758545, "learning_rate": 9.97094357043645e-06, "loss": 0.0879618301987648, "memory(GiB)": 19.03, "step": 2544, "token_acc": 0.9732142857142857, "train_speed(iter/s)": 0.941721 }, { "epoch": 0.08267550271253614, "grad_norm": 0.6596829891204834, "learning_rate": 9.970885716543212e-06, "loss": 0.08506764471530914, "memory(GiB)": 19.03, "step": 2545, "token_acc": 0.96, "train_speed(iter/s)": 0.9418 }, { "epoch": 0.08270798817529156, "grad_norm": 1.138204574584961, "learning_rate": 9.970827805279337e-06, "loss": 0.09144680947065353, "memory(GiB)": 19.03, "step": 2546, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.941881 }, { "epoch": 0.08274047363804697, "grad_norm": 0.7096880078315735, "learning_rate": 9.970769836645492e-06, "loss": 0.083457812666893, "memory(GiB)": 19.03, "step": 2547, "token_acc": 0.9612676056338029, "train_speed(iter/s)": 0.941955 }, { "epoch": 0.08277295910080239, "grad_norm": 0.8453806638717651, "learning_rate": 9.970711810642348e-06, "loss": 0.09285221993923187, "memory(GiB)": 19.03, "step": 2548, "token_acc": 0.946236559139785, "train_speed(iter/s)": 0.942031 }, { "epoch": 0.0828054445635578, "grad_norm": 3.5531182289123535, "learning_rate": 9.970653727270573e-06, "loss": 0.08406879007816315, "memory(GiB)": 19.03, "step": 2549, "token_acc": 0.972, "train_speed(iter/s)": 0.942106 }, { "epoch": 0.08283793002631322, "grad_norm": 0.9470916390419006, "learning_rate": 9.97059558653084e-06, "loss": 0.09015986323356628, "memory(GiB)": 19.03, "step": 2550, "token_acc": 0.961038961038961, "train_speed(iter/s)": 0.942183 }, { "epoch": 0.08287041548906864, "grad_norm": 0.7490593194961548, "learning_rate": 9.970537388423816e-06, "loss": 0.09559771418571472, "memory(GiB)": 19.03, "step": 2551, "token_acc": 0.9647058823529412, "train_speed(iter/s)": 0.942264 }, { "epoch": 0.08290290095182407, "grad_norm": 0.8330678939819336, "learning_rate": 9.970479132950174e-06, "loss": 0.0910002812743187, "memory(GiB)": 19.03, "step": 2552, "token_acc": 0.9708029197080292, "train_speed(iter/s)": 0.942342 }, { "epoch": 0.08293538641457948, "grad_norm": 0.6711969971656799, "learning_rate": 9.97042082011059e-06, "loss": 0.07941645383834839, "memory(GiB)": 19.03, "step": 2553, "token_acc": 0.9587155963302753, "train_speed(iter/s)": 0.942419 }, { "epoch": 0.0829678718773349, "grad_norm": 0.7144663333892822, "learning_rate": 9.970362449905732e-06, "loss": 0.08966796100139618, "memory(GiB)": 19.03, "step": 2554, "token_acc": 0.953307392996109, "train_speed(iter/s)": 0.942499 }, { "epoch": 0.08300035734009031, "grad_norm": 0.7868592739105225, "learning_rate": 9.970304022336277e-06, "loss": 0.08754625916481018, "memory(GiB)": 19.03, "step": 2555, "token_acc": 0.9563492063492064, "train_speed(iter/s)": 0.942579 }, { "epoch": 0.08303284280284573, "grad_norm": 1.1162357330322266, "learning_rate": 9.970245537402897e-06, "loss": 0.08827120065689087, "memory(GiB)": 19.03, "step": 2556, "token_acc": 0.9697986577181208, "train_speed(iter/s)": 0.942658 }, { "epoch": 0.08306532826560115, "grad_norm": 0.7101072072982788, "learning_rate": 9.97018699510627e-06, "loss": 0.09484183043241501, "memory(GiB)": 19.03, "step": 2557, "token_acc": 0.9563318777292577, "train_speed(iter/s)": 0.942737 }, { "epoch": 0.08309781372835656, "grad_norm": 1.0333400964736938, "learning_rate": 9.970128395447067e-06, "loss": 0.1065572053194046, "memory(GiB)": 19.03, "step": 2558, "token_acc": 0.9518518518518518, "train_speed(iter/s)": 0.942821 }, { "epoch": 0.08313029919111198, "grad_norm": 0.9848531484603882, "learning_rate": 9.97006973842597e-06, "loss": 0.09422922879457474, "memory(GiB)": 19.03, "step": 2559, "token_acc": 0.9429657794676806, "train_speed(iter/s)": 0.942803 }, { "epoch": 0.0831627846538674, "grad_norm": 1.0303503274917603, "learning_rate": 9.970011024043651e-06, "loss": 0.07988125830888748, "memory(GiB)": 19.03, "step": 2560, "token_acc": 0.9619771863117871, "train_speed(iter/s)": 0.942878 }, { "epoch": 0.08319527011662281, "grad_norm": 0.5761581063270569, "learning_rate": 9.96995225230079e-06, "loss": 0.07663784921169281, "memory(GiB)": 19.03, "step": 2561, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.942945 }, { "epoch": 0.08322775557937823, "grad_norm": 1.3510725498199463, "learning_rate": 9.969893423198066e-06, "loss": 0.09747068583965302, "memory(GiB)": 19.03, "step": 2562, "token_acc": 0.9702127659574468, "train_speed(iter/s)": 0.943021 }, { "epoch": 0.08326024104213364, "grad_norm": 0.6983695030212402, "learning_rate": 9.969834536736156e-06, "loss": 0.07948489487171173, "memory(GiB)": 19.03, "step": 2563, "token_acc": 0.9849056603773585, "train_speed(iter/s)": 0.943095 }, { "epoch": 0.08329272650488906, "grad_norm": 0.9609214067459106, "learning_rate": 9.96977559291574e-06, "loss": 0.07624049484729767, "memory(GiB)": 19.03, "step": 2564, "token_acc": 0.9591836734693877, "train_speed(iter/s)": 0.943161 }, { "epoch": 0.08332521196764447, "grad_norm": 0.9469867944717407, "learning_rate": 9.969716591737499e-06, "loss": 0.09071516990661621, "memory(GiB)": 19.03, "step": 2565, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.943217 }, { "epoch": 0.08335769743039989, "grad_norm": 0.9564200639724731, "learning_rate": 9.969657533202115e-06, "loss": 0.07354632019996643, "memory(GiB)": 19.03, "step": 2566, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.943274 }, { "epoch": 0.0833901828931553, "grad_norm": 0.5761579871177673, "learning_rate": 9.969598417310268e-06, "loss": 0.07315623015165329, "memory(GiB)": 19.03, "step": 2567, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.943337 }, { "epoch": 0.08342266835591074, "grad_norm": 0.7777626514434814, "learning_rate": 9.969539244062642e-06, "loss": 0.089256152510643, "memory(GiB)": 19.03, "step": 2568, "token_acc": 0.9739130434782609, "train_speed(iter/s)": 0.943397 }, { "epoch": 0.08345515381866615, "grad_norm": 0.7890588641166687, "learning_rate": 9.969480013459918e-06, "loss": 0.10022437572479248, "memory(GiB)": 19.03, "step": 2569, "token_acc": 0.9635761589403974, "train_speed(iter/s)": 0.943458 }, { "epoch": 0.08348763928142157, "grad_norm": 0.5141564607620239, "learning_rate": 9.969420725502779e-06, "loss": 0.06256399303674698, "memory(GiB)": 19.03, "step": 2570, "token_acc": 0.9753086419753086, "train_speed(iter/s)": 0.94352 }, { "epoch": 0.08352012474417699, "grad_norm": 0.7146399617195129, "learning_rate": 9.96936138019191e-06, "loss": 0.08032156527042389, "memory(GiB)": 19.03, "step": 2571, "token_acc": 0.975, "train_speed(iter/s)": 0.94358 }, { "epoch": 0.0835526102069324, "grad_norm": 0.5523791313171387, "learning_rate": 9.969301977527999e-06, "loss": 0.07992774993181229, "memory(GiB)": 19.03, "step": 2572, "token_acc": 0.976, "train_speed(iter/s)": 0.943637 }, { "epoch": 0.08358509566968782, "grad_norm": 0.7156312465667725, "learning_rate": 9.969242517511727e-06, "loss": 0.09510967135429382, "memory(GiB)": 19.03, "step": 2573, "token_acc": 0.9516129032258065, "train_speed(iter/s)": 0.9437 }, { "epoch": 0.08361758113244323, "grad_norm": 1.0979708433151245, "learning_rate": 9.969183000143785e-06, "loss": 0.08129977434873581, "memory(GiB)": 19.03, "step": 2574, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.943766 }, { "epoch": 0.08365006659519865, "grad_norm": 0.8942223191261292, "learning_rate": 9.969123425424855e-06, "loss": 0.07542682439088821, "memory(GiB)": 19.03, "step": 2575, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.943827 }, { "epoch": 0.08368255205795407, "grad_norm": 0.7854306101799011, "learning_rate": 9.969063793355627e-06, "loss": 0.08626842498779297, "memory(GiB)": 19.03, "step": 2576, "token_acc": 0.9704433497536946, "train_speed(iter/s)": 0.943885 }, { "epoch": 0.08371503752070948, "grad_norm": 0.7430237531661987, "learning_rate": 9.969004103936789e-06, "loss": 0.08949299156665802, "memory(GiB)": 19.03, "step": 2577, "token_acc": 0.9497716894977168, "train_speed(iter/s)": 0.943943 }, { "epoch": 0.0837475229834649, "grad_norm": 0.8600520491600037, "learning_rate": 9.96894435716903e-06, "loss": 0.0820687785744667, "memory(GiB)": 19.03, "step": 2578, "token_acc": 0.9411764705882353, "train_speed(iter/s)": 0.943998 }, { "epoch": 0.08378000844622031, "grad_norm": 0.623229444026947, "learning_rate": 9.96888455305304e-06, "loss": 0.08019477128982544, "memory(GiB)": 19.03, "step": 2579, "token_acc": 0.975103734439834, "train_speed(iter/s)": 0.944058 }, { "epoch": 0.08381249390897573, "grad_norm": 0.7000870704650879, "learning_rate": 9.968824691589509e-06, "loss": 0.07352883368730545, "memory(GiB)": 19.03, "step": 2580, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.944119 }, { "epoch": 0.08384497937173115, "grad_norm": 0.9955625534057617, "learning_rate": 9.968764772779125e-06, "loss": 0.0827411636710167, "memory(GiB)": 19.03, "step": 2581, "token_acc": 0.9568965517241379, "train_speed(iter/s)": 0.944188 }, { "epoch": 0.08387746483448656, "grad_norm": 1.2958694696426392, "learning_rate": 9.968704796622585e-06, "loss": 0.08807335793972015, "memory(GiB)": 19.03, "step": 2582, "token_acc": 0.9504504504504504, "train_speed(iter/s)": 0.944263 }, { "epoch": 0.08390995029724198, "grad_norm": 0.9269784688949585, "learning_rate": 9.968644763120576e-06, "loss": 0.08742215484380722, "memory(GiB)": 19.03, "step": 2583, "token_acc": 0.966789667896679, "train_speed(iter/s)": 0.944318 }, { "epoch": 0.08394243575999741, "grad_norm": 0.7362868189811707, "learning_rate": 9.968584672273795e-06, "loss": 0.08713410794734955, "memory(GiB)": 19.03, "step": 2584, "token_acc": 0.9671532846715328, "train_speed(iter/s)": 0.944382 }, { "epoch": 0.08397492122275282, "grad_norm": 0.9421110153198242, "learning_rate": 9.968524524082934e-06, "loss": 0.09671364724636078, "memory(GiB)": 19.03, "step": 2585, "token_acc": 0.95703125, "train_speed(iter/s)": 0.944446 }, { "epoch": 0.08400740668550824, "grad_norm": 0.8197634220123291, "learning_rate": 9.968464318548686e-06, "loss": 0.08343791961669922, "memory(GiB)": 19.03, "step": 2586, "token_acc": 0.9690265486725663, "train_speed(iter/s)": 0.944507 }, { "epoch": 0.08403989214826366, "grad_norm": 1.0441569089889526, "learning_rate": 9.968404055671747e-06, "loss": 0.08384934812784195, "memory(GiB)": 19.03, "step": 2587, "token_acc": 0.9638009049773756, "train_speed(iter/s)": 0.944568 }, { "epoch": 0.08407237761101907, "grad_norm": 0.7536499500274658, "learning_rate": 9.968343735452813e-06, "loss": 0.09140534698963165, "memory(GiB)": 19.03, "step": 2588, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.944625 }, { "epoch": 0.08410486307377449, "grad_norm": 1.0289700031280518, "learning_rate": 9.968283357892577e-06, "loss": 0.09566997736692429, "memory(GiB)": 19.03, "step": 2589, "token_acc": 0.9523809523809523, "train_speed(iter/s)": 0.944686 }, { "epoch": 0.0841373485365299, "grad_norm": 0.804440438747406, "learning_rate": 9.968222922991739e-06, "loss": 0.08499307930469513, "memory(GiB)": 19.03, "step": 2590, "token_acc": 0.9662447257383966, "train_speed(iter/s)": 0.944748 }, { "epoch": 0.08416983399928532, "grad_norm": 0.7266100645065308, "learning_rate": 9.968162430750997e-06, "loss": 0.07254085689783096, "memory(GiB)": 19.03, "step": 2591, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.944814 }, { "epoch": 0.08420231946204074, "grad_norm": 1.0817370414733887, "learning_rate": 9.968101881171048e-06, "loss": 0.09224577248096466, "memory(GiB)": 19.03, "step": 2592, "token_acc": 0.963963963963964, "train_speed(iter/s)": 0.944876 }, { "epoch": 0.08423480492479615, "grad_norm": 0.9822847843170166, "learning_rate": 9.96804127425259e-06, "loss": 0.0790063664317131, "memory(GiB)": 19.03, "step": 2593, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.944932 }, { "epoch": 0.08426729038755157, "grad_norm": 0.5726041197776794, "learning_rate": 9.967980609996324e-06, "loss": 0.06901998817920685, "memory(GiB)": 19.03, "step": 2594, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.944986 }, { "epoch": 0.08429977585030698, "grad_norm": 0.7592462301254272, "learning_rate": 9.967919888402948e-06, "loss": 0.08056890964508057, "memory(GiB)": 19.03, "step": 2595, "token_acc": 0.9627906976744186, "train_speed(iter/s)": 0.945051 }, { "epoch": 0.0843322613130624, "grad_norm": 1.1816028356552124, "learning_rate": 9.967859109473165e-06, "loss": 0.08014986664056778, "memory(GiB)": 19.03, "step": 2596, "token_acc": 0.9708333333333333, "train_speed(iter/s)": 0.945116 }, { "epoch": 0.08436474677581782, "grad_norm": 0.8587270975112915, "learning_rate": 9.967798273207676e-06, "loss": 0.09346948564052582, "memory(GiB)": 19.03, "step": 2597, "token_acc": 0.9809523809523809, "train_speed(iter/s)": 0.94518 }, { "epoch": 0.08439723223857323, "grad_norm": 0.8494638800621033, "learning_rate": 9.967737379607182e-06, "loss": 0.08564458042383194, "memory(GiB)": 19.03, "step": 2598, "token_acc": 0.9742647058823529, "train_speed(iter/s)": 0.945243 }, { "epoch": 0.08442971770132865, "grad_norm": 0.9873694181442261, "learning_rate": 9.967676428672387e-06, "loss": 0.08442206680774689, "memory(GiB)": 19.03, "step": 2599, "token_acc": 0.9620253164556962, "train_speed(iter/s)": 0.945305 }, { "epoch": 0.08446220316408408, "grad_norm": 0.9151745438575745, "learning_rate": 9.967615420403995e-06, "loss": 0.07540185749530792, "memory(GiB)": 19.03, "step": 2600, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.945368 }, { "epoch": 0.0844946886268395, "grad_norm": 0.8047921061515808, "learning_rate": 9.967554354802708e-06, "loss": 0.07695053517818451, "memory(GiB)": 19.03, "step": 2601, "token_acc": 0.9663865546218487, "train_speed(iter/s)": 0.945416 }, { "epoch": 0.08452717408959491, "grad_norm": 0.8050855994224548, "learning_rate": 9.967493231869235e-06, "loss": 0.07344545423984528, "memory(GiB)": 19.03, "step": 2602, "token_acc": 0.9604743083003953, "train_speed(iter/s)": 0.94548 }, { "epoch": 0.08455965955235033, "grad_norm": 0.7860557436943054, "learning_rate": 9.967432051604274e-06, "loss": 0.07571852207183838, "memory(GiB)": 19.03, "step": 2603, "token_acc": 0.9607843137254902, "train_speed(iter/s)": 0.945542 }, { "epoch": 0.08459214501510574, "grad_norm": 0.7949820160865784, "learning_rate": 9.967370814008537e-06, "loss": 0.07923534512519836, "memory(GiB)": 19.03, "step": 2604, "token_acc": 0.9558232931726908, "train_speed(iter/s)": 0.9456 }, { "epoch": 0.08462463047786116, "grad_norm": 1.5229240655899048, "learning_rate": 9.96730951908273e-06, "loss": 0.07238069176673889, "memory(GiB)": 19.03, "step": 2605, "token_acc": 0.9629629629629629, "train_speed(iter/s)": 0.945676 }, { "epoch": 0.08465711594061658, "grad_norm": 0.9692617654800415, "learning_rate": 9.967248166827558e-06, "loss": 0.08564446866512299, "memory(GiB)": 19.03, "step": 2606, "token_acc": 0.9704641350210971, "train_speed(iter/s)": 0.945749 }, { "epoch": 0.08468960140337199, "grad_norm": 0.7498562932014465, "learning_rate": 9.967186757243731e-06, "loss": 0.09610751271247864, "memory(GiB)": 19.03, "step": 2607, "token_acc": 0.9658119658119658, "train_speed(iter/s)": 0.945826 }, { "epoch": 0.08472208686612741, "grad_norm": 0.6291596293449402, "learning_rate": 9.96712529033196e-06, "loss": 0.083609439432621, "memory(GiB)": 19.03, "step": 2608, "token_acc": 0.963963963963964, "train_speed(iter/s)": 0.945904 }, { "epoch": 0.08475457232888282, "grad_norm": 0.6229294538497925, "learning_rate": 9.96706376609295e-06, "loss": 0.08154206722974777, "memory(GiB)": 19.03, "step": 2609, "token_acc": 0.9651567944250871, "train_speed(iter/s)": 0.945982 }, { "epoch": 0.08478705779163824, "grad_norm": 0.7127304673194885, "learning_rate": 9.967002184527415e-06, "loss": 0.08736283332109451, "memory(GiB)": 19.03, "step": 2610, "token_acc": 0.9754601226993865, "train_speed(iter/s)": 0.946057 }, { "epoch": 0.08481954325439366, "grad_norm": 1.398972988128662, "learning_rate": 9.966940545636062e-06, "loss": 0.09394694864749908, "memory(GiB)": 19.03, "step": 2611, "token_acc": 0.9641025641025641, "train_speed(iter/s)": 0.946129 }, { "epoch": 0.08485202871714907, "grad_norm": 0.7219924330711365, "learning_rate": 9.966878849419604e-06, "loss": 0.07534236460924149, "memory(GiB)": 19.03, "step": 2612, "token_acc": 0.9609756097560975, "train_speed(iter/s)": 0.9462 }, { "epoch": 0.08488451417990449, "grad_norm": 0.6446008682250977, "learning_rate": 9.966817095878752e-06, "loss": 0.08449894934892654, "memory(GiB)": 19.03, "step": 2613, "token_acc": 0.9766355140186916, "train_speed(iter/s)": 0.94628 }, { "epoch": 0.0849169996426599, "grad_norm": 0.6292667388916016, "learning_rate": 9.966755285014223e-06, "loss": 0.08570923656225204, "memory(GiB)": 19.03, "step": 2614, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.946351 }, { "epoch": 0.08494948510541532, "grad_norm": 0.5627049803733826, "learning_rate": 9.966693416826724e-06, "loss": 0.08360950648784637, "memory(GiB)": 19.03, "step": 2615, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.946427 }, { "epoch": 0.08498197056817075, "grad_norm": 0.7225366234779358, "learning_rate": 9.966631491316974e-06, "loss": 0.0808691531419754, "memory(GiB)": 19.03, "step": 2616, "token_acc": 0.9662921348314607, "train_speed(iter/s)": 0.946499 }, { "epoch": 0.08501445603092617, "grad_norm": 0.5423561334609985, "learning_rate": 9.966569508485687e-06, "loss": 0.07375362515449524, "memory(GiB)": 19.03, "step": 2617, "token_acc": 0.9732620320855615, "train_speed(iter/s)": 0.946574 }, { "epoch": 0.08504694149368158, "grad_norm": 1.0190937519073486, "learning_rate": 9.966507468333576e-06, "loss": 0.09423995018005371, "memory(GiB)": 19.03, "step": 2618, "token_acc": 0.9438202247191011, "train_speed(iter/s)": 0.946647 }, { "epoch": 0.085079426956437, "grad_norm": 0.7096089720726013, "learning_rate": 9.966445370861358e-06, "loss": 0.08142055571079254, "memory(GiB)": 19.03, "step": 2619, "token_acc": 0.957345971563981, "train_speed(iter/s)": 0.946721 }, { "epoch": 0.08511191241919241, "grad_norm": 0.6829182505607605, "learning_rate": 9.96638321606975e-06, "loss": 0.08380042016506195, "memory(GiB)": 19.03, "step": 2620, "token_acc": 0.9662447257383966, "train_speed(iter/s)": 0.946796 }, { "epoch": 0.08514439788194783, "grad_norm": 0.8584003448486328, "learning_rate": 9.966321003959471e-06, "loss": 0.09952618926763535, "memory(GiB)": 19.03, "step": 2621, "token_acc": 0.9626556016597511, "train_speed(iter/s)": 0.946851 }, { "epoch": 0.08517688334470325, "grad_norm": 1.094260573387146, "learning_rate": 9.966258734531237e-06, "loss": 0.089865542948246, "memory(GiB)": 19.03, "step": 2622, "token_acc": 0.9396984924623115, "train_speed(iter/s)": 0.946906 }, { "epoch": 0.08520936880745866, "grad_norm": 0.7777911424636841, "learning_rate": 9.966196407785766e-06, "loss": 0.08396366238594055, "memory(GiB)": 19.03, "step": 2623, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.946966 }, { "epoch": 0.08524185427021408, "grad_norm": 0.955968976020813, "learning_rate": 9.966134023723779e-06, "loss": 0.08559709042310715, "memory(GiB)": 19.03, "step": 2624, "token_acc": 0.9597989949748744, "train_speed(iter/s)": 0.947024 }, { "epoch": 0.0852743397329695, "grad_norm": 0.6570503115653992, "learning_rate": 9.966071582345997e-06, "loss": 0.0772952139377594, "memory(GiB)": 19.03, "step": 2625, "token_acc": 0.9765258215962441, "train_speed(iter/s)": 0.947086 }, { "epoch": 0.08530682519572491, "grad_norm": 1.2691214084625244, "learning_rate": 9.966009083653137e-06, "loss": 0.08548182249069214, "memory(GiB)": 19.03, "step": 2626, "token_acc": 0.9717741935483871, "train_speed(iter/s)": 0.947145 }, { "epoch": 0.08533931065848033, "grad_norm": 0.9967650771141052, "learning_rate": 9.965946527645923e-06, "loss": 0.08649571239948273, "memory(GiB)": 19.03, "step": 2627, "token_acc": 0.9702970297029703, "train_speed(iter/s)": 0.947208 }, { "epoch": 0.08537179612123574, "grad_norm": 0.8137326240539551, "learning_rate": 9.965883914325076e-06, "loss": 0.0921715795993805, "memory(GiB)": 19.03, "step": 2628, "token_acc": 0.9568345323741008, "train_speed(iter/s)": 0.947266 }, { "epoch": 0.08540428158399116, "grad_norm": 0.9087755084037781, "learning_rate": 9.96582124369132e-06, "loss": 0.09015519917011261, "memory(GiB)": 19.03, "step": 2629, "token_acc": 0.9688888888888889, "train_speed(iter/s)": 0.947327 }, { "epoch": 0.08543676704674658, "grad_norm": 0.583630383014679, "learning_rate": 9.965758515745379e-06, "loss": 0.06712053716182709, "memory(GiB)": 19.03, "step": 2630, "token_acc": 0.9656862745098039, "train_speed(iter/s)": 0.947385 }, { "epoch": 0.08546925250950199, "grad_norm": 0.6956184506416321, "learning_rate": 9.965695730487973e-06, "loss": 0.08603571355342865, "memory(GiB)": 19.03, "step": 2631, "token_acc": 0.9543568464730291, "train_speed(iter/s)": 0.947441 }, { "epoch": 0.08550173797225742, "grad_norm": 0.6262181401252747, "learning_rate": 9.965632887919828e-06, "loss": 0.07300916314125061, "memory(GiB)": 19.03, "step": 2632, "token_acc": 0.9748743718592965, "train_speed(iter/s)": 0.947498 }, { "epoch": 0.08553422343501284, "grad_norm": 0.8187152743339539, "learning_rate": 9.965569988041673e-06, "loss": 0.08640924096107483, "memory(GiB)": 19.03, "step": 2633, "token_acc": 0.9699570815450643, "train_speed(iter/s)": 0.947555 }, { "epoch": 0.08556670889776825, "grad_norm": 0.7511226534843445, "learning_rate": 9.96550703085423e-06, "loss": 0.07706648856401443, "memory(GiB)": 19.03, "step": 2634, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.947611 }, { "epoch": 0.08559919436052367, "grad_norm": 0.6974802017211914, "learning_rate": 9.965444016358225e-06, "loss": 0.0762433409690857, "memory(GiB)": 19.03, "step": 2635, "token_acc": 0.9780701754385965, "train_speed(iter/s)": 0.947669 }, { "epoch": 0.08563167982327909, "grad_norm": 0.8571101427078247, "learning_rate": 9.965380944554388e-06, "loss": 0.09196335077285767, "memory(GiB)": 19.03, "step": 2636, "token_acc": 0.9656652360515021, "train_speed(iter/s)": 0.947729 }, { "epoch": 0.0856641652860345, "grad_norm": 1.2532411813735962, "learning_rate": 9.965317815443447e-06, "loss": 0.07688084244728088, "memory(GiB)": 19.03, "step": 2637, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.947785 }, { "epoch": 0.08569665074878992, "grad_norm": 0.6905393600463867, "learning_rate": 9.965254629026128e-06, "loss": 0.07891238480806351, "memory(GiB)": 19.03, "step": 2638, "token_acc": 0.9556650246305419, "train_speed(iter/s)": 0.947851 }, { "epoch": 0.08572913621154533, "grad_norm": 0.7598648071289062, "learning_rate": 9.965191385303164e-06, "loss": 0.07410594075918198, "memory(GiB)": 19.03, "step": 2639, "token_acc": 0.9607843137254902, "train_speed(iter/s)": 0.947923 }, { "epoch": 0.08576162167430075, "grad_norm": 0.8389125466346741, "learning_rate": 9.965128084275281e-06, "loss": 0.08489760011434555, "memory(GiB)": 19.03, "step": 2640, "token_acc": 0.960431654676259, "train_speed(iter/s)": 0.947998 }, { "epoch": 0.08579410713705617, "grad_norm": 1.0791823863983154, "learning_rate": 9.965064725943213e-06, "loss": 0.09685684740543365, "memory(GiB)": 19.03, "step": 2641, "token_acc": 0.9661016949152542, "train_speed(iter/s)": 0.948073 }, { "epoch": 0.08582659259981158, "grad_norm": 0.7965162396430969, "learning_rate": 9.965001310307689e-06, "loss": 0.08694857358932495, "memory(GiB)": 19.03, "step": 2642, "token_acc": 0.9624060150375939, "train_speed(iter/s)": 0.948146 }, { "epoch": 0.085859078062567, "grad_norm": 0.7109536528587341, "learning_rate": 9.964937837369441e-06, "loss": 0.08477698266506195, "memory(GiB)": 19.03, "step": 2643, "token_acc": 0.9574468085106383, "train_speed(iter/s)": 0.948213 }, { "epoch": 0.08589156352532241, "grad_norm": 0.9626200795173645, "learning_rate": 9.964874307129202e-06, "loss": 0.08204120397567749, "memory(GiB)": 19.03, "step": 2644, "token_acc": 0.9776785714285714, "train_speed(iter/s)": 0.948274 }, { "epoch": 0.08592404898807783, "grad_norm": 0.8041571378707886, "learning_rate": 9.964810719587705e-06, "loss": 0.08259275555610657, "memory(GiB)": 19.03, "step": 2645, "token_acc": 0.9504950495049505, "train_speed(iter/s)": 0.948333 }, { "epoch": 0.08595653445083325, "grad_norm": 0.7461684942245483, "learning_rate": 9.964747074745685e-06, "loss": 0.0894659012556076, "memory(GiB)": 19.03, "step": 2646, "token_acc": 0.964, "train_speed(iter/s)": 0.948386 }, { "epoch": 0.08598901991358866, "grad_norm": 0.6307697296142578, "learning_rate": 9.964683372603876e-06, "loss": 0.08230659365653992, "memory(GiB)": 19.03, "step": 2647, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.94844 }, { "epoch": 0.08602150537634409, "grad_norm": 1.1400554180145264, "learning_rate": 9.964619613163012e-06, "loss": 0.08933192491531372, "memory(GiB)": 19.03, "step": 2648, "token_acc": 0.954337899543379, "train_speed(iter/s)": 0.948501 }, { "epoch": 0.08605399083909951, "grad_norm": 1.0077471733093262, "learning_rate": 9.96455579642383e-06, "loss": 0.07195717096328735, "memory(GiB)": 19.03, "step": 2649, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.948564 }, { "epoch": 0.08608647630185493, "grad_norm": 0.756048858165741, "learning_rate": 9.964491922387066e-06, "loss": 0.08738012611865997, "memory(GiB)": 19.03, "step": 2650, "token_acc": 0.9558823529411765, "train_speed(iter/s)": 0.948617 }, { "epoch": 0.08611896176461034, "grad_norm": 0.8305519223213196, "learning_rate": 9.964427991053457e-06, "loss": 0.10128122568130493, "memory(GiB)": 19.03, "step": 2651, "token_acc": 0.9614035087719298, "train_speed(iter/s)": 0.94866 }, { "epoch": 0.08615144722736576, "grad_norm": 0.5829722285270691, "learning_rate": 9.964364002423743e-06, "loss": 0.07733596861362457, "memory(GiB)": 19.03, "step": 2652, "token_acc": 0.9696969696969697, "train_speed(iter/s)": 0.948718 }, { "epoch": 0.08618393269012117, "grad_norm": 0.7133053541183472, "learning_rate": 9.96429995649866e-06, "loss": 0.07640406489372253, "memory(GiB)": 19.03, "step": 2653, "token_acc": 0.9737827715355806, "train_speed(iter/s)": 0.948779 }, { "epoch": 0.08621641815287659, "grad_norm": 0.8866073489189148, "learning_rate": 9.964235853278947e-06, "loss": 0.07246033847332001, "memory(GiB)": 19.03, "step": 2654, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.948835 }, { "epoch": 0.086248903615632, "grad_norm": 0.538545548915863, "learning_rate": 9.964171692765348e-06, "loss": 0.06456372141838074, "memory(GiB)": 19.03, "step": 2655, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.948891 }, { "epoch": 0.08628138907838742, "grad_norm": 0.9165710806846619, "learning_rate": 9.964107474958598e-06, "loss": 0.07775835692882538, "memory(GiB)": 19.03, "step": 2656, "token_acc": 0.9793388429752066, "train_speed(iter/s)": 0.948949 }, { "epoch": 0.08631387454114284, "grad_norm": 3.2263264656066895, "learning_rate": 9.96404319985944e-06, "loss": 0.09749752283096313, "memory(GiB)": 19.03, "step": 2657, "token_acc": 0.9516129032258065, "train_speed(iter/s)": 0.949008 }, { "epoch": 0.08634636000389825, "grad_norm": 0.7559115886688232, "learning_rate": 9.963978867468618e-06, "loss": 0.08970664441585541, "memory(GiB)": 19.03, "step": 2658, "token_acc": 0.9836956521739131, "train_speed(iter/s)": 0.94907 }, { "epoch": 0.08637884546665367, "grad_norm": 1.0380489826202393, "learning_rate": 9.963914477786872e-06, "loss": 0.08677025884389877, "memory(GiB)": 19.03, "step": 2659, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.949133 }, { "epoch": 0.08641133092940909, "grad_norm": 0.637735903263092, "learning_rate": 9.963850030814946e-06, "loss": 0.08614605665206909, "memory(GiB)": 19.03, "step": 2660, "token_acc": 0.9662447257383966, "train_speed(iter/s)": 0.94919 }, { "epoch": 0.0864438163921645, "grad_norm": 1.770216703414917, "learning_rate": 9.963785526553581e-06, "loss": 0.0897940993309021, "memory(GiB)": 19.03, "step": 2661, "token_acc": 0.9615384615384616, "train_speed(iter/s)": 0.949252 }, { "epoch": 0.08647630185491992, "grad_norm": 0.6830598711967468, "learning_rate": 9.963720965003528e-06, "loss": 0.0748726949095726, "memory(GiB)": 19.03, "step": 2662, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.949294 }, { "epoch": 0.08650878731767533, "grad_norm": 0.681269109249115, "learning_rate": 9.963656346165527e-06, "loss": 0.07905005663633347, "memory(GiB)": 19.03, "step": 2663, "token_acc": 0.9634703196347032, "train_speed(iter/s)": 0.949349 }, { "epoch": 0.08654127278043076, "grad_norm": 0.705033004283905, "learning_rate": 9.963591670040327e-06, "loss": 0.09298329055309296, "memory(GiB)": 19.03, "step": 2664, "token_acc": 0.9581589958158996, "train_speed(iter/s)": 0.949409 }, { "epoch": 0.08657375824318618, "grad_norm": 0.6094020009040833, "learning_rate": 9.96352693662867e-06, "loss": 0.08024557679891586, "memory(GiB)": 19.03, "step": 2665, "token_acc": 0.9641434262948207, "train_speed(iter/s)": 0.949473 }, { "epoch": 0.0866062437059416, "grad_norm": 0.5556514263153076, "learning_rate": 9.963462145931307e-06, "loss": 0.07687041163444519, "memory(GiB)": 19.03, "step": 2666, "token_acc": 0.9753086419753086, "train_speed(iter/s)": 0.949546 }, { "epoch": 0.08663872916869701, "grad_norm": 0.8017401099205017, "learning_rate": 9.963397297948984e-06, "loss": 0.08629292249679565, "memory(GiB)": 19.03, "step": 2667, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.949617 }, { "epoch": 0.08667121463145243, "grad_norm": 0.8515058755874634, "learning_rate": 9.963332392682449e-06, "loss": 0.0822349488735199, "memory(GiB)": 19.03, "step": 2668, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.949688 }, { "epoch": 0.08670370009420784, "grad_norm": 0.6506521701812744, "learning_rate": 9.963267430132455e-06, "loss": 0.07645432651042938, "memory(GiB)": 19.03, "step": 2669, "token_acc": 0.967479674796748, "train_speed(iter/s)": 0.949751 }, { "epoch": 0.08673618555696326, "grad_norm": 0.6575880646705627, "learning_rate": 9.963202410299746e-06, "loss": 0.08252683281898499, "memory(GiB)": 19.03, "step": 2670, "token_acc": 0.9591078066914498, "train_speed(iter/s)": 0.949818 }, { "epoch": 0.08676867101971868, "grad_norm": 0.8612130880355835, "learning_rate": 9.963137333185076e-06, "loss": 0.08700234442949295, "memory(GiB)": 19.03, "step": 2671, "token_acc": 0.9644268774703557, "train_speed(iter/s)": 0.949887 }, { "epoch": 0.08680115648247409, "grad_norm": 0.9509863257408142, "learning_rate": 9.963072198789196e-06, "loss": 0.09174513816833496, "memory(GiB)": 19.03, "step": 2672, "token_acc": 0.9607843137254902, "train_speed(iter/s)": 0.949961 }, { "epoch": 0.08683364194522951, "grad_norm": 1.068924069404602, "learning_rate": 9.963007007112858e-06, "loss": 0.08443643152713776, "memory(GiB)": 19.03, "step": 2673, "token_acc": 0.9623287671232876, "train_speed(iter/s)": 0.950033 }, { "epoch": 0.08686612740798492, "grad_norm": 0.9011671543121338, "learning_rate": 9.96294175815681e-06, "loss": 0.08466857671737671, "memory(GiB)": 19.03, "step": 2674, "token_acc": 0.9702970297029703, "train_speed(iter/s)": 0.950098 }, { "epoch": 0.08689861287074034, "grad_norm": 0.7286176681518555, "learning_rate": 9.96287645192181e-06, "loss": 0.07613290101289749, "memory(GiB)": 19.03, "step": 2675, "token_acc": 0.966183574879227, "train_speed(iter/s)": 0.950168 }, { "epoch": 0.08693109833349576, "grad_norm": 0.9771103262901306, "learning_rate": 9.96281108840861e-06, "loss": 0.0838882327079773, "memory(GiB)": 19.03, "step": 2676, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.950238 }, { "epoch": 0.08696358379625117, "grad_norm": 0.7383798956871033, "learning_rate": 9.962745667617966e-06, "loss": 0.07769639045000076, "memory(GiB)": 19.03, "step": 2677, "token_acc": 0.972, "train_speed(iter/s)": 0.950294 }, { "epoch": 0.08699606925900659, "grad_norm": 0.6884837746620178, "learning_rate": 9.962680189550631e-06, "loss": 0.08470125496387482, "memory(GiB)": 19.03, "step": 2678, "token_acc": 0.9605911330049262, "train_speed(iter/s)": 0.950342 }, { "epoch": 0.087028554721762, "grad_norm": 0.5754629373550415, "learning_rate": 9.96261465420736e-06, "loss": 0.07888229936361313, "memory(GiB)": 19.03, "step": 2679, "token_acc": 0.9581589958158996, "train_speed(iter/s)": 0.950403 }, { "epoch": 0.08706104018451744, "grad_norm": 1.026227355003357, "learning_rate": 9.962549061588911e-06, "loss": 0.08490549027919769, "memory(GiB)": 19.03, "step": 2680, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.950455 }, { "epoch": 0.08709352564727285, "grad_norm": 1.2402710914611816, "learning_rate": 9.962483411696043e-06, "loss": 0.07427018135786057, "memory(GiB)": 19.03, "step": 2681, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.950506 }, { "epoch": 0.08712601111002827, "grad_norm": 0.6902554035186768, "learning_rate": 9.962417704529508e-06, "loss": 0.08805342018604279, "memory(GiB)": 19.03, "step": 2682, "token_acc": 0.9622641509433962, "train_speed(iter/s)": 0.950558 }, { "epoch": 0.08715849657278368, "grad_norm": 0.7165570855140686, "learning_rate": 9.96235194009007e-06, "loss": 0.07855924963951111, "memory(GiB)": 19.03, "step": 2683, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.950613 }, { "epoch": 0.0871909820355391, "grad_norm": 0.5337916016578674, "learning_rate": 9.962286118378483e-06, "loss": 0.08588971197605133, "memory(GiB)": 19.03, "step": 2684, "token_acc": 0.9572649572649573, "train_speed(iter/s)": 0.950671 }, { "epoch": 0.08722346749829452, "grad_norm": 0.6657738089561462, "learning_rate": 9.962220239395513e-06, "loss": 0.07449394464492798, "memory(GiB)": 19.03, "step": 2685, "token_acc": 0.9700854700854701, "train_speed(iter/s)": 0.950725 }, { "epoch": 0.08725595296104993, "grad_norm": 1.1779210567474365, "learning_rate": 9.962154303141916e-06, "loss": 0.0889645516872406, "memory(GiB)": 19.03, "step": 2686, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.950784 }, { "epoch": 0.08728843842380535, "grad_norm": 0.7385329604148865, "learning_rate": 9.962088309618452e-06, "loss": 0.07294042408466339, "memory(GiB)": 19.03, "step": 2687, "token_acc": 0.9885931558935361, "train_speed(iter/s)": 0.950835 }, { "epoch": 0.08732092388656076, "grad_norm": 0.7802113890647888, "learning_rate": 9.962022258825885e-06, "loss": 0.08016354590654373, "memory(GiB)": 19.03, "step": 2688, "token_acc": 0.9659574468085106, "train_speed(iter/s)": 0.950887 }, { "epoch": 0.08735340934931618, "grad_norm": 0.8473635911941528, "learning_rate": 9.961956150764978e-06, "loss": 0.07840383797883987, "memory(GiB)": 19.03, "step": 2689, "token_acc": 0.975, "train_speed(iter/s)": 0.950945 }, { "epoch": 0.0873858948120716, "grad_norm": 0.9096043109893799, "learning_rate": 9.961889985436491e-06, "loss": 0.07456551492214203, "memory(GiB)": 19.03, "step": 2690, "token_acc": 0.96875, "train_speed(iter/s)": 0.950988 }, { "epoch": 0.08741838027482701, "grad_norm": 1.0074350833892822, "learning_rate": 9.961823762841189e-06, "loss": 0.07821498811244965, "memory(GiB)": 19.03, "step": 2691, "token_acc": 0.9651741293532339, "train_speed(iter/s)": 0.951042 }, { "epoch": 0.08745086573758243, "grad_norm": 5.282806873321533, "learning_rate": 9.961757482979837e-06, "loss": 0.08250147849321365, "memory(GiB)": 19.03, "step": 2692, "token_acc": 0.9449152542372882, "train_speed(iter/s)": 0.951096 }, { "epoch": 0.08748335120033784, "grad_norm": 1.2699737548828125, "learning_rate": 9.9616911458532e-06, "loss": 0.08293008804321289, "memory(GiB)": 19.03, "step": 2693, "token_acc": 0.9527896995708155, "train_speed(iter/s)": 0.951154 }, { "epoch": 0.08751583666309326, "grad_norm": 1.277709722518921, "learning_rate": 9.961624751462044e-06, "loss": 0.08892849832773209, "memory(GiB)": 19.03, "step": 2694, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.951216 }, { "epoch": 0.08754832212584868, "grad_norm": 0.9093829393386841, "learning_rate": 9.961558299807135e-06, "loss": 0.0773598849773407, "memory(GiB)": 19.03, "step": 2695, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.951288 }, { "epoch": 0.0875808075886041, "grad_norm": 0.8129280805587769, "learning_rate": 9.961491790889237e-06, "loss": 0.07943755388259888, "memory(GiB)": 19.03, "step": 2696, "token_acc": 0.9619771863117871, "train_speed(iter/s)": 0.951352 }, { "epoch": 0.08761329305135952, "grad_norm": 0.8651934266090393, "learning_rate": 9.96142522470912e-06, "loss": 0.07411271333694458, "memory(GiB)": 19.03, "step": 2697, "token_acc": 0.9783549783549783, "train_speed(iter/s)": 0.951422 }, { "epoch": 0.08764577851411494, "grad_norm": 1.2121989727020264, "learning_rate": 9.961358601267555e-06, "loss": 0.07790502905845642, "memory(GiB)": 19.03, "step": 2698, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.951484 }, { "epoch": 0.08767826397687035, "grad_norm": 1.2028858661651611, "learning_rate": 9.961291920565307e-06, "loss": 0.08160047233104706, "memory(GiB)": 19.03, "step": 2699, "token_acc": 0.9428571428571428, "train_speed(iter/s)": 0.951557 }, { "epoch": 0.08771074943962577, "grad_norm": 1.1892521381378174, "learning_rate": 9.961225182603146e-06, "loss": 0.08974587172269821, "memory(GiB)": 19.03, "step": 2700, "token_acc": 0.9739583333333334, "train_speed(iter/s)": 0.951625 }, { "epoch": 0.08774323490238119, "grad_norm": 0.9856963157653809, "learning_rate": 9.961158387381844e-06, "loss": 0.0688902884721756, "memory(GiB)": 19.03, "step": 2701, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.951694 }, { "epoch": 0.0877757203651366, "grad_norm": 0.7748425602912903, "learning_rate": 9.961091534902169e-06, "loss": 0.06648541241884232, "memory(GiB)": 19.03, "step": 2702, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.951758 }, { "epoch": 0.08780820582789202, "grad_norm": 0.9641231894493103, "learning_rate": 9.961024625164897e-06, "loss": 0.08354319632053375, "memory(GiB)": 19.03, "step": 2703, "token_acc": 0.9672131147540983, "train_speed(iter/s)": 0.951815 }, { "epoch": 0.08784069129064744, "grad_norm": 0.993906557559967, "learning_rate": 9.960957658170796e-06, "loss": 0.08503574877977371, "memory(GiB)": 19.03, "step": 2704, "token_acc": 0.9592592592592593, "train_speed(iter/s)": 0.951866 }, { "epoch": 0.08787317675340285, "grad_norm": 0.701358437538147, "learning_rate": 9.960890633920641e-06, "loss": 0.08044244349002838, "memory(GiB)": 19.03, "step": 2705, "token_acc": 0.9588014981273408, "train_speed(iter/s)": 0.951924 }, { "epoch": 0.08790566221615827, "grad_norm": 1.0395774841308594, "learning_rate": 9.960823552415206e-06, "loss": 0.0946757048368454, "memory(GiB)": 19.03, "step": 2706, "token_acc": 0.9658536585365853, "train_speed(iter/s)": 0.951979 }, { "epoch": 0.08793814767891368, "grad_norm": 1.6227540969848633, "learning_rate": 9.960756413655264e-06, "loss": 0.0856645405292511, "memory(GiB)": 19.03, "step": 2707, "token_acc": 0.9817518248175182, "train_speed(iter/s)": 0.952031 }, { "epoch": 0.0879706331416691, "grad_norm": 1.6336970329284668, "learning_rate": 9.960689217641588e-06, "loss": 0.08669689297676086, "memory(GiB)": 19.03, "step": 2708, "token_acc": 0.9634703196347032, "train_speed(iter/s)": 0.952083 }, { "epoch": 0.08800311860442452, "grad_norm": 0.7615264058113098, "learning_rate": 9.960621964374959e-06, "loss": 0.07976622879505157, "memory(GiB)": 19.03, "step": 2709, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.952138 }, { "epoch": 0.08803560406717993, "grad_norm": 0.7567470073699951, "learning_rate": 9.960554653856148e-06, "loss": 0.09333159774541855, "memory(GiB)": 19.03, "step": 2710, "token_acc": 0.966542750929368, "train_speed(iter/s)": 0.952189 }, { "epoch": 0.08806808952993535, "grad_norm": 1.8653064966201782, "learning_rate": 9.960487286085935e-06, "loss": 0.08611643314361572, "memory(GiB)": 19.03, "step": 2711, "token_acc": 0.9736842105263158, "train_speed(iter/s)": 0.952246 }, { "epoch": 0.08810057499269078, "grad_norm": 3.8237929344177246, "learning_rate": 9.960419861065096e-06, "loss": 0.09691356122493744, "memory(GiB)": 19.03, "step": 2712, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.952298 }, { "epoch": 0.0881330604554462, "grad_norm": 2.1902103424072266, "learning_rate": 9.960352378794409e-06, "loss": 0.07732785493135452, "memory(GiB)": 19.03, "step": 2713, "token_acc": 0.9716981132075472, "train_speed(iter/s)": 0.952348 }, { "epoch": 0.08816554591820161, "grad_norm": 0.7603536248207092, "learning_rate": 9.960284839274654e-06, "loss": 0.08855737000703812, "memory(GiB)": 19.03, "step": 2714, "token_acc": 0.9612403100775194, "train_speed(iter/s)": 0.952403 }, { "epoch": 0.08819803138095703, "grad_norm": 0.654371976852417, "learning_rate": 9.960217242506608e-06, "loss": 0.08288569003343582, "memory(GiB)": 19.03, "step": 2715, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.952457 }, { "epoch": 0.08823051684371244, "grad_norm": 0.9787799715995789, "learning_rate": 9.960149588491055e-06, "loss": 0.08205296844244003, "memory(GiB)": 19.03, "step": 2716, "token_acc": 0.9629629629629629, "train_speed(iter/s)": 0.952513 }, { "epoch": 0.08826300230646786, "grad_norm": 0.5885191559791565, "learning_rate": 9.960081877228773e-06, "loss": 0.07023275643587112, "memory(GiB)": 19.03, "step": 2717, "token_acc": 0.9703703703703703, "train_speed(iter/s)": 0.952573 }, { "epoch": 0.08829548776922327, "grad_norm": 0.9075709581375122, "learning_rate": 9.960014108720544e-06, "loss": 0.08317787945270538, "memory(GiB)": 19.03, "step": 2718, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.952634 }, { "epoch": 0.08832797323197869, "grad_norm": 0.5469554662704468, "learning_rate": 9.959946282967152e-06, "loss": 0.0817553922533989, "memory(GiB)": 19.03, "step": 2719, "token_acc": 0.984375, "train_speed(iter/s)": 0.952688 }, { "epoch": 0.0883604586947341, "grad_norm": 0.9938265085220337, "learning_rate": 9.959878399969375e-06, "loss": 0.08259884268045425, "memory(GiB)": 19.03, "step": 2720, "token_acc": 0.971830985915493, "train_speed(iter/s)": 0.952734 }, { "epoch": 0.08839294415748952, "grad_norm": 0.5389849543571472, "learning_rate": 9.959810459728003e-06, "loss": 0.08845497667789459, "memory(GiB)": 19.03, "step": 2721, "token_acc": 0.9488188976377953, "train_speed(iter/s)": 0.952785 }, { "epoch": 0.08842542962024494, "grad_norm": 1.3181545734405518, "learning_rate": 9.959742462243815e-06, "loss": 0.09767574071884155, "memory(GiB)": 19.03, "step": 2722, "token_acc": 0.9680851063829787, "train_speed(iter/s)": 0.95284 }, { "epoch": 0.08845791508300035, "grad_norm": 1.2060855627059937, "learning_rate": 9.9596744075176e-06, "loss": 0.08114677667617798, "memory(GiB)": 19.03, "step": 2723, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.952899 }, { "epoch": 0.08849040054575577, "grad_norm": 1.0506017208099365, "learning_rate": 9.959606295550138e-06, "loss": 0.07606805115938187, "memory(GiB)": 19.03, "step": 2724, "token_acc": 0.9727626459143969, "train_speed(iter/s)": 0.952964 }, { "epoch": 0.08852288600851119, "grad_norm": 0.8204914927482605, "learning_rate": 9.95953812634222e-06, "loss": 0.07076141238212585, "memory(GiB)": 19.03, "step": 2725, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.953033 }, { "epoch": 0.0885553714712666, "grad_norm": 1.1793324947357178, "learning_rate": 9.959469899894629e-06, "loss": 0.07613267749547958, "memory(GiB)": 19.03, "step": 2726, "token_acc": 0.9695652173913043, "train_speed(iter/s)": 0.953099 }, { "epoch": 0.08858785693402202, "grad_norm": 0.864443838596344, "learning_rate": 9.959401616208156e-06, "loss": 0.0823945552110672, "memory(GiB)": 19.03, "step": 2727, "token_acc": 0.9404761904761905, "train_speed(iter/s)": 0.953169 }, { "epoch": 0.08862034239677745, "grad_norm": 1.134104609489441, "learning_rate": 9.959333275283587e-06, "loss": 0.08682843297719955, "memory(GiB)": 19.03, "step": 2728, "token_acc": 0.99, "train_speed(iter/s)": 0.953238 }, { "epoch": 0.08865282785953287, "grad_norm": 0.6848740577697754, "learning_rate": 9.959264877121712e-06, "loss": 0.07691282778978348, "memory(GiB)": 19.03, "step": 2729, "token_acc": 0.9783549783549783, "train_speed(iter/s)": 0.953309 }, { "epoch": 0.08868531332228828, "grad_norm": 0.7858251333236694, "learning_rate": 9.959196421723319e-06, "loss": 0.06907106190919876, "memory(GiB)": 19.03, "step": 2730, "token_acc": 0.9783393501805054, "train_speed(iter/s)": 0.953381 }, { "epoch": 0.0887177987850437, "grad_norm": 1.1631004810333252, "learning_rate": 9.959127909089196e-06, "loss": 0.08344705402851105, "memory(GiB)": 19.03, "step": 2731, "token_acc": 0.9508928571428571, "train_speed(iter/s)": 0.953449 }, { "epoch": 0.08875028424779911, "grad_norm": 1.0408618450164795, "learning_rate": 9.959059339220141e-06, "loss": 0.07557880878448486, "memory(GiB)": 19.03, "step": 2732, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.953516 }, { "epoch": 0.08878276971055453, "grad_norm": 1.3707748651504517, "learning_rate": 9.958990712116938e-06, "loss": 0.08185242116451263, "memory(GiB)": 19.03, "step": 2733, "token_acc": 0.9555555555555556, "train_speed(iter/s)": 0.953575 }, { "epoch": 0.08881525517330995, "grad_norm": 0.9725292325019836, "learning_rate": 9.958922027780382e-06, "loss": 0.09784917533397675, "memory(GiB)": 19.03, "step": 2734, "token_acc": 0.9690265486725663, "train_speed(iter/s)": 0.95363 }, { "epoch": 0.08884774063606536, "grad_norm": 0.7890673875808716, "learning_rate": 9.958853286211266e-06, "loss": 0.08234788477420807, "memory(GiB)": 19.03, "step": 2735, "token_acc": 0.9726775956284153, "train_speed(iter/s)": 0.953694 }, { "epoch": 0.08888022609882078, "grad_norm": 0.8569478988647461, "learning_rate": 9.958784487410383e-06, "loss": 0.07838236540555954, "memory(GiB)": 19.03, "step": 2736, "token_acc": 0.965, "train_speed(iter/s)": 0.953751 }, { "epoch": 0.0889127115615762, "grad_norm": 1.9784826040267944, "learning_rate": 9.958715631378524e-06, "loss": 0.08132419735193253, "memory(GiB)": 19.03, "step": 2737, "token_acc": 0.9641255605381166, "train_speed(iter/s)": 0.95381 }, { "epoch": 0.08894519702433161, "grad_norm": 0.7701783180236816, "learning_rate": 9.95864671811649e-06, "loss": 0.07736831903457642, "memory(GiB)": 19.03, "step": 2738, "token_acc": 0.9495798319327731, "train_speed(iter/s)": 0.953864 }, { "epoch": 0.08897768248708703, "grad_norm": 1.0028055906295776, "learning_rate": 9.958577747625071e-06, "loss": 0.08843681216239929, "memory(GiB)": 19.03, "step": 2739, "token_acc": 0.9621848739495799, "train_speed(iter/s)": 0.953917 }, { "epoch": 0.08901016794984244, "grad_norm": 1.2067004442214966, "learning_rate": 9.958508719905067e-06, "loss": 0.0806068629026413, "memory(GiB)": 19.03, "step": 2740, "token_acc": 0.9662921348314607, "train_speed(iter/s)": 0.953966 }, { "epoch": 0.08904265341259786, "grad_norm": 0.9109794497489929, "learning_rate": 9.95843963495727e-06, "loss": 0.09703415632247925, "memory(GiB)": 19.03, "step": 2741, "token_acc": 0.96875, "train_speed(iter/s)": 0.954025 }, { "epoch": 0.08907513887535327, "grad_norm": 1.2610015869140625, "learning_rate": 9.958370492782483e-06, "loss": 0.09016942232847214, "memory(GiB)": 19.03, "step": 2742, "token_acc": 0.95703125, "train_speed(iter/s)": 0.954078 }, { "epoch": 0.08910762433810869, "grad_norm": 1.4845083951950073, "learning_rate": 9.958301293381498e-06, "loss": 0.09355151653289795, "memory(GiB)": 19.03, "step": 2743, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.954134 }, { "epoch": 0.08914010980086412, "grad_norm": 26.247468948364258, "learning_rate": 9.95823203675512e-06, "loss": 0.0852680429816246, "memory(GiB)": 19.03, "step": 2744, "token_acc": 0.9662447257383966, "train_speed(iter/s)": 0.954188 }, { "epoch": 0.08917259526361954, "grad_norm": 2.965209722518921, "learning_rate": 9.958162722904141e-06, "loss": 0.08307583630084991, "memory(GiB)": 19.03, "step": 2745, "token_acc": 0.974025974025974, "train_speed(iter/s)": 0.954235 }, { "epoch": 0.08920508072637495, "grad_norm": 0.8357020020484924, "learning_rate": 9.958093351829368e-06, "loss": 0.08362355828285217, "memory(GiB)": 19.03, "step": 2746, "token_acc": 0.9656862745098039, "train_speed(iter/s)": 0.954293 }, { "epoch": 0.08923756618913037, "grad_norm": 1.3465385437011719, "learning_rate": 9.958023923531596e-06, "loss": 0.0909222885966301, "memory(GiB)": 19.03, "step": 2747, "token_acc": 0.966542750929368, "train_speed(iter/s)": 0.954348 }, { "epoch": 0.08927005165188578, "grad_norm": 0.8535029888153076, "learning_rate": 9.95795443801163e-06, "loss": 0.0915590301156044, "memory(GiB)": 19.03, "step": 2748, "token_acc": 0.9698275862068966, "train_speed(iter/s)": 0.954402 }, { "epoch": 0.0893025371146412, "grad_norm": 1.0591989755630493, "learning_rate": 9.95788489527027e-06, "loss": 0.07703325897455215, "memory(GiB)": 19.03, "step": 2749, "token_acc": 0.9626168224299065, "train_speed(iter/s)": 0.954455 }, { "epoch": 0.08933502257739662, "grad_norm": 1.1026452779769897, "learning_rate": 9.957815295308322e-06, "loss": 0.07302553206682205, "memory(GiB)": 19.03, "step": 2750, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.954514 }, { "epoch": 0.08936750804015203, "grad_norm": 1.2151076793670654, "learning_rate": 9.957745638126583e-06, "loss": 0.0914171040058136, "memory(GiB)": 19.03, "step": 2751, "token_acc": 0.9611307420494699, "train_speed(iter/s)": 0.954587 }, { "epoch": 0.08939999350290745, "grad_norm": 1.8260078430175781, "learning_rate": 9.957675923725863e-06, "loss": 0.09049788117408752, "memory(GiB)": 19.03, "step": 2752, "token_acc": 0.9702127659574468, "train_speed(iter/s)": 0.954663 }, { "epoch": 0.08943247896566286, "grad_norm": 1.2348246574401855, "learning_rate": 9.957606152106964e-06, "loss": 0.09276694059371948, "memory(GiB)": 19.03, "step": 2753, "token_acc": 0.9537037037037037, "train_speed(iter/s)": 0.954737 }, { "epoch": 0.08946496442841828, "grad_norm": 1.2014182806015015, "learning_rate": 9.957536323270691e-06, "loss": 0.0871177464723587, "memory(GiB)": 19.03, "step": 2754, "token_acc": 0.9656862745098039, "train_speed(iter/s)": 0.954808 }, { "epoch": 0.0894974498911737, "grad_norm": 1.3159016370773315, "learning_rate": 9.957466437217851e-06, "loss": 0.08679299801588058, "memory(GiB)": 19.03, "step": 2755, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.954883 }, { "epoch": 0.08952993535392911, "grad_norm": 0.9437525272369385, "learning_rate": 9.957396493949248e-06, "loss": 0.10072062909603119, "memory(GiB)": 19.03, "step": 2756, "token_acc": 0.9590909090909091, "train_speed(iter/s)": 0.954951 }, { "epoch": 0.08956242081668453, "grad_norm": 1.0993291139602661, "learning_rate": 9.957326493465693e-06, "loss": 0.08007986098527908, "memory(GiB)": 19.03, "step": 2757, "token_acc": 0.967032967032967, "train_speed(iter/s)": 0.955025 }, { "epoch": 0.08959490627943995, "grad_norm": 0.5258447527885437, "learning_rate": 9.957256435767992e-06, "loss": 0.061827704310417175, "memory(GiB)": 19.03, "step": 2758, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.955094 }, { "epoch": 0.08962739174219536, "grad_norm": 1.4087610244750977, "learning_rate": 9.957186320856953e-06, "loss": 0.08946101367473602, "memory(GiB)": 19.03, "step": 2759, "token_acc": 0.9567307692307693, "train_speed(iter/s)": 0.955163 }, { "epoch": 0.08965987720495079, "grad_norm": 0.9848347306251526, "learning_rate": 9.957116148733385e-06, "loss": 0.07394783198833466, "memory(GiB)": 19.03, "step": 2760, "token_acc": 0.9845559845559846, "train_speed(iter/s)": 0.955234 }, { "epoch": 0.08969236266770621, "grad_norm": 1.5264239311218262, "learning_rate": 9.9570459193981e-06, "loss": 0.09077748656272888, "memory(GiB)": 19.03, "step": 2761, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.955301 }, { "epoch": 0.08972484813046162, "grad_norm": 1.2537763118743896, "learning_rate": 9.956975632851908e-06, "loss": 0.07349695265293121, "memory(GiB)": 19.03, "step": 2762, "token_acc": 0.9745762711864406, "train_speed(iter/s)": 0.955354 }, { "epoch": 0.08975733359321704, "grad_norm": 1.5803806781768799, "learning_rate": 9.95690528909562e-06, "loss": 0.09296610951423645, "memory(GiB)": 19.03, "step": 2763, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.95541 }, { "epoch": 0.08978981905597246, "grad_norm": 1.458146095275879, "learning_rate": 9.956834888130045e-06, "loss": 0.06972995400428772, "memory(GiB)": 19.03, "step": 2764, "token_acc": 0.9727626459143969, "train_speed(iter/s)": 0.955465 }, { "epoch": 0.08982230451872787, "grad_norm": 1.8768471479415894, "learning_rate": 9.956764429955998e-06, "loss": 0.095486119389534, "memory(GiB)": 19.03, "step": 2765, "token_acc": 0.9437229437229437, "train_speed(iter/s)": 0.95552 }, { "epoch": 0.08985478998148329, "grad_norm": 1.0692239999771118, "learning_rate": 9.956693914574294e-06, "loss": 0.09134198725223541, "memory(GiB)": 19.03, "step": 2766, "token_acc": 0.968421052631579, "train_speed(iter/s)": 0.955576 }, { "epoch": 0.0898872754442387, "grad_norm": 2.3612003326416016, "learning_rate": 9.956623341985743e-06, "loss": 0.07692457735538483, "memory(GiB)": 19.03, "step": 2767, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.955625 }, { "epoch": 0.08991976090699412, "grad_norm": 1.320365309715271, "learning_rate": 9.956552712191161e-06, "loss": 0.09171493351459503, "memory(GiB)": 19.03, "step": 2768, "token_acc": 0.9717741935483871, "train_speed(iter/s)": 0.955681 }, { "epoch": 0.08995224636974954, "grad_norm": 0.6363579034805298, "learning_rate": 9.956482025191366e-06, "loss": 0.08298298716545105, "memory(GiB)": 19.03, "step": 2769, "token_acc": 0.971830985915493, "train_speed(iter/s)": 0.95574 }, { "epoch": 0.08998473183250495, "grad_norm": 2.512437582015991, "learning_rate": 9.95641128098717e-06, "loss": 0.07634911686182022, "memory(GiB)": 19.03, "step": 2770, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.955793 }, { "epoch": 0.09001721729526037, "grad_norm": 2.6094937324523926, "learning_rate": 9.95634047957939e-06, "loss": 0.08391080796718597, "memory(GiB)": 19.03, "step": 2771, "token_acc": 0.9584775086505191, "train_speed(iter/s)": 0.955843 }, { "epoch": 0.09004970275801578, "grad_norm": 1.518955111503601, "learning_rate": 9.956269620968844e-06, "loss": 0.08194103091955185, "memory(GiB)": 19.03, "step": 2772, "token_acc": 0.9513274336283186, "train_speed(iter/s)": 0.955894 }, { "epoch": 0.0900821882207712, "grad_norm": 0.6205125451087952, "learning_rate": 9.95619870515635e-06, "loss": 0.0744204968214035, "memory(GiB)": 19.03, "step": 2773, "token_acc": 0.9730941704035875, "train_speed(iter/s)": 0.955948 }, { "epoch": 0.09011467368352662, "grad_norm": 0.8959762454032898, "learning_rate": 9.956127732142728e-06, "loss": 0.07727936655282974, "memory(GiB)": 19.03, "step": 2774, "token_acc": 0.9765258215962441, "train_speed(iter/s)": 0.956005 }, { "epoch": 0.09014715914628203, "grad_norm": 1.0751063823699951, "learning_rate": 9.956056701928794e-06, "loss": 0.07818402349948883, "memory(GiB)": 19.03, "step": 2775, "token_acc": 0.9603174603174603, "train_speed(iter/s)": 0.956061 }, { "epoch": 0.09017964460903746, "grad_norm": 2.3480188846588135, "learning_rate": 9.95598561451537e-06, "loss": 0.07176894694566727, "memory(GiB)": 19.03, "step": 2776, "token_acc": 0.9647577092511013, "train_speed(iter/s)": 0.956117 }, { "epoch": 0.09021213007179288, "grad_norm": 1.077719807624817, "learning_rate": 9.955914469903275e-06, "loss": 0.07867705076932907, "memory(GiB)": 19.03, "step": 2777, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.956174 }, { "epoch": 0.0902446155345483, "grad_norm": 0.7572709918022156, "learning_rate": 9.95584326809333e-06, "loss": 0.0796855241060257, "memory(GiB)": 19.03, "step": 2778, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.956221 }, { "epoch": 0.09027710099730371, "grad_norm": 1.3116176128387451, "learning_rate": 9.95577200908636e-06, "loss": 0.08083476126194, "memory(GiB)": 19.03, "step": 2779, "token_acc": 0.96875, "train_speed(iter/s)": 0.956269 }, { "epoch": 0.09030958646005913, "grad_norm": 1.7053133249282837, "learning_rate": 9.955700692883182e-06, "loss": 0.08988969773054123, "memory(GiB)": 19.03, "step": 2780, "token_acc": 0.9626865671641791, "train_speed(iter/s)": 0.956315 }, { "epoch": 0.09034207192281454, "grad_norm": 1.8778178691864014, "learning_rate": 9.955629319484623e-06, "loss": 0.0714363306760788, "memory(GiB)": 19.03, "step": 2781, "token_acc": 0.9700854700854701, "train_speed(iter/s)": 0.956369 }, { "epoch": 0.09037455738556996, "grad_norm": 1.3415942192077637, "learning_rate": 9.955557888891505e-06, "loss": 0.09475121647119522, "memory(GiB)": 19.03, "step": 2782, "token_acc": 0.9619047619047619, "train_speed(iter/s)": 0.956429 }, { "epoch": 0.09040704284832538, "grad_norm": 0.6515042185783386, "learning_rate": 9.955486401104654e-06, "loss": 0.07341879606246948, "memory(GiB)": 19.03, "step": 2783, "token_acc": 0.9641434262948207, "train_speed(iter/s)": 0.956491 }, { "epoch": 0.09043952831108079, "grad_norm": 1.228107213973999, "learning_rate": 9.955414856124893e-06, "loss": 0.0896410346031189, "memory(GiB)": 19.03, "step": 2784, "token_acc": 0.956, "train_speed(iter/s)": 0.956544 }, { "epoch": 0.09047201377383621, "grad_norm": 0.7130407691001892, "learning_rate": 9.955343253953051e-06, "loss": 0.0763707160949707, "memory(GiB)": 19.03, "step": 2785, "token_acc": 0.9534883720930233, "train_speed(iter/s)": 0.956611 }, { "epoch": 0.09050449923659162, "grad_norm": 0.6166519522666931, "learning_rate": 9.955271594589953e-06, "loss": 0.07152141630649567, "memory(GiB)": 19.03, "step": 2786, "token_acc": 0.9809885931558935, "train_speed(iter/s)": 0.956682 }, { "epoch": 0.09053698469934704, "grad_norm": 0.7553943991661072, "learning_rate": 9.955199878036423e-06, "loss": 0.07396192848682404, "memory(GiB)": 19.03, "step": 2787, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.956752 }, { "epoch": 0.09056947016210246, "grad_norm": 1.2954671382904053, "learning_rate": 9.955128104293292e-06, "loss": 0.08125832676887512, "memory(GiB)": 19.03, "step": 2788, "token_acc": 0.9653465346534653, "train_speed(iter/s)": 0.956821 }, { "epoch": 0.09060195562485787, "grad_norm": 1.0011717081069946, "learning_rate": 9.955056273361388e-06, "loss": 0.07205291092395782, "memory(GiB)": 19.03, "step": 2789, "token_acc": 0.9656652360515021, "train_speed(iter/s)": 0.956893 }, { "epoch": 0.09063444108761329, "grad_norm": 0.7249203324317932, "learning_rate": 9.954984385241537e-06, "loss": 0.07568427175283432, "memory(GiB)": 19.03, "step": 2790, "token_acc": 0.976, "train_speed(iter/s)": 0.956939 }, { "epoch": 0.0906669265503687, "grad_norm": 1.8860636949539185, "learning_rate": 9.954912439934574e-06, "loss": 0.0786585658788681, "memory(GiB)": 19.03, "step": 2791, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.956986 }, { "epoch": 0.09069941201312413, "grad_norm": 0.8626765012741089, "learning_rate": 9.954840437441324e-06, "loss": 0.07849402725696564, "memory(GiB)": 19.03, "step": 2792, "token_acc": 0.9543568464730291, "train_speed(iter/s)": 0.957045 }, { "epoch": 0.09073189747587955, "grad_norm": 1.6045526266098022, "learning_rate": 9.954768377762623e-06, "loss": 0.08008350431919098, "memory(GiB)": 19.03, "step": 2793, "token_acc": 0.9740932642487047, "train_speed(iter/s)": 0.957101 }, { "epoch": 0.09076438293863497, "grad_norm": 0.809001624584198, "learning_rate": 9.954696260899298e-06, "loss": 0.07621605694293976, "memory(GiB)": 19.03, "step": 2794, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.957153 }, { "epoch": 0.09079686840139038, "grad_norm": 0.6629965305328369, "learning_rate": 9.954624086852185e-06, "loss": 0.07652056962251663, "memory(GiB)": 19.03, "step": 2795, "token_acc": 0.9774436090225563, "train_speed(iter/s)": 0.957205 }, { "epoch": 0.0908293538641458, "grad_norm": 0.8973855376243591, "learning_rate": 9.954551855622114e-06, "loss": 0.07369338721036911, "memory(GiB)": 19.03, "step": 2796, "token_acc": 0.968503937007874, "train_speed(iter/s)": 0.95726 }, { "epoch": 0.09086183932690121, "grad_norm": 0.9102932214736938, "learning_rate": 9.95447956720992e-06, "loss": 0.07707761973142624, "memory(GiB)": 19.03, "step": 2797, "token_acc": 0.9701492537313433, "train_speed(iter/s)": 0.957315 }, { "epoch": 0.09089432478965663, "grad_norm": 0.6395492553710938, "learning_rate": 9.954407221616438e-06, "loss": 0.06831837445497513, "memory(GiB)": 19.03, "step": 2798, "token_acc": 0.9796954314720813, "train_speed(iter/s)": 0.95737 }, { "epoch": 0.09092681025241205, "grad_norm": 1.0913431644439697, "learning_rate": 9.954334818842503e-06, "loss": 0.07689669728279114, "memory(GiB)": 19.03, "step": 2799, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.957428 }, { "epoch": 0.09095929571516746, "grad_norm": 1.619471549987793, "learning_rate": 9.954262358888949e-06, "loss": 0.08782881498336792, "memory(GiB)": 19.03, "step": 2800, "token_acc": 0.9518072289156626, "train_speed(iter/s)": 0.957477 }, { "epoch": 0.09099178117792288, "grad_norm": 0.8371376395225525, "learning_rate": 9.954189841756613e-06, "loss": 0.08076002448797226, "memory(GiB)": 19.03, "step": 2801, "token_acc": 0.9574468085106383, "train_speed(iter/s)": 0.957523 }, { "epoch": 0.0910242666406783, "grad_norm": 0.8939622640609741, "learning_rate": 9.954117267446335e-06, "loss": 0.07401107251644135, "memory(GiB)": 19.03, "step": 2802, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.957575 }, { "epoch": 0.09105675210343371, "grad_norm": 1.431161642074585, "learning_rate": 9.954044635958949e-06, "loss": 0.085774265229702, "memory(GiB)": 19.03, "step": 2803, "token_acc": 0.9502262443438914, "train_speed(iter/s)": 0.957627 }, { "epoch": 0.09108923756618913, "grad_norm": 0.7411792278289795, "learning_rate": 9.95397194729529e-06, "loss": 0.07393805682659149, "memory(GiB)": 19.03, "step": 2804, "token_acc": 0.96, "train_speed(iter/s)": 0.95768 }, { "epoch": 0.09112172302894454, "grad_norm": 1.027295708656311, "learning_rate": 9.953899201456206e-06, "loss": 0.08215205371379852, "memory(GiB)": 19.03, "step": 2805, "token_acc": 0.9751243781094527, "train_speed(iter/s)": 0.957729 }, { "epoch": 0.09115420849169996, "grad_norm": 0.9658451080322266, "learning_rate": 9.95382639844253e-06, "loss": 0.08155372738838196, "memory(GiB)": 19.03, "step": 2806, "token_acc": 0.9698275862068966, "train_speed(iter/s)": 0.957786 }, { "epoch": 0.09118669395445538, "grad_norm": 1.8298982381820679, "learning_rate": 9.953753538255102e-06, "loss": 0.07617634534835815, "memory(GiB)": 19.03, "step": 2807, "token_acc": 0.9819277108433735, "train_speed(iter/s)": 0.957844 }, { "epoch": 0.0912191794172108, "grad_norm": 1.833068609237671, "learning_rate": 9.953680620894767e-06, "loss": 0.0895802229642868, "memory(GiB)": 19.03, "step": 2808, "token_acc": 0.9712918660287081, "train_speed(iter/s)": 0.957912 }, { "epoch": 0.09125166487996622, "grad_norm": 0.7426263689994812, "learning_rate": 9.953607646362363e-06, "loss": 0.07869081199169159, "memory(GiB)": 19.03, "step": 2809, "token_acc": 0.9742647058823529, "train_speed(iter/s)": 0.957975 }, { "epoch": 0.09128415034272164, "grad_norm": 0.9140830636024475, "learning_rate": 9.953534614658733e-06, "loss": 0.09240315854549408, "memory(GiB)": 19.03, "step": 2810, "token_acc": 0.9702127659574468, "train_speed(iter/s)": 0.958048 }, { "epoch": 0.09131663580547705, "grad_norm": 0.8756877779960632, "learning_rate": 9.953461525784722e-06, "loss": 0.07664790749549866, "memory(GiB)": 19.03, "step": 2811, "token_acc": 0.977859778597786, "train_speed(iter/s)": 0.958119 }, { "epoch": 0.09134912126823247, "grad_norm": 1.1156549453735352, "learning_rate": 9.953388379741171e-06, "loss": 0.07895709574222565, "memory(GiB)": 19.03, "step": 2812, "token_acc": 0.9807692307692307, "train_speed(iter/s)": 0.958189 }, { "epoch": 0.09138160673098789, "grad_norm": 0.6726787090301514, "learning_rate": 9.953315176528926e-06, "loss": 0.07912573218345642, "memory(GiB)": 19.03, "step": 2813, "token_acc": 0.9670781893004116, "train_speed(iter/s)": 0.958258 }, { "epoch": 0.0914140921937433, "grad_norm": 1.2310439348220825, "learning_rate": 9.95324191614883e-06, "loss": 0.07361370325088501, "memory(GiB)": 19.03, "step": 2814, "token_acc": 0.9615384615384616, "train_speed(iter/s)": 0.958327 }, { "epoch": 0.09144657765649872, "grad_norm": 0.6821451783180237, "learning_rate": 9.95316859860173e-06, "loss": 0.07504817843437195, "memory(GiB)": 19.03, "step": 2815, "token_acc": 0.976027397260274, "train_speed(iter/s)": 0.958394 }, { "epoch": 0.09147906311925413, "grad_norm": 0.558399498462677, "learning_rate": 9.953095223888471e-06, "loss": 0.07826226949691772, "memory(GiB)": 19.03, "step": 2816, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.958467 }, { "epoch": 0.09151154858200955, "grad_norm": 1.9507142305374146, "learning_rate": 9.953021792009902e-06, "loss": 0.0746583566069603, "memory(GiB)": 19.03, "step": 2817, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.958539 }, { "epoch": 0.09154403404476497, "grad_norm": 0.8604590892791748, "learning_rate": 9.952948302966869e-06, "loss": 0.07219459116458893, "memory(GiB)": 19.03, "step": 2818, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.958609 }, { "epoch": 0.09157651950752038, "grad_norm": 1.2098853588104248, "learning_rate": 9.952874756760221e-06, "loss": 0.08454916626214981, "memory(GiB)": 19.03, "step": 2819, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.958676 }, { "epoch": 0.0916090049702758, "grad_norm": 0.8927851319313049, "learning_rate": 9.952801153390802e-06, "loss": 0.07724124193191528, "memory(GiB)": 19.03, "step": 2820, "token_acc": 0.956, "train_speed(iter/s)": 0.958747 }, { "epoch": 0.09164149043303121, "grad_norm": 0.9070430397987366, "learning_rate": 9.952727492859469e-06, "loss": 0.07688956707715988, "memory(GiB)": 19.03, "step": 2821, "token_acc": 0.96875, "train_speed(iter/s)": 0.958802 }, { "epoch": 0.09167397589578663, "grad_norm": 0.7756034731864929, "learning_rate": 9.952653775167069e-06, "loss": 0.06781323254108429, "memory(GiB)": 19.03, "step": 2822, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.958858 }, { "epoch": 0.09170646135854205, "grad_norm": 0.9405665397644043, "learning_rate": 9.95258000031445e-06, "loss": 0.07912502437829971, "memory(GiB)": 19.03, "step": 2823, "token_acc": 0.9646017699115044, "train_speed(iter/s)": 0.95891 }, { "epoch": 0.09173894682129748, "grad_norm": 0.8999767303466797, "learning_rate": 9.952506168302467e-06, "loss": 0.08499065041542053, "memory(GiB)": 19.03, "step": 2824, "token_acc": 0.966542750929368, "train_speed(iter/s)": 0.958965 }, { "epoch": 0.09177143228405289, "grad_norm": 0.6307839751243591, "learning_rate": 9.952432279131972e-06, "loss": 0.07260750979185104, "memory(GiB)": 19.03, "step": 2825, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.959021 }, { "epoch": 0.09180391774680831, "grad_norm": 1.2020964622497559, "learning_rate": 9.952358332803815e-06, "loss": 0.08085177838802338, "memory(GiB)": 19.03, "step": 2826, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.959077 }, { "epoch": 0.09183640320956372, "grad_norm": 0.9293944835662842, "learning_rate": 9.952284329318853e-06, "loss": 0.07180030643939972, "memory(GiB)": 19.03, "step": 2827, "token_acc": 0.9662447257383966, "train_speed(iter/s)": 0.959135 }, { "epoch": 0.09186888867231914, "grad_norm": 0.92210453748703, "learning_rate": 9.952210268677938e-06, "loss": 0.0718270093202591, "memory(GiB)": 19.03, "step": 2828, "token_acc": 0.9700374531835206, "train_speed(iter/s)": 0.959191 }, { "epoch": 0.09190137413507456, "grad_norm": 0.722338855266571, "learning_rate": 9.952136150881924e-06, "loss": 0.07618753612041473, "memory(GiB)": 19.03, "step": 2829, "token_acc": 0.9707112970711297, "train_speed(iter/s)": 0.959244 }, { "epoch": 0.09193385959782997, "grad_norm": 0.8401261568069458, "learning_rate": 9.952061975931669e-06, "loss": 0.0768740177154541, "memory(GiB)": 19.03, "step": 2830, "token_acc": 0.9724137931034482, "train_speed(iter/s)": 0.959293 }, { "epoch": 0.09196634506058539, "grad_norm": 0.8169196844100952, "learning_rate": 9.951987743828027e-06, "loss": 0.08270107209682465, "memory(GiB)": 19.03, "step": 2831, "token_acc": 0.9695652173913043, "train_speed(iter/s)": 0.959348 }, { "epoch": 0.0919988305233408, "grad_norm": 0.740222156047821, "learning_rate": 9.951913454571855e-06, "loss": 0.07898068428039551, "memory(GiB)": 19.03, "step": 2832, "token_acc": 0.9707112970711297, "train_speed(iter/s)": 0.959403 }, { "epoch": 0.09203131598609622, "grad_norm": 1.2475254535675049, "learning_rate": 9.951839108164012e-06, "loss": 0.08500777930021286, "memory(GiB)": 19.03, "step": 2833, "token_acc": 0.944, "train_speed(iter/s)": 0.959459 }, { "epoch": 0.09206380144885164, "grad_norm": 1.2469408512115479, "learning_rate": 9.951764704605353e-06, "loss": 0.09006454050540924, "memory(GiB)": 19.03, "step": 2834, "token_acc": 0.945054945054945, "train_speed(iter/s)": 0.959511 }, { "epoch": 0.09209628691160705, "grad_norm": 1.349714994430542, "learning_rate": 9.95169024389674e-06, "loss": 0.0743512362241745, "memory(GiB)": 19.03, "step": 2835, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.959562 }, { "epoch": 0.09212877237436247, "grad_norm": 1.083499789237976, "learning_rate": 9.95161572603903e-06, "loss": 0.08757691085338593, "memory(GiB)": 19.03, "step": 2836, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.959614 }, { "epoch": 0.09216125783711789, "grad_norm": 0.6857799887657166, "learning_rate": 9.951541151033084e-06, "loss": 0.07533463090658188, "memory(GiB)": 19.03, "step": 2837, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.959665 }, { "epoch": 0.0921937432998733, "grad_norm": 0.7474648952484131, "learning_rate": 9.951466518879763e-06, "loss": 0.06969520449638367, "memory(GiB)": 19.03, "step": 2838, "token_acc": 0.9752475247524752, "train_speed(iter/s)": 0.959716 }, { "epoch": 0.09222622876262872, "grad_norm": 0.787670373916626, "learning_rate": 9.951391829579928e-06, "loss": 0.09190666675567627, "memory(GiB)": 19.03, "step": 2839, "token_acc": 0.9398148148148148, "train_speed(iter/s)": 0.959759 }, { "epoch": 0.09225871422538415, "grad_norm": 1.6345640420913696, "learning_rate": 9.951317083134441e-06, "loss": 0.08813174068927765, "memory(GiB)": 19.03, "step": 2840, "token_acc": 0.9550561797752809, "train_speed(iter/s)": 0.959811 }, { "epoch": 0.09229119968813956, "grad_norm": 0.5319864749908447, "learning_rate": 9.951242279544165e-06, "loss": 0.07711634039878845, "memory(GiB)": 19.03, "step": 2841, "token_acc": 0.9673469387755103, "train_speed(iter/s)": 0.959866 }, { "epoch": 0.09232368515089498, "grad_norm": 1.3530532121658325, "learning_rate": 9.951167418809962e-06, "loss": 0.0833684504032135, "memory(GiB)": 19.03, "step": 2842, "token_acc": 0.9601593625498008, "train_speed(iter/s)": 0.959927 }, { "epoch": 0.0923561706136504, "grad_norm": 0.7608046531677246, "learning_rate": 9.951092500932698e-06, "loss": 0.07111599296331406, "memory(GiB)": 19.03, "step": 2843, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.959995 }, { "epoch": 0.09238865607640581, "grad_norm": 1.109020709991455, "learning_rate": 9.951017525913236e-06, "loss": 0.08334846794605255, "memory(GiB)": 19.03, "step": 2844, "token_acc": 0.9601990049751243, "train_speed(iter/s)": 0.96006 }, { "epoch": 0.09242114153916123, "grad_norm": 0.7367124557495117, "learning_rate": 9.950942493752442e-06, "loss": 0.07934638857841492, "memory(GiB)": 19.03, "step": 2845, "token_acc": 0.9757085020242915, "train_speed(iter/s)": 0.960125 }, { "epoch": 0.09245362700191664, "grad_norm": 0.8627714514732361, "learning_rate": 9.950867404451183e-06, "loss": 0.08036893606185913, "memory(GiB)": 19.03, "step": 2846, "token_acc": 0.9630996309963099, "train_speed(iter/s)": 0.960193 }, { "epoch": 0.09248611246467206, "grad_norm": 0.6657738089561462, "learning_rate": 9.950792258010323e-06, "loss": 0.0807446539402008, "memory(GiB)": 19.03, "step": 2847, "token_acc": 0.9686274509803922, "train_speed(iter/s)": 0.960247 }, { "epoch": 0.09251859792742748, "grad_norm": 1.3379111289978027, "learning_rate": 9.950717054430731e-06, "loss": 0.09501496702432632, "memory(GiB)": 19.03, "step": 2848, "token_acc": 0.9627906976744186, "train_speed(iter/s)": 0.960295 }, { "epoch": 0.09255108339018289, "grad_norm": 1.1429849863052368, "learning_rate": 9.950641793713277e-06, "loss": 0.08007878065109253, "memory(GiB)": 19.03, "step": 2849, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.960346 }, { "epoch": 0.09258356885293831, "grad_norm": 0.7449871301651001, "learning_rate": 9.950566475858825e-06, "loss": 0.07272373139858246, "memory(GiB)": 19.03, "step": 2850, "token_acc": 0.9754385964912281, "train_speed(iter/s)": 0.960397 }, { "epoch": 0.09261605431569372, "grad_norm": 1.3514176607131958, "learning_rate": 9.950491100868247e-06, "loss": 0.07511929422616959, "memory(GiB)": 19.03, "step": 2851, "token_acc": 0.9723502304147466, "train_speed(iter/s)": 0.960444 }, { "epoch": 0.09264853977844914, "grad_norm": 0.6254976391792297, "learning_rate": 9.950415668742412e-06, "loss": 0.08171546459197998, "memory(GiB)": 19.03, "step": 2852, "token_acc": 0.95, "train_speed(iter/s)": 0.960492 }, { "epoch": 0.09268102524120456, "grad_norm": 0.9410030841827393, "learning_rate": 9.950340179482193e-06, "loss": 0.07102398574352264, "memory(GiB)": 19.03, "step": 2853, "token_acc": 0.9737827715355806, "train_speed(iter/s)": 0.960518 }, { "epoch": 0.09271351070395997, "grad_norm": 1.0777647495269775, "learning_rate": 9.95026463308846e-06, "loss": 0.08644836395978928, "memory(GiB)": 19.03, "step": 2854, "token_acc": 0.9563106796116505, "train_speed(iter/s)": 0.960563 }, { "epoch": 0.09274599616671539, "grad_norm": 1.1439927816390991, "learning_rate": 9.950189029562082e-06, "loss": 0.08094671368598938, "memory(GiB)": 19.03, "step": 2855, "token_acc": 0.96875, "train_speed(iter/s)": 0.960616 }, { "epoch": 0.09277848162947082, "grad_norm": 1.010550618171692, "learning_rate": 9.950113368903935e-06, "loss": 0.08523768931627274, "memory(GiB)": 19.03, "step": 2856, "token_acc": 0.950530035335689, "train_speed(iter/s)": 0.960668 }, { "epoch": 0.09281096709222623, "grad_norm": 0.6447160840034485, "learning_rate": 9.95003765111489e-06, "loss": 0.08771918714046478, "memory(GiB)": 19.03, "step": 2857, "token_acc": 0.9533898305084746, "train_speed(iter/s)": 0.960715 }, { "epoch": 0.09284345255498165, "grad_norm": 0.9120330810546875, "learning_rate": 9.949961876195824e-06, "loss": 0.08989525586366653, "memory(GiB)": 19.03, "step": 2858, "token_acc": 0.9601990049751243, "train_speed(iter/s)": 0.960768 }, { "epoch": 0.09287593801773707, "grad_norm": 0.6401166319847107, "learning_rate": 9.949886044147607e-06, "loss": 0.0754464864730835, "memory(GiB)": 19.03, "step": 2859, "token_acc": 0.9679144385026738, "train_speed(iter/s)": 0.960816 }, { "epoch": 0.09290842348049248, "grad_norm": 0.5819965600967407, "learning_rate": 9.94981015497112e-06, "loss": 0.07079485803842545, "memory(GiB)": 19.03, "step": 2860, "token_acc": 0.9686274509803922, "train_speed(iter/s)": 0.960869 }, { "epoch": 0.0929409089432479, "grad_norm": 0.5416539907455444, "learning_rate": 9.949734208667234e-06, "loss": 0.07937707751989365, "memory(GiB)": 19.03, "step": 2861, "token_acc": 0.9641255605381166, "train_speed(iter/s)": 0.960917 }, { "epoch": 0.09297339440600332, "grad_norm": 1.0611252784729004, "learning_rate": 9.949658205236828e-06, "loss": 0.09191697835922241, "memory(GiB)": 19.03, "step": 2862, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.960963 }, { "epoch": 0.09300587986875873, "grad_norm": 1.7241549491882324, "learning_rate": 9.949582144680776e-06, "loss": 0.09257610142230988, "memory(GiB)": 19.03, "step": 2863, "token_acc": 0.9681274900398407, "train_speed(iter/s)": 0.961011 }, { "epoch": 0.09303836533151415, "grad_norm": 0.7798686623573303, "learning_rate": 9.94950602699996e-06, "loss": 0.08762842416763306, "memory(GiB)": 19.03, "step": 2864, "token_acc": 0.9563106796116505, "train_speed(iter/s)": 0.961081 }, { "epoch": 0.09307085079426956, "grad_norm": 1.1415677070617676, "learning_rate": 9.949429852195257e-06, "loss": 0.0691278949379921, "memory(GiB)": 19.03, "step": 2865, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.961149 }, { "epoch": 0.09310333625702498, "grad_norm": 0.8177337646484375, "learning_rate": 9.949353620267545e-06, "loss": 0.07526298612356186, "memory(GiB)": 19.03, "step": 2866, "token_acc": 0.9683257918552036, "train_speed(iter/s)": 0.961218 }, { "epoch": 0.0931358217197804, "grad_norm": 0.6755644679069519, "learning_rate": 9.949277331217704e-06, "loss": 0.07971079647541046, "memory(GiB)": 19.03, "step": 2867, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.961285 }, { "epoch": 0.09316830718253581, "grad_norm": 0.8763577938079834, "learning_rate": 9.949200985046617e-06, "loss": 0.08403130620718002, "memory(GiB)": 19.03, "step": 2868, "token_acc": 0.9708333333333333, "train_speed(iter/s)": 0.961353 }, { "epoch": 0.09320079264529123, "grad_norm": 0.5792109966278076, "learning_rate": 9.949124581755162e-06, "loss": 0.0715545117855072, "memory(GiB)": 19.03, "step": 2869, "token_acc": 0.9768518518518519, "train_speed(iter/s)": 0.961413 }, { "epoch": 0.09323327810804664, "grad_norm": 0.6531532406806946, "learning_rate": 9.949048121344223e-06, "loss": 0.06848738342523575, "memory(GiB)": 19.03, "step": 2870, "token_acc": 0.974169741697417, "train_speed(iter/s)": 0.961474 }, { "epoch": 0.09326576357080206, "grad_norm": 0.6039267778396606, "learning_rate": 9.948971603814678e-06, "loss": 0.07365638017654419, "memory(GiB)": 19.03, "step": 2871, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.961531 }, { "epoch": 0.09329824903355749, "grad_norm": 0.995461106300354, "learning_rate": 9.948895029167419e-06, "loss": 0.09442679584026337, "memory(GiB)": 19.03, "step": 2872, "token_acc": 0.9523809523809523, "train_speed(iter/s)": 0.961597 }, { "epoch": 0.0933307344963129, "grad_norm": 0.7517752647399902, "learning_rate": 9.948818397403322e-06, "loss": 0.07743117213249207, "memory(GiB)": 19.03, "step": 2873, "token_acc": 0.953307392996109, "train_speed(iter/s)": 0.961665 }, { "epoch": 0.09336321995906832, "grad_norm": 1.2460436820983887, "learning_rate": 9.948741708523273e-06, "loss": 0.08798260986804962, "memory(GiB)": 19.03, "step": 2874, "token_acc": 0.9535864978902954, "train_speed(iter/s)": 0.961728 }, { "epoch": 0.09339570542182374, "grad_norm": 0.965196967124939, "learning_rate": 9.948664962528158e-06, "loss": 0.06645718216896057, "memory(GiB)": 19.03, "step": 2875, "token_acc": 0.9765258215962441, "train_speed(iter/s)": 0.961796 }, { "epoch": 0.09342819088457915, "grad_norm": 0.7978284955024719, "learning_rate": 9.948588159418864e-06, "loss": 0.07772055268287659, "memory(GiB)": 19.03, "step": 2876, "token_acc": 0.9644268774703557, "train_speed(iter/s)": 0.961862 }, { "epoch": 0.09346067634733457, "grad_norm": 0.9030027389526367, "learning_rate": 9.948511299196275e-06, "loss": 0.07629594206809998, "memory(GiB)": 19.03, "step": 2877, "token_acc": 0.9585062240663901, "train_speed(iter/s)": 0.961932 }, { "epoch": 0.09349316181008999, "grad_norm": 1.074485182762146, "learning_rate": 9.948434381861278e-06, "loss": 0.08685775101184845, "memory(GiB)": 19.03, "step": 2878, "token_acc": 0.975103734439834, "train_speed(iter/s)": 0.961999 }, { "epoch": 0.0935256472728454, "grad_norm": 1.4217123985290527, "learning_rate": 9.948357407414764e-06, "loss": 0.07681125402450562, "memory(GiB)": 19.03, "step": 2879, "token_acc": 0.9701492537313433, "train_speed(iter/s)": 0.962068 }, { "epoch": 0.09355813273560082, "grad_norm": 0.7941718697547913, "learning_rate": 9.948280375857619e-06, "loss": 0.07327406108379364, "memory(GiB)": 19.03, "step": 2880, "token_acc": 0.9679144385026738, "train_speed(iter/s)": 0.962118 }, { "epoch": 0.09359061819835623, "grad_norm": 0.8585439920425415, "learning_rate": 9.948203287190731e-06, "loss": 0.08316251635551453, "memory(GiB)": 19.03, "step": 2881, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.962171 }, { "epoch": 0.09362310366111165, "grad_norm": 1.1215320825576782, "learning_rate": 9.948126141414993e-06, "loss": 0.09156983345746994, "memory(GiB)": 19.03, "step": 2882, "token_acc": 0.9704433497536946, "train_speed(iter/s)": 0.962223 }, { "epoch": 0.09365558912386707, "grad_norm": 1.9011329412460327, "learning_rate": 9.948048938531291e-06, "loss": 0.08549679815769196, "memory(GiB)": 19.03, "step": 2883, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.962276 }, { "epoch": 0.09368807458662248, "grad_norm": 0.8504839539527893, "learning_rate": 9.94797167854052e-06, "loss": 0.08548856526613235, "memory(GiB)": 19.03, "step": 2884, "token_acc": 0.9641434262948207, "train_speed(iter/s)": 0.962326 }, { "epoch": 0.0937205600493779, "grad_norm": 0.5975990295410156, "learning_rate": 9.94789436144357e-06, "loss": 0.06806392967700958, "memory(GiB)": 19.03, "step": 2885, "token_acc": 0.965, "train_speed(iter/s)": 0.96238 }, { "epoch": 0.09375304551213332, "grad_norm": 0.6708632707595825, "learning_rate": 9.947816987241334e-06, "loss": 0.0849432423710823, "memory(GiB)": 19.03, "step": 2886, "token_acc": 0.9623430962343096, "train_speed(iter/s)": 0.962432 }, { "epoch": 0.09378553097488873, "grad_norm": 0.7404010891914368, "learning_rate": 9.947739555934703e-06, "loss": 0.08668317645788193, "memory(GiB)": 19.03, "step": 2887, "token_acc": 0.9700854700854701, "train_speed(iter/s)": 0.962479 }, { "epoch": 0.09381801643764416, "grad_norm": 0.5843888521194458, "learning_rate": 9.947662067524574e-06, "loss": 0.06153935566544533, "memory(GiB)": 19.03, "step": 2888, "token_acc": 0.9623655913978495, "train_speed(iter/s)": 0.962532 }, { "epoch": 0.09385050190039958, "grad_norm": 0.8285689949989319, "learning_rate": 9.947584522011837e-06, "loss": 0.05650540068745613, "memory(GiB)": 19.03, "step": 2889, "token_acc": 0.966789667896679, "train_speed(iter/s)": 0.96258 }, { "epoch": 0.093882987363155, "grad_norm": 0.6882195472717285, "learning_rate": 9.947506919397392e-06, "loss": 0.07210970669984818, "memory(GiB)": 19.03, "step": 2890, "token_acc": 0.9606299212598425, "train_speed(iter/s)": 0.962628 }, { "epoch": 0.09391547282591041, "grad_norm": 1.0450001955032349, "learning_rate": 9.947429259682131e-06, "loss": 0.07443077862262726, "memory(GiB)": 19.03, "step": 2891, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.962682 }, { "epoch": 0.09394795828866583, "grad_norm": 1.190226674079895, "learning_rate": 9.94735154286695e-06, "loss": 0.0804421603679657, "memory(GiB)": 19.03, "step": 2892, "token_acc": 0.9577464788732394, "train_speed(iter/s)": 0.962734 }, { "epoch": 0.09398044375142124, "grad_norm": 1.2554802894592285, "learning_rate": 9.947273768952749e-06, "loss": 0.08608561754226685, "memory(GiB)": 19.03, "step": 2893, "token_acc": 0.9771863117870723, "train_speed(iter/s)": 0.962788 }, { "epoch": 0.09401292921417666, "grad_norm": 1.3040193319320679, "learning_rate": 9.947195937940423e-06, "loss": 0.07958008348941803, "memory(GiB)": 19.03, "step": 2894, "token_acc": 0.9656652360515021, "train_speed(iter/s)": 0.962843 }, { "epoch": 0.09404541467693207, "grad_norm": 0.8479582667350769, "learning_rate": 9.947118049830871e-06, "loss": 0.09094032645225525, "memory(GiB)": 19.03, "step": 2895, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.962899 }, { "epoch": 0.09407790013968749, "grad_norm": 1.1358628273010254, "learning_rate": 9.947040104624992e-06, "loss": 0.10023113340139389, "memory(GiB)": 19.03, "step": 2896, "token_acc": 0.9563492063492064, "train_speed(iter/s)": 0.96295 }, { "epoch": 0.0941103856024429, "grad_norm": 1.3889098167419434, "learning_rate": 9.946962102323687e-06, "loss": 0.07505187392234802, "memory(GiB)": 19.03, "step": 2897, "token_acc": 0.9436619718309859, "train_speed(iter/s)": 0.962999 }, { "epoch": 0.09414287106519832, "grad_norm": 0.692681610584259, "learning_rate": 9.946884042927854e-06, "loss": 0.07020288705825806, "memory(GiB)": 19.03, "step": 2898, "token_acc": 0.968609865470852, "train_speed(iter/s)": 0.963039 }, { "epoch": 0.09417535652795374, "grad_norm": 1.6747711896896362, "learning_rate": 9.946805926438395e-06, "loss": 0.0720566064119339, "memory(GiB)": 19.03, "step": 2899, "token_acc": 0.9682539682539683, "train_speed(iter/s)": 0.963089 }, { "epoch": 0.09420784199070915, "grad_norm": 1.036643385887146, "learning_rate": 9.94672775285621e-06, "loss": 0.07855640351772308, "memory(GiB)": 19.03, "step": 2900, "token_acc": 0.9612676056338029, "train_speed(iter/s)": 0.963142 }, { "epoch": 0.09424032745346457, "grad_norm": 1.0488319396972656, "learning_rate": 9.946649522182205e-06, "loss": 0.08830864727497101, "memory(GiB)": 19.03, "step": 2901, "token_acc": 0.974169741697417, "train_speed(iter/s)": 0.963207 }, { "epoch": 0.09427281291621999, "grad_norm": 0.56574946641922, "learning_rate": 9.94657123441728e-06, "loss": 0.06714038550853729, "memory(GiB)": 19.03, "step": 2902, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.963274 }, { "epoch": 0.0943052983789754, "grad_norm": 0.4882464110851288, "learning_rate": 9.946492889562338e-06, "loss": 0.06449103355407715, "memory(GiB)": 19.03, "step": 2903, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.96333 }, { "epoch": 0.09433778384173083, "grad_norm": 0.4440561532974243, "learning_rate": 9.946414487618286e-06, "loss": 0.06139367073774338, "memory(GiB)": 19.03, "step": 2904, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.963375 }, { "epoch": 0.09437026930448625, "grad_norm": 0.5683476328849792, "learning_rate": 9.946336028586025e-06, "loss": 0.07166112959384918, "memory(GiB)": 19.03, "step": 2905, "token_acc": 0.975103734439834, "train_speed(iter/s)": 0.963421 }, { "epoch": 0.09440275476724166, "grad_norm": 1.1134397983551025, "learning_rate": 9.946257512466464e-06, "loss": 0.08929745852947235, "memory(GiB)": 19.03, "step": 2906, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.963471 }, { "epoch": 0.09443524022999708, "grad_norm": 4.6677398681640625, "learning_rate": 9.946178939260508e-06, "loss": 0.0830373466014862, "memory(GiB)": 19.03, "step": 2907, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.96352 }, { "epoch": 0.0944677256927525, "grad_norm": 0.7378718852996826, "learning_rate": 9.946100308969064e-06, "loss": 0.07401490211486816, "memory(GiB)": 19.03, "step": 2908, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.963566 }, { "epoch": 0.09450021115550791, "grad_norm": 0.6659077405929565, "learning_rate": 9.946021621593038e-06, "loss": 0.07426668703556061, "memory(GiB)": 19.03, "step": 2909, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.963615 }, { "epoch": 0.09453269661826333, "grad_norm": 0.9727808237075806, "learning_rate": 9.94594287713334e-06, "loss": 0.09046545624732971, "memory(GiB)": 19.03, "step": 2910, "token_acc": 0.9661016949152542, "train_speed(iter/s)": 0.963664 }, { "epoch": 0.09456518208101874, "grad_norm": 1.0457602739334106, "learning_rate": 9.945864075590878e-06, "loss": 0.06783707439899445, "memory(GiB)": 19.03, "step": 2911, "token_acc": 0.9638009049773756, "train_speed(iter/s)": 0.96371 }, { "epoch": 0.09459766754377416, "grad_norm": 0.8904719948768616, "learning_rate": 9.94578521696656e-06, "loss": 0.0790492445230484, "memory(GiB)": 19.03, "step": 2912, "token_acc": 0.9630996309963099, "train_speed(iter/s)": 0.963762 }, { "epoch": 0.09463015300652958, "grad_norm": 0.621700644493103, "learning_rate": 9.945706301261301e-06, "loss": 0.08201710879802704, "memory(GiB)": 19.03, "step": 2913, "token_acc": 0.9620253164556962, "train_speed(iter/s)": 0.963809 }, { "epoch": 0.094662638469285, "grad_norm": 0.7437366843223572, "learning_rate": 9.945627328476006e-06, "loss": 0.08306287974119186, "memory(GiB)": 19.03, "step": 2914, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.963857 }, { "epoch": 0.09469512393204041, "grad_norm": 0.6852102875709534, "learning_rate": 9.945548298611592e-06, "loss": 0.06746922433376312, "memory(GiB)": 19.03, "step": 2915, "token_acc": 0.9706959706959707, "train_speed(iter/s)": 0.963901 }, { "epoch": 0.09472760939479583, "grad_norm": 0.7017092108726501, "learning_rate": 9.945469211668965e-06, "loss": 0.0843857005238533, "memory(GiB)": 19.03, "step": 2916, "token_acc": 0.9456066945606695, "train_speed(iter/s)": 0.963952 }, { "epoch": 0.09476009485755124, "grad_norm": 0.735157310962677, "learning_rate": 9.945390067649041e-06, "loss": 0.08294130116701126, "memory(GiB)": 19.03, "step": 2917, "token_acc": 0.963963963963964, "train_speed(iter/s)": 0.964002 }, { "epoch": 0.09479258032030666, "grad_norm": 0.6251114010810852, "learning_rate": 9.945310866552735e-06, "loss": 0.07651315629482269, "memory(GiB)": 19.03, "step": 2918, "token_acc": 0.9754098360655737, "train_speed(iter/s)": 0.964049 }, { "epoch": 0.09482506578306207, "grad_norm": 0.7557598948478699, "learning_rate": 9.94523160838096e-06, "loss": 0.06952314078807831, "memory(GiB)": 19.03, "step": 2919, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.964098 }, { "epoch": 0.0948575512458175, "grad_norm": 0.661461591720581, "learning_rate": 9.945152293134628e-06, "loss": 0.07374878972768784, "memory(GiB)": 19.03, "step": 2920, "token_acc": 0.9703703703703703, "train_speed(iter/s)": 0.964153 }, { "epoch": 0.09489003670857292, "grad_norm": 0.7991959452629089, "learning_rate": 9.945072920814658e-06, "loss": 0.07821877300739288, "memory(GiB)": 19.03, "step": 2921, "token_acc": 0.9754385964912281, "train_speed(iter/s)": 0.964215 }, { "epoch": 0.09492252217132834, "grad_norm": 0.7274643182754517, "learning_rate": 9.944993491421963e-06, "loss": 0.06891559064388275, "memory(GiB)": 19.03, "step": 2922, "token_acc": 0.978021978021978, "train_speed(iter/s)": 0.964275 }, { "epoch": 0.09495500763408375, "grad_norm": 0.6767438054084778, "learning_rate": 9.944914004957462e-06, "loss": 0.08941908180713654, "memory(GiB)": 19.03, "step": 2923, "token_acc": 0.968609865470852, "train_speed(iter/s)": 0.964332 }, { "epoch": 0.09498749309683917, "grad_norm": 1.427422046661377, "learning_rate": 9.944834461422072e-06, "loss": 0.07917185872793198, "memory(GiB)": 19.03, "step": 2924, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.964399 }, { "epoch": 0.09501997855959458, "grad_norm": 1.187018871307373, "learning_rate": 9.944754860816712e-06, "loss": 0.07326343655586243, "memory(GiB)": 19.03, "step": 2925, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.964465 }, { "epoch": 0.09505246402235, "grad_norm": 0.6733173727989197, "learning_rate": 9.944675203142297e-06, "loss": 0.06600278615951538, "memory(GiB)": 19.03, "step": 2926, "token_acc": 0.9740740740740741, "train_speed(iter/s)": 0.964527 }, { "epoch": 0.09508494948510542, "grad_norm": 1.1546334028244019, "learning_rate": 9.94459548839975e-06, "loss": 0.060829758644104004, "memory(GiB)": 19.03, "step": 2927, "token_acc": 0.9780701754385965, "train_speed(iter/s)": 0.964589 }, { "epoch": 0.09511743494786083, "grad_norm": 0.7722840309143066, "learning_rate": 9.94451571658999e-06, "loss": 0.07714591175317764, "memory(GiB)": 19.03, "step": 2928, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.964646 }, { "epoch": 0.09514992041061625, "grad_norm": 0.6664137840270996, "learning_rate": 9.944435887713939e-06, "loss": 0.07660181820392609, "memory(GiB)": 19.03, "step": 2929, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.96471 }, { "epoch": 0.09518240587337166, "grad_norm": 0.693915843963623, "learning_rate": 9.944356001772515e-06, "loss": 0.0764828473329544, "memory(GiB)": 19.03, "step": 2930, "token_acc": 0.966804979253112, "train_speed(iter/s)": 0.964776 }, { "epoch": 0.09521489133612708, "grad_norm": 0.6832191348075867, "learning_rate": 9.944276058766643e-06, "loss": 0.07546588778495789, "memory(GiB)": 19.03, "step": 2931, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.964838 }, { "epoch": 0.0952473767988825, "grad_norm": 0.975719153881073, "learning_rate": 9.944196058697241e-06, "loss": 0.0840032696723938, "memory(GiB)": 19.03, "step": 2932, "token_acc": 0.970954356846473, "train_speed(iter/s)": 0.964897 }, { "epoch": 0.09527986226163791, "grad_norm": 1.0110855102539062, "learning_rate": 9.944116001565241e-06, "loss": 0.07295078039169312, "memory(GiB)": 19.03, "step": 2933, "token_acc": 0.9592760180995475, "train_speed(iter/s)": 0.964959 }, { "epoch": 0.09531234772439333, "grad_norm": 0.7052246332168579, "learning_rate": 9.944035887371559e-06, "loss": 0.06831495463848114, "memory(GiB)": 19.03, "step": 2934, "token_acc": 0.983957219251337, "train_speed(iter/s)": 0.965022 }, { "epoch": 0.09534483318714874, "grad_norm": 0.9394789934158325, "learning_rate": 9.943955716117123e-06, "loss": 0.0894884392619133, "memory(GiB)": 19.03, "step": 2935, "token_acc": 0.96, "train_speed(iter/s)": 0.965086 }, { "epoch": 0.09537731864990417, "grad_norm": 1.5132215023040771, "learning_rate": 9.943875487802857e-06, "loss": 0.08142434060573578, "memory(GiB)": 19.03, "step": 2936, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.96515 }, { "epoch": 0.09540980411265959, "grad_norm": 0.8003035187721252, "learning_rate": 9.943795202429688e-06, "loss": 0.08746936917304993, "memory(GiB)": 19.03, "step": 2937, "token_acc": 0.961864406779661, "train_speed(iter/s)": 0.965215 }, { "epoch": 0.095442289575415, "grad_norm": 0.5889090299606323, "learning_rate": 9.943714859998541e-06, "loss": 0.07307077199220657, "memory(GiB)": 19.03, "step": 2938, "token_acc": 0.972027972027972, "train_speed(iter/s)": 0.965279 }, { "epoch": 0.09547477503817042, "grad_norm": 0.5290942192077637, "learning_rate": 9.943634460510345e-06, "loss": 0.06329327076673508, "memory(GiB)": 19.03, "step": 2939, "token_acc": 0.9658536585365853, "train_speed(iter/s)": 0.965328 }, { "epoch": 0.09550726050092584, "grad_norm": 0.7486604452133179, "learning_rate": 9.943554003966027e-06, "loss": 0.0638146698474884, "memory(GiB)": 19.03, "step": 2940, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.965374 }, { "epoch": 0.09553974596368126, "grad_norm": 0.6698053479194641, "learning_rate": 9.943473490366518e-06, "loss": 0.07355813682079315, "memory(GiB)": 19.03, "step": 2941, "token_acc": 0.960352422907489, "train_speed(iter/s)": 0.965424 }, { "epoch": 0.09557223142643667, "grad_norm": 1.2841564416885376, "learning_rate": 9.943392919712742e-06, "loss": 0.07507658749818802, "memory(GiB)": 19.03, "step": 2942, "token_acc": 0.9534883720930233, "train_speed(iter/s)": 0.965474 }, { "epoch": 0.09560471688919209, "grad_norm": 0.6845530867576599, "learning_rate": 9.943312292005635e-06, "loss": 0.07565230876207352, "memory(GiB)": 19.03, "step": 2943, "token_acc": 0.963855421686747, "train_speed(iter/s)": 0.965526 }, { "epoch": 0.0956372023519475, "grad_norm": 0.683408796787262, "learning_rate": 9.943231607246122e-06, "loss": 0.07406693696975708, "memory(GiB)": 19.03, "step": 2944, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.965578 }, { "epoch": 0.09566968781470292, "grad_norm": 0.730289101600647, "learning_rate": 9.94315086543514e-06, "loss": 0.08212918043136597, "memory(GiB)": 19.03, "step": 2945, "token_acc": 0.9720930232558139, "train_speed(iter/s)": 0.96563 }, { "epoch": 0.09570217327745834, "grad_norm": 0.8498369455337524, "learning_rate": 9.943070066573614e-06, "loss": 0.07196767628192902, "memory(GiB)": 19.03, "step": 2946, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.965665 }, { "epoch": 0.09573465874021375, "grad_norm": 1.1198192834854126, "learning_rate": 9.942989210662484e-06, "loss": 0.08750639855861664, "memory(GiB)": 19.03, "step": 2947, "token_acc": 0.9765625, "train_speed(iter/s)": 0.965711 }, { "epoch": 0.09576714420296917, "grad_norm": 0.8243756890296936, "learning_rate": 9.942908297702677e-06, "loss": 0.08542798459529877, "memory(GiB)": 19.03, "step": 2948, "token_acc": 0.9690265486725663, "train_speed(iter/s)": 0.965758 }, { "epoch": 0.09579962966572458, "grad_norm": 0.7374327778816223, "learning_rate": 9.94282732769513e-06, "loss": 0.08442467451095581, "memory(GiB)": 19.03, "step": 2949, "token_acc": 0.9765625, "train_speed(iter/s)": 0.965807 }, { "epoch": 0.09583211512848, "grad_norm": 0.6271401047706604, "learning_rate": 9.942746300640777e-06, "loss": 0.06972265988588333, "memory(GiB)": 19.03, "step": 2950, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.965846 }, { "epoch": 0.09586460059123542, "grad_norm": 0.919545590877533, "learning_rate": 9.942665216540552e-06, "loss": 0.07881011068820953, "memory(GiB)": 19.03, "step": 2951, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.965895 }, { "epoch": 0.09589708605399085, "grad_norm": 0.75862056016922, "learning_rate": 9.942584075395392e-06, "loss": 0.07230202108621597, "memory(GiB)": 19.03, "step": 2952, "token_acc": 0.9798994974874372, "train_speed(iter/s)": 0.965947 }, { "epoch": 0.09592957151674626, "grad_norm": 0.8485084176063538, "learning_rate": 9.942502877206233e-06, "loss": 0.08565931767225266, "memory(GiB)": 19.03, "step": 2953, "token_acc": 0.9558823529411765, "train_speed(iter/s)": 0.965999 }, { "epoch": 0.09596205697950168, "grad_norm": 0.8259167075157166, "learning_rate": 9.942421621974014e-06, "loss": 0.07010084390640259, "memory(GiB)": 19.03, "step": 2954, "token_acc": 0.9810606060606061, "train_speed(iter/s)": 0.966053 }, { "epoch": 0.0959945424422571, "grad_norm": 0.49308228492736816, "learning_rate": 9.94234030969967e-06, "loss": 0.07030892372131348, "memory(GiB)": 19.03, "step": 2955, "token_acc": 0.98046875, "train_speed(iter/s)": 0.966103 }, { "epoch": 0.09602702790501251, "grad_norm": 0.722210168838501, "learning_rate": 9.94225894038414e-06, "loss": 0.07911556959152222, "memory(GiB)": 19.03, "step": 2956, "token_acc": 0.9638009049773756, "train_speed(iter/s)": 0.96615 }, { "epoch": 0.09605951336776793, "grad_norm": 0.7364451289176941, "learning_rate": 9.942177514028364e-06, "loss": 0.05630648136138916, "memory(GiB)": 19.03, "step": 2957, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.966192 }, { "epoch": 0.09609199883052334, "grad_norm": 1.308759093284607, "learning_rate": 9.942096030633282e-06, "loss": 0.07474972307682037, "memory(GiB)": 19.03, "step": 2958, "token_acc": 0.9628252788104089, "train_speed(iter/s)": 0.966241 }, { "epoch": 0.09612448429327876, "grad_norm": 0.9737496376037598, "learning_rate": 9.942014490199834e-06, "loss": 0.07658389955759048, "memory(GiB)": 19.03, "step": 2959, "token_acc": 0.966804979253112, "train_speed(iter/s)": 0.96629 }, { "epoch": 0.09615696975603417, "grad_norm": 0.7594196200370789, "learning_rate": 9.94193289272896e-06, "loss": 0.07294897735118866, "memory(GiB)": 19.03, "step": 2960, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.966338 }, { "epoch": 0.09618945521878959, "grad_norm": 1.1400505304336548, "learning_rate": 9.941851238221602e-06, "loss": 0.07369302213191986, "memory(GiB)": 19.03, "step": 2961, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.966382 }, { "epoch": 0.096221940681545, "grad_norm": 0.6664777398109436, "learning_rate": 9.941769526678706e-06, "loss": 0.0711243599653244, "memory(GiB)": 19.03, "step": 2962, "token_acc": 0.9725490196078431, "train_speed(iter/s)": 0.966436 }, { "epoch": 0.09625442614430042, "grad_norm": 0.6449641585350037, "learning_rate": 9.941687758101211e-06, "loss": 0.06796813756227493, "memory(GiB)": 19.03, "step": 2963, "token_acc": 0.96484375, "train_speed(iter/s)": 0.966482 }, { "epoch": 0.09628691160705584, "grad_norm": 0.608903706073761, "learning_rate": 9.94160593249006e-06, "loss": 0.07043834030628204, "memory(GiB)": 19.03, "step": 2964, "token_acc": 0.9571428571428572, "train_speed(iter/s)": 0.966529 }, { "epoch": 0.09631939706981126, "grad_norm": 0.4976212680339813, "learning_rate": 9.9415240498462e-06, "loss": 0.0676230788230896, "memory(GiB)": 19.03, "step": 2965, "token_acc": 0.9644268774703557, "train_speed(iter/s)": 0.966574 }, { "epoch": 0.09635188253256667, "grad_norm": 1.3830174207687378, "learning_rate": 9.941442110170578e-06, "loss": 0.07179993391036987, "memory(GiB)": 19.03, "step": 2966, "token_acc": 0.9730941704035875, "train_speed(iter/s)": 0.966627 }, { "epoch": 0.09638436799532209, "grad_norm": 0.533056914806366, "learning_rate": 9.941360113464134e-06, "loss": 0.06730122864246368, "memory(GiB)": 19.03, "step": 2967, "token_acc": 0.9752066115702479, "train_speed(iter/s)": 0.96667 }, { "epoch": 0.09641685345807752, "grad_norm": 0.674015998840332, "learning_rate": 9.941278059727819e-06, "loss": 0.08426119387149811, "memory(GiB)": 19.03, "step": 2968, "token_acc": 0.9746835443037974, "train_speed(iter/s)": 0.966709 }, { "epoch": 0.09644933892083293, "grad_norm": 0.7406248450279236, "learning_rate": 9.941195948962578e-06, "loss": 0.07938981801271439, "memory(GiB)": 19.03, "step": 2969, "token_acc": 0.9541284403669725, "train_speed(iter/s)": 0.966759 }, { "epoch": 0.09648182438358835, "grad_norm": 0.7298331260681152, "learning_rate": 9.941113781169359e-06, "loss": 0.06397716701030731, "memory(GiB)": 19.03, "step": 2970, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.966808 }, { "epoch": 0.09651430984634377, "grad_norm": 0.5975102782249451, "learning_rate": 9.94103155634911e-06, "loss": 0.07394979894161224, "memory(GiB)": 19.03, "step": 2971, "token_acc": 0.9586466165413534, "train_speed(iter/s)": 0.966851 }, { "epoch": 0.09654679530909918, "grad_norm": 0.5696817636489868, "learning_rate": 9.94094927450278e-06, "loss": 0.06897443532943726, "memory(GiB)": 19.03, "step": 2972, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.966899 }, { "epoch": 0.0965792807718546, "grad_norm": 0.9271450638771057, "learning_rate": 9.94086693563132e-06, "loss": 0.08550567924976349, "memory(GiB)": 19.03, "step": 2973, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.966951 }, { "epoch": 0.09661176623461001, "grad_norm": 0.8793708682060242, "learning_rate": 9.940784539735678e-06, "loss": 0.09441708773374557, "memory(GiB)": 19.03, "step": 2974, "token_acc": 0.9644128113879004, "train_speed(iter/s)": 0.966992 }, { "epoch": 0.09664425169736543, "grad_norm": 0.7006412148475647, "learning_rate": 9.940702086816806e-06, "loss": 0.07137378305196762, "memory(GiB)": 19.03, "step": 2975, "token_acc": 0.9725490196078431, "train_speed(iter/s)": 0.967039 }, { "epoch": 0.09667673716012085, "grad_norm": 0.735395073890686, "learning_rate": 9.94061957687566e-06, "loss": 0.07861697673797607, "memory(GiB)": 19.03, "step": 2976, "token_acc": 0.9511278195488722, "train_speed(iter/s)": 0.967081 }, { "epoch": 0.09670922262287626, "grad_norm": 0.8283094167709351, "learning_rate": 9.940537009913183e-06, "loss": 0.07882337272167206, "memory(GiB)": 19.03, "step": 2977, "token_acc": 0.9580645161290322, "train_speed(iter/s)": 0.967137 }, { "epoch": 0.09674170808563168, "grad_norm": 0.5823056697845459, "learning_rate": 9.940454385930335e-06, "loss": 0.07409951090812683, "memory(GiB)": 19.03, "step": 2978, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.967201 }, { "epoch": 0.0967741935483871, "grad_norm": 0.4770132303237915, "learning_rate": 9.940371704928067e-06, "loss": 0.062036458402872086, "memory(GiB)": 19.03, "step": 2979, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.967268 }, { "epoch": 0.09680667901114251, "grad_norm": 1.160772681236267, "learning_rate": 9.940288966907336e-06, "loss": 0.08454637229442596, "memory(GiB)": 19.03, "step": 2980, "token_acc": 0.966542750929368, "train_speed(iter/s)": 0.967327 }, { "epoch": 0.09683916447389793, "grad_norm": 1.0459659099578857, "learning_rate": 9.940206171869094e-06, "loss": 0.08896853774785995, "memory(GiB)": 19.03, "step": 2981, "token_acc": 0.9769230769230769, "train_speed(iter/s)": 0.967393 }, { "epoch": 0.09687164993665334, "grad_norm": 0.6202738285064697, "learning_rate": 9.940123319814297e-06, "loss": 0.07706144452095032, "memory(GiB)": 19.03, "step": 2982, "token_acc": 0.9752066115702479, "train_speed(iter/s)": 0.967456 }, { "epoch": 0.09690413539940876, "grad_norm": 0.7052833437919617, "learning_rate": 9.9400404107439e-06, "loss": 0.06809236854314804, "memory(GiB)": 19.03, "step": 2983, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.967521 }, { "epoch": 0.09693662086216419, "grad_norm": 2.070434093475342, "learning_rate": 9.939957444658864e-06, "loss": 0.08412039279937744, "memory(GiB)": 19.03, "step": 2984, "token_acc": 0.9661654135338346, "train_speed(iter/s)": 0.967584 }, { "epoch": 0.0969691063249196, "grad_norm": 0.9004663228988647, "learning_rate": 9.939874421560143e-06, "loss": 0.08184386044740677, "memory(GiB)": 19.03, "step": 2985, "token_acc": 0.9591078066914498, "train_speed(iter/s)": 0.967648 }, { "epoch": 0.09700159178767502, "grad_norm": 0.7961081266403198, "learning_rate": 9.939791341448694e-06, "loss": 0.0769474133849144, "memory(GiB)": 19.03, "step": 2986, "token_acc": 0.9609756097560975, "train_speed(iter/s)": 0.967709 }, { "epoch": 0.09703407725043044, "grad_norm": 0.8994372487068176, "learning_rate": 9.939708204325483e-06, "loss": 0.09668401628732681, "memory(GiB)": 19.03, "step": 2987, "token_acc": 0.973568281938326, "train_speed(iter/s)": 0.96777 }, { "epoch": 0.09706656271318585, "grad_norm": 1.0047852993011475, "learning_rate": 9.93962501019146e-06, "loss": 0.08762884885072708, "memory(GiB)": 19.03, "step": 2988, "token_acc": 0.9703703703703703, "train_speed(iter/s)": 0.967834 }, { "epoch": 0.09709904817594127, "grad_norm": 0.5631330609321594, "learning_rate": 9.939541759047592e-06, "loss": 0.06858903169631958, "memory(GiB)": 19.03, "step": 2989, "token_acc": 0.9651741293532339, "train_speed(iter/s)": 0.9679 }, { "epoch": 0.09713153363869668, "grad_norm": 0.6788526773452759, "learning_rate": 9.939458450894837e-06, "loss": 0.06854008883237839, "memory(GiB)": 19.03, "step": 2990, "token_acc": 0.9716981132075472, "train_speed(iter/s)": 0.967966 }, { "epoch": 0.0971640191014521, "grad_norm": 0.7548760771751404, "learning_rate": 9.939375085734158e-06, "loss": 0.08769212663173676, "memory(GiB)": 19.03, "step": 2991, "token_acc": 0.9625, "train_speed(iter/s)": 0.968032 }, { "epoch": 0.09719650456420752, "grad_norm": 0.7597069144248962, "learning_rate": 9.939291663566514e-06, "loss": 0.07764947414398193, "memory(GiB)": 19.03, "step": 2992, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.968088 }, { "epoch": 0.09722899002696293, "grad_norm": 3.69529128074646, "learning_rate": 9.939208184392872e-06, "loss": 0.08067478239536285, "memory(GiB)": 19.03, "step": 2993, "token_acc": 0.9633333333333334, "train_speed(iter/s)": 0.968157 }, { "epoch": 0.09726147548971835, "grad_norm": 0.7418416738510132, "learning_rate": 9.939124648214194e-06, "loss": 0.07458284497261047, "memory(GiB)": 19.03, "step": 2994, "token_acc": 0.973568281938326, "train_speed(iter/s)": 0.968224 }, { "epoch": 0.09729396095247377, "grad_norm": 0.7309051156044006, "learning_rate": 9.939041055031442e-06, "loss": 0.09076301753520966, "memory(GiB)": 19.03, "step": 2995, "token_acc": 0.9696969696969697, "train_speed(iter/s)": 0.968289 }, { "epoch": 0.09732644641522918, "grad_norm": 0.7058188915252686, "learning_rate": 9.938957404845582e-06, "loss": 0.0801725834608078, "memory(GiB)": 19.03, "step": 2996, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.968352 }, { "epoch": 0.0973589318779846, "grad_norm": 1.9168790578842163, "learning_rate": 9.938873697657581e-06, "loss": 0.08864878118038177, "memory(GiB)": 19.03, "step": 2997, "token_acc": 0.9541666666666667, "train_speed(iter/s)": 0.968416 }, { "epoch": 0.09739141734074001, "grad_norm": 1.3039178848266602, "learning_rate": 9.938789933468404e-06, "loss": 0.0689111053943634, "memory(GiB)": 19.03, "step": 2998, "token_acc": 0.9686274509803922, "train_speed(iter/s)": 0.96847 }, { "epoch": 0.09742390280349543, "grad_norm": 0.4900878667831421, "learning_rate": 9.938706112279018e-06, "loss": 0.07016658782958984, "memory(GiB)": 19.03, "step": 2999, "token_acc": 0.9653679653679653, "train_speed(iter/s)": 0.968523 }, { "epoch": 0.09745638826625086, "grad_norm": 0.6023080348968506, "learning_rate": 9.93862223409039e-06, "loss": 0.07396408915519714, "memory(GiB)": 19.03, "step": 3000, "token_acc": 0.9634703196347032, "train_speed(iter/s)": 0.968572 }, { "epoch": 0.09745638826625086, "eval_loss": 0.07437926530838013, "eval_runtime": 81.3393, "eval_samples_per_second": 122.327, "eval_steps_per_second": 3.823, "eval_token_acc": 0.9703835652640991, "step": 3000 }, { "epoch": 0.09748887372900628, "grad_norm": 0.5730461478233337, "learning_rate": 9.938538298903486e-06, "loss": 0.06973172724246979, "memory(GiB)": 19.03, "step": 3001, "token_acc": 0.9707687250013698, "train_speed(iter/s)": 0.940882 }, { "epoch": 0.09752135919176169, "grad_norm": 0.7310908436775208, "learning_rate": 9.938454306719279e-06, "loss": 0.0869249552488327, "memory(GiB)": 19.03, "step": 3002, "token_acc": 0.965, "train_speed(iter/s)": 0.940943 }, { "epoch": 0.09755384465451711, "grad_norm": 2.0884745121002197, "learning_rate": 9.938370257538734e-06, "loss": 0.08896368741989136, "memory(GiB)": 19.03, "step": 3003, "token_acc": 0.9601990049751243, "train_speed(iter/s)": 0.941011 }, { "epoch": 0.09758633011727252, "grad_norm": 0.6196920275688171, "learning_rate": 9.938286151362825e-06, "loss": 0.0721873939037323, "memory(GiB)": 19.03, "step": 3004, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.94108 }, { "epoch": 0.09761881558002794, "grad_norm": 17.963703155517578, "learning_rate": 9.938201988192521e-06, "loss": 0.0839584469795227, "memory(GiB)": 19.03, "step": 3005, "token_acc": 0.952755905511811, "train_speed(iter/s)": 0.941136 }, { "epoch": 0.09765130104278336, "grad_norm": 0.624912679195404, "learning_rate": 9.938117768028794e-06, "loss": 0.0728529840707779, "memory(GiB)": 19.03, "step": 3006, "token_acc": 0.9806949806949807, "train_speed(iter/s)": 0.941203 }, { "epoch": 0.09768378650553877, "grad_norm": 0.695837140083313, "learning_rate": 9.938033490872614e-06, "loss": 0.06713058054447174, "memory(GiB)": 19.03, "step": 3007, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.941272 }, { "epoch": 0.09771627196829419, "grad_norm": 0.9100970029830933, "learning_rate": 9.937949156724955e-06, "loss": 0.08569774031639099, "memory(GiB)": 19.03, "step": 3008, "token_acc": 0.9665271966527197, "train_speed(iter/s)": 0.941334 }, { "epoch": 0.0977487574310496, "grad_norm": 0.7303193211555481, "learning_rate": 9.93786476558679e-06, "loss": 0.08807022869586945, "memory(GiB)": 19.03, "step": 3009, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.941392 }, { "epoch": 0.09778124289380502, "grad_norm": 0.8443099856376648, "learning_rate": 9.937780317459096e-06, "loss": 0.07103121280670166, "memory(GiB)": 19.03, "step": 3010, "token_acc": 0.9840425531914894, "train_speed(iter/s)": 0.941457 }, { "epoch": 0.09781372835656044, "grad_norm": 0.8033761978149414, "learning_rate": 9.937695812342843e-06, "loss": 0.07161830365657806, "memory(GiB)": 19.03, "step": 3011, "token_acc": 0.9751243781094527, "train_speed(iter/s)": 0.941525 }, { "epoch": 0.09784621381931585, "grad_norm": 0.8399711847305298, "learning_rate": 9.937611250239008e-06, "loss": 0.08138477802276611, "memory(GiB)": 19.03, "step": 3012, "token_acc": 0.9504504504504504, "train_speed(iter/s)": 0.941592 }, { "epoch": 0.09787869928207127, "grad_norm": 0.8299967646598816, "learning_rate": 9.937526631148568e-06, "loss": 0.08147039264440536, "memory(GiB)": 19.03, "step": 3013, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.941658 }, { "epoch": 0.09791118474482668, "grad_norm": 0.7147085070610046, "learning_rate": 9.937441955072497e-06, "loss": 0.06656788289546967, "memory(GiB)": 19.03, "step": 3014, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.941725 }, { "epoch": 0.0979436702075821, "grad_norm": 0.6418490409851074, "learning_rate": 9.937357222011777e-06, "loss": 0.06336890161037445, "memory(GiB)": 19.03, "step": 3015, "token_acc": 0.9806201550387597, "train_speed(iter/s)": 0.941788 }, { "epoch": 0.09797615567033753, "grad_norm": 0.5323011875152588, "learning_rate": 9.937272431967381e-06, "loss": 0.06847162544727325, "memory(GiB)": 19.03, "step": 3016, "token_acc": 0.9704433497536946, "train_speed(iter/s)": 0.941846 }, { "epoch": 0.09800864113309295, "grad_norm": 0.6607373952865601, "learning_rate": 9.93718758494029e-06, "loss": 0.06480216234922409, "memory(GiB)": 19.03, "step": 3017, "token_acc": 0.9823008849557522, "train_speed(iter/s)": 0.941913 }, { "epoch": 0.09804112659584836, "grad_norm": 0.6531466245651245, "learning_rate": 9.937102680931483e-06, "loss": 0.07120025902986526, "memory(GiB)": 19.03, "step": 3018, "token_acc": 0.9620253164556962, "train_speed(iter/s)": 0.941982 }, { "epoch": 0.09807361205860378, "grad_norm": 0.6706451773643494, "learning_rate": 9.93701771994194e-06, "loss": 0.06768166273832321, "memory(GiB)": 19.03, "step": 3019, "token_acc": 0.968503937007874, "train_speed(iter/s)": 0.942048 }, { "epoch": 0.0981060975213592, "grad_norm": 0.7871644496917725, "learning_rate": 9.936932701972642e-06, "loss": 0.07372355461120605, "memory(GiB)": 19.03, "step": 3020, "token_acc": 0.9738805970149254, "train_speed(iter/s)": 0.942114 }, { "epoch": 0.09813858298411461, "grad_norm": 0.6569106578826904, "learning_rate": 9.936847627024569e-06, "loss": 0.07385784387588501, "memory(GiB)": 19.03, "step": 3021, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.942178 }, { "epoch": 0.09817106844687003, "grad_norm": 1.005409836769104, "learning_rate": 9.936762495098702e-06, "loss": 0.07503630220890045, "memory(GiB)": 19.03, "step": 3022, "token_acc": 0.9742489270386266, "train_speed(iter/s)": 0.94217 }, { "epoch": 0.09820355390962544, "grad_norm": 1.7563680410385132, "learning_rate": 9.936677306196027e-06, "loss": 0.08822697401046753, "memory(GiB)": 19.03, "step": 3023, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.942236 }, { "epoch": 0.09823603937238086, "grad_norm": 1.0068414211273193, "learning_rate": 9.936592060317525e-06, "loss": 0.08101557195186615, "memory(GiB)": 19.03, "step": 3024, "token_acc": 0.9737991266375546, "train_speed(iter/s)": 0.942302 }, { "epoch": 0.09826852483513628, "grad_norm": 1.1634732484817505, "learning_rate": 9.936506757464179e-06, "loss": 0.07264405488967896, "memory(GiB)": 19.03, "step": 3025, "token_acc": 0.9609756097560975, "train_speed(iter/s)": 0.942363 }, { "epoch": 0.09830101029789169, "grad_norm": 0.8203467130661011, "learning_rate": 9.936421397636975e-06, "loss": 0.0862591415643692, "memory(GiB)": 19.03, "step": 3026, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.942412 }, { "epoch": 0.09833349576064711, "grad_norm": 1.7863869667053223, "learning_rate": 9.936335980836898e-06, "loss": 0.0752292275428772, "memory(GiB)": 19.03, "step": 3027, "token_acc": 0.9739776951672863, "train_speed(iter/s)": 0.942466 }, { "epoch": 0.09836598122340252, "grad_norm": 0.7534563541412354, "learning_rate": 9.936250507064932e-06, "loss": 0.08061221241950989, "memory(GiB)": 19.03, "step": 3028, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.942518 }, { "epoch": 0.09839846668615794, "grad_norm": 0.8517409563064575, "learning_rate": 9.936164976322067e-06, "loss": 0.08432618528604507, "memory(GiB)": 19.03, "step": 3029, "token_acc": 0.9529411764705882, "train_speed(iter/s)": 0.942569 }, { "epoch": 0.09843095214891336, "grad_norm": 1.1714131832122803, "learning_rate": 9.936079388609288e-06, "loss": 0.08424025774002075, "memory(GiB)": 19.03, "step": 3030, "token_acc": 0.9558823529411765, "train_speed(iter/s)": 0.942621 }, { "epoch": 0.09846343761166877, "grad_norm": 0.8360739946365356, "learning_rate": 9.935993743927582e-06, "loss": 0.0819239467382431, "memory(GiB)": 19.03, "step": 3031, "token_acc": 0.972, "train_speed(iter/s)": 0.94267 }, { "epoch": 0.0984959230744242, "grad_norm": 0.5586559176445007, "learning_rate": 9.93590804227794e-06, "loss": 0.05935732647776604, "memory(GiB)": 19.03, "step": 3032, "token_acc": 0.9601593625498008, "train_speed(iter/s)": 0.942721 }, { "epoch": 0.09852840853717962, "grad_norm": 0.6851299405097961, "learning_rate": 9.935822283661347e-06, "loss": 0.07301946729421616, "memory(GiB)": 19.03, "step": 3033, "token_acc": 0.978494623655914, "train_speed(iter/s)": 0.942779 }, { "epoch": 0.09856089399993503, "grad_norm": 0.729796290397644, "learning_rate": 9.935736468078796e-06, "loss": 0.07240535318851471, "memory(GiB)": 19.03, "step": 3034, "token_acc": 0.963963963963964, "train_speed(iter/s)": 0.942835 }, { "epoch": 0.09859337946269045, "grad_norm": 0.8334944248199463, "learning_rate": 9.93565059553128e-06, "loss": 0.0782325267791748, "memory(GiB)": 19.03, "step": 3035, "token_acc": 0.9620253164556962, "train_speed(iter/s)": 0.942894 }, { "epoch": 0.09862586492544587, "grad_norm": 0.5745286345481873, "learning_rate": 9.935564666019782e-06, "loss": 0.061859145760536194, "memory(GiB)": 19.03, "step": 3036, "token_acc": 0.9774436090225563, "train_speed(iter/s)": 0.942948 }, { "epoch": 0.09865835038820128, "grad_norm": 0.9535024166107178, "learning_rate": 9.9354786795453e-06, "loss": 0.07969269901514053, "memory(GiB)": 19.03, "step": 3037, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.942992 }, { "epoch": 0.0986908358509567, "grad_norm": 0.6063013076782227, "learning_rate": 9.935392636108827e-06, "loss": 0.0721963495016098, "memory(GiB)": 19.03, "step": 3038, "token_acc": 0.9545454545454546, "train_speed(iter/s)": 0.943036 }, { "epoch": 0.09872332131371211, "grad_norm": 0.62352055311203, "learning_rate": 9.935306535711352e-06, "loss": 0.08587959408760071, "memory(GiB)": 19.03, "step": 3039, "token_acc": 0.9636363636363636, "train_speed(iter/s)": 0.943069 }, { "epoch": 0.09875580677646753, "grad_norm": 1.2772060632705688, "learning_rate": 9.935220378353872e-06, "loss": 0.07886286079883575, "memory(GiB)": 19.03, "step": 3040, "token_acc": 0.9579439252336449, "train_speed(iter/s)": 0.943107 }, { "epoch": 0.09878829223922295, "grad_norm": 0.7558570504188538, "learning_rate": 9.935134164037379e-06, "loss": 0.08465718477964401, "memory(GiB)": 19.03, "step": 3041, "token_acc": 0.9767441860465116, "train_speed(iter/s)": 0.943154 }, { "epoch": 0.09882077770197836, "grad_norm": 0.7034116387367249, "learning_rate": 9.93504789276287e-06, "loss": 0.07277683913707733, "memory(GiB)": 19.03, "step": 3042, "token_acc": 0.9786476868327402, "train_speed(iter/s)": 0.943199 }, { "epoch": 0.09885326316473378, "grad_norm": 0.7360888719558716, "learning_rate": 9.934961564531337e-06, "loss": 0.07598140835762024, "memory(GiB)": 19.03, "step": 3043, "token_acc": 0.9661016949152542, "train_speed(iter/s)": 0.943241 }, { "epoch": 0.0988857486274892, "grad_norm": 0.7165845632553101, "learning_rate": 9.934875179343782e-06, "loss": 0.06897839903831482, "memory(GiB)": 19.03, "step": 3044, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.943278 }, { "epoch": 0.09891823409024461, "grad_norm": 0.6059913635253906, "learning_rate": 9.9347887372012e-06, "loss": 0.07156245410442352, "memory(GiB)": 19.03, "step": 3045, "token_acc": 0.9725274725274725, "train_speed(iter/s)": 0.943318 }, { "epoch": 0.09895071955300003, "grad_norm": 0.7349590063095093, "learning_rate": 9.934702238104585e-06, "loss": 0.07690738141536713, "memory(GiB)": 19.03, "step": 3046, "token_acc": 0.9624413145539906, "train_speed(iter/s)": 0.94336 }, { "epoch": 0.09898320501575544, "grad_norm": 0.7477587461471558, "learning_rate": 9.93461568205494e-06, "loss": 0.066651351749897, "memory(GiB)": 19.03, "step": 3047, "token_acc": 0.9696969696969697, "train_speed(iter/s)": 0.943406 }, { "epoch": 0.09901569047851087, "grad_norm": 0.6351206302642822, "learning_rate": 9.93452906905326e-06, "loss": 0.06580336391925812, "memory(GiB)": 19.03, "step": 3048, "token_acc": 0.9593023255813954, "train_speed(iter/s)": 0.943455 }, { "epoch": 0.09904817594126629, "grad_norm": 0.6721289157867432, "learning_rate": 9.934442399100548e-06, "loss": 0.07283315807580948, "memory(GiB)": 19.03, "step": 3049, "token_acc": 0.963963963963964, "train_speed(iter/s)": 0.9435 }, { "epoch": 0.0990806614040217, "grad_norm": 1.8495436906814575, "learning_rate": 9.934355672197804e-06, "loss": 0.07755924761295319, "memory(GiB)": 20.68, "step": 3050, "token_acc": 0.9641434262948207, "train_speed(iter/s)": 0.94355 }, { "epoch": 0.09911314686677712, "grad_norm": 0.9157683253288269, "learning_rate": 9.934268888346027e-06, "loss": 0.0851370245218277, "memory(GiB)": 20.68, "step": 3051, "token_acc": 0.954337899543379, "train_speed(iter/s)": 0.943596 }, { "epoch": 0.09914563232953254, "grad_norm": 0.8986899256706238, "learning_rate": 9.934182047546219e-06, "loss": 0.0725414976477623, "memory(GiB)": 20.68, "step": 3052, "token_acc": 0.9680365296803652, "train_speed(iter/s)": 0.943646 }, { "epoch": 0.09917811779228795, "grad_norm": 0.5240496397018433, "learning_rate": 9.934095149799384e-06, "loss": 0.07412838190793991, "memory(GiB)": 20.68, "step": 3053, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.943694 }, { "epoch": 0.09921060325504337, "grad_norm": 1.1517434120178223, "learning_rate": 9.934008195106523e-06, "loss": 0.07674240320920944, "memory(GiB)": 20.68, "step": 3054, "token_acc": 0.968503937007874, "train_speed(iter/s)": 0.943752 }, { "epoch": 0.09924308871779879, "grad_norm": 0.48508909344673157, "learning_rate": 9.93392118346864e-06, "loss": 0.07186367362737656, "memory(GiB)": 20.68, "step": 3055, "token_acc": 0.9695817490494296, "train_speed(iter/s)": 0.943817 }, { "epoch": 0.0992755741805542, "grad_norm": 0.6635390520095825, "learning_rate": 9.93383411488674e-06, "loss": 0.0793994590640068, "memory(GiB)": 20.68, "step": 3056, "token_acc": 0.9719298245614035, "train_speed(iter/s)": 0.943883 }, { "epoch": 0.09930805964330962, "grad_norm": 0.5799040794372559, "learning_rate": 9.933746989361828e-06, "loss": 0.07497306913137436, "memory(GiB)": 20.68, "step": 3057, "token_acc": 0.968, "train_speed(iter/s)": 0.943948 }, { "epoch": 0.09934054510606503, "grad_norm": 0.7102189064025879, "learning_rate": 9.933659806894908e-06, "loss": 0.07197335362434387, "memory(GiB)": 20.68, "step": 3058, "token_acc": 0.9740932642487047, "train_speed(iter/s)": 0.944014 }, { "epoch": 0.09937303056882045, "grad_norm": 0.846279501914978, "learning_rate": 9.933572567486987e-06, "loss": 0.06955894827842712, "memory(GiB)": 20.68, "step": 3059, "token_acc": 0.9486166007905138, "train_speed(iter/s)": 0.944077 }, { "epoch": 0.09940551603157587, "grad_norm": 0.8207210898399353, "learning_rate": 9.933485271139071e-06, "loss": 0.07596295326948166, "memory(GiB)": 20.68, "step": 3060, "token_acc": 0.9834710743801653, "train_speed(iter/s)": 0.944141 }, { "epoch": 0.09943800149433128, "grad_norm": 1.0093876123428345, "learning_rate": 9.933397917852171e-06, "loss": 0.08034417033195496, "memory(GiB)": 20.68, "step": 3061, "token_acc": 0.95703125, "train_speed(iter/s)": 0.944204 }, { "epoch": 0.0994704869570867, "grad_norm": 0.5552330613136292, "learning_rate": 9.933310507627292e-06, "loss": 0.06831679493188858, "memory(GiB)": 20.68, "step": 3062, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.944268 }, { "epoch": 0.09950297241984211, "grad_norm": 0.6676405668258667, "learning_rate": 9.933223040465444e-06, "loss": 0.07212768495082855, "memory(GiB)": 20.68, "step": 3063, "token_acc": 0.9531914893617022, "train_speed(iter/s)": 0.944332 }, { "epoch": 0.09953545788259754, "grad_norm": 0.6184694766998291, "learning_rate": 9.933135516367633e-06, "loss": 0.056311748921871185, "memory(GiB)": 20.68, "step": 3064, "token_acc": 0.973404255319149, "train_speed(iter/s)": 0.944389 }, { "epoch": 0.09956794334535296, "grad_norm": 1.2290345430374146, "learning_rate": 9.933047935334875e-06, "loss": 0.08061401546001434, "memory(GiB)": 20.68, "step": 3065, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.944454 }, { "epoch": 0.09960042880810838, "grad_norm": 0.9856883883476257, "learning_rate": 9.932960297368177e-06, "loss": 0.0771806538105011, "memory(GiB)": 20.68, "step": 3066, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.944518 }, { "epoch": 0.09963291427086379, "grad_norm": 3.054460048675537, "learning_rate": 9.932872602468551e-06, "loss": 0.07944636791944504, "memory(GiB)": 20.68, "step": 3067, "token_acc": 0.9552238805970149, "train_speed(iter/s)": 0.94458 }, { "epoch": 0.09966539973361921, "grad_norm": 0.7244783043861389, "learning_rate": 9.932784850637012e-06, "loss": 0.07309587299823761, "memory(GiB)": 20.68, "step": 3068, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.944646 }, { "epoch": 0.09969788519637462, "grad_norm": 0.7768533229827881, "learning_rate": 9.932697041874567e-06, "loss": 0.06913876533508301, "memory(GiB)": 20.68, "step": 3069, "token_acc": 0.9754901960784313, "train_speed(iter/s)": 0.944713 }, { "epoch": 0.09973037065913004, "grad_norm": 0.6946508288383484, "learning_rate": 9.932609176182235e-06, "loss": 0.06444967538118362, "memory(GiB)": 20.68, "step": 3070, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.944776 }, { "epoch": 0.09976285612188546, "grad_norm": 1.1888840198516846, "learning_rate": 9.932521253561026e-06, "loss": 0.07798488438129425, "memory(GiB)": 20.68, "step": 3071, "token_acc": 0.9617021276595744, "train_speed(iter/s)": 0.944839 }, { "epoch": 0.09979534158464087, "grad_norm": 0.7662537693977356, "learning_rate": 9.932433274011959e-06, "loss": 0.0767226591706276, "memory(GiB)": 20.68, "step": 3072, "token_acc": 0.9868995633187773, "train_speed(iter/s)": 0.944901 }, { "epoch": 0.09982782704739629, "grad_norm": 0.8879541158676147, "learning_rate": 9.932345237536046e-06, "loss": 0.07634178549051285, "memory(GiB)": 20.68, "step": 3073, "token_acc": 0.958041958041958, "train_speed(iter/s)": 0.944968 }, { "epoch": 0.0998603125101517, "grad_norm": 0.6766273379325867, "learning_rate": 9.932257144134301e-06, "loss": 0.07234887778759003, "memory(GiB)": 20.68, "step": 3074, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.945034 }, { "epoch": 0.09989279797290712, "grad_norm": 0.673820436000824, "learning_rate": 9.932168993807747e-06, "loss": 0.06891416013240814, "memory(GiB)": 20.68, "step": 3075, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.945099 }, { "epoch": 0.09992528343566254, "grad_norm": 0.5558773875236511, "learning_rate": 9.932080786557398e-06, "loss": 0.06903943419456482, "memory(GiB)": 20.68, "step": 3076, "token_acc": 0.9732142857142857, "train_speed(iter/s)": 0.945162 }, { "epoch": 0.09995776889841795, "grad_norm": 1.4102956056594849, "learning_rate": 9.93199252238427e-06, "loss": 0.08314499258995056, "memory(GiB)": 20.68, "step": 3077, "token_acc": 0.9594594594594594, "train_speed(iter/s)": 0.945223 }, { "epoch": 0.09999025436117337, "grad_norm": 0.5891425609588623, "learning_rate": 9.931904201289385e-06, "loss": 0.06402162462472916, "memory(GiB)": 20.68, "step": 3078, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.945289 }, { "epoch": 0.10002273982392879, "grad_norm": 0.5219710469245911, "learning_rate": 9.931815823273763e-06, "loss": 0.06295676529407501, "memory(GiB)": 20.68, "step": 3079, "token_acc": 0.9537815126050421, "train_speed(iter/s)": 0.945356 }, { "epoch": 0.10005522528668422, "grad_norm": 0.5875341892242432, "learning_rate": 9.93172738833842e-06, "loss": 0.06944975256919861, "memory(GiB)": 20.68, "step": 3080, "token_acc": 0.9683257918552036, "train_speed(iter/s)": 0.945421 }, { "epoch": 0.10008771074943963, "grad_norm": 0.8555480241775513, "learning_rate": 9.93163889648438e-06, "loss": 0.08036084473133087, "memory(GiB)": 20.68, "step": 3081, "token_acc": 0.973404255319149, "train_speed(iter/s)": 0.945484 }, { "epoch": 0.10012019621219505, "grad_norm": 0.8259435892105103, "learning_rate": 9.931550347712663e-06, "loss": 0.07780319452285767, "memory(GiB)": 20.68, "step": 3082, "token_acc": 0.9506726457399103, "train_speed(iter/s)": 0.945549 }, { "epoch": 0.10015268167495046, "grad_norm": 0.7514585256576538, "learning_rate": 9.93146174202429e-06, "loss": 0.06423718482255936, "memory(GiB)": 20.68, "step": 3083, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.945614 }, { "epoch": 0.10018516713770588, "grad_norm": 2.1818273067474365, "learning_rate": 9.931373079420285e-06, "loss": 0.06867465376853943, "memory(GiB)": 20.68, "step": 3084, "token_acc": 0.9725490196078431, "train_speed(iter/s)": 0.945672 }, { "epoch": 0.1002176526004613, "grad_norm": 1.248096227645874, "learning_rate": 9.931284359901672e-06, "loss": 0.06727901846170425, "memory(GiB)": 20.68, "step": 3085, "token_acc": 0.9710144927536232, "train_speed(iter/s)": 0.945736 }, { "epoch": 0.10025013806321671, "grad_norm": 1.3361258506774902, "learning_rate": 9.931195583469473e-06, "loss": 0.08165009319782257, "memory(GiB)": 20.68, "step": 3086, "token_acc": 0.9558823529411765, "train_speed(iter/s)": 0.945797 }, { "epoch": 0.10028262352597213, "grad_norm": 0.9457708597183228, "learning_rate": 9.931106750124714e-06, "loss": 0.07991062104701996, "memory(GiB)": 20.68, "step": 3087, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.945851 }, { "epoch": 0.10031510898872754, "grad_norm": 0.8757532238960266, "learning_rate": 9.93101785986842e-06, "loss": 0.06773561239242554, "memory(GiB)": 20.68, "step": 3088, "token_acc": 0.9708333333333333, "train_speed(iter/s)": 0.945905 }, { "epoch": 0.10034759445148296, "grad_norm": 0.9130939841270447, "learning_rate": 9.930928912701616e-06, "loss": 0.07730744779109955, "memory(GiB)": 20.68, "step": 3089, "token_acc": 0.9596412556053812, "train_speed(iter/s)": 0.945957 }, { "epoch": 0.10038007991423838, "grad_norm": 1.12534761428833, "learning_rate": 9.93083990862533e-06, "loss": 0.08531362563371658, "memory(GiB)": 20.68, "step": 3090, "token_acc": 0.9725490196078431, "train_speed(iter/s)": 0.946008 }, { "epoch": 0.10041256537699379, "grad_norm": 1.2290382385253906, "learning_rate": 9.930750847640588e-06, "loss": 0.07708372175693512, "memory(GiB)": 20.68, "step": 3091, "token_acc": 0.9528795811518325, "train_speed(iter/s)": 0.946059 }, { "epoch": 0.10044505083974921, "grad_norm": 0.5120660662651062, "learning_rate": 9.93066172974842e-06, "loss": 0.06099604442715645, "memory(GiB)": 20.68, "step": 3092, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.946112 }, { "epoch": 0.10047753630250462, "grad_norm": 0.7932248711585999, "learning_rate": 9.93057255494985e-06, "loss": 0.07182978093624115, "memory(GiB)": 20.68, "step": 3093, "token_acc": 0.9625, "train_speed(iter/s)": 0.946164 }, { "epoch": 0.10051002176526004, "grad_norm": 1.0146316289901733, "learning_rate": 9.930483323245914e-06, "loss": 0.07840126007795334, "memory(GiB)": 20.68, "step": 3094, "token_acc": 0.9645669291338582, "train_speed(iter/s)": 0.946207 }, { "epoch": 0.10054250722801546, "grad_norm": 0.7069908380508423, "learning_rate": 9.930394034637636e-06, "loss": 0.0828220471739769, "memory(GiB)": 20.68, "step": 3095, "token_acc": 0.958904109589041, "train_speed(iter/s)": 0.946246 }, { "epoch": 0.10057499269077089, "grad_norm": 0.6999727487564087, "learning_rate": 9.930304689126046e-06, "loss": 0.07744438201189041, "memory(GiB)": 20.68, "step": 3096, "token_acc": 0.9774436090225563, "train_speed(iter/s)": 0.946289 }, { "epoch": 0.1006074781535263, "grad_norm": 0.61991286277771, "learning_rate": 9.930215286712182e-06, "loss": 0.0727359801530838, "memory(GiB)": 20.68, "step": 3097, "token_acc": 0.9680365296803652, "train_speed(iter/s)": 0.946336 }, { "epoch": 0.10063996361628172, "grad_norm": 0.845770537853241, "learning_rate": 9.930125827397069e-06, "loss": 0.08370011299848557, "memory(GiB)": 20.68, "step": 3098, "token_acc": 0.9478672985781991, "train_speed(iter/s)": 0.946379 }, { "epoch": 0.10067244907903714, "grad_norm": 0.7408902645111084, "learning_rate": 9.930036311181743e-06, "loss": 0.08463327586650848, "memory(GiB)": 20.68, "step": 3099, "token_acc": 0.9589552238805971, "train_speed(iter/s)": 0.946423 }, { "epoch": 0.10070493454179255, "grad_norm": 0.945551872253418, "learning_rate": 9.929946738067236e-06, "loss": 0.08487477898597717, "memory(GiB)": 20.68, "step": 3100, "token_acc": 0.966183574879227, "train_speed(iter/s)": 0.946465 }, { "epoch": 0.10073742000454797, "grad_norm": 4.251328945159912, "learning_rate": 9.92985710805458e-06, "loss": 0.07962723076343536, "memory(GiB)": 20.68, "step": 3101, "token_acc": 0.981549815498155, "train_speed(iter/s)": 0.946503 }, { "epoch": 0.10076990546730338, "grad_norm": 0.8027471303939819, "learning_rate": 9.929767421144814e-06, "loss": 0.06979215145111084, "memory(GiB)": 20.68, "step": 3102, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.946546 }, { "epoch": 0.1008023909300588, "grad_norm": 0.7930017709732056, "learning_rate": 9.92967767733897e-06, "loss": 0.0784413069486618, "memory(GiB)": 20.68, "step": 3103, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.946589 }, { "epoch": 0.10083487639281422, "grad_norm": 0.7759977579116821, "learning_rate": 9.929587876638082e-06, "loss": 0.06645148992538452, "memory(GiB)": 20.68, "step": 3104, "token_acc": 0.9651741293532339, "train_speed(iter/s)": 0.94663 }, { "epoch": 0.10086736185556963, "grad_norm": 0.4954149127006531, "learning_rate": 9.929498019043191e-06, "loss": 0.06254847347736359, "memory(GiB)": 20.68, "step": 3105, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.946673 }, { "epoch": 0.10089984731832505, "grad_norm": 0.5204784274101257, "learning_rate": 9.92940810455533e-06, "loss": 0.06295117735862732, "memory(GiB)": 20.68, "step": 3106, "token_acc": 0.9626556016597511, "train_speed(iter/s)": 0.946714 }, { "epoch": 0.10093233278108046, "grad_norm": 0.7027305960655212, "learning_rate": 9.929318133175539e-06, "loss": 0.07382561266422272, "memory(GiB)": 20.68, "step": 3107, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.946749 }, { "epoch": 0.10096481824383588, "grad_norm": 0.6153311729431152, "learning_rate": 9.929228104904857e-06, "loss": 0.07093043625354767, "memory(GiB)": 20.68, "step": 3108, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.946798 }, { "epoch": 0.1009973037065913, "grad_norm": 0.7093693614006042, "learning_rate": 9.92913801974432e-06, "loss": 0.0736963301897049, "memory(GiB)": 20.68, "step": 3109, "token_acc": 0.9788359788359788, "train_speed(iter/s)": 0.946845 }, { "epoch": 0.10102978916934671, "grad_norm": 0.8152119517326355, "learning_rate": 9.92904787769497e-06, "loss": 0.0774102509021759, "memory(GiB)": 20.68, "step": 3110, "token_acc": 0.984, "train_speed(iter/s)": 0.946889 }, { "epoch": 0.10106227463210213, "grad_norm": 0.8415824174880981, "learning_rate": 9.92895767875785e-06, "loss": 0.08053997159004211, "memory(GiB)": 20.68, "step": 3111, "token_acc": 0.9669117647058824, "train_speed(iter/s)": 0.946949 }, { "epoch": 0.10109476009485756, "grad_norm": 0.8151805400848389, "learning_rate": 9.928867422933995e-06, "loss": 0.06843218207359314, "memory(GiB)": 20.68, "step": 3112, "token_acc": 0.9646464646464646, "train_speed(iter/s)": 0.947011 }, { "epoch": 0.10112724555761297, "grad_norm": 0.6353883147239685, "learning_rate": 9.92877711022445e-06, "loss": 0.07282982766628265, "memory(GiB)": 20.68, "step": 3113, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.947073 }, { "epoch": 0.10115973102036839, "grad_norm": 0.943492591381073, "learning_rate": 9.928686740630256e-06, "loss": 0.07167229801416397, "memory(GiB)": 20.68, "step": 3114, "token_acc": 0.9742489270386266, "train_speed(iter/s)": 0.947138 }, { "epoch": 0.1011922164831238, "grad_norm": 0.9323459267616272, "learning_rate": 9.92859631415246e-06, "loss": 0.06278639286756516, "memory(GiB)": 20.68, "step": 3115, "token_acc": 0.9853479853479854, "train_speed(iter/s)": 0.947199 }, { "epoch": 0.10122470194587922, "grad_norm": 0.7886053919792175, "learning_rate": 9.928505830792099e-06, "loss": 0.06900937855243683, "memory(GiB)": 20.68, "step": 3116, "token_acc": 0.9678714859437751, "train_speed(iter/s)": 0.94726 }, { "epoch": 0.10125718740863464, "grad_norm": 0.7508367300033569, "learning_rate": 9.928415290550221e-06, "loss": 0.07218459993600845, "memory(GiB)": 20.68, "step": 3117, "token_acc": 0.974169741697417, "train_speed(iter/s)": 0.94732 }, { "epoch": 0.10128967287139005, "grad_norm": 0.8767149448394775, "learning_rate": 9.928324693427873e-06, "loss": 0.07231895625591278, "memory(GiB)": 20.68, "step": 3118, "token_acc": 0.9704433497536946, "train_speed(iter/s)": 0.947383 }, { "epoch": 0.10132215833414547, "grad_norm": 0.8096905946731567, "learning_rate": 9.9282340394261e-06, "loss": 0.07848409563302994, "memory(GiB)": 20.68, "step": 3119, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.947444 }, { "epoch": 0.10135464379690089, "grad_norm": 0.7325544357299805, "learning_rate": 9.928143328545944e-06, "loss": 0.07714875042438507, "memory(GiB)": 20.68, "step": 3120, "token_acc": 0.9634703196347032, "train_speed(iter/s)": 0.947506 }, { "epoch": 0.1013871292596563, "grad_norm": 0.9976064562797546, "learning_rate": 9.928052560788455e-06, "loss": 0.06552133709192276, "memory(GiB)": 20.68, "step": 3121, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.947571 }, { "epoch": 0.10141961472241172, "grad_norm": 0.5679382681846619, "learning_rate": 9.92796173615468e-06, "loss": 0.06909304857254028, "memory(GiB)": 20.68, "step": 3122, "token_acc": 0.968503937007874, "train_speed(iter/s)": 0.94763 }, { "epoch": 0.10145210018516714, "grad_norm": 0.7237767577171326, "learning_rate": 9.927870854645668e-06, "loss": 0.06250953674316406, "memory(GiB)": 20.68, "step": 3123, "token_acc": 0.9847908745247148, "train_speed(iter/s)": 0.947693 }, { "epoch": 0.10148458564792255, "grad_norm": 0.8481102585792542, "learning_rate": 9.927779916262467e-06, "loss": 0.08067098259925842, "memory(GiB)": 20.68, "step": 3124, "token_acc": 0.9723502304147466, "train_speed(iter/s)": 0.947749 }, { "epoch": 0.10151707111067797, "grad_norm": 0.8205970525741577, "learning_rate": 9.927688921006127e-06, "loss": 0.06724463403224945, "memory(GiB)": 20.68, "step": 3125, "token_acc": 0.9581589958158996, "train_speed(iter/s)": 0.947812 }, { "epoch": 0.10154955657343338, "grad_norm": 1.677468180656433, "learning_rate": 9.927597868877698e-06, "loss": 0.0772709771990776, "memory(GiB)": 20.68, "step": 3126, "token_acc": 0.9708333333333333, "train_speed(iter/s)": 0.947875 }, { "epoch": 0.1015820420361888, "grad_norm": 1.5518074035644531, "learning_rate": 9.927506759878232e-06, "loss": 0.07925370335578918, "memory(GiB)": 20.68, "step": 3127, "token_acc": 0.9606299212598425, "train_speed(iter/s)": 0.947937 }, { "epoch": 0.10161452749894423, "grad_norm": 0.6928038001060486, "learning_rate": 9.927415594008778e-06, "loss": 0.07087377458810806, "memory(GiB)": 20.68, "step": 3128, "token_acc": 0.9653679653679653, "train_speed(iter/s)": 0.947999 }, { "epoch": 0.10164701296169965, "grad_norm": 0.733669102191925, "learning_rate": 9.92732437127039e-06, "loss": 0.06843449920415878, "memory(GiB)": 20.68, "step": 3129, "token_acc": 0.9810606060606061, "train_speed(iter/s)": 0.94806 }, { "epoch": 0.10167949842445506, "grad_norm": 0.5783708691596985, "learning_rate": 9.92723309166412e-06, "loss": 0.06356583535671234, "memory(GiB)": 20.68, "step": 3130, "token_acc": 0.988, "train_speed(iter/s)": 0.948123 }, { "epoch": 0.10171198388721048, "grad_norm": 0.9073710441589355, "learning_rate": 9.927141755191024e-06, "loss": 0.07353630661964417, "memory(GiB)": 20.68, "step": 3131, "token_acc": 0.9717314487632509, "train_speed(iter/s)": 0.948184 }, { "epoch": 0.1017444693499659, "grad_norm": 3.987478494644165, "learning_rate": 9.927050361852153e-06, "loss": 0.08622057735919952, "memory(GiB)": 20.68, "step": 3132, "token_acc": 0.9589552238805971, "train_speed(iter/s)": 0.948246 }, { "epoch": 0.10177695481272131, "grad_norm": 0.9374231696128845, "learning_rate": 9.926958911648563e-06, "loss": 0.0866837352514267, "memory(GiB)": 20.68, "step": 3133, "token_acc": 0.9601593625498008, "train_speed(iter/s)": 0.948308 }, { "epoch": 0.10180944027547673, "grad_norm": 0.7751898169517517, "learning_rate": 9.926867404581309e-06, "loss": 0.0732598528265953, "memory(GiB)": 20.68, "step": 3134, "token_acc": 0.966789667896679, "train_speed(iter/s)": 0.948369 }, { "epoch": 0.10184192573823214, "grad_norm": 0.6517844200134277, "learning_rate": 9.926775840651448e-06, "loss": 0.06800785660743713, "memory(GiB)": 20.68, "step": 3135, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.948433 }, { "epoch": 0.10187441120098756, "grad_norm": 1.0608069896697998, "learning_rate": 9.926684219860036e-06, "loss": 0.08767350018024445, "memory(GiB)": 20.68, "step": 3136, "token_acc": 0.9588014981273408, "train_speed(iter/s)": 0.948497 }, { "epoch": 0.10190689666374297, "grad_norm": 0.7655553817749023, "learning_rate": 9.926592542208131e-06, "loss": 0.07578517496585846, "memory(GiB)": 20.68, "step": 3137, "token_acc": 0.9584905660377359, "train_speed(iter/s)": 0.948561 }, { "epoch": 0.10193938212649839, "grad_norm": 0.5422791838645935, "learning_rate": 9.92650080769679e-06, "loss": 0.07663168013095856, "memory(GiB)": 20.68, "step": 3138, "token_acc": 0.9699248120300752, "train_speed(iter/s)": 0.948622 }, { "epoch": 0.1019718675892538, "grad_norm": 0.6466179490089417, "learning_rate": 9.926409016327075e-06, "loss": 0.07260797917842865, "memory(GiB)": 20.68, "step": 3139, "token_acc": 0.9672897196261683, "train_speed(iter/s)": 0.948684 }, { "epoch": 0.10200435305200922, "grad_norm": 0.7680858969688416, "learning_rate": 9.92631716810004e-06, "loss": 0.07196136564016342, "memory(GiB)": 20.68, "step": 3140, "token_acc": 0.9840425531914894, "train_speed(iter/s)": 0.948746 }, { "epoch": 0.10203683851476464, "grad_norm": 0.8227232694625854, "learning_rate": 9.926225263016749e-06, "loss": 0.0855460911989212, "memory(GiB)": 20.68, "step": 3141, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.948808 }, { "epoch": 0.10206932397752005, "grad_norm": 0.5827568173408508, "learning_rate": 9.926133301078261e-06, "loss": 0.07267946004867554, "memory(GiB)": 20.68, "step": 3142, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.948867 }, { "epoch": 0.10210180944027547, "grad_norm": 1.5323072671890259, "learning_rate": 9.926041282285639e-06, "loss": 0.09080443531274796, "memory(GiB)": 20.68, "step": 3143, "token_acc": 0.9552238805970149, "train_speed(iter/s)": 0.94893 }, { "epoch": 0.1021342949030309, "grad_norm": 1.14429771900177, "learning_rate": 9.925949206639944e-06, "loss": 0.08512899279594421, "memory(GiB)": 20.68, "step": 3144, "token_acc": 0.9613733905579399, "train_speed(iter/s)": 0.948994 }, { "epoch": 0.10216678036578632, "grad_norm": 0.5303366780281067, "learning_rate": 9.925857074142237e-06, "loss": 0.059522829949855804, "memory(GiB)": 20.68, "step": 3145, "token_acc": 0.9878048780487805, "train_speed(iter/s)": 0.949058 }, { "epoch": 0.10219926582854173, "grad_norm": 3.932936191558838, "learning_rate": 9.925764884793585e-06, "loss": 0.06318632513284683, "memory(GiB)": 20.68, "step": 3146, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.949111 }, { "epoch": 0.10223175129129715, "grad_norm": 0.6245332956314087, "learning_rate": 9.92567263859505e-06, "loss": 0.06599199771881104, "memory(GiB)": 20.68, "step": 3147, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.94916 }, { "epoch": 0.10226423675405256, "grad_norm": 0.903313398361206, "learning_rate": 9.925580335547696e-06, "loss": 0.0809607058763504, "memory(GiB)": 20.68, "step": 3148, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.949209 }, { "epoch": 0.10229672221680798, "grad_norm": 0.6330001354217529, "learning_rate": 9.92548797565259e-06, "loss": 0.0751815065741539, "memory(GiB)": 20.68, "step": 3149, "token_acc": 0.9707792207792207, "train_speed(iter/s)": 0.949264 }, { "epoch": 0.1023292076795634, "grad_norm": 1.0688323974609375, "learning_rate": 9.925395558910795e-06, "loss": 0.07125210762023926, "memory(GiB)": 20.68, "step": 3150, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.949317 }, { "epoch": 0.10236169314231881, "grad_norm": 0.6011118292808533, "learning_rate": 9.925303085323382e-06, "loss": 0.06576958298683167, "memory(GiB)": 20.68, "step": 3151, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.949363 }, { "epoch": 0.10239417860507423, "grad_norm": 0.610819399356842, "learning_rate": 9.925210554891414e-06, "loss": 0.06956830620765686, "memory(GiB)": 20.68, "step": 3152, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.949404 }, { "epoch": 0.10242666406782965, "grad_norm": 0.5488271117210388, "learning_rate": 9.925117967615962e-06, "loss": 0.06741597503423691, "memory(GiB)": 20.68, "step": 3153, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.949446 }, { "epoch": 0.10245914953058506, "grad_norm": 0.5258380174636841, "learning_rate": 9.925025323498092e-06, "loss": 0.06410062313079834, "memory(GiB)": 20.68, "step": 3154, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.949483 }, { "epoch": 0.10249163499334048, "grad_norm": 0.6665315628051758, "learning_rate": 9.924932622538877e-06, "loss": 0.07191772758960724, "memory(GiB)": 20.68, "step": 3155, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.949525 }, { "epoch": 0.1025241204560959, "grad_norm": 0.8936150670051575, "learning_rate": 9.924839864739382e-06, "loss": 0.08122453093528748, "memory(GiB)": 20.68, "step": 3156, "token_acc": 0.9551020408163265, "train_speed(iter/s)": 0.949571 }, { "epoch": 0.10255660591885131, "grad_norm": 1.766105055809021, "learning_rate": 9.924747050100681e-06, "loss": 0.08026763051748276, "memory(GiB)": 20.68, "step": 3157, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.949612 }, { "epoch": 0.10258909138160673, "grad_norm": 0.9220066070556641, "learning_rate": 9.924654178623844e-06, "loss": 0.08135482668876648, "memory(GiB)": 20.68, "step": 3158, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.949654 }, { "epoch": 0.10262157684436214, "grad_norm": 1.0399657487869263, "learning_rate": 9.924561250309943e-06, "loss": 0.07414384931325912, "memory(GiB)": 20.68, "step": 3159, "token_acc": 0.9789473684210527, "train_speed(iter/s)": 0.949692 }, { "epoch": 0.10265406230711757, "grad_norm": 0.6301799416542053, "learning_rate": 9.92446826516005e-06, "loss": 0.06710133701562881, "memory(GiB)": 20.68, "step": 3160, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.949732 }, { "epoch": 0.10268654776987299, "grad_norm": 0.9102529287338257, "learning_rate": 9.92437522317524e-06, "loss": 0.0894947499036789, "memory(GiB)": 20.68, "step": 3161, "token_acc": 0.9537815126050421, "train_speed(iter/s)": 0.949774 }, { "epoch": 0.1027190332326284, "grad_norm": 0.6813625693321228, "learning_rate": 9.924282124356585e-06, "loss": 0.07443945109844208, "memory(GiB)": 20.68, "step": 3162, "token_acc": 0.9555555555555556, "train_speed(iter/s)": 0.949817 }, { "epoch": 0.10275151869538382, "grad_norm": 0.924472987651825, "learning_rate": 9.924188968705159e-06, "loss": 0.0779007077217102, "memory(GiB)": 20.68, "step": 3163, "token_acc": 0.9741379310344828, "train_speed(iter/s)": 0.94985 }, { "epoch": 0.10278400415813924, "grad_norm": 0.5489302277565002, "learning_rate": 9.92409575622204e-06, "loss": 0.06040709465742111, "memory(GiB)": 20.68, "step": 3164, "token_acc": 0.9661016949152542, "train_speed(iter/s)": 0.949878 }, { "epoch": 0.10281648962089465, "grad_norm": 0.5721663236618042, "learning_rate": 9.924002486908301e-06, "loss": 0.06857690960168839, "memory(GiB)": 20.68, "step": 3165, "token_acc": 0.9581589958158996, "train_speed(iter/s)": 0.949923 }, { "epoch": 0.10284897508365007, "grad_norm": 0.5136591196060181, "learning_rate": 9.92390916076502e-06, "loss": 0.07005759328603745, "memory(GiB)": 20.68, "step": 3166, "token_acc": 0.9663865546218487, "train_speed(iter/s)": 0.949967 }, { "epoch": 0.10288146054640548, "grad_norm": 0.6414309144020081, "learning_rate": 9.923815777793273e-06, "loss": 0.06709519773721695, "memory(GiB)": 20.68, "step": 3167, "token_acc": 0.9637096774193549, "train_speed(iter/s)": 0.950016 }, { "epoch": 0.1029139460091609, "grad_norm": 1.2503294944763184, "learning_rate": 9.92372233799414e-06, "loss": 0.07394835352897644, "memory(GiB)": 20.68, "step": 3168, "token_acc": 0.9536679536679536, "train_speed(iter/s)": 0.950076 }, { "epoch": 0.10294643147191632, "grad_norm": 0.6955702304840088, "learning_rate": 9.923628841368695e-06, "loss": 0.08246999979019165, "memory(GiB)": 20.68, "step": 3169, "token_acc": 0.9723502304147466, "train_speed(iter/s)": 0.95012 }, { "epoch": 0.10297891693467173, "grad_norm": 0.3917957544326782, "learning_rate": 9.923535287918022e-06, "loss": 0.061671316623687744, "memory(GiB)": 20.68, "step": 3170, "token_acc": 0.9732441471571907, "train_speed(iter/s)": 0.950168 }, { "epoch": 0.10301140239742715, "grad_norm": 0.6144543290138245, "learning_rate": 9.923441677643198e-06, "loss": 0.06479145586490631, "memory(GiB)": 20.68, "step": 3171, "token_acc": 0.9651162790697675, "train_speed(iter/s)": 0.950217 }, { "epoch": 0.10304388786018256, "grad_norm": 1.0255571603775024, "learning_rate": 9.923348010545304e-06, "loss": 0.08954846858978271, "memory(GiB)": 20.68, "step": 3172, "token_acc": 0.9493670886075949, "train_speed(iter/s)": 0.950262 }, { "epoch": 0.10307637332293798, "grad_norm": 1.3788552284240723, "learning_rate": 9.923254286625422e-06, "loss": 0.07600128650665283, "memory(GiB)": 20.68, "step": 3173, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.950307 }, { "epoch": 0.1031088587856934, "grad_norm": 0.5700105428695679, "learning_rate": 9.92316050588463e-06, "loss": 0.0680149719119072, "memory(GiB)": 20.68, "step": 3174, "token_acc": 0.9716981132075472, "train_speed(iter/s)": 0.950355 }, { "epoch": 0.10314134424844881, "grad_norm": 0.7824639081954956, "learning_rate": 9.923066668324015e-06, "loss": 0.08612668514251709, "memory(GiB)": 20.68, "step": 3175, "token_acc": 0.9455252918287937, "train_speed(iter/s)": 0.950403 }, { "epoch": 0.10317382971120424, "grad_norm": 0.7541754245758057, "learning_rate": 9.922972773944658e-06, "loss": 0.08024860918521881, "memory(GiB)": 20.68, "step": 3176, "token_acc": 0.9714285714285714, "train_speed(iter/s)": 0.950446 }, { "epoch": 0.10320631517395966, "grad_norm": 0.5594886541366577, "learning_rate": 9.922878822747641e-06, "loss": 0.07417634129524231, "memory(GiB)": 20.68, "step": 3177, "token_acc": 0.9690265486725663, "train_speed(iter/s)": 0.95049 }, { "epoch": 0.10323880063671508, "grad_norm": 0.720460832118988, "learning_rate": 9.922784814734052e-06, "loss": 0.07264809310436249, "memory(GiB)": 20.68, "step": 3178, "token_acc": 0.9578059071729957, "train_speed(iter/s)": 0.950536 }, { "epoch": 0.10327128609947049, "grad_norm": 0.4343318045139313, "learning_rate": 9.922690749904973e-06, "loss": 0.052025943994522095, "memory(GiB)": 20.68, "step": 3179, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.950581 }, { "epoch": 0.10330377156222591, "grad_norm": 0.859940767288208, "learning_rate": 9.922596628261492e-06, "loss": 0.0759953111410141, "memory(GiB)": 20.68, "step": 3180, "token_acc": 0.9604743083003953, "train_speed(iter/s)": 0.950622 }, { "epoch": 0.10333625702498132, "grad_norm": 0.5914706587791443, "learning_rate": 9.922502449804692e-06, "loss": 0.0703853964805603, "memory(GiB)": 20.68, "step": 3181, "token_acc": 0.9737827715355806, "train_speed(iter/s)": 0.950657 }, { "epoch": 0.10336874248773674, "grad_norm": 0.8934201598167419, "learning_rate": 9.922408214535663e-06, "loss": 0.08817695081233978, "memory(GiB)": 20.68, "step": 3182, "token_acc": 0.9682539682539683, "train_speed(iter/s)": 0.950705 }, { "epoch": 0.10340122795049216, "grad_norm": 1.0011976957321167, "learning_rate": 9.922313922455494e-06, "loss": 0.06803512573242188, "memory(GiB)": 20.68, "step": 3183, "token_acc": 0.9638009049773756, "train_speed(iter/s)": 0.950754 }, { "epoch": 0.10343371341324757, "grad_norm": 1.6377942562103271, "learning_rate": 9.922219573565267e-06, "loss": 0.0733824372291565, "memory(GiB)": 20.68, "step": 3184, "token_acc": 0.9683794466403162, "train_speed(iter/s)": 0.950802 }, { "epoch": 0.10346619887600299, "grad_norm": 1.0960856676101685, "learning_rate": 9.922125167866077e-06, "loss": 0.07362417876720428, "memory(GiB)": 20.68, "step": 3185, "token_acc": 0.976303317535545, "train_speed(iter/s)": 0.950846 }, { "epoch": 0.1034986843387584, "grad_norm": 1.4040015935897827, "learning_rate": 9.922030705359009e-06, "loss": 0.07875316590070724, "memory(GiB)": 20.68, "step": 3186, "token_acc": 0.9641434262948207, "train_speed(iter/s)": 0.950903 }, { "epoch": 0.10353116980151382, "grad_norm": 1.3303422927856445, "learning_rate": 9.921936186045155e-06, "loss": 0.07470355927944183, "memory(GiB)": 20.68, "step": 3187, "token_acc": 0.9696969696969697, "train_speed(iter/s)": 0.950962 }, { "epoch": 0.10356365526426924, "grad_norm": 0.4962524473667145, "learning_rate": 9.921841609925609e-06, "loss": 0.07059714198112488, "memory(GiB)": 20.68, "step": 3188, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.951022 }, { "epoch": 0.10359614072702465, "grad_norm": 0.7267467379570007, "learning_rate": 9.921746977001458e-06, "loss": 0.07291547954082489, "memory(GiB)": 20.68, "step": 3189, "token_acc": 0.9605734767025089, "train_speed(iter/s)": 0.951081 }, { "epoch": 0.10362862618978007, "grad_norm": 0.5997881293296814, "learning_rate": 9.921652287273798e-06, "loss": 0.07193388044834137, "memory(GiB)": 20.68, "step": 3190, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.951142 }, { "epoch": 0.10366111165253548, "grad_norm": 0.8923045992851257, "learning_rate": 9.921557540743718e-06, "loss": 0.06468243896961212, "memory(GiB)": 20.68, "step": 3191, "token_acc": 0.9759036144578314, "train_speed(iter/s)": 0.951206 }, { "epoch": 0.10369359711529091, "grad_norm": 1.234663486480713, "learning_rate": 9.921462737412316e-06, "loss": 0.05987425148487091, "memory(GiB)": 20.68, "step": 3192, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.951266 }, { "epoch": 0.10372608257804633, "grad_norm": 0.5544816851615906, "learning_rate": 9.92136787728068e-06, "loss": 0.06045300513505936, "memory(GiB)": 20.68, "step": 3193, "token_acc": 0.9682539682539683, "train_speed(iter/s)": 0.951325 }, { "epoch": 0.10375856804080175, "grad_norm": 0.5481221675872803, "learning_rate": 9.92127296034991e-06, "loss": 0.06330916285514832, "memory(GiB)": 20.68, "step": 3194, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.951386 }, { "epoch": 0.10379105350355716, "grad_norm": 0.6461724638938904, "learning_rate": 9.9211779866211e-06, "loss": 0.06615142524242401, "memory(GiB)": 20.68, "step": 3195, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.951448 }, { "epoch": 0.10382353896631258, "grad_norm": 0.907303512096405, "learning_rate": 9.921082956095346e-06, "loss": 0.0791928842663765, "memory(GiB)": 20.68, "step": 3196, "token_acc": 0.9685314685314685, "train_speed(iter/s)": 0.95151 }, { "epoch": 0.103856024429068, "grad_norm": 0.4834800362586975, "learning_rate": 9.920987868773743e-06, "loss": 0.056866008788347244, "memory(GiB)": 20.68, "step": 3197, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.951566 }, { "epoch": 0.10388850989182341, "grad_norm": 0.7580191493034363, "learning_rate": 9.920892724657393e-06, "loss": 0.08290649950504303, "memory(GiB)": 20.68, "step": 3198, "token_acc": 0.9781420765027322, "train_speed(iter/s)": 0.951624 }, { "epoch": 0.10392099535457883, "grad_norm": 2.203162431716919, "learning_rate": 9.92079752374739e-06, "loss": 0.07177584618330002, "memory(GiB)": 20.68, "step": 3199, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.951685 }, { "epoch": 0.10395348081733424, "grad_norm": 0.6716164946556091, "learning_rate": 9.920702266044834e-06, "loss": 0.06395512819290161, "memory(GiB)": 20.68, "step": 3200, "token_acc": 0.9644444444444444, "train_speed(iter/s)": 0.951746 }, { "epoch": 0.10398596628008966, "grad_norm": 0.7838526964187622, "learning_rate": 9.920606951550824e-06, "loss": 0.07126069068908691, "memory(GiB)": 20.68, "step": 3201, "token_acc": 0.9702127659574468, "train_speed(iter/s)": 0.95181 }, { "epoch": 0.10401845174284508, "grad_norm": 0.5510602593421936, "learning_rate": 9.92051158026646e-06, "loss": 0.06714533269405365, "memory(GiB)": 20.68, "step": 3202, "token_acc": 0.9847715736040609, "train_speed(iter/s)": 0.951873 }, { "epoch": 0.10405093720560049, "grad_norm": 0.6476977467536926, "learning_rate": 9.920416152192845e-06, "loss": 0.0780482217669487, "memory(GiB)": 20.68, "step": 3203, "token_acc": 0.96875, "train_speed(iter/s)": 0.951937 }, { "epoch": 0.10408342266835591, "grad_norm": 0.6457006931304932, "learning_rate": 9.920320667331077e-06, "loss": 0.07111009210348129, "memory(GiB)": 20.68, "step": 3204, "token_acc": 0.976, "train_speed(iter/s)": 0.951995 }, { "epoch": 0.10411590813111132, "grad_norm": 0.49746373295783997, "learning_rate": 9.920225125682261e-06, "loss": 0.07351090013980865, "memory(GiB)": 20.68, "step": 3205, "token_acc": 0.9691629955947136, "train_speed(iter/s)": 0.952042 }, { "epoch": 0.10414839359386674, "grad_norm": 0.5586633682250977, "learning_rate": 9.920129527247497e-06, "loss": 0.0654924064874649, "memory(GiB)": 20.68, "step": 3206, "token_acc": 0.9775280898876404, "train_speed(iter/s)": 0.952092 }, { "epoch": 0.10418087905662216, "grad_norm": 0.8362380266189575, "learning_rate": 9.92003387202789e-06, "loss": 0.0791136771440506, "memory(GiB)": 20.68, "step": 3207, "token_acc": 0.96875, "train_speed(iter/s)": 0.952142 }, { "epoch": 0.10421336451937759, "grad_norm": 0.5082883238792419, "learning_rate": 9.919938160024545e-06, "loss": 0.07373875379562378, "memory(GiB)": 20.68, "step": 3208, "token_acc": 0.9793388429752066, "train_speed(iter/s)": 0.952191 }, { "epoch": 0.104245849982133, "grad_norm": 0.6787512898445129, "learning_rate": 9.919842391238563e-06, "loss": 0.07963597774505615, "memory(GiB)": 20.68, "step": 3209, "token_acc": 0.9806201550387597, "train_speed(iter/s)": 0.95224 }, { "epoch": 0.10427833544488842, "grad_norm": 0.7550033330917358, "learning_rate": 9.919746565671052e-06, "loss": 0.06817633658647537, "memory(GiB)": 20.68, "step": 3210, "token_acc": 0.972, "train_speed(iter/s)": 0.952291 }, { "epoch": 0.10431082090764383, "grad_norm": 0.8163186311721802, "learning_rate": 9.91965068332312e-06, "loss": 0.07393019646406174, "memory(GiB)": 20.68, "step": 3211, "token_acc": 0.9695431472081218, "train_speed(iter/s)": 0.95234 }, { "epoch": 0.10434330637039925, "grad_norm": 0.6206654906272888, "learning_rate": 9.919554744195868e-06, "loss": 0.07169956713914871, "memory(GiB)": 20.68, "step": 3212, "token_acc": 0.9774436090225563, "train_speed(iter/s)": 0.952388 }, { "epoch": 0.10437579183315467, "grad_norm": 0.5783643126487732, "learning_rate": 9.919458748290406e-06, "loss": 0.06638604402542114, "memory(GiB)": 20.68, "step": 3213, "token_acc": 0.9778761061946902, "train_speed(iter/s)": 0.952438 }, { "epoch": 0.10440827729591008, "grad_norm": 0.5995649099349976, "learning_rate": 9.919362695607844e-06, "loss": 0.07270704209804535, "memory(GiB)": 20.68, "step": 3214, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.952485 }, { "epoch": 0.1044407627586655, "grad_norm": 0.566425085067749, "learning_rate": 9.919266586149287e-06, "loss": 0.0722808986902237, "memory(GiB)": 20.68, "step": 3215, "token_acc": 0.9575471698113207, "train_speed(iter/s)": 0.952526 }, { "epoch": 0.10447324822142091, "grad_norm": 1.277557611465454, "learning_rate": 9.919170419915848e-06, "loss": 0.0869130939245224, "memory(GiB)": 20.68, "step": 3216, "token_acc": 0.9656652360515021, "train_speed(iter/s)": 0.952576 }, { "epoch": 0.10450573368417633, "grad_norm": 0.6117698550224304, "learning_rate": 9.919074196908633e-06, "loss": 0.082496277987957, "memory(GiB)": 20.68, "step": 3217, "token_acc": 0.9778761061946902, "train_speed(iter/s)": 0.952628 }, { "epoch": 0.10453821914693175, "grad_norm": 0.5791113972663879, "learning_rate": 9.918977917128757e-06, "loss": 0.061613332480192184, "memory(GiB)": 20.68, "step": 3218, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.95268 }, { "epoch": 0.10457070460968716, "grad_norm": 0.6507553458213806, "learning_rate": 9.918881580577328e-06, "loss": 0.06663067638874054, "memory(GiB)": 20.68, "step": 3219, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.952731 }, { "epoch": 0.10460319007244258, "grad_norm": 0.6892642974853516, "learning_rate": 9.918785187255457e-06, "loss": 0.08323191851377487, "memory(GiB)": 20.68, "step": 3220, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.952782 }, { "epoch": 0.104635675535198, "grad_norm": 0.8423733711242676, "learning_rate": 9.918688737164259e-06, "loss": 0.08843529969453812, "memory(GiB)": 20.68, "step": 3221, "token_acc": 0.9632352941176471, "train_speed(iter/s)": 0.95283 }, { "epoch": 0.10466816099795341, "grad_norm": 0.7408856153488159, "learning_rate": 9.918592230304846e-06, "loss": 0.07498259097337723, "memory(GiB)": 20.68, "step": 3222, "token_acc": 0.9514925373134329, "train_speed(iter/s)": 0.952874 }, { "epoch": 0.10470064646070883, "grad_norm": 0.6788366436958313, "learning_rate": 9.918495666678335e-06, "loss": 0.06892971694469452, "memory(GiB)": 20.68, "step": 3223, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.952922 }, { "epoch": 0.10473313192346426, "grad_norm": 0.9567059874534607, "learning_rate": 9.918399046285835e-06, "loss": 0.06235939636826515, "memory(GiB)": 20.68, "step": 3224, "token_acc": 0.9622641509433962, "train_speed(iter/s)": 0.952968 }, { "epoch": 0.10476561738621967, "grad_norm": 0.8152669072151184, "learning_rate": 9.918302369128461e-06, "loss": 0.0769052505493164, "memory(GiB)": 20.68, "step": 3225, "token_acc": 0.9779005524861878, "train_speed(iter/s)": 0.953015 }, { "epoch": 0.10479810284897509, "grad_norm": 0.8417527675628662, "learning_rate": 9.918205635207334e-06, "loss": 0.07608025521039963, "memory(GiB)": 20.68, "step": 3226, "token_acc": 0.9473684210526315, "train_speed(iter/s)": 0.953058 }, { "epoch": 0.1048305883117305, "grad_norm": 0.5954437851905823, "learning_rate": 9.918108844523569e-06, "loss": 0.060899458825588226, "memory(GiB)": 20.68, "step": 3227, "token_acc": 0.9644670050761421, "train_speed(iter/s)": 0.953105 }, { "epoch": 0.10486307377448592, "grad_norm": 0.6989308595657349, "learning_rate": 9.91801199707828e-06, "loss": 0.07380266487598419, "memory(GiB)": 20.68, "step": 3228, "token_acc": 0.9357798165137615, "train_speed(iter/s)": 0.953149 }, { "epoch": 0.10489555923724134, "grad_norm": 0.5993467569351196, "learning_rate": 9.917915092872588e-06, "loss": 0.0646834596991539, "memory(GiB)": 20.68, "step": 3229, "token_acc": 0.9757085020242915, "train_speed(iter/s)": 0.953197 }, { "epoch": 0.10492804469999675, "grad_norm": 0.6729225516319275, "learning_rate": 9.917818131907608e-06, "loss": 0.07422619313001633, "memory(GiB)": 20.68, "step": 3230, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.953238 }, { "epoch": 0.10496053016275217, "grad_norm": 0.6798508763313293, "learning_rate": 9.917721114184464e-06, "loss": 0.06267231702804565, "memory(GiB)": 20.68, "step": 3231, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.953286 }, { "epoch": 0.10499301562550759, "grad_norm": 0.7373266220092773, "learning_rate": 9.91762403970427e-06, "loss": 0.09059175848960876, "memory(GiB)": 20.68, "step": 3232, "token_acc": 0.9896373056994818, "train_speed(iter/s)": 0.953326 }, { "epoch": 0.105025501088263, "grad_norm": 1.7490994930267334, "learning_rate": 9.917526908468151e-06, "loss": 0.07539068907499313, "memory(GiB)": 20.68, "step": 3233, "token_acc": 0.9626556016597511, "train_speed(iter/s)": 0.953364 }, { "epoch": 0.10505798655101842, "grad_norm": 0.8064090609550476, "learning_rate": 9.917429720477226e-06, "loss": 0.08239847421646118, "memory(GiB)": 20.68, "step": 3234, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.953412 }, { "epoch": 0.10509047201377383, "grad_norm": 0.7815049886703491, "learning_rate": 9.917332475732618e-06, "loss": 0.07204727083444595, "memory(GiB)": 20.68, "step": 3235, "token_acc": 0.9604743083003953, "train_speed(iter/s)": 0.953464 }, { "epoch": 0.10512295747652925, "grad_norm": 0.717724084854126, "learning_rate": 9.917235174235446e-06, "loss": 0.07273285835981369, "memory(GiB)": 20.68, "step": 3236, "token_acc": 0.963855421686747, "train_speed(iter/s)": 0.953508 }, { "epoch": 0.10515544293928467, "grad_norm": 0.6051080822944641, "learning_rate": 9.917137815986837e-06, "loss": 0.07510250061750412, "memory(GiB)": 20.68, "step": 3237, "token_acc": 0.9554455445544554, "train_speed(iter/s)": 0.953548 }, { "epoch": 0.10518792840204008, "grad_norm": 0.5603002905845642, "learning_rate": 9.917040400987912e-06, "loss": 0.07717113196849823, "memory(GiB)": 20.68, "step": 3238, "token_acc": 0.97, "train_speed(iter/s)": 0.953589 }, { "epoch": 0.1052204138647955, "grad_norm": 0.5052600502967834, "learning_rate": 9.916942929239797e-06, "loss": 0.07144543528556824, "memory(GiB)": 20.68, "step": 3239, "token_acc": 0.9701492537313433, "train_speed(iter/s)": 0.95363 }, { "epoch": 0.10525289932755093, "grad_norm": 0.49785566329956055, "learning_rate": 9.916845400743616e-06, "loss": 0.07540800422430038, "memory(GiB)": 20.68, "step": 3240, "token_acc": 0.9613733905579399, "train_speed(iter/s)": 0.953676 }, { "epoch": 0.10528538479030634, "grad_norm": 0.48994871973991394, "learning_rate": 9.916747815500494e-06, "loss": 0.07190405577421188, "memory(GiB)": 20.68, "step": 3241, "token_acc": 0.9725490196078431, "train_speed(iter/s)": 0.953717 }, { "epoch": 0.10531787025306176, "grad_norm": 0.5160288214683533, "learning_rate": 9.916650173511558e-06, "loss": 0.06959061324596405, "memory(GiB)": 20.68, "step": 3242, "token_acc": 0.9592760180995475, "train_speed(iter/s)": 0.953766 }, { "epoch": 0.10535035571581718, "grad_norm": 0.4634712338447571, "learning_rate": 9.916552474777936e-06, "loss": 0.06387029588222504, "memory(GiB)": 20.68, "step": 3243, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.953825 }, { "epoch": 0.10538284117857259, "grad_norm": 0.5251427292823792, "learning_rate": 9.916454719300755e-06, "loss": 0.07875953614711761, "memory(GiB)": 20.68, "step": 3244, "token_acc": 0.952191235059761, "train_speed(iter/s)": 0.953885 }, { "epoch": 0.10541532664132801, "grad_norm": 0.5570263266563416, "learning_rate": 9.91635690708114e-06, "loss": 0.06630124896764755, "memory(GiB)": 20.68, "step": 3245, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.953945 }, { "epoch": 0.10544781210408342, "grad_norm": 0.5715221762657166, "learning_rate": 9.916259038120225e-06, "loss": 0.056980155408382416, "memory(GiB)": 20.68, "step": 3246, "token_acc": 0.9747474747474747, "train_speed(iter/s)": 0.954006 }, { "epoch": 0.10548029756683884, "grad_norm": 0.479914128780365, "learning_rate": 9.916161112419136e-06, "loss": 0.0738113522529602, "memory(GiB)": 20.68, "step": 3247, "token_acc": 0.9595588235294118, "train_speed(iter/s)": 0.954067 }, { "epoch": 0.10551278302959426, "grad_norm": 0.5630338788032532, "learning_rate": 9.916063129979005e-06, "loss": 0.0699821263551712, "memory(GiB)": 20.68, "step": 3248, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.954131 }, { "epoch": 0.10554526849234967, "grad_norm": 0.6982818841934204, "learning_rate": 9.915965090800963e-06, "loss": 0.07676489651203156, "memory(GiB)": 20.68, "step": 3249, "token_acc": 0.9675925925925926, "train_speed(iter/s)": 0.95419 }, { "epoch": 0.10557775395510509, "grad_norm": 0.6362705826759338, "learning_rate": 9.91586699488614e-06, "loss": 0.07820861041545868, "memory(GiB)": 20.68, "step": 3250, "token_acc": 0.9759036144578314, "train_speed(iter/s)": 0.954247 }, { "epoch": 0.1056102394178605, "grad_norm": 0.6795176267623901, "learning_rate": 9.91576884223567e-06, "loss": 0.07716383039951324, "memory(GiB)": 20.68, "step": 3251, "token_acc": 0.9714285714285714, "train_speed(iter/s)": 0.954301 }, { "epoch": 0.10564272488061592, "grad_norm": 0.546572744846344, "learning_rate": 9.915670632850682e-06, "loss": 0.06979314237833023, "memory(GiB)": 20.68, "step": 3252, "token_acc": 0.9660194174757282, "train_speed(iter/s)": 0.954358 }, { "epoch": 0.10567521034337134, "grad_norm": 0.5574122667312622, "learning_rate": 9.915572366732314e-06, "loss": 0.0624147430062294, "memory(GiB)": 20.68, "step": 3253, "token_acc": 0.9832402234636871, "train_speed(iter/s)": 0.954415 }, { "epoch": 0.10570769580612675, "grad_norm": 0.6656304597854614, "learning_rate": 9.915474043881696e-06, "loss": 0.0667933002114296, "memory(GiB)": 20.68, "step": 3254, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.954472 }, { "epoch": 0.10574018126888217, "grad_norm": 0.5642129182815552, "learning_rate": 9.915375664299968e-06, "loss": 0.06859434396028519, "memory(GiB)": 20.68, "step": 3255, "token_acc": 0.9678714859437751, "train_speed(iter/s)": 0.954528 }, { "epoch": 0.1057726667316376, "grad_norm": 0.7298140525817871, "learning_rate": 9.91527722798826e-06, "loss": 0.07630513608455658, "memory(GiB)": 20.68, "step": 3256, "token_acc": 0.9893992932862191, "train_speed(iter/s)": 0.954582 }, { "epoch": 0.10580515219439302, "grad_norm": 1.2443921566009521, "learning_rate": 9.915178734947714e-06, "loss": 0.08033822476863861, "memory(GiB)": 20.68, "step": 3257, "token_acc": 0.9698492462311558, "train_speed(iter/s)": 0.954637 }, { "epoch": 0.10583763765714843, "grad_norm": 1.6289021968841553, "learning_rate": 9.91508018517946e-06, "loss": 0.06767436861991882, "memory(GiB)": 20.68, "step": 3258, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.954694 }, { "epoch": 0.10587012311990385, "grad_norm": 0.5212937593460083, "learning_rate": 9.91498157868464e-06, "loss": 0.06535626947879791, "memory(GiB)": 20.68, "step": 3259, "token_acc": 0.9695431472081218, "train_speed(iter/s)": 0.954749 }, { "epoch": 0.10590260858265926, "grad_norm": 0.7195817232131958, "learning_rate": 9.914882915464391e-06, "loss": 0.06756436079740524, "memory(GiB)": 20.68, "step": 3260, "token_acc": 0.9724770642201835, "train_speed(iter/s)": 0.954807 }, { "epoch": 0.10593509404541468, "grad_norm": 0.7436690926551819, "learning_rate": 9.914784195519851e-06, "loss": 0.07080067694187164, "memory(GiB)": 20.68, "step": 3261, "token_acc": 0.981549815498155, "train_speed(iter/s)": 0.954863 }, { "epoch": 0.1059675795081701, "grad_norm": 0.6793832778930664, "learning_rate": 9.914685418852162e-06, "loss": 0.056230366230010986, "memory(GiB)": 20.68, "step": 3262, "token_acc": 0.9780701754385965, "train_speed(iter/s)": 0.95492 }, { "epoch": 0.10600006497092551, "grad_norm": 0.7064168453216553, "learning_rate": 9.914586585462458e-06, "loss": 0.0660921186208725, "memory(GiB)": 20.68, "step": 3263, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.954971 }, { "epoch": 0.10603255043368093, "grad_norm": 0.5720588564872742, "learning_rate": 9.914487695351887e-06, "loss": 0.06948281824588776, "memory(GiB)": 20.68, "step": 3264, "token_acc": 0.9790940766550522, "train_speed(iter/s)": 0.955013 }, { "epoch": 0.10606503589643634, "grad_norm": 0.6671064496040344, "learning_rate": 9.914388748521584e-06, "loss": 0.06478352099657059, "memory(GiB)": 20.68, "step": 3265, "token_acc": 0.983957219251337, "train_speed(iter/s)": 0.95505 }, { "epoch": 0.10609752135919176, "grad_norm": 0.6682943105697632, "learning_rate": 9.914289744972695e-06, "loss": 0.08044472336769104, "memory(GiB)": 20.68, "step": 3266, "token_acc": 0.9800664451827242, "train_speed(iter/s)": 0.955099 }, { "epoch": 0.10613000682194718, "grad_norm": 0.6685208678245544, "learning_rate": 9.914190684706362e-06, "loss": 0.07801105082035065, "memory(GiB)": 20.68, "step": 3267, "token_acc": 0.9568627450980393, "train_speed(iter/s)": 0.955138 }, { "epoch": 0.10616249228470259, "grad_norm": 2.6603920459747314, "learning_rate": 9.914091567723727e-06, "loss": 0.07304275035858154, "memory(GiB)": 20.68, "step": 3268, "token_acc": 0.9813084112149533, "train_speed(iter/s)": 0.955181 }, { "epoch": 0.10619497774745801, "grad_norm": 0.5716231465339661, "learning_rate": 9.913992394025934e-06, "loss": 0.06325723230838776, "memory(GiB)": 20.68, "step": 3269, "token_acc": 0.9836065573770492, "train_speed(iter/s)": 0.955228 }, { "epoch": 0.10622746321021342, "grad_norm": 0.6240772008895874, "learning_rate": 9.91389316361413e-06, "loss": 0.06905360519886017, "memory(GiB)": 20.68, "step": 3270, "token_acc": 0.9730941704035875, "train_speed(iter/s)": 0.955277 }, { "epoch": 0.10625994867296884, "grad_norm": 0.9549640417098999, "learning_rate": 9.913793876489457e-06, "loss": 0.06480211019515991, "memory(GiB)": 20.68, "step": 3271, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.955326 }, { "epoch": 0.10629243413572427, "grad_norm": 0.7079087495803833, "learning_rate": 9.913694532653064e-06, "loss": 0.07102873921394348, "memory(GiB)": 20.68, "step": 3272, "token_acc": 0.9682539682539683, "train_speed(iter/s)": 0.955372 }, { "epoch": 0.10632491959847969, "grad_norm": 0.7960822582244873, "learning_rate": 9.913595132106093e-06, "loss": 0.07521796226501465, "memory(GiB)": 20.68, "step": 3273, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.955411 }, { "epoch": 0.1063574050612351, "grad_norm": 0.8194037079811096, "learning_rate": 9.913495674849698e-06, "loss": 0.09987880289554596, "memory(GiB)": 20.68, "step": 3274, "token_acc": 0.9664429530201343, "train_speed(iter/s)": 0.955449 }, { "epoch": 0.10638989052399052, "grad_norm": 0.73719722032547, "learning_rate": 9.913396160885021e-06, "loss": 0.08234100043773651, "memory(GiB)": 20.68, "step": 3275, "token_acc": 0.9695431472081218, "train_speed(iter/s)": 0.955497 }, { "epoch": 0.10642237598674593, "grad_norm": 0.713227391242981, "learning_rate": 9.913296590213213e-06, "loss": 0.07759253680706024, "memory(GiB)": 20.68, "step": 3276, "token_acc": 0.9518072289156626, "train_speed(iter/s)": 0.955545 }, { "epoch": 0.10645486144950135, "grad_norm": 0.9558910727500916, "learning_rate": 9.913196962835422e-06, "loss": 0.0761188417673111, "memory(GiB)": 20.68, "step": 3277, "token_acc": 0.956989247311828, "train_speed(iter/s)": 0.955591 }, { "epoch": 0.10648734691225677, "grad_norm": 0.6135875582695007, "learning_rate": 9.913097278752798e-06, "loss": 0.06648609042167664, "memory(GiB)": 20.68, "step": 3278, "token_acc": 0.9656652360515021, "train_speed(iter/s)": 0.955633 }, { "epoch": 0.10651983237501218, "grad_norm": 0.5476568937301636, "learning_rate": 9.912997537966495e-06, "loss": 0.07458348572254181, "memory(GiB)": 20.68, "step": 3279, "token_acc": 0.9758064516129032, "train_speed(iter/s)": 0.955681 }, { "epoch": 0.1065523178377676, "grad_norm": 0.5257256627082825, "learning_rate": 9.912897740477659e-06, "loss": 0.0692717432975769, "memory(GiB)": 20.68, "step": 3280, "token_acc": 0.9641434262948207, "train_speed(iter/s)": 0.955725 }, { "epoch": 0.10658480330052301, "grad_norm": 1.0412225723266602, "learning_rate": 9.912797886287444e-06, "loss": 0.08991164714097977, "memory(GiB)": 20.68, "step": 3281, "token_acc": 0.966542750929368, "train_speed(iter/s)": 0.955757 }, { "epoch": 0.10661728876327843, "grad_norm": 0.7632762789726257, "learning_rate": 9.912697975397003e-06, "loss": 0.07266752421855927, "memory(GiB)": 20.68, "step": 3282, "token_acc": 0.9616858237547893, "train_speed(iter/s)": 0.955792 }, { "epoch": 0.10664977422603385, "grad_norm": 0.480461448431015, "learning_rate": 9.912598007807489e-06, "loss": 0.07230134308338165, "memory(GiB)": 20.68, "step": 3283, "token_acc": 0.9788732394366197, "train_speed(iter/s)": 0.955832 }, { "epoch": 0.10668225968878926, "grad_norm": 0.664483904838562, "learning_rate": 9.912497983520056e-06, "loss": 0.07257169485092163, "memory(GiB)": 20.68, "step": 3284, "token_acc": 0.966542750929368, "train_speed(iter/s)": 0.955868 }, { "epoch": 0.10671474515154468, "grad_norm": 0.6973877549171448, "learning_rate": 9.912397902535855e-06, "loss": 0.07383192330598831, "memory(GiB)": 20.68, "step": 3285, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.955907 }, { "epoch": 0.1067472306143001, "grad_norm": 0.47674131393432617, "learning_rate": 9.912297764856047e-06, "loss": 0.06641754508018494, "memory(GiB)": 20.68, "step": 3286, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.955919 }, { "epoch": 0.10677971607705551, "grad_norm": 0.6113052368164062, "learning_rate": 9.912197570481782e-06, "loss": 0.0631365180015564, "memory(GiB)": 20.68, "step": 3287, "token_acc": 0.9678899082568807, "train_speed(iter/s)": 0.955957 }, { "epoch": 0.10681220153981094, "grad_norm": 1.162591576576233, "learning_rate": 9.91209731941422e-06, "loss": 0.0685025230050087, "memory(GiB)": 20.68, "step": 3288, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.955994 }, { "epoch": 0.10684468700256636, "grad_norm": 0.696290910243988, "learning_rate": 9.911997011654517e-06, "loss": 0.06169332563877106, "memory(GiB)": 20.68, "step": 3289, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.956026 }, { "epoch": 0.10687717246532177, "grad_norm": 0.7452683448791504, "learning_rate": 9.911896647203831e-06, "loss": 0.06942038238048553, "memory(GiB)": 20.68, "step": 3290, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.956067 }, { "epoch": 0.10690965792807719, "grad_norm": 0.753641664981842, "learning_rate": 9.91179622606332e-06, "loss": 0.07913971692323685, "memory(GiB)": 20.68, "step": 3291, "token_acc": 0.9545454545454546, "train_speed(iter/s)": 0.956109 }, { "epoch": 0.1069421433908326, "grad_norm": 0.565043568611145, "learning_rate": 9.911695748234142e-06, "loss": 0.07186571508646011, "memory(GiB)": 20.68, "step": 3292, "token_acc": 0.9641434262948207, "train_speed(iter/s)": 0.956144 }, { "epoch": 0.10697462885358802, "grad_norm": 0.699171245098114, "learning_rate": 9.911595213717458e-06, "loss": 0.0650998055934906, "memory(GiB)": 20.68, "step": 3293, "token_acc": 0.9727272727272728, "train_speed(iter/s)": 0.956177 }, { "epoch": 0.10700711431634344, "grad_norm": 0.4265855550765991, "learning_rate": 9.911494622514429e-06, "loss": 0.0559389665722847, "memory(GiB)": 20.68, "step": 3294, "token_acc": 0.9701492537313433, "train_speed(iter/s)": 0.956214 }, { "epoch": 0.10703959977909885, "grad_norm": 0.9817423224449158, "learning_rate": 9.911393974626213e-06, "loss": 0.07524693757295609, "memory(GiB)": 20.68, "step": 3295, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.956254 }, { "epoch": 0.10707208524185427, "grad_norm": 0.8326624631881714, "learning_rate": 9.911293270053974e-06, "loss": 0.06837715208530426, "memory(GiB)": 20.68, "step": 3296, "token_acc": 0.9746192893401016, "train_speed(iter/s)": 0.956294 }, { "epoch": 0.10710457070460969, "grad_norm": 1.2275521755218506, "learning_rate": 9.911192508798875e-06, "loss": 0.07564420998096466, "memory(GiB)": 20.68, "step": 3297, "token_acc": 0.9708029197080292, "train_speed(iter/s)": 0.956336 }, { "epoch": 0.1071370561673651, "grad_norm": 0.621748685836792, "learning_rate": 9.911091690862075e-06, "loss": 0.06375443935394287, "memory(GiB)": 20.68, "step": 3298, "token_acc": 0.9736842105263158, "train_speed(iter/s)": 0.956376 }, { "epoch": 0.10716954163012052, "grad_norm": 0.5911543965339661, "learning_rate": 9.910990816244743e-06, "loss": 0.07506280392408371, "memory(GiB)": 20.68, "step": 3299, "token_acc": 0.9711934156378601, "train_speed(iter/s)": 0.956427 }, { "epoch": 0.10720202709287593, "grad_norm": 0.9109557867050171, "learning_rate": 9.91088988494804e-06, "loss": 0.06428885459899902, "memory(GiB)": 20.68, "step": 3300, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.956476 }, { "epoch": 0.10723451255563135, "grad_norm": 0.6359692215919495, "learning_rate": 9.91078889697313e-06, "loss": 0.07129107415676117, "memory(GiB)": 20.68, "step": 3301, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.95652 }, { "epoch": 0.10726699801838677, "grad_norm": 0.9365057945251465, "learning_rate": 9.910687852321182e-06, "loss": 0.06740190088748932, "memory(GiB)": 20.68, "step": 3302, "token_acc": 0.9802631578947368, "train_speed(iter/s)": 0.95657 }, { "epoch": 0.10729948348114218, "grad_norm": 0.712665319442749, "learning_rate": 9.910586750993358e-06, "loss": 0.07141876220703125, "memory(GiB)": 20.68, "step": 3303, "token_acc": 0.9718875502008032, "train_speed(iter/s)": 0.956615 }, { "epoch": 0.10733196894389761, "grad_norm": 0.765669047832489, "learning_rate": 9.910485592990829e-06, "loss": 0.07214425504207611, "memory(GiB)": 20.68, "step": 3304, "token_acc": 0.9681818181818181, "train_speed(iter/s)": 0.95667 }, { "epoch": 0.10736445440665303, "grad_norm": 0.6008631587028503, "learning_rate": 9.910384378314757e-06, "loss": 0.06639177352190018, "memory(GiB)": 20.68, "step": 3305, "token_acc": 0.96875, "train_speed(iter/s)": 0.956724 }, { "epoch": 0.10739693986940844, "grad_norm": 0.6265866160392761, "learning_rate": 9.910283106966316e-06, "loss": 0.07046587020158768, "memory(GiB)": 20.68, "step": 3306, "token_acc": 0.9734513274336283, "train_speed(iter/s)": 0.956777 }, { "epoch": 0.10742942533216386, "grad_norm": 0.5807814598083496, "learning_rate": 9.910181778946672e-06, "loss": 0.0760042741894722, "memory(GiB)": 20.68, "step": 3307, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.956832 }, { "epoch": 0.10746191079491928, "grad_norm": 0.8752961754798889, "learning_rate": 9.910080394256995e-06, "loss": 0.07914184033870697, "memory(GiB)": 20.68, "step": 3308, "token_acc": 0.9699248120300752, "train_speed(iter/s)": 0.956887 }, { "epoch": 0.1074943962576747, "grad_norm": 0.7739365100860596, "learning_rate": 9.909978952898455e-06, "loss": 0.07384086400270462, "memory(GiB)": 20.68, "step": 3309, "token_acc": 0.9512195121951219, "train_speed(iter/s)": 0.95694 }, { "epoch": 0.10752688172043011, "grad_norm": 0.648760974407196, "learning_rate": 9.909877454872222e-06, "loss": 0.06284073740243912, "memory(GiB)": 20.68, "step": 3310, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.956982 }, { "epoch": 0.10755936718318553, "grad_norm": 0.8409816026687622, "learning_rate": 9.90977590017947e-06, "loss": 0.08119416236877441, "memory(GiB)": 20.68, "step": 3311, "token_acc": 0.9783549783549783, "train_speed(iter/s)": 0.957026 }, { "epoch": 0.10759185264594094, "grad_norm": 0.6168798208236694, "learning_rate": 9.909674288821367e-06, "loss": 0.07407891750335693, "memory(GiB)": 20.68, "step": 3312, "token_acc": 0.9653979238754326, "train_speed(iter/s)": 0.957071 }, { "epoch": 0.10762433810869636, "grad_norm": 0.6546890139579773, "learning_rate": 9.90957262079909e-06, "loss": 0.07029427587985992, "memory(GiB)": 20.68, "step": 3313, "token_acc": 0.98, "train_speed(iter/s)": 0.957118 }, { "epoch": 0.10765682357145177, "grad_norm": 0.6462401151657104, "learning_rate": 9.909470896113809e-06, "loss": 0.076771080493927, "memory(GiB)": 20.68, "step": 3314, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.957162 }, { "epoch": 0.10768930903420719, "grad_norm": 0.7232184410095215, "learning_rate": 9.909369114766699e-06, "loss": 0.0802670568227768, "memory(GiB)": 20.68, "step": 3315, "token_acc": 0.9700374531835206, "train_speed(iter/s)": 0.957201 }, { "epoch": 0.1077217944969626, "grad_norm": 0.9316089153289795, "learning_rate": 9.909267276758935e-06, "loss": 0.0907629132270813, "memory(GiB)": 20.68, "step": 3316, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.957241 }, { "epoch": 0.10775427995971802, "grad_norm": 0.5554357171058655, "learning_rate": 9.909165382091693e-06, "loss": 0.0639520138502121, "memory(GiB)": 20.68, "step": 3317, "token_acc": 0.9652173913043478, "train_speed(iter/s)": 0.957272 }, { "epoch": 0.10778676542247344, "grad_norm": 0.8980207443237305, "learning_rate": 9.909063430766147e-06, "loss": 0.07354798913002014, "memory(GiB)": 20.68, "step": 3318, "token_acc": 0.967032967032967, "train_speed(iter/s)": 0.957313 }, { "epoch": 0.10781925088522885, "grad_norm": 0.9377707839012146, "learning_rate": 9.908961422783478e-06, "loss": 0.08599071949720383, "memory(GiB)": 20.68, "step": 3319, "token_acc": 0.961864406779661, "train_speed(iter/s)": 0.957353 }, { "epoch": 0.10785173634798428, "grad_norm": 0.47032254934310913, "learning_rate": 9.90885935814486e-06, "loss": 0.05903608351945877, "memory(GiB)": 20.68, "step": 3320, "token_acc": 0.9673469387755103, "train_speed(iter/s)": 0.957392 }, { "epoch": 0.1078842218107397, "grad_norm": 0.5719032287597656, "learning_rate": 9.908757236851469e-06, "loss": 0.07065272331237793, "memory(GiB)": 20.68, "step": 3321, "token_acc": 0.9725274725274725, "train_speed(iter/s)": 0.957434 }, { "epoch": 0.10791670727349512, "grad_norm": 0.9890976548194885, "learning_rate": 9.908655058904488e-06, "loss": 0.08681212365627289, "memory(GiB)": 20.68, "step": 3322, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.957475 }, { "epoch": 0.10794919273625053, "grad_norm": 0.6267290711402893, "learning_rate": 9.908552824305094e-06, "loss": 0.07154443114995956, "memory(GiB)": 20.68, "step": 3323, "token_acc": 0.966542750929368, "train_speed(iter/s)": 0.957517 }, { "epoch": 0.10798167819900595, "grad_norm": 0.7349103093147278, "learning_rate": 9.908450533054466e-06, "loss": 0.07329584658145905, "memory(GiB)": 20.68, "step": 3324, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.957552 }, { "epoch": 0.10801416366176136, "grad_norm": 0.5522254705429077, "learning_rate": 9.908348185153785e-06, "loss": 0.07164254784584045, "memory(GiB)": 20.68, "step": 3325, "token_acc": 0.9596412556053812, "train_speed(iter/s)": 0.957594 }, { "epoch": 0.10804664912451678, "grad_norm": 0.748806893825531, "learning_rate": 9.908245780604235e-06, "loss": 0.08336609601974487, "memory(GiB)": 20.68, "step": 3326, "token_acc": 0.958904109589041, "train_speed(iter/s)": 0.957635 }, { "epoch": 0.1080791345872722, "grad_norm": 0.6983041167259216, "learning_rate": 9.908143319406995e-06, "loss": 0.06982161849737167, "memory(GiB)": 20.68, "step": 3327, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.957669 }, { "epoch": 0.10811162005002761, "grad_norm": 0.8511934280395508, "learning_rate": 9.908040801563248e-06, "loss": 0.07953263819217682, "memory(GiB)": 20.68, "step": 3328, "token_acc": 0.9458333333333333, "train_speed(iter/s)": 0.957713 }, { "epoch": 0.10814410551278303, "grad_norm": 0.8207528591156006, "learning_rate": 9.90793822707418e-06, "loss": 0.06989921629428864, "memory(GiB)": 20.68, "step": 3329, "token_acc": 0.9601990049751243, "train_speed(iter/s)": 0.957762 }, { "epoch": 0.10817659097553844, "grad_norm": 1.2959060668945312, "learning_rate": 9.90783559594097e-06, "loss": 0.09354592859745026, "memory(GiB)": 20.68, "step": 3330, "token_acc": 0.9745762711864406, "train_speed(iter/s)": 0.957809 }, { "epoch": 0.10820907643829386, "grad_norm": 0.8338470458984375, "learning_rate": 9.907732908164803e-06, "loss": 0.06055685505270958, "memory(GiB)": 20.68, "step": 3331, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.957847 }, { "epoch": 0.10824156190104928, "grad_norm": 0.4289439618587494, "learning_rate": 9.907630163746868e-06, "loss": 0.061301201581954956, "memory(GiB)": 20.68, "step": 3332, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.957886 }, { "epoch": 0.10827404736380469, "grad_norm": 0.6582610011100769, "learning_rate": 9.90752736268835e-06, "loss": 0.0618230439722538, "memory(GiB)": 20.68, "step": 3333, "token_acc": 0.9671532846715328, "train_speed(iter/s)": 0.95793 }, { "epoch": 0.10830653282656011, "grad_norm": 0.5462192296981812, "learning_rate": 9.907424504990433e-06, "loss": 0.060274094343185425, "memory(GiB)": 20.68, "step": 3334, "token_acc": 0.9724137931034482, "train_speed(iter/s)": 0.957976 }, { "epoch": 0.10833901828931553, "grad_norm": 0.4982307255268097, "learning_rate": 9.907321590654306e-06, "loss": 0.06597933918237686, "memory(GiB)": 20.68, "step": 3335, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.958026 }, { "epoch": 0.10837150375207096, "grad_norm": 0.5920155048370361, "learning_rate": 9.907218619681156e-06, "loss": 0.06205233559012413, "memory(GiB)": 20.68, "step": 3336, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.958074 }, { "epoch": 0.10840398921482637, "grad_norm": 1.4837902784347534, "learning_rate": 9.907115592072173e-06, "loss": 0.06805935502052307, "memory(GiB)": 20.68, "step": 3337, "token_acc": 0.9621621621621622, "train_speed(iter/s)": 0.95813 }, { "epoch": 0.10843647467758179, "grad_norm": 0.5989611744880676, "learning_rate": 9.907012507828542e-06, "loss": 0.06794501841068268, "memory(GiB)": 20.68, "step": 3338, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.958187 }, { "epoch": 0.1084689601403372, "grad_norm": 0.7174801826477051, "learning_rate": 9.906909366951459e-06, "loss": 0.08013282716274261, "memory(GiB)": 20.68, "step": 3339, "token_acc": 0.9572649572649573, "train_speed(iter/s)": 0.958246 }, { "epoch": 0.10850144560309262, "grad_norm": 0.8614025712013245, "learning_rate": 9.906806169442107e-06, "loss": 0.07368443161249161, "memory(GiB)": 20.68, "step": 3340, "token_acc": 0.9605911330049262, "train_speed(iter/s)": 0.9583 }, { "epoch": 0.10853393106584804, "grad_norm": 0.5418305397033691, "learning_rate": 9.906702915301682e-06, "loss": 0.07103689014911652, "memory(GiB)": 20.68, "step": 3341, "token_acc": 0.9601990049751243, "train_speed(iter/s)": 0.958355 }, { "epoch": 0.10856641652860345, "grad_norm": 0.6476753354072571, "learning_rate": 9.906599604531377e-06, "loss": 0.061690159142017365, "memory(GiB)": 20.68, "step": 3342, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.958415 }, { "epoch": 0.10859890199135887, "grad_norm": 0.607428789138794, "learning_rate": 9.906496237132379e-06, "loss": 0.07499537616968155, "memory(GiB)": 20.68, "step": 3343, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.958471 }, { "epoch": 0.10863138745411428, "grad_norm": 0.4679381549358368, "learning_rate": 9.906392813105885e-06, "loss": 0.05715227127075195, "memory(GiB)": 20.68, "step": 3344, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.958529 }, { "epoch": 0.1086638729168697, "grad_norm": 0.8340011835098267, "learning_rate": 9.906289332453087e-06, "loss": 0.07892870903015137, "memory(GiB)": 20.68, "step": 3345, "token_acc": 0.9558232931726908, "train_speed(iter/s)": 0.958587 }, { "epoch": 0.10869635837962512, "grad_norm": 0.7806830406188965, "learning_rate": 9.90618579517518e-06, "loss": 0.07065209746360779, "memory(GiB)": 20.68, "step": 3346, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.958642 }, { "epoch": 0.10872884384238053, "grad_norm": 0.626278817653656, "learning_rate": 9.90608220127336e-06, "loss": 0.06545288860797882, "memory(GiB)": 20.68, "step": 3347, "token_acc": 0.9806763285024155, "train_speed(iter/s)": 0.958697 }, { "epoch": 0.10876132930513595, "grad_norm": 0.5953407287597656, "learning_rate": 9.905978550748821e-06, "loss": 0.05960187315940857, "memory(GiB)": 20.68, "step": 3348, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.958755 }, { "epoch": 0.10879381476789136, "grad_norm": 0.5217024683952332, "learning_rate": 9.905874843602759e-06, "loss": 0.05651625245809555, "memory(GiB)": 20.68, "step": 3349, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.958813 }, { "epoch": 0.10882630023064678, "grad_norm": 0.7175347208976746, "learning_rate": 9.905771079836372e-06, "loss": 0.0708460733294487, "memory(GiB)": 20.68, "step": 3350, "token_acc": 0.9752475247524752, "train_speed(iter/s)": 0.958865 }, { "epoch": 0.1088587856934022, "grad_norm": 0.6204443573951721, "learning_rate": 9.905667259450858e-06, "loss": 0.06335458159446716, "memory(GiB)": 20.68, "step": 3351, "token_acc": 0.961864406779661, "train_speed(iter/s)": 0.958919 }, { "epoch": 0.10889127115615763, "grad_norm": 1.201230525970459, "learning_rate": 9.905563382447411e-06, "loss": 0.07091140747070312, "memory(GiB)": 20.68, "step": 3352, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.958971 }, { "epoch": 0.10892375661891304, "grad_norm": 0.6073061227798462, "learning_rate": 9.905459448827238e-06, "loss": 0.07625474035739899, "memory(GiB)": 20.68, "step": 3353, "token_acc": 0.9622641509433962, "train_speed(iter/s)": 0.959026 }, { "epoch": 0.10895624208166846, "grad_norm": 0.8245109915733337, "learning_rate": 9.90535545859153e-06, "loss": 0.07701342552900314, "memory(GiB)": 20.68, "step": 3354, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.959081 }, { "epoch": 0.10898872754442387, "grad_norm": 0.6278905868530273, "learning_rate": 9.905251411741493e-06, "loss": 0.07448849081993103, "memory(GiB)": 20.68, "step": 3355, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.959132 }, { "epoch": 0.10902121300717929, "grad_norm": 0.49539053440093994, "learning_rate": 9.905147308278325e-06, "loss": 0.06289780884981155, "memory(GiB)": 20.68, "step": 3356, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.959182 }, { "epoch": 0.1090536984699347, "grad_norm": 0.5712940692901611, "learning_rate": 9.905043148203229e-06, "loss": 0.0656241849064827, "memory(GiB)": 20.68, "step": 3357, "token_acc": 0.9624060150375939, "train_speed(iter/s)": 0.95924 }, { "epoch": 0.10908618393269012, "grad_norm": 0.8388696312904358, "learning_rate": 9.904938931517406e-06, "loss": 0.0668010413646698, "memory(GiB)": 20.68, "step": 3358, "token_acc": 0.975103734439834, "train_speed(iter/s)": 0.959294 }, { "epoch": 0.10911866939544554, "grad_norm": 0.6184671521186829, "learning_rate": 9.90483465822206e-06, "loss": 0.06479325145483017, "memory(GiB)": 20.68, "step": 3359, "token_acc": 0.9603960396039604, "train_speed(iter/s)": 0.959347 }, { "epoch": 0.10915115485820095, "grad_norm": 1.0175855159759521, "learning_rate": 9.904730328318392e-06, "loss": 0.06368741393089294, "memory(GiB)": 20.68, "step": 3360, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.959403 }, { "epoch": 0.10918364032095637, "grad_norm": 0.5702999830245972, "learning_rate": 9.90462594180761e-06, "loss": 0.07427515089511871, "memory(GiB)": 20.68, "step": 3361, "token_acc": 0.9563492063492064, "train_speed(iter/s)": 0.959457 }, { "epoch": 0.10921612578371179, "grad_norm": 0.6093025207519531, "learning_rate": 9.904521498690916e-06, "loss": 0.06573084741830826, "memory(GiB)": 20.68, "step": 3362, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.959514 }, { "epoch": 0.1092486112464672, "grad_norm": 0.8069022297859192, "learning_rate": 9.904416998969514e-06, "loss": 0.07266612350940704, "memory(GiB)": 20.68, "step": 3363, "token_acc": 0.9814126394052045, "train_speed(iter/s)": 0.959569 }, { "epoch": 0.10928109670922262, "grad_norm": 0.6813663244247437, "learning_rate": 9.904312442644614e-06, "loss": 0.06560172885656357, "memory(GiB)": 20.68, "step": 3364, "token_acc": 0.978494623655914, "train_speed(iter/s)": 0.959624 }, { "epoch": 0.10931358217197804, "grad_norm": 1.035228967666626, "learning_rate": 9.90420782971742e-06, "loss": 0.0764450952410698, "memory(GiB)": 20.68, "step": 3365, "token_acc": 0.9634703196347032, "train_speed(iter/s)": 0.95968 }, { "epoch": 0.10934606763473345, "grad_norm": 0.7662821412086487, "learning_rate": 9.904103160189142e-06, "loss": 0.0642855167388916, "memory(GiB)": 20.68, "step": 3366, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.959729 }, { "epoch": 0.10937855309748887, "grad_norm": 0.4435080587863922, "learning_rate": 9.903998434060983e-06, "loss": 0.057464007288217545, "memory(GiB)": 20.68, "step": 3367, "token_acc": 0.9613733905579399, "train_speed(iter/s)": 0.959784 }, { "epoch": 0.1094110385602443, "grad_norm": 0.5146552920341492, "learning_rate": 9.903893651334158e-06, "loss": 0.06427812576293945, "memory(GiB)": 20.68, "step": 3368, "token_acc": 0.9610894941634242, "train_speed(iter/s)": 0.959832 }, { "epoch": 0.10944352402299971, "grad_norm": 1.0614955425262451, "learning_rate": 9.903788812009872e-06, "loss": 0.08445417881011963, "memory(GiB)": 20.68, "step": 3369, "token_acc": 0.966804979253112, "train_speed(iter/s)": 0.959869 }, { "epoch": 0.10947600948575513, "grad_norm": 1.4602693319320679, "learning_rate": 9.903683916089334e-06, "loss": 0.09513762593269348, "memory(GiB)": 20.68, "step": 3370, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.959913 }, { "epoch": 0.10950849494851055, "grad_norm": 0.9231863617897034, "learning_rate": 9.903578963573762e-06, "loss": 0.06905633211135864, "memory(GiB)": 20.68, "step": 3371, "token_acc": 0.968, "train_speed(iter/s)": 0.959958 }, { "epoch": 0.10954098041126596, "grad_norm": 0.8710924983024597, "learning_rate": 9.903473954464357e-06, "loss": 0.07569344341754913, "memory(GiB)": 20.68, "step": 3372, "token_acc": 0.9678714859437751, "train_speed(iter/s)": 0.960004 }, { "epoch": 0.10957346587402138, "grad_norm": 0.7954419851303101, "learning_rate": 9.903368888762339e-06, "loss": 0.07851579785346985, "memory(GiB)": 20.68, "step": 3373, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.960047 }, { "epoch": 0.1096059513367768, "grad_norm": 1.02206289768219, "learning_rate": 9.903263766468916e-06, "loss": 0.06628181040287018, "memory(GiB)": 20.68, "step": 3374, "token_acc": 0.9836956521739131, "train_speed(iter/s)": 0.960092 }, { "epoch": 0.10963843679953221, "grad_norm": 0.6022622585296631, "learning_rate": 9.903158587585302e-06, "loss": 0.07419717311859131, "memory(GiB)": 20.68, "step": 3375, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.960137 }, { "epoch": 0.10967092226228763, "grad_norm": 0.7527148723602295, "learning_rate": 9.903053352112714e-06, "loss": 0.0725550577044487, "memory(GiB)": 20.68, "step": 3376, "token_acc": 0.9692307692307692, "train_speed(iter/s)": 0.960184 }, { "epoch": 0.10970340772504304, "grad_norm": 0.5278344750404358, "learning_rate": 9.902948060052364e-06, "loss": 0.06306146830320358, "memory(GiB)": 20.68, "step": 3377, "token_acc": 0.97265625, "train_speed(iter/s)": 0.960217 }, { "epoch": 0.10973589318779846, "grad_norm": 0.5913702845573425, "learning_rate": 9.902842711405467e-06, "loss": 0.07420830428600311, "memory(GiB)": 20.68, "step": 3378, "token_acc": 0.9603960396039604, "train_speed(iter/s)": 0.960259 }, { "epoch": 0.10976837865055387, "grad_norm": 0.5308956503868103, "learning_rate": 9.90273730617324e-06, "loss": 0.0689634308218956, "memory(GiB)": 20.68, "step": 3379, "token_acc": 0.9595959595959596, "train_speed(iter/s)": 0.960299 }, { "epoch": 0.10980086411330929, "grad_norm": 0.4878631830215454, "learning_rate": 9.9026318443569e-06, "loss": 0.067339688539505, "memory(GiB)": 20.68, "step": 3380, "token_acc": 0.9644268774703557, "train_speed(iter/s)": 0.960338 }, { "epoch": 0.1098333495760647, "grad_norm": 0.5773593187332153, "learning_rate": 9.902526325957662e-06, "loss": 0.06795363128185272, "memory(GiB)": 20.68, "step": 3381, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.960375 }, { "epoch": 0.10986583503882012, "grad_norm": 1.0705435276031494, "learning_rate": 9.902420750976745e-06, "loss": 0.0774630531668663, "memory(GiB)": 20.68, "step": 3382, "token_acc": 0.9675925925925926, "train_speed(iter/s)": 0.960411 }, { "epoch": 0.10989832050157554, "grad_norm": 0.4634064733982086, "learning_rate": 9.902315119415367e-06, "loss": 0.07212384790182114, "memory(GiB)": 20.68, "step": 3383, "token_acc": 0.9626865671641791, "train_speed(iter/s)": 0.960448 }, { "epoch": 0.10993080596433097, "grad_norm": 0.5295431613922119, "learning_rate": 9.902209431274748e-06, "loss": 0.06298486888408661, "memory(GiB)": 20.68, "step": 3384, "token_acc": 0.9641025641025641, "train_speed(iter/s)": 0.960489 }, { "epoch": 0.10996329142708638, "grad_norm": 0.5651629567146301, "learning_rate": 9.902103686556106e-06, "loss": 0.05987007915973663, "memory(GiB)": 20.68, "step": 3385, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.960527 }, { "epoch": 0.1099957768898418, "grad_norm": 0.6527592539787292, "learning_rate": 9.901997885260663e-06, "loss": 0.08210144937038422, "memory(GiB)": 20.68, "step": 3386, "token_acc": 0.9776785714285714, "train_speed(iter/s)": 0.960569 }, { "epoch": 0.11002826235259722, "grad_norm": 0.6614488363265991, "learning_rate": 9.901892027389641e-06, "loss": 0.06383085995912552, "memory(GiB)": 20.68, "step": 3387, "token_acc": 0.9581589958158996, "train_speed(iter/s)": 0.960607 }, { "epoch": 0.11006074781535263, "grad_norm": 0.5894842147827148, "learning_rate": 9.90178611294426e-06, "loss": 0.07109402120113373, "memory(GiB)": 20.68, "step": 3388, "token_acc": 0.9700374531835206, "train_speed(iter/s)": 0.960649 }, { "epoch": 0.11009323327810805, "grad_norm": 0.4777211844921112, "learning_rate": 9.901680141925745e-06, "loss": 0.060584791004657745, "memory(GiB)": 20.68, "step": 3389, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.960688 }, { "epoch": 0.11012571874086347, "grad_norm": 0.5316739082336426, "learning_rate": 9.901574114335315e-06, "loss": 0.0688563734292984, "memory(GiB)": 20.68, "step": 3390, "token_acc": 0.9724409448818898, "train_speed(iter/s)": 0.96073 }, { "epoch": 0.11015820420361888, "grad_norm": 0.5776918530464172, "learning_rate": 9.901468030174194e-06, "loss": 0.07211905717849731, "memory(GiB)": 20.68, "step": 3391, "token_acc": 0.968, "train_speed(iter/s)": 0.960774 }, { "epoch": 0.1101906896663743, "grad_norm": 0.6128042340278625, "learning_rate": 9.901361889443612e-06, "loss": 0.060946084558963776, "memory(GiB)": 20.68, "step": 3392, "token_acc": 0.9621848739495799, "train_speed(iter/s)": 0.960811 }, { "epoch": 0.11022317512912971, "grad_norm": 0.7529333233833313, "learning_rate": 9.901255692144789e-06, "loss": 0.0610809251666069, "memory(GiB)": 20.68, "step": 3393, "token_acc": 0.98, "train_speed(iter/s)": 0.960858 }, { "epoch": 0.11025566059188513, "grad_norm": 0.642336368560791, "learning_rate": 9.90114943827895e-06, "loss": 0.07250040769577026, "memory(GiB)": 20.68, "step": 3394, "token_acc": 0.9798994974874372, "train_speed(iter/s)": 0.960903 }, { "epoch": 0.11028814605464055, "grad_norm": 0.6887847185134888, "learning_rate": 9.901043127847324e-06, "loss": 0.06970959156751633, "memory(GiB)": 20.68, "step": 3395, "token_acc": 0.9858490566037735, "train_speed(iter/s)": 0.960943 }, { "epoch": 0.11032063151739596, "grad_norm": 0.8184999823570251, "learning_rate": 9.900936760851137e-06, "loss": 0.06849296391010284, "memory(GiB)": 20.68, "step": 3396, "token_acc": 0.9670781893004116, "train_speed(iter/s)": 0.960983 }, { "epoch": 0.11035311698015138, "grad_norm": 0.6604970693588257, "learning_rate": 9.900830337291617e-06, "loss": 0.07529912143945694, "memory(GiB)": 20.68, "step": 3397, "token_acc": 0.95, "train_speed(iter/s)": 0.961022 }, { "epoch": 0.1103856024429068, "grad_norm": 1.6746516227722168, "learning_rate": 9.900723857169991e-06, "loss": 0.08658204227685928, "memory(GiB)": 20.68, "step": 3398, "token_acc": 0.9768339768339769, "train_speed(iter/s)": 0.961063 }, { "epoch": 0.11041808790566221, "grad_norm": 1.1203992366790771, "learning_rate": 9.90061732048749e-06, "loss": 0.07440784573554993, "memory(GiB)": 20.68, "step": 3399, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.961097 }, { "epoch": 0.11045057336841764, "grad_norm": 0.6903350353240967, "learning_rate": 9.900510727245343e-06, "loss": 0.07254608720541, "memory(GiB)": 20.68, "step": 3400, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.961153 }, { "epoch": 0.11048305883117306, "grad_norm": 0.7420654892921448, "learning_rate": 9.900404077444777e-06, "loss": 0.05990337207913399, "memory(GiB)": 20.68, "step": 3401, "token_acc": 0.9708029197080292, "train_speed(iter/s)": 0.961207 }, { "epoch": 0.11051554429392847, "grad_norm": 0.48300251364707947, "learning_rate": 9.900297371087027e-06, "loss": 0.05982176586985588, "memory(GiB)": 20.68, "step": 3402, "token_acc": 0.9796954314720813, "train_speed(iter/s)": 0.961262 }, { "epoch": 0.11054802975668389, "grad_norm": 0.7285882830619812, "learning_rate": 9.900190608173324e-06, "loss": 0.07879573106765747, "memory(GiB)": 20.68, "step": 3403, "token_acc": 0.971830985915493, "train_speed(iter/s)": 0.961317 }, { "epoch": 0.1105805152194393, "grad_norm": 0.7617006897926331, "learning_rate": 9.900083788704899e-06, "loss": 0.08483521640300751, "memory(GiB)": 20.68, "step": 3404, "token_acc": 0.978494623655914, "train_speed(iter/s)": 0.961372 }, { "epoch": 0.11061300068219472, "grad_norm": 0.9730177521705627, "learning_rate": 9.899976912682984e-06, "loss": 0.08085031807422638, "memory(GiB)": 20.68, "step": 3405, "token_acc": 0.961352657004831, "train_speed(iter/s)": 0.961425 }, { "epoch": 0.11064548614495014, "grad_norm": 1.7992627620697021, "learning_rate": 9.899869980108813e-06, "loss": 0.06416232883930206, "memory(GiB)": 20.68, "step": 3406, "token_acc": 0.9604743083003953, "train_speed(iter/s)": 0.961481 }, { "epoch": 0.11067797160770555, "grad_norm": 0.7795623540878296, "learning_rate": 9.899762990983624e-06, "loss": 0.06237843632698059, "memory(GiB)": 20.68, "step": 3407, "token_acc": 0.9708333333333333, "train_speed(iter/s)": 0.961537 }, { "epoch": 0.11071045707046097, "grad_norm": 0.6639599204063416, "learning_rate": 9.899655945308645e-06, "loss": 0.06905815005302429, "memory(GiB)": 20.68, "step": 3408, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.961591 }, { "epoch": 0.11074294253321638, "grad_norm": 0.7073138356208801, "learning_rate": 9.899548843085117e-06, "loss": 0.0685853511095047, "memory(GiB)": 20.68, "step": 3409, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.961646 }, { "epoch": 0.1107754279959718, "grad_norm": 0.6517741680145264, "learning_rate": 9.899441684314273e-06, "loss": 0.07897888123989105, "memory(GiB)": 20.68, "step": 3410, "token_acc": 0.9717741935483871, "train_speed(iter/s)": 0.961701 }, { "epoch": 0.11080791345872722, "grad_norm": 0.5906786918640137, "learning_rate": 9.89933446899735e-06, "loss": 0.060528382658958435, "memory(GiB)": 20.68, "step": 3411, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.961754 }, { "epoch": 0.11084039892148263, "grad_norm": 0.6749367117881775, "learning_rate": 9.899227197135587e-06, "loss": 0.07761388272047043, "memory(GiB)": 20.68, "step": 3412, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.961809 }, { "epoch": 0.11087288438423805, "grad_norm": 0.5766519904136658, "learning_rate": 9.899119868730223e-06, "loss": 0.06685103476047516, "memory(GiB)": 20.68, "step": 3413, "token_acc": 0.979253112033195, "train_speed(iter/s)": 0.961864 }, { "epoch": 0.11090536984699347, "grad_norm": 0.5569117665290833, "learning_rate": 9.899012483782493e-06, "loss": 0.06928230077028275, "memory(GiB)": 20.68, "step": 3414, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.961919 }, { "epoch": 0.11093785530974888, "grad_norm": 0.694809079170227, "learning_rate": 9.898905042293639e-06, "loss": 0.062452249228954315, "memory(GiB)": 20.68, "step": 3415, "token_acc": 0.9758454106280193, "train_speed(iter/s)": 0.961968 }, { "epoch": 0.11097034077250431, "grad_norm": 0.8833841681480408, "learning_rate": 9.8987975442649e-06, "loss": 0.07989691197872162, "memory(GiB)": 20.68, "step": 3416, "token_acc": 0.9629629629629629, "train_speed(iter/s)": 0.962024 }, { "epoch": 0.11100282623525973, "grad_norm": 0.7016657590866089, "learning_rate": 9.898689989697518e-06, "loss": 0.05814899131655693, "memory(GiB)": 20.68, "step": 3417, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.96208 }, { "epoch": 0.11103531169801514, "grad_norm": 2.0300040245056152, "learning_rate": 9.898582378592734e-06, "loss": 0.07839377969503403, "memory(GiB)": 20.68, "step": 3418, "token_acc": 0.9767441860465116, "train_speed(iter/s)": 0.962133 }, { "epoch": 0.11106779716077056, "grad_norm": 0.7299396395683289, "learning_rate": 9.898474710951787e-06, "loss": 0.07342514395713806, "memory(GiB)": 20.68, "step": 3419, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.962188 }, { "epoch": 0.11110028262352598, "grad_norm": 0.5686634182929993, "learning_rate": 9.898366986775923e-06, "loss": 0.06158447265625, "memory(GiB)": 20.68, "step": 3420, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.962245 }, { "epoch": 0.11113276808628139, "grad_norm": 0.7827538847923279, "learning_rate": 9.898259206066385e-06, "loss": 0.06179556995630264, "memory(GiB)": 20.68, "step": 3421, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.962298 }, { "epoch": 0.11116525354903681, "grad_norm": 0.9792085886001587, "learning_rate": 9.898151368824416e-06, "loss": 0.08077429234981537, "memory(GiB)": 20.68, "step": 3422, "token_acc": 0.9775280898876404, "train_speed(iter/s)": 0.962349 }, { "epoch": 0.11119773901179222, "grad_norm": 0.6234903335571289, "learning_rate": 9.89804347505126e-06, "loss": 0.06821803748607635, "memory(GiB)": 20.68, "step": 3423, "token_acc": 0.9664179104477612, "train_speed(iter/s)": 0.962404 }, { "epoch": 0.11123022447454764, "grad_norm": 1.0024973154067993, "learning_rate": 9.897935524748164e-06, "loss": 0.06984390318393707, "memory(GiB)": 20.68, "step": 3424, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.962458 }, { "epoch": 0.11126270993730306, "grad_norm": 0.5786640644073486, "learning_rate": 9.897827517916372e-06, "loss": 0.0733070820569992, "memory(GiB)": 20.68, "step": 3425, "token_acc": 0.9607843137254902, "train_speed(iter/s)": 0.962502 }, { "epoch": 0.11129519540005847, "grad_norm": 0.760658323764801, "learning_rate": 9.897719454557131e-06, "loss": 0.07618475705385208, "memory(GiB)": 20.68, "step": 3426, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.96254 }, { "epoch": 0.11132768086281389, "grad_norm": 0.6643386483192444, "learning_rate": 9.897611334671691e-06, "loss": 0.07101771235466003, "memory(GiB)": 20.68, "step": 3427, "token_acc": 0.9724137931034482, "train_speed(iter/s)": 0.962582 }, { "epoch": 0.1113601663255693, "grad_norm": 1.6728123426437378, "learning_rate": 9.897503158261294e-06, "loss": 0.061760030686855316, "memory(GiB)": 20.68, "step": 3428, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.962623 }, { "epoch": 0.11139265178832472, "grad_norm": 0.5359727144241333, "learning_rate": 9.897394925327196e-06, "loss": 0.08107733726501465, "memory(GiB)": 20.68, "step": 3429, "token_acc": 0.9644444444444444, "train_speed(iter/s)": 0.962665 }, { "epoch": 0.11142513725108014, "grad_norm": 0.6635317206382751, "learning_rate": 9.89728663587064e-06, "loss": 0.05850449204444885, "memory(GiB)": 20.68, "step": 3430, "token_acc": 0.9700854700854701, "train_speed(iter/s)": 0.962709 }, { "epoch": 0.11145762271383555, "grad_norm": 0.6247852444648743, "learning_rate": 9.897178289892876e-06, "loss": 0.0681585967540741, "memory(GiB)": 20.68, "step": 3431, "token_acc": 0.9778761061946902, "train_speed(iter/s)": 0.962753 }, { "epoch": 0.11149010817659098, "grad_norm": 0.8182331919670105, "learning_rate": 9.89706988739516e-06, "loss": 0.06821797788143158, "memory(GiB)": 20.68, "step": 3432, "token_acc": 0.971830985915493, "train_speed(iter/s)": 0.9628 }, { "epoch": 0.1115225936393464, "grad_norm": 0.7355588674545288, "learning_rate": 9.896961428378738e-06, "loss": 0.06114058941602707, "memory(GiB)": 20.68, "step": 3433, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.96284 }, { "epoch": 0.11155507910210181, "grad_norm": 1.1182972192764282, "learning_rate": 9.896852912844863e-06, "loss": 0.06906445324420929, "memory(GiB)": 20.68, "step": 3434, "token_acc": 0.9601593625498008, "train_speed(iter/s)": 0.962881 }, { "epoch": 0.11158756456485723, "grad_norm": 0.9072927236557007, "learning_rate": 9.896744340794789e-06, "loss": 0.07583059370517731, "memory(GiB)": 20.68, "step": 3435, "token_acc": 0.966183574879227, "train_speed(iter/s)": 0.962926 }, { "epoch": 0.11162005002761265, "grad_norm": 0.8962860107421875, "learning_rate": 9.896635712229765e-06, "loss": 0.05610717087984085, "memory(GiB)": 20.68, "step": 3436, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.962968 }, { "epoch": 0.11165253549036806, "grad_norm": 0.675264298915863, "learning_rate": 9.89652702715105e-06, "loss": 0.0674489289522171, "memory(GiB)": 20.68, "step": 3437, "token_acc": 0.9704641350210971, "train_speed(iter/s)": 0.963012 }, { "epoch": 0.11168502095312348, "grad_norm": 0.6326958537101746, "learning_rate": 9.896418285559894e-06, "loss": 0.07112406194210052, "memory(GiB)": 20.68, "step": 3438, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.963052 }, { "epoch": 0.1117175064158789, "grad_norm": 0.6080583333969116, "learning_rate": 9.896309487457556e-06, "loss": 0.057331375777721405, "memory(GiB)": 20.68, "step": 3439, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.963087 }, { "epoch": 0.11174999187863431, "grad_norm": 0.6721695065498352, "learning_rate": 9.896200632845288e-06, "loss": 0.060951441526412964, "memory(GiB)": 20.68, "step": 3440, "token_acc": 0.974025974025974, "train_speed(iter/s)": 0.963121 }, { "epoch": 0.11178247734138973, "grad_norm": 0.8905166983604431, "learning_rate": 9.896091721724349e-06, "loss": 0.07244749367237091, "memory(GiB)": 20.68, "step": 3441, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.963159 }, { "epoch": 0.11181496280414514, "grad_norm": 0.7504507899284363, "learning_rate": 9.895982754095993e-06, "loss": 0.06310068070888519, "memory(GiB)": 20.68, "step": 3442, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.963192 }, { "epoch": 0.11184744826690056, "grad_norm": 0.8650351762771606, "learning_rate": 9.89587372996148e-06, "loss": 0.0673624724149704, "memory(GiB)": 20.68, "step": 3443, "token_acc": 0.9704433497536946, "train_speed(iter/s)": 0.963229 }, { "epoch": 0.11187993372965598, "grad_norm": 0.7008918523788452, "learning_rate": 9.895764649322068e-06, "loss": 0.07043881714344025, "memory(GiB)": 20.68, "step": 3444, "token_acc": 0.961864406779661, "train_speed(iter/s)": 0.96327 }, { "epoch": 0.11191241919241139, "grad_norm": 0.8042405247688293, "learning_rate": 9.895655512179017e-06, "loss": 0.06751203536987305, "memory(GiB)": 20.68, "step": 3445, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.963315 }, { "epoch": 0.11194490465516681, "grad_norm": 0.63236403465271, "learning_rate": 9.895546318533585e-06, "loss": 0.0855790227651596, "memory(GiB)": 20.68, "step": 3446, "token_acc": 0.9604743083003953, "train_speed(iter/s)": 0.963357 }, { "epoch": 0.11197739011792222, "grad_norm": 0.7364899516105652, "learning_rate": 9.895437068387031e-06, "loss": 0.0777834951877594, "memory(GiB)": 20.68, "step": 3447, "token_acc": 0.9514925373134329, "train_speed(iter/s)": 0.963402 }, { "epoch": 0.11200987558067765, "grad_norm": 0.9617971777915955, "learning_rate": 9.895327761740618e-06, "loss": 0.0640711635351181, "memory(GiB)": 20.68, "step": 3448, "token_acc": 0.958139534883721, "train_speed(iter/s)": 0.963444 }, { "epoch": 0.11204236104343307, "grad_norm": 0.8790045976638794, "learning_rate": 9.895218398595608e-06, "loss": 0.07505908608436584, "memory(GiB)": 20.68, "step": 3449, "token_acc": 0.9346733668341709, "train_speed(iter/s)": 0.963482 }, { "epoch": 0.11207484650618849, "grad_norm": 0.479474276304245, "learning_rate": 9.89510897895326e-06, "loss": 0.06258539110422134, "memory(GiB)": 20.68, "step": 3450, "token_acc": 0.9723756906077348, "train_speed(iter/s)": 0.963525 }, { "epoch": 0.1121073319689439, "grad_norm": 0.6936307549476624, "learning_rate": 9.89499950281484e-06, "loss": 0.06982512772083282, "memory(GiB)": 20.68, "step": 3451, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.96357 }, { "epoch": 0.11213981743169932, "grad_norm": 0.5127546191215515, "learning_rate": 9.894889970181613e-06, "loss": 0.06346875429153442, "memory(GiB)": 20.68, "step": 3452, "token_acc": 0.9781659388646288, "train_speed(iter/s)": 0.963616 }, { "epoch": 0.11217230289445473, "grad_norm": 1.111433982849121, "learning_rate": 9.894780381054838e-06, "loss": 0.07273837924003601, "memory(GiB)": 20.68, "step": 3453, "token_acc": 0.9814126394052045, "train_speed(iter/s)": 0.963662 }, { "epoch": 0.11220478835721015, "grad_norm": 0.8642215132713318, "learning_rate": 9.894670735435784e-06, "loss": 0.08518756926059723, "memory(GiB)": 20.68, "step": 3454, "token_acc": 0.9529914529914529, "train_speed(iter/s)": 0.963709 }, { "epoch": 0.11223727381996557, "grad_norm": 0.6866515278816223, "learning_rate": 9.894561033325713e-06, "loss": 0.08551348745822906, "memory(GiB)": 20.68, "step": 3455, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.963743 }, { "epoch": 0.11226975928272098, "grad_norm": 0.652215301990509, "learning_rate": 9.894451274725893e-06, "loss": 0.07753702998161316, "memory(GiB)": 20.68, "step": 3456, "token_acc": 0.9699248120300752, "train_speed(iter/s)": 0.963773 }, { "epoch": 0.1123022447454764, "grad_norm": 0.8730278015136719, "learning_rate": 9.894341459637594e-06, "loss": 0.07623840123414993, "memory(GiB)": 20.68, "step": 3457, "token_acc": 0.974025974025974, "train_speed(iter/s)": 0.96381 }, { "epoch": 0.11233473020823181, "grad_norm": 0.5533686280250549, "learning_rate": 9.894231588062077e-06, "loss": 0.06456322968006134, "memory(GiB)": 20.68, "step": 3458, "token_acc": 0.9646017699115044, "train_speed(iter/s)": 0.96385 }, { "epoch": 0.11236721567098723, "grad_norm": 0.8023123741149902, "learning_rate": 9.894121660000613e-06, "loss": 0.07615602016448975, "memory(GiB)": 20.68, "step": 3459, "token_acc": 0.98046875, "train_speed(iter/s)": 0.963894 }, { "epoch": 0.11239970113374265, "grad_norm": 0.5184816122055054, "learning_rate": 9.894011675454473e-06, "loss": 0.06383182108402252, "memory(GiB)": 20.68, "step": 3460, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.963943 }, { "epoch": 0.11243218659649806, "grad_norm": 1.023026466369629, "learning_rate": 9.893901634424924e-06, "loss": 0.06616024672985077, "memory(GiB)": 20.68, "step": 3461, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.963984 }, { "epoch": 0.11246467205925348, "grad_norm": 0.7731894850730896, "learning_rate": 9.893791536913235e-06, "loss": 0.07096712291240692, "memory(GiB)": 20.68, "step": 3462, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.96403 }, { "epoch": 0.1124971575220089, "grad_norm": 0.659416913986206, "learning_rate": 9.89368138292068e-06, "loss": 0.07394076138734818, "memory(GiB)": 20.68, "step": 3463, "token_acc": 0.9704641350210971, "train_speed(iter/s)": 0.964083 }, { "epoch": 0.11252964298476432, "grad_norm": 0.568689227104187, "learning_rate": 9.893571172448527e-06, "loss": 0.06780490279197693, "memory(GiB)": 20.68, "step": 3464, "token_acc": 0.974910394265233, "train_speed(iter/s)": 0.964134 }, { "epoch": 0.11256212844751974, "grad_norm": 0.6106505990028381, "learning_rate": 9.89346090549805e-06, "loss": 0.06480759382247925, "memory(GiB)": 20.68, "step": 3465, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.964186 }, { "epoch": 0.11259461391027516, "grad_norm": 0.603284478187561, "learning_rate": 9.89335058207052e-06, "loss": 0.0803668200969696, "memory(GiB)": 20.68, "step": 3466, "token_acc": 0.9626556016597511, "train_speed(iter/s)": 0.964236 }, { "epoch": 0.11262709937303057, "grad_norm": 0.5930566191673279, "learning_rate": 9.893240202167213e-06, "loss": 0.07281936705112457, "memory(GiB)": 20.68, "step": 3467, "token_acc": 0.9658119658119658, "train_speed(iter/s)": 0.964286 }, { "epoch": 0.11265958483578599, "grad_norm": 0.42533838748931885, "learning_rate": 9.8931297657894e-06, "loss": 0.05302418768405914, "memory(GiB)": 20.68, "step": 3468, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.964336 }, { "epoch": 0.1126920702985414, "grad_norm": 0.5064296722412109, "learning_rate": 9.893019272938356e-06, "loss": 0.06566335260868073, "memory(GiB)": 20.68, "step": 3469, "token_acc": 0.9659574468085106, "train_speed(iter/s)": 0.96439 }, { "epoch": 0.11272455576129682, "grad_norm": 0.5251708626747131, "learning_rate": 9.892908723615358e-06, "loss": 0.07756248116493225, "memory(GiB)": 20.68, "step": 3470, "token_acc": 0.9678714859437751, "train_speed(iter/s)": 0.964439 }, { "epoch": 0.11275704122405224, "grad_norm": 0.5755910873413086, "learning_rate": 9.89279811782168e-06, "loss": 0.06166764721274376, "memory(GiB)": 20.68, "step": 3471, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.964491 }, { "epoch": 0.11278952668680765, "grad_norm": 0.5479860305786133, "learning_rate": 9.8926874555586e-06, "loss": 0.06596611440181732, "memory(GiB)": 20.68, "step": 3472, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.964542 }, { "epoch": 0.11282201214956307, "grad_norm": 0.7275401949882507, "learning_rate": 9.892576736827395e-06, "loss": 0.06467694044113159, "memory(GiB)": 20.68, "step": 3473, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.964591 }, { "epoch": 0.11285449761231849, "grad_norm": 0.6080517172813416, "learning_rate": 9.892465961629342e-06, "loss": 0.05770993232727051, "memory(GiB)": 20.68, "step": 3474, "token_acc": 0.97265625, "train_speed(iter/s)": 0.964644 }, { "epoch": 0.1128869830750739, "grad_norm": 0.693845808506012, "learning_rate": 9.89235512996572e-06, "loss": 0.0710601732134819, "memory(GiB)": 20.68, "step": 3475, "token_acc": 0.9710144927536232, "train_speed(iter/s)": 0.964697 }, { "epoch": 0.11291946853782932, "grad_norm": 1.2297375202178955, "learning_rate": 9.89224424183781e-06, "loss": 0.07117503136396408, "memory(GiB)": 20.68, "step": 3476, "token_acc": 0.9736842105263158, "train_speed(iter/s)": 0.96475 }, { "epoch": 0.11295195400058473, "grad_norm": 0.8489493131637573, "learning_rate": 9.892133297246887e-06, "loss": 0.05471678450703621, "memory(GiB)": 20.68, "step": 3477, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.964799 }, { "epoch": 0.11298443946334015, "grad_norm": 0.7494305968284607, "learning_rate": 9.892022296194236e-06, "loss": 0.07283944636583328, "memory(GiB)": 20.68, "step": 3478, "token_acc": 0.966542750929368, "train_speed(iter/s)": 0.964853 }, { "epoch": 0.11301692492609557, "grad_norm": 0.4028865694999695, "learning_rate": 9.891911238681136e-06, "loss": 0.05537343770265579, "memory(GiB)": 20.68, "step": 3479, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.964906 }, { "epoch": 0.113049410388851, "grad_norm": 0.7941654324531555, "learning_rate": 9.891800124708868e-06, "loss": 0.05658455565571785, "memory(GiB)": 20.68, "step": 3480, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.964959 }, { "epoch": 0.11308189585160641, "grad_norm": 0.6633936762809753, "learning_rate": 9.891688954278717e-06, "loss": 0.07690051198005676, "memory(GiB)": 20.68, "step": 3481, "token_acc": 0.9698275862068966, "train_speed(iter/s)": 0.965007 }, { "epoch": 0.11311438131436183, "grad_norm": 0.6963066458702087, "learning_rate": 9.891577727391965e-06, "loss": 0.06993909180164337, "memory(GiB)": 20.68, "step": 3482, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.965042 }, { "epoch": 0.11314686677711724, "grad_norm": 0.5655003786087036, "learning_rate": 9.891466444049896e-06, "loss": 0.056816525757312775, "memory(GiB)": 20.68, "step": 3483, "token_acc": 0.974025974025974, "train_speed(iter/s)": 0.965086 }, { "epoch": 0.11317935223987266, "grad_norm": 0.8338471055030823, "learning_rate": 9.891355104253791e-06, "loss": 0.07020431756973267, "memory(GiB)": 20.68, "step": 3484, "token_acc": 0.9790940766550522, "train_speed(iter/s)": 0.965128 }, { "epoch": 0.11321183770262808, "grad_norm": 1.7032512426376343, "learning_rate": 9.891243708004939e-06, "loss": 0.08469955623149872, "memory(GiB)": 20.68, "step": 3485, "token_acc": 0.9699570815450643, "train_speed(iter/s)": 0.965173 }, { "epoch": 0.11324432316538349, "grad_norm": 0.5929979085922241, "learning_rate": 9.891132255304626e-06, "loss": 0.05317224562168121, "memory(GiB)": 20.68, "step": 3486, "token_acc": 0.9812206572769953, "train_speed(iter/s)": 0.965212 }, { "epoch": 0.11327680862813891, "grad_norm": 0.5006632804870605, "learning_rate": 9.891020746154134e-06, "loss": 0.05294874310493469, "memory(GiB)": 20.68, "step": 3487, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.965254 }, { "epoch": 0.11330929409089432, "grad_norm": 1.0637354850769043, "learning_rate": 9.890909180554753e-06, "loss": 0.07033158093690872, "memory(GiB)": 20.68, "step": 3488, "token_acc": 0.9679715302491103, "train_speed(iter/s)": 0.965299 }, { "epoch": 0.11334177955364974, "grad_norm": 0.40469110012054443, "learning_rate": 9.890797558507773e-06, "loss": 0.061011988669633865, "memory(GiB)": 20.68, "step": 3489, "token_acc": 0.988, "train_speed(iter/s)": 0.965343 }, { "epoch": 0.11337426501640516, "grad_norm": 0.6013482213020325, "learning_rate": 9.890685880014476e-06, "loss": 0.06608173251152039, "memory(GiB)": 20.68, "step": 3490, "token_acc": 0.9683257918552036, "train_speed(iter/s)": 0.965381 }, { "epoch": 0.11340675047916057, "grad_norm": 0.4628869891166687, "learning_rate": 9.890574145076158e-06, "loss": 0.05731318145990372, "memory(GiB)": 20.68, "step": 3491, "token_acc": 0.9805825242718447, "train_speed(iter/s)": 0.965422 }, { "epoch": 0.11343923594191599, "grad_norm": 0.5421643853187561, "learning_rate": 9.890462353694102e-06, "loss": 0.056685805320739746, "memory(GiB)": 20.68, "step": 3492, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.965463 }, { "epoch": 0.1134717214046714, "grad_norm": 1.1066738367080688, "learning_rate": 9.890350505869602e-06, "loss": 0.08029597997665405, "memory(GiB)": 20.68, "step": 3493, "token_acc": 0.9781420765027322, "train_speed(iter/s)": 0.965503 }, { "epoch": 0.11350420686742682, "grad_norm": 0.5263987183570862, "learning_rate": 9.89023860160395e-06, "loss": 0.06968070566654205, "memory(GiB)": 20.68, "step": 3494, "token_acc": 0.9653465346534653, "train_speed(iter/s)": 0.96554 }, { "epoch": 0.11353669233018224, "grad_norm": 0.6252188086509705, "learning_rate": 9.890126640898431e-06, "loss": 0.06411786377429962, "memory(GiB)": 20.68, "step": 3495, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.965584 }, { "epoch": 0.11356917779293767, "grad_norm": 0.5419985055923462, "learning_rate": 9.890014623754348e-06, "loss": 0.06116675212979317, "memory(GiB)": 20.68, "step": 3496, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.965623 }, { "epoch": 0.11360166325569308, "grad_norm": 0.565809965133667, "learning_rate": 9.889902550172982e-06, "loss": 0.07163722813129425, "memory(GiB)": 20.68, "step": 3497, "token_acc": 0.9630996309963099, "train_speed(iter/s)": 0.965661 }, { "epoch": 0.1136341487184485, "grad_norm": 0.6330645680427551, "learning_rate": 9.889790420155636e-06, "loss": 0.07113635540008545, "memory(GiB)": 20.68, "step": 3498, "token_acc": 0.9590163934426229, "train_speed(iter/s)": 0.965702 }, { "epoch": 0.11366663418120392, "grad_norm": 1.0583806037902832, "learning_rate": 9.8896782337036e-06, "loss": 0.0891592726111412, "memory(GiB)": 20.68, "step": 3499, "token_acc": 0.9497907949790795, "train_speed(iter/s)": 0.965746 }, { "epoch": 0.11369911964395933, "grad_norm": 0.882654070854187, "learning_rate": 9.889565990818166e-06, "loss": 0.07542847096920013, "memory(GiB)": 20.68, "step": 3500, "token_acc": 0.9748743718592965, "train_speed(iter/s)": 0.965784 }, { "epoch": 0.11369911964395933, "eval_loss": 0.06848622858524323, "eval_runtime": 80.2714, "eval_samples_per_second": 123.954, "eval_steps_per_second": 3.874, "eval_token_acc": 0.9725046634987481, "step": 3500 }, { "epoch": 0.11373160510671475, "grad_norm": 1.5367910861968994, "learning_rate": 9.889453691500636e-06, "loss": 0.06282329559326172, "memory(GiB)": 21.32, "step": 3501, "token_acc": 0.9726483317810771, "train_speed(iter/s)": 0.942357 }, { "epoch": 0.11376409056947016, "grad_norm": 0.8109822869300842, "learning_rate": 9.8893413357523e-06, "loss": 0.06917384266853333, "memory(GiB)": 21.32, "step": 3502, "token_acc": 0.966542750929368, "train_speed(iter/s)": 0.942406 }, { "epoch": 0.11379657603222558, "grad_norm": 0.9053179025650024, "learning_rate": 9.88922892357446e-06, "loss": 0.07209712266921997, "memory(GiB)": 21.32, "step": 3503, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.942452 }, { "epoch": 0.113829061494981, "grad_norm": 0.7480307221412659, "learning_rate": 9.889116454968408e-06, "loss": 0.07581749558448792, "memory(GiB)": 21.32, "step": 3504, "token_acc": 0.9783783783783784, "train_speed(iter/s)": 0.942508 }, { "epoch": 0.11386154695773641, "grad_norm": 0.448429673910141, "learning_rate": 9.889003929935446e-06, "loss": 0.05691508203744888, "memory(GiB)": 21.32, "step": 3505, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.942551 }, { "epoch": 0.11389403242049183, "grad_norm": 1.0083144903182983, "learning_rate": 9.88889134847687e-06, "loss": 0.08454428613185883, "memory(GiB)": 21.32, "step": 3506, "token_acc": 0.968421052631579, "train_speed(iter/s)": 0.942596 }, { "epoch": 0.11392651788324724, "grad_norm": 0.7198508381843567, "learning_rate": 9.888778710593982e-06, "loss": 0.0797581672668457, "memory(GiB)": 21.32, "step": 3507, "token_acc": 0.956, "train_speed(iter/s)": 0.942642 }, { "epoch": 0.11395900334600266, "grad_norm": 0.8283786177635193, "learning_rate": 9.88866601628808e-06, "loss": 0.0726553201675415, "memory(GiB)": 21.32, "step": 3508, "token_acc": 0.9727626459143969, "train_speed(iter/s)": 0.942689 }, { "epoch": 0.11399148880875808, "grad_norm": 0.6673696041107178, "learning_rate": 9.888553265560467e-06, "loss": 0.06944537162780762, "memory(GiB)": 21.32, "step": 3509, "token_acc": 0.9612068965517241, "train_speed(iter/s)": 0.942732 }, { "epoch": 0.11402397427151349, "grad_norm": 0.568730354309082, "learning_rate": 9.888440458412442e-06, "loss": 0.07214794307947159, "memory(GiB)": 21.32, "step": 3510, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.942778 }, { "epoch": 0.11405645973426891, "grad_norm": 0.6518213152885437, "learning_rate": 9.888327594845306e-06, "loss": 0.0646996945142746, "memory(GiB)": 21.32, "step": 3511, "token_acc": 0.9696969696969697, "train_speed(iter/s)": 0.942818 }, { "epoch": 0.11408894519702434, "grad_norm": 0.9715672135353088, "learning_rate": 9.888214674860366e-06, "loss": 0.06641332805156708, "memory(GiB)": 21.32, "step": 3512, "token_acc": 0.9702602230483272, "train_speed(iter/s)": 0.94286 }, { "epoch": 0.11412143065977975, "grad_norm": 0.5773979425430298, "learning_rate": 9.88810169845892e-06, "loss": 0.06953202933073044, "memory(GiB)": 21.32, "step": 3513, "token_acc": 0.9561752988047809, "train_speed(iter/s)": 0.942903 }, { "epoch": 0.11415391612253517, "grad_norm": 0.5671955347061157, "learning_rate": 9.887988665642274e-06, "loss": 0.0645144060254097, "memory(GiB)": 21.32, "step": 3514, "token_acc": 0.9767441860465116, "train_speed(iter/s)": 0.942947 }, { "epoch": 0.11418640158529059, "grad_norm": 0.5691182017326355, "learning_rate": 9.887875576411737e-06, "loss": 0.0628136619925499, "memory(GiB)": 21.32, "step": 3515, "token_acc": 0.96875, "train_speed(iter/s)": 0.942986 }, { "epoch": 0.114218887048046, "grad_norm": 0.5147386789321899, "learning_rate": 9.887762430768605e-06, "loss": 0.06436972320079803, "memory(GiB)": 21.32, "step": 3516, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.94303 }, { "epoch": 0.11425137251080142, "grad_norm": 0.660342812538147, "learning_rate": 9.88764922871419e-06, "loss": 0.06383509933948517, "memory(GiB)": 21.32, "step": 3517, "token_acc": 0.9836956521739131, "train_speed(iter/s)": 0.943069 }, { "epoch": 0.11428385797355683, "grad_norm": 0.5938838720321655, "learning_rate": 9.887535970249799e-06, "loss": 0.07588109374046326, "memory(GiB)": 21.32, "step": 3518, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.943112 }, { "epoch": 0.11431634343631225, "grad_norm": 0.6188628077507019, "learning_rate": 9.88742265537674e-06, "loss": 0.06881894171237946, "memory(GiB)": 21.32, "step": 3519, "token_acc": 0.9700374531835206, "train_speed(iter/s)": 0.943152 }, { "epoch": 0.11434882889906767, "grad_norm": 0.5617279410362244, "learning_rate": 9.887309284096313e-06, "loss": 0.06803961843252182, "memory(GiB)": 21.32, "step": 3520, "token_acc": 0.9659574468085106, "train_speed(iter/s)": 0.943195 }, { "epoch": 0.11438131436182308, "grad_norm": 0.5595202445983887, "learning_rate": 9.887195856409836e-06, "loss": 0.05436262488365173, "memory(GiB)": 21.32, "step": 3521, "token_acc": 0.9816176470588235, "train_speed(iter/s)": 0.94324 }, { "epoch": 0.1144137998245785, "grad_norm": 0.7201046347618103, "learning_rate": 9.887082372318612e-06, "loss": 0.061785243451595306, "memory(GiB)": 21.32, "step": 3522, "token_acc": 0.9709090909090909, "train_speed(iter/s)": 0.943288 }, { "epoch": 0.11444628528733392, "grad_norm": 0.7285455465316772, "learning_rate": 9.886968831823953e-06, "loss": 0.05971861630678177, "memory(GiB)": 21.32, "step": 3523, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.943339 }, { "epoch": 0.11447877075008933, "grad_norm": 0.5929615497589111, "learning_rate": 9.88685523492717e-06, "loss": 0.08345744013786316, "memory(GiB)": 21.32, "step": 3524, "token_acc": 0.9516908212560387, "train_speed(iter/s)": 0.943393 }, { "epoch": 0.11451125621284475, "grad_norm": 0.49377182126045227, "learning_rate": 9.886741581629573e-06, "loss": 0.06455755233764648, "memory(GiB)": 21.32, "step": 3525, "token_acc": 0.9738805970149254, "train_speed(iter/s)": 0.94345 }, { "epoch": 0.11454374167560016, "grad_norm": 0.6338061690330505, "learning_rate": 9.886627871932472e-06, "loss": 0.0670732781291008, "memory(GiB)": 21.32, "step": 3526, "token_acc": 0.9541984732824428, "train_speed(iter/s)": 0.943506 }, { "epoch": 0.11457622713835558, "grad_norm": 0.6603916883468628, "learning_rate": 9.886514105837184e-06, "loss": 0.06559749692678452, "memory(GiB)": 21.32, "step": 3527, "token_acc": 0.9719298245614035, "train_speed(iter/s)": 0.943559 }, { "epoch": 0.11460871260111101, "grad_norm": 0.6073186993598938, "learning_rate": 9.886400283345019e-06, "loss": 0.07501488924026489, "memory(GiB)": 21.32, "step": 3528, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.943614 }, { "epoch": 0.11464119806386643, "grad_norm": 1.2248048782348633, "learning_rate": 9.886286404457288e-06, "loss": 0.06820036470890045, "memory(GiB)": 21.32, "step": 3529, "token_acc": 0.966789667896679, "train_speed(iter/s)": 0.943672 }, { "epoch": 0.11467368352662184, "grad_norm": 0.4916437864303589, "learning_rate": 9.88617246917531e-06, "loss": 0.05354912951588631, "memory(GiB)": 21.32, "step": 3530, "token_acc": 0.9728682170542635, "train_speed(iter/s)": 0.943729 }, { "epoch": 0.11470616898937726, "grad_norm": 0.6234354972839355, "learning_rate": 9.8860584775004e-06, "loss": 0.06482250243425369, "memory(GiB)": 21.32, "step": 3531, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.943786 }, { "epoch": 0.11473865445213267, "grad_norm": 1.6373921632766724, "learning_rate": 9.88594442943387e-06, "loss": 0.06840908527374268, "memory(GiB)": 21.32, "step": 3532, "token_acc": 0.958904109589041, "train_speed(iter/s)": 0.943836 }, { "epoch": 0.11477113991488809, "grad_norm": 1.2099796533584595, "learning_rate": 9.885830324977038e-06, "loss": 0.06738464534282684, "memory(GiB)": 21.32, "step": 3533, "token_acc": 0.9826086956521739, "train_speed(iter/s)": 0.943894 }, { "epoch": 0.1148036253776435, "grad_norm": 0.6045695543289185, "learning_rate": 9.885716164131222e-06, "loss": 0.06605510413646698, "memory(GiB)": 21.32, "step": 3534, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.943951 }, { "epoch": 0.11483611084039892, "grad_norm": 0.5473800897598267, "learning_rate": 9.885601946897739e-06, "loss": 0.0682394802570343, "memory(GiB)": 21.32, "step": 3535, "token_acc": 0.9862542955326461, "train_speed(iter/s)": 0.944006 }, { "epoch": 0.11486859630315434, "grad_norm": 0.5410234332084656, "learning_rate": 9.885487673277904e-06, "loss": 0.06199974566698074, "memory(GiB)": 21.32, "step": 3536, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.944058 }, { "epoch": 0.11490108176590975, "grad_norm": 3.457310199737549, "learning_rate": 9.885373343273041e-06, "loss": 0.0770789310336113, "memory(GiB)": 21.32, "step": 3537, "token_acc": 0.9537037037037037, "train_speed(iter/s)": 0.944112 }, { "epoch": 0.11493356722866517, "grad_norm": 0.4993315041065216, "learning_rate": 9.885258956884467e-06, "loss": 0.06072874367237091, "memory(GiB)": 21.32, "step": 3538, "token_acc": 0.9754098360655737, "train_speed(iter/s)": 0.944167 }, { "epoch": 0.11496605269142059, "grad_norm": 1.2889926433563232, "learning_rate": 9.885144514113503e-06, "loss": 0.0689152330160141, "memory(GiB)": 21.32, "step": 3539, "token_acc": 0.9378238341968912, "train_speed(iter/s)": 0.944223 }, { "epoch": 0.114998538154176, "grad_norm": 0.7731736302375793, "learning_rate": 9.885030014961468e-06, "loss": 0.07779543101787567, "memory(GiB)": 21.32, "step": 3540, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.94428 }, { "epoch": 0.11503102361693142, "grad_norm": 0.6151630878448486, "learning_rate": 9.884915459429685e-06, "loss": 0.06151938438415527, "memory(GiB)": 21.32, "step": 3541, "token_acc": 0.9683257918552036, "train_speed(iter/s)": 0.944335 }, { "epoch": 0.11506350907968683, "grad_norm": 0.748664915561676, "learning_rate": 9.884800847519475e-06, "loss": 0.07277768850326538, "memory(GiB)": 21.32, "step": 3542, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.944382 }, { "epoch": 0.11509599454244225, "grad_norm": 0.7693845629692078, "learning_rate": 9.884686179232162e-06, "loss": 0.07005181163549423, "memory(GiB)": 21.32, "step": 3543, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.944431 }, { "epoch": 0.11512848000519768, "grad_norm": 0.6458098292350769, "learning_rate": 9.88457145456907e-06, "loss": 0.06381382048130035, "memory(GiB)": 21.32, "step": 3544, "token_acc": 0.9789473684210527, "train_speed(iter/s)": 0.944478 }, { "epoch": 0.1151609654679531, "grad_norm": 0.8679433465003967, "learning_rate": 9.884456673531522e-06, "loss": 0.07280730456113815, "memory(GiB)": 21.32, "step": 3545, "token_acc": 0.9741379310344828, "train_speed(iter/s)": 0.944526 }, { "epoch": 0.11519345093070851, "grad_norm": 1.0469825267791748, "learning_rate": 9.88434183612084e-06, "loss": 0.0640954077243805, "memory(GiB)": 21.32, "step": 3546, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.944574 }, { "epoch": 0.11522593639346393, "grad_norm": 0.5774411559104919, "learning_rate": 9.884226942338356e-06, "loss": 0.060779452323913574, "memory(GiB)": 21.32, "step": 3547, "token_acc": 0.9779735682819384, "train_speed(iter/s)": 0.944618 }, { "epoch": 0.11525842185621935, "grad_norm": 0.47124865651130676, "learning_rate": 9.88411199218539e-06, "loss": 0.061195507645606995, "memory(GiB)": 21.32, "step": 3548, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.944662 }, { "epoch": 0.11529090731897476, "grad_norm": 0.5148605108261108, "learning_rate": 9.883996985663272e-06, "loss": 0.06855244934558868, "memory(GiB)": 21.32, "step": 3549, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.94471 }, { "epoch": 0.11532339278173018, "grad_norm": 0.8765760660171509, "learning_rate": 9.883881922773326e-06, "loss": 0.06777232885360718, "memory(GiB)": 21.32, "step": 3550, "token_acc": 0.9647577092511013, "train_speed(iter/s)": 0.944758 }, { "epoch": 0.1153558782444856, "grad_norm": 0.6484487056732178, "learning_rate": 9.883766803516886e-06, "loss": 0.06728310137987137, "memory(GiB)": 21.32, "step": 3551, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.944801 }, { "epoch": 0.11538836370724101, "grad_norm": 0.5332297086715698, "learning_rate": 9.883651627895274e-06, "loss": 0.069080650806427, "memory(GiB)": 21.32, "step": 3552, "token_acc": 0.9774436090225563, "train_speed(iter/s)": 0.944849 }, { "epoch": 0.11542084916999643, "grad_norm": 0.5705745220184326, "learning_rate": 9.883536395909821e-06, "loss": 0.06717929989099503, "memory(GiB)": 21.32, "step": 3553, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.944891 }, { "epoch": 0.11545333463275184, "grad_norm": 0.4983521103858948, "learning_rate": 9.883421107561861e-06, "loss": 0.057645753026008606, "memory(GiB)": 21.32, "step": 3554, "token_acc": 0.9710743801652892, "train_speed(iter/s)": 0.944926 }, { "epoch": 0.11548582009550726, "grad_norm": 0.8001604080200195, "learning_rate": 9.88330576285272e-06, "loss": 0.08214036375284195, "memory(GiB)": 21.32, "step": 3555, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.944971 }, { "epoch": 0.11551830555826267, "grad_norm": 0.6268150210380554, "learning_rate": 9.883190361783731e-06, "loss": 0.06357019394636154, "memory(GiB)": 21.32, "step": 3556, "token_acc": 0.9671361502347418, "train_speed(iter/s)": 0.945019 }, { "epoch": 0.11555079102101809, "grad_norm": 0.8597248196601868, "learning_rate": 9.883074904356226e-06, "loss": 0.06906326860189438, "memory(GiB)": 21.32, "step": 3557, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.945067 }, { "epoch": 0.1155832764837735, "grad_norm": 0.9476456046104431, "learning_rate": 9.882959390571537e-06, "loss": 0.0675433874130249, "memory(GiB)": 21.32, "step": 3558, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.945116 }, { "epoch": 0.11561576194652892, "grad_norm": 0.48002350330352783, "learning_rate": 9.882843820430997e-06, "loss": 0.06137887388467789, "memory(GiB)": 21.32, "step": 3559, "token_acc": 0.9629629629629629, "train_speed(iter/s)": 0.945158 }, { "epoch": 0.11564824740928435, "grad_norm": 0.6795002818107605, "learning_rate": 9.882728193935941e-06, "loss": 0.06560377776622772, "memory(GiB)": 21.32, "step": 3560, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.945201 }, { "epoch": 0.11568073287203977, "grad_norm": 2.0393853187561035, "learning_rate": 9.882612511087704e-06, "loss": 0.06940630078315735, "memory(GiB)": 21.32, "step": 3561, "token_acc": 0.9816176470588235, "train_speed(iter/s)": 0.945243 }, { "epoch": 0.11571321833479518, "grad_norm": 0.6291011571884155, "learning_rate": 9.882496771887618e-06, "loss": 0.0742526575922966, "memory(GiB)": 21.32, "step": 3562, "token_acc": 0.9795918367346939, "train_speed(iter/s)": 0.945281 }, { "epoch": 0.1157457037975506, "grad_norm": 0.6621025800704956, "learning_rate": 9.88238097633702e-06, "loss": 0.06270883977413177, "memory(GiB)": 21.32, "step": 3563, "token_acc": 0.9552845528455285, "train_speed(iter/s)": 0.945323 }, { "epoch": 0.11577818926030602, "grad_norm": 0.8603979349136353, "learning_rate": 9.882265124437249e-06, "loss": 0.07719279825687408, "memory(GiB)": 21.32, "step": 3564, "token_acc": 0.9614035087719298, "train_speed(iter/s)": 0.945358 }, { "epoch": 0.11581067472306143, "grad_norm": 0.595397412776947, "learning_rate": 9.882149216189638e-06, "loss": 0.06275568902492523, "memory(GiB)": 21.32, "step": 3565, "token_acc": 0.9768211920529801, "train_speed(iter/s)": 0.9454 }, { "epoch": 0.11584316018581685, "grad_norm": 0.6485040187835693, "learning_rate": 9.88203325159553e-06, "loss": 0.07786500453948975, "memory(GiB)": 21.32, "step": 3566, "token_acc": 0.9846938775510204, "train_speed(iter/s)": 0.945442 }, { "epoch": 0.11587564564857226, "grad_norm": 0.636132538318634, "learning_rate": 9.881917230656258e-06, "loss": 0.06307798624038696, "memory(GiB)": 21.32, "step": 3567, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.945485 }, { "epoch": 0.11590813111132768, "grad_norm": 0.5957414507865906, "learning_rate": 9.881801153373162e-06, "loss": 0.06403177976608276, "memory(GiB)": 21.32, "step": 3568, "token_acc": 0.9644670050761421, "train_speed(iter/s)": 0.945457 }, { "epoch": 0.1159406165740831, "grad_norm": 0.6168814301490784, "learning_rate": 9.881685019747587e-06, "loss": 0.06667979061603546, "memory(GiB)": 21.32, "step": 3569, "token_acc": 0.9623430962343096, "train_speed(iter/s)": 0.945499 }, { "epoch": 0.11597310203683851, "grad_norm": 0.47071659564971924, "learning_rate": 9.881568829780866e-06, "loss": 0.05505011975765228, "memory(GiB)": 21.32, "step": 3570, "token_acc": 0.9765625, "train_speed(iter/s)": 0.94554 }, { "epoch": 0.11600558749959393, "grad_norm": 0.5018852949142456, "learning_rate": 9.881452583474345e-06, "loss": 0.052911046892404556, "memory(GiB)": 21.32, "step": 3571, "token_acc": 0.9875, "train_speed(iter/s)": 0.945584 }, { "epoch": 0.11603807296234935, "grad_norm": 0.7078726291656494, "learning_rate": 9.881336280829364e-06, "loss": 0.06215300038456917, "memory(GiB)": 21.32, "step": 3572, "token_acc": 0.98, "train_speed(iter/s)": 0.945624 }, { "epoch": 0.11607055842510476, "grad_norm": 0.5078190565109253, "learning_rate": 9.881219921847264e-06, "loss": 0.06449717283248901, "memory(GiB)": 21.32, "step": 3573, "token_acc": 0.9748743718592965, "train_speed(iter/s)": 0.945666 }, { "epoch": 0.11610304388786018, "grad_norm": 0.6262562870979309, "learning_rate": 9.88110350652939e-06, "loss": 0.06639737635850906, "memory(GiB)": 21.32, "step": 3574, "token_acc": 0.9878048780487805, "train_speed(iter/s)": 0.945708 }, { "epoch": 0.1161355293506156, "grad_norm": 0.6051746606826782, "learning_rate": 9.880987034877085e-06, "loss": 0.059409528970718384, "memory(GiB)": 21.32, "step": 3575, "token_acc": 0.9653465346534653, "train_speed(iter/s)": 0.94575 }, { "epoch": 0.11616801481337102, "grad_norm": 1.0188745260238647, "learning_rate": 9.880870506891694e-06, "loss": 0.06493696570396423, "memory(GiB)": 21.32, "step": 3576, "token_acc": 0.974025974025974, "train_speed(iter/s)": 0.945795 }, { "epoch": 0.11620050027612644, "grad_norm": 0.6706192493438721, "learning_rate": 9.88075392257456e-06, "loss": 0.0680142492055893, "memory(GiB)": 21.32, "step": 3577, "token_acc": 0.9833887043189369, "train_speed(iter/s)": 0.945841 }, { "epoch": 0.11623298573888186, "grad_norm": 0.6238633394241333, "learning_rate": 9.880637281927028e-06, "loss": 0.05950310826301575, "memory(GiB)": 21.32, "step": 3578, "token_acc": 0.9792746113989638, "train_speed(iter/s)": 0.945885 }, { "epoch": 0.11626547120163727, "grad_norm": 0.8344075083732605, "learning_rate": 9.880520584950448e-06, "loss": 0.07333478331565857, "memory(GiB)": 21.32, "step": 3579, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.94593 }, { "epoch": 0.11629795666439269, "grad_norm": 0.5863826274871826, "learning_rate": 9.880403831646164e-06, "loss": 0.05997547507286072, "memory(GiB)": 21.32, "step": 3580, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.945975 }, { "epoch": 0.1163304421271481, "grad_norm": 0.614749014377594, "learning_rate": 9.880287022015523e-06, "loss": 0.0639534443616867, "memory(GiB)": 21.32, "step": 3581, "token_acc": 0.9683794466403162, "train_speed(iter/s)": 0.946022 }, { "epoch": 0.11636292758990352, "grad_norm": 1.8718364238739014, "learning_rate": 9.880170156059875e-06, "loss": 0.0564606636762619, "memory(GiB)": 21.32, "step": 3582, "token_acc": 0.9754901960784313, "train_speed(iter/s)": 0.946076 }, { "epoch": 0.11639541305265894, "grad_norm": 0.8722122311592102, "learning_rate": 9.880053233780567e-06, "loss": 0.06986656785011292, "memory(GiB)": 21.32, "step": 3583, "token_acc": 0.9752066115702479, "train_speed(iter/s)": 0.946131 }, { "epoch": 0.11642789851541435, "grad_norm": 0.6893534660339355, "learning_rate": 9.87993625517895e-06, "loss": 0.07222805172204971, "memory(GiB)": 21.32, "step": 3584, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.946187 }, { "epoch": 0.11646038397816977, "grad_norm": 0.49780401587486267, "learning_rate": 9.879819220256373e-06, "loss": 0.052431877702474594, "memory(GiB)": 21.32, "step": 3585, "token_acc": 0.9759036144578314, "train_speed(iter/s)": 0.946243 }, { "epoch": 0.11649286944092518, "grad_norm": 0.6510421633720398, "learning_rate": 9.879702129014189e-06, "loss": 0.06196262687444687, "memory(GiB)": 21.32, "step": 3586, "token_acc": 0.9795918367346939, "train_speed(iter/s)": 0.9463 }, { "epoch": 0.1165253549036806, "grad_norm": 1.1260632276535034, "learning_rate": 9.879584981453744e-06, "loss": 0.07308200001716614, "memory(GiB)": 21.32, "step": 3587, "token_acc": 0.9730941704035875, "train_speed(iter/s)": 0.946354 }, { "epoch": 0.11655784036643602, "grad_norm": 0.6778168678283691, "learning_rate": 9.879467777576397e-06, "loss": 0.0677362009882927, "memory(GiB)": 21.32, "step": 3588, "token_acc": 0.9720930232558139, "train_speed(iter/s)": 0.946411 }, { "epoch": 0.11659032582919143, "grad_norm": 0.6899349093437195, "learning_rate": 9.879350517383495e-06, "loss": 0.07616588473320007, "memory(GiB)": 21.32, "step": 3589, "token_acc": 0.9757085020242915, "train_speed(iter/s)": 0.946467 }, { "epoch": 0.11662281129194685, "grad_norm": 0.6015456318855286, "learning_rate": 9.879233200876395e-06, "loss": 0.07270337641239166, "memory(GiB)": 21.32, "step": 3590, "token_acc": 0.961864406779661, "train_speed(iter/s)": 0.946521 }, { "epoch": 0.11665529675470226, "grad_norm": 0.5631749033927917, "learning_rate": 9.879115828056449e-06, "loss": 0.06693387031555176, "memory(GiB)": 21.32, "step": 3591, "token_acc": 0.9653465346534653, "train_speed(iter/s)": 0.946578 }, { "epoch": 0.1166877822174577, "grad_norm": 0.6198277473449707, "learning_rate": 9.878998398925013e-06, "loss": 0.0656019002199173, "memory(GiB)": 21.32, "step": 3592, "token_acc": 0.9629629629629629, "train_speed(iter/s)": 0.946633 }, { "epoch": 0.11672026768021311, "grad_norm": 0.5212474465370178, "learning_rate": 9.87888091348344e-06, "loss": 0.05658119544386864, "memory(GiB)": 21.32, "step": 3593, "token_acc": 0.970954356846473, "train_speed(iter/s)": 0.946687 }, { "epoch": 0.11675275314296853, "grad_norm": 0.5275497436523438, "learning_rate": 9.878763371733089e-06, "loss": 0.06464091688394547, "memory(GiB)": 21.32, "step": 3594, "token_acc": 0.966789667896679, "train_speed(iter/s)": 0.946741 }, { "epoch": 0.11678523860572394, "grad_norm": 0.9334132671356201, "learning_rate": 9.878645773675315e-06, "loss": 0.0861952155828476, "memory(GiB)": 21.32, "step": 3595, "token_acc": 0.9603174603174603, "train_speed(iter/s)": 0.94679 }, { "epoch": 0.11681772406847936, "grad_norm": 0.4351399540901184, "learning_rate": 9.878528119311474e-06, "loss": 0.06183900684118271, "memory(GiB)": 21.32, "step": 3596, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.946844 }, { "epoch": 0.11685020953123477, "grad_norm": 0.6055203676223755, "learning_rate": 9.878410408642927e-06, "loss": 0.057289112359285355, "memory(GiB)": 21.32, "step": 3597, "token_acc": 0.9683098591549296, "train_speed(iter/s)": 0.9469 }, { "epoch": 0.11688269499399019, "grad_norm": 1.2202112674713135, "learning_rate": 9.878292641671029e-06, "loss": 0.05459648370742798, "memory(GiB)": 21.32, "step": 3598, "token_acc": 0.9781818181818182, "train_speed(iter/s)": 0.946958 }, { "epoch": 0.11691518045674561, "grad_norm": 0.4945952296257019, "learning_rate": 9.878174818397142e-06, "loss": 0.05680454149842262, "memory(GiB)": 21.32, "step": 3599, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.947013 }, { "epoch": 0.11694766591950102, "grad_norm": 0.7395952343940735, "learning_rate": 9.878056938822624e-06, "loss": 0.0649820938706398, "memory(GiB)": 21.32, "step": 3600, "token_acc": 0.9678899082568807, "train_speed(iter/s)": 0.947068 }, { "epoch": 0.11698015138225644, "grad_norm": 0.5526925921440125, "learning_rate": 9.877939002948838e-06, "loss": 0.060578037053346634, "memory(GiB)": 21.32, "step": 3601, "token_acc": 0.9630996309963099, "train_speed(iter/s)": 0.947121 }, { "epoch": 0.11701263684501186, "grad_norm": 0.8559767007827759, "learning_rate": 9.87782101077714e-06, "loss": 0.06861118972301483, "memory(GiB)": 21.32, "step": 3602, "token_acc": 0.9704433497536946, "train_speed(iter/s)": 0.947178 }, { "epoch": 0.11704512230776727, "grad_norm": 0.6404910683631897, "learning_rate": 9.877702962308898e-06, "loss": 0.06320013105869293, "memory(GiB)": 21.32, "step": 3603, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.947236 }, { "epoch": 0.11707760777052269, "grad_norm": 0.8112232089042664, "learning_rate": 9.87758485754547e-06, "loss": 0.06576088070869446, "memory(GiB)": 21.32, "step": 3604, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.947293 }, { "epoch": 0.1171100932332781, "grad_norm": 0.758415699005127, "learning_rate": 9.877466696488223e-06, "loss": 0.07633177936077118, "memory(GiB)": 21.32, "step": 3605, "token_acc": 0.9656652360515021, "train_speed(iter/s)": 0.947349 }, { "epoch": 0.11714257869603352, "grad_norm": 0.6043670773506165, "learning_rate": 9.877348479138516e-06, "loss": 0.07106780260801315, "memory(GiB)": 21.32, "step": 3606, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.947396 }, { "epoch": 0.11717506415878894, "grad_norm": 0.6129400134086609, "learning_rate": 9.877230205497716e-06, "loss": 0.06508587300777435, "memory(GiB)": 21.32, "step": 3607, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.947442 }, { "epoch": 0.11720754962154437, "grad_norm": 0.640758752822876, "learning_rate": 9.87711187556719e-06, "loss": 0.07434984296560287, "memory(GiB)": 21.32, "step": 3608, "token_acc": 0.9601990049751243, "train_speed(iter/s)": 0.947486 }, { "epoch": 0.11724003508429978, "grad_norm": 0.5155763030052185, "learning_rate": 9.876993489348299e-06, "loss": 0.0682985931634903, "memory(GiB)": 21.32, "step": 3609, "token_acc": 0.9859649122807017, "train_speed(iter/s)": 0.94753 }, { "epoch": 0.1172725205470552, "grad_norm": 0.5615640878677368, "learning_rate": 9.876875046842413e-06, "loss": 0.0711871013045311, "memory(GiB)": 21.32, "step": 3610, "token_acc": 0.9653179190751445, "train_speed(iter/s)": 0.947576 }, { "epoch": 0.11730500600981061, "grad_norm": 0.7201379537582397, "learning_rate": 9.8767565480509e-06, "loss": 0.07203719019889832, "memory(GiB)": 21.32, "step": 3611, "token_acc": 0.9742489270386266, "train_speed(iter/s)": 0.947616 }, { "epoch": 0.11733749147256603, "grad_norm": 0.6995542645454407, "learning_rate": 9.876637992975122e-06, "loss": 0.07317051291465759, "memory(GiB)": 21.32, "step": 3612, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.947662 }, { "epoch": 0.11736997693532145, "grad_norm": 1.0931209325790405, "learning_rate": 9.876519381616452e-06, "loss": 0.08667656779289246, "memory(GiB)": 21.32, "step": 3613, "token_acc": 0.9786096256684492, "train_speed(iter/s)": 0.947707 }, { "epoch": 0.11740246239807686, "grad_norm": 0.6012918949127197, "learning_rate": 9.876400713976258e-06, "loss": 0.06280993670225143, "memory(GiB)": 21.32, "step": 3614, "token_acc": 0.976, "train_speed(iter/s)": 0.947751 }, { "epoch": 0.11743494786083228, "grad_norm": 0.6077184081077576, "learning_rate": 9.876281990055909e-06, "loss": 0.06923571228981018, "memory(GiB)": 21.32, "step": 3615, "token_acc": 0.9570815450643777, "train_speed(iter/s)": 0.947793 }, { "epoch": 0.1174674333235877, "grad_norm": 0.5916396975517273, "learning_rate": 9.876163209856776e-06, "loss": 0.07417058944702148, "memory(GiB)": 21.32, "step": 3616, "token_acc": 0.958041958041958, "train_speed(iter/s)": 0.947833 }, { "epoch": 0.11749991878634311, "grad_norm": 0.5603042840957642, "learning_rate": 9.87604437338023e-06, "loss": 0.0682222917675972, "memory(GiB)": 21.32, "step": 3617, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.947872 }, { "epoch": 0.11753240424909853, "grad_norm": 0.5942981839179993, "learning_rate": 9.87592548062764e-06, "loss": 0.08148558437824249, "memory(GiB)": 21.32, "step": 3618, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.947916 }, { "epoch": 0.11756488971185394, "grad_norm": 0.5265445709228516, "learning_rate": 9.87580653160038e-06, "loss": 0.06287264078855515, "memory(GiB)": 21.32, "step": 3619, "token_acc": 0.9619565217391305, "train_speed(iter/s)": 0.947957 }, { "epoch": 0.11759737517460936, "grad_norm": 0.734009861946106, "learning_rate": 9.875687526299826e-06, "loss": 0.06765037775039673, "memory(GiB)": 21.32, "step": 3620, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.947995 }, { "epoch": 0.11762986063736477, "grad_norm": 0.7764918208122253, "learning_rate": 9.875568464727345e-06, "loss": 0.07820350676774979, "memory(GiB)": 21.32, "step": 3621, "token_acc": 0.9622641509433962, "train_speed(iter/s)": 0.948033 }, { "epoch": 0.11766234610012019, "grad_norm": 0.6837438344955444, "learning_rate": 9.875449346884316e-06, "loss": 0.07285420596599579, "memory(GiB)": 21.32, "step": 3622, "token_acc": 0.97, "train_speed(iter/s)": 0.948069 }, { "epoch": 0.11769483156287561, "grad_norm": 0.612646222114563, "learning_rate": 9.875330172772113e-06, "loss": 0.06834319233894348, "memory(GiB)": 21.32, "step": 3623, "token_acc": 0.9607843137254902, "train_speed(iter/s)": 0.948106 }, { "epoch": 0.11772731702563104, "grad_norm": 0.6171053051948547, "learning_rate": 9.87521094239211e-06, "loss": 0.05307364836335182, "memory(GiB)": 21.32, "step": 3624, "token_acc": 0.9741379310344828, "train_speed(iter/s)": 0.94814 }, { "epoch": 0.11775980248838645, "grad_norm": 0.6171717047691345, "learning_rate": 9.875091655745683e-06, "loss": 0.06489074230194092, "memory(GiB)": 21.32, "step": 3625, "token_acc": 0.9678714859437751, "train_speed(iter/s)": 0.94818 }, { "epoch": 0.11779228795114187, "grad_norm": 0.6965634226799011, "learning_rate": 9.874972312834212e-06, "loss": 0.0736566036939621, "memory(GiB)": 21.32, "step": 3626, "token_acc": 0.9681818181818181, "train_speed(iter/s)": 0.948219 }, { "epoch": 0.11782477341389729, "grad_norm": 0.8167582750320435, "learning_rate": 9.87485291365907e-06, "loss": 0.07803525775671005, "memory(GiB)": 21.32, "step": 3627, "token_acc": 0.974025974025974, "train_speed(iter/s)": 0.948257 }, { "epoch": 0.1178572588766527, "grad_norm": 0.7433799505233765, "learning_rate": 9.874733458221637e-06, "loss": 0.08167792856693268, "memory(GiB)": 21.32, "step": 3628, "token_acc": 0.9779411764705882, "train_speed(iter/s)": 0.948289 }, { "epoch": 0.11788974433940812, "grad_norm": 0.545035183429718, "learning_rate": 9.874613946523293e-06, "loss": 0.07435660064220428, "memory(GiB)": 21.32, "step": 3629, "token_acc": 0.9764705882352941, "train_speed(iter/s)": 0.948326 }, { "epoch": 0.11792222980216353, "grad_norm": 0.5907387137413025, "learning_rate": 9.874494378565413e-06, "loss": 0.07199359685182571, "memory(GiB)": 21.32, "step": 3630, "token_acc": 0.9613733905579399, "train_speed(iter/s)": 0.948369 }, { "epoch": 0.11795471526491895, "grad_norm": 0.7008956670761108, "learning_rate": 9.874374754349383e-06, "loss": 0.06598702818155289, "memory(GiB)": 21.32, "step": 3631, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.948413 }, { "epoch": 0.11798720072767437, "grad_norm": 0.7952604293823242, "learning_rate": 9.87425507387658e-06, "loss": 0.07358574867248535, "memory(GiB)": 21.32, "step": 3632, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.948454 }, { "epoch": 0.11801968619042978, "grad_norm": 0.5711778402328491, "learning_rate": 9.874135337148385e-06, "loss": 0.060779038816690445, "memory(GiB)": 21.32, "step": 3633, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.948495 }, { "epoch": 0.1180521716531852, "grad_norm": 0.6504243016242981, "learning_rate": 9.874015544166181e-06, "loss": 0.0625113844871521, "memory(GiB)": 21.32, "step": 3634, "token_acc": 0.9645669291338582, "train_speed(iter/s)": 0.948539 }, { "epoch": 0.11808465711594061, "grad_norm": 0.5678210258483887, "learning_rate": 9.87389569493135e-06, "loss": 0.07603861391544342, "memory(GiB)": 21.32, "step": 3635, "token_acc": 0.9715302491103203, "train_speed(iter/s)": 0.94858 }, { "epoch": 0.11811714257869603, "grad_norm": 0.5647684335708618, "learning_rate": 9.873775789445276e-06, "loss": 0.06013965606689453, "memory(GiB)": 21.32, "step": 3636, "token_acc": 0.9730941704035875, "train_speed(iter/s)": 0.948624 }, { "epoch": 0.11814962804145145, "grad_norm": 0.9836865067481995, "learning_rate": 9.873655827709343e-06, "loss": 0.07717593014240265, "memory(GiB)": 21.32, "step": 3637, "token_acc": 0.9753694581280788, "train_speed(iter/s)": 0.948667 }, { "epoch": 0.11818211350420686, "grad_norm": 0.688793957233429, "learning_rate": 9.873535809724934e-06, "loss": 0.06721919029951096, "memory(GiB)": 21.32, "step": 3638, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.948711 }, { "epoch": 0.11821459896696228, "grad_norm": 0.491619735956192, "learning_rate": 9.873415735493435e-06, "loss": 0.058257102966308594, "memory(GiB)": 21.32, "step": 3639, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.948766 }, { "epoch": 0.11824708442971771, "grad_norm": 0.5251837968826294, "learning_rate": 9.873295605016233e-06, "loss": 0.05248165875673294, "memory(GiB)": 21.32, "step": 3640, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.94882 }, { "epoch": 0.11827956989247312, "grad_norm": 0.7801982164382935, "learning_rate": 9.873175418294711e-06, "loss": 0.07418768852949142, "memory(GiB)": 21.32, "step": 3641, "token_acc": 0.9753694581280788, "train_speed(iter/s)": 0.948868 }, { "epoch": 0.11831205535522854, "grad_norm": 0.49114707112312317, "learning_rate": 9.873055175330262e-06, "loss": 0.05326875299215317, "memory(GiB)": 21.32, "step": 3642, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.948918 }, { "epoch": 0.11834454081798396, "grad_norm": 0.7122487425804138, "learning_rate": 9.872934876124266e-06, "loss": 0.06582796573638916, "memory(GiB)": 21.32, "step": 3643, "token_acc": 0.9764705882352941, "train_speed(iter/s)": 0.948972 }, { "epoch": 0.11837702628073937, "grad_norm": 0.5757998824119568, "learning_rate": 9.87281452067812e-06, "loss": 0.07018554210662842, "memory(GiB)": 21.32, "step": 3644, "token_acc": 0.9823529411764705, "train_speed(iter/s)": 0.949027 }, { "epoch": 0.11840951174349479, "grad_norm": 0.8572556376457214, "learning_rate": 9.872694108993206e-06, "loss": 0.0847000777721405, "memory(GiB)": 21.32, "step": 3645, "token_acc": 0.9701492537313433, "train_speed(iter/s)": 0.949082 }, { "epoch": 0.1184419972062502, "grad_norm": 0.9360970258712769, "learning_rate": 9.872573641070917e-06, "loss": 0.06388399749994278, "memory(GiB)": 21.32, "step": 3646, "token_acc": 0.9686274509803922, "train_speed(iter/s)": 0.949135 }, { "epoch": 0.11847448266900562, "grad_norm": 0.5675379037857056, "learning_rate": 9.872453116912643e-06, "loss": 0.05831889808177948, "memory(GiB)": 21.32, "step": 3647, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.949189 }, { "epoch": 0.11850696813176104, "grad_norm": 0.5930244326591492, "learning_rate": 9.872332536519776e-06, "loss": 0.056421130895614624, "memory(GiB)": 21.32, "step": 3648, "token_acc": 0.9854014598540146, "train_speed(iter/s)": 0.949244 }, { "epoch": 0.11853945359451645, "grad_norm": 0.6481235027313232, "learning_rate": 9.872211899893704e-06, "loss": 0.06947848200798035, "memory(GiB)": 21.32, "step": 3649, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.9493 }, { "epoch": 0.11857193905727187, "grad_norm": 0.6866351366043091, "learning_rate": 9.872091207035823e-06, "loss": 0.05944456905126572, "memory(GiB)": 21.32, "step": 3650, "token_acc": 0.96484375, "train_speed(iter/s)": 0.949359 }, { "epoch": 0.11860442452002729, "grad_norm": 0.8256071209907532, "learning_rate": 9.871970457947525e-06, "loss": 0.06560017168521881, "memory(GiB)": 21.32, "step": 3651, "token_acc": 0.9166666666666666, "train_speed(iter/s)": 0.949411 }, { "epoch": 0.1186369099827827, "grad_norm": 0.7143709659576416, "learning_rate": 9.871849652630204e-06, "loss": 0.06973225623369217, "memory(GiB)": 21.32, "step": 3652, "token_acc": 0.9701492537313433, "train_speed(iter/s)": 0.949465 }, { "epoch": 0.11866939544553812, "grad_norm": 1.0660412311553955, "learning_rate": 9.871728791085253e-06, "loss": 0.07414139062166214, "memory(GiB)": 21.32, "step": 3653, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.949519 }, { "epoch": 0.11870188090829353, "grad_norm": 0.8778581619262695, "learning_rate": 9.871607873314067e-06, "loss": 0.08514313399791718, "memory(GiB)": 21.32, "step": 3654, "token_acc": 0.9787985865724381, "train_speed(iter/s)": 0.949574 }, { "epoch": 0.11873436637104895, "grad_norm": 0.6789993643760681, "learning_rate": 9.871486899318042e-06, "loss": 0.06903108209371567, "memory(GiB)": 21.32, "step": 3655, "token_acc": 0.96, "train_speed(iter/s)": 0.949628 }, { "epoch": 0.11876685183380438, "grad_norm": 0.6105265021324158, "learning_rate": 9.871365869098575e-06, "loss": 0.05630072206258774, "memory(GiB)": 21.32, "step": 3656, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.949681 }, { "epoch": 0.1187993372965598, "grad_norm": 0.7915802597999573, "learning_rate": 9.871244782657061e-06, "loss": 0.065755695104599, "memory(GiB)": 21.32, "step": 3657, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.949736 }, { "epoch": 0.11883182275931521, "grad_norm": 0.8493935465812683, "learning_rate": 9.871123639994899e-06, "loss": 0.07153300940990448, "memory(GiB)": 21.32, "step": 3658, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.94979 }, { "epoch": 0.11886430822207063, "grad_norm": 0.48876655101776123, "learning_rate": 9.871002441113488e-06, "loss": 0.06871432065963745, "memory(GiB)": 21.32, "step": 3659, "token_acc": 0.9799196787148594, "train_speed(iter/s)": 0.949843 }, { "epoch": 0.11889679368482604, "grad_norm": 0.9179493188858032, "learning_rate": 9.870881186014224e-06, "loss": 0.08599187433719635, "memory(GiB)": 21.32, "step": 3660, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.949901 }, { "epoch": 0.11892927914758146, "grad_norm": 0.6338877081871033, "learning_rate": 9.870759874698509e-06, "loss": 0.0653836578130722, "memory(GiB)": 21.32, "step": 3661, "token_acc": 0.9742489270386266, "train_speed(iter/s)": 0.949956 }, { "epoch": 0.11896176461033688, "grad_norm": 0.7681235074996948, "learning_rate": 9.87063850716774e-06, "loss": 0.06526094675064087, "memory(GiB)": 21.32, "step": 3662, "token_acc": 0.9723502304147466, "train_speed(iter/s)": 0.95001 }, { "epoch": 0.11899425007309229, "grad_norm": 0.7505870461463928, "learning_rate": 9.870517083423323e-06, "loss": 0.07725077122449875, "memory(GiB)": 21.32, "step": 3663, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.950061 }, { "epoch": 0.11902673553584771, "grad_norm": 1.083283543586731, "learning_rate": 9.870395603466654e-06, "loss": 0.0715777575969696, "memory(GiB)": 21.32, "step": 3664, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.950115 }, { "epoch": 0.11905922099860312, "grad_norm": 0.6181684136390686, "learning_rate": 9.870274067299138e-06, "loss": 0.06145503744482994, "memory(GiB)": 21.32, "step": 3665, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.950164 }, { "epoch": 0.11909170646135854, "grad_norm": 0.5808840990066528, "learning_rate": 9.870152474922176e-06, "loss": 0.05887969955801964, "memory(GiB)": 21.32, "step": 3666, "token_acc": 0.9858490566037735, "train_speed(iter/s)": 0.950215 }, { "epoch": 0.11912419192411396, "grad_norm": 0.6542032361030579, "learning_rate": 9.870030826337173e-06, "loss": 0.07265190780162811, "memory(GiB)": 21.32, "step": 3667, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.950265 }, { "epoch": 0.11915667738686937, "grad_norm": 0.9440714716911316, "learning_rate": 9.869909121545533e-06, "loss": 0.0696304440498352, "memory(GiB)": 21.32, "step": 3668, "token_acc": 0.976, "train_speed(iter/s)": 0.95031 }, { "epoch": 0.11918916284962479, "grad_norm": 0.5507796406745911, "learning_rate": 9.869787360548659e-06, "loss": 0.06311091035604477, "memory(GiB)": 21.32, "step": 3669, "token_acc": 0.9781420765027322, "train_speed(iter/s)": 0.950347 }, { "epoch": 0.1192216483123802, "grad_norm": 0.7212348580360413, "learning_rate": 9.869665543347957e-06, "loss": 0.06420175731182098, "memory(GiB)": 21.32, "step": 3670, "token_acc": 0.9623430962343096, "train_speed(iter/s)": 0.950381 }, { "epoch": 0.11925413377513562, "grad_norm": 0.6356643438339233, "learning_rate": 9.869543669944835e-06, "loss": 0.060278307646512985, "memory(GiB)": 21.32, "step": 3671, "token_acc": 0.9795918367346939, "train_speed(iter/s)": 0.950424 }, { "epoch": 0.11928661923789105, "grad_norm": 0.7109523415565491, "learning_rate": 9.869421740340694e-06, "loss": 0.0671020895242691, "memory(GiB)": 21.32, "step": 3672, "token_acc": 0.9662921348314607, "train_speed(iter/s)": 0.950467 }, { "epoch": 0.11931910470064647, "grad_norm": 0.46717944741249084, "learning_rate": 9.869299754536949e-06, "loss": 0.061354342848062515, "memory(GiB)": 21.32, "step": 3673, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.950507 }, { "epoch": 0.11935159016340188, "grad_norm": 0.6480467915534973, "learning_rate": 9.869177712535e-06, "loss": 0.0703817829489708, "memory(GiB)": 21.32, "step": 3674, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.95055 }, { "epoch": 0.1193840756261573, "grad_norm": 0.6088579297065735, "learning_rate": 9.869055614336263e-06, "loss": 0.06405505537986755, "memory(GiB)": 21.32, "step": 3675, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.95059 }, { "epoch": 0.11941656108891271, "grad_norm": 1.050530195236206, "learning_rate": 9.868933459942141e-06, "loss": 0.06073296070098877, "memory(GiB)": 21.32, "step": 3676, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.950631 }, { "epoch": 0.11944904655166813, "grad_norm": 1.0986970663070679, "learning_rate": 9.868811249354048e-06, "loss": 0.0708693340420723, "memory(GiB)": 21.32, "step": 3677, "token_acc": 0.9626168224299065, "train_speed(iter/s)": 0.950664 }, { "epoch": 0.11948153201442355, "grad_norm": 0.6239076852798462, "learning_rate": 9.86868898257339e-06, "loss": 0.06819692999124527, "memory(GiB)": 21.32, "step": 3678, "token_acc": 0.9621848739495799, "train_speed(iter/s)": 0.950701 }, { "epoch": 0.11951401747717896, "grad_norm": 0.5672226548194885, "learning_rate": 9.868566659601584e-06, "loss": 0.05794967710971832, "memory(GiB)": 21.32, "step": 3679, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.950737 }, { "epoch": 0.11954650293993438, "grad_norm": 0.5884811878204346, "learning_rate": 9.868444280440038e-06, "loss": 0.059194959700107574, "memory(GiB)": 21.32, "step": 3680, "token_acc": 0.9707112970711297, "train_speed(iter/s)": 0.950764 }, { "epoch": 0.1195789884026898, "grad_norm": 0.6893274188041687, "learning_rate": 9.868321845090165e-06, "loss": 0.07632304728031158, "memory(GiB)": 21.32, "step": 3681, "token_acc": 0.969811320754717, "train_speed(iter/s)": 0.950797 }, { "epoch": 0.11961147386544521, "grad_norm": 1.0019690990447998, "learning_rate": 9.868199353553379e-06, "loss": 0.0545383021235466, "memory(GiB)": 21.32, "step": 3682, "token_acc": 0.9776951672862454, "train_speed(iter/s)": 0.950832 }, { "epoch": 0.11964395932820063, "grad_norm": 0.6890993118286133, "learning_rate": 9.86807680583109e-06, "loss": 0.0566399022936821, "memory(GiB)": 21.32, "step": 3683, "token_acc": 0.9823321554770318, "train_speed(iter/s)": 0.950867 }, { "epoch": 0.11967644479095604, "grad_norm": 0.5595564842224121, "learning_rate": 9.867954201924718e-06, "loss": 0.05268838256597519, "memory(GiB)": 21.32, "step": 3684, "token_acc": 0.988, "train_speed(iter/s)": 0.950903 }, { "epoch": 0.11970893025371146, "grad_norm": 0.4141315817832947, "learning_rate": 9.867831541835674e-06, "loss": 0.06044661998748779, "memory(GiB)": 21.32, "step": 3685, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.950937 }, { "epoch": 0.11974141571646688, "grad_norm": 0.5545333027839661, "learning_rate": 9.867708825565376e-06, "loss": 0.06953034549951553, "memory(GiB)": 21.32, "step": 3686, "token_acc": 0.9706959706959707, "train_speed(iter/s)": 0.95097 }, { "epoch": 0.11977390117922229, "grad_norm": 0.8949661254882812, "learning_rate": 9.867586053115237e-06, "loss": 0.0687599629163742, "memory(GiB)": 21.32, "step": 3687, "token_acc": 0.9543568464730291, "train_speed(iter/s)": 0.951006 }, { "epoch": 0.11980638664197772, "grad_norm": 0.7049123048782349, "learning_rate": 9.86746322448668e-06, "loss": 0.07247427105903625, "memory(GiB)": 21.32, "step": 3688, "token_acc": 0.9601593625498008, "train_speed(iter/s)": 0.951044 }, { "epoch": 0.11983887210473314, "grad_norm": 0.6225540637969971, "learning_rate": 9.867340339681116e-06, "loss": 0.07530337572097778, "memory(GiB)": 21.32, "step": 3689, "token_acc": 0.9548872180451128, "train_speed(iter/s)": 0.95108 }, { "epoch": 0.11987135756748855, "grad_norm": 0.5692489147186279, "learning_rate": 9.867217398699966e-06, "loss": 0.07146332412958145, "memory(GiB)": 21.32, "step": 3690, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.951122 }, { "epoch": 0.11990384303024397, "grad_norm": 0.6027265191078186, "learning_rate": 9.86709440154465e-06, "loss": 0.06653852015733719, "memory(GiB)": 21.32, "step": 3691, "token_acc": 0.9558232931726908, "train_speed(iter/s)": 0.951162 }, { "epoch": 0.11993632849299939, "grad_norm": 1.7847994565963745, "learning_rate": 9.866971348216586e-06, "loss": 0.06949734687805176, "memory(GiB)": 21.32, "step": 3692, "token_acc": 0.9718875502008032, "train_speed(iter/s)": 0.951204 }, { "epoch": 0.1199688139557548, "grad_norm": 0.6309729814529419, "learning_rate": 9.866848238717195e-06, "loss": 0.06332416832447052, "memory(GiB)": 21.32, "step": 3693, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.951244 }, { "epoch": 0.12000129941851022, "grad_norm": 0.5079809427261353, "learning_rate": 9.866725073047898e-06, "loss": 0.059383418411016464, "memory(GiB)": 21.32, "step": 3694, "token_acc": 0.968, "train_speed(iter/s)": 0.951289 }, { "epoch": 0.12003378488126563, "grad_norm": 0.5522938966751099, "learning_rate": 9.866601851210113e-06, "loss": 0.06780166178941727, "memory(GiB)": 21.32, "step": 3695, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.951335 }, { "epoch": 0.12006627034402105, "grad_norm": 0.6251894235610962, "learning_rate": 9.866478573205269e-06, "loss": 0.06129010394215584, "memory(GiB)": 21.32, "step": 3696, "token_acc": 0.9659574468085106, "train_speed(iter/s)": 0.951388 }, { "epoch": 0.12009875580677647, "grad_norm": 0.9265168905258179, "learning_rate": 9.866355239034784e-06, "loss": 0.06242179125547409, "memory(GiB)": 21.32, "step": 3697, "token_acc": 0.9786096256684492, "train_speed(iter/s)": 0.951441 }, { "epoch": 0.12013124126953188, "grad_norm": 0.5076262354850769, "learning_rate": 9.866231848700081e-06, "loss": 0.06498248875141144, "memory(GiB)": 21.32, "step": 3698, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.951494 }, { "epoch": 0.1201637267322873, "grad_norm": 0.49617746472358704, "learning_rate": 9.866108402202587e-06, "loss": 0.0585898756980896, "memory(GiB)": 21.32, "step": 3699, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.951548 }, { "epoch": 0.12019621219504271, "grad_norm": 0.6853405237197876, "learning_rate": 9.865984899543726e-06, "loss": 0.07120176404714584, "memory(GiB)": 21.32, "step": 3700, "token_acc": 0.9605911330049262, "train_speed(iter/s)": 0.951597 }, { "epoch": 0.12022869765779813, "grad_norm": 0.7784059047698975, "learning_rate": 9.86586134072492e-06, "loss": 0.05978522449731827, "memory(GiB)": 21.32, "step": 3701, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.95165 }, { "epoch": 0.12026118312055355, "grad_norm": 0.9118242859840393, "learning_rate": 9.8657377257476e-06, "loss": 0.07199369370937347, "memory(GiB)": 21.32, "step": 3702, "token_acc": 0.9610894941634242, "train_speed(iter/s)": 0.951703 }, { "epoch": 0.12029366858330896, "grad_norm": 2.386495351791382, "learning_rate": 9.865614054613189e-06, "loss": 0.06634257733821869, "memory(GiB)": 21.32, "step": 3703, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.951755 }, { "epoch": 0.12032615404606439, "grad_norm": 0.5262395739555359, "learning_rate": 9.865490327323118e-06, "loss": 0.06874866783618927, "memory(GiB)": 21.32, "step": 3704, "token_acc": 0.9728260869565217, "train_speed(iter/s)": 0.951808 }, { "epoch": 0.12035863950881981, "grad_norm": 1.0708050727844238, "learning_rate": 9.86536654387881e-06, "loss": 0.07590296864509583, "memory(GiB)": 21.32, "step": 3705, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.951861 }, { "epoch": 0.12039112497157523, "grad_norm": 0.8525128960609436, "learning_rate": 9.865242704281697e-06, "loss": 0.07090984284877777, "memory(GiB)": 21.32, "step": 3706, "token_acc": 0.974025974025974, "train_speed(iter/s)": 0.951911 }, { "epoch": 0.12042361043433064, "grad_norm": 0.5603441596031189, "learning_rate": 9.865118808533207e-06, "loss": 0.06689269095659256, "memory(GiB)": 21.32, "step": 3707, "token_acc": 0.9680851063829787, "train_speed(iter/s)": 0.951964 }, { "epoch": 0.12045609589708606, "grad_norm": 0.5424713492393494, "learning_rate": 9.864994856634772e-06, "loss": 0.058425627648830414, "memory(GiB)": 21.32, "step": 3708, "token_acc": 0.9609756097560975, "train_speed(iter/s)": 0.952016 }, { "epoch": 0.12048858135984147, "grad_norm": 0.569709300994873, "learning_rate": 9.86487084858782e-06, "loss": 0.06234664469957352, "memory(GiB)": 21.32, "step": 3709, "token_acc": 0.9797979797979798, "train_speed(iter/s)": 0.952065 }, { "epoch": 0.12052106682259689, "grad_norm": 0.5590887665748596, "learning_rate": 9.864746784393783e-06, "loss": 0.05959659442305565, "memory(GiB)": 21.32, "step": 3710, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.952114 }, { "epoch": 0.1205535522853523, "grad_norm": 0.5174791812896729, "learning_rate": 9.864622664054095e-06, "loss": 0.05859886109828949, "memory(GiB)": 21.32, "step": 3711, "token_acc": 0.984375, "train_speed(iter/s)": 0.952164 }, { "epoch": 0.12058603774810772, "grad_norm": 1.0251989364624023, "learning_rate": 9.864498487570186e-06, "loss": 0.08286018669605255, "memory(GiB)": 21.32, "step": 3712, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.952214 }, { "epoch": 0.12061852321086314, "grad_norm": 0.5911740064620972, "learning_rate": 9.864374254943489e-06, "loss": 0.0639147013425827, "memory(GiB)": 21.32, "step": 3713, "token_acc": 0.97265625, "train_speed(iter/s)": 0.952268 }, { "epoch": 0.12065100867361855, "grad_norm": 0.5159309506416321, "learning_rate": 9.864249966175438e-06, "loss": 0.06070488318800926, "memory(GiB)": 21.32, "step": 3714, "token_acc": 0.9758454106280193, "train_speed(iter/s)": 0.952321 }, { "epoch": 0.12068349413637397, "grad_norm": 0.5454161167144775, "learning_rate": 9.86412562126747e-06, "loss": 0.06919969618320465, "memory(GiB)": 21.32, "step": 3715, "token_acc": 0.968503937007874, "train_speed(iter/s)": 0.952371 }, { "epoch": 0.12071597959912939, "grad_norm": 0.9577913880348206, "learning_rate": 9.864001220221016e-06, "loss": 0.07289640605449677, "memory(GiB)": 21.32, "step": 3716, "token_acc": 0.981042654028436, "train_speed(iter/s)": 0.952421 }, { "epoch": 0.1207484650618848, "grad_norm": 0.5034980177879333, "learning_rate": 9.863876763037516e-06, "loss": 0.06804874539375305, "memory(GiB)": 21.32, "step": 3717, "token_acc": 0.9641255605381166, "train_speed(iter/s)": 0.952474 }, { "epoch": 0.12078095052464022, "grad_norm": 0.6130902767181396, "learning_rate": 9.863752249718404e-06, "loss": 0.05092020332813263, "memory(GiB)": 21.32, "step": 3718, "token_acc": 0.9813664596273292, "train_speed(iter/s)": 0.952529 }, { "epoch": 0.12081343598739563, "grad_norm": 0.6416887640953064, "learning_rate": 9.863627680265115e-06, "loss": 0.06492668390274048, "memory(GiB)": 21.32, "step": 3719, "token_acc": 0.9628099173553719, "train_speed(iter/s)": 0.952576 }, { "epoch": 0.12084592145015106, "grad_norm": 0.7397835850715637, "learning_rate": 9.863503054679092e-06, "loss": 0.06867145746946335, "memory(GiB)": 21.32, "step": 3720, "token_acc": 0.972, "train_speed(iter/s)": 0.95263 }, { "epoch": 0.12087840691290648, "grad_norm": 0.5851339101791382, "learning_rate": 9.863378372961769e-06, "loss": 0.06348608434200287, "memory(GiB)": 21.32, "step": 3721, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.952678 }, { "epoch": 0.1209108923756619, "grad_norm": 0.5761855840682983, "learning_rate": 9.863253635114586e-06, "loss": 0.06178475171327591, "memory(GiB)": 21.32, "step": 3722, "token_acc": 0.9780701754385965, "train_speed(iter/s)": 0.952732 }, { "epoch": 0.12094337783841731, "grad_norm": 0.7127552628517151, "learning_rate": 9.863128841138984e-06, "loss": 0.06587561219930649, "memory(GiB)": 21.32, "step": 3723, "token_acc": 0.9826839826839827, "train_speed(iter/s)": 0.952787 }, { "epoch": 0.12097586330117273, "grad_norm": 0.7702516317367554, "learning_rate": 9.863003991036404e-06, "loss": 0.07745645940303802, "memory(GiB)": 21.32, "step": 3724, "token_acc": 0.9624413145539906, "train_speed(iter/s)": 0.952841 }, { "epoch": 0.12100834876392814, "grad_norm": 0.8899868130683899, "learning_rate": 9.862879084808284e-06, "loss": 0.06881070137023926, "memory(GiB)": 21.32, "step": 3725, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.952895 }, { "epoch": 0.12104083422668356, "grad_norm": 0.7340007424354553, "learning_rate": 9.862754122456067e-06, "loss": 0.07327194511890411, "memory(GiB)": 21.32, "step": 3726, "token_acc": 0.9724770642201835, "train_speed(iter/s)": 0.952948 }, { "epoch": 0.12107331968943898, "grad_norm": 1.6928231716156006, "learning_rate": 9.862629103981194e-06, "loss": 0.07138261198997498, "memory(GiB)": 21.32, "step": 3727, "token_acc": 0.968609865470852, "train_speed(iter/s)": 0.953 }, { "epoch": 0.12110580515219439, "grad_norm": 0.624792754650116, "learning_rate": 9.86250402938511e-06, "loss": 0.06860102713108063, "memory(GiB)": 21.32, "step": 3728, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.953051 }, { "epoch": 0.12113829061494981, "grad_norm": 0.53010493516922, "learning_rate": 9.862378898669255e-06, "loss": 0.06001121550798416, "memory(GiB)": 21.32, "step": 3729, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.9531 }, { "epoch": 0.12117077607770523, "grad_norm": 0.7106550931930542, "learning_rate": 9.86225371183508e-06, "loss": 0.07299665361642838, "memory(GiB)": 21.32, "step": 3730, "token_acc": 0.9752066115702479, "train_speed(iter/s)": 0.95314 }, { "epoch": 0.12120326154046064, "grad_norm": 1.3448028564453125, "learning_rate": 9.862128468884022e-06, "loss": 0.06575513631105423, "memory(GiB)": 21.32, "step": 3731, "token_acc": 0.9702970297029703, "train_speed(iter/s)": 0.95318 }, { "epoch": 0.12123574700321606, "grad_norm": 2.448375701904297, "learning_rate": 9.86200316981753e-06, "loss": 0.0628293976187706, "memory(GiB)": 21.32, "step": 3732, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.95322 }, { "epoch": 0.12126823246597147, "grad_norm": 0.5757957100868225, "learning_rate": 9.861877814637052e-06, "loss": 0.05908634513616562, "memory(GiB)": 21.32, "step": 3733, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.953258 }, { "epoch": 0.12130071792872689, "grad_norm": 0.5588783621788025, "learning_rate": 9.861752403344033e-06, "loss": 0.06516286730766296, "memory(GiB)": 21.32, "step": 3734, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.953295 }, { "epoch": 0.1213332033914823, "grad_norm": 0.6737457513809204, "learning_rate": 9.861626935939919e-06, "loss": 0.06059717386960983, "memory(GiB)": 21.32, "step": 3735, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.953327 }, { "epoch": 0.12136568885423774, "grad_norm": 0.6038116216659546, "learning_rate": 9.86150141242616e-06, "loss": 0.0661223754286766, "memory(GiB)": 21.32, "step": 3736, "token_acc": 0.956, "train_speed(iter/s)": 0.953364 }, { "epoch": 0.12139817431699315, "grad_norm": 0.47041040658950806, "learning_rate": 9.861375832804205e-06, "loss": 0.05383528769016266, "memory(GiB)": 21.32, "step": 3737, "token_acc": 0.9859649122807017, "train_speed(iter/s)": 0.953401 }, { "epoch": 0.12143065977974857, "grad_norm": 0.6153523325920105, "learning_rate": 9.8612501970755e-06, "loss": 0.06322170048952103, "memory(GiB)": 21.32, "step": 3738, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.953436 }, { "epoch": 0.12146314524250398, "grad_norm": 0.611295223236084, "learning_rate": 9.861124505241499e-06, "loss": 0.05092940479516983, "memory(GiB)": 21.32, "step": 3739, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.953468 }, { "epoch": 0.1214956307052594, "grad_norm": 0.9587796926498413, "learning_rate": 9.860998757303651e-06, "loss": 0.07522747665643692, "memory(GiB)": 21.32, "step": 3740, "token_acc": 0.9533898305084746, "train_speed(iter/s)": 0.953503 }, { "epoch": 0.12152811616801482, "grad_norm": 0.6706305742263794, "learning_rate": 9.860872953263407e-06, "loss": 0.06589193642139435, "memory(GiB)": 21.32, "step": 3741, "token_acc": 0.9728682170542635, "train_speed(iter/s)": 0.953541 }, { "epoch": 0.12156060163077023, "grad_norm": 0.6646230220794678, "learning_rate": 9.86074709312222e-06, "loss": 0.06866039335727692, "memory(GiB)": 21.32, "step": 3742, "token_acc": 0.963302752293578, "train_speed(iter/s)": 0.953576 }, { "epoch": 0.12159308709352565, "grad_norm": 0.8066015839576721, "learning_rate": 9.860621176881543e-06, "loss": 0.05600529909133911, "memory(GiB)": 21.32, "step": 3743, "token_acc": 0.984, "train_speed(iter/s)": 0.953608 }, { "epoch": 0.12162557255628106, "grad_norm": 0.6231004595756531, "learning_rate": 9.860495204542825e-06, "loss": 0.05885782092809677, "memory(GiB)": 21.32, "step": 3744, "token_acc": 0.9885931558935361, "train_speed(iter/s)": 0.95364 }, { "epoch": 0.12165805801903648, "grad_norm": 0.8878596425056458, "learning_rate": 9.860369176107525e-06, "loss": 0.0856696143746376, "memory(GiB)": 21.32, "step": 3745, "token_acc": 0.9531914893617022, "train_speed(iter/s)": 0.953676 }, { "epoch": 0.1216905434817919, "grad_norm": 0.781171977519989, "learning_rate": 9.860243091577096e-06, "loss": 0.07477039098739624, "memory(GiB)": 21.32, "step": 3746, "token_acc": 0.9711538461538461, "train_speed(iter/s)": 0.953709 }, { "epoch": 0.12172302894454731, "grad_norm": 0.7174242734909058, "learning_rate": 9.860116950952991e-06, "loss": 0.06823185086250305, "memory(GiB)": 21.32, "step": 3747, "token_acc": 0.9769230769230769, "train_speed(iter/s)": 0.953737 }, { "epoch": 0.12175551440730273, "grad_norm": 0.5023305416107178, "learning_rate": 9.859990754236668e-06, "loss": 0.05888304114341736, "memory(GiB)": 21.32, "step": 3748, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.953773 }, { "epoch": 0.12178799987005814, "grad_norm": 0.7913303375244141, "learning_rate": 9.859864501429583e-06, "loss": 0.07289209216833115, "memory(GiB)": 21.32, "step": 3749, "token_acc": 0.9723502304147466, "train_speed(iter/s)": 0.953804 }, { "epoch": 0.12182048533281356, "grad_norm": 0.9231916666030884, "learning_rate": 9.859738192533195e-06, "loss": 0.07919517159461975, "memory(GiB)": 21.32, "step": 3750, "token_acc": 0.954954954954955, "train_speed(iter/s)": 0.953841 }, { "epoch": 0.12185297079556898, "grad_norm": 0.9917929172515869, "learning_rate": 9.859611827548958e-06, "loss": 0.07358551025390625, "memory(GiB)": 21.32, "step": 3751, "token_acc": 0.9760765550239234, "train_speed(iter/s)": 0.953881 }, { "epoch": 0.1218854562583244, "grad_norm": 0.53992760181427, "learning_rate": 9.859485406478335e-06, "loss": 0.0728837251663208, "memory(GiB)": 21.32, "step": 3752, "token_acc": 0.9581589958158996, "train_speed(iter/s)": 0.953923 }, { "epoch": 0.12191794172107982, "grad_norm": 0.9231628775596619, "learning_rate": 9.85935892932278e-06, "loss": 0.06619885563850403, "memory(GiB)": 21.32, "step": 3753, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.953972 }, { "epoch": 0.12195042718383524, "grad_norm": 0.8168053030967712, "learning_rate": 9.859232396083756e-06, "loss": 0.06080468371510506, "memory(GiB)": 21.32, "step": 3754, "token_acc": 0.9776951672862454, "train_speed(iter/s)": 0.954026 }, { "epoch": 0.12198291264659065, "grad_norm": 12.903891563415527, "learning_rate": 9.859105806762721e-06, "loss": 0.07168234884738922, "memory(GiB)": 21.32, "step": 3755, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.954076 }, { "epoch": 0.12201539810934607, "grad_norm": 0.47862115502357483, "learning_rate": 9.85897916136114e-06, "loss": 0.05742523819208145, "memory(GiB)": 21.32, "step": 3756, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.95413 }, { "epoch": 0.12204788357210149, "grad_norm": 0.6226024627685547, "learning_rate": 9.85885245988047e-06, "loss": 0.0644143745303154, "memory(GiB)": 21.32, "step": 3757, "token_acc": 0.9823008849557522, "train_speed(iter/s)": 0.95418 }, { "epoch": 0.1220803690348569, "grad_norm": 0.8749231100082397, "learning_rate": 9.858725702322177e-06, "loss": 0.06973579525947571, "memory(GiB)": 21.32, "step": 3758, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.954232 }, { "epoch": 0.12211285449761232, "grad_norm": 0.7489557862281799, "learning_rate": 9.858598888687722e-06, "loss": 0.06970921158790588, "memory(GiB)": 21.32, "step": 3759, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.954283 }, { "epoch": 0.12214533996036774, "grad_norm": 1.1313159465789795, "learning_rate": 9.858472018978568e-06, "loss": 0.04773560166358948, "memory(GiB)": 21.32, "step": 3760, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.954337 }, { "epoch": 0.12217782542312315, "grad_norm": 0.6073885560035706, "learning_rate": 9.858345093196182e-06, "loss": 0.05993792414665222, "memory(GiB)": 21.32, "step": 3761, "token_acc": 0.9715447154471545, "train_speed(iter/s)": 0.954384 }, { "epoch": 0.12221031088587857, "grad_norm": 0.7262254953384399, "learning_rate": 9.858218111342026e-06, "loss": 0.06838306784629822, "memory(GiB)": 21.32, "step": 3762, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.954435 }, { "epoch": 0.12224279634863398, "grad_norm": 0.7064791321754456, "learning_rate": 9.858091073417568e-06, "loss": 0.08167734742164612, "memory(GiB)": 21.32, "step": 3763, "token_acc": 0.9724770642201835, "train_speed(iter/s)": 0.954487 }, { "epoch": 0.1222752818113894, "grad_norm": 0.6082113981246948, "learning_rate": 9.857963979424271e-06, "loss": 0.05724600329995155, "memory(GiB)": 21.32, "step": 3764, "token_acc": 0.9727272727272728, "train_speed(iter/s)": 0.954539 }, { "epoch": 0.12230776727414482, "grad_norm": 0.4926033318042755, "learning_rate": 9.857836829363605e-06, "loss": 0.0563475638628006, "memory(GiB)": 21.32, "step": 3765, "token_acc": 0.981042654028436, "train_speed(iter/s)": 0.954593 }, { "epoch": 0.12234025273690023, "grad_norm": 0.5793667435646057, "learning_rate": 9.857709623237037e-06, "loss": 0.057048067450523376, "memory(GiB)": 21.32, "step": 3766, "token_acc": 0.9727272727272728, "train_speed(iter/s)": 0.954644 }, { "epoch": 0.12237273819965565, "grad_norm": 0.44478118419647217, "learning_rate": 9.857582361046033e-06, "loss": 0.05531664937734604, "memory(GiB)": 21.32, "step": 3767, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.954692 }, { "epoch": 0.12240522366241108, "grad_norm": 0.6054823994636536, "learning_rate": 9.857455042792064e-06, "loss": 0.05540275573730469, "memory(GiB)": 21.32, "step": 3768, "token_acc": 0.9636363636363636, "train_speed(iter/s)": 0.954729 }, { "epoch": 0.1224377091251665, "grad_norm": 0.6091305613517761, "learning_rate": 9.857327668476598e-06, "loss": 0.06434875726699829, "memory(GiB)": 21.32, "step": 3769, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.954781 }, { "epoch": 0.12247019458792191, "grad_norm": 0.5661777257919312, "learning_rate": 9.857200238101105e-06, "loss": 0.061632491648197174, "memory(GiB)": 21.32, "step": 3770, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.954832 }, { "epoch": 0.12250268005067733, "grad_norm": 0.5670631527900696, "learning_rate": 9.857072751667059e-06, "loss": 0.06431633234024048, "memory(GiB)": 21.32, "step": 3771, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.954885 }, { "epoch": 0.12253516551343274, "grad_norm": 0.7681165933609009, "learning_rate": 9.856945209175925e-06, "loss": 0.07077513635158539, "memory(GiB)": 21.32, "step": 3772, "token_acc": 0.9682539682539683, "train_speed(iter/s)": 0.954938 }, { "epoch": 0.12256765097618816, "grad_norm": 1.0496033430099487, "learning_rate": 9.85681761062918e-06, "loss": 0.07242334634065628, "memory(GiB)": 21.32, "step": 3773, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.95499 }, { "epoch": 0.12260013643894357, "grad_norm": 0.9727992415428162, "learning_rate": 9.856689956028296e-06, "loss": 0.08052436262369156, "memory(GiB)": 21.32, "step": 3774, "token_acc": 0.980544747081712, "train_speed(iter/s)": 0.955042 }, { "epoch": 0.12263262190169899, "grad_norm": 0.8467047214508057, "learning_rate": 9.856562245374746e-06, "loss": 0.07561562955379486, "memory(GiB)": 21.32, "step": 3775, "token_acc": 0.9523809523809523, "train_speed(iter/s)": 0.955095 }, { "epoch": 0.1226651073644544, "grad_norm": 0.5391219854354858, "learning_rate": 9.856434478670003e-06, "loss": 0.053883425891399384, "memory(GiB)": 21.32, "step": 3776, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.955144 }, { "epoch": 0.12269759282720982, "grad_norm": 0.6469852328300476, "learning_rate": 9.856306655915542e-06, "loss": 0.06351371109485626, "memory(GiB)": 21.32, "step": 3777, "token_acc": 0.9702602230483272, "train_speed(iter/s)": 0.955192 }, { "epoch": 0.12273007828996524, "grad_norm": 1.008920431137085, "learning_rate": 9.85617877711284e-06, "loss": 0.07813961803913116, "memory(GiB)": 21.32, "step": 3778, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.955244 }, { "epoch": 0.12276256375272065, "grad_norm": 0.7006402611732483, "learning_rate": 9.85605084226337e-06, "loss": 0.0662280023097992, "memory(GiB)": 21.32, "step": 3779, "token_acc": 0.9561752988047809, "train_speed(iter/s)": 0.955294 }, { "epoch": 0.12279504921547607, "grad_norm": 0.5003388524055481, "learning_rate": 9.85592285136861e-06, "loss": 0.06160876154899597, "memory(GiB)": 21.32, "step": 3780, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.955343 }, { "epoch": 0.12282753467823149, "grad_norm": 0.5285807847976685, "learning_rate": 9.855794804430036e-06, "loss": 0.060788869857788086, "memory(GiB)": 21.32, "step": 3781, "token_acc": 0.9676113360323887, "train_speed(iter/s)": 0.955393 }, { "epoch": 0.1228600201409869, "grad_norm": 0.7804397940635681, "learning_rate": 9.855666701449129e-06, "loss": 0.07258729636669159, "memory(GiB)": 21.32, "step": 3782, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.955441 }, { "epoch": 0.12289250560374232, "grad_norm": 0.6093083620071411, "learning_rate": 9.855538542427363e-06, "loss": 0.06551644951105118, "memory(GiB)": 21.32, "step": 3783, "token_acc": 0.9578059071729957, "train_speed(iter/s)": 0.95549 }, { "epoch": 0.12292499106649775, "grad_norm": 0.6311102509498596, "learning_rate": 9.855410327366221e-06, "loss": 0.06450279802083969, "memory(GiB)": 21.32, "step": 3784, "token_acc": 0.9781021897810219, "train_speed(iter/s)": 0.955537 }, { "epoch": 0.12295747652925317, "grad_norm": 0.7151598334312439, "learning_rate": 9.855282056267182e-06, "loss": 0.05674638971686363, "memory(GiB)": 21.32, "step": 3785, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.955586 }, { "epoch": 0.12298996199200858, "grad_norm": 0.4834437668323517, "learning_rate": 9.855153729131724e-06, "loss": 0.06744175404310226, "memory(GiB)": 21.32, "step": 3786, "token_acc": 0.9678899082568807, "train_speed(iter/s)": 0.955636 }, { "epoch": 0.123022447454764, "grad_norm": 0.8318238854408264, "learning_rate": 9.85502534596133e-06, "loss": 0.06392791867256165, "memory(GiB)": 21.32, "step": 3787, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.955689 }, { "epoch": 0.12305493291751941, "grad_norm": 1.0660737752914429, "learning_rate": 9.854896906757483e-06, "loss": 0.0872357189655304, "memory(GiB)": 21.32, "step": 3788, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.955741 }, { "epoch": 0.12308741838027483, "grad_norm": 0.7979212999343872, "learning_rate": 9.854768411521664e-06, "loss": 0.06498106569051743, "memory(GiB)": 21.32, "step": 3789, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.95579 }, { "epoch": 0.12311990384303025, "grad_norm": 0.6713301539421082, "learning_rate": 9.854639860255356e-06, "loss": 0.06217062473297119, "memory(GiB)": 21.32, "step": 3790, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.955839 }, { "epoch": 0.12315238930578566, "grad_norm": 0.7271317839622498, "learning_rate": 9.854511252960039e-06, "loss": 0.06518928706645966, "memory(GiB)": 21.32, "step": 3791, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.955875 }, { "epoch": 0.12318487476854108, "grad_norm": 0.5980890989303589, "learning_rate": 9.854382589637203e-06, "loss": 0.08144866675138474, "memory(GiB)": 21.32, "step": 3792, "token_acc": 0.9574468085106383, "train_speed(iter/s)": 0.955904 }, { "epoch": 0.1232173602312965, "grad_norm": 0.6521106958389282, "learning_rate": 9.854253870288331e-06, "loss": 0.062332380563020706, "memory(GiB)": 21.32, "step": 3793, "token_acc": 0.9637096774193549, "train_speed(iter/s)": 0.955941 }, { "epoch": 0.12324984569405191, "grad_norm": 0.7233281135559082, "learning_rate": 9.854125094914908e-06, "loss": 0.05791942775249481, "memory(GiB)": 21.32, "step": 3794, "token_acc": 0.9779005524861878, "train_speed(iter/s)": 0.955978 }, { "epoch": 0.12328233115680733, "grad_norm": 0.8390160799026489, "learning_rate": 9.853996263518421e-06, "loss": 0.060975149273872375, "memory(GiB)": 21.32, "step": 3795, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.956013 }, { "epoch": 0.12331481661956274, "grad_norm": 0.6699288487434387, "learning_rate": 9.853867376100357e-06, "loss": 0.05984360724687576, "memory(GiB)": 21.32, "step": 3796, "token_acc": 0.9675324675324676, "train_speed(iter/s)": 0.956047 }, { "epoch": 0.12334730208231816, "grad_norm": 0.7536753416061401, "learning_rate": 9.853738432662203e-06, "loss": 0.06223250925540924, "memory(GiB)": 21.32, "step": 3797, "token_acc": 0.9726775956284153, "train_speed(iter/s)": 0.956084 }, { "epoch": 0.12337978754507357, "grad_norm": 0.6576429605484009, "learning_rate": 9.853609433205446e-06, "loss": 0.06313008069992065, "memory(GiB)": 21.32, "step": 3798, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.956116 }, { "epoch": 0.12341227300782899, "grad_norm": 0.7251126766204834, "learning_rate": 9.853480377731576e-06, "loss": 0.06671704351902008, "memory(GiB)": 21.32, "step": 3799, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.956151 }, { "epoch": 0.12344475847058442, "grad_norm": 0.7603579759597778, "learning_rate": 9.853351266242083e-06, "loss": 0.07461005449295044, "memory(GiB)": 21.32, "step": 3800, "token_acc": 0.975, "train_speed(iter/s)": 0.956182 }, { "epoch": 0.12347724393333984, "grad_norm": 0.7579860091209412, "learning_rate": 9.853222098738457e-06, "loss": 0.06926631927490234, "memory(GiB)": 21.32, "step": 3801, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.956216 }, { "epoch": 0.12350972939609525, "grad_norm": 0.6029766201972961, "learning_rate": 9.853092875222187e-06, "loss": 0.06571121513843536, "memory(GiB)": 21.32, "step": 3802, "token_acc": 0.9681978798586572, "train_speed(iter/s)": 0.95625 }, { "epoch": 0.12354221485885067, "grad_norm": 2.805936098098755, "learning_rate": 9.852963595694767e-06, "loss": 0.06542021036148071, "memory(GiB)": 21.32, "step": 3803, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.956284 }, { "epoch": 0.12357470032160608, "grad_norm": 0.8022276759147644, "learning_rate": 9.852834260157687e-06, "loss": 0.07250207662582397, "memory(GiB)": 21.32, "step": 3804, "token_acc": 0.9759615384615384, "train_speed(iter/s)": 0.956317 }, { "epoch": 0.1236071857843615, "grad_norm": 0.5199510455131531, "learning_rate": 9.852704868612442e-06, "loss": 0.0643024742603302, "memory(GiB)": 21.32, "step": 3805, "token_acc": 0.9711934156378601, "train_speed(iter/s)": 0.956349 }, { "epoch": 0.12363967124711692, "grad_norm": 0.6211135387420654, "learning_rate": 9.852575421060521e-06, "loss": 0.06832394003868103, "memory(GiB)": 21.32, "step": 3806, "token_acc": 0.9656652360515021, "train_speed(iter/s)": 0.956387 }, { "epoch": 0.12367215670987233, "grad_norm": 1.5672264099121094, "learning_rate": 9.852445917503424e-06, "loss": 0.07513335347175598, "memory(GiB)": 21.32, "step": 3807, "token_acc": 0.952, "train_speed(iter/s)": 0.956423 }, { "epoch": 0.12370464217262775, "grad_norm": 0.6955704092979431, "learning_rate": 9.852316357942641e-06, "loss": 0.06934595853090286, "memory(GiB)": 21.32, "step": 3808, "token_acc": 0.9735449735449735, "train_speed(iter/s)": 0.956456 }, { "epoch": 0.12373712763538317, "grad_norm": 0.6756848692893982, "learning_rate": 9.852186742379669e-06, "loss": 0.07514886558055878, "memory(GiB)": 21.32, "step": 3809, "token_acc": 0.9698275862068966, "train_speed(iter/s)": 0.956485 }, { "epoch": 0.12376961309813858, "grad_norm": 0.575677752494812, "learning_rate": 9.852057070816004e-06, "loss": 0.058050088584423065, "memory(GiB)": 21.32, "step": 3810, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.956524 }, { "epoch": 0.123802098560894, "grad_norm": 0.712910532951355, "learning_rate": 9.851927343253143e-06, "loss": 0.0580720454454422, "memory(GiB)": 21.32, "step": 3811, "token_acc": 0.9604743083003953, "train_speed(iter/s)": 0.956563 }, { "epoch": 0.12383458402364941, "grad_norm": 0.7270768284797668, "learning_rate": 9.851797559692581e-06, "loss": 0.07010204344987869, "memory(GiB)": 21.32, "step": 3812, "token_acc": 0.9589552238805971, "train_speed(iter/s)": 0.956606 }, { "epoch": 0.12386706948640483, "grad_norm": 0.8067372441291809, "learning_rate": 9.851667720135818e-06, "loss": 0.05951278656721115, "memory(GiB)": 21.32, "step": 3813, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.956647 }, { "epoch": 0.12389955494916025, "grad_norm": 0.5750523805618286, "learning_rate": 9.851537824584352e-06, "loss": 0.05921877548098564, "memory(GiB)": 21.32, "step": 3814, "token_acc": 0.9659090909090909, "train_speed(iter/s)": 0.956693 }, { "epoch": 0.12393204041191566, "grad_norm": 0.5071945190429688, "learning_rate": 9.851407873039682e-06, "loss": 0.06378540396690369, "memory(GiB)": 21.32, "step": 3815, "token_acc": 0.976303317535545, "train_speed(iter/s)": 0.956744 }, { "epoch": 0.12396452587467109, "grad_norm": 0.5829971432685852, "learning_rate": 9.851277865503307e-06, "loss": 0.06567583978176117, "memory(GiB)": 21.32, "step": 3816, "token_acc": 0.9690265486725663, "train_speed(iter/s)": 0.956787 }, { "epoch": 0.12399701133742651, "grad_norm": 0.8907738327980042, "learning_rate": 9.85114780197673e-06, "loss": 0.06590436398983002, "memory(GiB)": 21.32, "step": 3817, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.956835 }, { "epoch": 0.12402949680018192, "grad_norm": 0.6962164044380188, "learning_rate": 9.851017682461451e-06, "loss": 0.061173513531684875, "memory(GiB)": 21.32, "step": 3818, "token_acc": 0.9710144927536232, "train_speed(iter/s)": 0.956881 }, { "epoch": 0.12406198226293734, "grad_norm": 0.621004045009613, "learning_rate": 9.85088750695897e-06, "loss": 0.06317827105522156, "memory(GiB)": 21.32, "step": 3819, "token_acc": 0.9702602230483272, "train_speed(iter/s)": 0.956923 }, { "epoch": 0.12409446772569276, "grad_norm": 0.505072832107544, "learning_rate": 9.850757275470791e-06, "loss": 0.05810628831386566, "memory(GiB)": 21.32, "step": 3820, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.956972 }, { "epoch": 0.12412695318844817, "grad_norm": 0.6767145395278931, "learning_rate": 9.850626987998418e-06, "loss": 0.06798574328422546, "memory(GiB)": 21.32, "step": 3821, "token_acc": 0.9529914529914529, "train_speed(iter/s)": 0.957023 }, { "epoch": 0.12415943865120359, "grad_norm": 1.6921982765197754, "learning_rate": 9.85049664454335e-06, "loss": 0.06291820108890533, "memory(GiB)": 21.32, "step": 3822, "token_acc": 0.9669117647058824, "train_speed(iter/s)": 0.957074 }, { "epoch": 0.124191924113959, "grad_norm": 0.5318261981010437, "learning_rate": 9.850366245107098e-06, "loss": 0.05875939503312111, "memory(GiB)": 21.32, "step": 3823, "token_acc": 0.964824120603015, "train_speed(iter/s)": 0.957124 }, { "epoch": 0.12422440957671442, "grad_norm": 0.5075482130050659, "learning_rate": 9.850235789691163e-06, "loss": 0.0533284991979599, "memory(GiB)": 21.32, "step": 3824, "token_acc": 0.981203007518797, "train_speed(iter/s)": 0.957175 }, { "epoch": 0.12425689503946984, "grad_norm": 0.3702622950077057, "learning_rate": 9.85010527829705e-06, "loss": 0.05501549318432808, "memory(GiB)": 21.32, "step": 3825, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.957224 }, { "epoch": 0.12428938050222525, "grad_norm": 0.7878305912017822, "learning_rate": 9.849974710926268e-06, "loss": 0.06103391945362091, "memory(GiB)": 21.32, "step": 3826, "token_acc": 0.9663461538461539, "train_speed(iter/s)": 0.957275 }, { "epoch": 0.12432186596498067, "grad_norm": 1.0881781578063965, "learning_rate": 9.849844087580322e-06, "loss": 0.07171599566936493, "memory(GiB)": 21.32, "step": 3827, "token_acc": 0.9665271966527197, "train_speed(iter/s)": 0.957325 }, { "epoch": 0.12435435142773608, "grad_norm": 0.6608173847198486, "learning_rate": 9.84971340826072e-06, "loss": 0.0653381496667862, "memory(GiB)": 21.32, "step": 3828, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.957374 }, { "epoch": 0.1243868368904915, "grad_norm": 0.686830461025238, "learning_rate": 9.84958267296897e-06, "loss": 0.06099114939570427, "memory(GiB)": 21.32, "step": 3829, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.957424 }, { "epoch": 0.12441932235324692, "grad_norm": 1.191558599472046, "learning_rate": 9.849451881706582e-06, "loss": 0.06420983374118805, "memory(GiB)": 21.32, "step": 3830, "token_acc": 0.9669117647058824, "train_speed(iter/s)": 0.95747 }, { "epoch": 0.12445180781600233, "grad_norm": 1.4884132146835327, "learning_rate": 9.849321034475064e-06, "loss": 0.06994232535362244, "memory(GiB)": 21.32, "step": 3831, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.957519 }, { "epoch": 0.12448429327875776, "grad_norm": 0.6531712412834167, "learning_rate": 9.849190131275927e-06, "loss": 0.05720897763967514, "memory(GiB)": 21.32, "step": 3832, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.957567 }, { "epoch": 0.12451677874151318, "grad_norm": 0.45880478620529175, "learning_rate": 9.849059172110683e-06, "loss": 0.05111920088529587, "memory(GiB)": 21.32, "step": 3833, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.957613 }, { "epoch": 0.1245492642042686, "grad_norm": 0.7755828499794006, "learning_rate": 9.84892815698084e-06, "loss": 0.062024205923080444, "memory(GiB)": 21.32, "step": 3834, "token_acc": 0.975, "train_speed(iter/s)": 0.957663 }, { "epoch": 0.12458174966702401, "grad_norm": 0.8356560468673706, "learning_rate": 9.848797085887912e-06, "loss": 0.06924144923686981, "memory(GiB)": 21.32, "step": 3835, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.957713 }, { "epoch": 0.12461423512977943, "grad_norm": 0.5153521299362183, "learning_rate": 9.848665958833412e-06, "loss": 0.05109693482518196, "memory(GiB)": 21.32, "step": 3836, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.95776 }, { "epoch": 0.12464672059253484, "grad_norm": 0.5369105339050293, "learning_rate": 9.848534775818854e-06, "loss": 0.06701020896434784, "memory(GiB)": 21.32, "step": 3837, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.957806 }, { "epoch": 0.12467920605529026, "grad_norm": 0.9406786561012268, "learning_rate": 9.848403536845752e-06, "loss": 0.05986111983656883, "memory(GiB)": 21.32, "step": 3838, "token_acc": 0.9695431472081218, "train_speed(iter/s)": 0.957855 }, { "epoch": 0.12471169151804568, "grad_norm": 0.7083742022514343, "learning_rate": 9.84827224191562e-06, "loss": 0.0612579882144928, "memory(GiB)": 21.32, "step": 3839, "token_acc": 0.9751243781094527, "train_speed(iter/s)": 0.957904 }, { "epoch": 0.12474417698080109, "grad_norm": 0.7666445374488831, "learning_rate": 9.848140891029972e-06, "loss": 0.06549811363220215, "memory(GiB)": 21.32, "step": 3840, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.957951 }, { "epoch": 0.12477666244355651, "grad_norm": 0.6392012238502502, "learning_rate": 9.848009484190325e-06, "loss": 0.049676813185214996, "memory(GiB)": 21.32, "step": 3841, "token_acc": 0.9766536964980544, "train_speed(iter/s)": 0.958 }, { "epoch": 0.12480914790631192, "grad_norm": 0.8248607516288757, "learning_rate": 9.847878021398198e-06, "loss": 0.06538963317871094, "memory(GiB)": 21.32, "step": 3842, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.95805 }, { "epoch": 0.12484163336906734, "grad_norm": 0.9309902191162109, "learning_rate": 9.847746502655104e-06, "loss": 0.07176592946052551, "memory(GiB)": 21.32, "step": 3843, "token_acc": 0.98828125, "train_speed(iter/s)": 0.958098 }, { "epoch": 0.12487411883182276, "grad_norm": 0.7615693807601929, "learning_rate": 9.847614927962565e-06, "loss": 0.06651806086301804, "memory(GiB)": 21.32, "step": 3844, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.958145 }, { "epoch": 0.12490660429457817, "grad_norm": 2.1941096782684326, "learning_rate": 9.847483297322097e-06, "loss": 0.0736268013715744, "memory(GiB)": 21.32, "step": 3845, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.958194 }, { "epoch": 0.12493908975733359, "grad_norm": 0.7226533889770508, "learning_rate": 9.847351610735218e-06, "loss": 0.06335647404193878, "memory(GiB)": 21.32, "step": 3846, "token_acc": 0.9746835443037974, "train_speed(iter/s)": 0.958244 }, { "epoch": 0.124971575220089, "grad_norm": 0.86026930809021, "learning_rate": 9.847219868203453e-06, "loss": 0.06576845794916153, "memory(GiB)": 21.32, "step": 3847, "token_acc": 0.9733333333333334, "train_speed(iter/s)": 0.958292 }, { "epoch": 0.12500406068284442, "grad_norm": 0.8925585150718689, "learning_rate": 9.847088069728316e-06, "loss": 0.0749131441116333, "memory(GiB)": 21.32, "step": 3848, "token_acc": 0.9752650176678446, "train_speed(iter/s)": 0.958332 }, { "epoch": 0.12503654614559984, "grad_norm": 0.7380426526069641, "learning_rate": 9.846956215311332e-06, "loss": 0.06900650262832642, "memory(GiB)": 21.32, "step": 3849, "token_acc": 0.9578059071729957, "train_speed(iter/s)": 0.958363 }, { "epoch": 0.12506903160835525, "grad_norm": 0.6403619647026062, "learning_rate": 9.846824304954022e-06, "loss": 0.0592745803296566, "memory(GiB)": 21.32, "step": 3850, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.958401 }, { "epoch": 0.12510151707111067, "grad_norm": 1.0422892570495605, "learning_rate": 9.846692338657907e-06, "loss": 0.07559347152709961, "memory(GiB)": 21.32, "step": 3851, "token_acc": 0.9631336405529954, "train_speed(iter/s)": 0.958441 }, { "epoch": 0.12513400253386608, "grad_norm": 0.9885000586509705, "learning_rate": 9.846560316424515e-06, "loss": 0.07576695084571838, "memory(GiB)": 21.32, "step": 3852, "token_acc": 0.9764150943396226, "train_speed(iter/s)": 0.95848 }, { "epoch": 0.1251664879966215, "grad_norm": 0.6416998505592346, "learning_rate": 9.846428238255363e-06, "loss": 0.06606708467006683, "memory(GiB)": 21.32, "step": 3853, "token_acc": 0.9691629955947136, "train_speed(iter/s)": 0.958515 }, { "epoch": 0.12519897345937692, "grad_norm": 1.3188703060150146, "learning_rate": 9.84629610415198e-06, "loss": 0.06094275042414665, "memory(GiB)": 21.32, "step": 3854, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.958547 }, { "epoch": 0.12523145892213233, "grad_norm": 0.9104871153831482, "learning_rate": 9.84616391411589e-06, "loss": 0.06704768538475037, "memory(GiB)": 21.32, "step": 3855, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.958579 }, { "epoch": 0.12526394438488778, "grad_norm": 0.5884701609611511, "learning_rate": 9.846031668148617e-06, "loss": 0.05877329036593437, "memory(GiB)": 21.32, "step": 3856, "token_acc": 0.9703703703703703, "train_speed(iter/s)": 0.958618 }, { "epoch": 0.1252964298476432, "grad_norm": 1.2161977291107178, "learning_rate": 9.84589936625169e-06, "loss": 0.07029477506875992, "memory(GiB)": 21.32, "step": 3857, "token_acc": 0.9798387096774194, "train_speed(iter/s)": 0.958653 }, { "epoch": 0.1253289153103986, "grad_norm": 0.5539444088935852, "learning_rate": 9.845767008426634e-06, "loss": 0.07144667208194733, "memory(GiB)": 21.32, "step": 3858, "token_acc": 0.96, "train_speed(iter/s)": 0.958688 }, { "epoch": 0.12536140077315402, "grad_norm": 0.7231180667877197, "learning_rate": 9.845634594674975e-06, "loss": 0.0674000233411789, "memory(GiB)": 21.32, "step": 3859, "token_acc": 0.9628252788104089, "train_speed(iter/s)": 0.958719 }, { "epoch": 0.12539388623590944, "grad_norm": 0.7962886095046997, "learning_rate": 9.845502124998246e-06, "loss": 0.0637846365571022, "memory(GiB)": 21.32, "step": 3860, "token_acc": 0.9704433497536946, "train_speed(iter/s)": 0.958749 }, { "epoch": 0.12542637169866486, "grad_norm": 0.5910663604736328, "learning_rate": 9.845369599397972e-06, "loss": 0.06893609464168549, "memory(GiB)": 21.32, "step": 3861, "token_acc": 0.9776785714285714, "train_speed(iter/s)": 0.958783 }, { "epoch": 0.12545885716142027, "grad_norm": 0.48282763361930847, "learning_rate": 9.845237017875683e-06, "loss": 0.059991441667079926, "memory(GiB)": 21.32, "step": 3862, "token_acc": 0.9826086956521739, "train_speed(iter/s)": 0.95881 }, { "epoch": 0.1254913426241757, "grad_norm": 1.3344532251358032, "learning_rate": 9.845104380432911e-06, "loss": 0.06271668523550034, "memory(GiB)": 21.32, "step": 3863, "token_acc": 0.9806949806949807, "train_speed(iter/s)": 0.958841 }, { "epoch": 0.1255238280869311, "grad_norm": 0.45336559414863586, "learning_rate": 9.844971687071184e-06, "loss": 0.05356275290250778, "memory(GiB)": 21.32, "step": 3864, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.958875 }, { "epoch": 0.12555631354968652, "grad_norm": 0.563084065914154, "learning_rate": 9.844838937792036e-06, "loss": 0.0678747370839119, "memory(GiB)": 21.32, "step": 3865, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.958899 }, { "epoch": 0.12558879901244194, "grad_norm": 0.5810918807983398, "learning_rate": 9.844706132596998e-06, "loss": 0.05383920297026634, "memory(GiB)": 21.32, "step": 3866, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.958933 }, { "epoch": 0.12562128447519735, "grad_norm": 2.240856885910034, "learning_rate": 9.844573271487604e-06, "loss": 0.04798002541065216, "memory(GiB)": 21.32, "step": 3867, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.958974 }, { "epoch": 0.12565376993795277, "grad_norm": 0.7570489645004272, "learning_rate": 9.844440354465385e-06, "loss": 0.06061749532818794, "memory(GiB)": 21.32, "step": 3868, "token_acc": 0.9681978798586572, "train_speed(iter/s)": 0.959013 }, { "epoch": 0.12568625540070819, "grad_norm": 0.528660237789154, "learning_rate": 9.844307381531877e-06, "loss": 0.06739790737628937, "memory(GiB)": 21.32, "step": 3869, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.959054 }, { "epoch": 0.1257187408634636, "grad_norm": 1.4831854104995728, "learning_rate": 9.844174352688614e-06, "loss": 0.056885071098804474, "memory(GiB)": 21.32, "step": 3870, "token_acc": 0.983739837398374, "train_speed(iter/s)": 0.959081 }, { "epoch": 0.12575122632621902, "grad_norm": 0.690701425075531, "learning_rate": 9.844041267937131e-06, "loss": 0.05381038039922714, "memory(GiB)": 21.32, "step": 3871, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.959119 }, { "epoch": 0.12578371178897443, "grad_norm": 0.6757046580314636, "learning_rate": 9.843908127278965e-06, "loss": 0.0564022958278656, "memory(GiB)": 21.32, "step": 3872, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.959156 }, { "epoch": 0.12581619725172985, "grad_norm": 0.6845890879631042, "learning_rate": 9.843774930715651e-06, "loss": 0.05757581442594528, "memory(GiB)": 21.32, "step": 3873, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.959198 }, { "epoch": 0.12584868271448527, "grad_norm": 0.5729680061340332, "learning_rate": 9.84364167824873e-06, "loss": 0.0581628791987896, "memory(GiB)": 21.32, "step": 3874, "token_acc": 0.9620253164556962, "train_speed(iter/s)": 0.95924 }, { "epoch": 0.12588116817724068, "grad_norm": 0.7056832909584045, "learning_rate": 9.843508369879734e-06, "loss": 0.06037003546953201, "memory(GiB)": 21.32, "step": 3875, "token_acc": 0.9621848739495799, "train_speed(iter/s)": 0.959289 }, { "epoch": 0.1259136536399961, "grad_norm": 0.9128603935241699, "learning_rate": 9.843375005610205e-06, "loss": 0.07339473068714142, "memory(GiB)": 21.32, "step": 3876, "token_acc": 0.9621848739495799, "train_speed(iter/s)": 0.959336 }, { "epoch": 0.12594613910275151, "grad_norm": 0.5977432727813721, "learning_rate": 9.843241585441683e-06, "loss": 0.052465055137872696, "memory(GiB)": 21.32, "step": 3877, "token_acc": 0.975, "train_speed(iter/s)": 0.959382 }, { "epoch": 0.12597862456550693, "grad_norm": 0.6282562017440796, "learning_rate": 9.843108109375707e-06, "loss": 0.05992140993475914, "memory(GiB)": 21.32, "step": 3878, "token_acc": 0.9758454106280193, "train_speed(iter/s)": 0.959428 }, { "epoch": 0.12601111002826235, "grad_norm": 0.7889771461486816, "learning_rate": 9.842974577413816e-06, "loss": 0.06705091148614883, "memory(GiB)": 21.32, "step": 3879, "token_acc": 0.9769736842105263, "train_speed(iter/s)": 0.959471 }, { "epoch": 0.12604359549101776, "grad_norm": 0.7781386971473694, "learning_rate": 9.842840989557553e-06, "loss": 0.0689271092414856, "memory(GiB)": 21.32, "step": 3880, "token_acc": 0.9622641509433962, "train_speed(iter/s)": 0.95952 }, { "epoch": 0.12607608095377318, "grad_norm": 0.6298375725746155, "learning_rate": 9.84270734580846e-06, "loss": 0.0665719285607338, "memory(GiB)": 21.32, "step": 3881, "token_acc": 0.972, "train_speed(iter/s)": 0.95957 }, { "epoch": 0.1261085664165286, "grad_norm": 0.5815452933311462, "learning_rate": 9.842573646168078e-06, "loss": 0.05680207535624504, "memory(GiB)": 21.32, "step": 3882, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.959619 }, { "epoch": 0.126141051879284, "grad_norm": 0.6437863707542419, "learning_rate": 9.84243989063795e-06, "loss": 0.062349822372198105, "memory(GiB)": 21.32, "step": 3883, "token_acc": 0.9680851063829787, "train_speed(iter/s)": 0.959668 }, { "epoch": 0.12617353734203943, "grad_norm": 0.5180211067199707, "learning_rate": 9.84230607921962e-06, "loss": 0.06333506107330322, "memory(GiB)": 21.32, "step": 3884, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.959717 }, { "epoch": 0.12620602280479484, "grad_norm": 0.6168135404586792, "learning_rate": 9.842172211914635e-06, "loss": 0.05306867137551308, "memory(GiB)": 21.32, "step": 3885, "token_acc": 0.9719101123595506, "train_speed(iter/s)": 0.959765 }, { "epoch": 0.12623850826755026, "grad_norm": 0.7108771800994873, "learning_rate": 9.842038288724537e-06, "loss": 0.0633646696805954, "memory(GiB)": 21.32, "step": 3886, "token_acc": 0.9794238683127572, "train_speed(iter/s)": 0.959813 }, { "epoch": 0.12627099373030568, "grad_norm": 0.5118854641914368, "learning_rate": 9.841904309650872e-06, "loss": 0.055759139358997345, "memory(GiB)": 21.32, "step": 3887, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.959862 }, { "epoch": 0.12630347919306112, "grad_norm": 0.6875316500663757, "learning_rate": 9.841770274695186e-06, "loss": 0.06130874156951904, "memory(GiB)": 21.32, "step": 3888, "token_acc": 0.9636363636363636, "train_speed(iter/s)": 0.959907 }, { "epoch": 0.12633596465581654, "grad_norm": 0.4473199248313904, "learning_rate": 9.841636183859027e-06, "loss": 0.055980853736400604, "memory(GiB)": 21.32, "step": 3889, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.959955 }, { "epoch": 0.12636845011857195, "grad_norm": 0.421581894159317, "learning_rate": 9.841502037143942e-06, "loss": 0.05759625881910324, "memory(GiB)": 21.32, "step": 3890, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.960001 }, { "epoch": 0.12640093558132737, "grad_norm": 0.4745595157146454, "learning_rate": 9.841367834551481e-06, "loss": 0.06329892575740814, "memory(GiB)": 21.32, "step": 3891, "token_acc": 0.9671361502347418, "train_speed(iter/s)": 0.960051 }, { "epoch": 0.12643342104408278, "grad_norm": 1.1354471445083618, "learning_rate": 9.84123357608319e-06, "loss": 0.0672731101512909, "memory(GiB)": 21.32, "step": 3892, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.960099 }, { "epoch": 0.1264659065068382, "grad_norm": 0.6242989301681519, "learning_rate": 9.841099261740621e-06, "loss": 0.05359157174825668, "memory(GiB)": 21.32, "step": 3893, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.960148 }, { "epoch": 0.12649839196959362, "grad_norm": 0.4735110402107239, "learning_rate": 9.840964891525322e-06, "loss": 0.055924057960510254, "memory(GiB)": 21.32, "step": 3894, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.960194 }, { "epoch": 0.12653087743234903, "grad_norm": 0.4573725461959839, "learning_rate": 9.840830465438844e-06, "loss": 0.05568403750658035, "memory(GiB)": 21.32, "step": 3895, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.960244 }, { "epoch": 0.12656336289510445, "grad_norm": 0.5397756099700928, "learning_rate": 9.84069598348274e-06, "loss": 0.05565917491912842, "memory(GiB)": 21.32, "step": 3896, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.960292 }, { "epoch": 0.12659584835785986, "grad_norm": 0.6376964449882507, "learning_rate": 9.840561445658562e-06, "loss": 0.06170176714658737, "memory(GiB)": 21.32, "step": 3897, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.960342 }, { "epoch": 0.12662833382061528, "grad_norm": 0.6476777195930481, "learning_rate": 9.840426851967862e-06, "loss": 0.056084562093019485, "memory(GiB)": 21.32, "step": 3898, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.960391 }, { "epoch": 0.1266608192833707, "grad_norm": 0.5824583172798157, "learning_rate": 9.840292202412193e-06, "loss": 0.06542612612247467, "memory(GiB)": 21.32, "step": 3899, "token_acc": 0.957983193277311, "train_speed(iter/s)": 0.960441 }, { "epoch": 0.1266933047461261, "grad_norm": 0.7972663044929504, "learning_rate": 9.84015749699311e-06, "loss": 0.07354247570037842, "memory(GiB)": 21.32, "step": 3900, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.960488 }, { "epoch": 0.12672579020888153, "grad_norm": 0.584242045879364, "learning_rate": 9.84002273571217e-06, "loss": 0.06340010464191437, "memory(GiB)": 21.32, "step": 3901, "token_acc": 0.9732142857142857, "train_speed(iter/s)": 0.960537 }, { "epoch": 0.12675827567163694, "grad_norm": 0.6884281635284424, "learning_rate": 9.83988791857092e-06, "loss": 0.06155751273036003, "memory(GiB)": 21.32, "step": 3902, "token_acc": 0.9858657243816255, "train_speed(iter/s)": 0.960582 }, { "epoch": 0.12679076113439236, "grad_norm": 0.48494747281074524, "learning_rate": 9.839753045570925e-06, "loss": 0.058968789875507355, "memory(GiB)": 21.32, "step": 3903, "token_acc": 0.9629629629629629, "train_speed(iter/s)": 0.960631 }, { "epoch": 0.12682324659714778, "grad_norm": 0.49781349301338196, "learning_rate": 9.839618116713738e-06, "loss": 0.057974476367235184, "memory(GiB)": 21.32, "step": 3904, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.96068 }, { "epoch": 0.1268557320599032, "grad_norm": 0.5829195976257324, "learning_rate": 9.839483132000916e-06, "loss": 0.07110144197940826, "memory(GiB)": 21.32, "step": 3905, "token_acc": 0.9683098591549296, "train_speed(iter/s)": 0.960716 }, { "epoch": 0.1268882175226586, "grad_norm": 0.6248511672019958, "learning_rate": 9.839348091434017e-06, "loss": 0.06818962097167969, "memory(GiB)": 21.32, "step": 3906, "token_acc": 0.9766536964980544, "train_speed(iter/s)": 0.960749 }, { "epoch": 0.12692070298541402, "grad_norm": 0.538590669631958, "learning_rate": 9.839212995014599e-06, "loss": 0.06354432553052902, "memory(GiB)": 21.32, "step": 3907, "token_acc": 0.967032967032967, "train_speed(iter/s)": 0.960787 }, { "epoch": 0.12695318844816944, "grad_norm": 0.5117599964141846, "learning_rate": 9.839077842744225e-06, "loss": 0.06086563318967819, "memory(GiB)": 21.32, "step": 3908, "token_acc": 0.9730769230769231, "train_speed(iter/s)": 0.960825 }, { "epoch": 0.12698567391092486, "grad_norm": 0.5801865458488464, "learning_rate": 9.83894263462445e-06, "loss": 0.07227368652820587, "memory(GiB)": 21.32, "step": 3909, "token_acc": 0.9735849056603774, "train_speed(iter/s)": 0.960861 }, { "epoch": 0.12701815937368027, "grad_norm": 0.6680653095245361, "learning_rate": 9.838807370656836e-06, "loss": 0.06661295890808105, "memory(GiB)": 21.32, "step": 3910, "token_acc": 0.9735099337748344, "train_speed(iter/s)": 0.9609 }, { "epoch": 0.1270506448364357, "grad_norm": 0.4610544741153717, "learning_rate": 9.838672050842946e-06, "loss": 0.0488235279917717, "memory(GiB)": 21.32, "step": 3911, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.960939 }, { "epoch": 0.1270831302991911, "grad_norm": 0.5993931293487549, "learning_rate": 9.838536675184337e-06, "loss": 0.05744905397295952, "memory(GiB)": 21.32, "step": 3912, "token_acc": 0.9552238805970149, "train_speed(iter/s)": 0.96097 }, { "epoch": 0.12711561576194652, "grad_norm": 0.641990065574646, "learning_rate": 9.838401243682578e-06, "loss": 0.059439826756715775, "memory(GiB)": 21.32, "step": 3913, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.961006 }, { "epoch": 0.12714810122470194, "grad_norm": 0.8827187418937683, "learning_rate": 9.838265756339228e-06, "loss": 0.06888926774263382, "memory(GiB)": 21.32, "step": 3914, "token_acc": 0.9723502304147466, "train_speed(iter/s)": 0.961033 }, { "epoch": 0.12718058668745735, "grad_norm": 0.6434441208839417, "learning_rate": 9.838130213155852e-06, "loss": 0.07016220688819885, "memory(GiB)": 21.32, "step": 3915, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.961065 }, { "epoch": 0.12721307215021277, "grad_norm": 0.7212536334991455, "learning_rate": 9.837994614134011e-06, "loss": 0.059179455041885376, "memory(GiB)": 21.32, "step": 3916, "token_acc": 0.980544747081712, "train_speed(iter/s)": 0.961097 }, { "epoch": 0.12724555761296819, "grad_norm": 0.6542108654975891, "learning_rate": 9.837858959275275e-06, "loss": 0.06752634048461914, "memory(GiB)": 21.32, "step": 3917, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.961129 }, { "epoch": 0.1272780430757236, "grad_norm": 0.45075172185897827, "learning_rate": 9.837723248581207e-06, "loss": 0.0520968995988369, "memory(GiB)": 21.32, "step": 3918, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.961163 }, { "epoch": 0.12731052853847902, "grad_norm": 0.8000158071517944, "learning_rate": 9.837587482053374e-06, "loss": 0.07208742946386337, "memory(GiB)": 21.32, "step": 3919, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.961194 }, { "epoch": 0.12734301400123446, "grad_norm": 0.5980544090270996, "learning_rate": 9.837451659693343e-06, "loss": 0.05803123861551285, "memory(GiB)": 21.32, "step": 3920, "token_acc": 0.9629629629629629, "train_speed(iter/s)": 0.961226 }, { "epoch": 0.12737549946398988, "grad_norm": 0.787094235420227, "learning_rate": 9.837315781502681e-06, "loss": 0.06932581961154938, "memory(GiB)": 21.32, "step": 3921, "token_acc": 0.9572649572649573, "train_speed(iter/s)": 0.961259 }, { "epoch": 0.1274079849267453, "grad_norm": 0.7579546570777893, "learning_rate": 9.837179847482956e-06, "loss": 0.06516812741756439, "memory(GiB)": 21.32, "step": 3922, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.961289 }, { "epoch": 0.1274404703895007, "grad_norm": 0.7375808358192444, "learning_rate": 9.837043857635738e-06, "loss": 0.06035845726728439, "memory(GiB)": 21.32, "step": 3923, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.961319 }, { "epoch": 0.12747295585225613, "grad_norm": 0.5414963364601135, "learning_rate": 9.836907811962595e-06, "loss": 0.056771960109472275, "memory(GiB)": 21.32, "step": 3924, "token_acc": 0.9629629629629629, "train_speed(iter/s)": 0.961352 }, { "epoch": 0.12750544131501154, "grad_norm": 0.5776559710502625, "learning_rate": 9.836771710465098e-06, "loss": 0.06613749265670776, "memory(GiB)": 21.32, "step": 3925, "token_acc": 0.9585253456221198, "train_speed(iter/s)": 0.961389 }, { "epoch": 0.12753792677776696, "grad_norm": 0.6819120049476624, "learning_rate": 9.836635553144818e-06, "loss": 0.0640355572104454, "memory(GiB)": 21.32, "step": 3926, "token_acc": 0.968503937007874, "train_speed(iter/s)": 0.961427 }, { "epoch": 0.12757041224052237, "grad_norm": 0.8734679222106934, "learning_rate": 9.836499340003327e-06, "loss": 0.06758563965559006, "memory(GiB)": 21.32, "step": 3927, "token_acc": 0.9608695652173913, "train_speed(iter/s)": 0.961463 }, { "epoch": 0.1276028977032778, "grad_norm": 0.5643404722213745, "learning_rate": 9.836363071042195e-06, "loss": 0.06817968934774399, "memory(GiB)": 21.32, "step": 3928, "token_acc": 0.9695431472081218, "train_speed(iter/s)": 0.961499 }, { "epoch": 0.1276353831660332, "grad_norm": 0.7374950647354126, "learning_rate": 9.836226746262996e-06, "loss": 0.06412092596292496, "memory(GiB)": 21.32, "step": 3929, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.961537 }, { "epoch": 0.12766786862878862, "grad_norm": 0.6097529530525208, "learning_rate": 9.836090365667303e-06, "loss": 0.07014372944831848, "memory(GiB)": 21.32, "step": 3930, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.961575 }, { "epoch": 0.12770035409154404, "grad_norm": 0.697564423084259, "learning_rate": 9.835953929256692e-06, "loss": 0.07534149289131165, "memory(GiB)": 21.32, "step": 3931, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.961612 }, { "epoch": 0.12773283955429945, "grad_norm": 1.9147554636001587, "learning_rate": 9.835817437032734e-06, "loss": 0.06715935468673706, "memory(GiB)": 21.32, "step": 3932, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.961645 }, { "epoch": 0.12776532501705487, "grad_norm": 0.6398582458496094, "learning_rate": 9.835680888997008e-06, "loss": 0.05662553757429123, "memory(GiB)": 21.32, "step": 3933, "token_acc": 0.9748743718592965, "train_speed(iter/s)": 0.961682 }, { "epoch": 0.1277978104798103, "grad_norm": 0.5168637633323669, "learning_rate": 9.835544285151087e-06, "loss": 0.06872106343507767, "memory(GiB)": 21.32, "step": 3934, "token_acc": 0.976, "train_speed(iter/s)": 0.961718 }, { "epoch": 0.1278302959425657, "grad_norm": 0.5788394212722778, "learning_rate": 9.835407625496548e-06, "loss": 0.06666259467601776, "memory(GiB)": 21.32, "step": 3935, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.961757 }, { "epoch": 0.12786278140532112, "grad_norm": 0.42688116431236267, "learning_rate": 9.835270910034973e-06, "loss": 0.06390991806983948, "memory(GiB)": 21.32, "step": 3936, "token_acc": 0.9940828402366864, "train_speed(iter/s)": 0.961796 }, { "epoch": 0.12789526686807653, "grad_norm": 0.6846749186515808, "learning_rate": 9.835134138767932e-06, "loss": 0.06863871216773987, "memory(GiB)": 21.32, "step": 3937, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.961843 }, { "epoch": 0.12792775233083195, "grad_norm": 0.8200029730796814, "learning_rate": 9.83499731169701e-06, "loss": 0.07787041366100311, "memory(GiB)": 21.32, "step": 3938, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.961891 }, { "epoch": 0.12796023779358737, "grad_norm": 0.4858880937099457, "learning_rate": 9.834860428823782e-06, "loss": 0.058595962822437286, "memory(GiB)": 21.32, "step": 3939, "token_acc": 0.9699248120300752, "train_speed(iter/s)": 0.961939 }, { "epoch": 0.12799272325634278, "grad_norm": 0.5461555123329163, "learning_rate": 9.834723490149832e-06, "loss": 0.05303206294775009, "memory(GiB)": 21.32, "step": 3940, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.961985 }, { "epoch": 0.1280252087190982, "grad_norm": 0.5417190194129944, "learning_rate": 9.834586495676734e-06, "loss": 0.05861726030707359, "memory(GiB)": 21.32, "step": 3941, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.962033 }, { "epoch": 0.12805769418185362, "grad_norm": 0.6582865715026855, "learning_rate": 9.834449445406076e-06, "loss": 0.06730733811855316, "memory(GiB)": 21.32, "step": 3942, "token_acc": 0.947565543071161, "train_speed(iter/s)": 0.962081 }, { "epoch": 0.12809017964460903, "grad_norm": 0.6113579869270325, "learning_rate": 9.834312339339437e-06, "loss": 0.06604388356208801, "memory(GiB)": 21.32, "step": 3943, "token_acc": 0.9637096774193549, "train_speed(iter/s)": 0.962129 }, { "epoch": 0.12812266510736445, "grad_norm": 0.6662088632583618, "learning_rate": 9.834175177478396e-06, "loss": 0.06411789357662201, "memory(GiB)": 21.32, "step": 3944, "token_acc": 0.9686411149825784, "train_speed(iter/s)": 0.962174 }, { "epoch": 0.12815515057011986, "grad_norm": 0.502491295337677, "learning_rate": 9.834037959824541e-06, "loss": 0.058571018278598785, "memory(GiB)": 21.32, "step": 3945, "token_acc": 0.9745762711864406, "train_speed(iter/s)": 0.962221 }, { "epoch": 0.12818763603287528, "grad_norm": 0.7917541265487671, "learning_rate": 9.833900686379455e-06, "loss": 0.06470920145511627, "memory(GiB)": 21.32, "step": 3946, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.962267 }, { "epoch": 0.1282201214956307, "grad_norm": 0.725243330001831, "learning_rate": 9.83376335714472e-06, "loss": 0.074648916721344, "memory(GiB)": 21.32, "step": 3947, "token_acc": 0.9711191335740073, "train_speed(iter/s)": 0.962314 }, { "epoch": 0.1282526069583861, "grad_norm": 0.509549081325531, "learning_rate": 9.833625972121923e-06, "loss": 0.06404619663953781, "memory(GiB)": 21.32, "step": 3948, "token_acc": 0.9797979797979798, "train_speed(iter/s)": 0.962362 }, { "epoch": 0.12828509242114153, "grad_norm": 0.5916590690612793, "learning_rate": 9.833488531312647e-06, "loss": 0.0657249167561531, "memory(GiB)": 21.32, "step": 3949, "token_acc": 0.9651162790697675, "train_speed(iter/s)": 0.962409 }, { "epoch": 0.12831757788389694, "grad_norm": 1.0206496715545654, "learning_rate": 9.833351034718482e-06, "loss": 0.06455881893634796, "memory(GiB)": 21.32, "step": 3950, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.962454 }, { "epoch": 0.12835006334665236, "grad_norm": 0.5414291024208069, "learning_rate": 9.833213482341011e-06, "loss": 0.06192699074745178, "memory(GiB)": 21.32, "step": 3951, "token_acc": 0.9675925925925926, "train_speed(iter/s)": 0.962501 }, { "epoch": 0.1283825488094078, "grad_norm": 0.410780131816864, "learning_rate": 9.833075874181826e-06, "loss": 0.042986366897821426, "memory(GiB)": 21.32, "step": 3952, "token_acc": 0.978494623655914, "train_speed(iter/s)": 0.962547 }, { "epoch": 0.12841503427216322, "grad_norm": 0.7622890472412109, "learning_rate": 9.83293821024251e-06, "loss": 0.06660394370555878, "memory(GiB)": 21.32, "step": 3953, "token_acc": 0.9700374531835206, "train_speed(iter/s)": 0.962594 }, { "epoch": 0.12844751973491864, "grad_norm": 0.6772772669792175, "learning_rate": 9.832800490524654e-06, "loss": 0.06490428745746613, "memory(GiB)": 21.32, "step": 3954, "token_acc": 0.9702127659574468, "train_speed(iter/s)": 0.962636 }, { "epoch": 0.12848000519767405, "grad_norm": 0.8041636943817139, "learning_rate": 9.83266271502985e-06, "loss": 0.06994014978408813, "memory(GiB)": 21.32, "step": 3955, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.962682 }, { "epoch": 0.12851249066042947, "grad_norm": 0.6559586524963379, "learning_rate": 9.832524883759686e-06, "loss": 0.06776966154575348, "memory(GiB)": 21.32, "step": 3956, "token_acc": 0.974025974025974, "train_speed(iter/s)": 0.962729 }, { "epoch": 0.12854497612318488, "grad_norm": 1.9829978942871094, "learning_rate": 9.832386996715752e-06, "loss": 0.0709577426314354, "memory(GiB)": 21.32, "step": 3957, "token_acc": 0.9775280898876404, "train_speed(iter/s)": 0.962771 }, { "epoch": 0.1285774615859403, "grad_norm": 0.6966639161109924, "learning_rate": 9.83224905389964e-06, "loss": 0.06754587590694427, "memory(GiB)": 21.32, "step": 3958, "token_acc": 0.9735849056603774, "train_speed(iter/s)": 0.962814 }, { "epoch": 0.12860994704869572, "grad_norm": 0.9875180721282959, "learning_rate": 9.832111055312942e-06, "loss": 0.07085921615362167, "memory(GiB)": 21.32, "step": 3959, "token_acc": 0.9617021276595744, "train_speed(iter/s)": 0.962858 }, { "epoch": 0.12864243251145113, "grad_norm": 0.4467373192310333, "learning_rate": 9.831973000957252e-06, "loss": 0.049206122756004333, "memory(GiB)": 21.32, "step": 3960, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.962904 }, { "epoch": 0.12867491797420655, "grad_norm": 0.6196339130401611, "learning_rate": 9.831834890834162e-06, "loss": 0.05483021214604378, "memory(GiB)": 21.32, "step": 3961, "token_acc": 0.97, "train_speed(iter/s)": 0.962943 }, { "epoch": 0.12870740343696196, "grad_norm": 0.3998045325279236, "learning_rate": 9.831696724945266e-06, "loss": 0.052857883274555206, "memory(GiB)": 21.32, "step": 3962, "token_acc": 0.9675925925925926, "train_speed(iter/s)": 0.962976 }, { "epoch": 0.12873988889971738, "grad_norm": 0.690514087677002, "learning_rate": 9.831558503292159e-06, "loss": 0.06954728811979294, "memory(GiB)": 21.32, "step": 3963, "token_acc": 0.9751243781094527, "train_speed(iter/s)": 0.963016 }, { "epoch": 0.1287723743624728, "grad_norm": 0.9966840744018555, "learning_rate": 9.831420225876433e-06, "loss": 0.07385484129190445, "memory(GiB)": 21.32, "step": 3964, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.963053 }, { "epoch": 0.1288048598252282, "grad_norm": 0.8165311813354492, "learning_rate": 9.831281892699692e-06, "loss": 0.07247328013181686, "memory(GiB)": 21.32, "step": 3965, "token_acc": 0.9736842105263158, "train_speed(iter/s)": 0.96309 }, { "epoch": 0.12883734528798363, "grad_norm": 0.5753512978553772, "learning_rate": 9.831143503763525e-06, "loss": 0.0530271977186203, "memory(GiB)": 21.32, "step": 3966, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.963125 }, { "epoch": 0.12886983075073905, "grad_norm": 0.5433221459388733, "learning_rate": 9.831005059069531e-06, "loss": 0.0635790228843689, "memory(GiB)": 21.32, "step": 3967, "token_acc": 0.9626556016597511, "train_speed(iter/s)": 0.963161 }, { "epoch": 0.12890231621349446, "grad_norm": 0.6544327735900879, "learning_rate": 9.83086655861931e-06, "loss": 0.06335503607988358, "memory(GiB)": 21.32, "step": 3968, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.963199 }, { "epoch": 0.12893480167624988, "grad_norm": 0.5881898999214172, "learning_rate": 9.830728002414458e-06, "loss": 0.06143989413976669, "memory(GiB)": 21.32, "step": 3969, "token_acc": 0.9725490196078431, "train_speed(iter/s)": 0.963236 }, { "epoch": 0.1289672871390053, "grad_norm": 0.5919159650802612, "learning_rate": 9.830589390456575e-06, "loss": 0.05715550482273102, "memory(GiB)": 21.32, "step": 3970, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.96327 }, { "epoch": 0.1289997726017607, "grad_norm": 2.134036064147949, "learning_rate": 9.830450722747263e-06, "loss": 0.04791945219039917, "memory(GiB)": 21.32, "step": 3971, "token_acc": 0.9893238434163701, "train_speed(iter/s)": 0.963306 }, { "epoch": 0.12903225806451613, "grad_norm": 0.6422106623649597, "learning_rate": 9.830311999288117e-06, "loss": 0.05566141754388809, "memory(GiB)": 21.32, "step": 3972, "token_acc": 0.9790940766550522, "train_speed(iter/s)": 0.963341 }, { "epoch": 0.12906474352727154, "grad_norm": 0.7278807163238525, "learning_rate": 9.830173220080746e-06, "loss": 0.06914345920085907, "memory(GiB)": 21.32, "step": 3973, "token_acc": 0.9724409448818898, "train_speed(iter/s)": 0.96338 }, { "epoch": 0.12909722899002696, "grad_norm": 0.5537446141242981, "learning_rate": 9.830034385126744e-06, "loss": 0.05918820947408676, "memory(GiB)": 21.32, "step": 3974, "token_acc": 0.9727626459143969, "train_speed(iter/s)": 0.963415 }, { "epoch": 0.12912971445278237, "grad_norm": 0.6186606884002686, "learning_rate": 9.829895494427717e-06, "loss": 0.06697585433721542, "memory(GiB)": 21.32, "step": 3975, "token_acc": 0.9613733905579399, "train_speed(iter/s)": 0.963448 }, { "epoch": 0.1291621999155378, "grad_norm": 3.3144490718841553, "learning_rate": 9.829756547985267e-06, "loss": 0.054985806345939636, "memory(GiB)": 21.32, "step": 3976, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.963478 }, { "epoch": 0.1291946853782932, "grad_norm": 0.6296839714050293, "learning_rate": 9.829617545800999e-06, "loss": 0.05699142441153526, "memory(GiB)": 21.32, "step": 3977, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.96351 }, { "epoch": 0.12922717084104862, "grad_norm": 0.7807340025901794, "learning_rate": 9.829478487876515e-06, "loss": 0.05157310515642166, "memory(GiB)": 21.32, "step": 3978, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.96354 }, { "epoch": 0.12925965630380404, "grad_norm": 0.7637376189231873, "learning_rate": 9.829339374213421e-06, "loss": 0.06632092595100403, "memory(GiB)": 21.32, "step": 3979, "token_acc": 0.9764309764309764, "train_speed(iter/s)": 0.963572 }, { "epoch": 0.12929214176655945, "grad_norm": 0.5412774682044983, "learning_rate": 9.829200204813325e-06, "loss": 0.07487000524997711, "memory(GiB)": 21.32, "step": 3980, "token_acc": 0.9700374531835206, "train_speed(iter/s)": 0.963605 }, { "epoch": 0.12932462722931487, "grad_norm": 0.9713160991668701, "learning_rate": 9.829060979677828e-06, "loss": 0.0739632248878479, "memory(GiB)": 21.32, "step": 3981, "token_acc": 0.972027972027972, "train_speed(iter/s)": 0.963641 }, { "epoch": 0.1293571126920703, "grad_norm": 1.101404070854187, "learning_rate": 9.828921698808542e-06, "loss": 0.07077157497406006, "memory(GiB)": 21.32, "step": 3982, "token_acc": 0.9678899082568807, "train_speed(iter/s)": 0.963678 }, { "epoch": 0.1293895981548257, "grad_norm": 0.7250862121582031, "learning_rate": 9.828782362207072e-06, "loss": 0.04791940376162529, "memory(GiB)": 21.32, "step": 3983, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.963716 }, { "epoch": 0.12942208361758115, "grad_norm": 0.8367909789085388, "learning_rate": 9.828642969875026e-06, "loss": 0.05762515589594841, "memory(GiB)": 21.32, "step": 3984, "token_acc": 0.974025974025974, "train_speed(iter/s)": 0.963749 }, { "epoch": 0.12945456908033656, "grad_norm": 0.666826069355011, "learning_rate": 9.828503521814014e-06, "loss": 0.06575585156679153, "memory(GiB)": 21.32, "step": 3985, "token_acc": 0.9649805447470817, "train_speed(iter/s)": 0.963785 }, { "epoch": 0.12948705454309198, "grad_norm": 0.8935553431510925, "learning_rate": 9.828364018025644e-06, "loss": 0.06244710087776184, "memory(GiB)": 21.32, "step": 3986, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.963821 }, { "epoch": 0.1295195400058474, "grad_norm": 0.8411039113998413, "learning_rate": 9.828224458511526e-06, "loss": 0.06382057070732117, "memory(GiB)": 21.32, "step": 3987, "token_acc": 0.9702970297029703, "train_speed(iter/s)": 0.963858 }, { "epoch": 0.1295520254686028, "grad_norm": 0.8640965223312378, "learning_rate": 9.828084843273274e-06, "loss": 0.05206109955906868, "memory(GiB)": 21.32, "step": 3988, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.963896 }, { "epoch": 0.12958451093135823, "grad_norm": 0.6030497550964355, "learning_rate": 9.827945172312495e-06, "loss": 0.06351669132709503, "memory(GiB)": 21.32, "step": 3989, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.963935 }, { "epoch": 0.12961699639411364, "grad_norm": 1.1215261220932007, "learning_rate": 9.827805445630803e-06, "loss": 0.07866033911705017, "memory(GiB)": 21.32, "step": 3990, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.963976 }, { "epoch": 0.12964948185686906, "grad_norm": 0.8539584279060364, "learning_rate": 9.827665663229808e-06, "loss": 0.06554608047008514, "memory(GiB)": 21.32, "step": 3991, "token_acc": 0.9751243781094527, "train_speed(iter/s)": 0.964012 }, { "epoch": 0.12968196731962447, "grad_norm": 0.9735661745071411, "learning_rate": 9.827525825111129e-06, "loss": 0.0612436980009079, "memory(GiB)": 21.32, "step": 3992, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.964048 }, { "epoch": 0.1297144527823799, "grad_norm": 2.481658935546875, "learning_rate": 9.827385931276375e-06, "loss": 0.05613558739423752, "memory(GiB)": 21.32, "step": 3993, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.964077 }, { "epoch": 0.1297469382451353, "grad_norm": 0.6560384631156921, "learning_rate": 9.827245981727162e-06, "loss": 0.07220705598592758, "memory(GiB)": 21.32, "step": 3994, "token_acc": 0.9576271186440678, "train_speed(iter/s)": 0.96411 }, { "epoch": 0.12977942370789072, "grad_norm": 0.7288079261779785, "learning_rate": 9.827105976465106e-06, "loss": 0.06407753378152847, "memory(GiB)": 21.32, "step": 3995, "token_acc": 0.9576271186440678, "train_speed(iter/s)": 0.964146 }, { "epoch": 0.12981190917064614, "grad_norm": 0.7377849817276001, "learning_rate": 9.826965915491822e-06, "loss": 0.06697813421487808, "memory(GiB)": 21.32, "step": 3996, "token_acc": 0.982078853046595, "train_speed(iter/s)": 0.964184 }, { "epoch": 0.12984439463340156, "grad_norm": 0.5504873991012573, "learning_rate": 9.826825798808926e-06, "loss": 0.07126499712467194, "memory(GiB)": 21.32, "step": 3997, "token_acc": 0.9805825242718447, "train_speed(iter/s)": 0.964222 }, { "epoch": 0.12987688009615697, "grad_norm": 0.5493909120559692, "learning_rate": 9.826685626418037e-06, "loss": 0.06247617304325104, "memory(GiB)": 21.32, "step": 3998, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.964266 }, { "epoch": 0.1299093655589124, "grad_norm": 0.5910508036613464, "learning_rate": 9.82654539832077e-06, "loss": 0.07151110470294952, "memory(GiB)": 21.32, "step": 3999, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.96431 }, { "epoch": 0.1299418510216678, "grad_norm": 0.5968224406242371, "learning_rate": 9.826405114518746e-06, "loss": 0.0663340836763382, "memory(GiB)": 21.32, "step": 4000, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.964358 }, { "epoch": 0.1299418510216678, "eval_loss": 0.06223749741911888, "eval_runtime": 79.7217, "eval_samples_per_second": 124.809, "eval_steps_per_second": 3.901, "eval_token_acc": 0.9756331117849659, "step": 4000 }, { "epoch": 0.12997433648442322, "grad_norm": 1.2177067995071411, "learning_rate": 9.826264775013584e-06, "loss": 0.06104519963264465, "memory(GiB)": 21.32, "step": 4001, "token_acc": 0.9754119693711217, "train_speed(iter/s)": 0.943754 }, { "epoch": 0.13000682194717864, "grad_norm": 0.4961245059967041, "learning_rate": 9.8261243798069e-06, "loss": 0.055456846952438354, "memory(GiB)": 21.32, "step": 4002, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.943792 }, { "epoch": 0.13003930740993405, "grad_norm": 0.9559094309806824, "learning_rate": 9.82598392890032e-06, "loss": 0.06248275190591812, "memory(GiB)": 21.32, "step": 4003, "token_acc": 0.96, "train_speed(iter/s)": 0.943832 }, { "epoch": 0.13007179287268947, "grad_norm": 0.5865750312805176, "learning_rate": 9.82584342229546e-06, "loss": 0.07100705057382584, "memory(GiB)": 21.32, "step": 4004, "token_acc": 0.9682539682539683, "train_speed(iter/s)": 0.943874 }, { "epoch": 0.13010427833544488, "grad_norm": 0.9916481375694275, "learning_rate": 9.825702859993944e-06, "loss": 0.06179659068584442, "memory(GiB)": 21.32, "step": 4005, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.943925 }, { "epoch": 0.1301367637982003, "grad_norm": 0.5506094098091125, "learning_rate": 9.825562241997395e-06, "loss": 0.0664927288889885, "memory(GiB)": 21.32, "step": 4006, "token_acc": 0.947136563876652, "train_speed(iter/s)": 0.943972 }, { "epoch": 0.13016924926095572, "grad_norm": 0.5424831509590149, "learning_rate": 9.825421568307435e-06, "loss": 0.05478701367974281, "memory(GiB)": 21.32, "step": 4007, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.944019 }, { "epoch": 0.13020173472371113, "grad_norm": 0.6987758278846741, "learning_rate": 9.825280838925686e-06, "loss": 0.06147041916847229, "memory(GiB)": 21.32, "step": 4008, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.944069 }, { "epoch": 0.13023422018646655, "grad_norm": 0.6060187816619873, "learning_rate": 9.825140053853775e-06, "loss": 0.0578155443072319, "memory(GiB)": 21.32, "step": 4009, "token_acc": 0.9714285714285714, "train_speed(iter/s)": 0.944106 }, { "epoch": 0.13026670564922196, "grad_norm": 0.6002838611602783, "learning_rate": 9.824999213093326e-06, "loss": 0.05902032181620598, "memory(GiB)": 21.32, "step": 4010, "token_acc": 0.975, "train_speed(iter/s)": 0.94415 }, { "epoch": 0.13029919111197738, "grad_norm": 0.6937460899353027, "learning_rate": 9.824858316645962e-06, "loss": 0.0625995397567749, "memory(GiB)": 21.32, "step": 4011, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.944192 }, { "epoch": 0.1303316765747328, "grad_norm": 0.8308246731758118, "learning_rate": 9.824717364513313e-06, "loss": 0.06917157024145126, "memory(GiB)": 21.32, "step": 4012, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.944233 }, { "epoch": 0.1303641620374882, "grad_norm": 2.6810760498046875, "learning_rate": 9.824576356697002e-06, "loss": 0.06693155318498611, "memory(GiB)": 21.32, "step": 4013, "token_acc": 0.9822485207100592, "train_speed(iter/s)": 0.944271 }, { "epoch": 0.13039664750024363, "grad_norm": 0.9062009453773499, "learning_rate": 9.82443529319866e-06, "loss": 0.08335262537002563, "memory(GiB)": 21.32, "step": 4014, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.944312 }, { "epoch": 0.13042913296299904, "grad_norm": 0.6031440496444702, "learning_rate": 9.824294174019914e-06, "loss": 0.05381445586681366, "memory(GiB)": 21.32, "step": 4015, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.944354 }, { "epoch": 0.1304616184257545, "grad_norm": 0.5486615896224976, "learning_rate": 9.82415299916239e-06, "loss": 0.05732050910592079, "memory(GiB)": 21.32, "step": 4016, "token_acc": 0.9701492537313433, "train_speed(iter/s)": 0.944395 }, { "epoch": 0.1304941038885099, "grad_norm": 0.7027217149734497, "learning_rate": 9.824011768627722e-06, "loss": 0.06290806084871292, "memory(GiB)": 21.32, "step": 4017, "token_acc": 0.979253112033195, "train_speed(iter/s)": 0.944437 }, { "epoch": 0.13052658935126532, "grad_norm": 0.6368317604064941, "learning_rate": 9.823870482417536e-06, "loss": 0.0704304650425911, "memory(GiB)": 21.32, "step": 4018, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.944478 }, { "epoch": 0.13055907481402074, "grad_norm": 0.8237628936767578, "learning_rate": 9.823729140533464e-06, "loss": 0.06647332012653351, "memory(GiB)": 21.32, "step": 4019, "token_acc": 0.9812206572769953, "train_speed(iter/s)": 0.944519 }, { "epoch": 0.13059156027677615, "grad_norm": 0.7591681480407715, "learning_rate": 9.823587742977138e-06, "loss": 0.0628751814365387, "memory(GiB)": 21.32, "step": 4020, "token_acc": 0.9678714859437751, "train_speed(iter/s)": 0.944558 }, { "epoch": 0.13062404573953157, "grad_norm": 0.7739301919937134, "learning_rate": 9.823446289750189e-06, "loss": 0.058954741805791855, "memory(GiB)": 21.32, "step": 4021, "token_acc": 0.9736842105263158, "train_speed(iter/s)": 0.944591 }, { "epoch": 0.13065653120228699, "grad_norm": 0.6180160045623779, "learning_rate": 9.823304780854248e-06, "loss": 0.06150049716234207, "memory(GiB)": 21.32, "step": 4022, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.944628 }, { "epoch": 0.1306890166650424, "grad_norm": 0.5668466091156006, "learning_rate": 9.823163216290952e-06, "loss": 0.04999386519193649, "memory(GiB)": 21.32, "step": 4023, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.944668 }, { "epoch": 0.13072150212779782, "grad_norm": 0.6416101455688477, "learning_rate": 9.823021596061933e-06, "loss": 0.07000652700662613, "memory(GiB)": 21.32, "step": 4024, "token_acc": 0.9746835443037974, "train_speed(iter/s)": 0.944709 }, { "epoch": 0.13075398759055323, "grad_norm": 1.2255568504333496, "learning_rate": 9.822879920168824e-06, "loss": 0.0660913810133934, "memory(GiB)": 21.32, "step": 4025, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.944743 }, { "epoch": 0.13078647305330865, "grad_norm": 1.239082932472229, "learning_rate": 9.822738188613264e-06, "loss": 0.06560829281806946, "memory(GiB)": 21.32, "step": 4026, "token_acc": 0.9702127659574468, "train_speed(iter/s)": 0.944784 }, { "epoch": 0.13081895851606407, "grad_norm": 0.49958691000938416, "learning_rate": 9.822596401396883e-06, "loss": 0.05197250097990036, "memory(GiB)": 21.32, "step": 4027, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.944822 }, { "epoch": 0.13085144397881948, "grad_norm": 0.587862491607666, "learning_rate": 9.822454558521323e-06, "loss": 0.05486786738038063, "memory(GiB)": 21.32, "step": 4028, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.944858 }, { "epoch": 0.1308839294415749, "grad_norm": 0.7544484734535217, "learning_rate": 9.822312659988218e-06, "loss": 0.05803448334336281, "memory(GiB)": 21.32, "step": 4029, "token_acc": 0.9813084112149533, "train_speed(iter/s)": 0.944896 }, { "epoch": 0.1309164149043303, "grad_norm": 0.6313050985336304, "learning_rate": 9.822170705799206e-06, "loss": 0.060129620134830475, "memory(GiB)": 21.32, "step": 4030, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.944875 }, { "epoch": 0.13094890036708573, "grad_norm": 0.5713645815849304, "learning_rate": 9.822028695955926e-06, "loss": 0.052815258502960205, "memory(GiB)": 21.32, "step": 4031, "token_acc": 0.968503937007874, "train_speed(iter/s)": 0.94491 }, { "epoch": 0.13098138582984115, "grad_norm": 0.5837159752845764, "learning_rate": 9.821886630460018e-06, "loss": 0.05492262542247772, "memory(GiB)": 21.32, "step": 4032, "token_acc": 0.9630996309963099, "train_speed(iter/s)": 0.944952 }, { "epoch": 0.13101387129259656, "grad_norm": 3.7761330604553223, "learning_rate": 9.821744509313118e-06, "loss": 0.05528370290994644, "memory(GiB)": 21.32, "step": 4033, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.944997 }, { "epoch": 0.13104635675535198, "grad_norm": 0.5080270171165466, "learning_rate": 9.821602332516871e-06, "loss": 0.0554887056350708, "memory(GiB)": 21.32, "step": 4034, "token_acc": 0.9806201550387597, "train_speed(iter/s)": 0.945047 }, { "epoch": 0.1310788422181074, "grad_norm": 0.7378519773483276, "learning_rate": 9.821460100072914e-06, "loss": 0.0673806369304657, "memory(GiB)": 21.32, "step": 4035, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.945097 }, { "epoch": 0.1311113276808628, "grad_norm": 0.6739824414253235, "learning_rate": 9.82131781198289e-06, "loss": 0.06659664213657379, "memory(GiB)": 21.32, "step": 4036, "token_acc": 0.9626865671641791, "train_speed(iter/s)": 0.945145 }, { "epoch": 0.13114381314361823, "grad_norm": 0.7272018790245056, "learning_rate": 9.821175468248441e-06, "loss": 0.06478427350521088, "memory(GiB)": 21.32, "step": 4037, "token_acc": 0.9531914893617022, "train_speed(iter/s)": 0.945193 }, { "epoch": 0.13117629860637364, "grad_norm": 0.542151927947998, "learning_rate": 9.821033068871212e-06, "loss": 0.05955659598112106, "memory(GiB)": 21.32, "step": 4038, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.945242 }, { "epoch": 0.13120878406912906, "grad_norm": 0.9937703609466553, "learning_rate": 9.820890613852843e-06, "loss": 0.05841550976037979, "memory(GiB)": 21.32, "step": 4039, "token_acc": 0.9662921348314607, "train_speed(iter/s)": 0.945291 }, { "epoch": 0.13124126953188447, "grad_norm": 0.7034756541252136, "learning_rate": 9.82074810319498e-06, "loss": 0.07804352045059204, "memory(GiB)": 21.32, "step": 4040, "token_acc": 0.97265625, "train_speed(iter/s)": 0.945341 }, { "epoch": 0.1312737549946399, "grad_norm": 2.802156448364258, "learning_rate": 9.820605536899268e-06, "loss": 0.06501872837543488, "memory(GiB)": 21.32, "step": 4041, "token_acc": 0.9531914893617022, "train_speed(iter/s)": 0.945388 }, { "epoch": 0.1313062404573953, "grad_norm": 0.9655332565307617, "learning_rate": 9.820462914967352e-06, "loss": 0.06564106047153473, "memory(GiB)": 21.32, "step": 4042, "token_acc": 0.9814126394052045, "train_speed(iter/s)": 0.945427 }, { "epoch": 0.13133872592015072, "grad_norm": 0.5612092614173889, "learning_rate": 9.820320237400878e-06, "loss": 0.05864090844988823, "memory(GiB)": 21.32, "step": 4043, "token_acc": 0.9753521126760564, "train_speed(iter/s)": 0.945466 }, { "epoch": 0.13137121138290614, "grad_norm": 4.440164566040039, "learning_rate": 9.820177504201492e-06, "loss": 0.0545162670314312, "memory(GiB)": 21.32, "step": 4044, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.945506 }, { "epoch": 0.13140369684566156, "grad_norm": 0.5976778864860535, "learning_rate": 9.820034715370843e-06, "loss": 0.05747254937887192, "memory(GiB)": 21.32, "step": 4045, "token_acc": 0.9730941704035875, "train_speed(iter/s)": 0.945545 }, { "epoch": 0.13143618230841697, "grad_norm": 1.2709100246429443, "learning_rate": 9.819891870910577e-06, "loss": 0.06065132096409798, "memory(GiB)": 21.32, "step": 4046, "token_acc": 0.9523809523809523, "train_speed(iter/s)": 0.945585 }, { "epoch": 0.1314686677711724, "grad_norm": 0.61599200963974, "learning_rate": 9.819748970822343e-06, "loss": 0.05811529606580734, "memory(GiB)": 21.32, "step": 4047, "token_acc": 0.9758454106280193, "train_speed(iter/s)": 0.945624 }, { "epoch": 0.13150115323392783, "grad_norm": 0.6413873434066772, "learning_rate": 9.819606015107791e-06, "loss": 0.060012899339199066, "memory(GiB)": 21.32, "step": 4048, "token_acc": 0.9645669291338582, "train_speed(iter/s)": 0.945661 }, { "epoch": 0.13153363869668325, "grad_norm": 0.7939845323562622, "learning_rate": 9.819463003768572e-06, "loss": 0.06511611491441727, "memory(GiB)": 21.32, "step": 4049, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.945696 }, { "epoch": 0.13156612415943866, "grad_norm": 0.6571639180183411, "learning_rate": 9.819319936806333e-06, "loss": 0.05260607600212097, "memory(GiB)": 21.32, "step": 4050, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.945732 }, { "epoch": 0.13159860962219408, "grad_norm": 0.4524626135826111, "learning_rate": 9.81917681422273e-06, "loss": 0.04254285618662834, "memory(GiB)": 21.32, "step": 4051, "token_acc": 0.973404255319149, "train_speed(iter/s)": 0.945766 }, { "epoch": 0.1316310950849495, "grad_norm": 0.730825662612915, "learning_rate": 9.81903363601941e-06, "loss": 0.06847168505191803, "memory(GiB)": 21.32, "step": 4052, "token_acc": 0.9698275862068966, "train_speed(iter/s)": 0.945802 }, { "epoch": 0.1316635805477049, "grad_norm": 0.6762784123420715, "learning_rate": 9.81889040219803e-06, "loss": 0.05875594541430473, "memory(GiB)": 21.32, "step": 4053, "token_acc": 0.984, "train_speed(iter/s)": 0.945837 }, { "epoch": 0.13169606601046033, "grad_norm": 0.8801295161247253, "learning_rate": 9.818747112760238e-06, "loss": 0.061275873333215714, "memory(GiB)": 21.32, "step": 4054, "token_acc": 0.9628099173553719, "train_speed(iter/s)": 0.945872 }, { "epoch": 0.13172855147321574, "grad_norm": 0.6952117681503296, "learning_rate": 9.818603767707693e-06, "loss": 0.07536791265010834, "memory(GiB)": 21.32, "step": 4055, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.94591 }, { "epoch": 0.13176103693597116, "grad_norm": 0.8406147360801697, "learning_rate": 9.818460367042046e-06, "loss": 0.06621486693620682, "memory(GiB)": 21.32, "step": 4056, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.945945 }, { "epoch": 0.13179352239872658, "grad_norm": 0.9098861217498779, "learning_rate": 9.818316910764954e-06, "loss": 0.05640788376331329, "memory(GiB)": 21.32, "step": 4057, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.945983 }, { "epoch": 0.131826007861482, "grad_norm": 0.6679198741912842, "learning_rate": 9.81817339887807e-06, "loss": 0.06623752415180206, "memory(GiB)": 21.32, "step": 4058, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.946019 }, { "epoch": 0.1318584933242374, "grad_norm": 1.4901578426361084, "learning_rate": 9.818029831383054e-06, "loss": 0.06595523655414581, "memory(GiB)": 21.32, "step": 4059, "token_acc": 0.9757281553398058, "train_speed(iter/s)": 0.946056 }, { "epoch": 0.13189097878699282, "grad_norm": 0.4827168583869934, "learning_rate": 9.81788620828156e-06, "loss": 0.04461658000946045, "memory(GiB)": 21.32, "step": 4060, "token_acc": 0.981549815498155, "train_speed(iter/s)": 0.946096 }, { "epoch": 0.13192346424974824, "grad_norm": 0.6376897692680359, "learning_rate": 9.817742529575245e-06, "loss": 0.05287937819957733, "memory(GiB)": 21.32, "step": 4061, "token_acc": 0.9746835443037974, "train_speed(iter/s)": 0.946144 }, { "epoch": 0.13195594971250366, "grad_norm": 0.49900883436203003, "learning_rate": 9.81759879526577e-06, "loss": 0.050787679851055145, "memory(GiB)": 21.32, "step": 4062, "token_acc": 0.9555555555555556, "train_speed(iter/s)": 0.946192 }, { "epoch": 0.13198843517525907, "grad_norm": 0.8447651863098145, "learning_rate": 9.817455005354794e-06, "loss": 0.07782354205846786, "memory(GiB)": 21.32, "step": 4063, "token_acc": 0.9788732394366197, "train_speed(iter/s)": 0.946241 }, { "epoch": 0.1320209206380145, "grad_norm": 0.43393510580062866, "learning_rate": 9.817311159843974e-06, "loss": 0.055947139859199524, "memory(GiB)": 21.32, "step": 4064, "token_acc": 0.9627659574468085, "train_speed(iter/s)": 0.946292 }, { "epoch": 0.1320534061007699, "grad_norm": 0.6411410570144653, "learning_rate": 9.817167258734972e-06, "loss": 0.057878755033016205, "memory(GiB)": 21.32, "step": 4065, "token_acc": 0.9710743801652892, "train_speed(iter/s)": 0.946344 }, { "epoch": 0.13208589156352532, "grad_norm": 0.7716807723045349, "learning_rate": 9.817023302029447e-06, "loss": 0.060068026185035706, "memory(GiB)": 21.32, "step": 4066, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.94639 }, { "epoch": 0.13211837702628074, "grad_norm": 0.6577785015106201, "learning_rate": 9.816879289729061e-06, "loss": 0.059932708740234375, "memory(GiB)": 21.32, "step": 4067, "token_acc": 0.9712918660287081, "train_speed(iter/s)": 0.94644 }, { "epoch": 0.13215086248903615, "grad_norm": 0.9478937387466431, "learning_rate": 9.816735221835479e-06, "loss": 0.07928649336099625, "memory(GiB)": 21.32, "step": 4068, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.94649 }, { "epoch": 0.13218334795179157, "grad_norm": 0.6764311790466309, "learning_rate": 9.816591098350361e-06, "loss": 0.06709852814674377, "memory(GiB)": 21.32, "step": 4069, "token_acc": 0.9623430962343096, "train_speed(iter/s)": 0.946538 }, { "epoch": 0.13221583341454698, "grad_norm": 0.48187223076820374, "learning_rate": 9.816446919275371e-06, "loss": 0.05880872532725334, "memory(GiB)": 21.32, "step": 4070, "token_acc": 0.97265625, "train_speed(iter/s)": 0.946586 }, { "epoch": 0.1322483188773024, "grad_norm": 0.545112669467926, "learning_rate": 9.81630268461217e-06, "loss": 0.0600069984793663, "memory(GiB)": 21.32, "step": 4071, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.946635 }, { "epoch": 0.13228080434005782, "grad_norm": 0.6817835569381714, "learning_rate": 9.816158394362429e-06, "loss": 0.06618238985538483, "memory(GiB)": 21.32, "step": 4072, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.94668 }, { "epoch": 0.13231328980281323, "grad_norm": 0.7753115892410278, "learning_rate": 9.816014048527809e-06, "loss": 0.06964951753616333, "memory(GiB)": 21.32, "step": 4073, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.946716 }, { "epoch": 0.13234577526556865, "grad_norm": 0.6579821705818176, "learning_rate": 9.815869647109977e-06, "loss": 0.06686091423034668, "memory(GiB)": 21.32, "step": 4074, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.946756 }, { "epoch": 0.13237826072832407, "grad_norm": 0.5937767028808594, "learning_rate": 9.815725190110599e-06, "loss": 0.06059857830405235, "memory(GiB)": 21.32, "step": 4075, "token_acc": 0.9674418604651163, "train_speed(iter/s)": 0.946798 }, { "epoch": 0.13241074619107948, "grad_norm": 0.5165472030639648, "learning_rate": 9.815580677531343e-06, "loss": 0.05435340851545334, "memory(GiB)": 21.32, "step": 4076, "token_acc": 0.9624413145539906, "train_speed(iter/s)": 0.946837 }, { "epoch": 0.1324432316538349, "grad_norm": 0.4790644645690918, "learning_rate": 9.815436109373874e-06, "loss": 0.055046938359737396, "memory(GiB)": 21.32, "step": 4077, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.946874 }, { "epoch": 0.1324757171165903, "grad_norm": 0.6570003628730774, "learning_rate": 9.815291485639867e-06, "loss": 0.0703054741024971, "memory(GiB)": 21.32, "step": 4078, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.946912 }, { "epoch": 0.13250820257934573, "grad_norm": 0.5574851632118225, "learning_rate": 9.815146806330984e-06, "loss": 0.06300020962953568, "memory(GiB)": 21.32, "step": 4079, "token_acc": 0.9649805447470817, "train_speed(iter/s)": 0.946953 }, { "epoch": 0.13254068804210117, "grad_norm": 0.7010703086853027, "learning_rate": 9.815002071448898e-06, "loss": 0.059966374188661575, "memory(GiB)": 21.32, "step": 4080, "token_acc": 0.975103734439834, "train_speed(iter/s)": 0.946992 }, { "epoch": 0.1325731735048566, "grad_norm": 0.6173675060272217, "learning_rate": 9.81485728099528e-06, "loss": 0.052409540861845016, "memory(GiB)": 21.32, "step": 4081, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.947032 }, { "epoch": 0.132605658967612, "grad_norm": 0.45222166180610657, "learning_rate": 9.814712434971802e-06, "loss": 0.05898817628622055, "memory(GiB)": 21.32, "step": 4082, "token_acc": 0.9724409448818898, "train_speed(iter/s)": 0.947059 }, { "epoch": 0.13263814443036742, "grad_norm": 0.5880010724067688, "learning_rate": 9.814567533380132e-06, "loss": 0.05090635269880295, "memory(GiB)": 21.32, "step": 4083, "token_acc": 0.968503937007874, "train_speed(iter/s)": 0.947096 }, { "epoch": 0.13267062989312284, "grad_norm": 0.7034605145454407, "learning_rate": 9.814422576221944e-06, "loss": 0.062061671167612076, "memory(GiB)": 21.32, "step": 4084, "token_acc": 0.9624413145539906, "train_speed(iter/s)": 0.947134 }, { "epoch": 0.13270311535587825, "grad_norm": 0.5786192417144775, "learning_rate": 9.814277563498915e-06, "loss": 0.06476034224033356, "memory(GiB)": 21.32, "step": 4085, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.947168 }, { "epoch": 0.13273560081863367, "grad_norm": 0.9078105688095093, "learning_rate": 9.81413249521271e-06, "loss": 0.06936526298522949, "memory(GiB)": 21.32, "step": 4086, "token_acc": 0.9753694581280788, "train_speed(iter/s)": 0.947203 }, { "epoch": 0.13276808628138909, "grad_norm": 0.5997546911239624, "learning_rate": 9.813987371365013e-06, "loss": 0.06882339715957642, "memory(GiB)": 21.32, "step": 4087, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.947244 }, { "epoch": 0.1328005717441445, "grad_norm": 0.7821134924888611, "learning_rate": 9.813842191957492e-06, "loss": 0.05381617695093155, "memory(GiB)": 21.32, "step": 4088, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.94728 }, { "epoch": 0.13283305720689992, "grad_norm": 0.7586448192596436, "learning_rate": 9.813696956991827e-06, "loss": 0.05895466357469559, "memory(GiB)": 21.32, "step": 4089, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.947315 }, { "epoch": 0.13286554266965533, "grad_norm": 0.79353928565979, "learning_rate": 9.81355166646969e-06, "loss": 0.06991880387067795, "memory(GiB)": 21.32, "step": 4090, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.947346 }, { "epoch": 0.13289802813241075, "grad_norm": 0.6393777132034302, "learning_rate": 9.81340632039276e-06, "loss": 0.0551239512860775, "memory(GiB)": 21.32, "step": 4091, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.947379 }, { "epoch": 0.13293051359516617, "grad_norm": 0.609065592288971, "learning_rate": 9.813260918762716e-06, "loss": 0.06231549009680748, "memory(GiB)": 21.32, "step": 4092, "token_acc": 0.9963636363636363, "train_speed(iter/s)": 0.947418 }, { "epoch": 0.13296299905792158, "grad_norm": 0.8902440071105957, "learning_rate": 9.813115461581233e-06, "loss": 0.07709912210702896, "memory(GiB)": 21.32, "step": 4093, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.947453 }, { "epoch": 0.132995484520677, "grad_norm": 0.6267181634902954, "learning_rate": 9.812969948849993e-06, "loss": 0.06105104461312294, "memory(GiB)": 21.32, "step": 4094, "token_acc": 0.9836065573770492, "train_speed(iter/s)": 0.947491 }, { "epoch": 0.13302796998343241, "grad_norm": 0.5256068706512451, "learning_rate": 9.812824380570671e-06, "loss": 0.05347216874361038, "memory(GiB)": 21.32, "step": 4095, "token_acc": 0.995, "train_speed(iter/s)": 0.947524 }, { "epoch": 0.13306045544618783, "grad_norm": 2.232875347137451, "learning_rate": 9.812678756744952e-06, "loss": 0.05668924003839493, "memory(GiB)": 21.32, "step": 4096, "token_acc": 0.9758454106280193, "train_speed(iter/s)": 0.947572 }, { "epoch": 0.13309294090894325, "grad_norm": 0.5612484216690063, "learning_rate": 9.812533077374515e-06, "loss": 0.054924823343753815, "memory(GiB)": 21.32, "step": 4097, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.94762 }, { "epoch": 0.13312542637169866, "grad_norm": 0.5603103637695312, "learning_rate": 9.812387342461039e-06, "loss": 0.062130995094776154, "memory(GiB)": 21.32, "step": 4098, "token_acc": 0.9609756097560975, "train_speed(iter/s)": 0.947664 }, { "epoch": 0.13315791183445408, "grad_norm": 0.6720795631408691, "learning_rate": 9.812241552006208e-06, "loss": 0.06634888797998428, "memory(GiB)": 21.32, "step": 4099, "token_acc": 0.9556650246305419, "train_speed(iter/s)": 0.94771 }, { "epoch": 0.1331903972972095, "grad_norm": 4.743260383605957, "learning_rate": 9.812095706011706e-06, "loss": 0.07074832916259766, "memory(GiB)": 21.32, "step": 4100, "token_acc": 0.9603960396039604, "train_speed(iter/s)": 0.947753 }, { "epoch": 0.1332228827599649, "grad_norm": 1.6208478212356567, "learning_rate": 9.811949804479212e-06, "loss": 0.05957264453172684, "memory(GiB)": 21.32, "step": 4101, "token_acc": 0.9809885931558935, "train_speed(iter/s)": 0.947787 }, { "epoch": 0.13325536822272033, "grad_norm": 0.628996729850769, "learning_rate": 9.811803847410415e-06, "loss": 0.060927122831344604, "memory(GiB)": 21.32, "step": 4102, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.947824 }, { "epoch": 0.13328785368547574, "grad_norm": 0.703027069568634, "learning_rate": 9.811657834806996e-06, "loss": 0.05874285474419594, "memory(GiB)": 21.32, "step": 4103, "token_acc": 0.9710743801652892, "train_speed(iter/s)": 0.947864 }, { "epoch": 0.13332033914823116, "grad_norm": 0.7316915988922119, "learning_rate": 9.811511766670641e-06, "loss": 0.05622557923197746, "memory(GiB)": 21.32, "step": 4104, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.947902 }, { "epoch": 0.13335282461098658, "grad_norm": 0.9161688089370728, "learning_rate": 9.811365643003037e-06, "loss": 0.07118585705757141, "memory(GiB)": 21.32, "step": 4105, "token_acc": 0.9631578947368421, "train_speed(iter/s)": 0.947937 }, { "epoch": 0.133385310073742, "grad_norm": 0.7681829929351807, "learning_rate": 9.81121946380587e-06, "loss": 0.05328986793756485, "memory(GiB)": 21.32, "step": 4106, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.947971 }, { "epoch": 0.1334177955364974, "grad_norm": 0.6986752152442932, "learning_rate": 9.811073229080826e-06, "loss": 0.05938250198960304, "memory(GiB)": 21.32, "step": 4107, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.948008 }, { "epoch": 0.13345028099925282, "grad_norm": 0.7336155772209167, "learning_rate": 9.810926938829593e-06, "loss": 0.052239976823329926, "memory(GiB)": 21.32, "step": 4108, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.948046 }, { "epoch": 0.13348276646200824, "grad_norm": 0.9161537289619446, "learning_rate": 9.81078059305386e-06, "loss": 0.06982561945915222, "memory(GiB)": 21.32, "step": 4109, "token_acc": 0.9644268774703557, "train_speed(iter/s)": 0.948085 }, { "epoch": 0.13351525192476366, "grad_norm": 1.1451680660247803, "learning_rate": 9.810634191755317e-06, "loss": 0.07509194314479828, "memory(GiB)": 21.32, "step": 4110, "token_acc": 0.9723502304147466, "train_speed(iter/s)": 0.948122 }, { "epoch": 0.13354773738751907, "grad_norm": 1.5927197933197021, "learning_rate": 9.81048773493565e-06, "loss": 0.07530546188354492, "memory(GiB)": 21.32, "step": 4111, "token_acc": 0.9651567944250871, "train_speed(iter/s)": 0.948156 }, { "epoch": 0.13358022285027452, "grad_norm": 1.1507837772369385, "learning_rate": 9.810341222596554e-06, "loss": 0.0704643726348877, "memory(GiB)": 21.32, "step": 4112, "token_acc": 0.9734848484848485, "train_speed(iter/s)": 0.94819 }, { "epoch": 0.13361270831302993, "grad_norm": 0.9456496238708496, "learning_rate": 9.810194654739717e-06, "loss": 0.058016449213027954, "memory(GiB)": 21.32, "step": 4113, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.948225 }, { "epoch": 0.13364519377578535, "grad_norm": 0.5642791986465454, "learning_rate": 9.81004803136683e-06, "loss": 0.055114418268203735, "memory(GiB)": 21.32, "step": 4114, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.948262 }, { "epoch": 0.13367767923854076, "grad_norm": 0.6108720898628235, "learning_rate": 9.80990135247959e-06, "loss": 0.05648104101419449, "memory(GiB)": 21.32, "step": 4115, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.948299 }, { "epoch": 0.13371016470129618, "grad_norm": 0.5386804342269897, "learning_rate": 9.809754618079684e-06, "loss": 0.05737057328224182, "memory(GiB)": 21.32, "step": 4116, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.948331 }, { "epoch": 0.1337426501640516, "grad_norm": 0.6623250842094421, "learning_rate": 9.809607828168808e-06, "loss": 0.066739521920681, "memory(GiB)": 21.32, "step": 4117, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.948363 }, { "epoch": 0.133775135626807, "grad_norm": 0.8095600605010986, "learning_rate": 9.809460982748656e-06, "loss": 0.07670162618160248, "memory(GiB)": 21.32, "step": 4118, "token_acc": 0.9670781893004116, "train_speed(iter/s)": 0.9484 }, { "epoch": 0.13380762108956243, "grad_norm": 0.7684486508369446, "learning_rate": 9.809314081820922e-06, "loss": 0.058896467089653015, "memory(GiB)": 21.32, "step": 4119, "token_acc": 0.9711934156378601, "train_speed(iter/s)": 0.94844 }, { "epoch": 0.13384010655231784, "grad_norm": 0.632409393787384, "learning_rate": 9.809167125387304e-06, "loss": 0.061784952878952026, "memory(GiB)": 21.32, "step": 4120, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.948487 }, { "epoch": 0.13387259201507326, "grad_norm": 0.9014778137207031, "learning_rate": 9.809020113449496e-06, "loss": 0.07027935236692429, "memory(GiB)": 21.32, "step": 4121, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.948534 }, { "epoch": 0.13390507747782868, "grad_norm": 0.7204943299293518, "learning_rate": 9.808873046009195e-06, "loss": 0.06162599101662636, "memory(GiB)": 21.32, "step": 4122, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.948577 }, { "epoch": 0.1339375629405841, "grad_norm": 0.4378599524497986, "learning_rate": 9.808725923068098e-06, "loss": 0.04751675948500633, "memory(GiB)": 21.32, "step": 4123, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.948615 }, { "epoch": 0.1339700484033395, "grad_norm": 0.9335348010063171, "learning_rate": 9.808578744627903e-06, "loss": 0.06373586505651474, "memory(GiB)": 21.32, "step": 4124, "token_acc": 0.9704641350210971, "train_speed(iter/s)": 0.948662 }, { "epoch": 0.13400253386609493, "grad_norm": 0.6205021739006042, "learning_rate": 9.80843151069031e-06, "loss": 0.07003820687532425, "memory(GiB)": 21.32, "step": 4125, "token_acc": 0.9453125, "train_speed(iter/s)": 0.94871 }, { "epoch": 0.13403501932885034, "grad_norm": 1.0409237146377563, "learning_rate": 9.808284221257019e-06, "loss": 0.055384956300258636, "memory(GiB)": 21.32, "step": 4126, "token_acc": 0.9607843137254902, "train_speed(iter/s)": 0.948758 }, { "epoch": 0.13406750479160576, "grad_norm": 0.5676987767219543, "learning_rate": 9.808136876329725e-06, "loss": 0.05502879619598389, "memory(GiB)": 21.32, "step": 4127, "token_acc": 0.9773584905660377, "train_speed(iter/s)": 0.948807 }, { "epoch": 0.13409999025436117, "grad_norm": 0.557037353515625, "learning_rate": 9.807989475910135e-06, "loss": 0.05417553335428238, "memory(GiB)": 21.32, "step": 4128, "token_acc": 0.9816176470588235, "train_speed(iter/s)": 0.94885 }, { "epoch": 0.1341324757171166, "grad_norm": 0.6531370282173157, "learning_rate": 9.807842019999945e-06, "loss": 0.056399162858724594, "memory(GiB)": 21.32, "step": 4129, "token_acc": 0.9725490196078431, "train_speed(iter/s)": 0.948896 }, { "epoch": 0.134164961179872, "grad_norm": 1.3339040279388428, "learning_rate": 9.80769450860086e-06, "loss": 0.07405776530504227, "memory(GiB)": 21.32, "step": 4130, "token_acc": 0.975103734439834, "train_speed(iter/s)": 0.948941 }, { "epoch": 0.13419744664262742, "grad_norm": 0.5961143970489502, "learning_rate": 9.807546941714581e-06, "loss": 0.06078960746526718, "memory(GiB)": 21.32, "step": 4131, "token_acc": 0.9776119402985075, "train_speed(iter/s)": 0.948979 }, { "epoch": 0.13422993210538284, "grad_norm": 0.61175537109375, "learning_rate": 9.80739931934281e-06, "loss": 0.05798298865556717, "memory(GiB)": 21.32, "step": 4132, "token_acc": 0.9746835443037974, "train_speed(iter/s)": 0.949024 }, { "epoch": 0.13426241756813825, "grad_norm": 0.7467491626739502, "learning_rate": 9.807251641487254e-06, "loss": 0.05206897854804993, "memory(GiB)": 21.32, "step": 4133, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.949063 }, { "epoch": 0.13429490303089367, "grad_norm": 0.6321237087249756, "learning_rate": 9.807103908149615e-06, "loss": 0.06341283023357391, "memory(GiB)": 21.32, "step": 4134, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.949097 }, { "epoch": 0.13432738849364909, "grad_norm": 0.40986496210098267, "learning_rate": 9.806956119331598e-06, "loss": 0.047793105244636536, "memory(GiB)": 21.32, "step": 4135, "token_acc": 0.9819494584837545, "train_speed(iter/s)": 0.94913 }, { "epoch": 0.1343598739564045, "grad_norm": 0.5763071775436401, "learning_rate": 9.80680827503491e-06, "loss": 0.0558726005256176, "memory(GiB)": 21.32, "step": 4136, "token_acc": 0.9766536964980544, "train_speed(iter/s)": 0.949164 }, { "epoch": 0.13439235941915992, "grad_norm": 0.5524165630340576, "learning_rate": 9.806660375261257e-06, "loss": 0.06447364389896393, "memory(GiB)": 21.32, "step": 4137, "token_acc": 0.9707112970711297, "train_speed(iter/s)": 0.949199 }, { "epoch": 0.13442484488191533, "grad_norm": 0.6626469492912292, "learning_rate": 9.806512420012344e-06, "loss": 0.04715108126401901, "memory(GiB)": 21.32, "step": 4138, "token_acc": 0.98046875, "train_speed(iter/s)": 0.949241 }, { "epoch": 0.13445733034467075, "grad_norm": 0.6779024004936218, "learning_rate": 9.806364409289882e-06, "loss": 0.05077143386006355, "memory(GiB)": 21.32, "step": 4139, "token_acc": 0.9764705882352941, "train_speed(iter/s)": 0.949278 }, { "epoch": 0.13448981580742617, "grad_norm": 0.6098950505256653, "learning_rate": 9.806216343095577e-06, "loss": 0.05760807543992996, "memory(GiB)": 21.32, "step": 4140, "token_acc": 0.9859649122807017, "train_speed(iter/s)": 0.949317 }, { "epoch": 0.13452230127018158, "grad_norm": 0.6505629420280457, "learning_rate": 9.806068221431138e-06, "loss": 0.053305767476558685, "memory(GiB)": 21.32, "step": 4141, "token_acc": 0.9745762711864406, "train_speed(iter/s)": 0.949356 }, { "epoch": 0.134554786732937, "grad_norm": 1.2102051973342896, "learning_rate": 9.805920044298277e-06, "loss": 0.06738385558128357, "memory(GiB)": 21.32, "step": 4142, "token_acc": 0.9707112970711297, "train_speed(iter/s)": 0.949395 }, { "epoch": 0.13458727219569241, "grad_norm": 0.5442905426025391, "learning_rate": 9.8057718116987e-06, "loss": 0.05845081806182861, "memory(GiB)": 21.32, "step": 4143, "token_acc": 0.9825783972125436, "train_speed(iter/s)": 0.949432 }, { "epoch": 0.13461975765844786, "grad_norm": 0.6381579041481018, "learning_rate": 9.805623523634119e-06, "loss": 0.06169351190328598, "memory(GiB)": 21.32, "step": 4144, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.949473 }, { "epoch": 0.13465224312120327, "grad_norm": 0.528182327747345, "learning_rate": 9.805475180106248e-06, "loss": 0.0645243376493454, "memory(GiB)": 21.32, "step": 4145, "token_acc": 0.9587628865979382, "train_speed(iter/s)": 0.949507 }, { "epoch": 0.1346847285839587, "grad_norm": 0.8123673796653748, "learning_rate": 9.805326781116794e-06, "loss": 0.06015757471323013, "memory(GiB)": 21.32, "step": 4146, "token_acc": 0.988, "train_speed(iter/s)": 0.949545 }, { "epoch": 0.1347172140467141, "grad_norm": 0.6158223748207092, "learning_rate": 9.805178326667477e-06, "loss": 0.051883623003959656, "memory(GiB)": 21.32, "step": 4147, "token_acc": 0.9752475247524752, "train_speed(iter/s)": 0.949585 }, { "epoch": 0.13474969950946952, "grad_norm": 0.5593329071998596, "learning_rate": 9.805029816760006e-06, "loss": 0.057773761451244354, "memory(GiB)": 21.32, "step": 4148, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.94962 }, { "epoch": 0.13478218497222494, "grad_norm": 0.5660132169723511, "learning_rate": 9.804881251396095e-06, "loss": 0.05026933550834656, "memory(GiB)": 21.32, "step": 4149, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.94966 }, { "epoch": 0.13481467043498035, "grad_norm": 1.029645562171936, "learning_rate": 9.804732630577457e-06, "loss": 0.06278513371944427, "memory(GiB)": 21.32, "step": 4150, "token_acc": 0.9809523809523809, "train_speed(iter/s)": 0.949696 }, { "epoch": 0.13484715589773577, "grad_norm": 0.8197425007820129, "learning_rate": 9.80458395430581e-06, "loss": 0.05031263828277588, "memory(GiB)": 21.32, "step": 4151, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.949732 }, { "epoch": 0.1348796413604912, "grad_norm": 1.041883945465088, "learning_rate": 9.80443522258287e-06, "loss": 0.05762685835361481, "memory(GiB)": 21.32, "step": 4152, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.949763 }, { "epoch": 0.1349121268232466, "grad_norm": 0.8078600764274597, "learning_rate": 9.804286435410355e-06, "loss": 0.0578518882393837, "memory(GiB)": 21.32, "step": 4153, "token_acc": 0.9764705882352941, "train_speed(iter/s)": 0.949797 }, { "epoch": 0.13494461228600202, "grad_norm": 0.6075725555419922, "learning_rate": 9.804137592789977e-06, "loss": 0.06200636923313141, "memory(GiB)": 21.32, "step": 4154, "token_acc": 0.9720930232558139, "train_speed(iter/s)": 0.949834 }, { "epoch": 0.13497709774875744, "grad_norm": 0.7968822121620178, "learning_rate": 9.803988694723459e-06, "loss": 0.0545082651078701, "memory(GiB)": 21.32, "step": 4155, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.949873 }, { "epoch": 0.13500958321151285, "grad_norm": 0.6521449685096741, "learning_rate": 9.803839741212516e-06, "loss": 0.06391215324401855, "memory(GiB)": 21.32, "step": 4156, "token_acc": 0.9702602230483272, "train_speed(iter/s)": 0.949911 }, { "epoch": 0.13504206867426827, "grad_norm": 0.7197595238685608, "learning_rate": 9.80369073225887e-06, "loss": 0.06938100606203079, "memory(GiB)": 21.32, "step": 4157, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.949945 }, { "epoch": 0.13507455413702368, "grad_norm": 0.6719067096710205, "learning_rate": 9.803541667864238e-06, "loss": 0.061839886009693146, "memory(GiB)": 21.32, "step": 4158, "token_acc": 0.9653179190751445, "train_speed(iter/s)": 0.94998 }, { "epoch": 0.1351070395997791, "grad_norm": 0.9893175363540649, "learning_rate": 9.803392548030342e-06, "loss": 0.06541387736797333, "memory(GiB)": 21.32, "step": 4159, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.950017 }, { "epoch": 0.13513952506253452, "grad_norm": 0.67408287525177, "learning_rate": 9.803243372758902e-06, "loss": 0.05498170852661133, "memory(GiB)": 21.32, "step": 4160, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.950054 }, { "epoch": 0.13517201052528993, "grad_norm": 0.817611813545227, "learning_rate": 9.80309414205164e-06, "loss": 0.06600963324308395, "memory(GiB)": 21.32, "step": 4161, "token_acc": 0.9625468164794008, "train_speed(iter/s)": 0.950089 }, { "epoch": 0.13520449598804535, "grad_norm": 0.837730884552002, "learning_rate": 9.802944855910279e-06, "loss": 0.060169387608766556, "memory(GiB)": 21.32, "step": 4162, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.950126 }, { "epoch": 0.13523698145080076, "grad_norm": 0.4430454671382904, "learning_rate": 9.802795514336542e-06, "loss": 0.0514618381857872, "memory(GiB)": 21.32, "step": 4163, "token_acc": 0.9718875502008032, "train_speed(iter/s)": 0.950159 }, { "epoch": 0.13526946691355618, "grad_norm": 0.7861011624336243, "learning_rate": 9.802646117332153e-06, "loss": 0.05732107535004616, "memory(GiB)": 21.32, "step": 4164, "token_acc": 0.9814126394052045, "train_speed(iter/s)": 0.950199 }, { "epoch": 0.1353019523763116, "grad_norm": 0.5186358094215393, "learning_rate": 9.802496664898834e-06, "loss": 0.054193928837776184, "memory(GiB)": 21.32, "step": 4165, "token_acc": 0.9813084112149533, "train_speed(iter/s)": 0.950237 }, { "epoch": 0.135334437839067, "grad_norm": 0.4726791977882385, "learning_rate": 9.802347157038314e-06, "loss": 0.057548969984054565, "memory(GiB)": 21.32, "step": 4166, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.950275 }, { "epoch": 0.13536692330182243, "grad_norm": 0.5623469352722168, "learning_rate": 9.802197593752314e-06, "loss": 0.057199522852897644, "memory(GiB)": 21.32, "step": 4167, "token_acc": 0.9644268774703557, "train_speed(iter/s)": 0.950312 }, { "epoch": 0.13539940876457784, "grad_norm": 0.7520673871040344, "learning_rate": 9.802047975042562e-06, "loss": 0.05452961474657059, "memory(GiB)": 21.32, "step": 4168, "token_acc": 0.9786476868327402, "train_speed(iter/s)": 0.95035 }, { "epoch": 0.13543189422733326, "grad_norm": 0.5944918990135193, "learning_rate": 9.801898300910785e-06, "loss": 0.05629532411694527, "memory(GiB)": 21.32, "step": 4169, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.950387 }, { "epoch": 0.13546437969008868, "grad_norm": 0.5997318029403687, "learning_rate": 9.801748571358711e-06, "loss": 0.056571051478385925, "memory(GiB)": 21.32, "step": 4170, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.950423 }, { "epoch": 0.1354968651528441, "grad_norm": 0.7246635556221008, "learning_rate": 9.801598786388067e-06, "loss": 0.06305781751871109, "memory(GiB)": 21.32, "step": 4171, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.950461 }, { "epoch": 0.1355293506155995, "grad_norm": 0.5524857044219971, "learning_rate": 9.801448946000582e-06, "loss": 0.05003281310200691, "memory(GiB)": 21.32, "step": 4172, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.950497 }, { "epoch": 0.13556183607835492, "grad_norm": 1.2609567642211914, "learning_rate": 9.801299050197986e-06, "loss": 0.06283316016197205, "memory(GiB)": 21.32, "step": 4173, "token_acc": 0.988, "train_speed(iter/s)": 0.950533 }, { "epoch": 0.13559432154111034, "grad_norm": 0.5381391048431396, "learning_rate": 9.801149098982006e-06, "loss": 0.05909240245819092, "memory(GiB)": 21.32, "step": 4174, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.950568 }, { "epoch": 0.13562680700386576, "grad_norm": 0.6064412593841553, "learning_rate": 9.800999092354378e-06, "loss": 0.06896562874317169, "memory(GiB)": 21.32, "step": 4175, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.950606 }, { "epoch": 0.1356592924666212, "grad_norm": 0.9467430114746094, "learning_rate": 9.800849030316829e-06, "loss": 0.058486439287662506, "memory(GiB)": 21.32, "step": 4176, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.950652 }, { "epoch": 0.13569177792937662, "grad_norm": 0.5036640167236328, "learning_rate": 9.800698912871092e-06, "loss": 0.05609777569770813, "memory(GiB)": 21.32, "step": 4177, "token_acc": 0.975, "train_speed(iter/s)": 0.950698 }, { "epoch": 0.13572426339213203, "grad_norm": 0.5221137404441833, "learning_rate": 9.8005487400189e-06, "loss": 0.05815685912966728, "memory(GiB)": 21.32, "step": 4178, "token_acc": 0.9704433497536946, "train_speed(iter/s)": 0.950742 }, { "epoch": 0.13575674885488745, "grad_norm": 0.5639492273330688, "learning_rate": 9.800398511761987e-06, "loss": 0.05816303938627243, "memory(GiB)": 21.32, "step": 4179, "token_acc": 0.9561752988047809, "train_speed(iter/s)": 0.950785 }, { "epoch": 0.13578923431764287, "grad_norm": 0.5478221774101257, "learning_rate": 9.800248228102084e-06, "loss": 0.056934624910354614, "memory(GiB)": 21.32, "step": 4180, "token_acc": 0.9671532846715328, "train_speed(iter/s)": 0.95083 }, { "epoch": 0.13582171978039828, "grad_norm": 0.6058955192565918, "learning_rate": 9.80009788904093e-06, "loss": 0.06741799414157867, "memory(GiB)": 21.32, "step": 4181, "token_acc": 0.9545454545454546, "train_speed(iter/s)": 0.950874 }, { "epoch": 0.1358542052431537, "grad_norm": 0.6598978638648987, "learning_rate": 9.799947494580255e-06, "loss": 0.07035857439041138, "memory(GiB)": 21.32, "step": 4182, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.95092 }, { "epoch": 0.1358866907059091, "grad_norm": 0.6393149495124817, "learning_rate": 9.799797044721799e-06, "loss": 0.0675562396645546, "memory(GiB)": 21.32, "step": 4183, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.95096 }, { "epoch": 0.13591917616866453, "grad_norm": 0.48655012249946594, "learning_rate": 9.799646539467296e-06, "loss": 0.05405832454562187, "memory(GiB)": 21.32, "step": 4184, "token_acc": 0.9652777777777778, "train_speed(iter/s)": 0.951006 }, { "epoch": 0.13595166163141995, "grad_norm": 0.4900381863117218, "learning_rate": 9.799495978818483e-06, "loss": 0.050449877977371216, "memory(GiB)": 21.32, "step": 4185, "token_acc": 0.9822064056939501, "train_speed(iter/s)": 0.951051 }, { "epoch": 0.13598414709417536, "grad_norm": 0.9245244264602661, "learning_rate": 9.799345362777099e-06, "loss": 0.06873385608196259, "memory(GiB)": 21.32, "step": 4186, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.951096 }, { "epoch": 0.13601663255693078, "grad_norm": 0.5221527218818665, "learning_rate": 9.799194691344883e-06, "loss": 0.06155112013220787, "memory(GiB)": 21.32, "step": 4187, "token_acc": 0.9776951672862454, "train_speed(iter/s)": 0.951141 }, { "epoch": 0.1360491180196862, "grad_norm": 0.6185206174850464, "learning_rate": 9.79904396452357e-06, "loss": 0.06339617073535919, "memory(GiB)": 21.32, "step": 4188, "token_acc": 0.9704641350210971, "train_speed(iter/s)": 0.951187 }, { "epoch": 0.1360816034824416, "grad_norm": 0.5680637359619141, "learning_rate": 9.798893182314902e-06, "loss": 0.0556388720870018, "memory(GiB)": 21.32, "step": 4189, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.951233 }, { "epoch": 0.13611408894519703, "grad_norm": 0.6596800684928894, "learning_rate": 9.79874234472062e-06, "loss": 0.05910976231098175, "memory(GiB)": 21.32, "step": 4190, "token_acc": 0.981042654028436, "train_speed(iter/s)": 0.951276 }, { "epoch": 0.13614657440795244, "grad_norm": 1.6801270246505737, "learning_rate": 9.798591451742466e-06, "loss": 0.05474935472011566, "memory(GiB)": 21.32, "step": 4191, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.95132 }, { "epoch": 0.13617905987070786, "grad_norm": 0.7523936629295349, "learning_rate": 9.798440503382179e-06, "loss": 0.06783004105091095, "memory(GiB)": 21.32, "step": 4192, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.951364 }, { "epoch": 0.13621154533346327, "grad_norm": 1.0917795896530151, "learning_rate": 9.7982894996415e-06, "loss": 0.052092283964157104, "memory(GiB)": 21.32, "step": 4193, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.95141 }, { "epoch": 0.1362440307962187, "grad_norm": 0.5986097455024719, "learning_rate": 9.798138440522175e-06, "loss": 0.05746886879205704, "memory(GiB)": 21.32, "step": 4194, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.951446 }, { "epoch": 0.1362765162589741, "grad_norm": 0.5745510458946228, "learning_rate": 9.797987326025945e-06, "loss": 0.05583442002534866, "memory(GiB)": 21.32, "step": 4195, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.951482 }, { "epoch": 0.13630900172172952, "grad_norm": 0.6331862807273865, "learning_rate": 9.797836156154557e-06, "loss": 0.05688657611608505, "memory(GiB)": 21.32, "step": 4196, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.951519 }, { "epoch": 0.13634148718448494, "grad_norm": 0.5106770992279053, "learning_rate": 9.797684930909752e-06, "loss": 0.0561903640627861, "memory(GiB)": 21.32, "step": 4197, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.951554 }, { "epoch": 0.13637397264724035, "grad_norm": 0.6997653245925903, "learning_rate": 9.797533650293278e-06, "loss": 0.058189988136291504, "memory(GiB)": 21.32, "step": 4198, "token_acc": 0.9676113360323887, "train_speed(iter/s)": 0.951589 }, { "epoch": 0.13640645810999577, "grad_norm": 0.6168840527534485, "learning_rate": 9.79738231430688e-06, "loss": 0.055789515376091, "memory(GiB)": 21.32, "step": 4199, "token_acc": 0.9707317073170731, "train_speed(iter/s)": 0.951628 }, { "epoch": 0.1364389435727512, "grad_norm": 0.605257511138916, "learning_rate": 9.797230922952306e-06, "loss": 0.07207921147346497, "memory(GiB)": 21.32, "step": 4200, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.951666 }, { "epoch": 0.1364714290355066, "grad_norm": 0.6169921159744263, "learning_rate": 9.797079476231302e-06, "loss": 0.053095221519470215, "memory(GiB)": 21.32, "step": 4201, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.951703 }, { "epoch": 0.13650391449826202, "grad_norm": 0.8480343222618103, "learning_rate": 9.796927974145614e-06, "loss": 0.06966355443000793, "memory(GiB)": 21.32, "step": 4202, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.95174 }, { "epoch": 0.13653639996101744, "grad_norm": 0.4914763867855072, "learning_rate": 9.796776416696993e-06, "loss": 0.05600271373987198, "memory(GiB)": 21.32, "step": 4203, "token_acc": 0.9702970297029703, "train_speed(iter/s)": 0.951779 }, { "epoch": 0.13656888542377285, "grad_norm": 0.706759512424469, "learning_rate": 9.796624803887189e-06, "loss": 0.0609847791492939, "memory(GiB)": 21.32, "step": 4204, "token_acc": 0.9775280898876404, "train_speed(iter/s)": 0.951815 }, { "epoch": 0.13660137088652827, "grad_norm": 0.5068621635437012, "learning_rate": 9.796473135717949e-06, "loss": 0.04857136309146881, "memory(GiB)": 21.32, "step": 4205, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.951851 }, { "epoch": 0.13663385634928368, "grad_norm": 2.806664228439331, "learning_rate": 9.796321412191027e-06, "loss": 0.07426519691944122, "memory(GiB)": 21.32, "step": 4206, "token_acc": 0.9502487562189055, "train_speed(iter/s)": 0.951887 }, { "epoch": 0.1366663418120391, "grad_norm": 0.7902560234069824, "learning_rate": 9.79616963330817e-06, "loss": 0.062115658074617386, "memory(GiB)": 21.32, "step": 4207, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.951924 }, { "epoch": 0.13669882727479454, "grad_norm": 4.24534273147583, "learning_rate": 9.796017799071134e-06, "loss": 0.056531649082899094, "memory(GiB)": 21.32, "step": 4208, "token_acc": 0.9720930232558139, "train_speed(iter/s)": 0.951962 }, { "epoch": 0.13673131273754996, "grad_norm": 0.717054545879364, "learning_rate": 9.795865909481666e-06, "loss": 0.06690772622823715, "memory(GiB)": 21.32, "step": 4209, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.952 }, { "epoch": 0.13676379820030538, "grad_norm": 0.9476616382598877, "learning_rate": 9.795713964541524e-06, "loss": 0.059026531875133514, "memory(GiB)": 21.32, "step": 4210, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.952038 }, { "epoch": 0.1367962836630608, "grad_norm": 0.5350229144096375, "learning_rate": 9.79556196425246e-06, "loss": 0.056618582457304, "memory(GiB)": 21.32, "step": 4211, "token_acc": 0.9875, "train_speed(iter/s)": 0.952074 }, { "epoch": 0.1368287691258162, "grad_norm": 0.5107378363609314, "learning_rate": 9.795409908616228e-06, "loss": 0.04645279049873352, "memory(GiB)": 21.32, "step": 4212, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.952107 }, { "epoch": 0.13686125458857162, "grad_norm": 0.623792290687561, "learning_rate": 9.79525779763458e-06, "loss": 0.05232446640729904, "memory(GiB)": 21.32, "step": 4213, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.952129 }, { "epoch": 0.13689374005132704, "grad_norm": 0.7658607959747314, "learning_rate": 9.795105631309279e-06, "loss": 0.04981057345867157, "memory(GiB)": 21.32, "step": 4214, "token_acc": 0.96875, "train_speed(iter/s)": 0.952156 }, { "epoch": 0.13692622551408246, "grad_norm": 0.5649881958961487, "learning_rate": 9.794953409642074e-06, "loss": 0.0635688304901123, "memory(GiB)": 21.32, "step": 4215, "token_acc": 0.985663082437276, "train_speed(iter/s)": 0.952188 }, { "epoch": 0.13695871097683787, "grad_norm": 0.8199365139007568, "learning_rate": 9.794801132634725e-06, "loss": 0.06380508840084076, "memory(GiB)": 21.32, "step": 4216, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.952221 }, { "epoch": 0.1369911964395933, "grad_norm": 0.5853008031845093, "learning_rate": 9.79464880028899e-06, "loss": 0.060487210750579834, "memory(GiB)": 21.32, "step": 4217, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.952251 }, { "epoch": 0.1370236819023487, "grad_norm": 1.7256070375442505, "learning_rate": 9.794496412606625e-06, "loss": 0.06028155982494354, "memory(GiB)": 21.32, "step": 4218, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.95229 }, { "epoch": 0.13705616736510412, "grad_norm": 0.7389937043190002, "learning_rate": 9.79434396958939e-06, "loss": 0.061094895005226135, "memory(GiB)": 21.32, "step": 4219, "token_acc": 0.9581589958158996, "train_speed(iter/s)": 0.952325 }, { "epoch": 0.13708865282785954, "grad_norm": 1.6702983379364014, "learning_rate": 9.794191471239046e-06, "loss": 0.06452085077762604, "memory(GiB)": 21.32, "step": 4220, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.952361 }, { "epoch": 0.13712113829061495, "grad_norm": 0.5937883853912354, "learning_rate": 9.79403891755735e-06, "loss": 0.059162914752960205, "memory(GiB)": 21.32, "step": 4221, "token_acc": 0.971830985915493, "train_speed(iter/s)": 0.952391 }, { "epoch": 0.13715362375337037, "grad_norm": 0.607615053653717, "learning_rate": 9.793886308546064e-06, "loss": 0.06141573190689087, "memory(GiB)": 21.32, "step": 4222, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.952428 }, { "epoch": 0.13718610921612578, "grad_norm": 0.6458014845848083, "learning_rate": 9.793733644206948e-06, "loss": 0.050547659397125244, "memory(GiB)": 21.32, "step": 4223, "token_acc": 0.9767441860465116, "train_speed(iter/s)": 0.952465 }, { "epoch": 0.1372185946788812, "grad_norm": 0.6139489412307739, "learning_rate": 9.793580924541769e-06, "loss": 0.054381176829338074, "memory(GiB)": 21.32, "step": 4224, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.952499 }, { "epoch": 0.13725108014163662, "grad_norm": 0.7464586496353149, "learning_rate": 9.793428149552283e-06, "loss": 0.06809220463037491, "memory(GiB)": 21.32, "step": 4225, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.952536 }, { "epoch": 0.13728356560439203, "grad_norm": 0.4787721037864685, "learning_rate": 9.793275319240256e-06, "loss": 0.050739917904138565, "memory(GiB)": 21.32, "step": 4226, "token_acc": 0.9703703703703703, "train_speed(iter/s)": 0.952569 }, { "epoch": 0.13731605106714745, "grad_norm": 1.2056139707565308, "learning_rate": 9.793122433607453e-06, "loss": 0.06242906674742699, "memory(GiB)": 21.32, "step": 4227, "token_acc": 0.9885931558935361, "train_speed(iter/s)": 0.952598 }, { "epoch": 0.13734853652990286, "grad_norm": 0.5318625569343567, "learning_rate": 9.792969492655637e-06, "loss": 0.04735743999481201, "memory(GiB)": 21.32, "step": 4228, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.952633 }, { "epoch": 0.13738102199265828, "grad_norm": 0.9752275943756104, "learning_rate": 9.792816496386575e-06, "loss": 0.08143842220306396, "memory(GiB)": 21.32, "step": 4229, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.95267 }, { "epoch": 0.1374135074554137, "grad_norm": 0.5494006872177124, "learning_rate": 9.792663444802031e-06, "loss": 0.05625248700380325, "memory(GiB)": 21.32, "step": 4230, "token_acc": 0.9587155963302753, "train_speed(iter/s)": 0.952707 }, { "epoch": 0.1374459929181691, "grad_norm": 0.4844590425491333, "learning_rate": 9.792510337903771e-06, "loss": 0.058473654091358185, "memory(GiB)": 21.32, "step": 4231, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.952739 }, { "epoch": 0.13747847838092453, "grad_norm": 0.6540321111679077, "learning_rate": 9.792357175693563e-06, "loss": 0.07057135552167892, "memory(GiB)": 21.32, "step": 4232, "token_acc": 0.9698275862068966, "train_speed(iter/s)": 0.952781 }, { "epoch": 0.13751096384367995, "grad_norm": 0.5674042105674744, "learning_rate": 9.792203958173174e-06, "loss": 0.05628379434347153, "memory(GiB)": 21.32, "step": 4233, "token_acc": 0.9813953488372092, "train_speed(iter/s)": 0.952822 }, { "epoch": 0.13754344930643536, "grad_norm": 0.7230124473571777, "learning_rate": 9.792050685344373e-06, "loss": 0.06808219850063324, "memory(GiB)": 21.32, "step": 4234, "token_acc": 0.9759036144578314, "train_speed(iter/s)": 0.952868 }, { "epoch": 0.13757593476919078, "grad_norm": 0.698051929473877, "learning_rate": 9.79189735720893e-06, "loss": 0.07111448049545288, "memory(GiB)": 21.32, "step": 4235, "token_acc": 0.9683257918552036, "train_speed(iter/s)": 0.952911 }, { "epoch": 0.1376084202319462, "grad_norm": 0.7069398164749146, "learning_rate": 9.791743973768613e-06, "loss": 0.06815466284751892, "memory(GiB)": 21.32, "step": 4236, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.952956 }, { "epoch": 0.1376409056947016, "grad_norm": 0.5888945460319519, "learning_rate": 9.791590535025192e-06, "loss": 0.05963204801082611, "memory(GiB)": 21.32, "step": 4237, "token_acc": 0.9623430962343096, "train_speed(iter/s)": 0.953 }, { "epoch": 0.13767339115745703, "grad_norm": 1.3028411865234375, "learning_rate": 9.79143704098044e-06, "loss": 0.0725565254688263, "memory(GiB)": 21.32, "step": 4238, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.953044 }, { "epoch": 0.13770587662021244, "grad_norm": 0.9908378720283508, "learning_rate": 9.791283491636128e-06, "loss": 0.05924159288406372, "memory(GiB)": 21.32, "step": 4239, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.953088 }, { "epoch": 0.13773836208296789, "grad_norm": 0.5800541043281555, "learning_rate": 9.791129886994025e-06, "loss": 0.06050874665379524, "memory(GiB)": 21.32, "step": 4240, "token_acc": 0.9539748953974896, "train_speed(iter/s)": 0.953132 }, { "epoch": 0.1377708475457233, "grad_norm": 1.7118990421295166, "learning_rate": 9.790976227055907e-06, "loss": 0.049605708569288254, "memory(GiB)": 21.32, "step": 4241, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.953173 }, { "epoch": 0.13780333300847872, "grad_norm": 0.7153936624526978, "learning_rate": 9.790822511823546e-06, "loss": 0.07934834063053131, "memory(GiB)": 21.32, "step": 4242, "token_acc": 0.9665271966527197, "train_speed(iter/s)": 0.953217 }, { "epoch": 0.13783581847123413, "grad_norm": 1.1444439888000488, "learning_rate": 9.790668741298717e-06, "loss": 0.0636906549334526, "memory(GiB)": 21.32, "step": 4243, "token_acc": 0.9856459330143541, "train_speed(iter/s)": 0.953253 }, { "epoch": 0.13786830393398955, "grad_norm": 0.7006105780601501, "learning_rate": 9.790514915483196e-06, "loss": 0.06213035434484482, "memory(GiB)": 21.32, "step": 4244, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.953296 }, { "epoch": 0.13790078939674497, "grad_norm": 0.8931058049201965, "learning_rate": 9.790361034378755e-06, "loss": 0.06364314258098602, "memory(GiB)": 21.32, "step": 4245, "token_acc": 0.9591078066914498, "train_speed(iter/s)": 0.95334 }, { "epoch": 0.13793327485950038, "grad_norm": 0.6276990175247192, "learning_rate": 9.79020709798717e-06, "loss": 0.05383560061454773, "memory(GiB)": 21.32, "step": 4246, "token_acc": 0.963302752293578, "train_speed(iter/s)": 0.953383 }, { "epoch": 0.1379657603222558, "grad_norm": 0.952652633190155, "learning_rate": 9.790053106310222e-06, "loss": 0.07806357741355896, "memory(GiB)": 21.32, "step": 4247, "token_acc": 0.9690721649484536, "train_speed(iter/s)": 0.953428 }, { "epoch": 0.13799824578501121, "grad_norm": 0.44369322061538696, "learning_rate": 9.789899059349686e-06, "loss": 0.048132263123989105, "memory(GiB)": 21.32, "step": 4248, "token_acc": 0.9745762711864406, "train_speed(iter/s)": 0.95347 }, { "epoch": 0.13803073124776663, "grad_norm": 0.5784397125244141, "learning_rate": 9.789744957107338e-06, "loss": 0.05629555881023407, "memory(GiB)": 21.32, "step": 4249, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.953516 }, { "epoch": 0.13806321671052205, "grad_norm": 1.0341269969940186, "learning_rate": 9.789590799584958e-06, "loss": 0.06871551275253296, "memory(GiB)": 21.32, "step": 4250, "token_acc": 0.9835390946502057, "train_speed(iter/s)": 0.953558 }, { "epoch": 0.13809570217327746, "grad_norm": 1.1542274951934814, "learning_rate": 9.789436586784327e-06, "loss": 0.06617864966392517, "memory(GiB)": 21.32, "step": 4251, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.953601 }, { "epoch": 0.13812818763603288, "grad_norm": 0.45801377296447754, "learning_rate": 9.789282318707221e-06, "loss": 0.05598702281713486, "memory(GiB)": 21.32, "step": 4252, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.953645 }, { "epoch": 0.1381606730987883, "grad_norm": 0.6799213290214539, "learning_rate": 9.789127995355423e-06, "loss": 0.05408862233161926, "memory(GiB)": 21.32, "step": 4253, "token_acc": 0.9786096256684492, "train_speed(iter/s)": 0.95369 }, { "epoch": 0.1381931585615437, "grad_norm": 1.0012022256851196, "learning_rate": 9.788973616730713e-06, "loss": 0.06152595579624176, "memory(GiB)": 21.32, "step": 4254, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.953732 }, { "epoch": 0.13822564402429913, "grad_norm": 0.7977416515350342, "learning_rate": 9.788819182834874e-06, "loss": 0.06363339722156525, "memory(GiB)": 21.32, "step": 4255, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.953773 }, { "epoch": 0.13825812948705454, "grad_norm": 0.4954695403575897, "learning_rate": 9.78866469366969e-06, "loss": 0.054623790085315704, "memory(GiB)": 21.32, "step": 4256, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.953811 }, { "epoch": 0.13829061494980996, "grad_norm": 0.5069836974143982, "learning_rate": 9.788510149236938e-06, "loss": 0.052848126739263535, "memory(GiB)": 21.32, "step": 4257, "token_acc": 0.9618055555555556, "train_speed(iter/s)": 0.95385 }, { "epoch": 0.13832310041256538, "grad_norm": 0.961372435092926, "learning_rate": 9.788355549538408e-06, "loss": 0.06353749334812164, "memory(GiB)": 21.32, "step": 4258, "token_acc": 0.9672131147540983, "train_speed(iter/s)": 0.95389 }, { "epoch": 0.1383555858753208, "grad_norm": 0.5732054114341736, "learning_rate": 9.78820089457588e-06, "loss": 0.05577440187335014, "memory(GiB)": 21.32, "step": 4259, "token_acc": 0.9672897196261683, "train_speed(iter/s)": 0.953926 }, { "epoch": 0.1383880713380762, "grad_norm": 2.9673264026641846, "learning_rate": 9.788046184351142e-06, "loss": 0.06630381941795349, "memory(GiB)": 21.32, "step": 4260, "token_acc": 0.9707112970711297, "train_speed(iter/s)": 0.953964 }, { "epoch": 0.13842055680083162, "grad_norm": 0.8556743860244751, "learning_rate": 9.787891418865978e-06, "loss": 0.05481571704149246, "memory(GiB)": 21.32, "step": 4261, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.953995 }, { "epoch": 0.13845304226358704, "grad_norm": 0.5558292865753174, "learning_rate": 9.787736598122173e-06, "loss": 0.05902758613228798, "memory(GiB)": 21.32, "step": 4262, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.954037 }, { "epoch": 0.13848552772634246, "grad_norm": 2.185492753982544, "learning_rate": 9.787581722121518e-06, "loss": 0.06498733907938004, "memory(GiB)": 21.32, "step": 4263, "token_acc": 0.9836065573770492, "train_speed(iter/s)": 0.954074 }, { "epoch": 0.13851801318909787, "grad_norm": 0.7805184125900269, "learning_rate": 9.787426790865794e-06, "loss": 0.05846627801656723, "memory(GiB)": 21.32, "step": 4264, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.954113 }, { "epoch": 0.1385504986518533, "grad_norm": 0.7132450938224792, "learning_rate": 9.787271804356794e-06, "loss": 0.0600057989358902, "memory(GiB)": 21.32, "step": 4265, "token_acc": 0.982532751091703, "train_speed(iter/s)": 0.954149 }, { "epoch": 0.1385829841146087, "grad_norm": 0.8683077692985535, "learning_rate": 9.787116762596307e-06, "loss": 0.07666453719139099, "memory(GiB)": 21.32, "step": 4266, "token_acc": 0.9456521739130435, "train_speed(iter/s)": 0.954182 }, { "epoch": 0.13861546957736412, "grad_norm": 0.5393328666687012, "learning_rate": 9.78696166558612e-06, "loss": 0.052729569375514984, "memory(GiB)": 21.32, "step": 4267, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.954217 }, { "epoch": 0.13864795504011954, "grad_norm": 0.5881621241569519, "learning_rate": 9.786806513328025e-06, "loss": 0.05642380565404892, "memory(GiB)": 21.32, "step": 4268, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.954256 }, { "epoch": 0.13868044050287495, "grad_norm": 1.3818343877792358, "learning_rate": 9.786651305823809e-06, "loss": 0.06784289330244064, "memory(GiB)": 21.32, "step": 4269, "token_acc": 0.9696969696969697, "train_speed(iter/s)": 0.954293 }, { "epoch": 0.13871292596563037, "grad_norm": 2.1082682609558105, "learning_rate": 9.786496043075269e-06, "loss": 0.06231377273797989, "memory(GiB)": 21.32, "step": 4270, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.954321 }, { "epoch": 0.13874541142838578, "grad_norm": 0.7729004621505737, "learning_rate": 9.786340725084192e-06, "loss": 0.0597592368721962, "memory(GiB)": 21.32, "step": 4271, "token_acc": 0.9710743801652892, "train_speed(iter/s)": 0.954352 }, { "epoch": 0.13877789689114123, "grad_norm": 0.7596837282180786, "learning_rate": 9.786185351852372e-06, "loss": 0.0561799518764019, "memory(GiB)": 21.32, "step": 4272, "token_acc": 0.9737991266375546, "train_speed(iter/s)": 0.954386 }, { "epoch": 0.13881038235389664, "grad_norm": 0.7815048098564148, "learning_rate": 9.786029923381604e-06, "loss": 0.05703071877360344, "memory(GiB)": 21.32, "step": 4273, "token_acc": 0.9773584905660377, "train_speed(iter/s)": 0.954417 }, { "epoch": 0.13884286781665206, "grad_norm": 0.7104120254516602, "learning_rate": 9.785874439673679e-06, "loss": 0.07553759962320328, "memory(GiB)": 21.32, "step": 4274, "token_acc": 0.9699570815450643, "train_speed(iter/s)": 0.954442 }, { "epoch": 0.13887535327940748, "grad_norm": 0.5569155812263489, "learning_rate": 9.785718900730394e-06, "loss": 0.05400517210364342, "memory(GiB)": 21.32, "step": 4275, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.95447 }, { "epoch": 0.1389078387421629, "grad_norm": 0.6934187412261963, "learning_rate": 9.785563306553543e-06, "loss": 0.05425335094332695, "memory(GiB)": 21.32, "step": 4276, "token_acc": 0.9704641350210971, "train_speed(iter/s)": 0.9545 }, { "epoch": 0.1389403242049183, "grad_norm": 0.7017362117767334, "learning_rate": 9.785407657144922e-06, "loss": 0.058182865381240845, "memory(GiB)": 21.32, "step": 4277, "token_acc": 0.9613733905579399, "train_speed(iter/s)": 0.954533 }, { "epoch": 0.13897280966767372, "grad_norm": 0.5459538102149963, "learning_rate": 9.785251952506327e-06, "loss": 0.06507192552089691, "memory(GiB)": 21.32, "step": 4278, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.954564 }, { "epoch": 0.13900529513042914, "grad_norm": 1.0548646450042725, "learning_rate": 9.785096192639556e-06, "loss": 0.06608986854553223, "memory(GiB)": 21.32, "step": 4279, "token_acc": 0.9740740740740741, "train_speed(iter/s)": 0.954597 }, { "epoch": 0.13903778059318456, "grad_norm": 0.6268701553344727, "learning_rate": 9.784940377546404e-06, "loss": 0.06294946372509003, "memory(GiB)": 21.32, "step": 4280, "token_acc": 0.9732142857142857, "train_speed(iter/s)": 0.954623 }, { "epoch": 0.13907026605593997, "grad_norm": 0.6189805865287781, "learning_rate": 9.784784507228673e-06, "loss": 0.04931361973285675, "memory(GiB)": 21.32, "step": 4281, "token_acc": 0.9641255605381166, "train_speed(iter/s)": 0.954661 }, { "epoch": 0.1391027515186954, "grad_norm": 0.8892505168914795, "learning_rate": 9.78462858168816e-06, "loss": 0.07223452627658844, "memory(GiB)": 21.32, "step": 4282, "token_acc": 0.9581749049429658, "train_speed(iter/s)": 0.954694 }, { "epoch": 0.1391352369814508, "grad_norm": 0.8160262107849121, "learning_rate": 9.784472600926665e-06, "loss": 0.060829173773527145, "memory(GiB)": 21.32, "step": 4283, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.954726 }, { "epoch": 0.13916772244420622, "grad_norm": 0.5431628823280334, "learning_rate": 9.784316564945988e-06, "loss": 0.05352116376161575, "memory(GiB)": 21.32, "step": 4284, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.95476 }, { "epoch": 0.13920020790696164, "grad_norm": 0.6085328459739685, "learning_rate": 9.78416047374793e-06, "loss": 0.06466363370418549, "memory(GiB)": 21.32, "step": 4285, "token_acc": 0.975103734439834, "train_speed(iter/s)": 0.954795 }, { "epoch": 0.13923269336971705, "grad_norm": 0.8347791433334351, "learning_rate": 9.784004327334293e-06, "loss": 0.06113244220614433, "memory(GiB)": 21.32, "step": 4286, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.954829 }, { "epoch": 0.13926517883247247, "grad_norm": 0.5243883728981018, "learning_rate": 9.783848125706878e-06, "loss": 0.057211801409721375, "memory(GiB)": 21.32, "step": 4287, "token_acc": 0.9641255605381166, "train_speed(iter/s)": 0.954863 }, { "epoch": 0.13929766429522789, "grad_norm": 0.5950838327407837, "learning_rate": 9.783691868867486e-06, "loss": 0.05807987600564957, "memory(GiB)": 21.32, "step": 4288, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.954905 }, { "epoch": 0.1393301497579833, "grad_norm": 6.194357872009277, "learning_rate": 9.783535556817927e-06, "loss": 0.06259303539991379, "memory(GiB)": 21.32, "step": 4289, "token_acc": 0.9826388888888888, "train_speed(iter/s)": 0.954949 }, { "epoch": 0.13936263522073872, "grad_norm": 1.0261049270629883, "learning_rate": 9.783379189559998e-06, "loss": 0.06395597755908966, "memory(GiB)": 21.32, "step": 4290, "token_acc": 0.9683794466403162, "train_speed(iter/s)": 0.954994 }, { "epoch": 0.13939512068349413, "grad_norm": 0.7127183079719543, "learning_rate": 9.783222767095507e-06, "loss": 0.06355550140142441, "memory(GiB)": 21.32, "step": 4291, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.955038 }, { "epoch": 0.13942760614624955, "grad_norm": 0.4823792576789856, "learning_rate": 9.78306628942626e-06, "loss": 0.051762618124485016, "memory(GiB)": 21.32, "step": 4292, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.955079 }, { "epoch": 0.13946009160900497, "grad_norm": 0.7486085891723633, "learning_rate": 9.78290975655406e-06, "loss": 0.058723580092191696, "memory(GiB)": 21.32, "step": 4293, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.955122 }, { "epoch": 0.13949257707176038, "grad_norm": 0.446260541677475, "learning_rate": 9.782753168480717e-06, "loss": 0.04593001306056976, "memory(GiB)": 21.32, "step": 4294, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.955167 }, { "epoch": 0.1395250625345158, "grad_norm": 0.8556032180786133, "learning_rate": 9.782596525208035e-06, "loss": 0.06817217171192169, "memory(GiB)": 21.32, "step": 4295, "token_acc": 0.95703125, "train_speed(iter/s)": 0.95521 }, { "epoch": 0.13955754799727121, "grad_norm": 0.5783877372741699, "learning_rate": 9.782439826737824e-06, "loss": 0.06429721415042877, "memory(GiB)": 21.32, "step": 4296, "token_acc": 0.9723502304147466, "train_speed(iter/s)": 0.955254 }, { "epoch": 0.13959003346002663, "grad_norm": 1.0961955785751343, "learning_rate": 9.782283073071892e-06, "loss": 0.0697636604309082, "memory(GiB)": 21.32, "step": 4297, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.955296 }, { "epoch": 0.13962251892278205, "grad_norm": 0.6165375709533691, "learning_rate": 9.782126264212048e-06, "loss": 0.06492499262094498, "memory(GiB)": 21.32, "step": 4298, "token_acc": 0.9607843137254902, "train_speed(iter/s)": 0.95534 }, { "epoch": 0.13965500438553746, "grad_norm": 0.6272273063659668, "learning_rate": 9.781969400160103e-06, "loss": 0.05838070437312126, "memory(GiB)": 21.32, "step": 4299, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.955383 }, { "epoch": 0.13968748984829288, "grad_norm": 0.5642752051353455, "learning_rate": 9.781812480917865e-06, "loss": 0.058199651539325714, "memory(GiB)": 21.32, "step": 4300, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.955426 }, { "epoch": 0.1397199753110483, "grad_norm": 0.5430138111114502, "learning_rate": 9.781655506487147e-06, "loss": 0.04847906529903412, "memory(GiB)": 21.32, "step": 4301, "token_acc": 0.9795918367346939, "train_speed(iter/s)": 0.955467 }, { "epoch": 0.1397524607738037, "grad_norm": 0.6171796321868896, "learning_rate": 9.78149847686976e-06, "loss": 0.05802054703235626, "memory(GiB)": 21.32, "step": 4302, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.95551 }, { "epoch": 0.13978494623655913, "grad_norm": 0.5764819383621216, "learning_rate": 9.781341392067517e-06, "loss": 0.051575180143117905, "memory(GiB)": 21.32, "step": 4303, "token_acc": 0.9776119402985075, "train_speed(iter/s)": 0.955551 }, { "epoch": 0.13981743169931457, "grad_norm": 0.7205875515937805, "learning_rate": 9.78118425208223e-06, "loss": 0.0634489580988884, "memory(GiB)": 21.32, "step": 4304, "token_acc": 0.9702970297029703, "train_speed(iter/s)": 0.955596 }, { "epoch": 0.13984991716207, "grad_norm": 0.6521525979042053, "learning_rate": 9.781027056915714e-06, "loss": 0.050485484302043915, "memory(GiB)": 21.32, "step": 4305, "token_acc": 0.953125, "train_speed(iter/s)": 0.955636 }, { "epoch": 0.1398824026248254, "grad_norm": 0.7417442798614502, "learning_rate": 9.78086980656978e-06, "loss": 0.049374908208847046, "memory(GiB)": 21.32, "step": 4306, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.955681 }, { "epoch": 0.13991488808758082, "grad_norm": 0.5741403102874756, "learning_rate": 9.780712501046245e-06, "loss": 0.052446942776441574, "memory(GiB)": 21.32, "step": 4307, "token_acc": 0.9656652360515021, "train_speed(iter/s)": 0.955725 }, { "epoch": 0.13994737355033623, "grad_norm": 0.7447123527526855, "learning_rate": 9.780555140346926e-06, "loss": 0.07910580188035965, "memory(GiB)": 21.32, "step": 4308, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.955769 }, { "epoch": 0.13997985901309165, "grad_norm": 0.5798418521881104, "learning_rate": 9.780397724473638e-06, "loss": 0.04961980879306793, "memory(GiB)": 21.32, "step": 4309, "token_acc": 0.9826086956521739, "train_speed(iter/s)": 0.955815 }, { "epoch": 0.14001234447584707, "grad_norm": 0.6607518196105957, "learning_rate": 9.780240253428197e-06, "loss": 0.05590733140707016, "memory(GiB)": 21.32, "step": 4310, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.95586 }, { "epoch": 0.14004482993860248, "grad_norm": 0.6163740754127502, "learning_rate": 9.780082727212421e-06, "loss": 0.055257923901081085, "memory(GiB)": 21.32, "step": 4311, "token_acc": 0.9754385964912281, "train_speed(iter/s)": 0.955899 }, { "epoch": 0.1400773154013579, "grad_norm": 0.6279857158660889, "learning_rate": 9.779925145828129e-06, "loss": 0.053466424345970154, "memory(GiB)": 21.32, "step": 4312, "token_acc": 0.9649805447470817, "train_speed(iter/s)": 0.955944 }, { "epoch": 0.14010980086411332, "grad_norm": 0.5627450346946716, "learning_rate": 9.779767509277139e-06, "loss": 0.0525582879781723, "memory(GiB)": 21.32, "step": 4313, "token_acc": 0.9730941704035875, "train_speed(iter/s)": 0.955989 }, { "epoch": 0.14014228632686873, "grad_norm": 0.6022169589996338, "learning_rate": 9.77960981756127e-06, "loss": 0.0540994256734848, "memory(GiB)": 21.32, "step": 4314, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.956033 }, { "epoch": 0.14017477178962415, "grad_norm": 0.8281437754631042, "learning_rate": 9.779452070682339e-06, "loss": 0.06033102422952652, "memory(GiB)": 21.32, "step": 4315, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.956079 }, { "epoch": 0.14020725725237956, "grad_norm": 0.7626440525054932, "learning_rate": 9.779294268642173e-06, "loss": 0.0584876611828804, "memory(GiB)": 21.32, "step": 4316, "token_acc": 0.974169741697417, "train_speed(iter/s)": 0.956123 }, { "epoch": 0.14023974271513498, "grad_norm": 0.7551617622375488, "learning_rate": 9.779136411442588e-06, "loss": 0.053453411906957626, "memory(GiB)": 21.32, "step": 4317, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.956158 }, { "epoch": 0.1402722281778904, "grad_norm": 0.8817115426063538, "learning_rate": 9.778978499085408e-06, "loss": 0.06291977316141129, "memory(GiB)": 21.32, "step": 4318, "token_acc": 0.9540229885057471, "train_speed(iter/s)": 0.956191 }, { "epoch": 0.1403047136406458, "grad_norm": 0.834947407245636, "learning_rate": 9.778820531572455e-06, "loss": 0.06516371667385101, "memory(GiB)": 21.32, "step": 4319, "token_acc": 0.9629629629629629, "train_speed(iter/s)": 0.956228 }, { "epoch": 0.14033719910340123, "grad_norm": 0.8845973014831543, "learning_rate": 9.778662508905554e-06, "loss": 0.05585422366857529, "memory(GiB)": 21.32, "step": 4320, "token_acc": 0.9604743083003953, "train_speed(iter/s)": 0.956264 }, { "epoch": 0.14036968456615664, "grad_norm": 0.46646904945373535, "learning_rate": 9.778504431086526e-06, "loss": 0.055947043001651764, "memory(GiB)": 21.32, "step": 4321, "token_acc": 0.9776785714285714, "train_speed(iter/s)": 0.956299 }, { "epoch": 0.14040217002891206, "grad_norm": 0.9496162533760071, "learning_rate": 9.778346298117198e-06, "loss": 0.05888787657022476, "memory(GiB)": 21.32, "step": 4322, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.956334 }, { "epoch": 0.14043465549166748, "grad_norm": 0.49924764037132263, "learning_rate": 9.778188109999393e-06, "loss": 0.05366344004869461, "memory(GiB)": 21.32, "step": 4323, "token_acc": 0.978494623655914, "train_speed(iter/s)": 0.956369 }, { "epoch": 0.1404671409544229, "grad_norm": 0.5797684192657471, "learning_rate": 9.778029866734937e-06, "loss": 0.058752648532390594, "memory(GiB)": 21.32, "step": 4324, "token_acc": 0.9561752988047809, "train_speed(iter/s)": 0.956406 }, { "epoch": 0.1404996264171783, "grad_norm": 1.3256911039352417, "learning_rate": 9.777871568325657e-06, "loss": 0.05499272048473358, "memory(GiB)": 21.32, "step": 4325, "token_acc": 0.968, "train_speed(iter/s)": 0.956442 }, { "epoch": 0.14053211187993372, "grad_norm": 0.6477448344230652, "learning_rate": 9.777713214773381e-06, "loss": 0.05041642114520073, "memory(GiB)": 21.32, "step": 4326, "token_acc": 0.9665271966527197, "train_speed(iter/s)": 0.956474 }, { "epoch": 0.14056459734268914, "grad_norm": 0.5762268900871277, "learning_rate": 9.777554806079934e-06, "loss": 0.0641094371676445, "memory(GiB)": 21.32, "step": 4327, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.956505 }, { "epoch": 0.14059708280544456, "grad_norm": 0.46915125846862793, "learning_rate": 9.777396342247145e-06, "loss": 0.047579362988471985, "memory(GiB)": 21.32, "step": 4328, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.956536 }, { "epoch": 0.14062956826819997, "grad_norm": 0.4581419825553894, "learning_rate": 9.777237823276846e-06, "loss": 0.053069107234478, "memory(GiB)": 21.32, "step": 4329, "token_acc": 0.9775280898876404, "train_speed(iter/s)": 0.956568 }, { "epoch": 0.1406620537309554, "grad_norm": 0.7301518321037292, "learning_rate": 9.777079249170861e-06, "loss": 0.0651012435555458, "memory(GiB)": 21.32, "step": 4330, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.956593 }, { "epoch": 0.1406945391937108, "grad_norm": 0.5119644999504089, "learning_rate": 9.776920619931025e-06, "loss": 0.05151812732219696, "memory(GiB)": 21.32, "step": 4331, "token_acc": 0.9847715736040609, "train_speed(iter/s)": 0.956622 }, { "epoch": 0.14072702465646622, "grad_norm": 0.6331052184104919, "learning_rate": 9.776761935559167e-06, "loss": 0.05532459914684296, "memory(GiB)": 21.32, "step": 4332, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.956653 }, { "epoch": 0.14075951011922164, "grad_norm": 0.9240875244140625, "learning_rate": 9.776603196057118e-06, "loss": 0.05845600366592407, "memory(GiB)": 21.32, "step": 4333, "token_acc": 0.9724137931034482, "train_speed(iter/s)": 0.956684 }, { "epoch": 0.14079199558197705, "grad_norm": 0.7141136527061462, "learning_rate": 9.776444401426712e-06, "loss": 0.05721893161535263, "memory(GiB)": 21.32, "step": 4334, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.956713 }, { "epoch": 0.14082448104473247, "grad_norm": 1.03431236743927, "learning_rate": 9.776285551669777e-06, "loss": 0.06820039451122284, "memory(GiB)": 21.32, "step": 4335, "token_acc": 0.9826839826839827, "train_speed(iter/s)": 0.95674 }, { "epoch": 0.1408569665074879, "grad_norm": 1.1641210317611694, "learning_rate": 9.776126646788152e-06, "loss": 0.05466848611831665, "memory(GiB)": 21.32, "step": 4336, "token_acc": 0.9781021897810219, "train_speed(iter/s)": 0.95677 }, { "epoch": 0.14088945197024333, "grad_norm": 1.129899501800537, "learning_rate": 9.775967686783667e-06, "loss": 0.05054089426994324, "memory(GiB)": 21.32, "step": 4337, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956803 }, { "epoch": 0.14092193743299875, "grad_norm": 0.6579062342643738, "learning_rate": 9.775808671658161e-06, "loss": 0.05138473957777023, "memory(GiB)": 21.32, "step": 4338, "token_acc": 0.984375, "train_speed(iter/s)": 0.956835 }, { "epoch": 0.14095442289575416, "grad_norm": 0.44172823429107666, "learning_rate": 9.775649601413463e-06, "loss": 0.04172505438327789, "memory(GiB)": 21.32, "step": 4339, "token_acc": 0.9846153846153847, "train_speed(iter/s)": 0.956865 }, { "epoch": 0.14098690835850958, "grad_norm": 0.9101285338401794, "learning_rate": 9.775490476051413e-06, "loss": 0.060595929622650146, "memory(GiB)": 21.32, "step": 4340, "token_acc": 0.9807692307692307, "train_speed(iter/s)": 0.956898 }, { "epoch": 0.141019393821265, "grad_norm": 0.6458536386489868, "learning_rate": 9.775331295573847e-06, "loss": 0.06926611065864563, "memory(GiB)": 21.32, "step": 4341, "token_acc": 0.9715447154471545, "train_speed(iter/s)": 0.95693 }, { "epoch": 0.1410518792840204, "grad_norm": 0.5024981498718262, "learning_rate": 9.775172059982604e-06, "loss": 0.05330734699964523, "memory(GiB)": 21.32, "step": 4342, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.956966 }, { "epoch": 0.14108436474677583, "grad_norm": 0.6380717754364014, "learning_rate": 9.775012769279515e-06, "loss": 0.06511497497558594, "memory(GiB)": 21.32, "step": 4343, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.957003 }, { "epoch": 0.14111685020953124, "grad_norm": 0.8971614241600037, "learning_rate": 9.774853423466428e-06, "loss": 0.0657307356595993, "memory(GiB)": 21.32, "step": 4344, "token_acc": 0.9739776951672863, "train_speed(iter/s)": 0.957039 }, { "epoch": 0.14114933567228666, "grad_norm": 0.5369727611541748, "learning_rate": 9.774694022545174e-06, "loss": 0.06122266501188278, "memory(GiB)": 21.32, "step": 4345, "token_acc": 0.966542750929368, "train_speed(iter/s)": 0.957079 }, { "epoch": 0.14118182113504207, "grad_norm": 0.558293342590332, "learning_rate": 9.774534566517598e-06, "loss": 0.06025276333093643, "memory(GiB)": 21.32, "step": 4346, "token_acc": 0.9698275862068966, "train_speed(iter/s)": 0.957124 }, { "epoch": 0.1412143065977975, "grad_norm": 1.24285888671875, "learning_rate": 9.774375055385537e-06, "loss": 0.0811423510313034, "memory(GiB)": 21.32, "step": 4347, "token_acc": 0.9725490196078431, "train_speed(iter/s)": 0.95716 }, { "epoch": 0.1412467920605529, "grad_norm": 0.5340167880058289, "learning_rate": 9.774215489150832e-06, "loss": 0.053483910858631134, "memory(GiB)": 21.32, "step": 4348, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.957203 }, { "epoch": 0.14127927752330832, "grad_norm": 1.0792739391326904, "learning_rate": 9.774055867815328e-06, "loss": 0.06176767870783806, "memory(GiB)": 21.32, "step": 4349, "token_acc": 0.9629629629629629, "train_speed(iter/s)": 0.957247 }, { "epoch": 0.14131176298606374, "grad_norm": 0.6350359916687012, "learning_rate": 9.773896191380864e-06, "loss": 0.05065988376736641, "memory(GiB)": 21.32, "step": 4350, "token_acc": 0.9878048780487805, "train_speed(iter/s)": 0.957291 }, { "epoch": 0.14134424844881915, "grad_norm": 0.48085924983024597, "learning_rate": 9.773736459849284e-06, "loss": 0.05530316382646561, "memory(GiB)": 21.32, "step": 4351, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.957333 }, { "epoch": 0.14137673391157457, "grad_norm": 1.5868592262268066, "learning_rate": 9.773576673222432e-06, "loss": 0.06811659038066864, "memory(GiB)": 21.32, "step": 4352, "token_acc": 0.96875, "train_speed(iter/s)": 0.957377 }, { "epoch": 0.14140921937433, "grad_norm": 0.7098670601844788, "learning_rate": 9.77341683150215e-06, "loss": 0.06698150932788849, "memory(GiB)": 21.32, "step": 4353, "token_acc": 0.975, "train_speed(iter/s)": 0.95742 }, { "epoch": 0.1414417048370854, "grad_norm": 0.5617740750312805, "learning_rate": 9.773256934690284e-06, "loss": 0.06409670412540436, "memory(GiB)": 21.32, "step": 4354, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.957464 }, { "epoch": 0.14147419029984082, "grad_norm": 0.5949400663375854, "learning_rate": 9.773096982788683e-06, "loss": 0.05552312359213829, "memory(GiB)": 21.32, "step": 4355, "token_acc": 0.9752066115702479, "train_speed(iter/s)": 0.957506 }, { "epoch": 0.14150667576259623, "grad_norm": 0.5393181443214417, "learning_rate": 9.772936975799188e-06, "loss": 0.05684089660644531, "memory(GiB)": 21.32, "step": 4356, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957549 }, { "epoch": 0.14153916122535165, "grad_norm": 0.6047013998031616, "learning_rate": 9.772776913723648e-06, "loss": 0.06159021705389023, "memory(GiB)": 21.32, "step": 4357, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.95759 }, { "epoch": 0.14157164668810707, "grad_norm": 0.5067864656448364, "learning_rate": 9.77261679656391e-06, "loss": 0.05307105556130409, "memory(GiB)": 21.32, "step": 4358, "token_acc": 0.966542750929368, "train_speed(iter/s)": 0.957633 }, { "epoch": 0.14160413215086248, "grad_norm": 0.7210085988044739, "learning_rate": 9.77245662432182e-06, "loss": 0.058185674250125885, "memory(GiB)": 21.32, "step": 4359, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.957675 }, { "epoch": 0.1416366176136179, "grad_norm": 0.6000082492828369, "learning_rate": 9.77229639699923e-06, "loss": 0.05613398551940918, "memory(GiB)": 21.32, "step": 4360, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.957719 }, { "epoch": 0.14166910307637332, "grad_norm": 0.5790772438049316, "learning_rate": 9.772136114597987e-06, "loss": 0.0477481335401535, "memory(GiB)": 21.32, "step": 4361, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.957762 }, { "epoch": 0.14170158853912873, "grad_norm": 0.9221476316452026, "learning_rate": 9.771975777119943e-06, "loss": 0.05265653133392334, "memory(GiB)": 21.32, "step": 4362, "token_acc": 0.9836065573770492, "train_speed(iter/s)": 0.957804 }, { "epoch": 0.14173407400188415, "grad_norm": 0.8164222836494446, "learning_rate": 9.771815384566946e-06, "loss": 0.06536564230918884, "memory(GiB)": 21.32, "step": 4363, "token_acc": 0.9646017699115044, "train_speed(iter/s)": 0.957845 }, { "epoch": 0.14176655946463956, "grad_norm": 0.5815199017524719, "learning_rate": 9.771654936940847e-06, "loss": 0.058667588979005814, "memory(GiB)": 21.32, "step": 4364, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.957887 }, { "epoch": 0.14179904492739498, "grad_norm": 0.6653274893760681, "learning_rate": 9.7714944342435e-06, "loss": 0.0572628453373909, "memory(GiB)": 21.32, "step": 4365, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.957931 }, { "epoch": 0.1418315303901504, "grad_norm": 0.8454200029373169, "learning_rate": 9.771333876476755e-06, "loss": 0.056603699922561646, "memory(GiB)": 21.32, "step": 4366, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.957973 }, { "epoch": 0.1418640158529058, "grad_norm": 0.7680887579917908, "learning_rate": 9.77117326364247e-06, "loss": 0.052678242325782776, "memory(GiB)": 21.32, "step": 4367, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.958015 }, { "epoch": 0.14189650131566126, "grad_norm": 1.2593982219696045, "learning_rate": 9.771012595742493e-06, "loss": 0.07319328188896179, "memory(GiB)": 21.32, "step": 4368, "token_acc": 0.964824120603015, "train_speed(iter/s)": 0.958057 }, { "epoch": 0.14192898677841667, "grad_norm": 0.5937494039535522, "learning_rate": 9.770851872778679e-06, "loss": 0.06626056879758835, "memory(GiB)": 21.32, "step": 4369, "token_acc": 0.9653679653679653, "train_speed(iter/s)": 0.9581 }, { "epoch": 0.1419614722411721, "grad_norm": 0.5732915997505188, "learning_rate": 9.770691094752886e-06, "loss": 0.058388277888298035, "memory(GiB)": 21.32, "step": 4370, "token_acc": 0.9636363636363636, "train_speed(iter/s)": 0.958142 }, { "epoch": 0.1419939577039275, "grad_norm": 0.7961816191673279, "learning_rate": 9.770530261666968e-06, "loss": 0.07853130251169205, "memory(GiB)": 21.32, "step": 4371, "token_acc": 0.9444444444444444, "train_speed(iter/s)": 0.958184 }, { "epoch": 0.14202644316668292, "grad_norm": 0.6052657961845398, "learning_rate": 9.770369373522782e-06, "loss": 0.05800487846136093, "memory(GiB)": 21.32, "step": 4372, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.958227 }, { "epoch": 0.14205892862943834, "grad_norm": 0.5511470437049866, "learning_rate": 9.770208430322184e-06, "loss": 0.062343306839466095, "memory(GiB)": 21.32, "step": 4373, "token_acc": 0.9776119402985075, "train_speed(iter/s)": 0.958271 }, { "epoch": 0.14209141409219375, "grad_norm": 0.5927812457084656, "learning_rate": 9.77004743206703e-06, "loss": 0.06205625832080841, "memory(GiB)": 21.32, "step": 4374, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.958312 }, { "epoch": 0.14212389955494917, "grad_norm": 0.5329314470291138, "learning_rate": 9.769886378759181e-06, "loss": 0.05541117861866951, "memory(GiB)": 21.32, "step": 4375, "token_acc": 0.9647058823529412, "train_speed(iter/s)": 0.958355 }, { "epoch": 0.14215638501770458, "grad_norm": 0.5350173711776733, "learning_rate": 9.769725270400493e-06, "loss": 0.06649065017700195, "memory(GiB)": 21.32, "step": 4376, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.958397 }, { "epoch": 0.14218887048046, "grad_norm": 0.4391539692878723, "learning_rate": 9.769564106992828e-06, "loss": 0.05805263668298721, "memory(GiB)": 21.32, "step": 4377, "token_acc": 0.9796954314720813, "train_speed(iter/s)": 0.95844 }, { "epoch": 0.14222135594321542, "grad_norm": 0.595306396484375, "learning_rate": 9.769402888538045e-06, "loss": 0.059255216270685196, "memory(GiB)": 21.32, "step": 4378, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.958482 }, { "epoch": 0.14225384140597083, "grad_norm": 0.44901910424232483, "learning_rate": 9.769241615038004e-06, "loss": 0.05977620929479599, "memory(GiB)": 21.32, "step": 4379, "token_acc": 0.9605911330049262, "train_speed(iter/s)": 0.958519 }, { "epoch": 0.14228632686872625, "grad_norm": 0.5810257792472839, "learning_rate": 9.769080286494567e-06, "loss": 0.05463884770870209, "memory(GiB)": 21.32, "step": 4380, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.958555 }, { "epoch": 0.14231881233148166, "grad_norm": 0.44035816192626953, "learning_rate": 9.768918902909596e-06, "loss": 0.06116650998592377, "memory(GiB)": 21.32, "step": 4381, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.95859 }, { "epoch": 0.14235129779423708, "grad_norm": 3.8191535472869873, "learning_rate": 9.768757464284955e-06, "loss": 0.04938902705907822, "memory(GiB)": 21.32, "step": 4382, "token_acc": 0.996, "train_speed(iter/s)": 0.958625 }, { "epoch": 0.1423837832569925, "grad_norm": 0.5787312984466553, "learning_rate": 9.768595970622503e-06, "loss": 0.05925522744655609, "memory(GiB)": 21.32, "step": 4383, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.958657 }, { "epoch": 0.1424162687197479, "grad_norm": 0.5476557016372681, "learning_rate": 9.768434421924108e-06, "loss": 0.0569155178964138, "memory(GiB)": 21.32, "step": 4384, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.958688 }, { "epoch": 0.14244875418250333, "grad_norm": 0.8852964043617249, "learning_rate": 9.768272818191632e-06, "loss": 0.06749974191188812, "memory(GiB)": 21.32, "step": 4385, "token_acc": 0.9682539682539683, "train_speed(iter/s)": 0.958721 }, { "epoch": 0.14248123964525874, "grad_norm": 0.7458301782608032, "learning_rate": 9.768111159426943e-06, "loss": 0.05119391903281212, "memory(GiB)": 21.32, "step": 4386, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.958753 }, { "epoch": 0.14251372510801416, "grad_norm": 0.8223235607147217, "learning_rate": 9.767949445631903e-06, "loss": 0.0580531507730484, "memory(GiB)": 21.32, "step": 4387, "token_acc": 0.9824561403508771, "train_speed(iter/s)": 0.958783 }, { "epoch": 0.14254621057076958, "grad_norm": 0.47530460357666016, "learning_rate": 9.767787676808382e-06, "loss": 0.046211980283260345, "memory(GiB)": 21.32, "step": 4388, "token_acc": 0.9752475247524752, "train_speed(iter/s)": 0.958812 }, { "epoch": 0.142578696033525, "grad_norm": 0.8635022044181824, "learning_rate": 9.767625852958244e-06, "loss": 0.05131666362285614, "memory(GiB)": 21.32, "step": 4389, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.958843 }, { "epoch": 0.1426111814962804, "grad_norm": 1.1850149631500244, "learning_rate": 9.767463974083359e-06, "loss": 0.06843090057373047, "memory(GiB)": 21.32, "step": 4390, "token_acc": 0.9773584905660377, "train_speed(iter/s)": 0.95887 }, { "epoch": 0.14264366695903583, "grad_norm": 0.6000810861587524, "learning_rate": 9.767302040185594e-06, "loss": 0.05018138140439987, "memory(GiB)": 21.32, "step": 4391, "token_acc": 0.9776951672862454, "train_speed(iter/s)": 0.958902 }, { "epoch": 0.14267615242179124, "grad_norm": 2.352445125579834, "learning_rate": 9.767140051266818e-06, "loss": 0.05467965453863144, "memory(GiB)": 21.32, "step": 4392, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.958932 }, { "epoch": 0.14270863788454666, "grad_norm": 0.5465676188468933, "learning_rate": 9.7669780073289e-06, "loss": 0.058276012539863586, "memory(GiB)": 21.32, "step": 4393, "token_acc": 0.9806763285024155, "train_speed(iter/s)": 0.958963 }, { "epoch": 0.14274112334730207, "grad_norm": 4.258617401123047, "learning_rate": 9.76681590837371e-06, "loss": 0.04815717041492462, "memory(GiB)": 21.32, "step": 4394, "token_acc": 0.9878048780487805, "train_speed(iter/s)": 0.958995 }, { "epoch": 0.1427736088100575, "grad_norm": 1.3198466300964355, "learning_rate": 9.766653754403122e-06, "loss": 0.07368842512369156, "memory(GiB)": 21.32, "step": 4395, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.959028 }, { "epoch": 0.1428060942728129, "grad_norm": 0.9390040636062622, "learning_rate": 9.766491545419005e-06, "loss": 0.05322352424263954, "memory(GiB)": 21.32, "step": 4396, "token_acc": 0.9833887043189369, "train_speed(iter/s)": 0.959059 }, { "epoch": 0.14283857973556832, "grad_norm": 0.5074341893196106, "learning_rate": 9.76632928142323e-06, "loss": 0.05704106390476227, "memory(GiB)": 21.32, "step": 4397, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.959085 }, { "epoch": 0.14287106519832374, "grad_norm": 0.6497300863265991, "learning_rate": 9.766166962417673e-06, "loss": 0.05984806269407272, "memory(GiB)": 21.32, "step": 4398, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.959115 }, { "epoch": 0.14290355066107915, "grad_norm": 0.707838237285614, "learning_rate": 9.766004588404204e-06, "loss": 0.06402723491191864, "memory(GiB)": 21.32, "step": 4399, "token_acc": 0.966824644549763, "train_speed(iter/s)": 0.959145 }, { "epoch": 0.1429360361238346, "grad_norm": 0.7716525793075562, "learning_rate": 9.7658421593847e-06, "loss": 0.0603286512196064, "memory(GiB)": 21.32, "step": 4400, "token_acc": 0.984, "train_speed(iter/s)": 0.959179 }, { "epoch": 0.14296852158659, "grad_norm": 0.6999965310096741, "learning_rate": 9.765679675361032e-06, "loss": 0.059618305414915085, "memory(GiB)": 21.32, "step": 4401, "token_acc": 0.9806201550387597, "train_speed(iter/s)": 0.95921 }, { "epoch": 0.14300100704934543, "grad_norm": 0.6815775036811829, "learning_rate": 9.765517136335079e-06, "loss": 0.06445399671792984, "memory(GiB)": 21.32, "step": 4402, "token_acc": 0.9785714285714285, "train_speed(iter/s)": 0.959248 }, { "epoch": 0.14303349251210085, "grad_norm": 0.6805891394615173, "learning_rate": 9.765354542308717e-06, "loss": 0.05399207025766373, "memory(GiB)": 21.32, "step": 4403, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.959288 }, { "epoch": 0.14306597797485626, "grad_norm": 0.5501779913902283, "learning_rate": 9.765191893283816e-06, "loss": 0.049144402146339417, "memory(GiB)": 21.32, "step": 4404, "token_acc": 0.9757281553398058, "train_speed(iter/s)": 0.959329 }, { "epoch": 0.14309846343761168, "grad_norm": 0.8082146048545837, "learning_rate": 9.765029189262261e-06, "loss": 0.055179156363010406, "memory(GiB)": 21.32, "step": 4405, "token_acc": 0.9727272727272728, "train_speed(iter/s)": 0.959369 }, { "epoch": 0.1431309489003671, "grad_norm": 0.7910711765289307, "learning_rate": 9.764866430245927e-06, "loss": 0.06221120059490204, "memory(GiB)": 21.32, "step": 4406, "token_acc": 0.9683257918552036, "train_speed(iter/s)": 0.959412 }, { "epoch": 0.1431634343631225, "grad_norm": 0.8487344980239868, "learning_rate": 9.764703616236693e-06, "loss": 0.06809139251708984, "memory(GiB)": 21.32, "step": 4407, "token_acc": 0.9711538461538461, "train_speed(iter/s)": 0.959455 }, { "epoch": 0.14319591982587793, "grad_norm": 1.1374224424362183, "learning_rate": 9.764540747236437e-06, "loss": 0.06687802076339722, "memory(GiB)": 21.32, "step": 4408, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.959495 }, { "epoch": 0.14322840528863334, "grad_norm": 0.872978687286377, "learning_rate": 9.764377823247039e-06, "loss": 0.05335981771349907, "memory(GiB)": 21.32, "step": 4409, "token_acc": 0.9702127659574468, "train_speed(iter/s)": 0.959538 }, { "epoch": 0.14326089075138876, "grad_norm": 0.5031494498252869, "learning_rate": 9.764214844270379e-06, "loss": 0.04633628576993942, "memory(GiB)": 21.32, "step": 4410, "token_acc": 0.9813953488372092, "train_speed(iter/s)": 0.959581 }, { "epoch": 0.14329337621414417, "grad_norm": 0.4556180238723755, "learning_rate": 9.76405181030834e-06, "loss": 0.045570798218250275, "memory(GiB)": 21.32, "step": 4411, "token_acc": 0.9558011049723757, "train_speed(iter/s)": 0.959623 }, { "epoch": 0.1433258616768996, "grad_norm": 0.5357363224029541, "learning_rate": 9.763888721362801e-06, "loss": 0.0530925989151001, "memory(GiB)": 21.32, "step": 4412, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.959664 }, { "epoch": 0.143358347139655, "grad_norm": 0.6603878736495972, "learning_rate": 9.763725577435646e-06, "loss": 0.06635265052318573, "memory(GiB)": 21.32, "step": 4413, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.959707 }, { "epoch": 0.14339083260241042, "grad_norm": 0.6950467228889465, "learning_rate": 9.763562378528757e-06, "loss": 0.05377023667097092, "memory(GiB)": 21.32, "step": 4414, "token_acc": 0.9759036144578314, "train_speed(iter/s)": 0.95975 }, { "epoch": 0.14342331806516584, "grad_norm": 0.5735098719596863, "learning_rate": 9.763399124644018e-06, "loss": 0.06080194190144539, "memory(GiB)": 21.32, "step": 4415, "token_acc": 0.975, "train_speed(iter/s)": 0.959791 }, { "epoch": 0.14345580352792126, "grad_norm": 0.44950351119041443, "learning_rate": 9.763235815783312e-06, "loss": 0.056243896484375, "memory(GiB)": 21.32, "step": 4416, "token_acc": 0.98828125, "train_speed(iter/s)": 0.959833 }, { "epoch": 0.14348828899067667, "grad_norm": 0.7556532025337219, "learning_rate": 9.763072451948525e-06, "loss": 0.05722148343920708, "memory(GiB)": 21.32, "step": 4417, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.959875 }, { "epoch": 0.1435207744534321, "grad_norm": 0.5756471753120422, "learning_rate": 9.762909033141544e-06, "loss": 0.05998045951128006, "memory(GiB)": 21.32, "step": 4418, "token_acc": 0.9736842105263158, "train_speed(iter/s)": 0.959918 }, { "epoch": 0.1435532599161875, "grad_norm": 1.9339717626571655, "learning_rate": 9.762745559364253e-06, "loss": 0.06044863536953926, "memory(GiB)": 21.32, "step": 4419, "token_acc": 0.9783783783783784, "train_speed(iter/s)": 0.95996 }, { "epoch": 0.14358574537894292, "grad_norm": 0.6024218797683716, "learning_rate": 9.762582030618536e-06, "loss": 0.06140966713428497, "memory(GiB)": 21.32, "step": 4420, "token_acc": 0.9675675675675676, "train_speed(iter/s)": 0.960002 }, { "epoch": 0.14361823084169834, "grad_norm": 0.8193471431732178, "learning_rate": 9.762418446906288e-06, "loss": 0.05219583213329315, "memory(GiB)": 21.32, "step": 4421, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.960042 }, { "epoch": 0.14365071630445375, "grad_norm": 0.5776461362838745, "learning_rate": 9.76225480822939e-06, "loss": 0.06282515823841095, "memory(GiB)": 21.32, "step": 4422, "token_acc": 0.966804979253112, "train_speed(iter/s)": 0.960084 }, { "epoch": 0.14368320176720917, "grad_norm": 0.5598179697990417, "learning_rate": 9.762091114589732e-06, "loss": 0.053061969578266144, "memory(GiB)": 21.32, "step": 4423, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.960124 }, { "epoch": 0.14371568722996458, "grad_norm": 0.6593965291976929, "learning_rate": 9.761927365989205e-06, "loss": 0.06191331520676613, "memory(GiB)": 21.32, "step": 4424, "token_acc": 0.9733333333333334, "train_speed(iter/s)": 0.960164 }, { "epoch": 0.14374817269272, "grad_norm": 0.43135568499565125, "learning_rate": 9.7617635624297e-06, "loss": 0.04643142968416214, "memory(GiB)": 21.32, "step": 4425, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.960205 }, { "epoch": 0.14378065815547542, "grad_norm": 0.5447593331336975, "learning_rate": 9.761599703913103e-06, "loss": 0.05004419758915901, "memory(GiB)": 21.32, "step": 4426, "token_acc": 0.9759036144578314, "train_speed(iter/s)": 0.960244 }, { "epoch": 0.14381314361823083, "grad_norm": 4.085960388183594, "learning_rate": 9.761435790441309e-06, "loss": 0.06942611187696457, "memory(GiB)": 21.32, "step": 4427, "token_acc": 0.9699570815450643, "train_speed(iter/s)": 0.960286 }, { "epoch": 0.14384562908098625, "grad_norm": 1.0195759534835815, "learning_rate": 9.761271822016207e-06, "loss": 0.06253817677497864, "memory(GiB)": 21.32, "step": 4428, "token_acc": 0.9776119402985075, "train_speed(iter/s)": 0.960326 }, { "epoch": 0.14387811454374166, "grad_norm": 0.7577086091041565, "learning_rate": 9.761107798639693e-06, "loss": 0.06146135926246643, "memory(GiB)": 21.32, "step": 4429, "token_acc": 0.9724409448818898, "train_speed(iter/s)": 0.960368 }, { "epoch": 0.14391060000649708, "grad_norm": 0.7267196178436279, "learning_rate": 9.760943720313658e-06, "loss": 0.0638982355594635, "memory(GiB)": 21.32, "step": 4430, "token_acc": 0.9758454106280193, "train_speed(iter/s)": 0.960409 }, { "epoch": 0.1439430854692525, "grad_norm": 0.7962375283241272, "learning_rate": 9.760779587039995e-06, "loss": 0.0596989206969738, "memory(GiB)": 21.32, "step": 4431, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.960451 }, { "epoch": 0.14397557093200794, "grad_norm": 0.9597454071044922, "learning_rate": 9.760615398820598e-06, "loss": 0.06590934097766876, "memory(GiB)": 21.32, "step": 4432, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.960492 }, { "epoch": 0.14400805639476336, "grad_norm": 0.5912533402442932, "learning_rate": 9.760451155657364e-06, "loss": 0.056211747229099274, "memory(GiB)": 21.32, "step": 4433, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.960533 }, { "epoch": 0.14404054185751877, "grad_norm": 0.54862380027771, "learning_rate": 9.760286857552187e-06, "loss": 0.05489075183868408, "memory(GiB)": 21.32, "step": 4434, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.960574 }, { "epoch": 0.1440730273202742, "grad_norm": 0.6695893406867981, "learning_rate": 9.760122504506964e-06, "loss": 0.05340373143553734, "memory(GiB)": 21.32, "step": 4435, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.960616 }, { "epoch": 0.1441055127830296, "grad_norm": 0.7486035823822021, "learning_rate": 9.759958096523592e-06, "loss": 0.05733289569616318, "memory(GiB)": 21.32, "step": 4436, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.960659 }, { "epoch": 0.14413799824578502, "grad_norm": 0.48033803701400757, "learning_rate": 9.759793633603967e-06, "loss": 0.05775948986411095, "memory(GiB)": 21.32, "step": 4437, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.960698 }, { "epoch": 0.14417048370854044, "grad_norm": 0.4662723243236542, "learning_rate": 9.75962911574999e-06, "loss": 0.055944569408893585, "memory(GiB)": 21.32, "step": 4438, "token_acc": 0.9682539682539683, "train_speed(iter/s)": 0.960739 }, { "epoch": 0.14420296917129585, "grad_norm": 0.646121084690094, "learning_rate": 9.759464542963556e-06, "loss": 0.051942046731710434, "memory(GiB)": 21.32, "step": 4439, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.960779 }, { "epoch": 0.14423545463405127, "grad_norm": 0.9083511829376221, "learning_rate": 9.759299915246568e-06, "loss": 0.05081109702587128, "memory(GiB)": 21.32, "step": 4440, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.960816 }, { "epoch": 0.14426794009680668, "grad_norm": 0.6151030659675598, "learning_rate": 9.759135232600923e-06, "loss": 0.07006849348545074, "memory(GiB)": 21.32, "step": 4441, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.960838 }, { "epoch": 0.1443004255595621, "grad_norm": 0.518592894077301, "learning_rate": 9.758970495028523e-06, "loss": 0.053130023181438446, "memory(GiB)": 21.32, "step": 4442, "token_acc": 0.9846153846153847, "train_speed(iter/s)": 0.960868 }, { "epoch": 0.14433291102231752, "grad_norm": 0.8186823129653931, "learning_rate": 9.75880570253127e-06, "loss": 0.06367461383342743, "memory(GiB)": 21.32, "step": 4443, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.960899 }, { "epoch": 0.14436539648507293, "grad_norm": 0.8937147855758667, "learning_rate": 9.758640855111066e-06, "loss": 0.06092110276222229, "memory(GiB)": 21.32, "step": 4444, "token_acc": 0.9704641350210971, "train_speed(iter/s)": 0.960928 }, { "epoch": 0.14439788194782835, "grad_norm": 0.5580012798309326, "learning_rate": 9.75847595276981e-06, "loss": 0.06000484526157379, "memory(GiB)": 21.32, "step": 4445, "token_acc": 0.984375, "train_speed(iter/s)": 0.960959 }, { "epoch": 0.14443036741058377, "grad_norm": 2.1761863231658936, "learning_rate": 9.75831099550941e-06, "loss": 0.05628804862499237, "memory(GiB)": 21.32, "step": 4446, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.960988 }, { "epoch": 0.14446285287333918, "grad_norm": 0.9367890357971191, "learning_rate": 9.758145983331769e-06, "loss": 0.06616724282503128, "memory(GiB)": 21.32, "step": 4447, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.96102 }, { "epoch": 0.1444953383360946, "grad_norm": 0.9439199566841125, "learning_rate": 9.757980916238789e-06, "loss": 0.06343701481819153, "memory(GiB)": 21.32, "step": 4448, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.96105 }, { "epoch": 0.14452782379885, "grad_norm": 0.5104261040687561, "learning_rate": 9.757815794232375e-06, "loss": 0.0577603280544281, "memory(GiB)": 21.32, "step": 4449, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.961078 }, { "epoch": 0.14456030926160543, "grad_norm": 1.0843923091888428, "learning_rate": 9.757650617314435e-06, "loss": 0.060950350016355515, "memory(GiB)": 21.32, "step": 4450, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.961107 }, { "epoch": 0.14459279472436085, "grad_norm": 0.44995924830436707, "learning_rate": 9.757485385486876e-06, "loss": 0.053821466863155365, "memory(GiB)": 21.32, "step": 4451, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.961135 }, { "epoch": 0.14462528018711626, "grad_norm": 1.87899649143219, "learning_rate": 9.7573200987516e-06, "loss": 0.07001234591007233, "memory(GiB)": 21.32, "step": 4452, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.961165 }, { "epoch": 0.14465776564987168, "grad_norm": 0.7655074596405029, "learning_rate": 9.757154757110523e-06, "loss": 0.06152322143316269, "memory(GiB)": 21.32, "step": 4453, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.961189 }, { "epoch": 0.1446902511126271, "grad_norm": 0.7387377023696899, "learning_rate": 9.756989360565546e-06, "loss": 0.07229049503803253, "memory(GiB)": 21.32, "step": 4454, "token_acc": 0.9636363636363636, "train_speed(iter/s)": 0.961214 }, { "epoch": 0.1447227365753825, "grad_norm": 0.6116542220115662, "learning_rate": 9.75682390911858e-06, "loss": 0.05625954270362854, "memory(GiB)": 21.32, "step": 4455, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.961243 }, { "epoch": 0.14475522203813793, "grad_norm": 0.4022144079208374, "learning_rate": 9.756658402771533e-06, "loss": 0.04603710025548935, "memory(GiB)": 21.32, "step": 4456, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.96127 }, { "epoch": 0.14478770750089334, "grad_norm": 0.7686424255371094, "learning_rate": 9.756492841526319e-06, "loss": 0.07288310676813126, "memory(GiB)": 21.32, "step": 4457, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.961297 }, { "epoch": 0.14482019296364876, "grad_norm": 0.6083788871765137, "learning_rate": 9.756327225384848e-06, "loss": 0.05953410267829895, "memory(GiB)": 21.32, "step": 4458, "token_acc": 0.9691629955947136, "train_speed(iter/s)": 0.961325 }, { "epoch": 0.14485267842640417, "grad_norm": 0.4705493152141571, "learning_rate": 9.756161554349028e-06, "loss": 0.05282323807477951, "memory(GiB)": 21.32, "step": 4459, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.961352 }, { "epoch": 0.1448851638891596, "grad_norm": 2.3317112922668457, "learning_rate": 9.755995828420773e-06, "loss": 0.07342415302991867, "memory(GiB)": 21.32, "step": 4460, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.961383 }, { "epoch": 0.144917649351915, "grad_norm": 0.49835652112960815, "learning_rate": 9.755830047601997e-06, "loss": 0.05964206904172897, "memory(GiB)": 21.32, "step": 4461, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.961416 }, { "epoch": 0.14495013481467042, "grad_norm": 0.3832082152366638, "learning_rate": 9.755664211894613e-06, "loss": 0.04710475355386734, "memory(GiB)": 21.32, "step": 4462, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.961452 }, { "epoch": 0.14498262027742584, "grad_norm": 0.39885902404785156, "learning_rate": 9.755498321300532e-06, "loss": 0.0479290597140789, "memory(GiB)": 21.32, "step": 4463, "token_acc": 0.9765258215962441, "train_speed(iter/s)": 0.961488 }, { "epoch": 0.14501510574018128, "grad_norm": 0.9157858490943909, "learning_rate": 9.755332375821672e-06, "loss": 0.05959121882915497, "memory(GiB)": 21.32, "step": 4464, "token_acc": 0.9710144927536232, "train_speed(iter/s)": 0.96153 }, { "epoch": 0.1450475912029367, "grad_norm": 0.7102551460266113, "learning_rate": 9.755166375459947e-06, "loss": 0.07013912498950958, "memory(GiB)": 21.32, "step": 4465, "token_acc": 0.9591836734693877, "train_speed(iter/s)": 0.961571 }, { "epoch": 0.14508007666569211, "grad_norm": 0.5008271336555481, "learning_rate": 9.755000320217274e-06, "loss": 0.051290541887283325, "memory(GiB)": 21.32, "step": 4466, "token_acc": 0.9885931558935361, "train_speed(iter/s)": 0.961613 }, { "epoch": 0.14511256212844753, "grad_norm": 0.9246290326118469, "learning_rate": 9.754834210095567e-06, "loss": 0.060440532863140106, "memory(GiB)": 21.32, "step": 4467, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.961654 }, { "epoch": 0.14514504759120295, "grad_norm": 0.5908827781677246, "learning_rate": 9.754668045096745e-06, "loss": 0.05855047330260277, "memory(GiB)": 21.32, "step": 4468, "token_acc": 0.9671361502347418, "train_speed(iter/s)": 0.961694 }, { "epoch": 0.14517753305395836, "grad_norm": 0.6497912406921387, "learning_rate": 9.754501825222725e-06, "loss": 0.06887250393629074, "memory(GiB)": 21.32, "step": 4469, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.961734 }, { "epoch": 0.14521001851671378, "grad_norm": 0.7936842441558838, "learning_rate": 9.754335550475426e-06, "loss": 0.058563411235809326, "memory(GiB)": 21.32, "step": 4470, "token_acc": 0.9793103448275862, "train_speed(iter/s)": 0.961775 }, { "epoch": 0.1452425039794692, "grad_norm": 0.5157751441001892, "learning_rate": 9.754169220856766e-06, "loss": 0.055574316531419754, "memory(GiB)": 21.32, "step": 4471, "token_acc": 0.9766355140186916, "train_speed(iter/s)": 0.961817 }, { "epoch": 0.1452749894422246, "grad_norm": 0.5156868696212769, "learning_rate": 9.754002836368668e-06, "loss": 0.05035550892353058, "memory(GiB)": 21.32, "step": 4472, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.961856 }, { "epoch": 0.14530747490498003, "grad_norm": 0.6904841065406799, "learning_rate": 9.753836397013046e-06, "loss": 0.0515897236764431, "memory(GiB)": 21.32, "step": 4473, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.961895 }, { "epoch": 0.14533996036773544, "grad_norm": 0.6417278051376343, "learning_rate": 9.753669902791826e-06, "loss": 0.05987038463354111, "memory(GiB)": 21.32, "step": 4474, "token_acc": 0.9717741935483871, "train_speed(iter/s)": 0.961934 }, { "epoch": 0.14537244583049086, "grad_norm": 0.47360366582870483, "learning_rate": 9.75350335370693e-06, "loss": 0.049786683171987534, "memory(GiB)": 21.32, "step": 4475, "token_acc": 0.9783549783549783, "train_speed(iter/s)": 0.961975 }, { "epoch": 0.14540493129324628, "grad_norm": 0.7435443997383118, "learning_rate": 9.753336749760277e-06, "loss": 0.0616341307759285, "memory(GiB)": 21.32, "step": 4476, "token_acc": 0.9776119402985075, "train_speed(iter/s)": 0.962014 }, { "epoch": 0.1454374167560017, "grad_norm": 0.47338154911994934, "learning_rate": 9.753170090953791e-06, "loss": 0.04443441331386566, "memory(GiB)": 21.32, "step": 4477, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.962055 }, { "epoch": 0.1454699022187571, "grad_norm": 0.43640318512916565, "learning_rate": 9.753003377289396e-06, "loss": 0.03924606740474701, "memory(GiB)": 21.32, "step": 4478, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.962096 }, { "epoch": 0.14550238768151252, "grad_norm": 0.6212339401245117, "learning_rate": 9.752836608769016e-06, "loss": 0.05642015486955643, "memory(GiB)": 21.32, "step": 4479, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.96213 }, { "epoch": 0.14553487314426794, "grad_norm": 0.5078133940696716, "learning_rate": 9.752669785394576e-06, "loss": 0.04923992604017258, "memory(GiB)": 21.32, "step": 4480, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.962171 }, { "epoch": 0.14556735860702336, "grad_norm": 0.8561190962791443, "learning_rate": 9.752502907168e-06, "loss": 0.07750870287418365, "memory(GiB)": 21.32, "step": 4481, "token_acc": 0.9475806451612904, "train_speed(iter/s)": 0.962212 }, { "epoch": 0.14559984406977877, "grad_norm": 0.7094525694847107, "learning_rate": 9.752335974091215e-06, "loss": 0.05985617637634277, "memory(GiB)": 21.32, "step": 4482, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.962253 }, { "epoch": 0.1456323295325342, "grad_norm": 0.9795828461647034, "learning_rate": 9.752168986166148e-06, "loss": 0.056423790752887726, "memory(GiB)": 21.32, "step": 4483, "token_acc": 0.9895104895104895, "train_speed(iter/s)": 0.962294 }, { "epoch": 0.1456648149952896, "grad_norm": 0.909597635269165, "learning_rate": 9.752001943394724e-06, "loss": 0.05853547900915146, "memory(GiB)": 21.32, "step": 4484, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.962335 }, { "epoch": 0.14569730045804502, "grad_norm": 0.5643275380134583, "learning_rate": 9.751834845778874e-06, "loss": 0.046373385936021805, "memory(GiB)": 21.32, "step": 4485, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.962374 }, { "epoch": 0.14572978592080044, "grad_norm": 2.6245250701904297, "learning_rate": 9.751667693320526e-06, "loss": 0.0449635349214077, "memory(GiB)": 21.32, "step": 4486, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.962414 }, { "epoch": 0.14576227138355585, "grad_norm": 0.6494564414024353, "learning_rate": 9.751500486021608e-06, "loss": 0.06157520040869713, "memory(GiB)": 21.32, "step": 4487, "token_acc": 0.9764705882352941, "train_speed(iter/s)": 0.962455 }, { "epoch": 0.14579475684631127, "grad_norm": 0.6961585283279419, "learning_rate": 9.75133322388405e-06, "loss": 0.054926156997680664, "memory(GiB)": 21.32, "step": 4488, "token_acc": 0.9834710743801653, "train_speed(iter/s)": 0.962494 }, { "epoch": 0.14582724230906668, "grad_norm": 0.6396620273590088, "learning_rate": 9.75116590690978e-06, "loss": 0.05854715406894684, "memory(GiB)": 21.32, "step": 4489, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.962536 }, { "epoch": 0.1458597277718221, "grad_norm": 1.3879332542419434, "learning_rate": 9.750998535100735e-06, "loss": 0.06291855126619339, "memory(GiB)": 21.32, "step": 4490, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.962577 }, { "epoch": 0.14589221323457752, "grad_norm": 0.8933193683624268, "learning_rate": 9.750831108458843e-06, "loss": 0.0701855719089508, "memory(GiB)": 21.32, "step": 4491, "token_acc": 0.964824120603015, "train_speed(iter/s)": 0.962617 }, { "epoch": 0.14592469869733293, "grad_norm": 0.6178949475288391, "learning_rate": 9.750663626986037e-06, "loss": 0.05882949009537697, "memory(GiB)": 21.32, "step": 4492, "token_acc": 0.985, "train_speed(iter/s)": 0.962656 }, { "epoch": 0.14595718416008835, "grad_norm": 0.6185585856437683, "learning_rate": 9.750496090684247e-06, "loss": 0.05767345800995827, "memory(GiB)": 21.32, "step": 4493, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.962696 }, { "epoch": 0.14598966962284377, "grad_norm": 0.5594213008880615, "learning_rate": 9.750328499555411e-06, "loss": 0.05197874456644058, "memory(GiB)": 21.32, "step": 4494, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.962737 }, { "epoch": 0.14602215508559918, "grad_norm": 0.7054147124290466, "learning_rate": 9.750160853601463e-06, "loss": 0.06254474073648453, "memory(GiB)": 21.32, "step": 4495, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.962778 }, { "epoch": 0.14605464054835463, "grad_norm": 0.6336023211479187, "learning_rate": 9.749993152824333e-06, "loss": 0.06341231614351273, "memory(GiB)": 21.32, "step": 4496, "token_acc": 0.9723502304147466, "train_speed(iter/s)": 0.962819 }, { "epoch": 0.14608712601111004, "grad_norm": 0.5919415950775146, "learning_rate": 9.749825397225961e-06, "loss": 0.05565517023205757, "memory(GiB)": 21.32, "step": 4497, "token_acc": 0.97, "train_speed(iter/s)": 0.962856 }, { "epoch": 0.14611961147386546, "grad_norm": 0.8020881414413452, "learning_rate": 9.749657586808283e-06, "loss": 0.06734761595726013, "memory(GiB)": 21.32, "step": 4498, "token_acc": 0.9752475247524752, "train_speed(iter/s)": 0.962882 }, { "epoch": 0.14615209693662087, "grad_norm": 0.7323843240737915, "learning_rate": 9.749489721573233e-06, "loss": 0.05273500084877014, "memory(GiB)": 21.32, "step": 4499, "token_acc": 0.984, "train_speed(iter/s)": 0.962916 }, { "epoch": 0.1461845823993763, "grad_norm": 0.5419434905052185, "learning_rate": 9.749321801522752e-06, "loss": 0.044980183243751526, "memory(GiB)": 21.32, "step": 4500, "token_acc": 0.976303317535545, "train_speed(iter/s)": 0.962945 }, { "epoch": 0.1461845823993763, "eval_loss": 0.05612359941005707, "eval_runtime": 81.0533, "eval_samples_per_second": 122.759, "eval_steps_per_second": 3.837, "eval_token_acc": 0.9782021118313006, "step": 4500 }, { "epoch": 0.1462170678621317, "grad_norm": 0.7136062383651733, "learning_rate": 9.749153826658775e-06, "loss": 0.04680967330932617, "memory(GiB)": 21.32, "step": 4501, "token_acc": 0.9779882402927591, "train_speed(iter/s)": 0.94464 }, { "epoch": 0.14624955332488712, "grad_norm": 0.4239550530910492, "learning_rate": 9.74898579698324e-06, "loss": 0.04190007597208023, "memory(GiB)": 21.32, "step": 4502, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.94468 }, { "epoch": 0.14628203878764254, "grad_norm": 1.9825392961502075, "learning_rate": 9.748817712498091e-06, "loss": 0.05984107404947281, "memory(GiB)": 21.32, "step": 4503, "token_acc": 0.9781659388646288, "train_speed(iter/s)": 0.944722 }, { "epoch": 0.14631452425039795, "grad_norm": 0.7015084028244019, "learning_rate": 9.748649573205265e-06, "loss": 0.05809212476015091, "memory(GiB)": 21.32, "step": 4504, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.944764 }, { "epoch": 0.14634700971315337, "grad_norm": 0.9946176409721375, "learning_rate": 9.7484813791067e-06, "loss": 0.05451070889830589, "memory(GiB)": 21.32, "step": 4505, "token_acc": 0.9785714285714285, "train_speed(iter/s)": 0.944806 }, { "epoch": 0.14637949517590879, "grad_norm": 0.5909541249275208, "learning_rate": 9.748313130204341e-06, "loss": 0.05851360782980919, "memory(GiB)": 21.32, "step": 4506, "token_acc": 0.9752475247524752, "train_speed(iter/s)": 0.944849 }, { "epoch": 0.1464119806386642, "grad_norm": 0.7835503816604614, "learning_rate": 9.74814482650013e-06, "loss": 0.06553925573825836, "memory(GiB)": 21.32, "step": 4507, "token_acc": 0.9746835443037974, "train_speed(iter/s)": 0.944892 }, { "epoch": 0.14644446610141962, "grad_norm": 0.7621182203292847, "learning_rate": 9.747976467996005e-06, "loss": 0.05569734424352646, "memory(GiB)": 21.32, "step": 4508, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.944933 }, { "epoch": 0.14647695156417503, "grad_norm": 0.5448245406150818, "learning_rate": 9.747808054693914e-06, "loss": 0.042417608201503754, "memory(GiB)": 21.32, "step": 4509, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.944975 }, { "epoch": 0.14650943702693045, "grad_norm": 0.46192604303359985, "learning_rate": 9.747639586595798e-06, "loss": 0.058158859610557556, "memory(GiB)": 21.32, "step": 4510, "token_acc": 0.9683098591549296, "train_speed(iter/s)": 0.945018 }, { "epoch": 0.14654192248968587, "grad_norm": 0.6377177834510803, "learning_rate": 9.747471063703601e-06, "loss": 0.05505307763814926, "memory(GiB)": 21.32, "step": 4511, "token_acc": 0.966789667896679, "train_speed(iter/s)": 0.945059 }, { "epoch": 0.14657440795244128, "grad_norm": 0.798511803150177, "learning_rate": 9.747302486019271e-06, "loss": 0.04342428594827652, "memory(GiB)": 21.32, "step": 4512, "token_acc": 0.9845559845559846, "train_speed(iter/s)": 0.945096 }, { "epoch": 0.1466068934151967, "grad_norm": 0.5410677790641785, "learning_rate": 9.74713385354475e-06, "loss": 0.05298943817615509, "memory(GiB)": 21.32, "step": 4513, "token_acc": 0.968421052631579, "train_speed(iter/s)": 0.945131 }, { "epoch": 0.14663937887795211, "grad_norm": 0.48428207635879517, "learning_rate": 9.746965166281988e-06, "loss": 0.044814370572566986, "memory(GiB)": 21.32, "step": 4514, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.945166 }, { "epoch": 0.14667186434070753, "grad_norm": 0.5931298136711121, "learning_rate": 9.74679642423293e-06, "loss": 0.05899787321686745, "memory(GiB)": 21.32, "step": 4515, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.945201 }, { "epoch": 0.14670434980346295, "grad_norm": 0.6362523436546326, "learning_rate": 9.746627627399521e-06, "loss": 0.06425119191408157, "memory(GiB)": 21.32, "step": 4516, "token_acc": 0.9796954314720813, "train_speed(iter/s)": 0.945239 }, { "epoch": 0.14673683526621836, "grad_norm": 0.6364129781723022, "learning_rate": 9.746458775783711e-06, "loss": 0.052284903824329376, "memory(GiB)": 21.32, "step": 4517, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.945276 }, { "epoch": 0.14676932072897378, "grad_norm": 0.6678799986839294, "learning_rate": 9.746289869387451e-06, "loss": 0.059458471834659576, "memory(GiB)": 21.32, "step": 4518, "token_acc": 0.9738805970149254, "train_speed(iter/s)": 0.945309 }, { "epoch": 0.1468018061917292, "grad_norm": 0.7194470167160034, "learning_rate": 9.746120908212688e-06, "loss": 0.057977207005023956, "memory(GiB)": 21.32, "step": 4519, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.94534 }, { "epoch": 0.1468342916544846, "grad_norm": 0.5685372352600098, "learning_rate": 9.745951892261373e-06, "loss": 0.04932338744401932, "memory(GiB)": 21.32, "step": 4520, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.945377 }, { "epoch": 0.14686677711724003, "grad_norm": 0.5594493746757507, "learning_rate": 9.745782821535456e-06, "loss": 0.054967254400253296, "memory(GiB)": 21.32, "step": 4521, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.945413 }, { "epoch": 0.14689926257999544, "grad_norm": 0.701741635799408, "learning_rate": 9.74561369603689e-06, "loss": 0.049640096724033356, "memory(GiB)": 21.32, "step": 4522, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.94545 }, { "epoch": 0.14693174804275086, "grad_norm": 0.7726379632949829, "learning_rate": 9.745444515767623e-06, "loss": 0.06412798166275024, "memory(GiB)": 21.32, "step": 4523, "token_acc": 0.9696969696969697, "train_speed(iter/s)": 0.945482 }, { "epoch": 0.14696423350550628, "grad_norm": 0.7120031118392944, "learning_rate": 9.745275280729612e-06, "loss": 0.0649457722902298, "memory(GiB)": 21.32, "step": 4524, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.945516 }, { "epoch": 0.1469967189682617, "grad_norm": 0.691040575504303, "learning_rate": 9.745105990924808e-06, "loss": 0.07059963047504425, "memory(GiB)": 21.32, "step": 4525, "token_acc": 0.9776785714285714, "train_speed(iter/s)": 0.945549 }, { "epoch": 0.1470292044310171, "grad_norm": 0.7901013493537903, "learning_rate": 9.744936646355164e-06, "loss": 0.06183718517422676, "memory(GiB)": 21.32, "step": 4526, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.945584 }, { "epoch": 0.14706168989377252, "grad_norm": 0.746654212474823, "learning_rate": 9.744767247022638e-06, "loss": 0.061946652829647064, "memory(GiB)": 21.32, "step": 4527, "token_acc": 0.9895104895104895, "train_speed(iter/s)": 0.945619 }, { "epoch": 0.14709417535652797, "grad_norm": 0.8309980034828186, "learning_rate": 9.74459779292918e-06, "loss": 0.05906692147254944, "memory(GiB)": 21.32, "step": 4528, "token_acc": 0.9698275862068966, "train_speed(iter/s)": 0.945656 }, { "epoch": 0.14712666081928338, "grad_norm": 0.9583883285522461, "learning_rate": 9.74442828407675e-06, "loss": 0.06648287177085876, "memory(GiB)": 21.32, "step": 4529, "token_acc": 0.9704797047970479, "train_speed(iter/s)": 0.945685 }, { "epoch": 0.1471591462820388, "grad_norm": 0.6043282747268677, "learning_rate": 9.744258720467302e-06, "loss": 0.0541866235435009, "memory(GiB)": 21.32, "step": 4530, "token_acc": 0.9816849816849816, "train_speed(iter/s)": 0.945716 }, { "epoch": 0.14719163174479422, "grad_norm": 0.7281263470649719, "learning_rate": 9.744089102102793e-06, "loss": 0.05112812668085098, "memory(GiB)": 21.32, "step": 4531, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.94575 }, { "epoch": 0.14722411720754963, "grad_norm": 0.5750952959060669, "learning_rate": 9.743919428985183e-06, "loss": 0.0561508946120739, "memory(GiB)": 21.32, "step": 4532, "token_acc": 0.9816849816849816, "train_speed(iter/s)": 0.945781 }, { "epoch": 0.14725660267030505, "grad_norm": 0.5455847382545471, "learning_rate": 9.743749701116428e-06, "loss": 0.05076592415571213, "memory(GiB)": 21.32, "step": 4533, "token_acc": 0.9751243781094527, "train_speed(iter/s)": 0.945806 }, { "epoch": 0.14728908813306046, "grad_norm": 0.8366001844406128, "learning_rate": 9.743579918498488e-06, "loss": 0.06578455865383148, "memory(GiB)": 21.32, "step": 4534, "token_acc": 0.9613733905579399, "train_speed(iter/s)": 0.945837 }, { "epoch": 0.14732157359581588, "grad_norm": 0.7673832774162292, "learning_rate": 9.743410081133322e-06, "loss": 0.05817517265677452, "memory(GiB)": 21.32, "step": 4535, "token_acc": 0.9676113360323887, "train_speed(iter/s)": 0.945868 }, { "epoch": 0.1473540590585713, "grad_norm": 1.6631072759628296, "learning_rate": 9.74324018902289e-06, "loss": 0.06332100927829742, "memory(GiB)": 21.32, "step": 4536, "token_acc": 0.9701492537313433, "train_speed(iter/s)": 0.9459 }, { "epoch": 0.1473865445213267, "grad_norm": 0.8766958713531494, "learning_rate": 9.743070242169152e-06, "loss": 0.06618191301822662, "memory(GiB)": 21.32, "step": 4537, "token_acc": 0.9681818181818181, "train_speed(iter/s)": 0.94593 }, { "epoch": 0.14741902998408213, "grad_norm": 0.6648831963539124, "learning_rate": 9.742900240574072e-06, "loss": 0.04494836926460266, "memory(GiB)": 21.32, "step": 4538, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.945956 }, { "epoch": 0.14745151544683754, "grad_norm": 0.7028542160987854, "learning_rate": 9.74273018423961e-06, "loss": 0.06389366835355759, "memory(GiB)": 21.32, "step": 4539, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.945989 }, { "epoch": 0.14748400090959296, "grad_norm": 0.6803328394889832, "learning_rate": 9.742560073167729e-06, "loss": 0.0688110813498497, "memory(GiB)": 21.32, "step": 4540, "token_acc": 0.9632352941176471, "train_speed(iter/s)": 0.946023 }, { "epoch": 0.14751648637234838, "grad_norm": 0.6814244389533997, "learning_rate": 9.742389907360392e-06, "loss": 0.058224767446517944, "memory(GiB)": 21.32, "step": 4541, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.946058 }, { "epoch": 0.1475489718351038, "grad_norm": 0.6544855237007141, "learning_rate": 9.742219686819562e-06, "loss": 0.060699861496686935, "memory(GiB)": 21.32, "step": 4542, "token_acc": 0.9617021276595744, "train_speed(iter/s)": 0.946093 }, { "epoch": 0.1475814572978592, "grad_norm": 0.5863162875175476, "learning_rate": 9.742049411547207e-06, "loss": 0.04679074138402939, "memory(GiB)": 21.32, "step": 4543, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.946126 }, { "epoch": 0.14761394276061462, "grad_norm": 0.8375580906867981, "learning_rate": 9.741879081545292e-06, "loss": 0.07172968238592148, "memory(GiB)": 21.32, "step": 4544, "token_acc": 0.9727626459143969, "train_speed(iter/s)": 0.946159 }, { "epoch": 0.14764642822337004, "grad_norm": 0.8009956479072571, "learning_rate": 9.741708696815776e-06, "loss": 0.05496230348944664, "memory(GiB)": 21.32, "step": 4545, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.946195 }, { "epoch": 0.14767891368612546, "grad_norm": 0.9290202856063843, "learning_rate": 9.741538257360634e-06, "loss": 0.07203318178653717, "memory(GiB)": 21.32, "step": 4546, "token_acc": 0.9768518518518519, "train_speed(iter/s)": 0.946231 }, { "epoch": 0.14771139914888087, "grad_norm": 0.6667773723602295, "learning_rate": 9.74136776318183e-06, "loss": 0.04979211091995239, "memory(GiB)": 21.32, "step": 4547, "token_acc": 0.9767441860465116, "train_speed(iter/s)": 0.946263 }, { "epoch": 0.1477438846116363, "grad_norm": 0.7258325219154358, "learning_rate": 9.741197214281329e-06, "loss": 0.05390186235308647, "memory(GiB)": 21.32, "step": 4548, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.9463 }, { "epoch": 0.1477763700743917, "grad_norm": 0.7732789516448975, "learning_rate": 9.741026610661103e-06, "loss": 0.05567794293165207, "memory(GiB)": 21.32, "step": 4549, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.94634 }, { "epoch": 0.14780885553714712, "grad_norm": 0.8038988709449768, "learning_rate": 9.740855952323119e-06, "loss": 0.055354274809360504, "memory(GiB)": 21.32, "step": 4550, "token_acc": 0.9748743718592965, "train_speed(iter/s)": 0.946382 }, { "epoch": 0.14784134099990254, "grad_norm": 0.8337764143943787, "learning_rate": 9.740685239269347e-06, "loss": 0.05786772444844246, "memory(GiB)": 21.32, "step": 4551, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.946424 }, { "epoch": 0.14787382646265795, "grad_norm": 0.6871162056922913, "learning_rate": 9.740514471501758e-06, "loss": 0.06024274230003357, "memory(GiB)": 21.32, "step": 4552, "token_acc": 0.9676113360323887, "train_speed(iter/s)": 0.946466 }, { "epoch": 0.14790631192541337, "grad_norm": 0.8201154470443726, "learning_rate": 9.740343649022324e-06, "loss": 0.06800313293933868, "memory(GiB)": 21.32, "step": 4553, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.946508 }, { "epoch": 0.14793879738816879, "grad_norm": 0.950910210609436, "learning_rate": 9.740172771833012e-06, "loss": 0.059990353882312775, "memory(GiB)": 21.32, "step": 4554, "token_acc": 0.9741379310344828, "train_speed(iter/s)": 0.94655 }, { "epoch": 0.1479712828509242, "grad_norm": 0.7705160975456238, "learning_rate": 9.740001839935798e-06, "loss": 0.05865487828850746, "memory(GiB)": 21.32, "step": 4555, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.946592 }, { "epoch": 0.14800376831367962, "grad_norm": 0.7006075382232666, "learning_rate": 9.739830853332655e-06, "loss": 0.05908583477139473, "memory(GiB)": 21.32, "step": 4556, "token_acc": 0.9790209790209791, "train_speed(iter/s)": 0.946633 }, { "epoch": 0.14803625377643503, "grad_norm": 0.4781530499458313, "learning_rate": 9.739659812025554e-06, "loss": 0.05230000242590904, "memory(GiB)": 21.32, "step": 4557, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.946677 }, { "epoch": 0.14806873923919045, "grad_norm": 0.7836469411849976, "learning_rate": 9.73948871601647e-06, "loss": 0.04411586374044418, "memory(GiB)": 21.32, "step": 4558, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.946719 }, { "epoch": 0.14810122470194587, "grad_norm": 0.7056674957275391, "learning_rate": 9.739317565307378e-06, "loss": 0.056293338537216187, "memory(GiB)": 21.32, "step": 4559, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.94676 }, { "epoch": 0.1481337101647013, "grad_norm": 0.6372364163398743, "learning_rate": 9.739146359900256e-06, "loss": 0.05047188699245453, "memory(GiB)": 21.32, "step": 4560, "token_acc": 0.9702380952380952, "train_speed(iter/s)": 0.946799 }, { "epoch": 0.14816619562745673, "grad_norm": 0.582676112651825, "learning_rate": 9.738975099797074e-06, "loss": 0.054388053715229034, "memory(GiB)": 21.32, "step": 4561, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.946838 }, { "epoch": 0.14819868109021214, "grad_norm": 0.4026592969894409, "learning_rate": 9.738803784999812e-06, "loss": 0.042848195880651474, "memory(GiB)": 21.32, "step": 4562, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.946878 }, { "epoch": 0.14823116655296756, "grad_norm": 3.28289794921875, "learning_rate": 9.73863241551045e-06, "loss": 0.040318287909030914, "memory(GiB)": 21.32, "step": 4563, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.946918 }, { "epoch": 0.14826365201572297, "grad_norm": 0.6805185079574585, "learning_rate": 9.73846099133096e-06, "loss": 0.05985337123274803, "memory(GiB)": 21.32, "step": 4564, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.946961 }, { "epoch": 0.1482961374784784, "grad_norm": 0.6359854936599731, "learning_rate": 9.738289512463323e-06, "loss": 0.058995284140110016, "memory(GiB)": 21.32, "step": 4565, "token_acc": 0.9681274900398407, "train_speed(iter/s)": 0.947004 }, { "epoch": 0.1483286229412338, "grad_norm": 0.8272042274475098, "learning_rate": 9.73811797890952e-06, "loss": 0.05606473609805107, "memory(GiB)": 21.32, "step": 4566, "token_acc": 0.9663461538461539, "train_speed(iter/s)": 0.947044 }, { "epoch": 0.14836110840398922, "grad_norm": 0.6647907495498657, "learning_rate": 9.737946390671529e-06, "loss": 0.05514238402247429, "memory(GiB)": 21.32, "step": 4567, "token_acc": 0.9813084112149533, "train_speed(iter/s)": 0.947086 }, { "epoch": 0.14839359386674464, "grad_norm": 0.559891939163208, "learning_rate": 9.73777474775133e-06, "loss": 0.05661257728934288, "memory(GiB)": 21.32, "step": 4568, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.947128 }, { "epoch": 0.14842607932950005, "grad_norm": 0.8841661214828491, "learning_rate": 9.737603050150903e-06, "loss": 0.06197240948677063, "memory(GiB)": 21.32, "step": 4569, "token_acc": 0.9519230769230769, "train_speed(iter/s)": 0.947171 }, { "epoch": 0.14845856479225547, "grad_norm": 0.5751022100448608, "learning_rate": 9.737431297872234e-06, "loss": 0.04743124172091484, "memory(GiB)": 21.32, "step": 4570, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.947214 }, { "epoch": 0.1484910502550109, "grad_norm": 0.6415714025497437, "learning_rate": 9.7372594909173e-06, "loss": 0.05522461608052254, "memory(GiB)": 21.32, "step": 4571, "token_acc": 0.9716312056737588, "train_speed(iter/s)": 0.947254 }, { "epoch": 0.1485235357177663, "grad_norm": 0.7470473051071167, "learning_rate": 9.737087629288086e-06, "loss": 0.0626462921500206, "memory(GiB)": 21.32, "step": 4572, "token_acc": 0.9424778761061947, "train_speed(iter/s)": 0.947295 }, { "epoch": 0.14855602118052172, "grad_norm": 2.421142578125, "learning_rate": 9.73691571298658e-06, "loss": 0.056979529559612274, "memory(GiB)": 21.32, "step": 4573, "token_acc": 0.976, "train_speed(iter/s)": 0.947337 }, { "epoch": 0.14858850664327714, "grad_norm": 1.1022121906280518, "learning_rate": 9.736743742014755e-06, "loss": 0.05762451887130737, "memory(GiB)": 21.32, "step": 4574, "token_acc": 0.9776119402985075, "train_speed(iter/s)": 0.947379 }, { "epoch": 0.14862099210603255, "grad_norm": 0.5926300287246704, "learning_rate": 9.736571716374608e-06, "loss": 0.04968633875250816, "memory(GiB)": 21.32, "step": 4575, "token_acc": 0.9732142857142857, "train_speed(iter/s)": 0.94742 }, { "epoch": 0.14865347756878797, "grad_norm": 0.5214698910713196, "learning_rate": 9.736399636068117e-06, "loss": 0.058427851647138596, "memory(GiB)": 21.32, "step": 4576, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.947396 }, { "epoch": 0.14868596303154338, "grad_norm": 0.585661768913269, "learning_rate": 9.73622750109727e-06, "loss": 0.06612260639667511, "memory(GiB)": 21.32, "step": 4577, "token_acc": 0.9644128113879004, "train_speed(iter/s)": 0.947436 }, { "epoch": 0.1487184484942988, "grad_norm": 0.4599693715572357, "learning_rate": 9.736055311464054e-06, "loss": 0.050155702978372574, "memory(GiB)": 21.32, "step": 4578, "token_acc": 0.9734513274336283, "train_speed(iter/s)": 0.947473 }, { "epoch": 0.14875093395705422, "grad_norm": 0.9683472514152527, "learning_rate": 9.735883067170456e-06, "loss": 0.055134259164333344, "memory(GiB)": 21.32, "step": 4579, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.947509 }, { "epoch": 0.14878341941980963, "grad_norm": 0.8982798457145691, "learning_rate": 9.735710768218464e-06, "loss": 0.05404006689786911, "memory(GiB)": 21.32, "step": 4580, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.947545 }, { "epoch": 0.14881590488256505, "grad_norm": 0.4156169593334198, "learning_rate": 9.735538414610067e-06, "loss": 0.055344343185424805, "memory(GiB)": 21.32, "step": 4581, "token_acc": 0.978021978021978, "train_speed(iter/s)": 0.947579 }, { "epoch": 0.14884839034532046, "grad_norm": 0.46670040488243103, "learning_rate": 9.735366006347254e-06, "loss": 0.04766812175512314, "memory(GiB)": 21.32, "step": 4582, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.947612 }, { "epoch": 0.14888087580807588, "grad_norm": 0.7919887900352478, "learning_rate": 9.735193543432013e-06, "loss": 0.048743411898612976, "memory(GiB)": 21.32, "step": 4583, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.947647 }, { "epoch": 0.1489133612708313, "grad_norm": 0.4225557744503021, "learning_rate": 9.735021025866336e-06, "loss": 0.040065571665763855, "memory(GiB)": 21.32, "step": 4584, "token_acc": 0.9780701754385965, "train_speed(iter/s)": 0.947683 }, { "epoch": 0.1489458467335867, "grad_norm": 0.8179677724838257, "learning_rate": 9.734848453652215e-06, "loss": 0.06393057107925415, "memory(GiB)": 21.32, "step": 4585, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.947719 }, { "epoch": 0.14897833219634213, "grad_norm": 0.44981756806373596, "learning_rate": 9.734675826791642e-06, "loss": 0.05345620959997177, "memory(GiB)": 21.32, "step": 4586, "token_acc": 0.9556451612903226, "train_speed(iter/s)": 0.947754 }, { "epoch": 0.14901081765909754, "grad_norm": 0.5255261659622192, "learning_rate": 9.734503145286604e-06, "loss": 0.04183373600244522, "memory(GiB)": 21.32, "step": 4587, "token_acc": 0.9707317073170731, "train_speed(iter/s)": 0.947789 }, { "epoch": 0.14904330312185296, "grad_norm": 3.072585344314575, "learning_rate": 9.734330409139102e-06, "loss": 0.06956721097230911, "memory(GiB)": 21.32, "step": 4588, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.94782 }, { "epoch": 0.14907578858460838, "grad_norm": 0.8979510068893433, "learning_rate": 9.734157618351124e-06, "loss": 0.048496313393116, "memory(GiB)": 21.32, "step": 4589, "token_acc": 0.9855595667870036, "train_speed(iter/s)": 0.94785 }, { "epoch": 0.1491082740473638, "grad_norm": 0.7976132035255432, "learning_rate": 9.733984772924667e-06, "loss": 0.062077876180410385, "memory(GiB)": 21.32, "step": 4590, "token_acc": 0.9682539682539683, "train_speed(iter/s)": 0.947878 }, { "epoch": 0.1491407595101192, "grad_norm": 0.5217005014419556, "learning_rate": 9.733811872861724e-06, "loss": 0.04171020910143852, "memory(GiB)": 21.32, "step": 4591, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.947911 }, { "epoch": 0.14917324497287465, "grad_norm": 0.8834112286567688, "learning_rate": 9.733638918164292e-06, "loss": 0.06601975858211517, "memory(GiB)": 21.32, "step": 4592, "token_acc": 0.9774436090225563, "train_speed(iter/s)": 0.94794 }, { "epoch": 0.14920573043563007, "grad_norm": 0.5319540500640869, "learning_rate": 9.733465908834365e-06, "loss": 0.04456306993961334, "memory(GiB)": 21.32, "step": 4593, "token_acc": 0.9826989619377162, "train_speed(iter/s)": 0.947972 }, { "epoch": 0.14923821589838548, "grad_norm": 0.5927124619483948, "learning_rate": 9.733292844873944e-06, "loss": 0.05569283664226532, "memory(GiB)": 21.32, "step": 4594, "token_acc": 0.9776951672862454, "train_speed(iter/s)": 0.948004 }, { "epoch": 0.1492707013611409, "grad_norm": 0.5934754610061646, "learning_rate": 9.733119726285021e-06, "loss": 0.05176208168268204, "memory(GiB)": 21.32, "step": 4595, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.948035 }, { "epoch": 0.14930318682389632, "grad_norm": 0.9462413191795349, "learning_rate": 9.732946553069596e-06, "loss": 0.057412635535001755, "memory(GiB)": 21.32, "step": 4596, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.948064 }, { "epoch": 0.14933567228665173, "grad_norm": 0.5867743492126465, "learning_rate": 9.73277332522967e-06, "loss": 0.05006532371044159, "memory(GiB)": 21.32, "step": 4597, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.948088 }, { "epoch": 0.14936815774940715, "grad_norm": 0.9252062439918518, "learning_rate": 9.73260004276724e-06, "loss": 0.06161656975746155, "memory(GiB)": 21.32, "step": 4598, "token_acc": 0.9746835443037974, "train_speed(iter/s)": 0.948116 }, { "epoch": 0.14940064321216256, "grad_norm": 0.4718562960624695, "learning_rate": 9.732426705684307e-06, "loss": 0.048917338252067566, "memory(GiB)": 21.32, "step": 4599, "token_acc": 0.9745762711864406, "train_speed(iter/s)": 0.948148 }, { "epoch": 0.14943312867491798, "grad_norm": 1.1281206607818604, "learning_rate": 9.73225331398287e-06, "loss": 0.048322126269340515, "memory(GiB)": 21.32, "step": 4600, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.948169 }, { "epoch": 0.1494656141376734, "grad_norm": 0.5586303472518921, "learning_rate": 9.732079867664932e-06, "loss": 0.05013272911310196, "memory(GiB)": 21.32, "step": 4601, "token_acc": 0.96875, "train_speed(iter/s)": 0.948199 }, { "epoch": 0.1494980996004288, "grad_norm": 0.4888332784175873, "learning_rate": 9.731906366732493e-06, "loss": 0.04886971786618233, "memory(GiB)": 21.32, "step": 4602, "token_acc": 0.970873786407767, "train_speed(iter/s)": 0.948232 }, { "epoch": 0.14953058506318423, "grad_norm": 0.8752078413963318, "learning_rate": 9.731732811187558e-06, "loss": 0.05448603630065918, "memory(GiB)": 21.32, "step": 4603, "token_acc": 0.9613899613899614, "train_speed(iter/s)": 0.948265 }, { "epoch": 0.14956307052593965, "grad_norm": 0.7615851759910583, "learning_rate": 9.731559201032126e-06, "loss": 0.06473356485366821, "memory(GiB)": 21.32, "step": 4604, "token_acc": 0.963855421686747, "train_speed(iter/s)": 0.948301 }, { "epoch": 0.14959555598869506, "grad_norm": 0.687341570854187, "learning_rate": 9.731385536268206e-06, "loss": 0.05043128877878189, "memory(GiB)": 21.32, "step": 4605, "token_acc": 0.9836956521739131, "train_speed(iter/s)": 0.948337 }, { "epoch": 0.14962804145145048, "grad_norm": 0.6904283761978149, "learning_rate": 9.731211816897796e-06, "loss": 0.061034366488456726, "memory(GiB)": 21.32, "step": 4606, "token_acc": 0.9747474747474747, "train_speed(iter/s)": 0.948371 }, { "epoch": 0.1496605269142059, "grad_norm": 0.44226524233818054, "learning_rate": 9.731038042922906e-06, "loss": 0.04863379895687103, "memory(GiB)": 21.32, "step": 4607, "token_acc": 0.9707112970711297, "train_speed(iter/s)": 0.948414 }, { "epoch": 0.1496930123769613, "grad_norm": 0.9485113620758057, "learning_rate": 9.730864214345541e-06, "loss": 0.069552481174469, "memory(GiB)": 21.32, "step": 4608, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.948456 }, { "epoch": 0.14972549783971673, "grad_norm": 2.0663704872131348, "learning_rate": 9.730690331167706e-06, "loss": 0.05273790284991264, "memory(GiB)": 21.32, "step": 4609, "token_acc": 0.9771863117870723, "train_speed(iter/s)": 0.948497 }, { "epoch": 0.14975798330247214, "grad_norm": 0.9278132915496826, "learning_rate": 9.730516393391408e-06, "loss": 0.06167708337306976, "memory(GiB)": 21.32, "step": 4610, "token_acc": 0.9744897959183674, "train_speed(iter/s)": 0.948539 }, { "epoch": 0.14979046876522756, "grad_norm": 0.595829963684082, "learning_rate": 9.730342401018652e-06, "loss": 0.053699761629104614, "memory(GiB)": 21.32, "step": 4611, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.948582 }, { "epoch": 0.14982295422798297, "grad_norm": 0.5146580338478088, "learning_rate": 9.730168354051452e-06, "loss": 0.052934445440769196, "memory(GiB)": 21.32, "step": 4612, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.948606 }, { "epoch": 0.1498554396907384, "grad_norm": 1.1393945217132568, "learning_rate": 9.729994252491813e-06, "loss": 0.08181952685117722, "memory(GiB)": 21.32, "step": 4613, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.948646 }, { "epoch": 0.1498879251534938, "grad_norm": 0.6699438095092773, "learning_rate": 9.729820096341742e-06, "loss": 0.050792425870895386, "memory(GiB)": 21.32, "step": 4614, "token_acc": 0.9730941704035875, "train_speed(iter/s)": 0.948688 }, { "epoch": 0.14992041061624922, "grad_norm": 0.6391803026199341, "learning_rate": 9.729645885603254e-06, "loss": 0.05741351842880249, "memory(GiB)": 21.32, "step": 4615, "token_acc": 0.9776785714285714, "train_speed(iter/s)": 0.948729 }, { "epoch": 0.14995289607900464, "grad_norm": 0.6715768575668335, "learning_rate": 9.729471620278358e-06, "loss": 0.0436357781291008, "memory(GiB)": 21.32, "step": 4616, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.948771 }, { "epoch": 0.14998538154176005, "grad_norm": 0.7193104028701782, "learning_rate": 9.729297300369063e-06, "loss": 0.05712207406759262, "memory(GiB)": 21.32, "step": 4617, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.948813 }, { "epoch": 0.15001786700451547, "grad_norm": 0.5013284683227539, "learning_rate": 9.729122925877383e-06, "loss": 0.050153948366642, "memory(GiB)": 21.32, "step": 4618, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.948854 }, { "epoch": 0.1500503524672709, "grad_norm": 0.9375681281089783, "learning_rate": 9.72894849680533e-06, "loss": 0.056296974420547485, "memory(GiB)": 21.32, "step": 4619, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.948897 }, { "epoch": 0.1500828379300263, "grad_norm": 0.573019802570343, "learning_rate": 9.728774013154916e-06, "loss": 0.058839425444602966, "memory(GiB)": 21.32, "step": 4620, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.948939 }, { "epoch": 0.15011532339278172, "grad_norm": 0.5517528057098389, "learning_rate": 9.728599474928157e-06, "loss": 0.05461331456899643, "memory(GiB)": 21.32, "step": 4621, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.948981 }, { "epoch": 0.15014780885553713, "grad_norm": 0.5524119734764099, "learning_rate": 9.728424882127066e-06, "loss": 0.05054571107029915, "memory(GiB)": 21.32, "step": 4622, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.949023 }, { "epoch": 0.15018029431829255, "grad_norm": 0.6672487854957581, "learning_rate": 9.72825023475366e-06, "loss": 0.055444564670324326, "memory(GiB)": 21.32, "step": 4623, "token_acc": 0.9748743718592965, "train_speed(iter/s)": 0.949064 }, { "epoch": 0.150212779781048, "grad_norm": 1.1172579526901245, "learning_rate": 9.72807553280995e-06, "loss": 0.06925934553146362, "memory(GiB)": 21.32, "step": 4624, "token_acc": 0.9766536964980544, "train_speed(iter/s)": 0.949106 }, { "epoch": 0.1502452652438034, "grad_norm": 0.681891143321991, "learning_rate": 9.727900776297956e-06, "loss": 0.06305479258298874, "memory(GiB)": 21.32, "step": 4625, "token_acc": 0.9700598802395209, "train_speed(iter/s)": 0.949148 }, { "epoch": 0.15027775070655883, "grad_norm": 0.5380549430847168, "learning_rate": 9.727725965219697e-06, "loss": 0.04612176865339279, "memory(GiB)": 21.32, "step": 4626, "token_acc": 0.9823943661971831, "train_speed(iter/s)": 0.94919 }, { "epoch": 0.15031023616931424, "grad_norm": 0.8997308611869812, "learning_rate": 9.727551099577186e-06, "loss": 0.059544485062360764, "memory(GiB)": 21.32, "step": 4627, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.949232 }, { "epoch": 0.15034272163206966, "grad_norm": 1.188033938407898, "learning_rate": 9.727376179372442e-06, "loss": 0.050204209983348846, "memory(GiB)": 21.32, "step": 4628, "token_acc": 0.9737991266375546, "train_speed(iter/s)": 0.949272 }, { "epoch": 0.15037520709482508, "grad_norm": 0.5291170477867126, "learning_rate": 9.727201204607486e-06, "loss": 0.047589004039764404, "memory(GiB)": 21.32, "step": 4629, "token_acc": 0.9716981132075472, "train_speed(iter/s)": 0.949314 }, { "epoch": 0.1504076925575805, "grad_norm": 0.5007078051567078, "learning_rate": 9.727026175284335e-06, "loss": 0.05677228420972824, "memory(GiB)": 21.32, "step": 4630, "token_acc": 0.9893992932862191, "train_speed(iter/s)": 0.949355 }, { "epoch": 0.1504401780203359, "grad_norm": 0.6275635957717896, "learning_rate": 9.726851091405012e-06, "loss": 0.061548907309770584, "memory(GiB)": 21.32, "step": 4631, "token_acc": 0.9713114754098361, "train_speed(iter/s)": 0.949395 }, { "epoch": 0.15047266348309132, "grad_norm": 0.5838510394096375, "learning_rate": 9.726675952971534e-06, "loss": 0.046522121876478195, "memory(GiB)": 21.32, "step": 4632, "token_acc": 0.9635036496350365, "train_speed(iter/s)": 0.949427 }, { "epoch": 0.15050514894584674, "grad_norm": 0.5445491075515747, "learning_rate": 9.726500759985927e-06, "loss": 0.05210801959037781, "memory(GiB)": 21.32, "step": 4633, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.949469 }, { "epoch": 0.15053763440860216, "grad_norm": 0.5151516199111938, "learning_rate": 9.726325512450207e-06, "loss": 0.05621422827243805, "memory(GiB)": 21.32, "step": 4634, "token_acc": 0.9663461538461539, "train_speed(iter/s)": 0.949508 }, { "epoch": 0.15057011987135757, "grad_norm": 0.6413209438323975, "learning_rate": 9.726150210366402e-06, "loss": 0.05523654446005821, "memory(GiB)": 21.32, "step": 4635, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.949549 }, { "epoch": 0.150602605334113, "grad_norm": 0.5667359828948975, "learning_rate": 9.725974853736532e-06, "loss": 0.055448271334171295, "memory(GiB)": 21.32, "step": 4636, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.949589 }, { "epoch": 0.1506350907968684, "grad_norm": 0.5319929718971252, "learning_rate": 9.725799442562623e-06, "loss": 0.054345302283763885, "memory(GiB)": 21.32, "step": 4637, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.94963 }, { "epoch": 0.15066757625962382, "grad_norm": 0.5717031955718994, "learning_rate": 9.725623976846698e-06, "loss": 0.051648274064064026, "memory(GiB)": 21.32, "step": 4638, "token_acc": 0.9704641350210971, "train_speed(iter/s)": 0.949671 }, { "epoch": 0.15070006172237924, "grad_norm": 2.156599998474121, "learning_rate": 9.725448456590782e-06, "loss": 0.0655207633972168, "memory(GiB)": 21.32, "step": 4639, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.949711 }, { "epoch": 0.15073254718513465, "grad_norm": 0.5847827792167664, "learning_rate": 9.7252728817969e-06, "loss": 0.04409339278936386, "memory(GiB)": 21.32, "step": 4640, "token_acc": 0.9834710743801653, "train_speed(iter/s)": 0.949752 }, { "epoch": 0.15076503264789007, "grad_norm": 0.6406701803207397, "learning_rate": 9.725097252467084e-06, "loss": 0.05749237537384033, "memory(GiB)": 21.32, "step": 4641, "token_acc": 0.982532751091703, "train_speed(iter/s)": 0.949793 }, { "epoch": 0.15079751811064548, "grad_norm": 1.1819045543670654, "learning_rate": 9.724921568603354e-06, "loss": 0.041111186146736145, "memory(GiB)": 21.32, "step": 4642, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.949833 }, { "epoch": 0.1508300035734009, "grad_norm": 0.6133162975311279, "learning_rate": 9.724745830207739e-06, "loss": 0.06028332933783531, "memory(GiB)": 21.32, "step": 4643, "token_acc": 0.9662447257383966, "train_speed(iter/s)": 0.949866 }, { "epoch": 0.15086248903615632, "grad_norm": 0.47971653938293457, "learning_rate": 9.72457003728227e-06, "loss": 0.049484215676784515, "memory(GiB)": 21.32, "step": 4644, "token_acc": 0.98, "train_speed(iter/s)": 0.949898 }, { "epoch": 0.15089497449891173, "grad_norm": 0.92463219165802, "learning_rate": 9.724394189828974e-06, "loss": 0.05664902552962303, "memory(GiB)": 21.32, "step": 4645, "token_acc": 0.9793388429752066, "train_speed(iter/s)": 0.949923 }, { "epoch": 0.15092745996166715, "grad_norm": 1.8477168083190918, "learning_rate": 9.72421828784988e-06, "loss": 0.05818749964237213, "memory(GiB)": 21.32, "step": 4646, "token_acc": 0.9689922480620154, "train_speed(iter/s)": 0.94995 }, { "epoch": 0.15095994542442256, "grad_norm": 0.48012325167655945, "learning_rate": 9.72404233134702e-06, "loss": 0.04643896967172623, "memory(GiB)": 21.32, "step": 4647, "token_acc": 0.9773584905660377, "train_speed(iter/s)": 0.949979 }, { "epoch": 0.15099243088717798, "grad_norm": 0.5161570906639099, "learning_rate": 9.723866320322423e-06, "loss": 0.0552520826458931, "memory(GiB)": 21.32, "step": 4648, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.950009 }, { "epoch": 0.1510249163499334, "grad_norm": 1.7814947366714478, "learning_rate": 9.723690254778123e-06, "loss": 0.05059444159269333, "memory(GiB)": 21.32, "step": 4649, "token_acc": 0.9757085020242915, "train_speed(iter/s)": 0.95004 }, { "epoch": 0.1510574018126888, "grad_norm": 0.49717649817466736, "learning_rate": 9.72351413471615e-06, "loss": 0.045909568667411804, "memory(GiB)": 21.32, "step": 4650, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.950067 }, { "epoch": 0.15108988727544423, "grad_norm": 0.5661123394966125, "learning_rate": 9.723337960138536e-06, "loss": 0.05480697751045227, "memory(GiB)": 21.32, "step": 4651, "token_acc": 0.9766536964980544, "train_speed(iter/s)": 0.950097 }, { "epoch": 0.15112237273819965, "grad_norm": 0.49119842052459717, "learning_rate": 9.723161731047315e-06, "loss": 0.05844728276133537, "memory(GiB)": 21.32, "step": 4652, "token_acc": 0.9753694581280788, "train_speed(iter/s)": 0.950121 }, { "epoch": 0.15115485820095506, "grad_norm": 0.7347963452339172, "learning_rate": 9.722985447444521e-06, "loss": 0.0548703558743, "memory(GiB)": 21.32, "step": 4653, "token_acc": 0.97265625, "train_speed(iter/s)": 0.950152 }, { "epoch": 0.15118734366371048, "grad_norm": 0.5267266631126404, "learning_rate": 9.72280910933219e-06, "loss": 0.05292639508843422, "memory(GiB)": 21.32, "step": 4654, "token_acc": 1.0, "train_speed(iter/s)": 0.950182 }, { "epoch": 0.1512198291264659, "grad_norm": 0.5101274847984314, "learning_rate": 9.722632716712354e-06, "loss": 0.062276147305965424, "memory(GiB)": 21.32, "step": 4655, "token_acc": 0.9621212121212122, "train_speed(iter/s)": 0.95021 }, { "epoch": 0.15125231458922134, "grad_norm": 2.7468018531799316, "learning_rate": 9.722456269587052e-06, "loss": 0.059192441403865814, "memory(GiB)": 21.32, "step": 4656, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.950241 }, { "epoch": 0.15128480005197675, "grad_norm": 0.616375744342804, "learning_rate": 9.722279767958318e-06, "loss": 0.057269252836704254, "memory(GiB)": 21.32, "step": 4657, "token_acc": 0.9779411764705882, "train_speed(iter/s)": 0.950267 }, { "epoch": 0.15131728551473217, "grad_norm": 0.6283907890319824, "learning_rate": 9.722103211828191e-06, "loss": 0.059767961502075195, "memory(GiB)": 21.32, "step": 4658, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.950295 }, { "epoch": 0.15134977097748759, "grad_norm": 1.289000391960144, "learning_rate": 9.721926601198707e-06, "loss": 0.0588199757039547, "memory(GiB)": 21.32, "step": 4659, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.950326 }, { "epoch": 0.151382256440243, "grad_norm": 0.866977870464325, "learning_rate": 9.721749936071907e-06, "loss": 0.0710022896528244, "memory(GiB)": 21.32, "step": 4660, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.950357 }, { "epoch": 0.15141474190299842, "grad_norm": 1.636707067489624, "learning_rate": 9.721573216449827e-06, "loss": 0.06044132634997368, "memory(GiB)": 21.32, "step": 4661, "token_acc": 0.9754901960784313, "train_speed(iter/s)": 0.950382 }, { "epoch": 0.15144722736575383, "grad_norm": 0.6696438789367676, "learning_rate": 9.721396442334507e-06, "loss": 0.06261496245861053, "memory(GiB)": 21.32, "step": 4662, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.950411 }, { "epoch": 0.15147971282850925, "grad_norm": 0.5413240194320679, "learning_rate": 9.721219613727988e-06, "loss": 0.05373605340719223, "memory(GiB)": 21.32, "step": 4663, "token_acc": 0.968609865470852, "train_speed(iter/s)": 0.950441 }, { "epoch": 0.15151219829126467, "grad_norm": 0.5462260842323303, "learning_rate": 9.721042730632311e-06, "loss": 0.05753472447395325, "memory(GiB)": 21.32, "step": 4664, "token_acc": 0.9813084112149533, "train_speed(iter/s)": 0.950476 }, { "epoch": 0.15154468375402008, "grad_norm": 0.43851038813591003, "learning_rate": 9.720865793049518e-06, "loss": 0.05118364840745926, "memory(GiB)": 21.32, "step": 4665, "token_acc": 0.9700598802395209, "train_speed(iter/s)": 0.950511 }, { "epoch": 0.1515771692167755, "grad_norm": 0.7982562780380249, "learning_rate": 9.720688800981648e-06, "loss": 0.05247431993484497, "memory(GiB)": 21.32, "step": 4666, "token_acc": 0.975, "train_speed(iter/s)": 0.950551 }, { "epoch": 0.15160965467953091, "grad_norm": 0.6110360622406006, "learning_rate": 9.720511754430748e-06, "loss": 0.06562283635139465, "memory(GiB)": 21.32, "step": 4667, "token_acc": 0.970873786407767, "train_speed(iter/s)": 0.950591 }, { "epoch": 0.15164214014228633, "grad_norm": 0.5627602934837341, "learning_rate": 9.72033465339886e-06, "loss": 0.054010190069675446, "memory(GiB)": 21.32, "step": 4668, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.950632 }, { "epoch": 0.15167462560504175, "grad_norm": 1.350394606590271, "learning_rate": 9.720157497888026e-06, "loss": 0.04996946454048157, "memory(GiB)": 21.32, "step": 4669, "token_acc": 0.9641255605381166, "train_speed(iter/s)": 0.950673 }, { "epoch": 0.15170711106779716, "grad_norm": 1.1321853399276733, "learning_rate": 9.719980287900292e-06, "loss": 0.055321529507637024, "memory(GiB)": 21.32, "step": 4670, "token_acc": 0.9775280898876404, "train_speed(iter/s)": 0.950712 }, { "epoch": 0.15173959653055258, "grad_norm": 1.255426049232483, "learning_rate": 9.719803023437704e-06, "loss": 0.06675191223621368, "memory(GiB)": 21.32, "step": 4671, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.950752 }, { "epoch": 0.151772081993308, "grad_norm": 0.8323760628700256, "learning_rate": 9.719625704502308e-06, "loss": 0.05318558216094971, "memory(GiB)": 21.32, "step": 4672, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.950792 }, { "epoch": 0.1518045674560634, "grad_norm": 0.5950945019721985, "learning_rate": 9.719448331096146e-06, "loss": 0.05645424872636795, "memory(GiB)": 21.32, "step": 4673, "token_acc": 0.9776951672862454, "train_speed(iter/s)": 0.950833 }, { "epoch": 0.15183705291881883, "grad_norm": 0.8478565812110901, "learning_rate": 9.719270903221272e-06, "loss": 0.06417308002710342, "memory(GiB)": 21.32, "step": 4674, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.950871 }, { "epoch": 0.15186953838157424, "grad_norm": 0.5334762930870056, "learning_rate": 9.719093420879729e-06, "loss": 0.049464188516139984, "memory(GiB)": 21.32, "step": 4675, "token_acc": 0.9680365296803652, "train_speed(iter/s)": 0.95091 }, { "epoch": 0.15190202384432966, "grad_norm": 0.660057544708252, "learning_rate": 9.718915884073568e-06, "loss": 0.056398697197437286, "memory(GiB)": 21.32, "step": 4676, "token_acc": 0.9822485207100592, "train_speed(iter/s)": 0.950948 }, { "epoch": 0.15193450930708507, "grad_norm": 0.6104487180709839, "learning_rate": 9.718738292804834e-06, "loss": 0.059255629777908325, "memory(GiB)": 21.32, "step": 4677, "token_acc": 0.9714285714285714, "train_speed(iter/s)": 0.950989 }, { "epoch": 0.1519669947698405, "grad_norm": 0.7642553448677063, "learning_rate": 9.718560647075583e-06, "loss": 0.07114158570766449, "memory(GiB)": 21.32, "step": 4678, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.951028 }, { "epoch": 0.1519994802325959, "grad_norm": 0.728610098361969, "learning_rate": 9.718382946887858e-06, "loss": 0.057795584201812744, "memory(GiB)": 21.32, "step": 4679, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.951068 }, { "epoch": 0.15203196569535132, "grad_norm": 0.4954202473163605, "learning_rate": 9.718205192243717e-06, "loss": 0.05148964747786522, "memory(GiB)": 21.32, "step": 4680, "token_acc": 0.984, "train_speed(iter/s)": 0.951109 }, { "epoch": 0.15206445115810674, "grad_norm": 0.5871523022651672, "learning_rate": 9.718027383145208e-06, "loss": 0.04226385056972504, "memory(GiB)": 21.32, "step": 4681, "token_acc": 0.9747474747474747, "train_speed(iter/s)": 0.951142 }, { "epoch": 0.15209693662086216, "grad_norm": 0.581771969795227, "learning_rate": 9.717849519594382e-06, "loss": 0.05496039614081383, "memory(GiB)": 21.32, "step": 4682, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.95118 }, { "epoch": 0.15212942208361757, "grad_norm": 0.7295685410499573, "learning_rate": 9.717671601593293e-06, "loss": 0.04739639535546303, "memory(GiB)": 21.32, "step": 4683, "token_acc": 0.9736842105263158, "train_speed(iter/s)": 0.951221 }, { "epoch": 0.152161907546373, "grad_norm": 0.6698864698410034, "learning_rate": 9.717493629143995e-06, "loss": 0.051964253187179565, "memory(GiB)": 21.32, "step": 4684, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.951261 }, { "epoch": 0.1521943930091284, "grad_norm": 0.6112594604492188, "learning_rate": 9.71731560224854e-06, "loss": 0.06270700693130493, "memory(GiB)": 21.32, "step": 4685, "token_acc": 0.9835164835164835, "train_speed(iter/s)": 0.951301 }, { "epoch": 0.15222687847188382, "grad_norm": 0.683914303779602, "learning_rate": 9.717137520908986e-06, "loss": 0.06882181763648987, "memory(GiB)": 21.32, "step": 4686, "token_acc": 0.9711934156378601, "train_speed(iter/s)": 0.951339 }, { "epoch": 0.15225936393463924, "grad_norm": 0.3749248683452606, "learning_rate": 9.716959385127387e-06, "loss": 0.04223598539829254, "memory(GiB)": 21.32, "step": 4687, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.951376 }, { "epoch": 0.15229184939739468, "grad_norm": 0.5998100638389587, "learning_rate": 9.716781194905798e-06, "loss": 0.05931294709444046, "memory(GiB)": 21.32, "step": 4688, "token_acc": 0.9704641350210971, "train_speed(iter/s)": 0.951414 }, { "epoch": 0.1523243348601501, "grad_norm": 0.5173124074935913, "learning_rate": 9.716602950246275e-06, "loss": 0.04834914207458496, "memory(GiB)": 21.32, "step": 4689, "token_acc": 0.9733333333333334, "train_speed(iter/s)": 0.951452 }, { "epoch": 0.1523568203229055, "grad_norm": 0.5679554343223572, "learning_rate": 9.716424651150877e-06, "loss": 0.046347979456186295, "memory(GiB)": 21.32, "step": 4690, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.951492 }, { "epoch": 0.15238930578566093, "grad_norm": 0.5163838863372803, "learning_rate": 9.716246297621661e-06, "loss": 0.05329260975122452, "memory(GiB)": 21.32, "step": 4691, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.951534 }, { "epoch": 0.15242179124841634, "grad_norm": 0.48602184653282166, "learning_rate": 9.716067889660685e-06, "loss": 0.04907005652785301, "memory(GiB)": 21.32, "step": 4692, "token_acc": 0.9699248120300752, "train_speed(iter/s)": 0.951572 }, { "epoch": 0.15245427671117176, "grad_norm": 0.5287532210350037, "learning_rate": 9.715889427270008e-06, "loss": 0.05084513872861862, "memory(GiB)": 21.32, "step": 4693, "token_acc": 0.9737827715355806, "train_speed(iter/s)": 0.951611 }, { "epoch": 0.15248676217392718, "grad_norm": 1.3568403720855713, "learning_rate": 9.715710910451691e-06, "loss": 0.06125970184803009, "memory(GiB)": 21.32, "step": 4694, "token_acc": 0.98, "train_speed(iter/s)": 0.951651 }, { "epoch": 0.1525192476366826, "grad_norm": 0.787685751914978, "learning_rate": 9.715532339207794e-06, "loss": 0.06072087585926056, "memory(GiB)": 21.32, "step": 4695, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.951688 }, { "epoch": 0.152551733099438, "grad_norm": 0.6626702547073364, "learning_rate": 9.715353713540377e-06, "loss": 0.05623093992471695, "memory(GiB)": 21.32, "step": 4696, "token_acc": 0.966804979253112, "train_speed(iter/s)": 0.951728 }, { "epoch": 0.15258421856219342, "grad_norm": 0.6042718887329102, "learning_rate": 9.715175033451501e-06, "loss": 0.06307631731033325, "memory(GiB)": 21.32, "step": 4697, "token_acc": 0.9798994974874372, "train_speed(iter/s)": 0.951767 }, { "epoch": 0.15261670402494884, "grad_norm": 0.86577308177948, "learning_rate": 9.71499629894323e-06, "loss": 0.06625012308359146, "memory(GiB)": 21.32, "step": 4698, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.951806 }, { "epoch": 0.15264918948770426, "grad_norm": 1.0956404209136963, "learning_rate": 9.714817510017627e-06, "loss": 0.05779511481523514, "memory(GiB)": 21.32, "step": 4699, "token_acc": 0.9816176470588235, "train_speed(iter/s)": 0.951845 }, { "epoch": 0.15268167495045967, "grad_norm": 1.2169432640075684, "learning_rate": 9.714638666676752e-06, "loss": 0.05692864954471588, "memory(GiB)": 21.32, "step": 4700, "token_acc": 0.9661016949152542, "train_speed(iter/s)": 0.951885 }, { "epoch": 0.1527141604132151, "grad_norm": 0.5956279635429382, "learning_rate": 9.714459768922673e-06, "loss": 0.04672235995531082, "memory(GiB)": 21.32, "step": 4701, "token_acc": 0.9812206572769953, "train_speed(iter/s)": 0.951923 }, { "epoch": 0.1527466458759705, "grad_norm": 0.5878296494483948, "learning_rate": 9.714280816757453e-06, "loss": 0.05103975534439087, "memory(GiB)": 21.32, "step": 4702, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.951949 }, { "epoch": 0.15277913133872592, "grad_norm": 0.6564220190048218, "learning_rate": 9.71410181018316e-06, "loss": 0.053012557327747345, "memory(GiB)": 21.32, "step": 4703, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.951979 }, { "epoch": 0.15281161680148134, "grad_norm": 0.4961411654949188, "learning_rate": 9.713922749201857e-06, "loss": 0.048363879323005676, "memory(GiB)": 21.32, "step": 4704, "token_acc": 0.9741379310344828, "train_speed(iter/s)": 0.952009 }, { "epoch": 0.15284410226423675, "grad_norm": 0.7610440850257874, "learning_rate": 9.713743633815609e-06, "loss": 0.06168484687805176, "memory(GiB)": 21.32, "step": 4705, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.952038 }, { "epoch": 0.15287658772699217, "grad_norm": 0.779643714427948, "learning_rate": 9.713564464026486e-06, "loss": 0.06195897236466408, "memory(GiB)": 21.32, "step": 4706, "token_acc": 0.9759036144578314, "train_speed(iter/s)": 0.952065 }, { "epoch": 0.15290907318974759, "grad_norm": 0.7476825714111328, "learning_rate": 9.713385239836557e-06, "loss": 0.054807618260383606, "memory(GiB)": 21.32, "step": 4707, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.952092 }, { "epoch": 0.152941558652503, "grad_norm": 0.7373765110969543, "learning_rate": 9.713205961247887e-06, "loss": 0.054412342607975006, "memory(GiB)": 21.32, "step": 4708, "token_acc": 1.0, "train_speed(iter/s)": 0.95212 }, { "epoch": 0.15297404411525842, "grad_norm": 0.8631183505058289, "learning_rate": 9.713026628262548e-06, "loss": 0.06594419479370117, "memory(GiB)": 21.32, "step": 4709, "token_acc": 0.9541284403669725, "train_speed(iter/s)": 0.952148 }, { "epoch": 0.15300652957801383, "grad_norm": 0.6248902678489685, "learning_rate": 9.712847240882607e-06, "loss": 0.05202937498688698, "memory(GiB)": 21.32, "step": 4710, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.952178 }, { "epoch": 0.15303901504076925, "grad_norm": 0.6046490669250488, "learning_rate": 9.712667799110138e-06, "loss": 0.0573892742395401, "memory(GiB)": 21.32, "step": 4711, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.952205 }, { "epoch": 0.15307150050352467, "grad_norm": 0.4733867943286896, "learning_rate": 9.71248830294721e-06, "loss": 0.05076809227466583, "memory(GiB)": 21.32, "step": 4712, "token_acc": 0.9704797047970479, "train_speed(iter/s)": 0.952235 }, { "epoch": 0.15310398596628008, "grad_norm": 0.6492281556129456, "learning_rate": 9.712308752395894e-06, "loss": 0.06508563458919525, "memory(GiB)": 21.32, "step": 4713, "token_acc": 0.9664179104477612, "train_speed(iter/s)": 0.952266 }, { "epoch": 0.1531364714290355, "grad_norm": 1.0672866106033325, "learning_rate": 9.712129147458264e-06, "loss": 0.05772818624973297, "memory(GiB)": 21.32, "step": 4714, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.952295 }, { "epoch": 0.15316895689179091, "grad_norm": 0.589061975479126, "learning_rate": 9.71194948813639e-06, "loss": 0.06051019951701164, "memory(GiB)": 21.32, "step": 4715, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.952324 }, { "epoch": 0.15320144235454633, "grad_norm": 0.47362494468688965, "learning_rate": 9.71176977443235e-06, "loss": 0.04679173603653908, "memory(GiB)": 21.32, "step": 4716, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.952351 }, { "epoch": 0.15323392781730175, "grad_norm": 1.0381602048873901, "learning_rate": 9.711590006348213e-06, "loss": 0.055816300213336945, "memory(GiB)": 21.32, "step": 4717, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.952376 }, { "epoch": 0.15326641328005716, "grad_norm": 0.6318153142929077, "learning_rate": 9.711410183886056e-06, "loss": 0.05322832986712456, "memory(GiB)": 21.32, "step": 4718, "token_acc": 0.972, "train_speed(iter/s)": 0.952405 }, { "epoch": 0.15329889874281258, "grad_norm": 0.5929223299026489, "learning_rate": 9.711230307047956e-06, "loss": 0.05797490477561951, "memory(GiB)": 21.32, "step": 4719, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.952432 }, { "epoch": 0.15333138420556802, "grad_norm": 0.5149093866348267, "learning_rate": 9.711050375835987e-06, "loss": 0.059342674911022186, "memory(GiB)": 21.32, "step": 4720, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.952464 }, { "epoch": 0.15336386966832344, "grad_norm": 0.43895846605300903, "learning_rate": 9.710870390252228e-06, "loss": 0.04220487177371979, "memory(GiB)": 21.32, "step": 4721, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.952495 }, { "epoch": 0.15339635513107885, "grad_norm": 1.014393925666809, "learning_rate": 9.710690350298752e-06, "loss": 0.06207898631691933, "memory(GiB)": 21.32, "step": 4722, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.952522 }, { "epoch": 0.15342884059383427, "grad_norm": 0.5599179863929749, "learning_rate": 9.71051025597764e-06, "loss": 0.052597641944885254, "memory(GiB)": 21.32, "step": 4723, "token_acc": 0.9847908745247148, "train_speed(iter/s)": 0.952553 }, { "epoch": 0.1534613260565897, "grad_norm": 0.6372623443603516, "learning_rate": 9.710330107290971e-06, "loss": 0.060818661004304886, "memory(GiB)": 21.32, "step": 4724, "token_acc": 0.9715447154471545, "train_speed(iter/s)": 0.952589 }, { "epoch": 0.1534938115193451, "grad_norm": 0.6188109517097473, "learning_rate": 9.710149904240822e-06, "loss": 0.06322621554136276, "memory(GiB)": 21.32, "step": 4725, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.952623 }, { "epoch": 0.15352629698210052, "grad_norm": 0.5728854537010193, "learning_rate": 9.709969646829274e-06, "loss": 0.054843492805957794, "memory(GiB)": 21.32, "step": 4726, "token_acc": 0.9792387543252595, "train_speed(iter/s)": 0.952657 }, { "epoch": 0.15355878244485593, "grad_norm": 0.5121046900749207, "learning_rate": 9.709789335058409e-06, "loss": 0.0531495176255703, "memory(GiB)": 21.32, "step": 4727, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.952695 }, { "epoch": 0.15359126790761135, "grad_norm": 0.5088936686515808, "learning_rate": 9.709608968930304e-06, "loss": 0.056991856545209885, "memory(GiB)": 21.32, "step": 4728, "token_acc": 0.958904109589041, "train_speed(iter/s)": 0.952734 }, { "epoch": 0.15362375337036677, "grad_norm": 0.45487120747566223, "learning_rate": 9.709428548447044e-06, "loss": 0.0471249595284462, "memory(GiB)": 21.32, "step": 4729, "token_acc": 0.98046875, "train_speed(iter/s)": 0.952775 }, { "epoch": 0.15365623883312218, "grad_norm": 0.5462626814842224, "learning_rate": 9.70924807361071e-06, "loss": 0.05291501805186272, "memory(GiB)": 21.32, "step": 4730, "token_acc": 0.9626168224299065, "train_speed(iter/s)": 0.952816 }, { "epoch": 0.1536887242958776, "grad_norm": 0.49000057578086853, "learning_rate": 9.709067544423386e-06, "loss": 0.05164371058344841, "memory(GiB)": 21.32, "step": 4731, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.952855 }, { "epoch": 0.15372120975863302, "grad_norm": 0.8496527671813965, "learning_rate": 9.708886960887153e-06, "loss": 0.05950644612312317, "memory(GiB)": 21.32, "step": 4732, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.952895 }, { "epoch": 0.15375369522138843, "grad_norm": 0.5771409273147583, "learning_rate": 9.7087063230041e-06, "loss": 0.06185833364725113, "memory(GiB)": 21.32, "step": 4733, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.952935 }, { "epoch": 0.15378618068414385, "grad_norm": 0.6656102538108826, "learning_rate": 9.708525630776307e-06, "loss": 0.054300930351018906, "memory(GiB)": 21.32, "step": 4734, "token_acc": 0.9704641350210971, "train_speed(iter/s)": 0.952974 }, { "epoch": 0.15381866614689926, "grad_norm": 0.5434351563453674, "learning_rate": 9.70834488420586e-06, "loss": 0.055901553481817245, "memory(GiB)": 21.32, "step": 4735, "token_acc": 0.9700854700854701, "train_speed(iter/s)": 0.953012 }, { "epoch": 0.15385115160965468, "grad_norm": 0.540436327457428, "learning_rate": 9.70816408329485e-06, "loss": 0.058649685233831406, "memory(GiB)": 21.32, "step": 4736, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.953052 }, { "epoch": 0.1538836370724101, "grad_norm": 0.4750373661518097, "learning_rate": 9.707983228045357e-06, "loss": 0.060665313154459, "memory(GiB)": 21.32, "step": 4737, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.953091 }, { "epoch": 0.1539161225351655, "grad_norm": 0.3840964436531067, "learning_rate": 9.707802318459471e-06, "loss": 0.04689827561378479, "memory(GiB)": 21.32, "step": 4738, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.953131 }, { "epoch": 0.15394860799792093, "grad_norm": 0.47590991854667664, "learning_rate": 9.707621354539282e-06, "loss": 0.04601508378982544, "memory(GiB)": 21.32, "step": 4739, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.953169 }, { "epoch": 0.15398109346067634, "grad_norm": 0.5204786658287048, "learning_rate": 9.707440336286875e-06, "loss": 0.04943046346306801, "memory(GiB)": 21.32, "step": 4740, "token_acc": 0.9730941704035875, "train_speed(iter/s)": 0.953209 }, { "epoch": 0.15401357892343176, "grad_norm": 0.6290670037269592, "learning_rate": 9.707259263704342e-06, "loss": 0.04508380591869354, "memory(GiB)": 21.32, "step": 4741, "token_acc": 0.9742647058823529, "train_speed(iter/s)": 0.953133 }, { "epoch": 0.15404606438618718, "grad_norm": 0.7476668953895569, "learning_rate": 9.70707813679377e-06, "loss": 0.05278179794549942, "memory(GiB)": 21.32, "step": 4742, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.953173 }, { "epoch": 0.1540785498489426, "grad_norm": 0.5431687235832214, "learning_rate": 9.706896955557254e-06, "loss": 0.05242827534675598, "memory(GiB)": 21.32, "step": 4743, "token_acc": 0.9768518518518519, "train_speed(iter/s)": 0.95321 }, { "epoch": 0.154111035311698, "grad_norm": 0.4743403494358063, "learning_rate": 9.70671571999688e-06, "loss": 0.04327840730547905, "memory(GiB)": 21.32, "step": 4744, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.953248 }, { "epoch": 0.15414352077445342, "grad_norm": 1.3411647081375122, "learning_rate": 9.706534430114744e-06, "loss": 0.056884974241256714, "memory(GiB)": 21.32, "step": 4745, "token_acc": 0.974169741697417, "train_speed(iter/s)": 0.953288 }, { "epoch": 0.15417600623720884, "grad_norm": 0.6578449606895447, "learning_rate": 9.706353085912935e-06, "loss": 0.048336610198020935, "memory(GiB)": 21.32, "step": 4746, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.953327 }, { "epoch": 0.15420849169996426, "grad_norm": 0.5375484824180603, "learning_rate": 9.706171687393547e-06, "loss": 0.04825974628329277, "memory(GiB)": 21.32, "step": 4747, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.953367 }, { "epoch": 0.15424097716271967, "grad_norm": 0.6540800333023071, "learning_rate": 9.705990234558674e-06, "loss": 0.05265164375305176, "memory(GiB)": 21.32, "step": 4748, "token_acc": 0.9567567567567568, "train_speed(iter/s)": 0.953406 }, { "epoch": 0.1542734626254751, "grad_norm": 0.5405299067497253, "learning_rate": 9.70580872741041e-06, "loss": 0.05799011141061783, "memory(GiB)": 21.32, "step": 4749, "token_acc": 0.9718875502008032, "train_speed(iter/s)": 0.953445 }, { "epoch": 0.1543059480882305, "grad_norm": 0.5592373609542847, "learning_rate": 9.70562716595085e-06, "loss": 0.05466732382774353, "memory(GiB)": 21.32, "step": 4750, "token_acc": 0.9683794466403162, "train_speed(iter/s)": 0.953484 }, { "epoch": 0.15433843355098592, "grad_norm": 0.6963627338409424, "learning_rate": 9.705445550182089e-06, "loss": 0.0527595579624176, "memory(GiB)": 21.32, "step": 4751, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.953523 }, { "epoch": 0.15437091901374136, "grad_norm": 2.5320630073547363, "learning_rate": 9.705263880106223e-06, "loss": 0.04485442489385605, "memory(GiB)": 21.32, "step": 4752, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.953562 }, { "epoch": 0.15440340447649678, "grad_norm": 0.8720292448997498, "learning_rate": 9.705082155725351e-06, "loss": 0.05601578950881958, "memory(GiB)": 21.32, "step": 4753, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.953602 }, { "epoch": 0.1544358899392522, "grad_norm": 0.5308024287223816, "learning_rate": 9.704900377041567e-06, "loss": 0.04942673444747925, "memory(GiB)": 21.32, "step": 4754, "token_acc": 0.9651162790697675, "train_speed(iter/s)": 0.953641 }, { "epoch": 0.1544683754020076, "grad_norm": 0.5626509189605713, "learning_rate": 9.704718544056971e-06, "loss": 0.04562855511903763, "memory(GiB)": 21.32, "step": 4755, "token_acc": 0.9737827715355806, "train_speed(iter/s)": 0.953681 }, { "epoch": 0.15450086086476303, "grad_norm": 0.7474195957183838, "learning_rate": 9.704536656773662e-06, "loss": 0.05988215655088425, "memory(GiB)": 21.32, "step": 4756, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.953718 }, { "epoch": 0.15453334632751844, "grad_norm": 0.5318437218666077, "learning_rate": 9.704354715193735e-06, "loss": 0.0488114207983017, "memory(GiB)": 21.32, "step": 4757, "token_acc": 0.9733333333333334, "train_speed(iter/s)": 0.953755 }, { "epoch": 0.15456583179027386, "grad_norm": 1.159225583076477, "learning_rate": 9.704172719319296e-06, "loss": 0.05830018222332001, "memory(GiB)": 21.32, "step": 4758, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.953784 }, { "epoch": 0.15459831725302928, "grad_norm": 0.5054308176040649, "learning_rate": 9.703990669152444e-06, "loss": 0.05153132975101471, "memory(GiB)": 21.32, "step": 4759, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.953817 }, { "epoch": 0.1546308027157847, "grad_norm": 0.6102319359779358, "learning_rate": 9.703808564695275e-06, "loss": 0.05426442623138428, "memory(GiB)": 21.32, "step": 4760, "token_acc": 0.9853479853479854, "train_speed(iter/s)": 0.953849 }, { "epoch": 0.1546632881785401, "grad_norm": 0.7665672302246094, "learning_rate": 9.703626405949895e-06, "loss": 0.05865348502993584, "memory(GiB)": 21.32, "step": 4761, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.95385 }, { "epoch": 0.15469577364129553, "grad_norm": 1.6003671884536743, "learning_rate": 9.703444192918408e-06, "loss": 0.05424337461590767, "memory(GiB)": 21.32, "step": 4762, "token_acc": 0.9734513274336283, "train_speed(iter/s)": 0.953841 }, { "epoch": 0.15472825910405094, "grad_norm": 0.592542290687561, "learning_rate": 9.703261925602914e-06, "loss": 0.052638985216617584, "memory(GiB)": 21.32, "step": 4763, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.953872 }, { "epoch": 0.15476074456680636, "grad_norm": 0.6688556671142578, "learning_rate": 9.703079604005516e-06, "loss": 0.0590410977602005, "memory(GiB)": 21.32, "step": 4764, "token_acc": 0.963963963963964, "train_speed(iter/s)": 0.953885 }, { "epoch": 0.15479323002956177, "grad_norm": 1.1966108083724976, "learning_rate": 9.70289722812832e-06, "loss": 0.06255199760198593, "memory(GiB)": 21.32, "step": 4765, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.9539 }, { "epoch": 0.1548257154923172, "grad_norm": 0.7685288786888123, "learning_rate": 9.702714797973432e-06, "loss": 0.048814013600349426, "memory(GiB)": 21.32, "step": 4766, "token_acc": 0.973384030418251, "train_speed(iter/s)": 0.953928 }, { "epoch": 0.1548582009550726, "grad_norm": 0.64286869764328, "learning_rate": 9.702532313542954e-06, "loss": 0.06363876909017563, "memory(GiB)": 21.32, "step": 4767, "token_acc": 0.9614035087719298, "train_speed(iter/s)": 0.953898 }, { "epoch": 0.15489068641782802, "grad_norm": 0.8420268893241882, "learning_rate": 9.702349774838995e-06, "loss": 0.05642367899417877, "memory(GiB)": 21.32, "step": 4768, "token_acc": 0.9738805970149254, "train_speed(iter/s)": 0.953882 }, { "epoch": 0.15492317188058344, "grad_norm": 0.6494995355606079, "learning_rate": 9.70216718186366e-06, "loss": 0.04991994798183441, "memory(GiB)": 21.32, "step": 4769, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.953876 }, { "epoch": 0.15495565734333885, "grad_norm": 0.6884688138961792, "learning_rate": 9.701984534619057e-06, "loss": 0.05635320395231247, "memory(GiB)": 21.32, "step": 4770, "token_acc": 0.9796954314720813, "train_speed(iter/s)": 0.953885 }, { "epoch": 0.15498814280609427, "grad_norm": 0.5544692873954773, "learning_rate": 9.701801833107296e-06, "loss": 0.05270954594016075, "memory(GiB)": 21.32, "step": 4771, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.9539 }, { "epoch": 0.1550206282688497, "grad_norm": 0.6773896813392639, "learning_rate": 9.701619077330482e-06, "loss": 0.05142209678888321, "memory(GiB)": 21.32, "step": 4772, "token_acc": 0.96, "train_speed(iter/s)": 0.953923 }, { "epoch": 0.1550531137316051, "grad_norm": 0.5903716087341309, "learning_rate": 9.701436267290727e-06, "loss": 0.05648373067378998, "memory(GiB)": 21.32, "step": 4773, "token_acc": 0.9826839826839827, "train_speed(iter/s)": 0.953949 }, { "epoch": 0.15508559919436052, "grad_norm": 1.1951076984405518, "learning_rate": 9.701253402990139e-06, "loss": 0.051128409802913666, "memory(GiB)": 21.32, "step": 4774, "token_acc": 0.970954356846473, "train_speed(iter/s)": 0.953979 }, { "epoch": 0.15511808465711593, "grad_norm": 0.6466168761253357, "learning_rate": 9.70107048443083e-06, "loss": 0.060601409524679184, "memory(GiB)": 21.32, "step": 4775, "token_acc": 0.9617021276595744, "train_speed(iter/s)": 0.953982 }, { "epoch": 0.15515057011987135, "grad_norm": 0.4432385563850403, "learning_rate": 9.70088751161491e-06, "loss": 0.05043146386742592, "memory(GiB)": 21.32, "step": 4776, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.953986 }, { "epoch": 0.15518305558262677, "grad_norm": 0.5590561628341675, "learning_rate": 9.700704484544492e-06, "loss": 0.05303054302930832, "memory(GiB)": 21.32, "step": 4777, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.954014 }, { "epoch": 0.15521554104538218, "grad_norm": 0.5823749899864197, "learning_rate": 9.700521403221688e-06, "loss": 0.054934967309236526, "memory(GiB)": 21.32, "step": 4778, "token_acc": 0.9897959183673469, "train_speed(iter/s)": 0.954049 }, { "epoch": 0.1552480265081376, "grad_norm": 0.5358397364616394, "learning_rate": 9.70033826764861e-06, "loss": 0.050574079155921936, "memory(GiB)": 21.32, "step": 4779, "token_acc": 0.9793103448275862, "train_speed(iter/s)": 0.954079 }, { "epoch": 0.15528051197089301, "grad_norm": 0.5791143774986267, "learning_rate": 9.700155077827372e-06, "loss": 0.048547692596912384, "memory(GiB)": 21.32, "step": 4780, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.954048 }, { "epoch": 0.15531299743364843, "grad_norm": 0.5032944679260254, "learning_rate": 9.699971833760088e-06, "loss": 0.04961078613996506, "memory(GiB)": 21.32, "step": 4781, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.953999 }, { "epoch": 0.15534548289640385, "grad_norm": 0.8993285298347473, "learning_rate": 9.699788535448874e-06, "loss": 0.060779910534620285, "memory(GiB)": 21.32, "step": 4782, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.954025 }, { "epoch": 0.15537796835915926, "grad_norm": 1.6961419582366943, "learning_rate": 9.699605182895845e-06, "loss": 0.05957459285855293, "memory(GiB)": 21.32, "step": 4783, "token_acc": 0.9854545454545455, "train_speed(iter/s)": 0.953971 }, { "epoch": 0.1554104538219147, "grad_norm": 0.6898117661476135, "learning_rate": 9.699421776103118e-06, "loss": 0.049495577812194824, "memory(GiB)": 21.32, "step": 4784, "token_acc": 0.9672897196261683, "train_speed(iter/s)": 0.95401 }, { "epoch": 0.15544293928467012, "grad_norm": 0.4628354609012604, "learning_rate": 9.699238315072808e-06, "loss": 0.04804676026105881, "memory(GiB)": 21.32, "step": 4785, "token_acc": 0.97, "train_speed(iter/s)": 0.954048 }, { "epoch": 0.15547542474742554, "grad_norm": 0.6341139078140259, "learning_rate": 9.699054799807031e-06, "loss": 0.06094809249043465, "memory(GiB)": 21.32, "step": 4786, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.954052 }, { "epoch": 0.15550791021018096, "grad_norm": 0.8110477328300476, "learning_rate": 9.69887123030791e-06, "loss": 0.05238134413957596, "memory(GiB)": 21.32, "step": 4787, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.953943 }, { "epoch": 0.15554039567293637, "grad_norm": 0.7646924257278442, "learning_rate": 9.69868760657756e-06, "loss": 0.0714312195777893, "memory(GiB)": 21.32, "step": 4788, "token_acc": 0.967391304347826, "train_speed(iter/s)": 0.95396 }, { "epoch": 0.1555728811356918, "grad_norm": 1.5835630893707275, "learning_rate": 9.6985039286181e-06, "loss": 0.05889853835105896, "memory(GiB)": 21.32, "step": 4789, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.953958 }, { "epoch": 0.1556053665984472, "grad_norm": 0.6062791347503662, "learning_rate": 9.698320196431651e-06, "loss": 0.05289778858423233, "memory(GiB)": 21.32, "step": 4790, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.953998 }, { "epoch": 0.15563785206120262, "grad_norm": 0.547218382358551, "learning_rate": 9.698136410020337e-06, "loss": 0.05000966787338257, "memory(GiB)": 21.32, "step": 4791, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.954035 }, { "epoch": 0.15567033752395804, "grad_norm": 0.580767035484314, "learning_rate": 9.697952569386272e-06, "loss": 0.046898357570171356, "memory(GiB)": 21.32, "step": 4792, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.954063 }, { "epoch": 0.15570282298671345, "grad_norm": 0.643735408782959, "learning_rate": 9.697768674531582e-06, "loss": 0.06306441128253937, "memory(GiB)": 21.32, "step": 4793, "token_acc": 0.984375, "train_speed(iter/s)": 0.954102 }, { "epoch": 0.15573530844946887, "grad_norm": 0.49995726346969604, "learning_rate": 9.69758472545839e-06, "loss": 0.05648227408528328, "memory(GiB)": 21.32, "step": 4794, "token_acc": 0.9707112970711297, "train_speed(iter/s)": 0.954119 }, { "epoch": 0.15576779391222428, "grad_norm": 0.4725765883922577, "learning_rate": 9.697400722168817e-06, "loss": 0.048959117382764816, "memory(GiB)": 21.32, "step": 4795, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.954121 }, { "epoch": 0.1558002793749797, "grad_norm": 0.5744512677192688, "learning_rate": 9.697216664664988e-06, "loss": 0.05087071657180786, "memory(GiB)": 21.32, "step": 4796, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.954104 }, { "epoch": 0.15583276483773512, "grad_norm": 0.5111412405967712, "learning_rate": 9.697032552949027e-06, "loss": 0.053359925746917725, "memory(GiB)": 21.32, "step": 4797, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.954144 }, { "epoch": 0.15586525030049053, "grad_norm": 0.49398937821388245, "learning_rate": 9.696848387023057e-06, "loss": 0.044581249356269836, "memory(GiB)": 21.32, "step": 4798, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.954183 }, { "epoch": 0.15589773576324595, "grad_norm": 1.564141035079956, "learning_rate": 9.696664166889206e-06, "loss": 0.05986044555902481, "memory(GiB)": 21.32, "step": 4799, "token_acc": 0.9696969696969697, "train_speed(iter/s)": 0.954133 }, { "epoch": 0.15593022122600136, "grad_norm": 0.583346962928772, "learning_rate": 9.696479892549598e-06, "loss": 0.055254947394132614, "memory(GiB)": 21.32, "step": 4800, "token_acc": 0.9680365296803652, "train_speed(iter/s)": 0.954058 }, { "epoch": 0.15596270668875678, "grad_norm": 0.6358925104141235, "learning_rate": 9.696295564006363e-06, "loss": 0.05058299005031586, "memory(GiB)": 21.32, "step": 4801, "token_acc": 0.9847715736040609, "train_speed(iter/s)": 0.954044 }, { "epoch": 0.1559951921515122, "grad_norm": 1.6590524911880493, "learning_rate": 9.696111181261626e-06, "loss": 0.03926141560077667, "memory(GiB)": 21.32, "step": 4802, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.954044 }, { "epoch": 0.1560276776142676, "grad_norm": 0.7190966606140137, "learning_rate": 9.695926744317515e-06, "loss": 0.06998658180236816, "memory(GiB)": 21.32, "step": 4803, "token_acc": 0.9669421487603306, "train_speed(iter/s)": 0.954081 }, { "epoch": 0.15606016307702303, "grad_norm": 0.5071934461593628, "learning_rate": 9.695742253176158e-06, "loss": 0.04793476313352585, "memory(GiB)": 21.32, "step": 4804, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.954116 }, { "epoch": 0.15609264853977844, "grad_norm": 0.5049934387207031, "learning_rate": 9.695557707839687e-06, "loss": 0.04931352287530899, "memory(GiB)": 21.32, "step": 4805, "token_acc": 0.9866220735785953, "train_speed(iter/s)": 0.954143 }, { "epoch": 0.15612513400253386, "grad_norm": 0.5619686841964722, "learning_rate": 9.69537310831023e-06, "loss": 0.05287661403417587, "memory(GiB)": 21.32, "step": 4806, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.954171 }, { "epoch": 0.15615761946528928, "grad_norm": 0.72574383020401, "learning_rate": 9.695188454589918e-06, "loss": 0.050960052758455276, "memory(GiB)": 21.32, "step": 4807, "token_acc": 0.9875, "train_speed(iter/s)": 0.954185 }, { "epoch": 0.1561901049280447, "grad_norm": 0.5445675849914551, "learning_rate": 9.695003746680883e-06, "loss": 0.051872000098228455, "memory(GiB)": 21.32, "step": 4808, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.954189 }, { "epoch": 0.1562225903908001, "grad_norm": 0.6622538566589355, "learning_rate": 9.694818984585253e-06, "loss": 0.05078553035855293, "memory(GiB)": 21.32, "step": 4809, "token_acc": 0.9724409448818898, "train_speed(iter/s)": 0.954145 }, { "epoch": 0.15625507585355553, "grad_norm": 0.7227534651756287, "learning_rate": 9.694634168305164e-06, "loss": 0.07114491611719131, "memory(GiB)": 21.32, "step": 4810, "token_acc": 0.9661016949152542, "train_speed(iter/s)": 0.954175 }, { "epoch": 0.15628756131631094, "grad_norm": 0.5420400500297546, "learning_rate": 9.69444929784275e-06, "loss": 0.0451253205537796, "memory(GiB)": 21.32, "step": 4811, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.954202 }, { "epoch": 0.15632004677906636, "grad_norm": 0.4658185839653015, "learning_rate": 9.69426437320014e-06, "loss": 0.04479184374213219, "memory(GiB)": 21.32, "step": 4812, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.954234 }, { "epoch": 0.15635253224182177, "grad_norm": 0.5416107773780823, "learning_rate": 9.694079394379474e-06, "loss": 0.054628387093544006, "memory(GiB)": 21.32, "step": 4813, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.954263 }, { "epoch": 0.1563850177045772, "grad_norm": 0.487845778465271, "learning_rate": 9.69389436138288e-06, "loss": 0.0527103953063488, "memory(GiB)": 21.32, "step": 4814, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.95424 }, { "epoch": 0.1564175031673326, "grad_norm": 0.6715942025184631, "learning_rate": 9.693709274212502e-06, "loss": 0.04716915637254715, "memory(GiB)": 21.32, "step": 4815, "token_acc": 0.9718875502008032, "train_speed(iter/s)": 0.954248 }, { "epoch": 0.15644998863008805, "grad_norm": 0.5321464538574219, "learning_rate": 9.693524132870469e-06, "loss": 0.04481664299964905, "memory(GiB)": 21.32, "step": 4816, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.954221 }, { "epoch": 0.15648247409284347, "grad_norm": 0.6000412702560425, "learning_rate": 9.693338937358921e-06, "loss": 0.06335508078336716, "memory(GiB)": 21.32, "step": 4817, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.954252 }, { "epoch": 0.15651495955559888, "grad_norm": 1.2325419187545776, "learning_rate": 9.693153687679996e-06, "loss": 0.04843907058238983, "memory(GiB)": 21.32, "step": 4818, "token_acc": 0.9681818181818181, "train_speed(iter/s)": 0.954281 }, { "epoch": 0.1565474450183543, "grad_norm": 1.210487961769104, "learning_rate": 9.692968383835829e-06, "loss": 0.055530354380607605, "memory(GiB)": 21.32, "step": 4819, "token_acc": 0.9885057471264368, "train_speed(iter/s)": 0.95431 }, { "epoch": 0.1565799304811097, "grad_norm": 0.6077627539634705, "learning_rate": 9.692783025828562e-06, "loss": 0.048587508499622345, "memory(GiB)": 21.32, "step": 4820, "token_acc": 0.98, "train_speed(iter/s)": 0.954289 }, { "epoch": 0.15661241594386513, "grad_norm": 0.6556729674339294, "learning_rate": 9.692597613660332e-06, "loss": 0.05386623367667198, "memory(GiB)": 21.32, "step": 4821, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.954226 }, { "epoch": 0.15664490140662055, "grad_norm": 1.0815811157226562, "learning_rate": 9.692412147333279e-06, "loss": 0.06441518664360046, "memory(GiB)": 21.32, "step": 4822, "token_acc": 0.9680851063829787, "train_speed(iter/s)": 0.954194 }, { "epoch": 0.15667738686937596, "grad_norm": 0.5854069590568542, "learning_rate": 9.692226626849546e-06, "loss": 0.04878479987382889, "memory(GiB)": 21.32, "step": 4823, "token_acc": 0.9748743718592965, "train_speed(iter/s)": 0.954177 }, { "epoch": 0.15670987233213138, "grad_norm": 0.622368335723877, "learning_rate": 9.692041052211272e-06, "loss": 0.055822450667619705, "memory(GiB)": 21.32, "step": 4824, "token_acc": 0.9737827715355806, "train_speed(iter/s)": 0.954207 }, { "epoch": 0.1567423577948868, "grad_norm": 0.5781641006469727, "learning_rate": 9.691855423420598e-06, "loss": 0.05573990195989609, "memory(GiB)": 21.32, "step": 4825, "token_acc": 0.9601990049751243, "train_speed(iter/s)": 0.954233 }, { "epoch": 0.1567748432576422, "grad_norm": 0.5828713774681091, "learning_rate": 9.691669740479668e-06, "loss": 0.05044978857040405, "memory(GiB)": 21.32, "step": 4826, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.954263 }, { "epoch": 0.15680732872039763, "grad_norm": 0.4715459942817688, "learning_rate": 9.691484003390625e-06, "loss": 0.05017800256609917, "memory(GiB)": 21.32, "step": 4827, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.954285 }, { "epoch": 0.15683981418315304, "grad_norm": 0.624207079410553, "learning_rate": 9.691298212155612e-06, "loss": 0.052769724279642105, "memory(GiB)": 21.32, "step": 4828, "token_acc": 0.9858156028368794, "train_speed(iter/s)": 0.954294 }, { "epoch": 0.15687229964590846, "grad_norm": 0.5535358190536499, "learning_rate": 9.691112366776773e-06, "loss": 0.04932231828570366, "memory(GiB)": 21.32, "step": 4829, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.954326 }, { "epoch": 0.15690478510866387, "grad_norm": 0.6692954301834106, "learning_rate": 9.690926467256254e-06, "loss": 0.04510130733251572, "memory(GiB)": 21.32, "step": 4830, "token_acc": 0.9776119402985075, "train_speed(iter/s)": 0.954357 }, { "epoch": 0.1569372705714193, "grad_norm": 0.9870562553405762, "learning_rate": 9.690740513596201e-06, "loss": 0.06704402714967728, "memory(GiB)": 21.32, "step": 4831, "token_acc": 0.972, "train_speed(iter/s)": 0.954371 }, { "epoch": 0.1569697560341747, "grad_norm": 0.472750723361969, "learning_rate": 9.690554505798758e-06, "loss": 0.057320382446050644, "memory(GiB)": 21.32, "step": 4832, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.954399 }, { "epoch": 0.15700224149693012, "grad_norm": 0.5932995676994324, "learning_rate": 9.690368443866071e-06, "loss": 0.04365364834666252, "memory(GiB)": 21.32, "step": 4833, "token_acc": 0.9826086956521739, "train_speed(iter/s)": 0.954385 }, { "epoch": 0.15703472695968554, "grad_norm": 0.9654434323310852, "learning_rate": 9.690182327800292e-06, "loss": 0.061863526701927185, "memory(GiB)": 21.32, "step": 4834, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.954415 }, { "epoch": 0.15706721242244095, "grad_norm": 0.9628705978393555, "learning_rate": 9.689996157603566e-06, "loss": 0.05859149992465973, "memory(GiB)": 21.32, "step": 4835, "token_acc": 0.9813084112149533, "train_speed(iter/s)": 0.954429 }, { "epoch": 0.15709969788519637, "grad_norm": 0.8577505946159363, "learning_rate": 9.689809933278043e-06, "loss": 0.04798199608922005, "memory(GiB)": 21.32, "step": 4836, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.954446 }, { "epoch": 0.1571321833479518, "grad_norm": 0.5761193633079529, "learning_rate": 9.689623654825868e-06, "loss": 0.04577130079269409, "memory(GiB)": 21.32, "step": 4837, "token_acc": 0.9758064516129032, "train_speed(iter/s)": 0.954461 }, { "epoch": 0.1571646688107072, "grad_norm": 0.7665113806724548, "learning_rate": 9.689437322249196e-06, "loss": 0.06548866629600525, "memory(GiB)": 21.32, "step": 4838, "token_acc": 0.9836734693877551, "train_speed(iter/s)": 0.954499 }, { "epoch": 0.15719715427346262, "grad_norm": 0.767124354839325, "learning_rate": 9.689250935550176e-06, "loss": 0.0527096651494503, "memory(GiB)": 21.32, "step": 4839, "token_acc": 0.995, "train_speed(iter/s)": 0.954539 }, { "epoch": 0.15722963973621804, "grad_norm": 0.6718502044677734, "learning_rate": 9.689064494730958e-06, "loss": 0.06450101733207703, "memory(GiB)": 21.32, "step": 4840, "token_acc": 0.9770992366412213, "train_speed(iter/s)": 0.954561 }, { "epoch": 0.15726212519897345, "grad_norm": 0.49786844849586487, "learning_rate": 9.688877999793696e-06, "loss": 0.049770455807447433, "memory(GiB)": 21.32, "step": 4841, "token_acc": 0.9837837837837838, "train_speed(iter/s)": 0.954573 }, { "epoch": 0.15729461066172887, "grad_norm": 0.7023258209228516, "learning_rate": 9.688691450740539e-06, "loss": 0.050674691796302795, "memory(GiB)": 21.32, "step": 4842, "token_acc": 0.9620253164556962, "train_speed(iter/s)": 0.95461 }, { "epoch": 0.15732709612448428, "grad_norm": 0.5273648500442505, "learning_rate": 9.688504847573644e-06, "loss": 0.048971451818943024, "memory(GiB)": 21.32, "step": 4843, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.954627 }, { "epoch": 0.1573595815872397, "grad_norm": 0.6389950513839722, "learning_rate": 9.688318190295162e-06, "loss": 0.044316262006759644, "memory(GiB)": 21.32, "step": 4844, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.954653 }, { "epoch": 0.15739206704999512, "grad_norm": 0.5901270508766174, "learning_rate": 9.688131478907247e-06, "loss": 0.05906029790639877, "memory(GiB)": 21.32, "step": 4845, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.954693 }, { "epoch": 0.15742455251275053, "grad_norm": 1.025711178779602, "learning_rate": 9.687944713412056e-06, "loss": 0.05861017107963562, "memory(GiB)": 21.32, "step": 4846, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.954732 }, { "epoch": 0.15745703797550595, "grad_norm": 1.3288239240646362, "learning_rate": 9.687757893811742e-06, "loss": 0.05503934621810913, "memory(GiB)": 21.32, "step": 4847, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.954764 }, { "epoch": 0.1574895234382614, "grad_norm": 0.8238200545310974, "learning_rate": 9.687571020108463e-06, "loss": 0.045876745134592056, "memory(GiB)": 21.32, "step": 4848, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.954757 }, { "epoch": 0.1575220089010168, "grad_norm": 0.8537881374359131, "learning_rate": 9.687384092304378e-06, "loss": 0.057070955634117126, "memory(GiB)": 21.32, "step": 4849, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.954795 }, { "epoch": 0.15755449436377222, "grad_norm": 0.6636582016944885, "learning_rate": 9.687197110401639e-06, "loss": 0.05122967064380646, "memory(GiB)": 21.32, "step": 4850, "token_acc": 1.0, "train_speed(iter/s)": 0.954835 }, { "epoch": 0.15758697982652764, "grad_norm": 0.6042117476463318, "learning_rate": 9.687010074402406e-06, "loss": 0.04851771891117096, "memory(GiB)": 21.32, "step": 4851, "token_acc": 0.9678714859437751, "train_speed(iter/s)": 0.954874 }, { "epoch": 0.15761946528928306, "grad_norm": 0.5357813239097595, "learning_rate": 9.68682298430884e-06, "loss": 0.04952127858996391, "memory(GiB)": 21.32, "step": 4852, "token_acc": 0.9826086956521739, "train_speed(iter/s)": 0.954913 }, { "epoch": 0.15765195075203847, "grad_norm": 0.533920168876648, "learning_rate": 9.686635840123097e-06, "loss": 0.06307493150234222, "memory(GiB)": 21.32, "step": 4853, "token_acc": 0.9817073170731707, "train_speed(iter/s)": 0.95495 }, { "epoch": 0.1576844362147939, "grad_norm": 0.5933316349983215, "learning_rate": 9.686448641847341e-06, "loss": 0.05314646661281586, "memory(GiB)": 21.32, "step": 4854, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.954985 }, { "epoch": 0.1577169216775493, "grad_norm": 0.6192899346351624, "learning_rate": 9.686261389483729e-06, "loss": 0.054823312908411026, "memory(GiB)": 21.32, "step": 4855, "token_acc": 0.9649122807017544, "train_speed(iter/s)": 0.955024 }, { "epoch": 0.15774940714030472, "grad_norm": 0.47685396671295166, "learning_rate": 9.686074083034422e-06, "loss": 0.0566214919090271, "memory(GiB)": 21.32, "step": 4856, "token_acc": 0.9789473684210527, "train_speed(iter/s)": 0.955062 }, { "epoch": 0.15778189260306014, "grad_norm": 0.5347495079040527, "learning_rate": 9.685886722501582e-06, "loss": 0.05088813602924347, "memory(GiB)": 21.32, "step": 4857, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.9551 }, { "epoch": 0.15781437806581555, "grad_norm": 0.6399091482162476, "learning_rate": 9.685699307887376e-06, "loss": 0.06424366682767868, "memory(GiB)": 21.32, "step": 4858, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.955134 }, { "epoch": 0.15784686352857097, "grad_norm": 0.5674922466278076, "learning_rate": 9.685511839193961e-06, "loss": 0.04797423630952835, "memory(GiB)": 21.32, "step": 4859, "token_acc": 0.968503937007874, "train_speed(iter/s)": 0.955163 }, { "epoch": 0.15787934899132638, "grad_norm": 0.5764384269714355, "learning_rate": 9.685324316423504e-06, "loss": 0.0663754791021347, "memory(GiB)": 21.32, "step": 4860, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.955195 }, { "epoch": 0.1579118344540818, "grad_norm": 0.40186136960983276, "learning_rate": 9.685136739578166e-06, "loss": 0.053593095391988754, "memory(GiB)": 21.32, "step": 4861, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.955226 }, { "epoch": 0.15794431991683722, "grad_norm": 7.491494178771973, "learning_rate": 9.684949108660117e-06, "loss": 0.046526141464710236, "memory(GiB)": 21.32, "step": 4862, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.955258 }, { "epoch": 0.15797680537959263, "grad_norm": 0.7312510013580322, "learning_rate": 9.684761423671519e-06, "loss": 0.0524740144610405, "memory(GiB)": 21.32, "step": 4863, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.955288 }, { "epoch": 0.15800929084234805, "grad_norm": 0.557911217212677, "learning_rate": 9.684573684614537e-06, "loss": 0.053778208792209625, "memory(GiB)": 21.32, "step": 4864, "token_acc": 0.9776785714285714, "train_speed(iter/s)": 0.955317 }, { "epoch": 0.15804177630510347, "grad_norm": 0.711979329586029, "learning_rate": 9.68438589149134e-06, "loss": 0.06467388570308685, "memory(GiB)": 21.32, "step": 4865, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.955347 }, { "epoch": 0.15807426176785888, "grad_norm": 0.6357980966567993, "learning_rate": 9.684198044304096e-06, "loss": 0.055440355092287064, "memory(GiB)": 21.32, "step": 4866, "token_acc": 0.9796954314720813, "train_speed(iter/s)": 0.955381 }, { "epoch": 0.1581067472306143, "grad_norm": 0.8023667335510254, "learning_rate": 9.68401014305497e-06, "loss": 0.06077846139669418, "memory(GiB)": 21.32, "step": 4867, "token_acc": 0.9794238683127572, "train_speed(iter/s)": 0.955412 }, { "epoch": 0.1581392326933697, "grad_norm": 1.6288121938705444, "learning_rate": 9.683822187746134e-06, "loss": 0.05014022812247276, "memory(GiB)": 21.32, "step": 4868, "token_acc": 0.9752475247524752, "train_speed(iter/s)": 0.955442 }, { "epoch": 0.15817171815612513, "grad_norm": 0.7207190990447998, "learning_rate": 9.683634178379758e-06, "loss": 0.055377811193466187, "memory(GiB)": 21.32, "step": 4869, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.955472 }, { "epoch": 0.15820420361888055, "grad_norm": 0.9148514866828918, "learning_rate": 9.683446114958006e-06, "loss": 0.06528735160827637, "memory(GiB)": 21.32, "step": 4870, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.955501 }, { "epoch": 0.15823668908163596, "grad_norm": 0.6196292638778687, "learning_rate": 9.683257997483055e-06, "loss": 0.051776379346847534, "memory(GiB)": 21.32, "step": 4871, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.955533 }, { "epoch": 0.15826917454439138, "grad_norm": 0.5643796324729919, "learning_rate": 9.68306982595707e-06, "loss": 0.05858234316110611, "memory(GiB)": 21.32, "step": 4872, "token_acc": 0.9860627177700348, "train_speed(iter/s)": 0.955561 }, { "epoch": 0.1583016600071468, "grad_norm": 0.5060351490974426, "learning_rate": 9.68288160038223e-06, "loss": 0.04989486560225487, "memory(GiB)": 21.32, "step": 4873, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.955593 }, { "epoch": 0.1583341454699022, "grad_norm": 1.9024633169174194, "learning_rate": 9.682693320760702e-06, "loss": 0.061384417116642, "memory(GiB)": 21.32, "step": 4874, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.955622 }, { "epoch": 0.15836663093265763, "grad_norm": 0.7221800684928894, "learning_rate": 9.682504987094658e-06, "loss": 0.05961574614048004, "memory(GiB)": 21.32, "step": 4875, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.95565 }, { "epoch": 0.15839911639541304, "grad_norm": 0.7751821279525757, "learning_rate": 9.682316599386276e-06, "loss": 0.0531039834022522, "memory(GiB)": 21.32, "step": 4876, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.955677 }, { "epoch": 0.15843160185816846, "grad_norm": 0.5285816788673401, "learning_rate": 9.682128157637729e-06, "loss": 0.05026647076010704, "memory(GiB)": 21.32, "step": 4877, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.955705 }, { "epoch": 0.15846408732092387, "grad_norm": 0.6759269833564758, "learning_rate": 9.68193966185119e-06, "loss": 0.05253922566771507, "memory(GiB)": 21.32, "step": 4878, "token_acc": 0.9933554817275747, "train_speed(iter/s)": 0.955736 }, { "epoch": 0.1584965727836793, "grad_norm": 0.5885172486305237, "learning_rate": 9.681751112028837e-06, "loss": 0.05466775596141815, "memory(GiB)": 21.32, "step": 4879, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.955767 }, { "epoch": 0.15852905824643473, "grad_norm": 0.501334011554718, "learning_rate": 9.681562508172843e-06, "loss": 0.060912638902664185, "memory(GiB)": 21.32, "step": 4880, "token_acc": 0.95703125, "train_speed(iter/s)": 0.955796 }, { "epoch": 0.15856154370919015, "grad_norm": 0.43226760625839233, "learning_rate": 9.681373850285387e-06, "loss": 0.04174322634935379, "memory(GiB)": 21.32, "step": 4881, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.955826 }, { "epoch": 0.15859402917194557, "grad_norm": 0.42541953921318054, "learning_rate": 9.681185138368646e-06, "loss": 0.04787624254822731, "memory(GiB)": 21.32, "step": 4882, "token_acc": 0.993006993006993, "train_speed(iter/s)": 0.955858 }, { "epoch": 0.15862651463470098, "grad_norm": 0.431246817111969, "learning_rate": 9.680996372424798e-06, "loss": 0.047491200268268585, "memory(GiB)": 21.32, "step": 4883, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.95589 }, { "epoch": 0.1586590000974564, "grad_norm": 0.645620584487915, "learning_rate": 9.680807552456021e-06, "loss": 0.04302207753062248, "memory(GiB)": 21.32, "step": 4884, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.955919 }, { "epoch": 0.15869148556021181, "grad_norm": 0.43783360719680786, "learning_rate": 9.680618678464496e-06, "loss": 0.04434419423341751, "memory(GiB)": 21.32, "step": 4885, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.955946 }, { "epoch": 0.15872397102296723, "grad_norm": 0.6614811420440674, "learning_rate": 9.6804297504524e-06, "loss": 0.0469856783747673, "memory(GiB)": 21.32, "step": 4886, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955976 }, { "epoch": 0.15875645648572265, "grad_norm": 0.8600104451179504, "learning_rate": 9.680240768421915e-06, "loss": 0.06583763659000397, "memory(GiB)": 21.32, "step": 4887, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.956005 }, { "epoch": 0.15878894194847806, "grad_norm": 0.6162987947463989, "learning_rate": 9.680051732375223e-06, "loss": 0.04960402101278305, "memory(GiB)": 21.32, "step": 4888, "token_acc": 0.9854545454545455, "train_speed(iter/s)": 0.956036 }, { "epoch": 0.15882142741123348, "grad_norm": 0.5434457659721375, "learning_rate": 9.679862642314506e-06, "loss": 0.045063965022563934, "memory(GiB)": 21.32, "step": 4889, "token_acc": 0.9757085020242915, "train_speed(iter/s)": 0.95607 }, { "epoch": 0.1588539128739889, "grad_norm": 0.5668768882751465, "learning_rate": 9.679673498241943e-06, "loss": 0.05459994077682495, "memory(GiB)": 21.32, "step": 4890, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.956102 }, { "epoch": 0.1588863983367443, "grad_norm": 0.5167062282562256, "learning_rate": 9.679484300159722e-06, "loss": 0.04862864315509796, "memory(GiB)": 21.32, "step": 4891, "token_acc": 0.9737991266375546, "train_speed(iter/s)": 0.956128 }, { "epoch": 0.15891888379949973, "grad_norm": 0.5038713812828064, "learning_rate": 9.679295048070021e-06, "loss": 0.050102349370718, "memory(GiB)": 21.32, "step": 4892, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.956157 }, { "epoch": 0.15895136926225514, "grad_norm": 0.49151599407196045, "learning_rate": 9.679105741975028e-06, "loss": 0.04269074648618698, "memory(GiB)": 21.32, "step": 4893, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.956183 }, { "epoch": 0.15898385472501056, "grad_norm": 0.4622083008289337, "learning_rate": 9.678916381876929e-06, "loss": 0.042935144156217575, "memory(GiB)": 21.32, "step": 4894, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956214 }, { "epoch": 0.15901634018776598, "grad_norm": 0.7788839936256409, "learning_rate": 9.678726967777904e-06, "loss": 0.0478515587747097, "memory(GiB)": 21.32, "step": 4895, "token_acc": 0.9826086956521739, "train_speed(iter/s)": 0.956245 }, { "epoch": 0.1590488256505214, "grad_norm": 0.4936582148075104, "learning_rate": 9.678537499680143e-06, "loss": 0.042841456830501556, "memory(GiB)": 21.32, "step": 4896, "token_acc": 0.9792746113989638, "train_speed(iter/s)": 0.956276 }, { "epoch": 0.1590813111132768, "grad_norm": 0.7209219336509705, "learning_rate": 9.678347977585834e-06, "loss": 0.055982556194067, "memory(GiB)": 21.32, "step": 4897, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.956307 }, { "epoch": 0.15911379657603222, "grad_norm": 0.8251437544822693, "learning_rate": 9.678158401497161e-06, "loss": 0.04372759163379669, "memory(GiB)": 21.32, "step": 4898, "token_acc": 0.9826086956521739, "train_speed(iter/s)": 0.956346 }, { "epoch": 0.15914628203878764, "grad_norm": 1.1905169486999512, "learning_rate": 9.677968771416312e-06, "loss": 0.05354965850710869, "memory(GiB)": 21.32, "step": 4899, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.956383 }, { "epoch": 0.15917876750154306, "grad_norm": 0.6594751477241516, "learning_rate": 9.67777908734548e-06, "loss": 0.05815383791923523, "memory(GiB)": 21.32, "step": 4900, "token_acc": 0.9732441471571907, "train_speed(iter/s)": 0.956418 }, { "epoch": 0.15921125296429847, "grad_norm": 0.8377710580825806, "learning_rate": 9.67758934928685e-06, "loss": 0.05455920100212097, "memory(GiB)": 21.32, "step": 4901, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.956452 }, { "epoch": 0.1592437384270539, "grad_norm": 0.6970252990722656, "learning_rate": 9.677399557242612e-06, "loss": 0.051527105271816254, "memory(GiB)": 21.32, "step": 4902, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.956489 }, { "epoch": 0.1592762238898093, "grad_norm": 0.770925760269165, "learning_rate": 9.677209711214959e-06, "loss": 0.05753634124994278, "memory(GiB)": 21.32, "step": 4903, "token_acc": 0.9742647058823529, "train_speed(iter/s)": 0.956529 }, { "epoch": 0.15930870935256472, "grad_norm": 0.6167296767234802, "learning_rate": 9.67701981120608e-06, "loss": 0.0548708513379097, "memory(GiB)": 21.32, "step": 4904, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.956567 }, { "epoch": 0.15934119481532014, "grad_norm": 0.830176830291748, "learning_rate": 9.676829857218169e-06, "loss": 0.06379415094852448, "memory(GiB)": 21.32, "step": 4905, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.956605 }, { "epoch": 0.15937368027807555, "grad_norm": 0.7487866282463074, "learning_rate": 9.676639849253413e-06, "loss": 0.07616949081420898, "memory(GiB)": 21.32, "step": 4906, "token_acc": 0.9634703196347032, "train_speed(iter/s)": 0.956641 }, { "epoch": 0.15940616574083097, "grad_norm": 0.6201157569885254, "learning_rate": 9.676449787314011e-06, "loss": 0.04917033389210701, "memory(GiB)": 21.32, "step": 4907, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.956681 }, { "epoch": 0.15943865120358638, "grad_norm": 0.5050973892211914, "learning_rate": 9.676259671402153e-06, "loss": 0.043894559144973755, "memory(GiB)": 21.32, "step": 4908, "token_acc": 0.9744525547445255, "train_speed(iter/s)": 0.956718 }, { "epoch": 0.1594711366663418, "grad_norm": 0.4735868275165558, "learning_rate": 9.676069501520035e-06, "loss": 0.052346520125865936, "memory(GiB)": 21.32, "step": 4909, "token_acc": 0.9658703071672355, "train_speed(iter/s)": 0.956753 }, { "epoch": 0.15950362212909722, "grad_norm": 0.8176990151405334, "learning_rate": 9.675879277669852e-06, "loss": 0.05880395695567131, "memory(GiB)": 21.32, "step": 4910, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.956791 }, { "epoch": 0.15953610759185263, "grad_norm": 0.8526270389556885, "learning_rate": 9.675688999853796e-06, "loss": 0.05994991958141327, "memory(GiB)": 21.32, "step": 4911, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.95683 }, { "epoch": 0.15956859305460808, "grad_norm": 0.7454816699028015, "learning_rate": 9.675498668074068e-06, "loss": 0.06583397835493088, "memory(GiB)": 21.32, "step": 4912, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.956868 }, { "epoch": 0.1596010785173635, "grad_norm": 0.4767856001853943, "learning_rate": 9.675308282332863e-06, "loss": 0.05377686768770218, "memory(GiB)": 21.32, "step": 4913, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.956905 }, { "epoch": 0.1596335639801189, "grad_norm": 0.523222804069519, "learning_rate": 9.675117842632375e-06, "loss": 0.03756474331021309, "memory(GiB)": 21.32, "step": 4914, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.956941 }, { "epoch": 0.15966604944287432, "grad_norm": 0.6254949569702148, "learning_rate": 9.674927348974804e-06, "loss": 0.05631755292415619, "memory(GiB)": 21.32, "step": 4915, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.956967 }, { "epoch": 0.15969853490562974, "grad_norm": 0.3853067457675934, "learning_rate": 9.674736801362353e-06, "loss": 0.05375014990568161, "memory(GiB)": 21.32, "step": 4916, "token_acc": 0.9701492537313433, "train_speed(iter/s)": 0.956999 }, { "epoch": 0.15973102036838516, "grad_norm": 0.5144400596618652, "learning_rate": 9.674546199797215e-06, "loss": 0.04725659266114235, "memory(GiB)": 21.32, "step": 4917, "token_acc": 0.9764705882352941, "train_speed(iter/s)": 0.957032 }, { "epoch": 0.15976350583114057, "grad_norm": 0.8807289600372314, "learning_rate": 9.674355544281592e-06, "loss": 0.0649023950099945, "memory(GiB)": 21.32, "step": 4918, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.957059 }, { "epoch": 0.159795991293896, "grad_norm": 0.6129546761512756, "learning_rate": 9.674164834817685e-06, "loss": 0.05251114070415497, "memory(GiB)": 21.32, "step": 4919, "token_acc": 0.9835164835164835, "train_speed(iter/s)": 0.95709 }, { "epoch": 0.1598284767566514, "grad_norm": 0.5330528020858765, "learning_rate": 9.673974071407693e-06, "loss": 0.04918317496776581, "memory(GiB)": 21.32, "step": 4920, "token_acc": 0.9700374531835206, "train_speed(iter/s)": 0.95712 }, { "epoch": 0.15986096221940682, "grad_norm": 0.45490762591362, "learning_rate": 9.67378325405382e-06, "loss": 0.04972538352012634, "memory(GiB)": 21.32, "step": 4921, "token_acc": 0.9701492537313433, "train_speed(iter/s)": 0.957152 }, { "epoch": 0.15989344768216224, "grad_norm": 0.7430524230003357, "learning_rate": 9.673592382758268e-06, "loss": 0.05588570609688759, "memory(GiB)": 21.32, "step": 4922, "token_acc": 0.9692307692307692, "train_speed(iter/s)": 0.957182 }, { "epoch": 0.15992593314491765, "grad_norm": 0.4993874728679657, "learning_rate": 9.67340145752324e-06, "loss": 0.047682829201221466, "memory(GiB)": 21.32, "step": 4923, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.957215 }, { "epoch": 0.15995841860767307, "grad_norm": 0.7436881065368652, "learning_rate": 9.673210478350937e-06, "loss": 0.05722877383232117, "memory(GiB)": 21.32, "step": 4924, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.957246 }, { "epoch": 0.15999090407042849, "grad_norm": 0.3849920928478241, "learning_rate": 9.673019445243566e-06, "loss": 0.03842185065150261, "memory(GiB)": 21.32, "step": 4925, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957276 }, { "epoch": 0.1600233895331839, "grad_norm": 0.43661803007125854, "learning_rate": 9.67282835820333e-06, "loss": 0.03842293471097946, "memory(GiB)": 21.32, "step": 4926, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.957301 }, { "epoch": 0.16005587499593932, "grad_norm": 1.2949306964874268, "learning_rate": 9.672637217232437e-06, "loss": 0.06151188164949417, "memory(GiB)": 21.32, "step": 4927, "token_acc": 0.9834710743801653, "train_speed(iter/s)": 0.957328 }, { "epoch": 0.16008836045869473, "grad_norm": 1.9609200954437256, "learning_rate": 9.672446022333089e-06, "loss": 0.05067518353462219, "memory(GiB)": 21.32, "step": 4928, "token_acc": 0.9646464646464646, "train_speed(iter/s)": 0.957358 }, { "epoch": 0.16012084592145015, "grad_norm": 0.5518316626548767, "learning_rate": 9.672254773507497e-06, "loss": 0.04661712422966957, "memory(GiB)": 21.32, "step": 4929, "token_acc": 0.984, "train_speed(iter/s)": 0.957387 }, { "epoch": 0.16015333138420557, "grad_norm": 0.48441869020462036, "learning_rate": 9.672063470757865e-06, "loss": 0.05471116304397583, "memory(GiB)": 21.32, "step": 4930, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.957417 }, { "epoch": 0.16018581684696098, "grad_norm": 0.5001062154769897, "learning_rate": 9.671872114086403e-06, "loss": 0.050955966114997864, "memory(GiB)": 21.32, "step": 4931, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.95745 }, { "epoch": 0.1602183023097164, "grad_norm": 0.7535614371299744, "learning_rate": 9.671680703495318e-06, "loss": 0.05444319546222687, "memory(GiB)": 21.32, "step": 4932, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.95748 }, { "epoch": 0.16025078777247181, "grad_norm": 0.5032421946525574, "learning_rate": 9.67148923898682e-06, "loss": 0.05130693316459656, "memory(GiB)": 21.32, "step": 4933, "token_acc": 0.988, "train_speed(iter/s)": 0.957516 }, { "epoch": 0.16028327323522723, "grad_norm": 0.4565315842628479, "learning_rate": 9.671297720563117e-06, "loss": 0.04614730924367905, "memory(GiB)": 21.32, "step": 4934, "token_acc": 0.9806201550387597, "train_speed(iter/s)": 0.957553 }, { "epoch": 0.16031575869798265, "grad_norm": 8.425970077514648, "learning_rate": 9.671106148226422e-06, "loss": 0.06711971759796143, "memory(GiB)": 21.32, "step": 4935, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.95759 }, { "epoch": 0.16034824416073806, "grad_norm": 0.6700245141983032, "learning_rate": 9.670914521978945e-06, "loss": 0.05645805597305298, "memory(GiB)": 21.32, "step": 4936, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.95762 }, { "epoch": 0.16038072962349348, "grad_norm": 0.571328341960907, "learning_rate": 9.670722841822897e-06, "loss": 0.04421965032815933, "memory(GiB)": 21.32, "step": 4937, "token_acc": 0.9747474747474747, "train_speed(iter/s)": 0.957648 }, { "epoch": 0.1604132150862489, "grad_norm": 0.5907523036003113, "learning_rate": 9.67053110776049e-06, "loss": 0.06056300178170204, "memory(GiB)": 21.32, "step": 4938, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.957679 }, { "epoch": 0.1604457005490043, "grad_norm": 0.5452256798744202, "learning_rate": 9.67033931979394e-06, "loss": 0.04243545979261398, "memory(GiB)": 21.32, "step": 4939, "token_acc": 0.9758064516129032, "train_speed(iter/s)": 0.957708 }, { "epoch": 0.16047818601175973, "grad_norm": 0.4943418800830841, "learning_rate": 9.670147477925455e-06, "loss": 0.04848215728998184, "memory(GiB)": 21.32, "step": 4940, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.957739 }, { "epoch": 0.16051067147451514, "grad_norm": 1.898698329925537, "learning_rate": 9.669955582157253e-06, "loss": 0.048732757568359375, "memory(GiB)": 21.32, "step": 4941, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.957769 }, { "epoch": 0.16054315693727056, "grad_norm": 0.5175462961196899, "learning_rate": 9.669763632491549e-06, "loss": 0.055908747017383575, "memory(GiB)": 21.32, "step": 4942, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.957799 }, { "epoch": 0.16057564240002598, "grad_norm": 1.9173709154129028, "learning_rate": 9.669571628930556e-06, "loss": 0.054720908403396606, "memory(GiB)": 21.32, "step": 4943, "token_acc": 0.9758454106280193, "train_speed(iter/s)": 0.957824 }, { "epoch": 0.16060812786278142, "grad_norm": 0.777847945690155, "learning_rate": 9.669379571476492e-06, "loss": 0.056523215025663376, "memory(GiB)": 21.32, "step": 4944, "token_acc": 0.9869281045751634, "train_speed(iter/s)": 0.957852 }, { "epoch": 0.16064061332553684, "grad_norm": 0.5717207193374634, "learning_rate": 9.669187460131573e-06, "loss": 0.04224050045013428, "memory(GiB)": 21.32, "step": 4945, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.957878 }, { "epoch": 0.16067309878829225, "grad_norm": 0.6582722663879395, "learning_rate": 9.668995294898015e-06, "loss": 0.05297023430466652, "memory(GiB)": 21.32, "step": 4946, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.957905 }, { "epoch": 0.16070558425104767, "grad_norm": 1.0144636631011963, "learning_rate": 9.668803075778039e-06, "loss": 0.047555673867464066, "memory(GiB)": 21.32, "step": 4947, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.957934 }, { "epoch": 0.16073806971380308, "grad_norm": 0.6600848436355591, "learning_rate": 9.66861080277386e-06, "loss": 0.06385411322116852, "memory(GiB)": 21.32, "step": 4948, "token_acc": 0.9797297297297297, "train_speed(iter/s)": 0.957962 }, { "epoch": 0.1607705551765585, "grad_norm": 0.47983837127685547, "learning_rate": 9.668418475887697e-06, "loss": 0.047981902956962585, "memory(GiB)": 21.32, "step": 4949, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.957991 }, { "epoch": 0.16080304063931392, "grad_norm": 0.4621768593788147, "learning_rate": 9.668226095121774e-06, "loss": 0.05389166623353958, "memory(GiB)": 21.32, "step": 4950, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.958016 }, { "epoch": 0.16083552610206933, "grad_norm": 2.5734548568725586, "learning_rate": 9.668033660478306e-06, "loss": 0.06929698586463928, "memory(GiB)": 21.32, "step": 4951, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.958046 }, { "epoch": 0.16086801156482475, "grad_norm": 0.8082602024078369, "learning_rate": 9.667841171959518e-06, "loss": 0.059807226061820984, "memory(GiB)": 21.32, "step": 4952, "token_acc": 0.9520958083832335, "train_speed(iter/s)": 0.958074 }, { "epoch": 0.16090049702758016, "grad_norm": 0.8390992283821106, "learning_rate": 9.66764862956763e-06, "loss": 0.054242320358753204, "memory(GiB)": 21.32, "step": 4953, "token_acc": 0.976, "train_speed(iter/s)": 0.958098 }, { "epoch": 0.16093298249033558, "grad_norm": 0.7777469158172607, "learning_rate": 9.667456033304863e-06, "loss": 0.06660536676645279, "memory(GiB)": 21.32, "step": 4954, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.958125 }, { "epoch": 0.160965467953091, "grad_norm": 0.4971384108066559, "learning_rate": 9.667263383173441e-06, "loss": 0.051472850143909454, "memory(GiB)": 21.32, "step": 4955, "token_acc": 0.9813953488372092, "train_speed(iter/s)": 0.958154 }, { "epoch": 0.1609979534158464, "grad_norm": 0.9729949831962585, "learning_rate": 9.667070679175587e-06, "loss": 0.07255126535892487, "memory(GiB)": 21.32, "step": 4956, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.958183 }, { "epoch": 0.16103043887860183, "grad_norm": 0.7147031426429749, "learning_rate": 9.666877921313528e-06, "loss": 0.048207055777311325, "memory(GiB)": 21.32, "step": 4957, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.958215 }, { "epoch": 0.16106292434135724, "grad_norm": 0.663605272769928, "learning_rate": 9.666685109589484e-06, "loss": 0.04802375286817551, "memory(GiB)": 21.32, "step": 4958, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.958247 }, { "epoch": 0.16109540980411266, "grad_norm": 0.6728910803794861, "learning_rate": 9.666492244005684e-06, "loss": 0.04513943940401077, "memory(GiB)": 21.32, "step": 4959, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.958281 }, { "epoch": 0.16112789526686808, "grad_norm": 0.6512954235076904, "learning_rate": 9.666299324564352e-06, "loss": 0.05039369314908981, "memory(GiB)": 21.32, "step": 4960, "token_acc": 0.958139534883721, "train_speed(iter/s)": 0.958316 }, { "epoch": 0.1611603807296235, "grad_norm": 0.48447567224502563, "learning_rate": 9.666106351267714e-06, "loss": 0.05821796506643295, "memory(GiB)": 21.32, "step": 4961, "token_acc": 0.9698492462311558, "train_speed(iter/s)": 0.958353 }, { "epoch": 0.1611928661923789, "grad_norm": 1.0161057710647583, "learning_rate": 9.665913324118e-06, "loss": 0.05304481089115143, "memory(GiB)": 21.32, "step": 4962, "token_acc": 0.9775280898876404, "train_speed(iter/s)": 0.958391 }, { "epoch": 0.16122535165513432, "grad_norm": 0.5965996980667114, "learning_rate": 9.665720243117434e-06, "loss": 0.06008416414260864, "memory(GiB)": 21.32, "step": 4963, "token_acc": 0.957983193277311, "train_speed(iter/s)": 0.958427 }, { "epoch": 0.16125783711788974, "grad_norm": 0.8238949775695801, "learning_rate": 9.665527108268247e-06, "loss": 0.05966504290699959, "memory(GiB)": 21.32, "step": 4964, "token_acc": 0.9814126394052045, "train_speed(iter/s)": 0.958465 }, { "epoch": 0.16129032258064516, "grad_norm": 1.2579320669174194, "learning_rate": 9.665333919572667e-06, "loss": 0.07051052153110504, "memory(GiB)": 21.32, "step": 4965, "token_acc": 0.9701492537313433, "train_speed(iter/s)": 0.958502 }, { "epoch": 0.16132280804340057, "grad_norm": 0.7714832425117493, "learning_rate": 9.665140677032924e-06, "loss": 0.05917072296142578, "memory(GiB)": 21.32, "step": 4966, "token_acc": 0.9826839826839827, "train_speed(iter/s)": 0.958539 }, { "epoch": 0.161355293506156, "grad_norm": 0.4804989993572235, "learning_rate": 9.664947380651248e-06, "loss": 0.05127808451652527, "memory(GiB)": 21.32, "step": 4967, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.958577 }, { "epoch": 0.1613877789689114, "grad_norm": 0.4657905399799347, "learning_rate": 9.664754030429872e-06, "loss": 0.05844596400856972, "memory(GiB)": 21.32, "step": 4968, "token_acc": 0.9863945578231292, "train_speed(iter/s)": 0.958614 }, { "epoch": 0.16142026443166682, "grad_norm": 0.6045503616333008, "learning_rate": 9.664560626371025e-06, "loss": 0.054556164890527725, "memory(GiB)": 21.32, "step": 4969, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.958651 }, { "epoch": 0.16145274989442224, "grad_norm": 0.5636172890663147, "learning_rate": 9.664367168476938e-06, "loss": 0.05154181271791458, "memory(GiB)": 21.32, "step": 4970, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.958681 }, { "epoch": 0.16148523535717765, "grad_norm": 0.5196545124053955, "learning_rate": 9.664173656749846e-06, "loss": 0.042122192680835724, "memory(GiB)": 21.32, "step": 4971, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.958705 }, { "epoch": 0.16151772081993307, "grad_norm": 0.6552345156669617, "learning_rate": 9.663980091191985e-06, "loss": 0.05140606313943863, "memory(GiB)": 21.32, "step": 4972, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.958736 }, { "epoch": 0.16155020628268849, "grad_norm": 0.43395885825157166, "learning_rate": 9.663786471805584e-06, "loss": 0.04853469133377075, "memory(GiB)": 21.32, "step": 4973, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.958766 }, { "epoch": 0.1615826917454439, "grad_norm": 0.4130382537841797, "learning_rate": 9.66359279859288e-06, "loss": 0.04097297787666321, "memory(GiB)": 21.32, "step": 4974, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.958797 }, { "epoch": 0.16161517720819932, "grad_norm": 0.937619149684906, "learning_rate": 9.663399071556107e-06, "loss": 0.06059752404689789, "memory(GiB)": 21.32, "step": 4975, "token_acc": 0.981549815498155, "train_speed(iter/s)": 0.958827 }, { "epoch": 0.16164766267095476, "grad_norm": 0.7197327613830566, "learning_rate": 9.663205290697502e-06, "loss": 0.053143374621868134, "memory(GiB)": 21.32, "step": 4976, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.958855 }, { "epoch": 0.16168014813371018, "grad_norm": 0.6807003021240234, "learning_rate": 9.663011456019302e-06, "loss": 0.04600784182548523, "memory(GiB)": 21.32, "step": 4977, "token_acc": 0.9704641350210971, "train_speed(iter/s)": 0.958878 }, { "epoch": 0.1617126335964656, "grad_norm": 0.5526070594787598, "learning_rate": 9.662817567523742e-06, "loss": 0.04902731999754906, "memory(GiB)": 21.32, "step": 4978, "token_acc": 0.983957219251337, "train_speed(iter/s)": 0.958906 }, { "epoch": 0.161745119059221, "grad_norm": 0.6207295656204224, "learning_rate": 9.662623625213063e-06, "loss": 0.055131010711193085, "memory(GiB)": 21.32, "step": 4979, "token_acc": 0.964, "train_speed(iter/s)": 0.958937 }, { "epoch": 0.16177760452197643, "grad_norm": 0.5605867505073547, "learning_rate": 9.6624296290895e-06, "loss": 0.0457691065967083, "memory(GiB)": 21.32, "step": 4980, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.958967 }, { "epoch": 0.16181008998473184, "grad_norm": 0.7337735891342163, "learning_rate": 9.662235579155294e-06, "loss": 0.06827257573604584, "memory(GiB)": 21.32, "step": 4981, "token_acc": 0.9678899082568807, "train_speed(iter/s)": 0.958998 }, { "epoch": 0.16184257544748726, "grad_norm": 0.5867293477058411, "learning_rate": 9.662041475412684e-06, "loss": 0.060006171464920044, "memory(GiB)": 21.32, "step": 4982, "token_acc": 0.9768518518518519, "train_speed(iter/s)": 0.959027 }, { "epoch": 0.16187506091024267, "grad_norm": 0.5023043155670166, "learning_rate": 9.66184731786391e-06, "loss": 0.052680015563964844, "memory(GiB)": 21.32, "step": 4983, "token_acc": 0.9712918660287081, "train_speed(iter/s)": 0.959057 }, { "epoch": 0.1619075463729981, "grad_norm": 0.5734416246414185, "learning_rate": 9.661653106511212e-06, "loss": 0.04747860133647919, "memory(GiB)": 21.32, "step": 4984, "token_acc": 0.9795918367346939, "train_speed(iter/s)": 0.959086 }, { "epoch": 0.1619400318357535, "grad_norm": 2.730673313140869, "learning_rate": 9.661458841356834e-06, "loss": 0.060306571424007416, "memory(GiB)": 21.32, "step": 4985, "token_acc": 0.9776119402985075, "train_speed(iter/s)": 0.959116 }, { "epoch": 0.16197251729850892, "grad_norm": 0.867421567440033, "learning_rate": 9.661264522403017e-06, "loss": 0.04327307641506195, "memory(GiB)": 21.32, "step": 4986, "token_acc": 0.9754901960784313, "train_speed(iter/s)": 0.959146 }, { "epoch": 0.16200500276126434, "grad_norm": 0.4459421634674072, "learning_rate": 9.661070149652002e-06, "loss": 0.0465824119746685, "memory(GiB)": 21.32, "step": 4987, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.959179 }, { "epoch": 0.16203748822401975, "grad_norm": 0.5718124508857727, "learning_rate": 9.660875723106033e-06, "loss": 0.05909329652786255, "memory(GiB)": 21.32, "step": 4988, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.95921 }, { "epoch": 0.16206997368677517, "grad_norm": 0.45988383889198303, "learning_rate": 9.660681242767356e-06, "loss": 0.058517806231975555, "memory(GiB)": 21.32, "step": 4989, "token_acc": 0.965, "train_speed(iter/s)": 0.959243 }, { "epoch": 0.1621024591495306, "grad_norm": 0.6080044507980347, "learning_rate": 9.660486708638214e-06, "loss": 0.046557821333408356, "memory(GiB)": 21.32, "step": 4990, "token_acc": 0.995, "train_speed(iter/s)": 0.959279 }, { "epoch": 0.162134944612286, "grad_norm": 0.6684180498123169, "learning_rate": 9.660292120720851e-06, "loss": 0.057427261024713516, "memory(GiB)": 21.32, "step": 4991, "token_acc": 0.9672131147540983, "train_speed(iter/s)": 0.959317 }, { "epoch": 0.16216743007504142, "grad_norm": 0.5337368845939636, "learning_rate": 9.660097479017516e-06, "loss": 0.05788695439696312, "memory(GiB)": 21.32, "step": 4992, "token_acc": 0.9707112970711297, "train_speed(iter/s)": 0.959353 }, { "epoch": 0.16219991553779683, "grad_norm": 0.46028387546539307, "learning_rate": 9.659902783530453e-06, "loss": 0.04586376994848251, "memory(GiB)": 21.32, "step": 4993, "token_acc": 0.9809160305343512, "train_speed(iter/s)": 0.959391 }, { "epoch": 0.16223240100055225, "grad_norm": 0.5919697880744934, "learning_rate": 9.659708034261909e-06, "loss": 0.04885338246822357, "memory(GiB)": 21.32, "step": 4994, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.959427 }, { "epoch": 0.16226488646330767, "grad_norm": 0.5340306758880615, "learning_rate": 9.659513231214133e-06, "loss": 0.04774843528866768, "memory(GiB)": 21.32, "step": 4995, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.959464 }, { "epoch": 0.16229737192606308, "grad_norm": 0.7106966972351074, "learning_rate": 9.65931837438937e-06, "loss": 0.053710851818323135, "memory(GiB)": 21.32, "step": 4996, "token_acc": 0.9698275862068966, "train_speed(iter/s)": 0.959494 }, { "epoch": 0.1623298573888185, "grad_norm": 0.56330406665802, "learning_rate": 9.659123463789875e-06, "loss": 0.040936511009931564, "memory(GiB)": 21.32, "step": 4997, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.959524 }, { "epoch": 0.16236234285157392, "grad_norm": 0.5857616662979126, "learning_rate": 9.658928499417891e-06, "loss": 0.04002565145492554, "memory(GiB)": 21.32, "step": 4998, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.959553 }, { "epoch": 0.16239482831432933, "grad_norm": 12.200408935546875, "learning_rate": 9.658733481275672e-06, "loss": 0.06329667568206787, "memory(GiB)": 21.32, "step": 4999, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.959582 }, { "epoch": 0.16242731377708475, "grad_norm": 0.5917731523513794, "learning_rate": 9.658538409365468e-06, "loss": 0.05028115212917328, "memory(GiB)": 21.32, "step": 5000, "token_acc": 0.9799196787148594, "train_speed(iter/s)": 0.959612 }, { "epoch": 0.16242731377708475, "eval_loss": 0.057892151176929474, "eval_runtime": 80.2755, "eval_samples_per_second": 123.948, "eval_steps_per_second": 3.874, "eval_token_acc": 0.9781094424909519, "step": 5000 }, { "epoch": 0.16245979923984016, "grad_norm": 0.8525911569595337, "learning_rate": 9.658343283689531e-06, "loss": 0.047660160809755325, "memory(GiB)": 21.32, "step": 5001, "token_acc": 0.9776986788965706, "train_speed(iter/s)": 0.94327 }, { "epoch": 0.16249228470259558, "grad_norm": 1.1501944065093994, "learning_rate": 9.658148104250111e-06, "loss": 0.054516054689884186, "memory(GiB)": 21.32, "step": 5002, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.943301 }, { "epoch": 0.162524770165351, "grad_norm": 0.8759548664093018, "learning_rate": 9.657952871049464e-06, "loss": 0.060533393174409866, "memory(GiB)": 21.32, "step": 5003, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.943333 }, { "epoch": 0.1625572556281064, "grad_norm": 0.6291260719299316, "learning_rate": 9.65775758408984e-06, "loss": 0.05470500886440277, "memory(GiB)": 21.32, "step": 5004, "token_acc": 0.9631336405529954, "train_speed(iter/s)": 0.943366 }, { "epoch": 0.16258974109086183, "grad_norm": 0.7867599129676819, "learning_rate": 9.657562243373492e-06, "loss": 0.05486171692609787, "memory(GiB)": 21.32, "step": 5005, "token_acc": 0.9842931937172775, "train_speed(iter/s)": 0.943395 }, { "epoch": 0.16262222655361724, "grad_norm": 0.6733798384666443, "learning_rate": 9.65736684890268e-06, "loss": 0.05043363571166992, "memory(GiB)": 21.32, "step": 5006, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.94342 }, { "epoch": 0.16265471201637266, "grad_norm": 0.6926588416099548, "learning_rate": 9.657171400679654e-06, "loss": 0.050142332911491394, "memory(GiB)": 21.32, "step": 5007, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.943448 }, { "epoch": 0.1626871974791281, "grad_norm": 0.6421159505844116, "learning_rate": 9.65697589870667e-06, "loss": 0.04491041228175163, "memory(GiB)": 21.32, "step": 5008, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.943474 }, { "epoch": 0.16271968294188352, "grad_norm": 0.7961001396179199, "learning_rate": 9.656780342985988e-06, "loss": 0.054900676012039185, "memory(GiB)": 21.32, "step": 5009, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.943503 }, { "epoch": 0.16275216840463894, "grad_norm": 1.1338027715682983, "learning_rate": 9.656584733519861e-06, "loss": 0.06617487967014313, "memory(GiB)": 21.32, "step": 5010, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.943528 }, { "epoch": 0.16278465386739435, "grad_norm": 0.5075665712356567, "learning_rate": 9.65638907031055e-06, "loss": 0.051757387816905975, "memory(GiB)": 21.32, "step": 5011, "token_acc": 0.9807692307692307, "train_speed(iter/s)": 0.943551 }, { "epoch": 0.16281713933014977, "grad_norm": 0.5218102335929871, "learning_rate": 9.65619335336031e-06, "loss": 0.05192890763282776, "memory(GiB)": 21.32, "step": 5012, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.943579 }, { "epoch": 0.16284962479290518, "grad_norm": 0.5478705763816833, "learning_rate": 9.655997582671402e-06, "loss": 0.04513115435838699, "memory(GiB)": 21.32, "step": 5013, "token_acc": 0.9862542955326461, "train_speed(iter/s)": 0.943608 }, { "epoch": 0.1628821102556606, "grad_norm": 0.48307204246520996, "learning_rate": 9.655801758246084e-06, "loss": 0.05117914825677872, "memory(GiB)": 21.32, "step": 5014, "token_acc": 0.9890710382513661, "train_speed(iter/s)": 0.94364 }, { "epoch": 0.16291459571841602, "grad_norm": 0.5956507921218872, "learning_rate": 9.655605880086617e-06, "loss": 0.05579748377203941, "memory(GiB)": 21.32, "step": 5015, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.943672 }, { "epoch": 0.16294708118117143, "grad_norm": 0.44949662685394287, "learning_rate": 9.655409948195263e-06, "loss": 0.04916563630104065, "memory(GiB)": 21.32, "step": 5016, "token_acc": 0.9752475247524752, "train_speed(iter/s)": 0.943705 }, { "epoch": 0.16297956664392685, "grad_norm": 0.4704037010669708, "learning_rate": 9.65521396257428e-06, "loss": 0.04445979744195938, "memory(GiB)": 21.32, "step": 5017, "token_acc": 0.9904761904761905, "train_speed(iter/s)": 0.943736 }, { "epoch": 0.16301205210668226, "grad_norm": 0.34341105818748474, "learning_rate": 9.655017923225934e-06, "loss": 0.04276309907436371, "memory(GiB)": 21.32, "step": 5018, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.943766 }, { "epoch": 0.16304453756943768, "grad_norm": 0.5584226250648499, "learning_rate": 9.654821830152484e-06, "loss": 0.050592903047800064, "memory(GiB)": 21.32, "step": 5019, "token_acc": 0.9740740740740741, "train_speed(iter/s)": 0.943799 }, { "epoch": 0.1630770230321931, "grad_norm": 0.6024951934814453, "learning_rate": 9.654625683356194e-06, "loss": 0.043431755155324936, "memory(GiB)": 21.32, "step": 5020, "token_acc": 0.9798994974874372, "train_speed(iter/s)": 0.94383 }, { "epoch": 0.1631095084949485, "grad_norm": 0.6829084157943726, "learning_rate": 9.654429482839327e-06, "loss": 0.05278506129980087, "memory(GiB)": 21.32, "step": 5021, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.943864 }, { "epoch": 0.16314199395770393, "grad_norm": 2.577451229095459, "learning_rate": 9.65423322860415e-06, "loss": 0.058783382177352905, "memory(GiB)": 21.32, "step": 5022, "token_acc": 0.945273631840796, "train_speed(iter/s)": 0.943896 }, { "epoch": 0.16317447942045935, "grad_norm": 0.6429078578948975, "learning_rate": 9.654036920652929e-06, "loss": 0.04739446938037872, "memory(GiB)": 21.32, "step": 5023, "token_acc": 0.979253112033195, "train_speed(iter/s)": 0.943928 }, { "epoch": 0.16320696488321476, "grad_norm": 0.7575926184654236, "learning_rate": 9.653840558987926e-06, "loss": 0.044791653752326965, "memory(GiB)": 21.32, "step": 5024, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.943958 }, { "epoch": 0.16323945034597018, "grad_norm": 0.5309860110282898, "learning_rate": 9.653644143611408e-06, "loss": 0.04731272906064987, "memory(GiB)": 21.32, "step": 5025, "token_acc": 0.9739776951672863, "train_speed(iter/s)": 0.943985 }, { "epoch": 0.1632719358087256, "grad_norm": 0.5360579490661621, "learning_rate": 9.653447674525643e-06, "loss": 0.04758206382393837, "memory(GiB)": 21.32, "step": 5026, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.944016 }, { "epoch": 0.163304421271481, "grad_norm": 0.5711662769317627, "learning_rate": 9.653251151732897e-06, "loss": 0.05395542457699776, "memory(GiB)": 21.32, "step": 5027, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.944049 }, { "epoch": 0.16333690673423643, "grad_norm": 0.7685291171073914, "learning_rate": 9.653054575235441e-06, "loss": 0.0469193235039711, "memory(GiB)": 21.32, "step": 5028, "token_acc": 0.9938271604938271, "train_speed(iter/s)": 0.944078 }, { "epoch": 0.16336939219699184, "grad_norm": 0.5067219734191895, "learning_rate": 9.652857945035541e-06, "loss": 0.046920523047447205, "memory(GiB)": 21.32, "step": 5029, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.944111 }, { "epoch": 0.16340187765974726, "grad_norm": 0.6830836534500122, "learning_rate": 9.652661261135467e-06, "loss": 0.045592304319143295, "memory(GiB)": 21.32, "step": 5030, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.944139 }, { "epoch": 0.16343436312250267, "grad_norm": 1.4345085620880127, "learning_rate": 9.652464523537491e-06, "loss": 0.047680385410785675, "memory(GiB)": 21.32, "step": 5031, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.944167 }, { "epoch": 0.1634668485852581, "grad_norm": 0.568907618522644, "learning_rate": 9.652267732243881e-06, "loss": 0.053826551884412766, "memory(GiB)": 21.32, "step": 5032, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.944205 }, { "epoch": 0.1634993340480135, "grad_norm": 0.6213602423667908, "learning_rate": 9.65207088725691e-06, "loss": 0.05379679799079895, "memory(GiB)": 21.32, "step": 5033, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.944243 }, { "epoch": 0.16353181951076892, "grad_norm": 0.39417529106140137, "learning_rate": 9.651873988578849e-06, "loss": 0.05395588278770447, "memory(GiB)": 21.32, "step": 5034, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.944281 }, { "epoch": 0.16356430497352434, "grad_norm": 0.6976223587989807, "learning_rate": 9.651677036211968e-06, "loss": 0.052201345562934875, "memory(GiB)": 21.32, "step": 5035, "token_acc": 0.9834710743801653, "train_speed(iter/s)": 0.944319 }, { "epoch": 0.16359679043627975, "grad_norm": 0.622255802154541, "learning_rate": 9.651480030158545e-06, "loss": 0.04622715711593628, "memory(GiB)": 21.32, "step": 5036, "token_acc": 0.9802631578947368, "train_speed(iter/s)": 0.944356 }, { "epoch": 0.16362927589903517, "grad_norm": 0.564414381980896, "learning_rate": 9.651282970420853e-06, "loss": 0.04445574805140495, "memory(GiB)": 21.32, "step": 5037, "token_acc": 0.9686274509803922, "train_speed(iter/s)": 0.944395 }, { "epoch": 0.1636617613617906, "grad_norm": 0.6439695954322815, "learning_rate": 9.651085857001162e-06, "loss": 0.0566377267241478, "memory(GiB)": 21.32, "step": 5038, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.944431 }, { "epoch": 0.163694246824546, "grad_norm": 0.7778895497322083, "learning_rate": 9.650888689901749e-06, "loss": 0.060163892805576324, "memory(GiB)": 21.32, "step": 5039, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.944418 }, { "epoch": 0.16372673228730145, "grad_norm": 0.6060009002685547, "learning_rate": 9.65069146912489e-06, "loss": 0.06408379971981049, "memory(GiB)": 21.32, "step": 5040, "token_acc": 0.9822695035460993, "train_speed(iter/s)": 0.944455 }, { "epoch": 0.16375921775005686, "grad_norm": 0.46537259221076965, "learning_rate": 9.650494194672864e-06, "loss": 0.050619661808013916, "memory(GiB)": 21.32, "step": 5041, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.944492 }, { "epoch": 0.16379170321281228, "grad_norm": 0.9073120951652527, "learning_rate": 9.650296866547943e-06, "loss": 0.06055482476949692, "memory(GiB)": 21.32, "step": 5042, "token_acc": 0.9739776951672863, "train_speed(iter/s)": 0.94453 }, { "epoch": 0.1638241886755677, "grad_norm": 0.6571362018585205, "learning_rate": 9.650099484752409e-06, "loss": 0.054791562259197235, "memory(GiB)": 21.32, "step": 5043, "token_acc": 0.9847715736040609, "train_speed(iter/s)": 0.944567 }, { "epoch": 0.1638566741383231, "grad_norm": 1.0217381715774536, "learning_rate": 9.649902049288537e-06, "loss": 0.0467374362051487, "memory(GiB)": 21.32, "step": 5044, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.944606 }, { "epoch": 0.16388915960107853, "grad_norm": 0.7448587417602539, "learning_rate": 9.649704560158605e-06, "loss": 0.051054418087005615, "memory(GiB)": 21.32, "step": 5045, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.944644 }, { "epoch": 0.16392164506383394, "grad_norm": 0.49226438999176025, "learning_rate": 9.649507017364894e-06, "loss": 0.048679474741220474, "memory(GiB)": 21.32, "step": 5046, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.944681 }, { "epoch": 0.16395413052658936, "grad_norm": 0.6188730001449585, "learning_rate": 9.649309420909685e-06, "loss": 0.0481262281537056, "memory(GiB)": 21.32, "step": 5047, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.944717 }, { "epoch": 0.16398661598934478, "grad_norm": 0.6186234354972839, "learning_rate": 9.649111770795254e-06, "loss": 0.05931000038981438, "memory(GiB)": 21.32, "step": 5048, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.944755 }, { "epoch": 0.1640191014521002, "grad_norm": 0.5772006511688232, "learning_rate": 9.648914067023888e-06, "loss": 0.05911391228437424, "memory(GiB)": 21.32, "step": 5049, "token_acc": 0.984, "train_speed(iter/s)": 0.944793 }, { "epoch": 0.1640515869148556, "grad_norm": 0.5653462409973145, "learning_rate": 9.648716309597866e-06, "loss": 0.048159461468458176, "memory(GiB)": 21.32, "step": 5050, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.944832 }, { "epoch": 0.16408407237761102, "grad_norm": 0.8619272112846375, "learning_rate": 9.648518498519468e-06, "loss": 0.04978591948747635, "memory(GiB)": 21.32, "step": 5051, "token_acc": 0.963963963963964, "train_speed(iter/s)": 0.944869 }, { "epoch": 0.16411655784036644, "grad_norm": 0.6488171815872192, "learning_rate": 9.648320633790982e-06, "loss": 0.05541759729385376, "memory(GiB)": 21.32, "step": 5052, "token_acc": 0.9644670050761421, "train_speed(iter/s)": 0.944901 }, { "epoch": 0.16414904330312186, "grad_norm": 1.0455796718597412, "learning_rate": 9.648122715414687e-06, "loss": 0.041874151676893234, "memory(GiB)": 21.32, "step": 5053, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.944932 }, { "epoch": 0.16418152876587727, "grad_norm": 0.5784266591072083, "learning_rate": 9.647924743392871e-06, "loss": 0.058499373495578766, "memory(GiB)": 21.32, "step": 5054, "token_acc": 0.9896551724137931, "train_speed(iter/s)": 0.944964 }, { "epoch": 0.1642140142286327, "grad_norm": 0.5622340440750122, "learning_rate": 9.647726717727815e-06, "loss": 0.04795412719249725, "memory(GiB)": 21.32, "step": 5055, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.944994 }, { "epoch": 0.1642464996913881, "grad_norm": 0.9867557883262634, "learning_rate": 9.647528638421807e-06, "loss": 0.04802151396870613, "memory(GiB)": 21.32, "step": 5056, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.945025 }, { "epoch": 0.16427898515414352, "grad_norm": 0.54239422082901, "learning_rate": 9.647330505477134e-06, "loss": 0.0422431118786335, "memory(GiB)": 21.32, "step": 5057, "token_acc": 0.9837837837837838, "train_speed(iter/s)": 0.945056 }, { "epoch": 0.16431147061689894, "grad_norm": 1.158975601196289, "learning_rate": 9.64713231889608e-06, "loss": 0.05737016350030899, "memory(GiB)": 21.32, "step": 5058, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.945089 }, { "epoch": 0.16434395607965435, "grad_norm": 0.5337629914283752, "learning_rate": 9.646934078680935e-06, "loss": 0.04519630968570709, "memory(GiB)": 21.32, "step": 5059, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.94512 }, { "epoch": 0.16437644154240977, "grad_norm": 0.5925673246383667, "learning_rate": 9.646735784833984e-06, "loss": 0.06396812200546265, "memory(GiB)": 21.32, "step": 5060, "token_acc": 0.9662447257383966, "train_speed(iter/s)": 0.945151 }, { "epoch": 0.16440892700516518, "grad_norm": 0.41864311695098877, "learning_rate": 9.64653743735752e-06, "loss": 0.04243157058954239, "memory(GiB)": 21.32, "step": 5061, "token_acc": 0.980544747081712, "train_speed(iter/s)": 0.945178 }, { "epoch": 0.1644414124679206, "grad_norm": 0.6790570616722107, "learning_rate": 9.646339036253826e-06, "loss": 0.04785147309303284, "memory(GiB)": 21.32, "step": 5062, "token_acc": 0.9752650176678446, "train_speed(iter/s)": 0.945207 }, { "epoch": 0.16447389793067602, "grad_norm": 0.6562475562095642, "learning_rate": 9.646140581525198e-06, "loss": 0.05557432025671005, "memory(GiB)": 21.32, "step": 5063, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.945237 }, { "epoch": 0.16450638339343143, "grad_norm": 0.5874664187431335, "learning_rate": 9.645942073173925e-06, "loss": 0.03450505807995796, "memory(GiB)": 21.32, "step": 5064, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.945268 }, { "epoch": 0.16453886885618685, "grad_norm": 0.6291543841362, "learning_rate": 9.645743511202292e-06, "loss": 0.048547931015491486, "memory(GiB)": 21.32, "step": 5065, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.945298 }, { "epoch": 0.16457135431894226, "grad_norm": 0.6393155455589294, "learning_rate": 9.645544895612599e-06, "loss": 0.04983007535338402, "memory(GiB)": 21.32, "step": 5066, "token_acc": 0.9738219895287958, "train_speed(iter/s)": 0.945329 }, { "epoch": 0.16460383978169768, "grad_norm": 0.6801211833953857, "learning_rate": 9.645346226407133e-06, "loss": 0.05838168412446976, "memory(GiB)": 21.32, "step": 5067, "token_acc": 0.9659574468085106, "train_speed(iter/s)": 0.945361 }, { "epoch": 0.1646363252444531, "grad_norm": 0.6216561794281006, "learning_rate": 9.64514750358819e-06, "loss": 0.04966295510530472, "memory(GiB)": 21.32, "step": 5068, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.945391 }, { "epoch": 0.1646688107072085, "grad_norm": 0.7016139030456543, "learning_rate": 9.64494872715806e-06, "loss": 0.05223802104592323, "memory(GiB)": 21.32, "step": 5069, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.945424 }, { "epoch": 0.16470129616996393, "grad_norm": 0.8202245831489563, "learning_rate": 9.64474989711904e-06, "loss": 0.05183953046798706, "memory(GiB)": 21.32, "step": 5070, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.94544 }, { "epoch": 0.16473378163271934, "grad_norm": 1.1477755308151245, "learning_rate": 9.644551013473426e-06, "loss": 0.05219151824712753, "memory(GiB)": 21.32, "step": 5071, "token_acc": 0.978494623655914, "train_speed(iter/s)": 0.945467 }, { "epoch": 0.1647662670954748, "grad_norm": 0.6659194827079773, "learning_rate": 9.64435207622351e-06, "loss": 0.052841149270534515, "memory(GiB)": 21.32, "step": 5072, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.945497 }, { "epoch": 0.1647987525582302, "grad_norm": 0.5845001935958862, "learning_rate": 9.64415308537159e-06, "loss": 0.061086419969797134, "memory(GiB)": 21.32, "step": 5073, "token_acc": 0.9723502304147466, "train_speed(iter/s)": 0.945528 }, { "epoch": 0.16483123802098562, "grad_norm": 0.5200133323669434, "learning_rate": 9.64395404091996e-06, "loss": 0.04603438451886177, "memory(GiB)": 21.32, "step": 5074, "token_acc": 0.981042654028436, "train_speed(iter/s)": 0.94556 }, { "epoch": 0.16486372348374104, "grad_norm": 0.5441588759422302, "learning_rate": 9.643754942870922e-06, "loss": 0.052195340394973755, "memory(GiB)": 21.32, "step": 5075, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.94559 }, { "epoch": 0.16489620894649645, "grad_norm": 0.5220092535018921, "learning_rate": 9.64355579122677e-06, "loss": 0.04815807193517685, "memory(GiB)": 21.32, "step": 5076, "token_acc": 0.9690265486725663, "train_speed(iter/s)": 0.945618 }, { "epoch": 0.16492869440925187, "grad_norm": 0.8721487522125244, "learning_rate": 9.643356585989803e-06, "loss": 0.04681375250220299, "memory(GiB)": 21.32, "step": 5077, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.945645 }, { "epoch": 0.16496117987200729, "grad_norm": 0.9531168341636658, "learning_rate": 9.643157327162322e-06, "loss": 0.05299535393714905, "memory(GiB)": 21.32, "step": 5078, "token_acc": 0.9742489270386266, "train_speed(iter/s)": 0.945676 }, { "epoch": 0.1649936653347627, "grad_norm": 0.45949944853782654, "learning_rate": 9.642958014746624e-06, "loss": 0.05462668836116791, "memory(GiB)": 21.32, "step": 5079, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.945706 }, { "epoch": 0.16502615079751812, "grad_norm": 0.6115596890449524, "learning_rate": 9.642758648745011e-06, "loss": 0.06052626296877861, "memory(GiB)": 21.32, "step": 5080, "token_acc": 0.9724770642201835, "train_speed(iter/s)": 0.945734 }, { "epoch": 0.16505863626027353, "grad_norm": 0.655717134475708, "learning_rate": 9.642559229159784e-06, "loss": 0.05620448663830757, "memory(GiB)": 21.32, "step": 5081, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.945764 }, { "epoch": 0.16509112172302895, "grad_norm": 0.554165244102478, "learning_rate": 9.642359755993245e-06, "loss": 0.046765465289354324, "memory(GiB)": 21.32, "step": 5082, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.945795 }, { "epoch": 0.16512360718578437, "grad_norm": 0.6477575302124023, "learning_rate": 9.642160229247695e-06, "loss": 0.053227923810482025, "memory(GiB)": 21.32, "step": 5083, "token_acc": 0.9754901960784313, "train_speed(iter/s)": 0.945826 }, { "epoch": 0.16515609264853978, "grad_norm": 0.5035938620567322, "learning_rate": 9.641960648925437e-06, "loss": 0.043517254292964935, "memory(GiB)": 21.32, "step": 5084, "token_acc": 0.9781420765027322, "train_speed(iter/s)": 0.945859 }, { "epoch": 0.1651885781112952, "grad_norm": 0.5906373262405396, "learning_rate": 9.641761015028775e-06, "loss": 0.05043443292379379, "memory(GiB)": 21.32, "step": 5085, "token_acc": 0.9776119402985075, "train_speed(iter/s)": 0.945892 }, { "epoch": 0.16522106357405061, "grad_norm": 0.47773227095603943, "learning_rate": 9.641561327560012e-06, "loss": 0.04328840970993042, "memory(GiB)": 21.32, "step": 5086, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.945925 }, { "epoch": 0.16525354903680603, "grad_norm": 0.60568767786026, "learning_rate": 9.641361586521455e-06, "loss": 0.06468160450458527, "memory(GiB)": 21.32, "step": 5087, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.945954 }, { "epoch": 0.16528603449956145, "grad_norm": 0.3839889168739319, "learning_rate": 9.641161791915406e-06, "loss": 0.04973636195063591, "memory(GiB)": 21.32, "step": 5088, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.945982 }, { "epoch": 0.16531851996231686, "grad_norm": 0.7586643695831299, "learning_rate": 9.640961943744172e-06, "loss": 0.06385733187198639, "memory(GiB)": 21.32, "step": 5089, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.946008 }, { "epoch": 0.16535100542507228, "grad_norm": 14.90857219696045, "learning_rate": 9.640762042010061e-06, "loss": 0.06357075273990631, "memory(GiB)": 21.32, "step": 5090, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.946038 }, { "epoch": 0.1653834908878277, "grad_norm": 0.36929163336753845, "learning_rate": 9.64056208671538e-06, "loss": 0.044947169721126556, "memory(GiB)": 21.32, "step": 5091, "token_acc": 0.9878048780487805, "train_speed(iter/s)": 0.946069 }, { "epoch": 0.1654159763505831, "grad_norm": 2.6512298583984375, "learning_rate": 9.640362077862436e-06, "loss": 0.05989230424165726, "memory(GiB)": 21.32, "step": 5092, "token_acc": 0.9764150943396226, "train_speed(iter/s)": 0.946102 }, { "epoch": 0.16544846181333853, "grad_norm": 0.664646565914154, "learning_rate": 9.640162015453536e-06, "loss": 0.045523472130298615, "memory(GiB)": 21.32, "step": 5093, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.946135 }, { "epoch": 0.16548094727609394, "grad_norm": 0.5559518337249756, "learning_rate": 9.639961899490991e-06, "loss": 0.049387820065021515, "memory(GiB)": 21.32, "step": 5094, "token_acc": 0.9860627177700348, "train_speed(iter/s)": 0.946169 }, { "epoch": 0.16551343273884936, "grad_norm": 0.7592869400978088, "learning_rate": 9.63976172997711e-06, "loss": 0.04973042011260986, "memory(GiB)": 21.32, "step": 5095, "token_acc": 0.970873786407767, "train_speed(iter/s)": 0.946208 }, { "epoch": 0.16554591820160477, "grad_norm": 0.5038393139839172, "learning_rate": 9.639561506914203e-06, "loss": 0.045849792659282684, "memory(GiB)": 21.32, "step": 5096, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.946243 }, { "epoch": 0.1655784036643602, "grad_norm": 0.47309622168540955, "learning_rate": 9.63936123030458e-06, "loss": 0.04095899313688278, "memory(GiB)": 21.32, "step": 5097, "token_acc": 0.9837837837837838, "train_speed(iter/s)": 0.94628 }, { "epoch": 0.1656108891271156, "grad_norm": 0.5315437912940979, "learning_rate": 9.639160900150554e-06, "loss": 0.042831920087337494, "memory(GiB)": 21.32, "step": 5098, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.946318 }, { "epoch": 0.16564337458987102, "grad_norm": 0.5760030746459961, "learning_rate": 9.638960516454437e-06, "loss": 0.04896008223295212, "memory(GiB)": 21.32, "step": 5099, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.946356 }, { "epoch": 0.16567586005262644, "grad_norm": 0.7733873128890991, "learning_rate": 9.638760079218542e-06, "loss": 0.05866921693086624, "memory(GiB)": 21.32, "step": 5100, "token_acc": 0.9555555555555556, "train_speed(iter/s)": 0.946391 }, { "epoch": 0.16570834551538186, "grad_norm": 0.6455259919166565, "learning_rate": 9.63855958844518e-06, "loss": 0.0520796999335289, "memory(GiB)": 21.32, "step": 5101, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.946429 }, { "epoch": 0.16574083097813727, "grad_norm": 0.6560297608375549, "learning_rate": 9.638359044136666e-06, "loss": 0.04628302901983261, "memory(GiB)": 21.32, "step": 5102, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.946467 }, { "epoch": 0.1657733164408927, "grad_norm": 0.5078476071357727, "learning_rate": 9.638158446295318e-06, "loss": 0.05502677708864212, "memory(GiB)": 21.32, "step": 5103, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.946505 }, { "epoch": 0.16580580190364813, "grad_norm": 0.6788594722747803, "learning_rate": 9.637957794923445e-06, "loss": 0.0651123970746994, "memory(GiB)": 21.32, "step": 5104, "token_acc": 0.9625, "train_speed(iter/s)": 0.946542 }, { "epoch": 0.16583828736640355, "grad_norm": 1.0140882730484009, "learning_rate": 9.637757090023368e-06, "loss": 0.06614868342876434, "memory(GiB)": 21.32, "step": 5105, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.946579 }, { "epoch": 0.16587077282915896, "grad_norm": 0.8057443499565125, "learning_rate": 9.6375563315974e-06, "loss": 0.06137183681130409, "memory(GiB)": 21.32, "step": 5106, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.946617 }, { "epoch": 0.16590325829191438, "grad_norm": 0.6506890058517456, "learning_rate": 9.63735551964786e-06, "loss": 0.04322833940386772, "memory(GiB)": 21.32, "step": 5107, "token_acc": 0.981203007518797, "train_speed(iter/s)": 0.946654 }, { "epoch": 0.1659357437546698, "grad_norm": 0.5067635178565979, "learning_rate": 9.637154654177063e-06, "loss": 0.050213057547807693, "memory(GiB)": 21.32, "step": 5108, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.94669 }, { "epoch": 0.1659682292174252, "grad_norm": 0.5284448266029358, "learning_rate": 9.636953735187331e-06, "loss": 0.051913149654865265, "memory(GiB)": 21.32, "step": 5109, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.946728 }, { "epoch": 0.16600071468018063, "grad_norm": 0.5228046774864197, "learning_rate": 9.636752762680982e-06, "loss": 0.05556875467300415, "memory(GiB)": 21.32, "step": 5110, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.946761 }, { "epoch": 0.16603320014293604, "grad_norm": 0.9024496078491211, "learning_rate": 9.636551736660334e-06, "loss": 0.05585004389286041, "memory(GiB)": 21.32, "step": 5111, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.946787 }, { "epoch": 0.16606568560569146, "grad_norm": 0.39867424964904785, "learning_rate": 9.636350657127706e-06, "loss": 0.04308561235666275, "memory(GiB)": 21.32, "step": 5112, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.94682 }, { "epoch": 0.16609817106844688, "grad_norm": 0.7269496321678162, "learning_rate": 9.636149524085424e-06, "loss": 0.055158331990242004, "memory(GiB)": 21.32, "step": 5113, "token_acc": 0.9681818181818181, "train_speed(iter/s)": 0.94685 }, { "epoch": 0.1661306565312023, "grad_norm": 0.7546184659004211, "learning_rate": 9.635948337535802e-06, "loss": 0.05330462381243706, "memory(GiB)": 21.32, "step": 5114, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.946881 }, { "epoch": 0.1661631419939577, "grad_norm": 0.6139071583747864, "learning_rate": 9.635747097481168e-06, "loss": 0.05092015862464905, "memory(GiB)": 21.32, "step": 5115, "token_acc": 0.9789473684210527, "train_speed(iter/s)": 0.94691 }, { "epoch": 0.16619562745671312, "grad_norm": 1.100319504737854, "learning_rate": 9.635545803923843e-06, "loss": 0.05238119512796402, "memory(GiB)": 21.32, "step": 5116, "token_acc": 0.988, "train_speed(iter/s)": 0.946937 }, { "epoch": 0.16622811291946854, "grad_norm": 0.5606961846351624, "learning_rate": 9.635344456866147e-06, "loss": 0.04874324053525925, "memory(GiB)": 21.32, "step": 5117, "token_acc": 0.9615384615384616, "train_speed(iter/s)": 0.946965 }, { "epoch": 0.16626059838222396, "grad_norm": 0.5220843553543091, "learning_rate": 9.635143056310408e-06, "loss": 0.04890109598636627, "memory(GiB)": 21.32, "step": 5118, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.946994 }, { "epoch": 0.16629308384497937, "grad_norm": 0.6734864711761475, "learning_rate": 9.634941602258949e-06, "loss": 0.05609199404716492, "memory(GiB)": 21.32, "step": 5119, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.947024 }, { "epoch": 0.1663255693077348, "grad_norm": 0.8328126668930054, "learning_rate": 9.634740094714093e-06, "loss": 0.043559860438108444, "memory(GiB)": 21.32, "step": 5120, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.947055 }, { "epoch": 0.1663580547704902, "grad_norm": 0.69295334815979, "learning_rate": 9.63453853367817e-06, "loss": 0.0631750226020813, "memory(GiB)": 21.32, "step": 5121, "token_acc": 0.9620253164556962, "train_speed(iter/s)": 0.947085 }, { "epoch": 0.16639054023324562, "grad_norm": 0.6905889511108398, "learning_rate": 9.634336919153502e-06, "loss": 0.062247999012470245, "memory(GiB)": 21.32, "step": 5122, "token_acc": 0.9618320610687023, "train_speed(iter/s)": 0.947115 }, { "epoch": 0.16642302569600104, "grad_norm": 0.5090658068656921, "learning_rate": 9.634135251142418e-06, "loss": 0.04432203993201256, "memory(GiB)": 21.32, "step": 5123, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.947146 }, { "epoch": 0.16645551115875645, "grad_norm": 0.5307292938232422, "learning_rate": 9.633933529647245e-06, "loss": 0.05799155309796333, "memory(GiB)": 21.32, "step": 5124, "token_acc": 0.9663865546218487, "train_speed(iter/s)": 0.947176 }, { "epoch": 0.16648799662151187, "grad_norm": 0.6729259490966797, "learning_rate": 9.633731754670312e-06, "loss": 0.04508725181221962, "memory(GiB)": 21.32, "step": 5125, "token_acc": 0.9806763285024155, "train_speed(iter/s)": 0.947202 }, { "epoch": 0.16652048208426729, "grad_norm": 0.8571934700012207, "learning_rate": 9.633529926213947e-06, "loss": 0.05704602971673012, "memory(GiB)": 21.32, "step": 5126, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.947232 }, { "epoch": 0.1665529675470227, "grad_norm": 0.5765273571014404, "learning_rate": 9.633328044280478e-06, "loss": 0.05100339651107788, "memory(GiB)": 21.32, "step": 5127, "token_acc": 0.9898477157360406, "train_speed(iter/s)": 0.947264 }, { "epoch": 0.16658545300977812, "grad_norm": 0.6076290011405945, "learning_rate": 9.633126108872239e-06, "loss": 0.05061780661344528, "memory(GiB)": 21.32, "step": 5128, "token_acc": 0.9780701754385965, "train_speed(iter/s)": 0.947292 }, { "epoch": 0.16661793847253353, "grad_norm": 1.1142128705978394, "learning_rate": 9.632924119991556e-06, "loss": 0.05992499738931656, "memory(GiB)": 21.32, "step": 5129, "token_acc": 0.9746835443037974, "train_speed(iter/s)": 0.947326 }, { "epoch": 0.16665042393528895, "grad_norm": 0.6270645260810852, "learning_rate": 9.632722077640763e-06, "loss": 0.05170720815658569, "memory(GiB)": 21.32, "step": 5130, "token_acc": 0.9824561403508771, "train_speed(iter/s)": 0.947364 }, { "epoch": 0.16668290939804437, "grad_norm": 0.48357781767845154, "learning_rate": 9.632519981822191e-06, "loss": 0.04818137362599373, "memory(GiB)": 21.32, "step": 5131, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.947398 }, { "epoch": 0.16671539486079978, "grad_norm": 0.5278130173683167, "learning_rate": 9.632317832538171e-06, "loss": 0.04711860045790672, "memory(GiB)": 21.32, "step": 5132, "token_acc": 0.9846743295019157, "train_speed(iter/s)": 0.947432 }, { "epoch": 0.1667478803235552, "grad_norm": 1.1604552268981934, "learning_rate": 9.63211562979104e-06, "loss": 0.05688123032450676, "memory(GiB)": 21.32, "step": 5133, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.947465 }, { "epoch": 0.1667803657863106, "grad_norm": 0.4806995093822479, "learning_rate": 9.631913373583128e-06, "loss": 0.04501432180404663, "memory(GiB)": 21.32, "step": 5134, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.947496 }, { "epoch": 0.16681285124906603, "grad_norm": 0.6807375550270081, "learning_rate": 9.63171106391677e-06, "loss": 0.05579793453216553, "memory(GiB)": 21.32, "step": 5135, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.947526 }, { "epoch": 0.16684533671182147, "grad_norm": 0.6704477071762085, "learning_rate": 9.631508700794302e-06, "loss": 0.05854341387748718, "memory(GiB)": 21.32, "step": 5136, "token_acc": 0.963302752293578, "train_speed(iter/s)": 0.947558 }, { "epoch": 0.1668778221745769, "grad_norm": 0.5761864185333252, "learning_rate": 9.63130628421806e-06, "loss": 0.05422014370560646, "memory(GiB)": 21.32, "step": 5137, "token_acc": 0.9708333333333333, "train_speed(iter/s)": 0.947588 }, { "epoch": 0.1669103076373323, "grad_norm": 0.49179089069366455, "learning_rate": 9.63110381419038e-06, "loss": 0.047958120703697205, "memory(GiB)": 21.32, "step": 5138, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.947619 }, { "epoch": 0.16694279310008772, "grad_norm": 0.9290023446083069, "learning_rate": 9.630901290713597e-06, "loss": 0.06182471290230751, "memory(GiB)": 21.32, "step": 5139, "token_acc": 0.9813084112149533, "train_speed(iter/s)": 0.947649 }, { "epoch": 0.16697527856284314, "grad_norm": 1.347072958946228, "learning_rate": 9.630698713790049e-06, "loss": 0.05661071464419365, "memory(GiB)": 21.32, "step": 5140, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.947678 }, { "epoch": 0.16700776402559855, "grad_norm": 0.9619228839874268, "learning_rate": 9.630496083422074e-06, "loss": 0.05758528411388397, "memory(GiB)": 21.32, "step": 5141, "token_acc": 0.9719626168224299, "train_speed(iter/s)": 0.947708 }, { "epoch": 0.16704024948835397, "grad_norm": 0.502480685710907, "learning_rate": 9.630293399612012e-06, "loss": 0.04059722274541855, "memory(GiB)": 21.32, "step": 5142, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.947738 }, { "epoch": 0.1670727349511094, "grad_norm": 0.6690759658813477, "learning_rate": 9.630090662362201e-06, "loss": 0.056036222726106644, "memory(GiB)": 21.32, "step": 5143, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.947766 }, { "epoch": 0.1671052204138648, "grad_norm": 0.5684738159179688, "learning_rate": 9.62988787167498e-06, "loss": 0.0546485111117363, "memory(GiB)": 21.32, "step": 5144, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.947793 }, { "epoch": 0.16713770587662022, "grad_norm": 0.9541622400283813, "learning_rate": 9.629685027552694e-06, "loss": 0.05355461686849594, "memory(GiB)": 21.32, "step": 5145, "token_acc": 0.9601593625498008, "train_speed(iter/s)": 0.947822 }, { "epoch": 0.16717019133937563, "grad_norm": 0.6543473601341248, "learning_rate": 9.629482129997676e-06, "loss": 0.054560378193855286, "memory(GiB)": 21.32, "step": 5146, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.947854 }, { "epoch": 0.16720267680213105, "grad_norm": 0.8820799589157104, "learning_rate": 9.629279179012275e-06, "loss": 0.04422498866915703, "memory(GiB)": 21.32, "step": 5147, "token_acc": 0.9747474747474747, "train_speed(iter/s)": 0.947881 }, { "epoch": 0.16723516226488647, "grad_norm": 0.39664050936698914, "learning_rate": 9.62907617459883e-06, "loss": 0.04712077975273132, "memory(GiB)": 21.32, "step": 5148, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.947912 }, { "epoch": 0.16726764772764188, "grad_norm": 0.588827908039093, "learning_rate": 9.628873116759687e-06, "loss": 0.06140810623764992, "memory(GiB)": 21.32, "step": 5149, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.947943 }, { "epoch": 0.1673001331903973, "grad_norm": 0.919097363948822, "learning_rate": 9.628670005497184e-06, "loss": 0.0458323135972023, "memory(GiB)": 21.32, "step": 5150, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.947973 }, { "epoch": 0.16733261865315271, "grad_norm": 0.5034312009811401, "learning_rate": 9.62846684081367e-06, "loss": 0.04961695149540901, "memory(GiB)": 21.32, "step": 5151, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.948005 }, { "epoch": 0.16736510411590813, "grad_norm": 0.5353419780731201, "learning_rate": 9.628263622711487e-06, "loss": 0.051764942705631256, "memory(GiB)": 21.32, "step": 5152, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.948028 }, { "epoch": 0.16739758957866355, "grad_norm": 0.7625626921653748, "learning_rate": 9.628060351192983e-06, "loss": 0.05873510241508484, "memory(GiB)": 21.32, "step": 5153, "token_acc": 0.9799196787148594, "train_speed(iter/s)": 0.948056 }, { "epoch": 0.16743007504141896, "grad_norm": 0.5744953155517578, "learning_rate": 9.627857026260502e-06, "loss": 0.04643397033214569, "memory(GiB)": 21.32, "step": 5154, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.948088 }, { "epoch": 0.16746256050417438, "grad_norm": 0.9350873231887817, "learning_rate": 9.62765364791639e-06, "loss": 0.05202572047710419, "memory(GiB)": 21.32, "step": 5155, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.948118 }, { "epoch": 0.1674950459669298, "grad_norm": 0.6724746227264404, "learning_rate": 9.627450216162998e-06, "loss": 0.049333974719047546, "memory(GiB)": 21.32, "step": 5156, "token_acc": 0.9725085910652921, "train_speed(iter/s)": 0.948149 }, { "epoch": 0.1675275314296852, "grad_norm": 0.45378774404525757, "learning_rate": 9.62724673100267e-06, "loss": 0.05329357087612152, "memory(GiB)": 21.32, "step": 5157, "token_acc": 0.9692307692307692, "train_speed(iter/s)": 0.948183 }, { "epoch": 0.16756001689244063, "grad_norm": 0.6236662268638611, "learning_rate": 9.627043192437756e-06, "loss": 0.049147848039865494, "memory(GiB)": 21.32, "step": 5158, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.94822 }, { "epoch": 0.16759250235519604, "grad_norm": 1.0864720344543457, "learning_rate": 9.626839600470605e-06, "loss": 0.05108397454023361, "memory(GiB)": 21.32, "step": 5159, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.948257 }, { "epoch": 0.16762498781795146, "grad_norm": 0.6368234753608704, "learning_rate": 9.626635955103566e-06, "loss": 0.05769503116607666, "memory(GiB)": 21.32, "step": 5160, "token_acc": 0.9836065573770492, "train_speed(iter/s)": 0.948295 }, { "epoch": 0.16765747328070688, "grad_norm": 0.5994953513145447, "learning_rate": 9.626432256338991e-06, "loss": 0.0396328866481781, "memory(GiB)": 21.32, "step": 5161, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.948329 }, { "epoch": 0.1676899587434623, "grad_norm": 0.5372447967529297, "learning_rate": 9.62622850417923e-06, "loss": 0.05162063613533974, "memory(GiB)": 21.32, "step": 5162, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.948365 }, { "epoch": 0.1677224442062177, "grad_norm": 0.7377714514732361, "learning_rate": 9.626024698626635e-06, "loss": 0.05521998554468155, "memory(GiB)": 21.32, "step": 5163, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.948401 }, { "epoch": 0.16775492966897312, "grad_norm": 1.10442054271698, "learning_rate": 9.625820839683557e-06, "loss": 0.06627698242664337, "memory(GiB)": 21.32, "step": 5164, "token_acc": 0.9797979797979798, "train_speed(iter/s)": 0.948438 }, { "epoch": 0.16778741513172854, "grad_norm": 0.6395230889320374, "learning_rate": 9.625616927352349e-06, "loss": 0.059462063014507294, "memory(GiB)": 21.32, "step": 5165, "token_acc": 0.9661654135338346, "train_speed(iter/s)": 0.948473 }, { "epoch": 0.16781990059448396, "grad_norm": 0.5334858298301697, "learning_rate": 9.625412961635365e-06, "loss": 0.04703826457262039, "memory(GiB)": 21.32, "step": 5166, "token_acc": 0.9857651245551602, "train_speed(iter/s)": 0.948507 }, { "epoch": 0.16785238605723937, "grad_norm": 0.5072698593139648, "learning_rate": 9.62520894253496e-06, "loss": 0.05356404557824135, "memory(GiB)": 21.32, "step": 5167, "token_acc": 0.9717741935483871, "train_speed(iter/s)": 0.948535 }, { "epoch": 0.16788487151999482, "grad_norm": 0.5314221382141113, "learning_rate": 9.625004870053488e-06, "loss": 0.05664481967687607, "memory(GiB)": 21.32, "step": 5168, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.948565 }, { "epoch": 0.16791735698275023, "grad_norm": 0.5609354376792908, "learning_rate": 9.624800744193303e-06, "loss": 0.057697929441928864, "memory(GiB)": 21.32, "step": 5169, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.948595 }, { "epoch": 0.16794984244550565, "grad_norm": 0.5801573395729065, "learning_rate": 9.624596564956763e-06, "loss": 0.05185365676879883, "memory(GiB)": 21.32, "step": 5170, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.948625 }, { "epoch": 0.16798232790826106, "grad_norm": 0.6456145644187927, "learning_rate": 9.624392332346222e-06, "loss": 0.05146685987710953, "memory(GiB)": 21.32, "step": 5171, "token_acc": 0.9921875, "train_speed(iter/s)": 0.948653 }, { "epoch": 0.16801481337101648, "grad_norm": 0.6967915296554565, "learning_rate": 9.624188046364038e-06, "loss": 0.07260699570178986, "memory(GiB)": 21.32, "step": 5172, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.948683 }, { "epoch": 0.1680472988337719, "grad_norm": 0.548653244972229, "learning_rate": 9.62398370701257e-06, "loss": 0.04702429473400116, "memory(GiB)": 21.32, "step": 5173, "token_acc": 0.9646464646464646, "train_speed(iter/s)": 0.948713 }, { "epoch": 0.1680797842965273, "grad_norm": 0.5430309772491455, "learning_rate": 9.623779314294176e-06, "loss": 0.05145341902971268, "memory(GiB)": 21.32, "step": 5174, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.948743 }, { "epoch": 0.16811226975928273, "grad_norm": 0.5878474116325378, "learning_rate": 9.623574868211215e-06, "loss": 0.05864425376057625, "memory(GiB)": 21.32, "step": 5175, "token_acc": 0.9699570815450643, "train_speed(iter/s)": 0.948773 }, { "epoch": 0.16814475522203814, "grad_norm": 0.5621622204780579, "learning_rate": 9.623370368766042e-06, "loss": 0.0606408454477787, "memory(GiB)": 21.32, "step": 5176, "token_acc": 0.9596412556053812, "train_speed(iter/s)": 0.948804 }, { "epoch": 0.16817724068479356, "grad_norm": 1.291114091873169, "learning_rate": 9.623165815961026e-06, "loss": 0.05715213716030121, "memory(GiB)": 21.32, "step": 5177, "token_acc": 0.974025974025974, "train_speed(iter/s)": 0.948835 }, { "epoch": 0.16820972614754898, "grad_norm": 0.5737693309783936, "learning_rate": 9.62296120979852e-06, "loss": 0.04863917827606201, "memory(GiB)": 21.32, "step": 5178, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.948864 }, { "epoch": 0.1682422116103044, "grad_norm": 0.693028450012207, "learning_rate": 9.62275655028089e-06, "loss": 0.05346982926130295, "memory(GiB)": 21.32, "step": 5179, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.948895 }, { "epoch": 0.1682746970730598, "grad_norm": 0.40686365962028503, "learning_rate": 9.622551837410496e-06, "loss": 0.042878158390522, "memory(GiB)": 21.32, "step": 5180, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.948924 }, { "epoch": 0.16830718253581523, "grad_norm": 1.2315601110458374, "learning_rate": 9.622347071189702e-06, "loss": 0.056212007999420166, "memory(GiB)": 21.32, "step": 5181, "token_acc": 0.955719557195572, "train_speed(iter/s)": 0.948953 }, { "epoch": 0.16833966799857064, "grad_norm": 0.4626254737377167, "learning_rate": 9.62214225162087e-06, "loss": 0.03966100141406059, "memory(GiB)": 21.32, "step": 5182, "token_acc": 0.9792746113989638, "train_speed(iter/s)": 0.94898 }, { "epoch": 0.16837215346132606, "grad_norm": 0.6584303379058838, "learning_rate": 9.621937378706365e-06, "loss": 0.049488864839076996, "memory(GiB)": 21.32, "step": 5183, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.949013 }, { "epoch": 0.16840463892408147, "grad_norm": 0.5953715443611145, "learning_rate": 9.62173245244855e-06, "loss": 0.052999526262283325, "memory(GiB)": 21.32, "step": 5184, "token_acc": 0.966789667896679, "train_speed(iter/s)": 0.949041 }, { "epoch": 0.1684371243868369, "grad_norm": 0.40730029344558716, "learning_rate": 9.621527472849792e-06, "loss": 0.04417521879076958, "memory(GiB)": 21.32, "step": 5185, "token_acc": 0.9822695035460993, "train_speed(iter/s)": 0.949072 }, { "epoch": 0.1684696098495923, "grad_norm": 0.5906920433044434, "learning_rate": 9.621322439912456e-06, "loss": 0.045864544808864594, "memory(GiB)": 21.32, "step": 5186, "token_acc": 0.98, "train_speed(iter/s)": 0.949109 }, { "epoch": 0.16850209531234772, "grad_norm": 0.44683849811553955, "learning_rate": 9.621117353638905e-06, "loss": 0.03737116605043411, "memory(GiB)": 21.32, "step": 5187, "token_acc": 0.9824561403508771, "train_speed(iter/s)": 0.949148 }, { "epoch": 0.16853458077510314, "grad_norm": 0.4912704825401306, "learning_rate": 9.620912214031511e-06, "loss": 0.04639352858066559, "memory(GiB)": 21.32, "step": 5188, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.949186 }, { "epoch": 0.16856706623785855, "grad_norm": 0.5309862494468689, "learning_rate": 9.62070702109264e-06, "loss": 0.05826164782047272, "memory(GiB)": 21.32, "step": 5189, "token_acc": 0.9754098360655737, "train_speed(iter/s)": 0.949223 }, { "epoch": 0.16859955170061397, "grad_norm": 0.5750771760940552, "learning_rate": 9.62050177482466e-06, "loss": 0.04352584853768349, "memory(GiB)": 21.32, "step": 5190, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.94926 }, { "epoch": 0.16863203716336939, "grad_norm": 0.6471695899963379, "learning_rate": 9.620296475229938e-06, "loss": 0.052484892308712006, "memory(GiB)": 21.32, "step": 5191, "token_acc": 0.9770992366412213, "train_speed(iter/s)": 0.949298 }, { "epoch": 0.1686645226261248, "grad_norm": 1.2389944791793823, "learning_rate": 9.620091122310846e-06, "loss": 0.04703470319509506, "memory(GiB)": 21.32, "step": 5192, "token_acc": 0.9809885931558935, "train_speed(iter/s)": 0.949337 }, { "epoch": 0.16869700808888022, "grad_norm": 0.6417199969291687, "learning_rate": 9.619885716069754e-06, "loss": 0.044772278517484665, "memory(GiB)": 21.32, "step": 5193, "token_acc": 0.9711538461538461, "train_speed(iter/s)": 0.949374 }, { "epoch": 0.16872949355163563, "grad_norm": 0.46566975116729736, "learning_rate": 9.619680256509031e-06, "loss": 0.049092039465904236, "memory(GiB)": 21.32, "step": 5194, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.949411 }, { "epoch": 0.16876197901439105, "grad_norm": 0.8318068385124207, "learning_rate": 9.61947474363105e-06, "loss": 0.056804485619068146, "memory(GiB)": 21.32, "step": 5195, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.949441 }, { "epoch": 0.16879446447714647, "grad_norm": 0.7407357096672058, "learning_rate": 9.619269177438181e-06, "loss": 0.045301102101802826, "memory(GiB)": 21.32, "step": 5196, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.949472 }, { "epoch": 0.16882694993990188, "grad_norm": 1.15459144115448, "learning_rate": 9.619063557932797e-06, "loss": 0.06326986849308014, "memory(GiB)": 21.32, "step": 5197, "token_acc": 0.9727272727272728, "train_speed(iter/s)": 0.949502 }, { "epoch": 0.1688594354026573, "grad_norm": 0.6995338797569275, "learning_rate": 9.618857885117272e-06, "loss": 0.05269501358270645, "memory(GiB)": 21.32, "step": 5198, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.949533 }, { "epoch": 0.16889192086541271, "grad_norm": 0.8546050190925598, "learning_rate": 9.61865215899398e-06, "loss": 0.052471332252025604, "memory(GiB)": 21.32, "step": 5199, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.949562 }, { "epoch": 0.16892440632816816, "grad_norm": 1.1153844594955444, "learning_rate": 9.618446379565294e-06, "loss": 0.059483110904693604, "memory(GiB)": 21.32, "step": 5200, "token_acc": 0.9757281553398058, "train_speed(iter/s)": 0.949595 }, { "epoch": 0.16895689179092357, "grad_norm": 0.5143856406211853, "learning_rate": 9.61824054683359e-06, "loss": 0.04650349169969559, "memory(GiB)": 21.32, "step": 5201, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.949626 }, { "epoch": 0.168989377253679, "grad_norm": 0.673704206943512, "learning_rate": 9.618034660801242e-06, "loss": 0.04780735820531845, "memory(GiB)": 21.32, "step": 5202, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.949655 }, { "epoch": 0.1690218627164344, "grad_norm": 0.5626240372657776, "learning_rate": 9.61782872147063e-06, "loss": 0.0520685575902462, "memory(GiB)": 21.32, "step": 5203, "token_acc": 0.9731800766283525, "train_speed(iter/s)": 0.949686 }, { "epoch": 0.16905434817918982, "grad_norm": 0.5252870917320251, "learning_rate": 9.617622728844125e-06, "loss": 0.051150061190128326, "memory(GiB)": 21.32, "step": 5204, "token_acc": 0.9718875502008032, "train_speed(iter/s)": 0.949716 }, { "epoch": 0.16908683364194524, "grad_norm": 0.73478764295578, "learning_rate": 9.61741668292411e-06, "loss": 0.04507000371813774, "memory(GiB)": 21.32, "step": 5205, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.949743 }, { "epoch": 0.16911931910470065, "grad_norm": 0.9184777736663818, "learning_rate": 9.617210583712961e-06, "loss": 0.06026371568441391, "memory(GiB)": 21.32, "step": 5206, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.949771 }, { "epoch": 0.16915180456745607, "grad_norm": 0.5512567162513733, "learning_rate": 9.617004431213054e-06, "loss": 0.061287350952625275, "memory(GiB)": 21.32, "step": 5207, "token_acc": 0.966183574879227, "train_speed(iter/s)": 0.949804 }, { "epoch": 0.1691842900302115, "grad_norm": 0.5702096819877625, "learning_rate": 9.616798225426774e-06, "loss": 0.0448877215385437, "memory(GiB)": 21.32, "step": 5208, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.94983 }, { "epoch": 0.1692167754929669, "grad_norm": 0.6519201993942261, "learning_rate": 9.616591966356494e-06, "loss": 0.0469404011964798, "memory(GiB)": 21.32, "step": 5209, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.949862 }, { "epoch": 0.16924926095572232, "grad_norm": 0.6292917728424072, "learning_rate": 9.6163856540046e-06, "loss": 0.05670281499624252, "memory(GiB)": 21.32, "step": 5210, "token_acc": 0.9788359788359788, "train_speed(iter/s)": 0.949895 }, { "epoch": 0.16928174641847774, "grad_norm": 1.1935516595840454, "learning_rate": 9.61617928837347e-06, "loss": 0.04654049873352051, "memory(GiB)": 21.32, "step": 5211, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.949927 }, { "epoch": 0.16931423188123315, "grad_norm": 0.678791344165802, "learning_rate": 9.615972869465489e-06, "loss": 0.05428915098309517, "memory(GiB)": 21.32, "step": 5212, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.94996 }, { "epoch": 0.16934671734398857, "grad_norm": 0.5083909034729004, "learning_rate": 9.615766397283036e-06, "loss": 0.05188349261879921, "memory(GiB)": 21.32, "step": 5213, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.949986 }, { "epoch": 0.16937920280674398, "grad_norm": 0.5932045578956604, "learning_rate": 9.615559871828494e-06, "loss": 0.05217519402503967, "memory(GiB)": 21.32, "step": 5214, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.950015 }, { "epoch": 0.1694116882694994, "grad_norm": 0.5562509894371033, "learning_rate": 9.61535329310425e-06, "loss": 0.045773424208164215, "memory(GiB)": 21.32, "step": 5215, "token_acc": 0.99, "train_speed(iter/s)": 0.950045 }, { "epoch": 0.16944417373225482, "grad_norm": 0.49526113271713257, "learning_rate": 9.615146661112684e-06, "loss": 0.051779359579086304, "memory(GiB)": 21.32, "step": 5216, "token_acc": 0.9885931558935361, "train_speed(iter/s)": 0.950073 }, { "epoch": 0.16947665919501023, "grad_norm": 0.6322610974311829, "learning_rate": 9.614939975856184e-06, "loss": 0.05621630698442459, "memory(GiB)": 21.32, "step": 5217, "token_acc": 0.9767441860465116, "train_speed(iter/s)": 0.950102 }, { "epoch": 0.16950914465776565, "grad_norm": 0.878024697303772, "learning_rate": 9.614733237337134e-06, "loss": 0.05597177892923355, "memory(GiB)": 21.32, "step": 5218, "token_acc": 0.9479166666666666, "train_speed(iter/s)": 0.950136 }, { "epoch": 0.16954163012052106, "grad_norm": 0.5382876992225647, "learning_rate": 9.614526445557921e-06, "loss": 0.04141112416982651, "memory(GiB)": 21.32, "step": 5219, "token_acc": 0.9878048780487805, "train_speed(iter/s)": 0.950171 }, { "epoch": 0.16957411558327648, "grad_norm": 0.44830337166786194, "learning_rate": 9.614319600520929e-06, "loss": 0.04678652808070183, "memory(GiB)": 21.32, "step": 5220, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.950209 }, { "epoch": 0.1696066010460319, "grad_norm": 0.5917049050331116, "learning_rate": 9.614112702228548e-06, "loss": 0.050038449466228485, "memory(GiB)": 21.32, "step": 5221, "token_acc": 0.9805825242718447, "train_speed(iter/s)": 0.950245 }, { "epoch": 0.1696390865087873, "grad_norm": 0.515903115272522, "learning_rate": 9.613905750683166e-06, "loss": 0.056787025183439255, "memory(GiB)": 21.32, "step": 5222, "token_acc": 0.9823008849557522, "train_speed(iter/s)": 0.950283 }, { "epoch": 0.16967157197154273, "grad_norm": 0.6201292276382446, "learning_rate": 9.61369874588717e-06, "loss": 0.04118610918521881, "memory(GiB)": 21.32, "step": 5223, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.950316 }, { "epoch": 0.16970405743429814, "grad_norm": 0.4924517869949341, "learning_rate": 9.61349168784295e-06, "loss": 0.04933813959360123, "memory(GiB)": 21.32, "step": 5224, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.950353 }, { "epoch": 0.16973654289705356, "grad_norm": 0.6873788833618164, "learning_rate": 9.613284576552896e-06, "loss": 0.054399505257606506, "memory(GiB)": 21.32, "step": 5225, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.950385 }, { "epoch": 0.16976902835980898, "grad_norm": 0.4990396201610565, "learning_rate": 9.613077412019395e-06, "loss": 0.04660631716251373, "memory(GiB)": 21.32, "step": 5226, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.950414 }, { "epoch": 0.1698015138225644, "grad_norm": 0.6001128554344177, "learning_rate": 9.612870194244842e-06, "loss": 0.03874274715781212, "memory(GiB)": 21.32, "step": 5227, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.950446 }, { "epoch": 0.1698339992853198, "grad_norm": 0.6117668151855469, "learning_rate": 9.612662923231628e-06, "loss": 0.05140792205929756, "memory(GiB)": 21.32, "step": 5228, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.950477 }, { "epoch": 0.16986648474807522, "grad_norm": 0.6625932455062866, "learning_rate": 9.612455598982144e-06, "loss": 0.060637082904577255, "memory(GiB)": 21.32, "step": 5229, "token_acc": 0.976303317535545, "train_speed(iter/s)": 0.950504 }, { "epoch": 0.16989897021083064, "grad_norm": 0.7384286522865295, "learning_rate": 9.612248221498783e-06, "loss": 0.0561058409512043, "memory(GiB)": 21.32, "step": 5230, "token_acc": 0.9948453608247423, "train_speed(iter/s)": 0.950531 }, { "epoch": 0.16993145567358606, "grad_norm": 1.103185772895813, "learning_rate": 9.612040790783939e-06, "loss": 0.05948536843061447, "memory(GiB)": 21.32, "step": 5231, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.950558 }, { "epoch": 0.1699639411363415, "grad_norm": 0.5712451338768005, "learning_rate": 9.611833306840005e-06, "loss": 0.0526394248008728, "memory(GiB)": 21.32, "step": 5232, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.950585 }, { "epoch": 0.16999642659909692, "grad_norm": 0.5372475981712341, "learning_rate": 9.611625769669377e-06, "loss": 0.039371557533741, "memory(GiB)": 21.32, "step": 5233, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.950614 }, { "epoch": 0.17002891206185233, "grad_norm": 0.47964319586753845, "learning_rate": 9.611418179274449e-06, "loss": 0.04384056478738785, "memory(GiB)": 21.32, "step": 5234, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.950642 }, { "epoch": 0.17006139752460775, "grad_norm": 0.49610990285873413, "learning_rate": 9.611210535657618e-06, "loss": 0.04290825501084328, "memory(GiB)": 21.32, "step": 5235, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.950671 }, { "epoch": 0.17009388298736317, "grad_norm": 0.4420200288295746, "learning_rate": 9.61100283882128e-06, "loss": 0.045581307262182236, "memory(GiB)": 21.32, "step": 5236, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.950699 }, { "epoch": 0.17012636845011858, "grad_norm": 0.632324755191803, "learning_rate": 9.61079508876783e-06, "loss": 0.05784718692302704, "memory(GiB)": 21.32, "step": 5237, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.950728 }, { "epoch": 0.170158853912874, "grad_norm": 0.5942092537879944, "learning_rate": 9.61058728549967e-06, "loss": 0.05575300008058548, "memory(GiB)": 21.32, "step": 5238, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.950758 }, { "epoch": 0.1701913393756294, "grad_norm": 0.6933079957962036, "learning_rate": 9.610379429019196e-06, "loss": 0.05328287184238434, "memory(GiB)": 21.32, "step": 5239, "token_acc": 0.972, "train_speed(iter/s)": 0.950786 }, { "epoch": 0.17022382483838483, "grad_norm": 0.4754623770713806, "learning_rate": 9.610171519328806e-06, "loss": 0.04539719223976135, "memory(GiB)": 21.32, "step": 5240, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.950813 }, { "epoch": 0.17025631030114025, "grad_norm": 0.6794747114181519, "learning_rate": 9.609963556430902e-06, "loss": 0.04609914869070053, "memory(GiB)": 21.32, "step": 5241, "token_acc": 0.980544747081712, "train_speed(iter/s)": 0.95084 }, { "epoch": 0.17028879576389566, "grad_norm": 0.7881958484649658, "learning_rate": 9.609755540327881e-06, "loss": 0.05909479781985283, "memory(GiB)": 21.32, "step": 5242, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.950869 }, { "epoch": 0.17032128122665108, "grad_norm": 0.5915894508361816, "learning_rate": 9.609547471022144e-06, "loss": 0.0419827438890934, "memory(GiB)": 21.32, "step": 5243, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.950898 }, { "epoch": 0.1703537666894065, "grad_norm": 0.8672627806663513, "learning_rate": 9.609339348516099e-06, "loss": 0.05596066638827324, "memory(GiB)": 21.32, "step": 5244, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.950931 }, { "epoch": 0.1703862521521619, "grad_norm": 0.7582015991210938, "learning_rate": 9.609131172812139e-06, "loss": 0.05479152873158455, "memory(GiB)": 21.32, "step": 5245, "token_acc": 0.9752066115702479, "train_speed(iter/s)": 0.950968 }, { "epoch": 0.17041873761491733, "grad_norm": 1.736437439918518, "learning_rate": 9.608922943912672e-06, "loss": 0.056060709059238434, "memory(GiB)": 21.32, "step": 5246, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.951006 }, { "epoch": 0.17045122307767274, "grad_norm": 0.46267735958099365, "learning_rate": 9.608714661820098e-06, "loss": 0.046793751418590546, "memory(GiB)": 21.32, "step": 5247, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.951042 }, { "epoch": 0.17048370854042816, "grad_norm": 0.7049344778060913, "learning_rate": 9.608506326536823e-06, "loss": 0.05660189688205719, "memory(GiB)": 21.32, "step": 5248, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.95108 }, { "epoch": 0.17051619400318357, "grad_norm": 0.5407516360282898, "learning_rate": 9.608297938065253e-06, "loss": 0.04692409560084343, "memory(GiB)": 21.32, "step": 5249, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.951115 }, { "epoch": 0.170548679465939, "grad_norm": 0.5463194847106934, "learning_rate": 9.60808949640779e-06, "loss": 0.04454636946320534, "memory(GiB)": 21.32, "step": 5250, "token_acc": 1.0, "train_speed(iter/s)": 0.951151 }, { "epoch": 0.1705811649286944, "grad_norm": 0.5722894072532654, "learning_rate": 9.607881001566841e-06, "loss": 0.057171061635017395, "memory(GiB)": 21.32, "step": 5251, "token_acc": 0.9637096774193549, "train_speed(iter/s)": 0.951189 }, { "epoch": 0.17061365039144982, "grad_norm": 0.631576657295227, "learning_rate": 9.607672453544811e-06, "loss": 0.04678687825798988, "memory(GiB)": 21.32, "step": 5252, "token_acc": 0.9727272727272728, "train_speed(iter/s)": 0.951226 }, { "epoch": 0.17064613585420524, "grad_norm": 0.5410923361778259, "learning_rate": 9.60746385234411e-06, "loss": 0.052385225892066956, "memory(GiB)": 21.32, "step": 5253, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.951262 }, { "epoch": 0.17067862131696065, "grad_norm": 0.5066540837287903, "learning_rate": 9.607255197967142e-06, "loss": 0.04829216003417969, "memory(GiB)": 21.32, "step": 5254, "token_acc": 0.9773584905660377, "train_speed(iter/s)": 0.951297 }, { "epoch": 0.17071110677971607, "grad_norm": 0.794114351272583, "learning_rate": 9.607046490416319e-06, "loss": 0.056726064532995224, "memory(GiB)": 21.32, "step": 5255, "token_acc": 0.9641255605381166, "train_speed(iter/s)": 0.951331 }, { "epoch": 0.1707435922424715, "grad_norm": 0.7767817974090576, "learning_rate": 9.606837729694044e-06, "loss": 0.054990626871585846, "memory(GiB)": 21.32, "step": 5256, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.951363 }, { "epoch": 0.1707760777052269, "grad_norm": 0.5790910720825195, "learning_rate": 9.606628915802734e-06, "loss": 0.05728374421596527, "memory(GiB)": 21.32, "step": 5257, "token_acc": 0.9734513274336283, "train_speed(iter/s)": 0.951393 }, { "epoch": 0.17080856316798232, "grad_norm": 1.269139051437378, "learning_rate": 9.606420048744793e-06, "loss": 0.05246042087674141, "memory(GiB)": 21.32, "step": 5258, "token_acc": 0.9823008849557522, "train_speed(iter/s)": 0.951423 }, { "epoch": 0.17084104863073774, "grad_norm": 0.48883751034736633, "learning_rate": 9.606211128522633e-06, "loss": 0.049155257642269135, "memory(GiB)": 21.32, "step": 5259, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.951455 }, { "epoch": 0.17087353409349315, "grad_norm": 0.4717438220977783, "learning_rate": 9.606002155138666e-06, "loss": 0.05096956342458725, "memory(GiB)": 21.32, "step": 5260, "token_acc": 0.97265625, "train_speed(iter/s)": 0.951486 }, { "epoch": 0.17090601955624857, "grad_norm": 4.182275295257568, "learning_rate": 9.605793128595305e-06, "loss": 0.05030548572540283, "memory(GiB)": 21.32, "step": 5261, "token_acc": 0.9739776951672863, "train_speed(iter/s)": 0.951519 }, { "epoch": 0.17093850501900398, "grad_norm": 0.5356042385101318, "learning_rate": 9.605584048894958e-06, "loss": 0.04998916760087013, "memory(GiB)": 21.32, "step": 5262, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.951552 }, { "epoch": 0.1709709904817594, "grad_norm": 0.6202136278152466, "learning_rate": 9.605374916040044e-06, "loss": 0.051719386130571365, "memory(GiB)": 21.32, "step": 5263, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.951581 }, { "epoch": 0.17100347594451484, "grad_norm": 0.6921667456626892, "learning_rate": 9.605165730032972e-06, "loss": 0.057970527559518814, "memory(GiB)": 21.32, "step": 5264, "token_acc": 0.9507389162561576, "train_speed(iter/s)": 0.95161 }, { "epoch": 0.17103596140727026, "grad_norm": 0.6058445572853088, "learning_rate": 9.604956490876159e-06, "loss": 0.04794226959347725, "memory(GiB)": 21.32, "step": 5265, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.951641 }, { "epoch": 0.17106844687002568, "grad_norm": 0.7156678438186646, "learning_rate": 9.604747198572017e-06, "loss": 0.06250278651714325, "memory(GiB)": 21.32, "step": 5266, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.95167 }, { "epoch": 0.1711009323327811, "grad_norm": 0.5600883364677429, "learning_rate": 9.604537853122966e-06, "loss": 0.0434735044836998, "memory(GiB)": 21.32, "step": 5267, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.951698 }, { "epoch": 0.1711334177955365, "grad_norm": 0.8721869587898254, "learning_rate": 9.604328454531417e-06, "loss": 0.04868040233850479, "memory(GiB)": 21.32, "step": 5268, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.951729 }, { "epoch": 0.17116590325829192, "grad_norm": 0.5642196536064148, "learning_rate": 9.604119002799792e-06, "loss": 0.049961768090724945, "memory(GiB)": 21.32, "step": 5269, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.951759 }, { "epoch": 0.17119838872104734, "grad_norm": 0.4726604223251343, "learning_rate": 9.603909497930504e-06, "loss": 0.04581371694803238, "memory(GiB)": 21.32, "step": 5270, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.951791 }, { "epoch": 0.17123087418380276, "grad_norm": 0.949324369430542, "learning_rate": 9.603699939925973e-06, "loss": 0.05890609323978424, "memory(GiB)": 21.32, "step": 5271, "token_acc": 0.9615384615384616, "train_speed(iter/s)": 0.951821 }, { "epoch": 0.17126335964655817, "grad_norm": 0.7536708116531372, "learning_rate": 9.603490328788617e-06, "loss": 0.06016848236322403, "memory(GiB)": 21.32, "step": 5272, "token_acc": 0.9849056603773585, "train_speed(iter/s)": 0.951853 }, { "epoch": 0.1712958451093136, "grad_norm": 1.8837532997131348, "learning_rate": 9.603280664520856e-06, "loss": 0.05375329405069351, "memory(GiB)": 21.32, "step": 5273, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.951884 }, { "epoch": 0.171328330572069, "grad_norm": 0.5891990065574646, "learning_rate": 9.603070947125109e-06, "loss": 0.05203241854906082, "memory(GiB)": 21.32, "step": 5274, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.951912 }, { "epoch": 0.17136081603482442, "grad_norm": 0.4621915817260742, "learning_rate": 9.602861176603795e-06, "loss": 0.03890793398022652, "memory(GiB)": 21.32, "step": 5275, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.951939 }, { "epoch": 0.17139330149757984, "grad_norm": 0.4678691029548645, "learning_rate": 9.602651352959339e-06, "loss": 0.045841045677661896, "memory(GiB)": 21.32, "step": 5276, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.951968 }, { "epoch": 0.17142578696033525, "grad_norm": 0.4567483067512512, "learning_rate": 9.60244147619416e-06, "loss": 0.04622909426689148, "memory(GiB)": 21.32, "step": 5277, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.952 }, { "epoch": 0.17145827242309067, "grad_norm": 0.6468676328659058, "learning_rate": 9.602231546310678e-06, "loss": 0.056893955916166306, "memory(GiB)": 21.32, "step": 5278, "token_acc": 0.9723502304147466, "train_speed(iter/s)": 0.952033 }, { "epoch": 0.17149075788584608, "grad_norm": 0.7950630784034729, "learning_rate": 9.602021563311321e-06, "loss": 0.05876114219427109, "memory(GiB)": 21.32, "step": 5279, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.952065 }, { "epoch": 0.1715232433486015, "grad_norm": 0.5415801405906677, "learning_rate": 9.601811527198507e-06, "loss": 0.05383171886205673, "memory(GiB)": 21.32, "step": 5280, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.952102 }, { "epoch": 0.17155572881135692, "grad_norm": 0.7855006456375122, "learning_rate": 9.601601437974666e-06, "loss": 0.047972775995731354, "memory(GiB)": 21.32, "step": 5281, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.952137 }, { "epoch": 0.17158821427411233, "grad_norm": 0.6673925518989563, "learning_rate": 9.60139129564222e-06, "loss": 0.05146578699350357, "memory(GiB)": 21.32, "step": 5282, "token_acc": 0.9702127659574468, "train_speed(iter/s)": 0.952163 }, { "epoch": 0.17162069973686775, "grad_norm": 0.6575397849082947, "learning_rate": 9.601181100203591e-06, "loss": 0.053696051239967346, "memory(GiB)": 21.32, "step": 5283, "token_acc": 0.9744897959183674, "train_speed(iter/s)": 0.95219 }, { "epoch": 0.17165318519962317, "grad_norm": 0.4268294870853424, "learning_rate": 9.60097085166121e-06, "loss": 0.041082609444856644, "memory(GiB)": 21.32, "step": 5284, "token_acc": 0.9823008849557522, "train_speed(iter/s)": 0.952221 }, { "epoch": 0.17168567066237858, "grad_norm": 0.7846624851226807, "learning_rate": 9.600760550017502e-06, "loss": 0.055260539054870605, "memory(GiB)": 21.32, "step": 5285, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.952249 }, { "epoch": 0.171718156125134, "grad_norm": 0.628609299659729, "learning_rate": 9.600550195274892e-06, "loss": 0.05158889666199684, "memory(GiB)": 21.32, "step": 5286, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.952274 }, { "epoch": 0.1717506415878894, "grad_norm": 0.47119462490081787, "learning_rate": 9.600339787435811e-06, "loss": 0.046403735876083374, "memory(GiB)": 21.32, "step": 5287, "token_acc": 0.9753694581280788, "train_speed(iter/s)": 0.952303 }, { "epoch": 0.17178312705064483, "grad_norm": 0.5036003589630127, "learning_rate": 9.600129326502684e-06, "loss": 0.04853042587637901, "memory(GiB)": 21.32, "step": 5288, "token_acc": 0.9890510948905109, "train_speed(iter/s)": 0.95233 }, { "epoch": 0.17181561251340025, "grad_norm": 0.5444614887237549, "learning_rate": 9.599918812477944e-06, "loss": 0.04145622253417969, "memory(GiB)": 21.32, "step": 5289, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.952356 }, { "epoch": 0.17184809797615566, "grad_norm": 2.1802051067352295, "learning_rate": 9.599708245364017e-06, "loss": 0.03492384031414986, "memory(GiB)": 21.32, "step": 5290, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.952385 }, { "epoch": 0.17188058343891108, "grad_norm": 0.55597323179245, "learning_rate": 9.599497625163335e-06, "loss": 0.05602303147315979, "memory(GiB)": 21.32, "step": 5291, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.952415 }, { "epoch": 0.1719130689016665, "grad_norm": 0.5501190423965454, "learning_rate": 9.599286951878328e-06, "loss": 0.0560937374830246, "memory(GiB)": 21.32, "step": 5292, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.952443 }, { "epoch": 0.1719455543644219, "grad_norm": 1.1784158945083618, "learning_rate": 9.599076225511428e-06, "loss": 0.05725117027759552, "memory(GiB)": 21.32, "step": 5293, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.952471 }, { "epoch": 0.17197803982717733, "grad_norm": 0.5009267330169678, "learning_rate": 9.598865446065066e-06, "loss": 0.0453941784799099, "memory(GiB)": 21.32, "step": 5294, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.9525 }, { "epoch": 0.17201052528993274, "grad_norm": 0.7356869578361511, "learning_rate": 9.598654613541677e-06, "loss": 0.05932673066854477, "memory(GiB)": 21.32, "step": 5295, "token_acc": 0.9696969696969697, "train_speed(iter/s)": 0.952528 }, { "epoch": 0.17204301075268819, "grad_norm": 0.44314512610435486, "learning_rate": 9.598443727943693e-06, "loss": 0.045623648911714554, "memory(GiB)": 21.32, "step": 5296, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.952557 }, { "epoch": 0.1720754962154436, "grad_norm": 0.39481133222579956, "learning_rate": 9.59823278927355e-06, "loss": 0.045020584017038345, "memory(GiB)": 21.32, "step": 5297, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.952584 }, { "epoch": 0.17210798167819902, "grad_norm": 0.3956128656864166, "learning_rate": 9.598021797533676e-06, "loss": 0.03857492655515671, "memory(GiB)": 21.32, "step": 5298, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.952614 }, { "epoch": 0.17214046714095443, "grad_norm": 0.5990660786628723, "learning_rate": 9.597810752726514e-06, "loss": 0.0576360709965229, "memory(GiB)": 21.32, "step": 5299, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.952643 }, { "epoch": 0.17217295260370985, "grad_norm": 0.556184709072113, "learning_rate": 9.597599654854495e-06, "loss": 0.05459237098693848, "memory(GiB)": 21.32, "step": 5300, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.952673 }, { "epoch": 0.17220543806646527, "grad_norm": 0.5739903450012207, "learning_rate": 9.597388503920056e-06, "loss": 0.05193186178803444, "memory(GiB)": 21.32, "step": 5301, "token_acc": 0.9856459330143541, "train_speed(iter/s)": 0.952707 }, { "epoch": 0.17223792352922068, "grad_norm": 0.4920365512371063, "learning_rate": 9.597177299925634e-06, "loss": 0.052621833980083466, "memory(GiB)": 21.32, "step": 5302, "token_acc": 0.9701492537313433, "train_speed(iter/s)": 0.95274 }, { "epoch": 0.1722704089919761, "grad_norm": 0.8866967558860779, "learning_rate": 9.59696604287367e-06, "loss": 0.052438776940107346, "memory(GiB)": 21.32, "step": 5303, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.952776 }, { "epoch": 0.17230289445473151, "grad_norm": 0.5328235030174255, "learning_rate": 9.596754732766599e-06, "loss": 0.041859984397888184, "memory(GiB)": 21.32, "step": 5304, "token_acc": 0.9836956521739131, "train_speed(iter/s)": 0.952812 }, { "epoch": 0.17233537991748693, "grad_norm": 0.588113009929657, "learning_rate": 9.596543369606856e-06, "loss": 0.06100521981716156, "memory(GiB)": 21.32, "step": 5305, "token_acc": 0.9816176470588235, "train_speed(iter/s)": 0.952847 }, { "epoch": 0.17236786538024235, "grad_norm": 0.7435816526412964, "learning_rate": 9.596331953396888e-06, "loss": 0.06318987905979156, "memory(GiB)": 21.32, "step": 5306, "token_acc": 0.9696969696969697, "train_speed(iter/s)": 0.952881 }, { "epoch": 0.17240035084299776, "grad_norm": 0.6367384195327759, "learning_rate": 9.59612048413913e-06, "loss": 0.055945634841918945, "memory(GiB)": 21.32, "step": 5307, "token_acc": 0.9717741935483871, "train_speed(iter/s)": 0.952916 }, { "epoch": 0.17243283630575318, "grad_norm": 0.5105273127555847, "learning_rate": 9.595908961836026e-06, "loss": 0.04981241747736931, "memory(GiB)": 21.32, "step": 5308, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.95295 }, { "epoch": 0.1724653217685086, "grad_norm": 0.6557404398918152, "learning_rate": 9.595697386490014e-06, "loss": 0.044789284467697144, "memory(GiB)": 21.32, "step": 5309, "token_acc": 0.9681274900398407, "train_speed(iter/s)": 0.952986 }, { "epoch": 0.172497807231264, "grad_norm": 0.8162631392478943, "learning_rate": 9.595485758103538e-06, "loss": 0.04142364487051964, "memory(GiB)": 21.32, "step": 5310, "token_acc": 0.9905660377358491, "train_speed(iter/s)": 0.953022 }, { "epoch": 0.17253029269401943, "grad_norm": 0.6087738275527954, "learning_rate": 9.59527407667904e-06, "loss": 0.04665375500917435, "memory(GiB)": 21.32, "step": 5311, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.953058 }, { "epoch": 0.17256277815677484, "grad_norm": 0.48680970072746277, "learning_rate": 9.595062342218961e-06, "loss": 0.05226703733205795, "memory(GiB)": 21.32, "step": 5312, "token_acc": 0.9702127659574468, "train_speed(iter/s)": 0.953093 }, { "epoch": 0.17259526361953026, "grad_norm": 1.2139490842819214, "learning_rate": 9.594850554725748e-06, "loss": 0.05694327503442764, "memory(GiB)": 21.32, "step": 5313, "token_acc": 0.9754901960784313, "train_speed(iter/s)": 0.953128 }, { "epoch": 0.17262774908228568, "grad_norm": 0.4082801640033722, "learning_rate": 9.594638714201844e-06, "loss": 0.04772557318210602, "memory(GiB)": 21.32, "step": 5314, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.953163 }, { "epoch": 0.1726602345450411, "grad_norm": 0.6832410097122192, "learning_rate": 9.594426820649694e-06, "loss": 0.05854659527540207, "memory(GiB)": 21.32, "step": 5315, "token_acc": 0.959409594095941, "train_speed(iter/s)": 0.953198 }, { "epoch": 0.1726927200077965, "grad_norm": 0.4756820797920227, "learning_rate": 9.594214874071742e-06, "loss": 0.04736151546239853, "memory(GiB)": 21.32, "step": 5316, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.953232 }, { "epoch": 0.17272520547055192, "grad_norm": 0.45894795656204224, "learning_rate": 9.594002874470435e-06, "loss": 0.05112360417842865, "memory(GiB)": 21.32, "step": 5317, "token_acc": 0.9892086330935251, "train_speed(iter/s)": 0.953262 }, { "epoch": 0.17275769093330734, "grad_norm": 0.4570547938346863, "learning_rate": 9.593790821848224e-06, "loss": 0.0407552495598793, "memory(GiB)": 21.32, "step": 5318, "token_acc": 0.9922779922779923, "train_speed(iter/s)": 0.953292 }, { "epoch": 0.17279017639606276, "grad_norm": 0.6707956790924072, "learning_rate": 9.593578716207549e-06, "loss": 0.052665576338768005, "memory(GiB)": 21.32, "step": 5319, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.953322 }, { "epoch": 0.17282266185881817, "grad_norm": 1.1062955856323242, "learning_rate": 9.593366557550863e-06, "loss": 0.0630912110209465, "memory(GiB)": 21.32, "step": 5320, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.953351 }, { "epoch": 0.1728551473215736, "grad_norm": 0.6158899068832397, "learning_rate": 9.593154345880612e-06, "loss": 0.04529542848467827, "memory(GiB)": 21.32, "step": 5321, "token_acc": 0.9797979797979798, "train_speed(iter/s)": 0.95338 }, { "epoch": 0.172887632784329, "grad_norm": 0.837913990020752, "learning_rate": 9.592942081199247e-06, "loss": 0.0555400624871254, "memory(GiB)": 21.32, "step": 5322, "token_acc": 0.9826839826839827, "train_speed(iter/s)": 0.953407 }, { "epoch": 0.17292011824708442, "grad_norm": 1.0637545585632324, "learning_rate": 9.592729763509219e-06, "loss": 0.056970130652189255, "memory(GiB)": 21.32, "step": 5323, "token_acc": 0.9759036144578314, "train_speed(iter/s)": 0.953437 }, { "epoch": 0.17295260370983984, "grad_norm": 0.800389289855957, "learning_rate": 9.592517392812974e-06, "loss": 0.047099556773900986, "memory(GiB)": 21.32, "step": 5324, "token_acc": 0.9933333333333333, "train_speed(iter/s)": 0.953464 }, { "epoch": 0.17298508917259525, "grad_norm": 0.7029381990432739, "learning_rate": 9.592304969112967e-06, "loss": 0.04486243426799774, "memory(GiB)": 21.32, "step": 5325, "token_acc": 0.9689922480620154, "train_speed(iter/s)": 0.953493 }, { "epoch": 0.17301757463535067, "grad_norm": 0.7647907733917236, "learning_rate": 9.592092492411649e-06, "loss": 0.06746167689561844, "memory(GiB)": 21.32, "step": 5326, "token_acc": 0.9796954314720813, "train_speed(iter/s)": 0.953525 }, { "epoch": 0.17305006009810608, "grad_norm": 0.825046956539154, "learning_rate": 9.59187996271147e-06, "loss": 0.06997640430927277, "memory(GiB)": 21.32, "step": 5327, "token_acc": 0.9701492537313433, "train_speed(iter/s)": 0.953552 }, { "epoch": 0.17308254556086153, "grad_norm": 1.2744742631912231, "learning_rate": 9.591667380014884e-06, "loss": 0.03786998987197876, "memory(GiB)": 21.32, "step": 5328, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.953581 }, { "epoch": 0.17311503102361694, "grad_norm": 0.6694220900535583, "learning_rate": 9.591454744324346e-06, "loss": 0.05214950814843178, "memory(GiB)": 21.32, "step": 5329, "token_acc": 0.9780701754385965, "train_speed(iter/s)": 0.953608 }, { "epoch": 0.17314751648637236, "grad_norm": 0.7446768283843994, "learning_rate": 9.59124205564231e-06, "loss": 0.04809962213039398, "memory(GiB)": 21.32, "step": 5330, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.953639 }, { "epoch": 0.17318000194912778, "grad_norm": 0.430070698261261, "learning_rate": 9.591029313971228e-06, "loss": 0.04275856912136078, "memory(GiB)": 21.32, "step": 5331, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.95367 }, { "epoch": 0.1732124874118832, "grad_norm": 1.4249473810195923, "learning_rate": 9.590816519313559e-06, "loss": 0.045897744596004486, "memory(GiB)": 21.32, "step": 5332, "token_acc": 0.9752066115702479, "train_speed(iter/s)": 0.9537 }, { "epoch": 0.1732449728746386, "grad_norm": 0.8430657982826233, "learning_rate": 9.590603671671754e-06, "loss": 0.04893837496638298, "memory(GiB)": 21.32, "step": 5333, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.95373 }, { "epoch": 0.17327745833739402, "grad_norm": 1.569139003753662, "learning_rate": 9.590390771048274e-06, "loss": 0.05492851883172989, "memory(GiB)": 21.32, "step": 5334, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.953758 }, { "epoch": 0.17330994380014944, "grad_norm": 0.6935003399848938, "learning_rate": 9.590177817445577e-06, "loss": 0.05061114951968193, "memory(GiB)": 21.32, "step": 5335, "token_acc": 0.981203007518797, "train_speed(iter/s)": 0.953786 }, { "epoch": 0.17334242926290486, "grad_norm": 0.479761004447937, "learning_rate": 9.589964810866117e-06, "loss": 0.0446314662694931, "memory(GiB)": 21.32, "step": 5336, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.953809 }, { "epoch": 0.17337491472566027, "grad_norm": 0.5614415407180786, "learning_rate": 9.589751751312353e-06, "loss": 0.04654403775930405, "memory(GiB)": 21.32, "step": 5337, "token_acc": 0.9745762711864406, "train_speed(iter/s)": 0.953835 }, { "epoch": 0.1734074001884157, "grad_norm": 0.6064469814300537, "learning_rate": 9.589538638786744e-06, "loss": 0.04292892664670944, "memory(GiB)": 21.32, "step": 5338, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.953861 }, { "epoch": 0.1734398856511711, "grad_norm": 0.5966576933860779, "learning_rate": 9.589325473291753e-06, "loss": 0.0484144389629364, "memory(GiB)": 21.32, "step": 5339, "token_acc": 0.9588477366255144, "train_speed(iter/s)": 0.953885 }, { "epoch": 0.17347237111392652, "grad_norm": 0.5184637308120728, "learning_rate": 9.589112254829836e-06, "loss": 0.0420822948217392, "memory(GiB)": 21.32, "step": 5340, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.953912 }, { "epoch": 0.17350485657668194, "grad_norm": 0.6973565816879272, "learning_rate": 9.588898983403456e-06, "loss": 0.04950959235429764, "memory(GiB)": 21.32, "step": 5341, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.953937 }, { "epoch": 0.17353734203943735, "grad_norm": 1.0217002630233765, "learning_rate": 9.588685659015073e-06, "loss": 0.047746822237968445, "memory(GiB)": 21.32, "step": 5342, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.953965 }, { "epoch": 0.17356982750219277, "grad_norm": 0.6394270062446594, "learning_rate": 9.588472281667152e-06, "loss": 0.054206572473049164, "memory(GiB)": 21.32, "step": 5343, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.953994 }, { "epoch": 0.17360231296494819, "grad_norm": 0.6364153623580933, "learning_rate": 9.58825885136215e-06, "loss": 0.04335642606019974, "memory(GiB)": 21.32, "step": 5344, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.954022 }, { "epoch": 0.1736347984277036, "grad_norm": 0.5515760779380798, "learning_rate": 9.588045368102538e-06, "loss": 0.04761205613613129, "memory(GiB)": 21.32, "step": 5345, "token_acc": 0.9704433497536946, "train_speed(iter/s)": 0.954049 }, { "epoch": 0.17366728389045902, "grad_norm": 0.7125726938247681, "learning_rate": 9.587831831890774e-06, "loss": 0.05489664897322655, "memory(GiB)": 21.32, "step": 5346, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.954077 }, { "epoch": 0.17369976935321443, "grad_norm": 0.44642627239227295, "learning_rate": 9.587618242729324e-06, "loss": 0.04259086400270462, "memory(GiB)": 21.32, "step": 5347, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.954105 }, { "epoch": 0.17373225481596985, "grad_norm": 0.6495240330696106, "learning_rate": 9.587404600620653e-06, "loss": 0.0490470826625824, "memory(GiB)": 21.32, "step": 5348, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.954133 }, { "epoch": 0.17376474027872527, "grad_norm": 0.5946733951568604, "learning_rate": 9.587190905567228e-06, "loss": 0.047225043177604675, "memory(GiB)": 21.32, "step": 5349, "token_acc": 0.9554455445544554, "train_speed(iter/s)": 0.954161 }, { "epoch": 0.17379722574148068, "grad_norm": 0.6164100170135498, "learning_rate": 9.586977157571515e-06, "loss": 0.04726816341280937, "memory(GiB)": 21.32, "step": 5350, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.95419 }, { "epoch": 0.1738297112042361, "grad_norm": 0.6385416984558105, "learning_rate": 9.58676335663598e-06, "loss": 0.048712871968746185, "memory(GiB)": 21.32, "step": 5351, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.954216 }, { "epoch": 0.17386219666699151, "grad_norm": 0.6543712019920349, "learning_rate": 9.58654950276309e-06, "loss": 0.05793282762169838, "memory(GiB)": 21.32, "step": 5352, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.954244 }, { "epoch": 0.17389468212974693, "grad_norm": 0.640718400478363, "learning_rate": 9.586335595955315e-06, "loss": 0.04435846954584122, "memory(GiB)": 21.32, "step": 5353, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.954272 }, { "epoch": 0.17392716759250235, "grad_norm": 0.6687915325164795, "learning_rate": 9.586121636215122e-06, "loss": 0.05376143008470535, "memory(GiB)": 21.32, "step": 5354, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.954299 }, { "epoch": 0.17395965305525776, "grad_norm": 0.7278915643692017, "learning_rate": 9.585907623544981e-06, "loss": 0.04889225587248802, "memory(GiB)": 21.32, "step": 5355, "token_acc": 0.9823008849557522, "train_speed(iter/s)": 0.954326 }, { "epoch": 0.17399213851801318, "grad_norm": 0.5234107971191406, "learning_rate": 9.585693557947361e-06, "loss": 0.037373222410678864, "memory(GiB)": 21.32, "step": 5356, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.954352 }, { "epoch": 0.1740246239807686, "grad_norm": 0.4962775409221649, "learning_rate": 9.585479439424737e-06, "loss": 0.03717590868473053, "memory(GiB)": 21.32, "step": 5357, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.954383 }, { "epoch": 0.174057109443524, "grad_norm": 0.5099882483482361, "learning_rate": 9.585265267979575e-06, "loss": 0.041046783328056335, "memory(GiB)": 21.32, "step": 5358, "token_acc": 0.98, "train_speed(iter/s)": 0.954417 }, { "epoch": 0.17408959490627943, "grad_norm": 0.6224220991134644, "learning_rate": 9.58505104361435e-06, "loss": 0.05183814465999603, "memory(GiB)": 21.32, "step": 5359, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.954453 }, { "epoch": 0.17412208036903487, "grad_norm": 0.5128534436225891, "learning_rate": 9.584836766331533e-06, "loss": 0.04746516793966293, "memory(GiB)": 21.32, "step": 5360, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.954487 }, { "epoch": 0.1741545658317903, "grad_norm": 0.830423891544342, "learning_rate": 9.584622436133597e-06, "loss": 0.040073223412036896, "memory(GiB)": 21.32, "step": 5361, "token_acc": 0.9784172661870504, "train_speed(iter/s)": 0.95452 }, { "epoch": 0.1741870512945457, "grad_norm": 0.49648815393447876, "learning_rate": 9.584408053023015e-06, "loss": 0.05113857984542847, "memory(GiB)": 21.32, "step": 5362, "token_acc": 0.9732142857142857, "train_speed(iter/s)": 0.954556 }, { "epoch": 0.17421953675730112, "grad_norm": 1.950323462486267, "learning_rate": 9.584193617002265e-06, "loss": 0.049384042620658875, "memory(GiB)": 21.32, "step": 5363, "token_acc": 0.9708333333333333, "train_speed(iter/s)": 0.95459 }, { "epoch": 0.17425202222005653, "grad_norm": 0.44820278882980347, "learning_rate": 9.583979128073815e-06, "loss": 0.04238293319940567, "memory(GiB)": 21.32, "step": 5364, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.954626 }, { "epoch": 0.17428450768281195, "grad_norm": 0.8467549681663513, "learning_rate": 9.583764586240147e-06, "loss": 0.04948146641254425, "memory(GiB)": 21.32, "step": 5365, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.954659 }, { "epoch": 0.17431699314556737, "grad_norm": 0.5737037658691406, "learning_rate": 9.583549991503735e-06, "loss": 0.04931312054395676, "memory(GiB)": 21.32, "step": 5366, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.954693 }, { "epoch": 0.17434947860832278, "grad_norm": 0.806333601474762, "learning_rate": 9.583335343867054e-06, "loss": 0.06534059345722198, "memory(GiB)": 21.32, "step": 5367, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.954728 }, { "epoch": 0.1743819640710782, "grad_norm": 0.5638206601142883, "learning_rate": 9.583120643332585e-06, "loss": 0.05097069591283798, "memory(GiB)": 21.32, "step": 5368, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.954764 }, { "epoch": 0.17441444953383362, "grad_norm": 0.5550837516784668, "learning_rate": 9.582905889902802e-06, "loss": 0.04919401928782463, "memory(GiB)": 21.32, "step": 5369, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.954795 }, { "epoch": 0.17444693499658903, "grad_norm": 0.6553030610084534, "learning_rate": 9.582691083580186e-06, "loss": 0.04005177691578865, "memory(GiB)": 21.32, "step": 5370, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.95483 }, { "epoch": 0.17447942045934445, "grad_norm": 0.6300380825996399, "learning_rate": 9.582476224367215e-06, "loss": 0.05341199040412903, "memory(GiB)": 21.32, "step": 5371, "token_acc": 0.9918367346938776, "train_speed(iter/s)": 0.954865 }, { "epoch": 0.17451190592209986, "grad_norm": 0.8087165951728821, "learning_rate": 9.582261312266368e-06, "loss": 0.053241975605487823, "memory(GiB)": 21.32, "step": 5372, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.954899 }, { "epoch": 0.17454439138485528, "grad_norm": 0.6787441372871399, "learning_rate": 9.582046347280129e-06, "loss": 0.05201973393559456, "memory(GiB)": 21.32, "step": 5373, "token_acc": 1.0, "train_speed(iter/s)": 0.954935 }, { "epoch": 0.1745768768476107, "grad_norm": 0.560401201248169, "learning_rate": 9.581831329410974e-06, "loss": 0.05289725214242935, "memory(GiB)": 21.32, "step": 5374, "token_acc": 0.9707112970711297, "train_speed(iter/s)": 0.954968 }, { "epoch": 0.1746093623103661, "grad_norm": 0.6529064178466797, "learning_rate": 9.581616258661389e-06, "loss": 0.04828383028507233, "memory(GiB)": 21.32, "step": 5375, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.955002 }, { "epoch": 0.17464184777312153, "grad_norm": 0.7119717597961426, "learning_rate": 9.581401135033852e-06, "loss": 0.056015536189079285, "memory(GiB)": 21.32, "step": 5376, "token_acc": 0.9680365296803652, "train_speed(iter/s)": 0.955037 }, { "epoch": 0.17467433323587694, "grad_norm": 0.6089851260185242, "learning_rate": 9.58118595853085e-06, "loss": 0.0510486401617527, "memory(GiB)": 21.32, "step": 5377, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.955071 }, { "epoch": 0.17470681869863236, "grad_norm": 0.4993434250354767, "learning_rate": 9.580970729154863e-06, "loss": 0.04378470033407211, "memory(GiB)": 21.32, "step": 5378, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.955104 }, { "epoch": 0.17473930416138778, "grad_norm": 0.5678163170814514, "learning_rate": 9.580755446908378e-06, "loss": 0.051163047552108765, "memory(GiB)": 21.32, "step": 5379, "token_acc": 0.9704433497536946, "train_speed(iter/s)": 0.955133 }, { "epoch": 0.1747717896241432, "grad_norm": 0.5049397349357605, "learning_rate": 9.580540111793877e-06, "loss": 0.06141024827957153, "memory(GiB)": 21.32, "step": 5380, "token_acc": 0.9738805970149254, "train_speed(iter/s)": 0.955162 }, { "epoch": 0.1748042750868986, "grad_norm": 0.437919020652771, "learning_rate": 9.580324723813847e-06, "loss": 0.04146764427423477, "memory(GiB)": 21.32, "step": 5381, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.955191 }, { "epoch": 0.17483676054965402, "grad_norm": 0.7375171184539795, "learning_rate": 9.580109282970773e-06, "loss": 0.04850482568144798, "memory(GiB)": 21.32, "step": 5382, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.95522 }, { "epoch": 0.17486924601240944, "grad_norm": 0.7277054190635681, "learning_rate": 9.579893789267142e-06, "loss": 0.06372535228729248, "memory(GiB)": 21.32, "step": 5383, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.95525 }, { "epoch": 0.17490173147516486, "grad_norm": 1.3129186630249023, "learning_rate": 9.579678242705442e-06, "loss": 0.055282752960920334, "memory(GiB)": 21.32, "step": 5384, "token_acc": 0.9695431472081218, "train_speed(iter/s)": 0.955281 }, { "epoch": 0.17493421693792027, "grad_norm": 0.4959641098976135, "learning_rate": 9.579462643288157e-06, "loss": 0.055202145129442215, "memory(GiB)": 21.32, "step": 5385, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.955311 }, { "epoch": 0.1749667024006757, "grad_norm": 0.916515588760376, "learning_rate": 9.57924699101778e-06, "loss": 0.055876899510622025, "memory(GiB)": 21.32, "step": 5386, "token_acc": 0.9724770642201835, "train_speed(iter/s)": 0.955337 }, { "epoch": 0.1749991878634311, "grad_norm": 0.508048415184021, "learning_rate": 9.579031285896797e-06, "loss": 0.04543072730302811, "memory(GiB)": 21.32, "step": 5387, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.955368 }, { "epoch": 0.17503167332618652, "grad_norm": 0.6449616551399231, "learning_rate": 9.578815527927699e-06, "loss": 0.04803627356886864, "memory(GiB)": 21.32, "step": 5388, "token_acc": 0.9812734082397003, "train_speed(iter/s)": 0.955396 }, { "epoch": 0.17506415878894194, "grad_norm": 0.586266815662384, "learning_rate": 9.578599717112973e-06, "loss": 0.04077649489045143, "memory(GiB)": 21.32, "step": 5389, "token_acc": 0.9894179894179894, "train_speed(iter/s)": 0.955424 }, { "epoch": 0.17509664425169735, "grad_norm": 0.9226654171943665, "learning_rate": 9.578383853455115e-06, "loss": 0.04935397952795029, "memory(GiB)": 21.32, "step": 5390, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.955453 }, { "epoch": 0.17512912971445277, "grad_norm": 1.2212715148925781, "learning_rate": 9.578167936956612e-06, "loss": 0.04695673659443855, "memory(GiB)": 21.32, "step": 5391, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.955483 }, { "epoch": 0.1751616151772082, "grad_norm": 1.0201525688171387, "learning_rate": 9.577951967619959e-06, "loss": 0.04214078560471535, "memory(GiB)": 21.32, "step": 5392, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.955512 }, { "epoch": 0.17519410063996363, "grad_norm": 0.46461302042007446, "learning_rate": 9.577735945447644e-06, "loss": 0.043918050825595856, "memory(GiB)": 21.32, "step": 5393, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.955539 }, { "epoch": 0.17522658610271905, "grad_norm": 0.45524507761001587, "learning_rate": 9.577519870442166e-06, "loss": 0.04576937109231949, "memory(GiB)": 21.32, "step": 5394, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.955568 }, { "epoch": 0.17525907156547446, "grad_norm": 0.8848928809165955, "learning_rate": 9.577303742606015e-06, "loss": 0.05421354994177818, "memory(GiB)": 21.32, "step": 5395, "token_acc": 0.9779005524861878, "train_speed(iter/s)": 0.955598 }, { "epoch": 0.17529155702822988, "grad_norm": 2.1610660552978516, "learning_rate": 9.577087561941687e-06, "loss": 0.06460047513246536, "memory(GiB)": 21.32, "step": 5396, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.955627 }, { "epoch": 0.1753240424909853, "grad_norm": 0.5897084474563599, "learning_rate": 9.576871328451676e-06, "loss": 0.04638957977294922, "memory(GiB)": 21.32, "step": 5397, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.955646 }, { "epoch": 0.1753565279537407, "grad_norm": 0.48504993319511414, "learning_rate": 9.57665504213848e-06, "loss": 0.03912104666233063, "memory(GiB)": 21.32, "step": 5398, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.955668 }, { "epoch": 0.17538901341649613, "grad_norm": 0.5775259137153625, "learning_rate": 9.576438703004589e-06, "loss": 0.06762202084064484, "memory(GiB)": 21.32, "step": 5399, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.955697 }, { "epoch": 0.17542149887925154, "grad_norm": 1.438058614730835, "learning_rate": 9.576222311052506e-06, "loss": 0.05192912369966507, "memory(GiB)": 21.32, "step": 5400, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.955723 }, { "epoch": 0.17545398434200696, "grad_norm": 0.689702033996582, "learning_rate": 9.576005866284727e-06, "loss": 0.05472628399729729, "memory(GiB)": 21.32, "step": 5401, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.955749 }, { "epoch": 0.17548646980476237, "grad_norm": 0.9073580503463745, "learning_rate": 9.57578936870375e-06, "loss": 0.05704500526189804, "memory(GiB)": 21.32, "step": 5402, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.955777 }, { "epoch": 0.1755189552675178, "grad_norm": 2.117018938064575, "learning_rate": 9.575572818312073e-06, "loss": 0.05125918611884117, "memory(GiB)": 21.32, "step": 5403, "token_acc": 0.9611307420494699, "train_speed(iter/s)": 0.955805 }, { "epoch": 0.1755514407302732, "grad_norm": 0.5391597151756287, "learning_rate": 9.575356215112197e-06, "loss": 0.04360133409500122, "memory(GiB)": 21.32, "step": 5404, "token_acc": 0.9741379310344828, "train_speed(iter/s)": 0.955832 }, { "epoch": 0.17558392619302862, "grad_norm": 0.7797080874443054, "learning_rate": 9.575139559106619e-06, "loss": 0.0513867512345314, "memory(GiB)": 21.32, "step": 5405, "token_acc": 0.99609375, "train_speed(iter/s)": 0.955864 }, { "epoch": 0.17561641165578404, "grad_norm": 0.5468177795410156, "learning_rate": 9.574922850297841e-06, "loss": 0.0545535571873188, "memory(GiB)": 21.32, "step": 5406, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.95589 }, { "epoch": 0.17564889711853945, "grad_norm": 0.5783888697624207, "learning_rate": 9.574706088688364e-06, "loss": 0.049290597438812256, "memory(GiB)": 21.32, "step": 5407, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.955918 }, { "epoch": 0.17568138258129487, "grad_norm": 0.5743180513381958, "learning_rate": 9.57448927428069e-06, "loss": 0.06145231053233147, "memory(GiB)": 21.32, "step": 5408, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.955944 }, { "epoch": 0.1757138680440503, "grad_norm": 1.3389798402786255, "learning_rate": 9.57427240707732e-06, "loss": 0.04966697096824646, "memory(GiB)": 21.32, "step": 5409, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.955971 }, { "epoch": 0.1757463535068057, "grad_norm": 0.49917179346084595, "learning_rate": 9.57405548708076e-06, "loss": 0.04873505234718323, "memory(GiB)": 21.32, "step": 5410, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.955998 }, { "epoch": 0.17577883896956112, "grad_norm": 10.867733001708984, "learning_rate": 9.573838514293512e-06, "loss": 0.059536125510931015, "memory(GiB)": 21.32, "step": 5411, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.956024 }, { "epoch": 0.17581132443231653, "grad_norm": 0.9874882102012634, "learning_rate": 9.573621488718078e-06, "loss": 0.0411374568939209, "memory(GiB)": 21.32, "step": 5412, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.956051 }, { "epoch": 0.17584380989507195, "grad_norm": 0.47783133387565613, "learning_rate": 9.573404410356966e-06, "loss": 0.039830103516578674, "memory(GiB)": 21.32, "step": 5413, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.956079 }, { "epoch": 0.17587629535782737, "grad_norm": 0.5256479978561401, "learning_rate": 9.57318727921268e-06, "loss": 0.04700092971324921, "memory(GiB)": 21.32, "step": 5414, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.956107 }, { "epoch": 0.17590878082058278, "grad_norm": 0.550624430179596, "learning_rate": 9.572970095287723e-06, "loss": 0.0471823588013649, "memory(GiB)": 21.32, "step": 5415, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.956135 }, { "epoch": 0.1759412662833382, "grad_norm": 0.5590159893035889, "learning_rate": 9.572752858584608e-06, "loss": 0.04957294464111328, "memory(GiB)": 21.32, "step": 5416, "token_acc": 0.9724409448818898, "train_speed(iter/s)": 0.956169 }, { "epoch": 0.17597375174609362, "grad_norm": 0.8624454736709595, "learning_rate": 9.572535569105836e-06, "loss": 0.050158653408288956, "memory(GiB)": 21.32, "step": 5417, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.956202 }, { "epoch": 0.17600623720884903, "grad_norm": 0.5447553992271423, "learning_rate": 9.57231822685392e-06, "loss": 0.05823466181755066, "memory(GiB)": 21.32, "step": 5418, "token_acc": 0.9805825242718447, "train_speed(iter/s)": 0.956234 }, { "epoch": 0.17603872267160445, "grad_norm": 0.9335525631904602, "learning_rate": 9.572100831831363e-06, "loss": 0.05178981274366379, "memory(GiB)": 21.32, "step": 5419, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.956269 }, { "epoch": 0.17607120813435986, "grad_norm": 0.4544282853603363, "learning_rate": 9.57188338404068e-06, "loss": 0.037510063499212265, "memory(GiB)": 21.32, "step": 5420, "token_acc": 0.978021978021978, "train_speed(iter/s)": 0.956302 }, { "epoch": 0.17610369359711528, "grad_norm": 0.5798670649528503, "learning_rate": 9.571665883484376e-06, "loss": 0.04475477710366249, "memory(GiB)": 21.32, "step": 5421, "token_acc": 0.979253112033195, "train_speed(iter/s)": 0.956337 }, { "epoch": 0.1761361790598707, "grad_norm": 0.8891417980194092, "learning_rate": 9.571448330164963e-06, "loss": 0.049494415521621704, "memory(GiB)": 21.32, "step": 5422, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956371 }, { "epoch": 0.1761686645226261, "grad_norm": 0.5964856147766113, "learning_rate": 9.571230724084952e-06, "loss": 0.048803284764289856, "memory(GiB)": 21.32, "step": 5423, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.956406 }, { "epoch": 0.17620114998538156, "grad_norm": 0.7615438103675842, "learning_rate": 9.571013065246854e-06, "loss": 0.05088250711560249, "memory(GiB)": 21.32, "step": 5424, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.95644 }, { "epoch": 0.17623363544813697, "grad_norm": 0.8808130621910095, "learning_rate": 9.570795353653182e-06, "loss": 0.047580454498529434, "memory(GiB)": 21.32, "step": 5425, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.956473 }, { "epoch": 0.1762661209108924, "grad_norm": 0.6609249711036682, "learning_rate": 9.570577589306447e-06, "loss": 0.05559030920267105, "memory(GiB)": 21.32, "step": 5426, "token_acc": 0.9757085020242915, "train_speed(iter/s)": 0.956504 }, { "epoch": 0.1762986063736478, "grad_norm": 1.1246157884597778, "learning_rate": 9.570359772209162e-06, "loss": 0.048765819519758224, "memory(GiB)": 21.32, "step": 5427, "token_acc": 0.9751243781094527, "train_speed(iter/s)": 0.956539 }, { "epoch": 0.17633109183640322, "grad_norm": 0.8913799524307251, "learning_rate": 9.570141902363844e-06, "loss": 0.06113750487565994, "memory(GiB)": 21.32, "step": 5428, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.95657 }, { "epoch": 0.17636357729915864, "grad_norm": 0.840919017791748, "learning_rate": 9.569923979773004e-06, "loss": 0.06400449573993683, "memory(GiB)": 21.32, "step": 5429, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.956603 }, { "epoch": 0.17639606276191405, "grad_norm": 0.6359969973564148, "learning_rate": 9.56970600443916e-06, "loss": 0.04876329004764557, "memory(GiB)": 21.32, "step": 5430, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.956636 }, { "epoch": 0.17642854822466947, "grad_norm": 0.47912245988845825, "learning_rate": 9.569487976364826e-06, "loss": 0.04527467489242554, "memory(GiB)": 21.32, "step": 5431, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.956668 }, { "epoch": 0.17646103368742488, "grad_norm": 0.6208007335662842, "learning_rate": 9.56926989555252e-06, "loss": 0.050072357058525085, "memory(GiB)": 21.32, "step": 5432, "token_acc": 0.9844559585492227, "train_speed(iter/s)": 0.956702 }, { "epoch": 0.1764935191501803, "grad_norm": 0.4213707447052002, "learning_rate": 9.569051762004755e-06, "loss": 0.03796087205410004, "memory(GiB)": 21.32, "step": 5433, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.956736 }, { "epoch": 0.17652600461293572, "grad_norm": 0.5567861199378967, "learning_rate": 9.568833575724053e-06, "loss": 0.048552900552749634, "memory(GiB)": 21.32, "step": 5434, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.956769 }, { "epoch": 0.17655849007569113, "grad_norm": 0.44521448016166687, "learning_rate": 9.568615336712931e-06, "loss": 0.04878518730401993, "memory(GiB)": 21.32, "step": 5435, "token_acc": 0.9812206572769953, "train_speed(iter/s)": 0.956803 }, { "epoch": 0.17659097553844655, "grad_norm": 0.5498873591423035, "learning_rate": 9.568397044973905e-06, "loss": 0.04020019620656967, "memory(GiB)": 21.32, "step": 5436, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.956837 }, { "epoch": 0.17662346100120196, "grad_norm": 0.558578610420227, "learning_rate": 9.5681787005095e-06, "loss": 0.05133311450481415, "memory(GiB)": 21.32, "step": 5437, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.956872 }, { "epoch": 0.17665594646395738, "grad_norm": 0.42064982652664185, "learning_rate": 9.56796030332223e-06, "loss": 0.03578809276223183, "memory(GiB)": 21.32, "step": 5438, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.956906 }, { "epoch": 0.1766884319267128, "grad_norm": 1.0555672645568848, "learning_rate": 9.567741853414618e-06, "loss": 0.0549427792429924, "memory(GiB)": 21.32, "step": 5439, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.956938 }, { "epoch": 0.1767209173894682, "grad_norm": 0.44427719712257385, "learning_rate": 9.567523350789187e-06, "loss": 0.0385073646903038, "memory(GiB)": 21.32, "step": 5440, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956966 }, { "epoch": 0.17675340285222363, "grad_norm": 0.8115236759185791, "learning_rate": 9.567304795448456e-06, "loss": 0.05580168589949608, "memory(GiB)": 21.32, "step": 5441, "token_acc": 0.9742489270386266, "train_speed(iter/s)": 0.956993 }, { "epoch": 0.17678588831497904, "grad_norm": 0.5605626702308655, "learning_rate": 9.567086187394949e-06, "loss": 0.04573346674442291, "memory(GiB)": 21.32, "step": 5442, "token_acc": 0.9678899082568807, "train_speed(iter/s)": 0.957022 }, { "epoch": 0.17681837377773446, "grad_norm": 0.5105929970741272, "learning_rate": 9.566867526631188e-06, "loss": 0.0501282662153244, "memory(GiB)": 21.32, "step": 5443, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.95705 }, { "epoch": 0.17685085924048988, "grad_norm": 0.475475013256073, "learning_rate": 9.566648813159699e-06, "loss": 0.04139181226491928, "memory(GiB)": 21.32, "step": 5444, "token_acc": 0.9778761061946902, "train_speed(iter/s)": 0.957079 }, { "epoch": 0.1768833447032453, "grad_norm": 0.6925101280212402, "learning_rate": 9.566430046983002e-06, "loss": 0.05740252137184143, "memory(GiB)": 21.32, "step": 5445, "token_acc": 0.9741379310344828, "train_speed(iter/s)": 0.957105 }, { "epoch": 0.1769158301660007, "grad_norm": 0.7374382019042969, "learning_rate": 9.566211228103626e-06, "loss": 0.04720798507332802, "memory(GiB)": 21.32, "step": 5446, "token_acc": 0.9779411764705882, "train_speed(iter/s)": 0.957134 }, { "epoch": 0.17694831562875613, "grad_norm": 0.4721672832965851, "learning_rate": 9.565992356524093e-06, "loss": 0.036229003220796585, "memory(GiB)": 21.32, "step": 5447, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.957162 }, { "epoch": 0.17698080109151154, "grad_norm": 0.5690261125564575, "learning_rate": 9.565773432246934e-06, "loss": 0.060411885380744934, "memory(GiB)": 21.32, "step": 5448, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.957189 }, { "epoch": 0.17701328655426696, "grad_norm": 0.6318260431289673, "learning_rate": 9.56555445527467e-06, "loss": 0.05111104995012283, "memory(GiB)": 21.32, "step": 5449, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.957215 }, { "epoch": 0.17704577201702237, "grad_norm": 0.5494396686553955, "learning_rate": 9.565335425609831e-06, "loss": 0.043818432837724686, "memory(GiB)": 21.32, "step": 5450, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.957243 }, { "epoch": 0.1770782574797778, "grad_norm": 0.940909206867218, "learning_rate": 9.565116343254948e-06, "loss": 0.04883427172899246, "memory(GiB)": 21.32, "step": 5451, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.957269 }, { "epoch": 0.1771107429425332, "grad_norm": 0.46862637996673584, "learning_rate": 9.564897208212542e-06, "loss": 0.04240211099386215, "memory(GiB)": 21.32, "step": 5452, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.957297 }, { "epoch": 0.17714322840528862, "grad_norm": 0.5293715596199036, "learning_rate": 9.564678020485148e-06, "loss": 0.04930632561445236, "memory(GiB)": 21.32, "step": 5453, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.957324 }, { "epoch": 0.17717571386804404, "grad_norm": 0.4270143508911133, "learning_rate": 9.564458780075296e-06, "loss": 0.05009406805038452, "memory(GiB)": 21.32, "step": 5454, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.957355 }, { "epoch": 0.17720819933079945, "grad_norm": 1.8117799758911133, "learning_rate": 9.564239486985512e-06, "loss": 0.043212324380874634, "memory(GiB)": 21.32, "step": 5455, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.957383 }, { "epoch": 0.1772406847935549, "grad_norm": 0.6766228079795837, "learning_rate": 9.564020141218329e-06, "loss": 0.054734859615564346, "memory(GiB)": 21.32, "step": 5456, "token_acc": 0.9619047619047619, "train_speed(iter/s)": 0.957408 }, { "epoch": 0.17727317025631031, "grad_norm": 0.8075779676437378, "learning_rate": 9.56380074277628e-06, "loss": 0.05240882933139801, "memory(GiB)": 21.32, "step": 5457, "token_acc": 0.980544747081712, "train_speed(iter/s)": 0.95743 }, { "epoch": 0.17730565571906573, "grad_norm": 0.5860188603401184, "learning_rate": 9.563581291661896e-06, "loss": 0.05254390090703964, "memory(GiB)": 21.32, "step": 5458, "token_acc": 0.9615384615384616, "train_speed(iter/s)": 0.957449 }, { "epoch": 0.17733814118182115, "grad_norm": 0.6113929152488708, "learning_rate": 9.56336178787771e-06, "loss": 0.04549545794725418, "memory(GiB)": 21.32, "step": 5459, "token_acc": 0.9858657243816255, "train_speed(iter/s)": 0.957473 }, { "epoch": 0.17737062664457656, "grad_norm": 0.47617387771606445, "learning_rate": 9.563142231426255e-06, "loss": 0.04333629831671715, "memory(GiB)": 21.32, "step": 5460, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.957498 }, { "epoch": 0.17740311210733198, "grad_norm": 0.5207883715629578, "learning_rate": 9.562922622310065e-06, "loss": 0.047327831387519836, "memory(GiB)": 21.32, "step": 5461, "token_acc": 0.9636363636363636, "train_speed(iter/s)": 0.957522 }, { "epoch": 0.1774355975700874, "grad_norm": 1.5012484788894653, "learning_rate": 9.562702960531674e-06, "loss": 0.04326429218053818, "memory(GiB)": 21.32, "step": 5462, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957543 }, { "epoch": 0.1774680830328428, "grad_norm": 0.9642659425735474, "learning_rate": 9.562483246093617e-06, "loss": 0.0587296187877655, "memory(GiB)": 21.32, "step": 5463, "token_acc": 0.975, "train_speed(iter/s)": 0.957568 }, { "epoch": 0.17750056849559823, "grad_norm": 0.4562518894672394, "learning_rate": 9.562263478998433e-06, "loss": 0.040225084871053696, "memory(GiB)": 21.32, "step": 5464, "token_acc": 0.9671361502347418, "train_speed(iter/s)": 0.957598 }, { "epoch": 0.17753305395835364, "grad_norm": 0.5689381957054138, "learning_rate": 9.562043659248655e-06, "loss": 0.04792993888258934, "memory(GiB)": 21.32, "step": 5465, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.957626 }, { "epoch": 0.17756553942110906, "grad_norm": 0.746797502040863, "learning_rate": 9.561823786846823e-06, "loss": 0.051127538084983826, "memory(GiB)": 21.32, "step": 5466, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957651 }, { "epoch": 0.17759802488386447, "grad_norm": 1.2026515007019043, "learning_rate": 9.56160386179547e-06, "loss": 0.04364052787423134, "memory(GiB)": 21.32, "step": 5467, "token_acc": 0.96484375, "train_speed(iter/s)": 0.957678 }, { "epoch": 0.1776305103466199, "grad_norm": 1.6513911485671997, "learning_rate": 9.561383884097137e-06, "loss": 0.059817396104335785, "memory(GiB)": 21.32, "step": 5468, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.957706 }, { "epoch": 0.1776629958093753, "grad_norm": 1.0348414182662964, "learning_rate": 9.561163853754365e-06, "loss": 0.06690190732479095, "memory(GiB)": 21.32, "step": 5469, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.957733 }, { "epoch": 0.17769548127213072, "grad_norm": 0.6246737241744995, "learning_rate": 9.56094377076969e-06, "loss": 0.043329041451215744, "memory(GiB)": 21.32, "step": 5470, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.957762 }, { "epoch": 0.17772796673488614, "grad_norm": 0.413289874792099, "learning_rate": 9.560723635145652e-06, "loss": 0.043550267815589905, "memory(GiB)": 21.32, "step": 5471, "token_acc": 0.9731182795698925, "train_speed(iter/s)": 0.957788 }, { "epoch": 0.17776045219764156, "grad_norm": 0.4819362759590149, "learning_rate": 9.560503446884794e-06, "loss": 0.05352948233485222, "memory(GiB)": 21.32, "step": 5472, "token_acc": 0.970873786407767, "train_speed(iter/s)": 0.957818 }, { "epoch": 0.17779293766039697, "grad_norm": 0.4632899761199951, "learning_rate": 9.560283205989657e-06, "loss": 0.04668208211660385, "memory(GiB)": 21.32, "step": 5473, "token_acc": 0.98828125, "train_speed(iter/s)": 0.957845 }, { "epoch": 0.1778254231231524, "grad_norm": 0.5601339936256409, "learning_rate": 9.560062912462783e-06, "loss": 0.05077354609966278, "memory(GiB)": 21.32, "step": 5474, "token_acc": 0.992, "train_speed(iter/s)": 0.957875 }, { "epoch": 0.1778579085859078, "grad_norm": 0.6572972536087036, "learning_rate": 9.559842566306712e-06, "loss": 0.0443539172410965, "memory(GiB)": 21.32, "step": 5475, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.957907 }, { "epoch": 0.17789039404866322, "grad_norm": 0.486946165561676, "learning_rate": 9.559622167523987e-06, "loss": 0.03977467119693756, "memory(GiB)": 21.32, "step": 5476, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.957941 }, { "epoch": 0.17792287951141864, "grad_norm": 0.6092568039894104, "learning_rate": 9.559401716117156e-06, "loss": 0.04361707717180252, "memory(GiB)": 21.32, "step": 5477, "token_acc": 0.9927797833935018, "train_speed(iter/s)": 0.957972 }, { "epoch": 0.17795536497417405, "grad_norm": 0.657128632068634, "learning_rate": 9.559181212088759e-06, "loss": 0.047473691403865814, "memory(GiB)": 21.32, "step": 5478, "token_acc": 0.9807692307692307, "train_speed(iter/s)": 0.958006 }, { "epoch": 0.17798785043692947, "grad_norm": 0.47662848234176636, "learning_rate": 9.558960655441344e-06, "loss": 0.04832232743501663, "memory(GiB)": 21.32, "step": 5479, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.958033 }, { "epoch": 0.17802033589968488, "grad_norm": 0.5409576296806335, "learning_rate": 9.558740046177455e-06, "loss": 0.049629226326942444, "memory(GiB)": 21.32, "step": 5480, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.958067 }, { "epoch": 0.1780528213624403, "grad_norm": 0.5105838775634766, "learning_rate": 9.558519384299636e-06, "loss": 0.054668959230184555, "memory(GiB)": 21.32, "step": 5481, "token_acc": 0.9689922480620154, "train_speed(iter/s)": 0.9581 }, { "epoch": 0.17808530682519572, "grad_norm": 0.5045130848884583, "learning_rate": 9.558298669810437e-06, "loss": 0.04952003061771393, "memory(GiB)": 21.32, "step": 5482, "token_acc": 0.9607843137254902, "train_speed(iter/s)": 0.958135 }, { "epoch": 0.17811779228795113, "grad_norm": 0.5479463338851929, "learning_rate": 9.558077902712405e-06, "loss": 0.04571423679590225, "memory(GiB)": 21.32, "step": 5483, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.958169 }, { "epoch": 0.17815027775070655, "grad_norm": 0.44941017031669617, "learning_rate": 9.557857083008088e-06, "loss": 0.047500986605882645, "memory(GiB)": 21.32, "step": 5484, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.958203 }, { "epoch": 0.17818276321346196, "grad_norm": 0.7004697918891907, "learning_rate": 9.557636210700032e-06, "loss": 0.045687608420848846, "memory(GiB)": 21.32, "step": 5485, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.958237 }, { "epoch": 0.17821524867621738, "grad_norm": 0.7008213400840759, "learning_rate": 9.55741528579079e-06, "loss": 0.04970961809158325, "memory(GiB)": 21.32, "step": 5486, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.958271 }, { "epoch": 0.1782477341389728, "grad_norm": 0.7686441540718079, "learning_rate": 9.557194308282907e-06, "loss": 0.07254131883382797, "memory(GiB)": 21.32, "step": 5487, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.958302 }, { "epoch": 0.17828021960172824, "grad_norm": 0.4298306405544281, "learning_rate": 9.556973278178938e-06, "loss": 0.03731464967131615, "memory(GiB)": 21.32, "step": 5488, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.958336 }, { "epoch": 0.17831270506448366, "grad_norm": 0.5213123559951782, "learning_rate": 9.556752195481432e-06, "loss": 0.041040025651454926, "memory(GiB)": 21.32, "step": 5489, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.958368 }, { "epoch": 0.17834519052723907, "grad_norm": 0.7549512386322021, "learning_rate": 9.55653106019294e-06, "loss": 0.04984460026025772, "memory(GiB)": 21.32, "step": 5490, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.958401 }, { "epoch": 0.1783776759899945, "grad_norm": 1.2126119136810303, "learning_rate": 9.556309872316015e-06, "loss": 0.05927051603794098, "memory(GiB)": 21.32, "step": 5491, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.958435 }, { "epoch": 0.1784101614527499, "grad_norm": 0.620374858379364, "learning_rate": 9.55608863185321e-06, "loss": 0.05343930423259735, "memory(GiB)": 21.32, "step": 5492, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.958469 }, { "epoch": 0.17844264691550532, "grad_norm": 0.555295467376709, "learning_rate": 9.555867338807076e-06, "loss": 0.05208265781402588, "memory(GiB)": 21.32, "step": 5493, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.958501 }, { "epoch": 0.17847513237826074, "grad_norm": 0.5682765245437622, "learning_rate": 9.555645993180171e-06, "loss": 0.04900737479329109, "memory(GiB)": 21.32, "step": 5494, "token_acc": 0.9846743295019157, "train_speed(iter/s)": 0.958534 }, { "epoch": 0.17850761784101615, "grad_norm": 0.7039627432823181, "learning_rate": 9.555424594975049e-06, "loss": 0.05307187885046005, "memory(GiB)": 21.32, "step": 5495, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.958566 }, { "epoch": 0.17854010330377157, "grad_norm": 1.094005823135376, "learning_rate": 9.55520314419426e-06, "loss": 0.048790134489536285, "memory(GiB)": 21.32, "step": 5496, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.958598 }, { "epoch": 0.17857258876652699, "grad_norm": 0.973930835723877, "learning_rate": 9.554981640840368e-06, "loss": 0.039731405675411224, "memory(GiB)": 21.32, "step": 5497, "token_acc": 0.9831081081081081, "train_speed(iter/s)": 0.95863 }, { "epoch": 0.1786050742292824, "grad_norm": 0.7350093722343445, "learning_rate": 9.554760084915923e-06, "loss": 0.050158627331256866, "memory(GiB)": 21.32, "step": 5498, "token_acc": 0.9717741935483871, "train_speed(iter/s)": 0.958664 }, { "epoch": 0.17863755969203782, "grad_norm": 0.5736029148101807, "learning_rate": 9.554538476423483e-06, "loss": 0.04309118539094925, "memory(GiB)": 21.32, "step": 5499, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.958697 }, { "epoch": 0.17867004515479323, "grad_norm": 0.7639553546905518, "learning_rate": 9.554316815365609e-06, "loss": 0.04436875134706497, "memory(GiB)": 21.32, "step": 5500, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.95873 }, { "epoch": 0.17867004515479323, "eval_loss": 0.04979908838868141, "eval_runtime": 80.818, "eval_samples_per_second": 123.116, "eval_steps_per_second": 3.848, "eval_token_acc": 0.980700751822926, "step": 5500 }, { "epoch": 0.17870253061754865, "grad_norm": 0.6763248443603516, "learning_rate": 9.554095101744858e-06, "loss": 0.06037206947803497, "memory(GiB)": 21.32, "step": 5501, "token_acc": 0.9802003261122758, "train_speed(iter/s)": 0.943768 }, { "epoch": 0.17873501608030407, "grad_norm": 0.640137255191803, "learning_rate": 9.553873335563786e-06, "loss": 0.05424119159579277, "memory(GiB)": 21.32, "step": 5502, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.943795 }, { "epoch": 0.17876750154305948, "grad_norm": 0.508781909942627, "learning_rate": 9.553651516824955e-06, "loss": 0.044632233679294586, "memory(GiB)": 21.32, "step": 5503, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.943781 }, { "epoch": 0.1787999870058149, "grad_norm": 0.4190255105495453, "learning_rate": 9.553429645530924e-06, "loss": 0.04763173684477806, "memory(GiB)": 21.32, "step": 5504, "token_acc": 0.9793388429752066, "train_speed(iter/s)": 0.943813 }, { "epoch": 0.1788324724685703, "grad_norm": 0.43602508306503296, "learning_rate": 9.553207721684256e-06, "loss": 0.046064116060733795, "memory(GiB)": 21.32, "step": 5505, "token_acc": 0.9716312056737588, "train_speed(iter/s)": 0.94385 }, { "epoch": 0.17886495793132573, "grad_norm": 0.9405718445777893, "learning_rate": 9.55298574528751e-06, "loss": 0.055479180067777634, "memory(GiB)": 21.32, "step": 5506, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.943889 }, { "epoch": 0.17889744339408115, "grad_norm": 0.5715457797050476, "learning_rate": 9.552763716343248e-06, "loss": 0.04851388931274414, "memory(GiB)": 21.32, "step": 5507, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.943925 }, { "epoch": 0.17892992885683656, "grad_norm": 0.450880765914917, "learning_rate": 9.552541634854032e-06, "loss": 0.04646803438663483, "memory(GiB)": 21.32, "step": 5508, "token_acc": 0.9764705882352941, "train_speed(iter/s)": 0.943961 }, { "epoch": 0.17896241431959198, "grad_norm": 0.5837967991828918, "learning_rate": 9.552319500822427e-06, "loss": 0.05422662943601608, "memory(GiB)": 21.32, "step": 5509, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.943996 }, { "epoch": 0.1789948997823474, "grad_norm": 0.6104220151901245, "learning_rate": 9.552097314250997e-06, "loss": 0.045394185930490494, "memory(GiB)": 21.32, "step": 5510, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.944033 }, { "epoch": 0.1790273852451028, "grad_norm": 0.6898179054260254, "learning_rate": 9.551875075142303e-06, "loss": 0.044218000024557114, "memory(GiB)": 21.32, "step": 5511, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.944069 }, { "epoch": 0.17905987070785823, "grad_norm": 0.5175495743751526, "learning_rate": 9.551652783498914e-06, "loss": 0.04651903733611107, "memory(GiB)": 21.32, "step": 5512, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.944104 }, { "epoch": 0.17909235617061364, "grad_norm": 0.49213191866874695, "learning_rate": 9.551430439323393e-06, "loss": 0.048468612134456635, "memory(GiB)": 21.32, "step": 5513, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.944138 }, { "epoch": 0.17912484163336906, "grad_norm": 0.4462263584136963, "learning_rate": 9.551208042618308e-06, "loss": 0.0432332307100296, "memory(GiB)": 21.32, "step": 5514, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.944171 }, { "epoch": 0.17915732709612447, "grad_norm": 0.5555830597877502, "learning_rate": 9.550985593386222e-06, "loss": 0.05399766191840172, "memory(GiB)": 21.32, "step": 5515, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.944201 }, { "epoch": 0.1791898125588799, "grad_norm": 1.0435408353805542, "learning_rate": 9.550763091629707e-06, "loss": 0.05021996051073074, "memory(GiB)": 21.32, "step": 5516, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.944233 }, { "epoch": 0.1792222980216353, "grad_norm": 0.6257801055908203, "learning_rate": 9.550540537351328e-06, "loss": 0.04389909654855728, "memory(GiB)": 21.32, "step": 5517, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.944264 }, { "epoch": 0.17925478348439072, "grad_norm": 0.6907732486724854, "learning_rate": 9.550317930553654e-06, "loss": 0.051225773990154266, "memory(GiB)": 21.32, "step": 5518, "token_acc": 1.0, "train_speed(iter/s)": 0.944297 }, { "epoch": 0.17928726894714614, "grad_norm": 0.5889489650726318, "learning_rate": 9.550095271239255e-06, "loss": 0.04858005419373512, "memory(GiB)": 21.32, "step": 5519, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.944327 }, { "epoch": 0.17931975440990158, "grad_norm": 0.640302300453186, "learning_rate": 9.5498725594107e-06, "loss": 0.041364964097738266, "memory(GiB)": 21.32, "step": 5520, "token_acc": 0.9835390946502057, "train_speed(iter/s)": 0.944358 }, { "epoch": 0.179352239872657, "grad_norm": 0.49078652262687683, "learning_rate": 9.54964979507056e-06, "loss": 0.05058795213699341, "memory(GiB)": 21.32, "step": 5521, "token_acc": 0.9727626459143969, "train_speed(iter/s)": 0.944387 }, { "epoch": 0.17938472533541241, "grad_norm": 1.1793218851089478, "learning_rate": 9.549426978221407e-06, "loss": 0.06455029547214508, "memory(GiB)": 21.32, "step": 5522, "token_acc": 0.9625, "train_speed(iter/s)": 0.944419 }, { "epoch": 0.17941721079816783, "grad_norm": 0.9473907351493835, "learning_rate": 9.54920410886581e-06, "loss": 0.05467454716563225, "memory(GiB)": 21.32, "step": 5523, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.94445 }, { "epoch": 0.17944969626092325, "grad_norm": 0.7861692309379578, "learning_rate": 9.548981187006343e-06, "loss": 0.060713786631822586, "memory(GiB)": 21.32, "step": 5524, "token_acc": 0.9753694581280788, "train_speed(iter/s)": 0.944481 }, { "epoch": 0.17948218172367866, "grad_norm": 1.226402997970581, "learning_rate": 9.54875821264558e-06, "loss": 0.051056064665317535, "memory(GiB)": 21.32, "step": 5525, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.944509 }, { "epoch": 0.17951466718643408, "grad_norm": 0.6119012832641602, "learning_rate": 9.54853518578609e-06, "loss": 0.044534508138895035, "memory(GiB)": 21.32, "step": 5526, "token_acc": 0.9768518518518519, "train_speed(iter/s)": 0.944537 }, { "epoch": 0.1795471526491895, "grad_norm": 0.6931470036506653, "learning_rate": 9.548312106430451e-06, "loss": 0.057478711009025574, "memory(GiB)": 21.32, "step": 5527, "token_acc": 0.9748743718592965, "train_speed(iter/s)": 0.944566 }, { "epoch": 0.1795796381119449, "grad_norm": 0.7229650020599365, "learning_rate": 9.548088974581238e-06, "loss": 0.053978536278009415, "memory(GiB)": 21.32, "step": 5528, "token_acc": 0.98, "train_speed(iter/s)": 0.944596 }, { "epoch": 0.17961212357470033, "grad_norm": 0.5595168471336365, "learning_rate": 9.547865790241023e-06, "loss": 0.051014162600040436, "memory(GiB)": 21.32, "step": 5529, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.944628 }, { "epoch": 0.17964460903745574, "grad_norm": 0.6386619806289673, "learning_rate": 9.547642553412384e-06, "loss": 0.05444616824388504, "memory(GiB)": 21.32, "step": 5530, "token_acc": 0.9683257918552036, "train_speed(iter/s)": 0.944658 }, { "epoch": 0.17967709450021116, "grad_norm": 0.44979262351989746, "learning_rate": 9.547419264097897e-06, "loss": 0.04443035274744034, "memory(GiB)": 21.32, "step": 5531, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.944691 }, { "epoch": 0.17970957996296658, "grad_norm": 0.5336095094680786, "learning_rate": 9.54719592230014e-06, "loss": 0.057461872696876526, "memory(GiB)": 21.32, "step": 5532, "token_acc": 0.9704641350210971, "train_speed(iter/s)": 0.94472 }, { "epoch": 0.179742065425722, "grad_norm": 0.47618359327316284, "learning_rate": 9.546972528021688e-06, "loss": 0.052212148904800415, "memory(GiB)": 21.32, "step": 5533, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.944748 }, { "epoch": 0.1797745508884774, "grad_norm": 0.47854769229888916, "learning_rate": 9.546749081265124e-06, "loss": 0.05115789175033569, "memory(GiB)": 21.32, "step": 5534, "token_acc": 0.9752475247524752, "train_speed(iter/s)": 0.944779 }, { "epoch": 0.17980703635123282, "grad_norm": 0.44355812668800354, "learning_rate": 9.54652558203302e-06, "loss": 0.04786492884159088, "memory(GiB)": 21.32, "step": 5535, "token_acc": 0.9854545454545455, "train_speed(iter/s)": 0.94481 }, { "epoch": 0.17983952181398824, "grad_norm": 0.5804710984230042, "learning_rate": 9.546302030327961e-06, "loss": 0.05452702194452286, "memory(GiB)": 21.32, "step": 5536, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.944837 }, { "epoch": 0.17987200727674366, "grad_norm": 0.41863566637039185, "learning_rate": 9.546078426152526e-06, "loss": 0.05105917155742645, "memory(GiB)": 21.32, "step": 5537, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.944867 }, { "epoch": 0.17990449273949907, "grad_norm": 0.7442181706428528, "learning_rate": 9.545854769509296e-06, "loss": 0.047445543110370636, "memory(GiB)": 21.32, "step": 5538, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.944901 }, { "epoch": 0.1799369782022545, "grad_norm": 0.40164363384246826, "learning_rate": 9.54563106040085e-06, "loss": 0.040365103632211685, "memory(GiB)": 21.32, "step": 5539, "token_acc": 0.9799196787148594, "train_speed(iter/s)": 0.944936 }, { "epoch": 0.1799694636650099, "grad_norm": 0.8092543482780457, "learning_rate": 9.54540729882977e-06, "loss": 0.04561102017760277, "memory(GiB)": 21.32, "step": 5540, "token_acc": 0.9765625, "train_speed(iter/s)": 0.944974 }, { "epoch": 0.18000194912776532, "grad_norm": 0.508914589881897, "learning_rate": 9.545183484798643e-06, "loss": 0.042647264897823334, "memory(GiB)": 21.32, "step": 5541, "token_acc": 0.9745762711864406, "train_speed(iter/s)": 0.945014 }, { "epoch": 0.18003443459052074, "grad_norm": 0.5611844062805176, "learning_rate": 9.544959618310047e-06, "loss": 0.04927791655063629, "memory(GiB)": 21.32, "step": 5542, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.945048 }, { "epoch": 0.18006692005327615, "grad_norm": 0.4747607707977295, "learning_rate": 9.544735699366567e-06, "loss": 0.0484301783144474, "memory(GiB)": 21.32, "step": 5543, "token_acc": 0.9742647058823529, "train_speed(iter/s)": 0.945074 }, { "epoch": 0.18009940551603157, "grad_norm": 0.4336468577384949, "learning_rate": 9.54451172797079e-06, "loss": 0.03884008526802063, "memory(GiB)": 21.32, "step": 5544, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.945105 }, { "epoch": 0.18013189097878698, "grad_norm": 0.6249163150787354, "learning_rate": 9.544287704125298e-06, "loss": 0.03754331171512604, "memory(GiB)": 21.32, "step": 5545, "token_acc": 0.9809523809523809, "train_speed(iter/s)": 0.945136 }, { "epoch": 0.1801643764415424, "grad_norm": 0.4601600468158722, "learning_rate": 9.544063627832678e-06, "loss": 0.04644519090652466, "memory(GiB)": 21.32, "step": 5546, "token_acc": 0.96875, "train_speed(iter/s)": 0.945163 }, { "epoch": 0.18019686190429782, "grad_norm": 0.7414612770080566, "learning_rate": 9.543839499095516e-06, "loss": 0.05460454151034355, "memory(GiB)": 21.32, "step": 5547, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.945191 }, { "epoch": 0.18022934736705323, "grad_norm": 0.7875619530677795, "learning_rate": 9.543615317916397e-06, "loss": 0.05272814631462097, "memory(GiB)": 21.32, "step": 5548, "token_acc": 0.9822064056939501, "train_speed(iter/s)": 0.945219 }, { "epoch": 0.18026183282980865, "grad_norm": 0.7836050391197205, "learning_rate": 9.54339108429791e-06, "loss": 0.04506046324968338, "memory(GiB)": 21.32, "step": 5549, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.945246 }, { "epoch": 0.18029431829256407, "grad_norm": 0.5343102216720581, "learning_rate": 9.543166798242642e-06, "loss": 0.0423288457095623, "memory(GiB)": 21.32, "step": 5550, "token_acc": 0.9885496183206107, "train_speed(iter/s)": 0.945275 }, { "epoch": 0.18032680375531948, "grad_norm": 1.4749997854232788, "learning_rate": 9.542942459753183e-06, "loss": 0.05610063299536705, "memory(GiB)": 21.32, "step": 5551, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.945305 }, { "epoch": 0.18035928921807493, "grad_norm": 1.4183255434036255, "learning_rate": 9.542718068832122e-06, "loss": 0.061612747609615326, "memory(GiB)": 21.32, "step": 5552, "token_acc": 0.9795918367346939, "train_speed(iter/s)": 0.945334 }, { "epoch": 0.18039177468083034, "grad_norm": 0.5362997055053711, "learning_rate": 9.542493625482047e-06, "loss": 0.04382624477148056, "memory(GiB)": 21.32, "step": 5553, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.945362 }, { "epoch": 0.18042426014358576, "grad_norm": 0.6779689788818359, "learning_rate": 9.54226912970555e-06, "loss": 0.047861866652965546, "memory(GiB)": 21.32, "step": 5554, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.94539 }, { "epoch": 0.18045674560634117, "grad_norm": 0.5142966508865356, "learning_rate": 9.542044581505221e-06, "loss": 0.0431893989443779, "memory(GiB)": 21.32, "step": 5555, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.945417 }, { "epoch": 0.1804892310690966, "grad_norm": 0.4932722747325897, "learning_rate": 9.541819980883653e-06, "loss": 0.04859808087348938, "memory(GiB)": 21.32, "step": 5556, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.945448 }, { "epoch": 0.180521716531852, "grad_norm": 0.602178156375885, "learning_rate": 9.541595327843436e-06, "loss": 0.0568288117647171, "memory(GiB)": 21.32, "step": 5557, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.945478 }, { "epoch": 0.18055420199460742, "grad_norm": 0.8752809762954712, "learning_rate": 9.541370622387166e-06, "loss": 0.054205335676670074, "memory(GiB)": 21.32, "step": 5558, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.945504 }, { "epoch": 0.18058668745736284, "grad_norm": 0.6178296208381653, "learning_rate": 9.541145864517434e-06, "loss": 0.046291567385196686, "memory(GiB)": 21.32, "step": 5559, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.945531 }, { "epoch": 0.18061917292011825, "grad_norm": 0.5455431342124939, "learning_rate": 9.540921054236834e-06, "loss": 0.055659037083387375, "memory(GiB)": 21.32, "step": 5560, "token_acc": 0.9683257918552036, "train_speed(iter/s)": 0.945563 }, { "epoch": 0.18065165838287367, "grad_norm": 0.6407615542411804, "learning_rate": 9.54069619154796e-06, "loss": 0.04150736331939697, "memory(GiB)": 21.32, "step": 5561, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.945594 }, { "epoch": 0.18068414384562909, "grad_norm": 0.6852107048034668, "learning_rate": 9.540471276453409e-06, "loss": 0.06176484376192093, "memory(GiB)": 21.32, "step": 5562, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.945629 }, { "epoch": 0.1807166293083845, "grad_norm": 0.44884517788887024, "learning_rate": 9.540246308955777e-06, "loss": 0.04593341425061226, "memory(GiB)": 21.32, "step": 5563, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.945667 }, { "epoch": 0.18074911477113992, "grad_norm": 0.6977038979530334, "learning_rate": 9.54002128905766e-06, "loss": 0.04940800741314888, "memory(GiB)": 21.32, "step": 5564, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.945705 }, { "epoch": 0.18078160023389533, "grad_norm": 0.4864440858364105, "learning_rate": 9.539796216761652e-06, "loss": 0.045333750545978546, "memory(GiB)": 21.32, "step": 5565, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.945743 }, { "epoch": 0.18081408569665075, "grad_norm": 0.4280482232570648, "learning_rate": 9.539571092070355e-06, "loss": 0.04270271956920624, "memory(GiB)": 21.32, "step": 5566, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.945782 }, { "epoch": 0.18084657115940617, "grad_norm": 0.4488168954849243, "learning_rate": 9.539345914986365e-06, "loss": 0.048031456768512726, "memory(GiB)": 21.32, "step": 5567, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.945816 }, { "epoch": 0.18087905662216158, "grad_norm": 0.7604149580001831, "learning_rate": 9.539120685512282e-06, "loss": 0.03471602499485016, "memory(GiB)": 21.32, "step": 5568, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.945853 }, { "epoch": 0.180911542084917, "grad_norm": 0.5890878438949585, "learning_rate": 9.538895403650702e-06, "loss": 0.04908999428153038, "memory(GiB)": 21.32, "step": 5569, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.945892 }, { "epoch": 0.18094402754767241, "grad_norm": 0.5043565630912781, "learning_rate": 9.538670069404232e-06, "loss": 0.042037107050418854, "memory(GiB)": 21.32, "step": 5570, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.945931 }, { "epoch": 0.18097651301042783, "grad_norm": 0.539093554019928, "learning_rate": 9.538444682775466e-06, "loss": 0.051167260855436325, "memory(GiB)": 21.32, "step": 5571, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.945966 }, { "epoch": 0.18100899847318325, "grad_norm": 0.6719629764556885, "learning_rate": 9.538219243767009e-06, "loss": 0.061654917895793915, "memory(GiB)": 21.32, "step": 5572, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.946003 }, { "epoch": 0.18104148393593866, "grad_norm": 0.5029552578926086, "learning_rate": 9.53799375238146e-06, "loss": 0.04399479553103447, "memory(GiB)": 21.32, "step": 5573, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.946038 }, { "epoch": 0.18107396939869408, "grad_norm": 0.517166256904602, "learning_rate": 9.537768208621424e-06, "loss": 0.050071537494659424, "memory(GiB)": 21.32, "step": 5574, "token_acc": 0.9898477157360406, "train_speed(iter/s)": 0.946076 }, { "epoch": 0.1811064548614495, "grad_norm": 1.108880639076233, "learning_rate": 9.537542612489503e-06, "loss": 0.05119995027780533, "memory(GiB)": 21.32, "step": 5575, "token_acc": 0.9771863117870723, "train_speed(iter/s)": 0.946115 }, { "epoch": 0.1811389403242049, "grad_norm": 2.161350727081299, "learning_rate": 9.5373169639883e-06, "loss": 0.06779615581035614, "memory(GiB)": 21.32, "step": 5576, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.946151 }, { "epoch": 0.18117142578696033, "grad_norm": 0.4684154689311981, "learning_rate": 9.537091263120422e-06, "loss": 0.05109420418739319, "memory(GiB)": 21.32, "step": 5577, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.946187 }, { "epoch": 0.18120391124971574, "grad_norm": 0.6095579266548157, "learning_rate": 9.536865509888468e-06, "loss": 0.0487636961042881, "memory(GiB)": 21.32, "step": 5578, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.946221 }, { "epoch": 0.18123639671247116, "grad_norm": 0.6737330555915833, "learning_rate": 9.536639704295052e-06, "loss": 0.052090443670749664, "memory(GiB)": 21.32, "step": 5579, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.946252 }, { "epoch": 0.18126888217522658, "grad_norm": 0.4427946209907532, "learning_rate": 9.536413846342774e-06, "loss": 0.0437491200864315, "memory(GiB)": 21.32, "step": 5580, "token_acc": 0.979381443298969, "train_speed(iter/s)": 0.946283 }, { "epoch": 0.181301367637982, "grad_norm": 1.5901224613189697, "learning_rate": 9.536187936034241e-06, "loss": 0.04960144683718681, "memory(GiB)": 21.32, "step": 5581, "token_acc": 0.975, "train_speed(iter/s)": 0.946314 }, { "epoch": 0.1813338531007374, "grad_norm": 0.5823389887809753, "learning_rate": 9.535961973372063e-06, "loss": 0.04915575683116913, "memory(GiB)": 21.32, "step": 5582, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.946343 }, { "epoch": 0.18136633856349282, "grad_norm": 0.7025527954101562, "learning_rate": 9.535735958358847e-06, "loss": 0.04719473794102669, "memory(GiB)": 21.32, "step": 5583, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.946373 }, { "epoch": 0.18139882402624827, "grad_norm": 0.4170111119747162, "learning_rate": 9.535509890997198e-06, "loss": 0.03942772001028061, "memory(GiB)": 21.32, "step": 5584, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.946402 }, { "epoch": 0.18143130948900368, "grad_norm": 0.4550517201423645, "learning_rate": 9.53528377128973e-06, "loss": 0.03867495059967041, "memory(GiB)": 21.32, "step": 5585, "token_acc": 0.9905660377358491, "train_speed(iter/s)": 0.946432 }, { "epoch": 0.1814637949517591, "grad_norm": 0.5726080536842346, "learning_rate": 9.535057599239051e-06, "loss": 0.04278290271759033, "memory(GiB)": 21.32, "step": 5586, "token_acc": 0.993127147766323, "train_speed(iter/s)": 0.946461 }, { "epoch": 0.18149628041451452, "grad_norm": 0.4403502643108368, "learning_rate": 9.534831374847772e-06, "loss": 0.045424021780490875, "memory(GiB)": 21.32, "step": 5587, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.946491 }, { "epoch": 0.18152876587726993, "grad_norm": 0.7367465496063232, "learning_rate": 9.534605098118501e-06, "loss": 0.06935270130634308, "memory(GiB)": 21.32, "step": 5588, "token_acc": 0.9627906976744186, "train_speed(iter/s)": 0.946521 }, { "epoch": 0.18156125134002535, "grad_norm": 0.5417289733886719, "learning_rate": 9.534378769053854e-06, "loss": 0.05802515149116516, "memory(GiB)": 21.32, "step": 5589, "token_acc": 0.981042654028436, "train_speed(iter/s)": 0.94655 }, { "epoch": 0.18159373680278076, "grad_norm": 0.5677935481071472, "learning_rate": 9.534152387656439e-06, "loss": 0.04364216327667236, "memory(GiB)": 21.32, "step": 5590, "token_acc": 0.9795918367346939, "train_speed(iter/s)": 0.946578 }, { "epoch": 0.18162622226553618, "grad_norm": 0.8648574352264404, "learning_rate": 9.533925953928871e-06, "loss": 0.051365673542022705, "memory(GiB)": 21.32, "step": 5591, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.946607 }, { "epoch": 0.1816587077282916, "grad_norm": 0.6116130352020264, "learning_rate": 9.533699467873763e-06, "loss": 0.047617167234420776, "memory(GiB)": 21.32, "step": 5592, "token_acc": 0.9752475247524752, "train_speed(iter/s)": 0.946634 }, { "epoch": 0.181691193191047, "grad_norm": 0.48183006048202515, "learning_rate": 9.533472929493729e-06, "loss": 0.046882715076208115, "memory(GiB)": 21.32, "step": 5593, "token_acc": 0.9896373056994818, "train_speed(iter/s)": 0.946664 }, { "epoch": 0.18172367865380243, "grad_norm": 0.5193851590156555, "learning_rate": 9.533246338791383e-06, "loss": 0.04243381693959236, "memory(GiB)": 21.32, "step": 5594, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.946694 }, { "epoch": 0.18175616411655784, "grad_norm": 0.4888897240161896, "learning_rate": 9.533019695769341e-06, "loss": 0.04201560467481613, "memory(GiB)": 21.32, "step": 5595, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.946717 }, { "epoch": 0.18178864957931326, "grad_norm": 0.7044562101364136, "learning_rate": 9.532793000430217e-06, "loss": 0.05131392180919647, "memory(GiB)": 21.32, "step": 5596, "token_acc": 0.9568627450980393, "train_speed(iter/s)": 0.946741 }, { "epoch": 0.18182113504206868, "grad_norm": 0.6258715391159058, "learning_rate": 9.53256625277663e-06, "loss": 0.053016651421785355, "memory(GiB)": 21.32, "step": 5597, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.946765 }, { "epoch": 0.1818536205048241, "grad_norm": 0.5028017163276672, "learning_rate": 9.532339452811195e-06, "loss": 0.04089063033461571, "memory(GiB)": 21.32, "step": 5598, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.946797 }, { "epoch": 0.1818861059675795, "grad_norm": 0.5535662174224854, "learning_rate": 9.53211260053653e-06, "loss": 0.04440632089972496, "memory(GiB)": 21.32, "step": 5599, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.946827 }, { "epoch": 0.18191859143033492, "grad_norm": 0.7617502212524414, "learning_rate": 9.531885695955255e-06, "loss": 0.050687652081251144, "memory(GiB)": 21.32, "step": 5600, "token_acc": 0.9702602230483272, "train_speed(iter/s)": 0.946856 }, { "epoch": 0.18195107689309034, "grad_norm": 0.506112813949585, "learning_rate": 9.531658739069984e-06, "loss": 0.037858814001083374, "memory(GiB)": 21.32, "step": 5601, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.946887 }, { "epoch": 0.18198356235584576, "grad_norm": 0.8575102090835571, "learning_rate": 9.531431729883342e-06, "loss": 0.045146048069000244, "memory(GiB)": 21.32, "step": 5602, "token_acc": 0.9825783972125436, "train_speed(iter/s)": 0.946911 }, { "epoch": 0.18201604781860117, "grad_norm": 0.6220797300338745, "learning_rate": 9.531204668397946e-06, "loss": 0.048499614000320435, "memory(GiB)": 21.32, "step": 5603, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.946938 }, { "epoch": 0.1820485332813566, "grad_norm": 0.5243116021156311, "learning_rate": 9.530977554616416e-06, "loss": 0.04528284817934036, "memory(GiB)": 21.32, "step": 5604, "token_acc": 0.9846153846153847, "train_speed(iter/s)": 0.946965 }, { "epoch": 0.182081018744112, "grad_norm": 0.6826877593994141, "learning_rate": 9.530750388541377e-06, "loss": 0.05619678646326065, "memory(GiB)": 21.32, "step": 5605, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.946994 }, { "epoch": 0.18211350420686742, "grad_norm": 0.4418773949146271, "learning_rate": 9.530523170175446e-06, "loss": 0.03751295804977417, "memory(GiB)": 21.32, "step": 5606, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.947019 }, { "epoch": 0.18214598966962284, "grad_norm": 0.5659803748130798, "learning_rate": 9.530295899521247e-06, "loss": 0.035735394805669785, "memory(GiB)": 21.32, "step": 5607, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.947045 }, { "epoch": 0.18217847513237825, "grad_norm": 1.215031385421753, "learning_rate": 9.530068576581403e-06, "loss": 0.05611872673034668, "memory(GiB)": 21.32, "step": 5608, "token_acc": 0.9767441860465116, "train_speed(iter/s)": 0.94707 }, { "epoch": 0.18221096059513367, "grad_norm": 0.7680397629737854, "learning_rate": 9.529841201358541e-06, "loss": 0.05873177945613861, "memory(GiB)": 21.32, "step": 5609, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.947096 }, { "epoch": 0.18224344605788909, "grad_norm": 0.5229512453079224, "learning_rate": 9.52961377385528e-06, "loss": 0.04483434557914734, "memory(GiB)": 21.32, "step": 5610, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.947124 }, { "epoch": 0.1822759315206445, "grad_norm": 0.5145289897918701, "learning_rate": 9.52938629407425e-06, "loss": 0.04344108700752258, "memory(GiB)": 21.32, "step": 5611, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.947152 }, { "epoch": 0.18230841698339992, "grad_norm": 1.027489423751831, "learning_rate": 9.52915876201807e-06, "loss": 0.05267016589641571, "memory(GiB)": 21.32, "step": 5612, "token_acc": 0.9681818181818181, "train_speed(iter/s)": 0.94718 }, { "epoch": 0.18234090244615533, "grad_norm": 0.5708869099617004, "learning_rate": 9.528931177689372e-06, "loss": 0.047273196280002594, "memory(GiB)": 21.32, "step": 5613, "token_acc": 0.9820359281437125, "train_speed(iter/s)": 0.947208 }, { "epoch": 0.18237338790891075, "grad_norm": 0.7242037057876587, "learning_rate": 9.528703541090778e-06, "loss": 0.053982146084308624, "memory(GiB)": 21.32, "step": 5614, "token_acc": 0.9896373056994818, "train_speed(iter/s)": 0.947237 }, { "epoch": 0.18240587337166617, "grad_norm": 0.7329053282737732, "learning_rate": 9.528475852224919e-06, "loss": 0.04621787369251251, "memory(GiB)": 21.32, "step": 5615, "token_acc": 0.9742268041237113, "train_speed(iter/s)": 0.947266 }, { "epoch": 0.1824383588344216, "grad_norm": 1.625744104385376, "learning_rate": 9.528248111094422e-06, "loss": 0.05773549899458885, "memory(GiB)": 21.32, "step": 5616, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.947294 }, { "epoch": 0.18247084429717703, "grad_norm": 1.2825331687927246, "learning_rate": 9.528020317701913e-06, "loss": 0.05329606309533119, "memory(GiB)": 21.32, "step": 5617, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.94732 }, { "epoch": 0.18250332975993244, "grad_norm": 0.5697309970855713, "learning_rate": 9.527792472050024e-06, "loss": 0.04571528732776642, "memory(GiB)": 21.32, "step": 5618, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.947347 }, { "epoch": 0.18253581522268786, "grad_norm": 0.5780524611473083, "learning_rate": 9.527564574141383e-06, "loss": 0.04266803711652756, "memory(GiB)": 21.32, "step": 5619, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.947377 }, { "epoch": 0.18256830068544327, "grad_norm": 0.6720764636993408, "learning_rate": 9.527336623978621e-06, "loss": 0.05660008639097214, "memory(GiB)": 21.32, "step": 5620, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.947407 }, { "epoch": 0.1826007861481987, "grad_norm": 1.8223499059677124, "learning_rate": 9.527108621564367e-06, "loss": 0.0531214103102684, "memory(GiB)": 21.32, "step": 5621, "token_acc": 0.9661016949152542, "train_speed(iter/s)": 0.947442 }, { "epoch": 0.1826332716109541, "grad_norm": 0.6135018467903137, "learning_rate": 9.526880566901256e-06, "loss": 0.05035489797592163, "memory(GiB)": 21.32, "step": 5622, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.947478 }, { "epoch": 0.18266575707370952, "grad_norm": 0.8209260106086731, "learning_rate": 9.526652459991916e-06, "loss": 0.045168206095695496, "memory(GiB)": 21.32, "step": 5623, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.947514 }, { "epoch": 0.18269824253646494, "grad_norm": 0.5395873188972473, "learning_rate": 9.526424300838982e-06, "loss": 0.04770159721374512, "memory(GiB)": 21.32, "step": 5624, "token_acc": 0.9675090252707581, "train_speed(iter/s)": 0.947551 }, { "epoch": 0.18273072799922035, "grad_norm": 0.6326977610588074, "learning_rate": 9.526196089445088e-06, "loss": 0.04653320461511612, "memory(GiB)": 21.32, "step": 5625, "token_acc": 0.9858490566037735, "train_speed(iter/s)": 0.947587 }, { "epoch": 0.18276321346197577, "grad_norm": 0.511998176574707, "learning_rate": 9.525967825812866e-06, "loss": 0.038818344473838806, "memory(GiB)": 21.32, "step": 5626, "token_acc": 0.975, "train_speed(iter/s)": 0.947625 }, { "epoch": 0.1827956989247312, "grad_norm": 0.7065762877464294, "learning_rate": 9.525739509944952e-06, "loss": 0.05722369998693466, "memory(GiB)": 21.32, "step": 5627, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.947662 }, { "epoch": 0.1828281843874866, "grad_norm": 0.7538459897041321, "learning_rate": 9.525511141843979e-06, "loss": 0.05083181709051132, "memory(GiB)": 21.32, "step": 5628, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.9477 }, { "epoch": 0.18286066985024202, "grad_norm": 0.5551832914352417, "learning_rate": 9.525282721512586e-06, "loss": 0.050788164138793945, "memory(GiB)": 21.32, "step": 5629, "token_acc": 0.9862068965517241, "train_speed(iter/s)": 0.947735 }, { "epoch": 0.18289315531299744, "grad_norm": 1.6521540880203247, "learning_rate": 9.525054248953406e-06, "loss": 0.05680118873715401, "memory(GiB)": 21.32, "step": 5630, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.947772 }, { "epoch": 0.18292564077575285, "grad_norm": 0.6526950597763062, "learning_rate": 9.524825724169078e-06, "loss": 0.051997411996126175, "memory(GiB)": 21.32, "step": 5631, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.94781 }, { "epoch": 0.18295812623850827, "grad_norm": 0.7370790839195251, "learning_rate": 9.524597147162237e-06, "loss": 0.05180540680885315, "memory(GiB)": 21.32, "step": 5632, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.947847 }, { "epoch": 0.18299061170126368, "grad_norm": 0.5325363278388977, "learning_rate": 9.524368517935523e-06, "loss": 0.046821705996990204, "memory(GiB)": 21.32, "step": 5633, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.947884 }, { "epoch": 0.1830230971640191, "grad_norm": 1.3367278575897217, "learning_rate": 9.524139836491576e-06, "loss": 0.058510031551122665, "memory(GiB)": 21.32, "step": 5634, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.94792 }, { "epoch": 0.18305558262677452, "grad_norm": 0.47561368346214294, "learning_rate": 9.523911102833033e-06, "loss": 0.051045630127191544, "memory(GiB)": 21.32, "step": 5635, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.947954 }, { "epoch": 0.18308806808952993, "grad_norm": 0.6282348036766052, "learning_rate": 9.523682316962533e-06, "loss": 0.04275558516383171, "memory(GiB)": 21.32, "step": 5636, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.947991 }, { "epoch": 0.18312055355228535, "grad_norm": 0.5536911487579346, "learning_rate": 9.523453478882719e-06, "loss": 0.04227316379547119, "memory(GiB)": 21.32, "step": 5637, "token_acc": 0.9781818181818182, "train_speed(iter/s)": 0.948028 }, { "epoch": 0.18315303901504076, "grad_norm": 0.6148461699485779, "learning_rate": 9.523224588596232e-06, "loss": 0.0482511967420578, "memory(GiB)": 21.32, "step": 5638, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.948066 }, { "epoch": 0.18318552447779618, "grad_norm": 0.5502208471298218, "learning_rate": 9.522995646105712e-06, "loss": 0.040936633944511414, "memory(GiB)": 21.32, "step": 5639, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.948103 }, { "epoch": 0.1832180099405516, "grad_norm": 0.5854775309562683, "learning_rate": 9.522766651413802e-06, "loss": 0.0414976105093956, "memory(GiB)": 21.32, "step": 5640, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.948133 }, { "epoch": 0.183250495403307, "grad_norm": 1.0948697328567505, "learning_rate": 9.522537604523146e-06, "loss": 0.05007605999708176, "memory(GiB)": 21.32, "step": 5641, "token_acc": 0.9822485207100592, "train_speed(iter/s)": 0.948162 }, { "epoch": 0.18328298086606243, "grad_norm": 0.6116783618927002, "learning_rate": 9.522308505436385e-06, "loss": 0.04673394560813904, "memory(GiB)": 21.32, "step": 5642, "token_acc": 0.9718875502008032, "train_speed(iter/s)": 0.94819 }, { "epoch": 0.18331546632881784, "grad_norm": 0.6633144021034241, "learning_rate": 9.522079354156167e-06, "loss": 0.05370563268661499, "memory(GiB)": 21.32, "step": 5643, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.94822 }, { "epoch": 0.18334795179157326, "grad_norm": 0.9834058284759521, "learning_rate": 9.521850150685133e-06, "loss": 0.040904656052589417, "memory(GiB)": 21.32, "step": 5644, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.948251 }, { "epoch": 0.18338043725432868, "grad_norm": 0.6569995880126953, "learning_rate": 9.521620895025929e-06, "loss": 0.05124947428703308, "memory(GiB)": 21.32, "step": 5645, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.948283 }, { "epoch": 0.1834129227170841, "grad_norm": 0.8122283816337585, "learning_rate": 9.521391587181202e-06, "loss": 0.04757612198591232, "memory(GiB)": 21.32, "step": 5646, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.948313 }, { "epoch": 0.1834454081798395, "grad_norm": 0.9550414681434631, "learning_rate": 9.521162227153598e-06, "loss": 0.04432179033756256, "memory(GiB)": 21.32, "step": 5647, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.948342 }, { "epoch": 0.18347789364259495, "grad_norm": 0.9319267272949219, "learning_rate": 9.520932814945765e-06, "loss": 0.05591211095452309, "memory(GiB)": 21.32, "step": 5648, "token_acc": 0.9894179894179894, "train_speed(iter/s)": 0.948368 }, { "epoch": 0.18351037910535037, "grad_norm": 0.49861016869544983, "learning_rate": 9.52070335056035e-06, "loss": 0.04549471288919449, "memory(GiB)": 21.32, "step": 5649, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.948398 }, { "epoch": 0.18354286456810578, "grad_norm": 0.7132825255393982, "learning_rate": 9.520473834000001e-06, "loss": 0.05387861281633377, "memory(GiB)": 21.32, "step": 5650, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.948425 }, { "epoch": 0.1835753500308612, "grad_norm": 0.6457433700561523, "learning_rate": 9.520244265267366e-06, "loss": 0.04606470465660095, "memory(GiB)": 21.32, "step": 5651, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.948454 }, { "epoch": 0.18360783549361662, "grad_norm": 0.5534877777099609, "learning_rate": 9.520014644365097e-06, "loss": 0.04715367406606674, "memory(GiB)": 21.32, "step": 5652, "token_acc": 0.9742489270386266, "train_speed(iter/s)": 0.948482 }, { "epoch": 0.18364032095637203, "grad_norm": 0.540250837802887, "learning_rate": 9.519784971295843e-06, "loss": 0.0440816804766655, "memory(GiB)": 21.32, "step": 5653, "token_acc": 0.9764705882352941, "train_speed(iter/s)": 0.948512 }, { "epoch": 0.18367280641912745, "grad_norm": 0.5215848088264465, "learning_rate": 9.519555246062256e-06, "loss": 0.045077793300151825, "memory(GiB)": 21.32, "step": 5654, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.948539 }, { "epoch": 0.18370529188188287, "grad_norm": 0.5472655296325684, "learning_rate": 9.519325468666983e-06, "loss": 0.04027315229177475, "memory(GiB)": 21.32, "step": 5655, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.948571 }, { "epoch": 0.18373777734463828, "grad_norm": 0.7010100483894348, "learning_rate": 9.51909563911268e-06, "loss": 0.04391206055879593, "memory(GiB)": 21.32, "step": 5656, "token_acc": 0.9948717948717949, "train_speed(iter/s)": 0.948601 }, { "epoch": 0.1837702628073937, "grad_norm": 0.5873709321022034, "learning_rate": 9.518865757401998e-06, "loss": 0.05212259292602539, "memory(GiB)": 21.32, "step": 5657, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.948629 }, { "epoch": 0.1838027482701491, "grad_norm": 0.4872598946094513, "learning_rate": 9.518635823537591e-06, "loss": 0.051113128662109375, "memory(GiB)": 21.32, "step": 5658, "token_acc": 0.9766536964980544, "train_speed(iter/s)": 0.948658 }, { "epoch": 0.18383523373290453, "grad_norm": 0.8695952296257019, "learning_rate": 9.518405837522113e-06, "loss": 0.05359712243080139, "memory(GiB)": 21.32, "step": 5659, "token_acc": 0.979253112033195, "train_speed(iter/s)": 0.948682 }, { "epoch": 0.18386771919565995, "grad_norm": 1.1836333274841309, "learning_rate": 9.518175799358218e-06, "loss": 0.06021801382303238, "memory(GiB)": 21.32, "step": 5660, "token_acc": 0.9516129032258065, "train_speed(iter/s)": 0.948708 }, { "epoch": 0.18390020465841536, "grad_norm": 0.7389202117919922, "learning_rate": 9.51794570904856e-06, "loss": 0.04942610114812851, "memory(GiB)": 21.32, "step": 5661, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.948735 }, { "epoch": 0.18393269012117078, "grad_norm": 1.4373507499694824, "learning_rate": 9.517715566595795e-06, "loss": 0.04963549226522446, "memory(GiB)": 21.32, "step": 5662, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.948761 }, { "epoch": 0.1839651755839262, "grad_norm": 0.6248690485954285, "learning_rate": 9.517485372002578e-06, "loss": 0.053565509617328644, "memory(GiB)": 21.32, "step": 5663, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.948787 }, { "epoch": 0.1839976610466816, "grad_norm": 0.5246898531913757, "learning_rate": 9.51725512527157e-06, "loss": 0.04214731231331825, "memory(GiB)": 21.32, "step": 5664, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.948814 }, { "epoch": 0.18403014650943703, "grad_norm": 0.631948709487915, "learning_rate": 9.517024826405423e-06, "loss": 0.04488306865096092, "memory(GiB)": 21.32, "step": 5665, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.94884 }, { "epoch": 0.18406263197219244, "grad_norm": 0.5559665560722351, "learning_rate": 9.516794475406798e-06, "loss": 0.0460776723921299, "memory(GiB)": 21.32, "step": 5666, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.948866 }, { "epoch": 0.18409511743494786, "grad_norm": 0.48509278893470764, "learning_rate": 9.516564072278353e-06, "loss": 0.04836944863200188, "memory(GiB)": 21.32, "step": 5667, "token_acc": 0.9806763285024155, "train_speed(iter/s)": 0.948896 }, { "epoch": 0.18412760289770327, "grad_norm": 0.7288261651992798, "learning_rate": 9.516333617022746e-06, "loss": 0.045557379722595215, "memory(GiB)": 21.32, "step": 5668, "token_acc": 0.9607843137254902, "train_speed(iter/s)": 0.948921 }, { "epoch": 0.1841600883604587, "grad_norm": 0.500644862651825, "learning_rate": 9.51610310964264e-06, "loss": 0.04517623782157898, "memory(GiB)": 21.32, "step": 5669, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.948946 }, { "epoch": 0.1841925738232141, "grad_norm": 0.5414703488349915, "learning_rate": 9.515872550140691e-06, "loss": 0.032294709235429764, "memory(GiB)": 21.32, "step": 5670, "token_acc": 0.9835390946502057, "train_speed(iter/s)": 0.948976 }, { "epoch": 0.18422505928596952, "grad_norm": 0.533638060092926, "learning_rate": 9.515641938519564e-06, "loss": 0.04406903684139252, "memory(GiB)": 21.32, "step": 5671, "token_acc": 0.9853479853479854, "train_speed(iter/s)": 0.949003 }, { "epoch": 0.18425754474872494, "grad_norm": 0.4531387686729431, "learning_rate": 9.515411274781917e-06, "loss": 0.04269509017467499, "memory(GiB)": 21.32, "step": 5672, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.949031 }, { "epoch": 0.18429003021148035, "grad_norm": 0.8333897590637207, "learning_rate": 9.515180558930415e-06, "loss": 0.0448477603495121, "memory(GiB)": 21.32, "step": 5673, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.949058 }, { "epoch": 0.18432251567423577, "grad_norm": 0.7208719849586487, "learning_rate": 9.51494979096772e-06, "loss": 0.04145531356334686, "memory(GiB)": 21.32, "step": 5674, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.949086 }, { "epoch": 0.1843550011369912, "grad_norm": 0.6236658692359924, "learning_rate": 9.514718970896494e-06, "loss": 0.05459102243185043, "memory(GiB)": 21.32, "step": 5675, "token_acc": 1.0, "train_speed(iter/s)": 0.949112 }, { "epoch": 0.1843874865997466, "grad_norm": 0.5809131860733032, "learning_rate": 9.514488098719404e-06, "loss": 0.040722399950027466, "memory(GiB)": 21.32, "step": 5676, "token_acc": 0.98828125, "train_speed(iter/s)": 0.949142 }, { "epoch": 0.18441997206250202, "grad_norm": 0.7981724739074707, "learning_rate": 9.514257174439109e-06, "loss": 0.049215272068977356, "memory(GiB)": 21.32, "step": 5677, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.94917 }, { "epoch": 0.18445245752525743, "grad_norm": 0.5855000019073486, "learning_rate": 9.51402619805828e-06, "loss": 0.0469876229763031, "memory(GiB)": 21.32, "step": 5678, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.949195 }, { "epoch": 0.18448494298801285, "grad_norm": 0.6619122624397278, "learning_rate": 9.51379516957958e-06, "loss": 0.05416196584701538, "memory(GiB)": 21.32, "step": 5679, "token_acc": 0.9701492537313433, "train_speed(iter/s)": 0.949223 }, { "epoch": 0.1845174284507683, "grad_norm": 0.7215926647186279, "learning_rate": 9.513564089005676e-06, "loss": 0.03780422359704971, "memory(GiB)": 21.32, "step": 5680, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.949251 }, { "epoch": 0.1845499139135237, "grad_norm": 0.6634221076965332, "learning_rate": 9.513332956339236e-06, "loss": 0.04976789653301239, "memory(GiB)": 21.32, "step": 5681, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.949287 }, { "epoch": 0.18458239937627913, "grad_norm": 0.6739145517349243, "learning_rate": 9.513101771582925e-06, "loss": 0.04702997952699661, "memory(GiB)": 21.32, "step": 5682, "token_acc": 0.98, "train_speed(iter/s)": 0.949321 }, { "epoch": 0.18461488483903454, "grad_norm": 0.5885410308837891, "learning_rate": 9.512870534739414e-06, "loss": 0.04372364655137062, "memory(GiB)": 21.32, "step": 5683, "token_acc": 0.9683098591549296, "train_speed(iter/s)": 0.949358 }, { "epoch": 0.18464737030178996, "grad_norm": 0.7768591642379761, "learning_rate": 9.51263924581137e-06, "loss": 0.05476788058876991, "memory(GiB)": 21.32, "step": 5684, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.949396 }, { "epoch": 0.18467985576454538, "grad_norm": 0.5204651355743408, "learning_rate": 9.512407904801461e-06, "loss": 0.03612038493156433, "memory(GiB)": 21.32, "step": 5685, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.949433 }, { "epoch": 0.1847123412273008, "grad_norm": 0.716895341873169, "learning_rate": 9.51217651171236e-06, "loss": 0.05394916981458664, "memory(GiB)": 21.32, "step": 5686, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.94947 }, { "epoch": 0.1847448266900562, "grad_norm": 0.4882161021232605, "learning_rate": 9.511945066546738e-06, "loss": 0.03845605254173279, "memory(GiB)": 21.32, "step": 5687, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.949506 }, { "epoch": 0.18477731215281162, "grad_norm": 0.5777793526649475, "learning_rate": 9.51171356930726e-06, "loss": 0.039463456720113754, "memory(GiB)": 21.32, "step": 5688, "token_acc": 0.9834437086092715, "train_speed(iter/s)": 0.94954 }, { "epoch": 0.18480979761556704, "grad_norm": 0.7280208468437195, "learning_rate": 9.511482019996608e-06, "loss": 0.050178490579128265, "memory(GiB)": 21.32, "step": 5689, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.949578 }, { "epoch": 0.18484228307832246, "grad_norm": 0.6400025486946106, "learning_rate": 9.511250418617447e-06, "loss": 0.048286065459251404, "memory(GiB)": 21.32, "step": 5690, "token_acc": 0.9776951672862454, "train_speed(iter/s)": 0.949615 }, { "epoch": 0.18487476854107787, "grad_norm": 0.7263449430465698, "learning_rate": 9.51101876517245e-06, "loss": 0.05247897654771805, "memory(GiB)": 21.32, "step": 5691, "token_acc": 0.9809885931558935, "train_speed(iter/s)": 0.949653 }, { "epoch": 0.1849072540038333, "grad_norm": 0.5156423449516296, "learning_rate": 9.510787059664293e-06, "loss": 0.0497027225792408, "memory(GiB)": 21.32, "step": 5692, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.94969 }, { "epoch": 0.1849397394665887, "grad_norm": 0.4390751123428345, "learning_rate": 9.51055530209565e-06, "loss": 0.04307369887828827, "memory(GiB)": 21.32, "step": 5693, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.949726 }, { "epoch": 0.18497222492934412, "grad_norm": 0.7399478554725647, "learning_rate": 9.510323492469193e-06, "loss": 0.04205811768770218, "memory(GiB)": 21.32, "step": 5694, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.949763 }, { "epoch": 0.18500471039209954, "grad_norm": 0.3852192163467407, "learning_rate": 9.510091630787602e-06, "loss": 0.03372332081198692, "memory(GiB)": 21.32, "step": 5695, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.9498 }, { "epoch": 0.18503719585485495, "grad_norm": 0.6241400241851807, "learning_rate": 9.50985971705355e-06, "loss": 0.052180562168359756, "memory(GiB)": 21.32, "step": 5696, "token_acc": 0.9636363636363636, "train_speed(iter/s)": 0.949836 }, { "epoch": 0.18506968131761037, "grad_norm": 0.8034911751747131, "learning_rate": 9.509627751269715e-06, "loss": 0.05773601680994034, "memory(GiB)": 21.32, "step": 5697, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.949873 }, { "epoch": 0.18510216678036578, "grad_norm": 1.0525727272033691, "learning_rate": 9.509395733438774e-06, "loss": 0.04124489426612854, "memory(GiB)": 21.32, "step": 5698, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.949909 }, { "epoch": 0.1851346522431212, "grad_norm": 0.4686989188194275, "learning_rate": 9.509163663563403e-06, "loss": 0.04822416231036186, "memory(GiB)": 21.32, "step": 5699, "token_acc": 0.975103734439834, "train_speed(iter/s)": 0.949944 }, { "epoch": 0.18516713770587662, "grad_norm": 0.5872669816017151, "learning_rate": 9.508931541646282e-06, "loss": 0.04724224656820297, "memory(GiB)": 21.32, "step": 5700, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.949981 }, { "epoch": 0.18519962316863203, "grad_norm": 0.42066285014152527, "learning_rate": 9.508699367690089e-06, "loss": 0.035481639206409454, "memory(GiB)": 21.32, "step": 5701, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.950016 }, { "epoch": 0.18523210863138745, "grad_norm": 0.6609864234924316, "learning_rate": 9.508467141697504e-06, "loss": 0.043247442692518234, "memory(GiB)": 21.32, "step": 5702, "token_acc": 0.9656862745098039, "train_speed(iter/s)": 0.950045 }, { "epoch": 0.18526459409414286, "grad_norm": 0.5534284710884094, "learning_rate": 9.508234863671208e-06, "loss": 0.057882025837898254, "memory(GiB)": 21.32, "step": 5703, "token_acc": 0.9646464646464646, "train_speed(iter/s)": 0.950076 }, { "epoch": 0.18529707955689828, "grad_norm": 0.5714617371559143, "learning_rate": 9.508002533613882e-06, "loss": 0.04827093705534935, "memory(GiB)": 21.32, "step": 5704, "token_acc": 0.9598214285714286, "train_speed(iter/s)": 0.950104 }, { "epoch": 0.1853295650196537, "grad_norm": 0.5558289885520935, "learning_rate": 9.507770151528206e-06, "loss": 0.05067916959524155, "memory(GiB)": 21.32, "step": 5705, "token_acc": 0.972, "train_speed(iter/s)": 0.950131 }, { "epoch": 0.1853620504824091, "grad_norm": 0.5779975056648254, "learning_rate": 9.507537717416862e-06, "loss": 0.04569725692272186, "memory(GiB)": 21.32, "step": 5706, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.950161 }, { "epoch": 0.18539453594516453, "grad_norm": 1.0290755033493042, "learning_rate": 9.507305231282534e-06, "loss": 0.04368065297603607, "memory(GiB)": 21.32, "step": 5707, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.95019 }, { "epoch": 0.18542702140791995, "grad_norm": 0.6184890270233154, "learning_rate": 9.507072693127904e-06, "loss": 0.04396742209792137, "memory(GiB)": 21.32, "step": 5708, "token_acc": 0.9634703196347032, "train_speed(iter/s)": 0.95022 }, { "epoch": 0.18545950687067536, "grad_norm": 1.2675386667251587, "learning_rate": 9.506840102955657e-06, "loss": 0.051894817501306534, "memory(GiB)": 21.32, "step": 5709, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.950251 }, { "epoch": 0.18549199233343078, "grad_norm": 0.5018244981765747, "learning_rate": 9.506607460768475e-06, "loss": 0.03597772866487503, "memory(GiB)": 21.32, "step": 5710, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.950281 }, { "epoch": 0.1855244777961862, "grad_norm": 0.5526238083839417, "learning_rate": 9.506374766569047e-06, "loss": 0.044811807572841644, "memory(GiB)": 21.32, "step": 5711, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.950309 }, { "epoch": 0.18555696325894164, "grad_norm": 0.681364893913269, "learning_rate": 9.506142020360053e-06, "loss": 0.059934720396995544, "memory(GiB)": 21.32, "step": 5712, "token_acc": 0.9858156028368794, "train_speed(iter/s)": 0.950335 }, { "epoch": 0.18558944872169705, "grad_norm": 0.43720194697380066, "learning_rate": 9.505909222144185e-06, "loss": 0.04157067835330963, "memory(GiB)": 21.32, "step": 5713, "token_acc": 0.9730941704035875, "train_speed(iter/s)": 0.950363 }, { "epoch": 0.18562193418445247, "grad_norm": 0.49509087204933167, "learning_rate": 9.505676371924128e-06, "loss": 0.040775127708911896, "memory(GiB)": 21.32, "step": 5714, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.950392 }, { "epoch": 0.18565441964720789, "grad_norm": 0.561684787273407, "learning_rate": 9.505443469702566e-06, "loss": 0.047355201095342636, "memory(GiB)": 21.32, "step": 5715, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.950424 }, { "epoch": 0.1856869051099633, "grad_norm": 0.6666832566261292, "learning_rate": 9.50521051548219e-06, "loss": 0.06447823345661163, "memory(GiB)": 21.32, "step": 5716, "token_acc": 0.9703703703703703, "train_speed(iter/s)": 0.950453 }, { "epoch": 0.18571939057271872, "grad_norm": 0.599016547203064, "learning_rate": 9.50497750926569e-06, "loss": 0.053640469908714294, "memory(GiB)": 21.32, "step": 5717, "token_acc": 0.97265625, "train_speed(iter/s)": 0.950484 }, { "epoch": 0.18575187603547413, "grad_norm": 0.5388544201850891, "learning_rate": 9.504744451055752e-06, "loss": 0.03929273039102554, "memory(GiB)": 21.32, "step": 5718, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.950512 }, { "epoch": 0.18578436149822955, "grad_norm": 0.7292189598083496, "learning_rate": 9.504511340855069e-06, "loss": 0.0492730438709259, "memory(GiB)": 21.32, "step": 5719, "token_acc": 0.963963963963964, "train_speed(iter/s)": 0.95054 }, { "epoch": 0.18581684696098497, "grad_norm": 0.5687437057495117, "learning_rate": 9.504278178666328e-06, "loss": 0.04411476105451584, "memory(GiB)": 21.32, "step": 5720, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.950565 }, { "epoch": 0.18584933242374038, "grad_norm": 0.720668613910675, "learning_rate": 9.504044964492222e-06, "loss": 0.05994315445423126, "memory(GiB)": 21.32, "step": 5721, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.950585 }, { "epoch": 0.1858818178864958, "grad_norm": 0.9193874001502991, "learning_rate": 9.503811698335441e-06, "loss": 0.06387743353843689, "memory(GiB)": 21.32, "step": 5722, "token_acc": 0.9661016949152542, "train_speed(iter/s)": 0.950607 }, { "epoch": 0.18591430334925121, "grad_norm": 0.6988903284072876, "learning_rate": 9.50357838019868e-06, "loss": 0.043551430106163025, "memory(GiB)": 21.32, "step": 5723, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.950633 }, { "epoch": 0.18594678881200663, "grad_norm": 0.6740153431892395, "learning_rate": 9.503345010084631e-06, "loss": 0.045733630657196045, "memory(GiB)": 21.32, "step": 5724, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.950658 }, { "epoch": 0.18597927427476205, "grad_norm": 0.459894061088562, "learning_rate": 9.503111587995985e-06, "loss": 0.057452812790870667, "memory(GiB)": 21.32, "step": 5725, "token_acc": 0.988, "train_speed(iter/s)": 0.950683 }, { "epoch": 0.18601175973751746, "grad_norm": 0.48171132802963257, "learning_rate": 9.502878113935438e-06, "loss": 0.05254293233156204, "memory(GiB)": 21.32, "step": 5726, "token_acc": 0.974025974025974, "train_speed(iter/s)": 0.950712 }, { "epoch": 0.18604424520027288, "grad_norm": 0.42740994691848755, "learning_rate": 9.502644587905685e-06, "loss": 0.04495738446712494, "memory(GiB)": 21.32, "step": 5727, "token_acc": 0.9888268156424581, "train_speed(iter/s)": 0.950742 }, { "epoch": 0.1860767306630283, "grad_norm": 0.7094056606292725, "learning_rate": 9.502411009909419e-06, "loss": 0.054689910262823105, "memory(GiB)": 21.32, "step": 5728, "token_acc": 0.976303317535545, "train_speed(iter/s)": 0.950766 }, { "epoch": 0.1861092161257837, "grad_norm": 0.48327288031578064, "learning_rate": 9.502177379949338e-06, "loss": 0.04937766492366791, "memory(GiB)": 21.32, "step": 5729, "token_acc": 0.9826086956521739, "train_speed(iter/s)": 0.95079 }, { "epoch": 0.18614170158853913, "grad_norm": 0.5265134572982788, "learning_rate": 9.501943698028138e-06, "loss": 0.04165145009756088, "memory(GiB)": 21.32, "step": 5730, "token_acc": 0.9779411764705882, "train_speed(iter/s)": 0.950819 }, { "epoch": 0.18617418705129454, "grad_norm": 0.4549776315689087, "learning_rate": 9.501709964148516e-06, "loss": 0.0416736826300621, "memory(GiB)": 21.32, "step": 5731, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.950846 }, { "epoch": 0.18620667251404996, "grad_norm": 0.5599362850189209, "learning_rate": 9.501476178313168e-06, "loss": 0.050736941397190094, "memory(GiB)": 21.32, "step": 5732, "token_acc": 0.9868995633187773, "train_speed(iter/s)": 0.950873 }, { "epoch": 0.18623915797680538, "grad_norm": 0.6120661497116089, "learning_rate": 9.501242340524795e-06, "loss": 0.045781172811985016, "memory(GiB)": 21.32, "step": 5733, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.9509 }, { "epoch": 0.1862716434395608, "grad_norm": 0.5134264230728149, "learning_rate": 9.501008450786094e-06, "loss": 0.053628064692020416, "memory(GiB)": 21.32, "step": 5734, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.950928 }, { "epoch": 0.1863041289023162, "grad_norm": 0.7504315972328186, "learning_rate": 9.500774509099762e-06, "loss": 0.04347904771566391, "memory(GiB)": 21.32, "step": 5735, "token_acc": 0.9758064516129032, "train_speed(iter/s)": 0.950952 }, { "epoch": 0.18633661436507162, "grad_norm": 0.5274182558059692, "learning_rate": 9.500540515468505e-06, "loss": 0.03830541670322418, "memory(GiB)": 21.32, "step": 5736, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.95098 }, { "epoch": 0.18636909982782704, "grad_norm": 0.4891243278980255, "learning_rate": 9.500306469895018e-06, "loss": 0.050081122666597366, "memory(GiB)": 21.32, "step": 5737, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.951008 }, { "epoch": 0.18640158529058246, "grad_norm": 0.4786054790019989, "learning_rate": 9.500072372382005e-06, "loss": 0.0417165532708168, "memory(GiB)": 21.32, "step": 5738, "token_acc": 0.9896373056994818, "train_speed(iter/s)": 0.951033 }, { "epoch": 0.18643407075333787, "grad_norm": 0.4640042781829834, "learning_rate": 9.499838222932169e-06, "loss": 0.045580171048641205, "memory(GiB)": 21.32, "step": 5739, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.951059 }, { "epoch": 0.1864665562160933, "grad_norm": 0.541854202747345, "learning_rate": 9.499604021548207e-06, "loss": 0.05264914035797119, "memory(GiB)": 21.32, "step": 5740, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.951091 }, { "epoch": 0.1864990416788487, "grad_norm": 0.6163755655288696, "learning_rate": 9.49936976823283e-06, "loss": 0.04489797353744507, "memory(GiB)": 21.32, "step": 5741, "token_acc": 0.9656862745098039, "train_speed(iter/s)": 0.951127 }, { "epoch": 0.18653152714160412, "grad_norm": 0.48068490624427795, "learning_rate": 9.499135462988735e-06, "loss": 0.047830261290073395, "memory(GiB)": 21.32, "step": 5742, "token_acc": 0.9813953488372092, "train_speed(iter/s)": 0.951161 }, { "epoch": 0.18656401260435954, "grad_norm": 0.4267329275608063, "learning_rate": 9.498901105818628e-06, "loss": 0.045018669217824936, "memory(GiB)": 21.32, "step": 5743, "token_acc": 0.98828125, "train_speed(iter/s)": 0.951198 }, { "epoch": 0.18659649806711498, "grad_norm": 0.5612175464630127, "learning_rate": 9.498666696725216e-06, "loss": 0.05211707577109337, "memory(GiB)": 21.32, "step": 5744, "token_acc": 0.9854545454545455, "train_speed(iter/s)": 0.951232 }, { "epoch": 0.1866289835298704, "grad_norm": 0.711140513420105, "learning_rate": 9.4984322357112e-06, "loss": 0.04701272398233414, "memory(GiB)": 21.32, "step": 5745, "token_acc": 0.9885057471264368, "train_speed(iter/s)": 0.951266 }, { "epoch": 0.1866614689926258, "grad_norm": 0.7044839859008789, "learning_rate": 9.49819772277929e-06, "loss": 0.057645607739686966, "memory(GiB)": 21.32, "step": 5746, "token_acc": 0.9771863117870723, "train_speed(iter/s)": 0.951301 }, { "epoch": 0.18669395445538123, "grad_norm": 0.6547828316688538, "learning_rate": 9.497963157932192e-06, "loss": 0.059056058526039124, "memory(GiB)": 21.32, "step": 5747, "token_acc": 0.9795081967213115, "train_speed(iter/s)": 0.951336 }, { "epoch": 0.18672643991813664, "grad_norm": 0.6776278018951416, "learning_rate": 9.49772854117261e-06, "loss": 0.04608798027038574, "memory(GiB)": 21.32, "step": 5748, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.951371 }, { "epoch": 0.18675892538089206, "grad_norm": 0.6068955063819885, "learning_rate": 9.497493872503256e-06, "loss": 0.04345201700925827, "memory(GiB)": 21.32, "step": 5749, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.951406 }, { "epoch": 0.18679141084364748, "grad_norm": 0.6922162175178528, "learning_rate": 9.497259151926837e-06, "loss": 0.048439182341098785, "memory(GiB)": 21.32, "step": 5750, "token_acc": 0.980544747081712, "train_speed(iter/s)": 0.95144 }, { "epoch": 0.1868238963064029, "grad_norm": 0.6352069973945618, "learning_rate": 9.49702437944606e-06, "loss": 0.04311135783791542, "memory(GiB)": 21.32, "step": 5751, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.951474 }, { "epoch": 0.1868563817691583, "grad_norm": 0.622092604637146, "learning_rate": 9.496789555063637e-06, "loss": 0.04572094976902008, "memory(GiB)": 21.32, "step": 5752, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.951506 }, { "epoch": 0.18688886723191372, "grad_norm": 0.4320179522037506, "learning_rate": 9.496554678782277e-06, "loss": 0.041093602776527405, "memory(GiB)": 21.32, "step": 5753, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.951542 }, { "epoch": 0.18692135269466914, "grad_norm": 1.042983055114746, "learning_rate": 9.496319750604692e-06, "loss": 0.04001271352171898, "memory(GiB)": 21.32, "step": 5754, "token_acc": 0.975, "train_speed(iter/s)": 0.951579 }, { "epoch": 0.18695383815742456, "grad_norm": 0.59864741563797, "learning_rate": 9.496084770533592e-06, "loss": 0.04686370864510536, "memory(GiB)": 21.32, "step": 5755, "token_acc": 0.9731543624161074, "train_speed(iter/s)": 0.951609 }, { "epoch": 0.18698632362017997, "grad_norm": 0.5195028781890869, "learning_rate": 9.49584973857169e-06, "loss": 0.04692612215876579, "memory(GiB)": 21.32, "step": 5756, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.951646 }, { "epoch": 0.1870188090829354, "grad_norm": 0.6011926531791687, "learning_rate": 9.495614654721697e-06, "loss": 0.04731353744864464, "memory(GiB)": 21.32, "step": 5757, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.951679 }, { "epoch": 0.1870512945456908, "grad_norm": 0.547225296497345, "learning_rate": 9.495379518986328e-06, "loss": 0.044425468891859055, "memory(GiB)": 21.32, "step": 5758, "token_acc": 0.9636363636363636, "train_speed(iter/s)": 0.951715 }, { "epoch": 0.18708378000844622, "grad_norm": 0.6990426182746887, "learning_rate": 9.495144331368295e-06, "loss": 0.046091385185718536, "memory(GiB)": 21.32, "step": 5759, "token_acc": 0.9813953488372092, "train_speed(iter/s)": 0.951753 }, { "epoch": 0.18711626547120164, "grad_norm": 0.9071236252784729, "learning_rate": 9.494909091870314e-06, "loss": 0.053964823484420776, "memory(GiB)": 21.32, "step": 5760, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.951788 }, { "epoch": 0.18714875093395705, "grad_norm": 0.7731192111968994, "learning_rate": 9.4946738004951e-06, "loss": 0.030876653268933296, "memory(GiB)": 21.32, "step": 5761, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.951825 }, { "epoch": 0.18718123639671247, "grad_norm": 0.4566187858581543, "learning_rate": 9.494438457245367e-06, "loss": 0.04571918398141861, "memory(GiB)": 21.32, "step": 5762, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.951861 }, { "epoch": 0.18721372185946789, "grad_norm": 0.55436772108078, "learning_rate": 9.494203062123832e-06, "loss": 0.041825294494628906, "memory(GiB)": 21.32, "step": 5763, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.951896 }, { "epoch": 0.1872462073222233, "grad_norm": 1.0969194173812866, "learning_rate": 9.493967615133213e-06, "loss": 0.04455551132559776, "memory(GiB)": 21.32, "step": 5764, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.951923 }, { "epoch": 0.18727869278497872, "grad_norm": 0.7016893029212952, "learning_rate": 9.493732116276224e-06, "loss": 0.05252934992313385, "memory(GiB)": 21.32, "step": 5765, "token_acc": 0.9597069597069597, "train_speed(iter/s)": 0.951953 }, { "epoch": 0.18731117824773413, "grad_norm": 0.8786470293998718, "learning_rate": 9.493496565555588e-06, "loss": 0.04323260113596916, "memory(GiB)": 21.32, "step": 5766, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.951981 }, { "epoch": 0.18734366371048955, "grad_norm": 0.4554087221622467, "learning_rate": 9.49326096297402e-06, "loss": 0.036495961248874664, "memory(GiB)": 21.32, "step": 5767, "token_acc": 0.9766355140186916, "train_speed(iter/s)": 0.952009 }, { "epoch": 0.18737614917324497, "grad_norm": 0.5199366211891174, "learning_rate": 9.49302530853424e-06, "loss": 0.03731516748666763, "memory(GiB)": 21.32, "step": 5768, "token_acc": 0.9702127659574468, "train_speed(iter/s)": 0.952036 }, { "epoch": 0.18740863463600038, "grad_norm": 0.4478982090950012, "learning_rate": 9.492789602238967e-06, "loss": 0.04090385138988495, "memory(GiB)": 21.32, "step": 5769, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.952063 }, { "epoch": 0.1874411200987558, "grad_norm": 0.5413258671760559, "learning_rate": 9.492553844090922e-06, "loss": 0.043301746249198914, "memory(GiB)": 21.32, "step": 5770, "token_acc": 0.9752475247524752, "train_speed(iter/s)": 0.952091 }, { "epoch": 0.18747360556151121, "grad_norm": 0.4014348089694977, "learning_rate": 9.492318034092827e-06, "loss": 0.034350618720054626, "memory(GiB)": 21.32, "step": 5771, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.952119 }, { "epoch": 0.18750609102426663, "grad_norm": 0.5955945253372192, "learning_rate": 9.492082172247402e-06, "loss": 0.04740327596664429, "memory(GiB)": 21.32, "step": 5772, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.952147 }, { "epoch": 0.18753857648702205, "grad_norm": 0.7934629321098328, "learning_rate": 9.491846258557368e-06, "loss": 0.05428662896156311, "memory(GiB)": 21.32, "step": 5773, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.952175 }, { "epoch": 0.18757106194977746, "grad_norm": 0.8816479444503784, "learning_rate": 9.491610293025452e-06, "loss": 0.059869758784770966, "memory(GiB)": 21.32, "step": 5774, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.952201 }, { "epoch": 0.18760354741253288, "grad_norm": 0.8033193349838257, "learning_rate": 9.491374275654375e-06, "loss": 0.048614561557769775, "memory(GiB)": 21.32, "step": 5775, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.952227 }, { "epoch": 0.18763603287528832, "grad_norm": 0.555846095085144, "learning_rate": 9.491138206446861e-06, "loss": 0.04835645854473114, "memory(GiB)": 21.32, "step": 5776, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.952255 }, { "epoch": 0.18766851833804374, "grad_norm": 0.6470622420310974, "learning_rate": 9.490902085405634e-06, "loss": 0.05495047569274902, "memory(GiB)": 21.32, "step": 5777, "token_acc": 0.9816176470588235, "train_speed(iter/s)": 0.952281 }, { "epoch": 0.18770100380079915, "grad_norm": 1.1591414213180542, "learning_rate": 9.490665912533419e-06, "loss": 0.051463618874549866, "memory(GiB)": 21.32, "step": 5778, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.95231 }, { "epoch": 0.18773348926355457, "grad_norm": 0.7416744232177734, "learning_rate": 9.490429687832941e-06, "loss": 0.0464295968413353, "memory(GiB)": 21.32, "step": 5779, "token_acc": 0.9807692307692307, "train_speed(iter/s)": 0.952339 }, { "epoch": 0.18776597472631, "grad_norm": 0.6075328588485718, "learning_rate": 9.49019341130693e-06, "loss": 0.04912327975034714, "memory(GiB)": 21.32, "step": 5780, "token_acc": 0.9745762711864406, "train_speed(iter/s)": 0.952365 }, { "epoch": 0.1877984601890654, "grad_norm": 0.578350841999054, "learning_rate": 9.48995708295811e-06, "loss": 0.04607103765010834, "memory(GiB)": 21.32, "step": 5781, "token_acc": 0.9745762711864406, "train_speed(iter/s)": 0.952386 }, { "epoch": 0.18783094565182082, "grad_norm": 0.7433334589004517, "learning_rate": 9.48972070278921e-06, "loss": 0.053584642708301544, "memory(GiB)": 21.32, "step": 5782, "token_acc": 0.9710743801652892, "train_speed(iter/s)": 0.952407 }, { "epoch": 0.18786343111457623, "grad_norm": 0.8592597246170044, "learning_rate": 9.489484270802956e-06, "loss": 0.05492842197418213, "memory(GiB)": 21.32, "step": 5783, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.952428 }, { "epoch": 0.18789591657733165, "grad_norm": 0.6461151838302612, "learning_rate": 9.489247787002076e-06, "loss": 0.054956987500190735, "memory(GiB)": 21.32, "step": 5784, "token_acc": 0.9846153846153847, "train_speed(iter/s)": 0.952453 }, { "epoch": 0.18792840204008707, "grad_norm": 0.5757251977920532, "learning_rate": 9.489011251389304e-06, "loss": 0.05129000172019005, "memory(GiB)": 21.32, "step": 5785, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.952477 }, { "epoch": 0.18796088750284248, "grad_norm": 0.3841495215892792, "learning_rate": 9.488774663967367e-06, "loss": 0.03955947235226631, "memory(GiB)": 21.32, "step": 5786, "token_acc": 1.0, "train_speed(iter/s)": 0.9525 }, { "epoch": 0.1879933729655979, "grad_norm": 0.5102863311767578, "learning_rate": 9.488538024738996e-06, "loss": 0.047393832355737686, "memory(GiB)": 21.32, "step": 5787, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.952518 }, { "epoch": 0.18802585842835332, "grad_norm": 0.45492470264434814, "learning_rate": 9.488301333706922e-06, "loss": 0.04241315275430679, "memory(GiB)": 21.32, "step": 5788, "token_acc": 0.979253112033195, "train_speed(iter/s)": 0.952544 }, { "epoch": 0.18805834389110873, "grad_norm": 0.46867209672927856, "learning_rate": 9.488064590873875e-06, "loss": 0.03774338960647583, "memory(GiB)": 21.32, "step": 5789, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.952571 }, { "epoch": 0.18809082935386415, "grad_norm": 0.4898974597454071, "learning_rate": 9.487827796242592e-06, "loss": 0.05262710899114609, "memory(GiB)": 21.32, "step": 5790, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.9526 }, { "epoch": 0.18812331481661956, "grad_norm": 0.49679628014564514, "learning_rate": 9.4875909498158e-06, "loss": 0.04249080270528793, "memory(GiB)": 21.32, "step": 5791, "token_acc": 0.9742489270386266, "train_speed(iter/s)": 0.952628 }, { "epoch": 0.18815580027937498, "grad_norm": 0.9830203056335449, "learning_rate": 9.487354051596239e-06, "loss": 0.060577694326639175, "memory(GiB)": 21.32, "step": 5792, "token_acc": 0.9814126394052045, "train_speed(iter/s)": 0.952655 }, { "epoch": 0.1881882857421304, "grad_norm": 0.561741054058075, "learning_rate": 9.487117101586636e-06, "loss": 0.03745993971824646, "memory(GiB)": 21.32, "step": 5793, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.952679 }, { "epoch": 0.1882207712048858, "grad_norm": 0.5865980982780457, "learning_rate": 9.48688009978973e-06, "loss": 0.04129820317029953, "memory(GiB)": 21.32, "step": 5794, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.952708 }, { "epoch": 0.18825325666764123, "grad_norm": 0.4966242015361786, "learning_rate": 9.486643046208257e-06, "loss": 0.04726746678352356, "memory(GiB)": 21.32, "step": 5795, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.952735 }, { "epoch": 0.18828574213039664, "grad_norm": 0.6649475693702698, "learning_rate": 9.486405940844951e-06, "loss": 0.0415620431303978, "memory(GiB)": 21.32, "step": 5796, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.952762 }, { "epoch": 0.18831822759315206, "grad_norm": 1.0145286321640015, "learning_rate": 9.486168783702548e-06, "loss": 0.046831805258989334, "memory(GiB)": 21.32, "step": 5797, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.952788 }, { "epoch": 0.18835071305590748, "grad_norm": 0.7691040635108948, "learning_rate": 9.485931574783788e-06, "loss": 0.05335913226008415, "memory(GiB)": 21.32, "step": 5798, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.952815 }, { "epoch": 0.1883831985186629, "grad_norm": 0.888069212436676, "learning_rate": 9.485694314091406e-06, "loss": 0.042926158756017685, "memory(GiB)": 21.32, "step": 5799, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.952843 }, { "epoch": 0.1884156839814183, "grad_norm": 2.4556357860565186, "learning_rate": 9.48545700162814e-06, "loss": 0.05079638957977295, "memory(GiB)": 21.32, "step": 5800, "token_acc": 0.9724409448818898, "train_speed(iter/s)": 0.952877 }, { "epoch": 0.18844816944417372, "grad_norm": 0.6407943367958069, "learning_rate": 9.485219637396732e-06, "loss": 0.04004526138305664, "memory(GiB)": 21.32, "step": 5801, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.952913 }, { "epoch": 0.18848065490692914, "grad_norm": 0.5902001261711121, "learning_rate": 9.484982221399919e-06, "loss": 0.049330271780490875, "memory(GiB)": 21.32, "step": 5802, "token_acc": 0.9825174825174825, "train_speed(iter/s)": 0.952949 }, { "epoch": 0.18851314036968456, "grad_norm": 0.6821351647377014, "learning_rate": 9.48474475364044e-06, "loss": 0.042319588363170624, "memory(GiB)": 21.32, "step": 5803, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.952983 }, { "epoch": 0.18854562583243997, "grad_norm": 0.7001079320907593, "learning_rate": 9.48450723412104e-06, "loss": 0.04095941781997681, "memory(GiB)": 21.32, "step": 5804, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.953016 }, { "epoch": 0.1885781112951954, "grad_norm": 0.4971589148044586, "learning_rate": 9.484269662844456e-06, "loss": 0.045612089335918427, "memory(GiB)": 21.32, "step": 5805, "token_acc": 0.9776951672862454, "train_speed(iter/s)": 0.953049 }, { "epoch": 0.1886105967579508, "grad_norm": 0.46374014019966125, "learning_rate": 9.484032039813432e-06, "loss": 0.0406646654009819, "memory(GiB)": 21.32, "step": 5806, "token_acc": 0.9754385964912281, "train_speed(iter/s)": 0.953086 }, { "epoch": 0.18864308222070622, "grad_norm": 0.8797656893730164, "learning_rate": 9.48379436503071e-06, "loss": 0.05883839353919029, "memory(GiB)": 21.32, "step": 5807, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.95312 }, { "epoch": 0.18867556768346166, "grad_norm": 0.5149481892585754, "learning_rate": 9.483556638499033e-06, "loss": 0.04145772010087967, "memory(GiB)": 21.32, "step": 5808, "token_acc": 0.984, "train_speed(iter/s)": 0.953155 }, { "epoch": 0.18870805314621708, "grad_norm": 0.4035540819168091, "learning_rate": 9.483318860221144e-06, "loss": 0.035110700875520706, "memory(GiB)": 21.32, "step": 5809, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.953189 }, { "epoch": 0.1887405386089725, "grad_norm": 0.8195948600769043, "learning_rate": 9.48308103019979e-06, "loss": 0.048529576510190964, "memory(GiB)": 21.32, "step": 5810, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.953226 }, { "epoch": 0.1887730240717279, "grad_norm": 0.44866663217544556, "learning_rate": 9.482843148437712e-06, "loss": 0.04155447334051132, "memory(GiB)": 21.32, "step": 5811, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.95326 }, { "epoch": 0.18880550953448333, "grad_norm": 0.807105302810669, "learning_rate": 9.48260521493766e-06, "loss": 0.051833152770996094, "memory(GiB)": 21.32, "step": 5812, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.953294 }, { "epoch": 0.18883799499723875, "grad_norm": 0.6548435091972351, "learning_rate": 9.482367229702375e-06, "loss": 0.0464446097612381, "memory(GiB)": 21.32, "step": 5813, "token_acc": 0.9813953488372092, "train_speed(iter/s)": 0.95333 }, { "epoch": 0.18887048045999416, "grad_norm": 0.6541356444358826, "learning_rate": 9.482129192734607e-06, "loss": 0.04406440258026123, "memory(GiB)": 21.32, "step": 5814, "token_acc": 0.9733333333333334, "train_speed(iter/s)": 0.953364 }, { "epoch": 0.18890296592274958, "grad_norm": 0.48539653420448303, "learning_rate": 9.481891104037101e-06, "loss": 0.03232968598604202, "memory(GiB)": 21.32, "step": 5815, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.953398 }, { "epoch": 0.188935451385505, "grad_norm": 0.5536389350891113, "learning_rate": 9.481652963612609e-06, "loss": 0.0411432608962059, "memory(GiB)": 21.32, "step": 5816, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.953433 }, { "epoch": 0.1889679368482604, "grad_norm": 0.7421273589134216, "learning_rate": 9.481414771463875e-06, "loss": 0.04383175075054169, "memory(GiB)": 21.32, "step": 5817, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.953469 }, { "epoch": 0.18900042231101583, "grad_norm": 1.34373939037323, "learning_rate": 9.48117652759365e-06, "loss": 0.04409132897853851, "memory(GiB)": 21.32, "step": 5818, "token_acc": 0.9813084112149533, "train_speed(iter/s)": 0.953504 }, { "epoch": 0.18903290777377124, "grad_norm": 1.5483086109161377, "learning_rate": 9.480938232004684e-06, "loss": 0.05831605941057205, "memory(GiB)": 21.32, "step": 5819, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.953538 }, { "epoch": 0.18906539323652666, "grad_norm": 0.6680203080177307, "learning_rate": 9.480699884699726e-06, "loss": 0.04726763814687729, "memory(GiB)": 21.32, "step": 5820, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.953575 }, { "epoch": 0.18909787869928207, "grad_norm": 0.7973527312278748, "learning_rate": 9.480461485681528e-06, "loss": 0.05418279021978378, "memory(GiB)": 21.32, "step": 5821, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.953608 }, { "epoch": 0.1891303641620375, "grad_norm": 1.3674625158309937, "learning_rate": 9.48022303495284e-06, "loss": 0.05657441169023514, "memory(GiB)": 21.32, "step": 5822, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.953642 }, { "epoch": 0.1891628496247929, "grad_norm": 0.576871395111084, "learning_rate": 9.479984532516417e-06, "loss": 0.05811760947108269, "memory(GiB)": 21.32, "step": 5823, "token_acc": 0.9769230769230769, "train_speed(iter/s)": 0.953676 }, { "epoch": 0.18919533508754832, "grad_norm": 1.0971910953521729, "learning_rate": 9.479745978375008e-06, "loss": 0.04053322225809097, "memory(GiB)": 21.32, "step": 5824, "token_acc": 0.9812734082397003, "train_speed(iter/s)": 0.953712 }, { "epoch": 0.18922782055030374, "grad_norm": 0.695626974105835, "learning_rate": 9.479507372531369e-06, "loss": 0.05700775608420372, "memory(GiB)": 21.32, "step": 5825, "token_acc": 0.956, "train_speed(iter/s)": 0.953747 }, { "epoch": 0.18926030601305915, "grad_norm": 0.47558024525642395, "learning_rate": 9.479268714988253e-06, "loss": 0.029468411579728127, "memory(GiB)": 21.32, "step": 5826, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.953775 }, { "epoch": 0.18929279147581457, "grad_norm": 0.6227778792381287, "learning_rate": 9.479030005748412e-06, "loss": 0.05140943080186844, "memory(GiB)": 21.32, "step": 5827, "token_acc": 0.9698492462311558, "train_speed(iter/s)": 0.953803 }, { "epoch": 0.18932527693857, "grad_norm": 0.6122519373893738, "learning_rate": 9.478791244814604e-06, "loss": 0.049068812280893326, "memory(GiB)": 21.32, "step": 5828, "token_acc": 0.9601593625498008, "train_speed(iter/s)": 0.953829 }, { "epoch": 0.1893577624013254, "grad_norm": 0.6987624764442444, "learning_rate": 9.478552432189584e-06, "loss": 0.03732144087553024, "memory(GiB)": 21.32, "step": 5829, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.953857 }, { "epoch": 0.18939024786408082, "grad_norm": 0.4857792258262634, "learning_rate": 9.478313567876108e-06, "loss": 0.04214077070355415, "memory(GiB)": 21.32, "step": 5830, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.953884 }, { "epoch": 0.18942273332683623, "grad_norm": 0.7529966831207275, "learning_rate": 9.478074651876934e-06, "loss": 0.05550704896450043, "memory(GiB)": 21.32, "step": 5831, "token_acc": 0.9754901960784313, "train_speed(iter/s)": 0.95391 }, { "epoch": 0.18945521878959165, "grad_norm": 0.742716372013092, "learning_rate": 9.477835684194815e-06, "loss": 0.0600990355014801, "memory(GiB)": 21.32, "step": 5832, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.953937 }, { "epoch": 0.18948770425234707, "grad_norm": 0.537074863910675, "learning_rate": 9.477596664832516e-06, "loss": 0.037136711180210114, "memory(GiB)": 21.32, "step": 5833, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.953962 }, { "epoch": 0.18952018971510248, "grad_norm": 1.1952236890792847, "learning_rate": 9.47735759379279e-06, "loss": 0.04539337009191513, "memory(GiB)": 21.32, "step": 5834, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.953989 }, { "epoch": 0.1895526751778579, "grad_norm": 0.5785682797431946, "learning_rate": 9.477118471078399e-06, "loss": 0.04191511869430542, "memory(GiB)": 21.32, "step": 5835, "token_acc": 0.9661016949152542, "train_speed(iter/s)": 0.954019 }, { "epoch": 0.18958516064061331, "grad_norm": 0.5584440231323242, "learning_rate": 9.4768792966921e-06, "loss": 0.038925059139728546, "memory(GiB)": 21.32, "step": 5836, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.954046 }, { "epoch": 0.18961764610336873, "grad_norm": 0.5351327061653137, "learning_rate": 9.476640070636657e-06, "loss": 0.04624275118112564, "memory(GiB)": 21.32, "step": 5837, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.954074 }, { "epoch": 0.18965013156612415, "grad_norm": 0.6346611976623535, "learning_rate": 9.476400792914828e-06, "loss": 0.0408078096807003, "memory(GiB)": 21.32, "step": 5838, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.954103 }, { "epoch": 0.18968261702887956, "grad_norm": 0.7779136896133423, "learning_rate": 9.476161463529376e-06, "loss": 0.05113273859024048, "memory(GiB)": 21.32, "step": 5839, "token_acc": 0.9806949806949807, "train_speed(iter/s)": 0.95413 }, { "epoch": 0.189715102491635, "grad_norm": 0.9498361349105835, "learning_rate": 9.475922082483063e-06, "loss": 0.05630582943558693, "memory(GiB)": 21.32, "step": 5840, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.954153 }, { "epoch": 0.18974758795439042, "grad_norm": 0.5837224721908569, "learning_rate": 9.475682649778653e-06, "loss": 0.03315211459994316, "memory(GiB)": 21.32, "step": 5841, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.954177 }, { "epoch": 0.18978007341714584, "grad_norm": 2.9422731399536133, "learning_rate": 9.475443165418906e-06, "loss": 0.044822171330451965, "memory(GiB)": 21.32, "step": 5842, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.954203 }, { "epoch": 0.18981255887990126, "grad_norm": 0.6983242034912109, "learning_rate": 9.475203629406588e-06, "loss": 0.04212741553783417, "memory(GiB)": 21.32, "step": 5843, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.954229 }, { "epoch": 0.18984504434265667, "grad_norm": 0.584841787815094, "learning_rate": 9.474964041744465e-06, "loss": 0.04346618801355362, "memory(GiB)": 21.32, "step": 5844, "token_acc": 0.990521327014218, "train_speed(iter/s)": 0.95425 }, { "epoch": 0.1898775298054121, "grad_norm": 0.8979945778846741, "learning_rate": 9.4747244024353e-06, "loss": 0.0473153181374073, "memory(GiB)": 21.32, "step": 5845, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.954271 }, { "epoch": 0.1899100152681675, "grad_norm": 0.6046831011772156, "learning_rate": 9.474484711481862e-06, "loss": 0.044392671436071396, "memory(GiB)": 21.32, "step": 5846, "token_acc": 0.9797979797979798, "train_speed(iter/s)": 0.954293 }, { "epoch": 0.18994250073092292, "grad_norm": 0.43857541680336, "learning_rate": 9.474244968886912e-06, "loss": 0.04743655025959015, "memory(GiB)": 21.32, "step": 5847, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.954315 }, { "epoch": 0.18997498619367834, "grad_norm": 0.7185441851615906, "learning_rate": 9.47400517465322e-06, "loss": 0.03830336406826973, "memory(GiB)": 21.32, "step": 5848, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.95434 }, { "epoch": 0.19000747165643375, "grad_norm": 0.5548007488250732, "learning_rate": 9.473765328783555e-06, "loss": 0.04361026734113693, "memory(GiB)": 21.32, "step": 5849, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.954363 }, { "epoch": 0.19003995711918917, "grad_norm": 0.8768612146377563, "learning_rate": 9.473525431280683e-06, "loss": 0.056061044335365295, "memory(GiB)": 21.32, "step": 5850, "token_acc": 0.9893238434163701, "train_speed(iter/s)": 0.954388 }, { "epoch": 0.19007244258194458, "grad_norm": 1.273486614227295, "learning_rate": 9.473285482147371e-06, "loss": 0.04872459918260574, "memory(GiB)": 21.32, "step": 5851, "token_acc": 0.9702127659574468, "train_speed(iter/s)": 0.954416 }, { "epoch": 0.1901049280447, "grad_norm": 0.6562582850456238, "learning_rate": 9.473045481386393e-06, "loss": 0.04582471400499344, "memory(GiB)": 21.32, "step": 5852, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.954444 }, { "epoch": 0.19013741350745542, "grad_norm": 0.6298514604568481, "learning_rate": 9.472805429000518e-06, "loss": 0.05544604733586311, "memory(GiB)": 21.32, "step": 5853, "token_acc": 0.9671361502347418, "train_speed(iter/s)": 0.95447 }, { "epoch": 0.19016989897021083, "grad_norm": 1.1706355810165405, "learning_rate": 9.472565324992514e-06, "loss": 0.04646557569503784, "memory(GiB)": 21.32, "step": 5854, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.954499 }, { "epoch": 0.19020238443296625, "grad_norm": 0.5740514397621155, "learning_rate": 9.47232516936515e-06, "loss": 0.03601185232400894, "memory(GiB)": 21.32, "step": 5855, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.954527 }, { "epoch": 0.19023486989572166, "grad_norm": 0.6155999302864075, "learning_rate": 9.472084962121204e-06, "loss": 0.04055505245923996, "memory(GiB)": 21.32, "step": 5856, "token_acc": 0.9669811320754716, "train_speed(iter/s)": 0.954556 }, { "epoch": 0.19026735535847708, "grad_norm": 0.5684044361114502, "learning_rate": 9.471844703263445e-06, "loss": 0.05004052817821503, "memory(GiB)": 21.32, "step": 5857, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.954578 }, { "epoch": 0.1902998408212325, "grad_norm": 0.4643251299858093, "learning_rate": 9.471604392794645e-06, "loss": 0.04409152641892433, "memory(GiB)": 21.32, "step": 5858, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.954605 }, { "epoch": 0.1903323262839879, "grad_norm": 1.029479742050171, "learning_rate": 9.471364030717579e-06, "loss": 0.043901365250349045, "memory(GiB)": 21.32, "step": 5859, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.954636 }, { "epoch": 0.19036481174674333, "grad_norm": 0.5141513347625732, "learning_rate": 9.471123617035021e-06, "loss": 0.04150740057229996, "memory(GiB)": 21.32, "step": 5860, "token_acc": 0.9707112970711297, "train_speed(iter/s)": 0.954671 }, { "epoch": 0.19039729720949874, "grad_norm": 1.4926594495773315, "learning_rate": 9.470883151749745e-06, "loss": 0.03917033225297928, "memory(GiB)": 21.32, "step": 5861, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.954706 }, { "epoch": 0.19042978267225416, "grad_norm": 0.5719978213310242, "learning_rate": 9.470642634864528e-06, "loss": 0.047299571335315704, "memory(GiB)": 21.32, "step": 5862, "token_acc": 0.9754901960784313, "train_speed(iter/s)": 0.95474 }, { "epoch": 0.19046226813500958, "grad_norm": 0.7598872184753418, "learning_rate": 9.470402066382142e-06, "loss": 0.049053847789764404, "memory(GiB)": 21.32, "step": 5863, "token_acc": 0.9905660377358491, "train_speed(iter/s)": 0.954773 }, { "epoch": 0.190494753597765, "grad_norm": 0.5990457534790039, "learning_rate": 9.470161446305368e-06, "loss": 0.04584914445877075, "memory(GiB)": 21.32, "step": 5864, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.954807 }, { "epoch": 0.1905272390605204, "grad_norm": 0.40239909291267395, "learning_rate": 9.469920774636982e-06, "loss": 0.045750692486763, "memory(GiB)": 21.32, "step": 5865, "token_acc": 0.9700854700854701, "train_speed(iter/s)": 0.954841 }, { "epoch": 0.19055972452327583, "grad_norm": 12.493990898132324, "learning_rate": 9.469680051379759e-06, "loss": 0.06377995759248734, "memory(GiB)": 21.32, "step": 5866, "token_acc": 0.9768518518518519, "train_speed(iter/s)": 0.954877 }, { "epoch": 0.19059220998603124, "grad_norm": 2.569634199142456, "learning_rate": 9.46943927653648e-06, "loss": 0.047110699117183685, "memory(GiB)": 21.32, "step": 5867, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.954911 }, { "epoch": 0.19062469544878666, "grad_norm": 0.5379703640937805, "learning_rate": 9.469198450109922e-06, "loss": 0.04391159489750862, "memory(GiB)": 21.32, "step": 5868, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.954943 }, { "epoch": 0.19065718091154207, "grad_norm": 0.6146577596664429, "learning_rate": 9.468957572102866e-06, "loss": 0.046478889882564545, "memory(GiB)": 21.32, "step": 5869, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.954977 }, { "epoch": 0.1906896663742975, "grad_norm": 1.4482550621032715, "learning_rate": 9.468716642518092e-06, "loss": 0.04954744875431061, "memory(GiB)": 21.32, "step": 5870, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.955012 }, { "epoch": 0.1907221518370529, "grad_norm": 0.4923593997955322, "learning_rate": 9.468475661358378e-06, "loss": 0.034282658249139786, "memory(GiB)": 21.32, "step": 5871, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.955047 }, { "epoch": 0.19075463729980835, "grad_norm": 0.5806888937950134, "learning_rate": 9.46823462862651e-06, "loss": 0.04150393605232239, "memory(GiB)": 21.32, "step": 5872, "token_acc": 0.9822064056939501, "train_speed(iter/s)": 0.955081 }, { "epoch": 0.19078712276256377, "grad_norm": 0.6574488282203674, "learning_rate": 9.467993544325267e-06, "loss": 0.039806656539440155, "memory(GiB)": 21.32, "step": 5873, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.955116 }, { "epoch": 0.19081960822531918, "grad_norm": 0.49349573254585266, "learning_rate": 9.467752408457429e-06, "loss": 0.04888102412223816, "memory(GiB)": 21.32, "step": 5874, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.95515 }, { "epoch": 0.1908520936880746, "grad_norm": 0.7107898592948914, "learning_rate": 9.467511221025784e-06, "loss": 0.04472462832927704, "memory(GiB)": 21.32, "step": 5875, "token_acc": 0.9683257918552036, "train_speed(iter/s)": 0.95518 }, { "epoch": 0.19088457915083, "grad_norm": 0.5697142481803894, "learning_rate": 9.467269982033113e-06, "loss": 0.04425269737839699, "memory(GiB)": 21.32, "step": 5876, "token_acc": 0.9630872483221476, "train_speed(iter/s)": 0.955216 }, { "epoch": 0.19091706461358543, "grad_norm": 0.5557473301887512, "learning_rate": 9.4670286914822e-06, "loss": 0.04917501285672188, "memory(GiB)": 21.32, "step": 5877, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955251 }, { "epoch": 0.19094955007634085, "grad_norm": 0.5307846665382385, "learning_rate": 9.46678734937583e-06, "loss": 0.04122302681207657, "memory(GiB)": 21.32, "step": 5878, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.955286 }, { "epoch": 0.19098203553909626, "grad_norm": 0.665793240070343, "learning_rate": 9.46654595571679e-06, "loss": 0.049559589475393295, "memory(GiB)": 21.32, "step": 5879, "token_acc": 0.9918367346938776, "train_speed(iter/s)": 0.955319 }, { "epoch": 0.19101452100185168, "grad_norm": 0.5080304145812988, "learning_rate": 9.466304510507862e-06, "loss": 0.04550100862979889, "memory(GiB)": 21.32, "step": 5880, "token_acc": 0.9700854700854701, "train_speed(iter/s)": 0.955355 }, { "epoch": 0.1910470064646071, "grad_norm": 0.5971795320510864, "learning_rate": 9.466063013751839e-06, "loss": 0.04249711334705353, "memory(GiB)": 21.32, "step": 5881, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.955389 }, { "epoch": 0.1910794919273625, "grad_norm": 0.574702262878418, "learning_rate": 9.465821465451502e-06, "loss": 0.044176869094371796, "memory(GiB)": 21.32, "step": 5882, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.955424 }, { "epoch": 0.19111197739011793, "grad_norm": 0.39148572087287903, "learning_rate": 9.465579865609642e-06, "loss": 0.03587118536233902, "memory(GiB)": 21.32, "step": 5883, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.95546 }, { "epoch": 0.19114446285287334, "grad_norm": 0.5558246374130249, "learning_rate": 9.465338214229047e-06, "loss": 0.04385092109441757, "memory(GiB)": 21.32, "step": 5884, "token_acc": 0.9699570815450643, "train_speed(iter/s)": 0.955495 }, { "epoch": 0.19117694831562876, "grad_norm": 0.5966486930847168, "learning_rate": 9.465096511312503e-06, "loss": 0.035497698932886124, "memory(GiB)": 21.32, "step": 5885, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.955531 }, { "epoch": 0.19120943377838417, "grad_norm": 0.5650927424430847, "learning_rate": 9.464854756862806e-06, "loss": 0.03581242263317108, "memory(GiB)": 21.32, "step": 5886, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.955566 }, { "epoch": 0.1912419192411396, "grad_norm": 1.0319085121154785, "learning_rate": 9.464612950882741e-06, "loss": 0.054565731436014175, "memory(GiB)": 21.32, "step": 5887, "token_acc": 0.9776119402985075, "train_speed(iter/s)": 0.955599 }, { "epoch": 0.191274404703895, "grad_norm": 0.7496288418769836, "learning_rate": 9.4643710933751e-06, "loss": 0.053632959723472595, "memory(GiB)": 21.32, "step": 5888, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.955628 }, { "epoch": 0.19130689016665042, "grad_norm": 0.5510858297348022, "learning_rate": 9.464129184342675e-06, "loss": 0.03860790655016899, "memory(GiB)": 21.32, "step": 5889, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955655 }, { "epoch": 0.19133937562940584, "grad_norm": 0.5627225041389465, "learning_rate": 9.463887223788257e-06, "loss": 0.0495116263628006, "memory(GiB)": 21.32, "step": 5890, "token_acc": 0.98046875, "train_speed(iter/s)": 0.955683 }, { "epoch": 0.19137186109216126, "grad_norm": 0.5625784993171692, "learning_rate": 9.463645211714638e-06, "loss": 0.05346065014600754, "memory(GiB)": 21.32, "step": 5891, "token_acc": 0.9742489270386266, "train_speed(iter/s)": 0.955711 }, { "epoch": 0.19140434655491667, "grad_norm": 0.7086191773414612, "learning_rate": 9.463403148124614e-06, "loss": 0.044897060841321945, "memory(GiB)": 21.32, "step": 5892, "token_acc": 0.9838056680161943, "train_speed(iter/s)": 0.955739 }, { "epoch": 0.1914368320176721, "grad_norm": 0.8061764240264893, "learning_rate": 9.463161033020976e-06, "loss": 0.0584242045879364, "memory(GiB)": 21.32, "step": 5893, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.955764 }, { "epoch": 0.1914693174804275, "grad_norm": 0.5883035063743591, "learning_rate": 9.462918866406518e-06, "loss": 0.04132721573114395, "memory(GiB)": 21.32, "step": 5894, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.955789 }, { "epoch": 0.19150180294318292, "grad_norm": 0.5609215497970581, "learning_rate": 9.462676648284038e-06, "loss": 0.043015457689762115, "memory(GiB)": 21.32, "step": 5895, "token_acc": 0.984, "train_speed(iter/s)": 0.955817 }, { "epoch": 0.19153428840593834, "grad_norm": 0.6659226417541504, "learning_rate": 9.462434378656329e-06, "loss": 0.04545211046934128, "memory(GiB)": 21.32, "step": 5896, "token_acc": 0.952191235059761, "train_speed(iter/s)": 0.955843 }, { "epoch": 0.19156677386869375, "grad_norm": 0.4928906261920929, "learning_rate": 9.462192057526189e-06, "loss": 0.042412206530570984, "memory(GiB)": 21.32, "step": 5897, "token_acc": 0.9812734082397003, "train_speed(iter/s)": 0.95587 }, { "epoch": 0.19159925933144917, "grad_norm": 1.1928701400756836, "learning_rate": 9.461949684896412e-06, "loss": 0.044889312237501144, "memory(GiB)": 21.32, "step": 5898, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.955895 }, { "epoch": 0.19163174479420458, "grad_norm": 0.5628401041030884, "learning_rate": 9.461707260769796e-06, "loss": 0.05076698958873749, "memory(GiB)": 21.32, "step": 5899, "token_acc": 0.981203007518797, "train_speed(iter/s)": 0.955919 }, { "epoch": 0.19166423025696, "grad_norm": 0.7026461958885193, "learning_rate": 9.461464785149141e-06, "loss": 0.052927855402231216, "memory(GiB)": 21.32, "step": 5900, "token_acc": 0.984375, "train_speed(iter/s)": 0.955942 }, { "epoch": 0.19169671571971542, "grad_norm": 0.4671378433704376, "learning_rate": 9.461222258037244e-06, "loss": 0.0390111543238163, "memory(GiB)": 21.32, "step": 5901, "token_acc": 0.9867549668874173, "train_speed(iter/s)": 0.955964 }, { "epoch": 0.19172920118247083, "grad_norm": 0.6005792617797852, "learning_rate": 9.460979679436905e-06, "loss": 0.04531487822532654, "memory(GiB)": 21.32, "step": 5902, "token_acc": 0.9775280898876404, "train_speed(iter/s)": 0.955987 }, { "epoch": 0.19176168664522625, "grad_norm": 0.48837393522262573, "learning_rate": 9.460737049350921e-06, "loss": 0.031107822433114052, "memory(GiB)": 21.32, "step": 5903, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.956005 }, { "epoch": 0.1917941721079817, "grad_norm": 0.8074917793273926, "learning_rate": 9.460494367782096e-06, "loss": 0.04332583397626877, "memory(GiB)": 21.32, "step": 5904, "token_acc": 0.977859778597786, "train_speed(iter/s)": 0.956027 }, { "epoch": 0.1918266575707371, "grad_norm": 0.8034325242042542, "learning_rate": 9.46025163473323e-06, "loss": 0.059671565890312195, "memory(GiB)": 21.32, "step": 5905, "token_acc": 0.980544747081712, "train_speed(iter/s)": 0.95605 }, { "epoch": 0.19185914303349252, "grad_norm": 0.581407904624939, "learning_rate": 9.460008850207121e-06, "loss": 0.04274478554725647, "memory(GiB)": 21.32, "step": 5906, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.956073 }, { "epoch": 0.19189162849624794, "grad_norm": 0.8023191094398499, "learning_rate": 9.459766014206575e-06, "loss": 0.04814809560775757, "memory(GiB)": 21.32, "step": 5907, "token_acc": 1.0, "train_speed(iter/s)": 0.956093 }, { "epoch": 0.19192411395900336, "grad_norm": 0.8270050883293152, "learning_rate": 9.459523126734394e-06, "loss": 0.04963334649801254, "memory(GiB)": 21.32, "step": 5908, "token_acc": 0.9702127659574468, "train_speed(iter/s)": 0.956115 }, { "epoch": 0.19195659942175877, "grad_norm": 1.9172754287719727, "learning_rate": 9.459280187793379e-06, "loss": 0.04742131382226944, "memory(GiB)": 21.32, "step": 5909, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.956138 }, { "epoch": 0.1919890848845142, "grad_norm": 0.6267459988594055, "learning_rate": 9.459037197386336e-06, "loss": 0.06066654622554779, "memory(GiB)": 21.32, "step": 5910, "token_acc": 0.977859778597786, "train_speed(iter/s)": 0.956161 }, { "epoch": 0.1920215703472696, "grad_norm": 0.6172986626625061, "learning_rate": 9.45879415551607e-06, "loss": 0.045745983719825745, "memory(GiB)": 21.32, "step": 5911, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.956184 }, { "epoch": 0.19205405581002502, "grad_norm": 0.5119460821151733, "learning_rate": 9.458551062185382e-06, "loss": 0.039007388055324554, "memory(GiB)": 21.32, "step": 5912, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.956211 }, { "epoch": 0.19208654127278044, "grad_norm": 0.5711960792541504, "learning_rate": 9.458307917397082e-06, "loss": 0.03960704430937767, "memory(GiB)": 21.32, "step": 5913, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.956237 }, { "epoch": 0.19211902673553585, "grad_norm": 0.6498565077781677, "learning_rate": 9.458064721153975e-06, "loss": 0.043619122356176376, "memory(GiB)": 21.32, "step": 5914, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.956261 }, { "epoch": 0.19215151219829127, "grad_norm": 0.6408010721206665, "learning_rate": 9.457821473458866e-06, "loss": 0.05336839705705643, "memory(GiB)": 21.32, "step": 5915, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.956287 }, { "epoch": 0.19218399766104668, "grad_norm": 0.5895570516586304, "learning_rate": 9.457578174314565e-06, "loss": 0.04204259812831879, "memory(GiB)": 21.32, "step": 5916, "token_acc": 0.9790209790209791, "train_speed(iter/s)": 0.956314 }, { "epoch": 0.1922164831238021, "grad_norm": 0.6327896118164062, "learning_rate": 9.457334823723878e-06, "loss": 0.04691283404827118, "memory(GiB)": 21.32, "step": 5917, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.956341 }, { "epoch": 0.19224896858655752, "grad_norm": 0.43383368849754333, "learning_rate": 9.457091421689615e-06, "loss": 0.0351511612534523, "memory(GiB)": 21.32, "step": 5918, "token_acc": 0.9744525547445255, "train_speed(iter/s)": 0.956369 }, { "epoch": 0.19228145404931293, "grad_norm": 0.5811368227005005, "learning_rate": 9.456847968214586e-06, "loss": 0.038545940071344376, "memory(GiB)": 21.32, "step": 5919, "token_acc": 0.996, "train_speed(iter/s)": 0.956398 }, { "epoch": 0.19231393951206835, "grad_norm": 0.44588014483451843, "learning_rate": 9.456604463301597e-06, "loss": 0.037677668035030365, "memory(GiB)": 21.32, "step": 5920, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.956431 }, { "epoch": 0.19234642497482377, "grad_norm": 0.5002908706665039, "learning_rate": 9.456360906953463e-06, "loss": 0.0417666882276535, "memory(GiB)": 21.32, "step": 5921, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.956464 }, { "epoch": 0.19237891043757918, "grad_norm": 0.5197639465332031, "learning_rate": 9.456117299172991e-06, "loss": 0.03887665271759033, "memory(GiB)": 21.32, "step": 5922, "token_acc": 0.983739837398374, "train_speed(iter/s)": 0.956499 }, { "epoch": 0.1924113959003346, "grad_norm": 0.6164962649345398, "learning_rate": 9.455873639962995e-06, "loss": 0.04916652664542198, "memory(GiB)": 21.32, "step": 5923, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.956533 }, { "epoch": 0.19244388136309, "grad_norm": 0.7328988909721375, "learning_rate": 9.455629929326287e-06, "loss": 0.044076647609472275, "memory(GiB)": 21.32, "step": 5924, "token_acc": 0.9793388429752066, "train_speed(iter/s)": 0.956565 }, { "epoch": 0.19247636682584543, "grad_norm": 0.48796236515045166, "learning_rate": 9.455386167265677e-06, "loss": 0.03915741294622421, "memory(GiB)": 21.32, "step": 5925, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.9566 }, { "epoch": 0.19250885228860085, "grad_norm": 0.5246307849884033, "learning_rate": 9.455142353783983e-06, "loss": 0.04565640911459923, "memory(GiB)": 21.32, "step": 5926, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.956633 }, { "epoch": 0.19254133775135626, "grad_norm": 2.57401704788208, "learning_rate": 9.454898488884015e-06, "loss": 0.0650845468044281, "memory(GiB)": 21.32, "step": 5927, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.956666 }, { "epoch": 0.19257382321411168, "grad_norm": 0.5811452269554138, "learning_rate": 9.454654572568589e-06, "loss": 0.04736894369125366, "memory(GiB)": 21.32, "step": 5928, "token_acc": 0.9702127659574468, "train_speed(iter/s)": 0.956699 }, { "epoch": 0.1926063086768671, "grad_norm": 1.8157302141189575, "learning_rate": 9.45441060484052e-06, "loss": 0.04892968013882637, "memory(GiB)": 21.32, "step": 5929, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.956733 }, { "epoch": 0.1926387941396225, "grad_norm": 0.688025951385498, "learning_rate": 9.454166585702626e-06, "loss": 0.04417119547724724, "memory(GiB)": 21.32, "step": 5930, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.956767 }, { "epoch": 0.19267127960237793, "grad_norm": 1.0049338340759277, "learning_rate": 9.453922515157718e-06, "loss": 0.0445638969540596, "memory(GiB)": 21.32, "step": 5931, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.956801 }, { "epoch": 0.19270376506513334, "grad_norm": 0.7859216332435608, "learning_rate": 9.453678393208619e-06, "loss": 0.044450219720602036, "memory(GiB)": 21.32, "step": 5932, "token_acc": 0.975, "train_speed(iter/s)": 0.956834 }, { "epoch": 0.19273625052788876, "grad_norm": 0.7681913375854492, "learning_rate": 9.453434219858141e-06, "loss": 0.04373089596629143, "memory(GiB)": 21.32, "step": 5933, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.956869 }, { "epoch": 0.19276873599064417, "grad_norm": 0.5058401226997375, "learning_rate": 9.453189995109107e-06, "loss": 0.03367748111486435, "memory(GiB)": 21.32, "step": 5934, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.956904 }, { "epoch": 0.1928012214533996, "grad_norm": 0.7086091041564941, "learning_rate": 9.45294571896433e-06, "loss": 0.041069161146879196, "memory(GiB)": 21.32, "step": 5935, "token_acc": 0.9849056603773585, "train_speed(iter/s)": 0.956938 }, { "epoch": 0.19283370691615503, "grad_norm": 0.48852255940437317, "learning_rate": 9.452701391426635e-06, "loss": 0.039621174335479736, "memory(GiB)": 21.32, "step": 5936, "token_acc": 0.9858156028368794, "train_speed(iter/s)": 0.956972 }, { "epoch": 0.19286619237891045, "grad_norm": 2.8815739154815674, "learning_rate": 9.452457012498839e-06, "loss": 0.052776090800762177, "memory(GiB)": 21.32, "step": 5937, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.957008 }, { "epoch": 0.19289867784166587, "grad_norm": 0.6543229818344116, "learning_rate": 9.452212582183765e-06, "loss": 0.04574809968471527, "memory(GiB)": 21.32, "step": 5938, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.957042 }, { "epoch": 0.19293116330442128, "grad_norm": 0.7350518703460693, "learning_rate": 9.451968100484228e-06, "loss": 0.0486595518887043, "memory(GiB)": 21.32, "step": 5939, "token_acc": 0.9786476868327402, "train_speed(iter/s)": 0.957076 }, { "epoch": 0.1929636487671767, "grad_norm": 0.7053218483924866, "learning_rate": 9.451723567403056e-06, "loss": 0.04690675064921379, "memory(GiB)": 21.32, "step": 5940, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.957109 }, { "epoch": 0.19299613422993211, "grad_norm": 0.5672371983528137, "learning_rate": 9.451478982943067e-06, "loss": 0.04431603103876114, "memory(GiB)": 21.32, "step": 5941, "token_acc": 0.9754098360655737, "train_speed(iter/s)": 0.957142 }, { "epoch": 0.19302861969268753, "grad_norm": 0.5458453297615051, "learning_rate": 9.451234347107087e-06, "loss": 0.03914587199687958, "memory(GiB)": 21.32, "step": 5942, "token_acc": 0.988950276243094, "train_speed(iter/s)": 0.957176 }, { "epoch": 0.19306110515544295, "grad_norm": 0.673854649066925, "learning_rate": 9.450989659897938e-06, "loss": 0.046743523329496384, "memory(GiB)": 21.32, "step": 5943, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.957209 }, { "epoch": 0.19309359061819836, "grad_norm": 0.9409865736961365, "learning_rate": 9.450744921318443e-06, "loss": 0.05269225686788559, "memory(GiB)": 21.32, "step": 5944, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.957243 }, { "epoch": 0.19312607608095378, "grad_norm": 0.5600282549858093, "learning_rate": 9.45050013137143e-06, "loss": 0.04584294930100441, "memory(GiB)": 21.32, "step": 5945, "token_acc": 0.984, "train_speed(iter/s)": 0.957273 }, { "epoch": 0.1931585615437092, "grad_norm": 0.8332111835479736, "learning_rate": 9.45025529005972e-06, "loss": 0.050717659294605255, "memory(GiB)": 21.32, "step": 5946, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.957306 }, { "epoch": 0.1931910470064646, "grad_norm": 0.5160880088806152, "learning_rate": 9.45001039738614e-06, "loss": 0.050143592059612274, "memory(GiB)": 21.32, "step": 5947, "token_acc": 0.9790940766550522, "train_speed(iter/s)": 0.957341 }, { "epoch": 0.19322353246922003, "grad_norm": 0.6994000673294067, "learning_rate": 9.449765453353519e-06, "loss": 0.05256613716483116, "memory(GiB)": 21.32, "step": 5948, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.957375 }, { "epoch": 0.19325601793197544, "grad_norm": 0.45911020040512085, "learning_rate": 9.44952045796468e-06, "loss": 0.044305913150310516, "memory(GiB)": 21.32, "step": 5949, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.957409 }, { "epoch": 0.19328850339473086, "grad_norm": 0.5180705189704895, "learning_rate": 9.449275411222454e-06, "loss": 0.03818492591381073, "memory(GiB)": 21.32, "step": 5950, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.95744 }, { "epoch": 0.19332098885748628, "grad_norm": 0.646438479423523, "learning_rate": 9.449030313129669e-06, "loss": 0.04268548637628555, "memory(GiB)": 21.32, "step": 5951, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.957466 }, { "epoch": 0.1933534743202417, "grad_norm": 0.5479587912559509, "learning_rate": 9.44878516368915e-06, "loss": 0.03876142203807831, "memory(GiB)": 21.32, "step": 5952, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957493 }, { "epoch": 0.1933859597829971, "grad_norm": 0.4734667241573334, "learning_rate": 9.448539962903728e-06, "loss": 0.04275984317064285, "memory(GiB)": 21.32, "step": 5953, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.95752 }, { "epoch": 0.19341844524575252, "grad_norm": 0.4775310456752777, "learning_rate": 9.448294710776238e-06, "loss": 0.032312750816345215, "memory(GiB)": 21.32, "step": 5954, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.957547 }, { "epoch": 0.19345093070850794, "grad_norm": 1.0173836946487427, "learning_rate": 9.448049407309503e-06, "loss": 0.046519435942173004, "memory(GiB)": 21.32, "step": 5955, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.957572 }, { "epoch": 0.19348341617126336, "grad_norm": 0.7318805456161499, "learning_rate": 9.44780405250636e-06, "loss": 0.0565192736685276, "memory(GiB)": 21.32, "step": 5956, "token_acc": 0.9785714285714285, "train_speed(iter/s)": 0.957598 }, { "epoch": 0.19351590163401877, "grad_norm": 0.9808566570281982, "learning_rate": 9.447558646369635e-06, "loss": 0.062054574489593506, "memory(GiB)": 21.32, "step": 5957, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.957625 }, { "epoch": 0.1935483870967742, "grad_norm": 0.5178576707839966, "learning_rate": 9.447313188902166e-06, "loss": 0.044467996805906296, "memory(GiB)": 21.32, "step": 5958, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957651 }, { "epoch": 0.1935808725595296, "grad_norm": 0.47347134351730347, "learning_rate": 9.447067680106783e-06, "loss": 0.039733849465847015, "memory(GiB)": 21.32, "step": 5959, "token_acc": 0.9752475247524752, "train_speed(iter/s)": 0.957678 }, { "epoch": 0.19361335802228502, "grad_norm": 0.7832266688346863, "learning_rate": 9.44682211998632e-06, "loss": 0.04395353049039841, "memory(GiB)": 21.32, "step": 5960, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.9577 }, { "epoch": 0.19364584348504044, "grad_norm": 0.6145427823066711, "learning_rate": 9.44657650854361e-06, "loss": 0.04784882068634033, "memory(GiB)": 21.32, "step": 5961, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.957722 }, { "epoch": 0.19367832894779585, "grad_norm": 0.5379934906959534, "learning_rate": 9.44633084578149e-06, "loss": 0.043107546865940094, "memory(GiB)": 21.32, "step": 5962, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.957743 }, { "epoch": 0.19371081441055127, "grad_norm": 0.47962749004364014, "learning_rate": 9.446085131702793e-06, "loss": 0.043223775923252106, "memory(GiB)": 21.32, "step": 5963, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.957766 }, { "epoch": 0.19374329987330668, "grad_norm": 0.681981086730957, "learning_rate": 9.445839366310357e-06, "loss": 0.05572976544499397, "memory(GiB)": 21.32, "step": 5964, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.957787 }, { "epoch": 0.1937757853360621, "grad_norm": 0.5293195247650146, "learning_rate": 9.445593549607017e-06, "loss": 0.036998242139816284, "memory(GiB)": 21.32, "step": 5965, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.957808 }, { "epoch": 0.19380827079881752, "grad_norm": 0.45130911469459534, "learning_rate": 9.44534768159561e-06, "loss": 0.047546595335006714, "memory(GiB)": 21.32, "step": 5966, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.957831 }, { "epoch": 0.19384075626157293, "grad_norm": 0.5359448194503784, "learning_rate": 9.445101762278974e-06, "loss": 0.039605364203453064, "memory(GiB)": 21.32, "step": 5967, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.957853 }, { "epoch": 0.19387324172432838, "grad_norm": 0.37154528498649597, "learning_rate": 9.444855791659947e-06, "loss": 0.040228135883808136, "memory(GiB)": 21.32, "step": 5968, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.957873 }, { "epoch": 0.1939057271870838, "grad_norm": 0.6166698336601257, "learning_rate": 9.44460976974137e-06, "loss": 0.03973335400223732, "memory(GiB)": 21.32, "step": 5969, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.957895 }, { "epoch": 0.1939382126498392, "grad_norm": 0.4442439675331116, "learning_rate": 9.444363696526078e-06, "loss": 0.03985193371772766, "memory(GiB)": 21.32, "step": 5970, "token_acc": 0.9672727272727273, "train_speed(iter/s)": 0.957918 }, { "epoch": 0.19397069811259463, "grad_norm": 0.5361957550048828, "learning_rate": 9.444117572016912e-06, "loss": 0.04152847081422806, "memory(GiB)": 21.32, "step": 5971, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.95794 }, { "epoch": 0.19400318357535004, "grad_norm": 1.832442283630371, "learning_rate": 9.443871396216719e-06, "loss": 0.058147937059402466, "memory(GiB)": 21.32, "step": 5972, "token_acc": 0.9661016949152542, "train_speed(iter/s)": 0.957962 }, { "epoch": 0.19403566903810546, "grad_norm": 0.712609052658081, "learning_rate": 9.443625169128331e-06, "loss": 0.03867567330598831, "memory(GiB)": 21.32, "step": 5973, "token_acc": 0.9799196787148594, "train_speed(iter/s)": 0.957986 }, { "epoch": 0.19406815450086087, "grad_norm": 0.9828521609306335, "learning_rate": 9.443378890754597e-06, "loss": 0.03484807163476944, "memory(GiB)": 21.32, "step": 5974, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.958009 }, { "epoch": 0.1941006399636163, "grad_norm": 0.5429394245147705, "learning_rate": 9.443132561098357e-06, "loss": 0.04432116821408272, "memory(GiB)": 21.32, "step": 5975, "token_acc": 0.98, "train_speed(iter/s)": 0.958035 }, { "epoch": 0.1941331254263717, "grad_norm": 0.6783804297447205, "learning_rate": 9.44288618016245e-06, "loss": 0.04475713148713112, "memory(GiB)": 21.32, "step": 5976, "token_acc": 0.9683257918552036, "train_speed(iter/s)": 0.958061 }, { "epoch": 0.19416561088912712, "grad_norm": 0.6457435488700867, "learning_rate": 9.442639747949725e-06, "loss": 0.0498405322432518, "memory(GiB)": 21.32, "step": 5977, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.958088 }, { "epoch": 0.19419809635188254, "grad_norm": 0.5586986541748047, "learning_rate": 9.442393264463023e-06, "loss": 0.04051273688673973, "memory(GiB)": 21.32, "step": 5978, "token_acc": 1.0, "train_speed(iter/s)": 0.958116 }, { "epoch": 0.19423058181463795, "grad_norm": 0.48495396971702576, "learning_rate": 9.442146729705192e-06, "loss": 0.045951537787914276, "memory(GiB)": 21.32, "step": 5979, "token_acc": 0.9682539682539683, "train_speed(iter/s)": 0.958148 }, { "epoch": 0.19426306727739337, "grad_norm": 0.5322040915489197, "learning_rate": 9.441900143679075e-06, "loss": 0.03509605675935745, "memory(GiB)": 21.32, "step": 5980, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.958183 }, { "epoch": 0.19429555274014879, "grad_norm": 0.7388648390769958, "learning_rate": 9.441653506387515e-06, "loss": 0.0419243648648262, "memory(GiB)": 21.32, "step": 5981, "token_acc": 0.9859649122807017, "train_speed(iter/s)": 0.958216 }, { "epoch": 0.1943280382029042, "grad_norm": 1.469011902809143, "learning_rate": 9.441406817833366e-06, "loss": 0.054709501564502716, "memory(GiB)": 21.32, "step": 5982, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.958252 }, { "epoch": 0.19436052366565962, "grad_norm": 0.7657395601272583, "learning_rate": 9.441160078019469e-06, "loss": 0.03820664435625076, "memory(GiB)": 21.32, "step": 5983, "token_acc": 0.9683794466403162, "train_speed(iter/s)": 0.958284 }, { "epoch": 0.19439300912841503, "grad_norm": 0.5086712837219238, "learning_rate": 9.440913286948672e-06, "loss": 0.0322582945227623, "memory(GiB)": 21.32, "step": 5984, "token_acc": 0.9846743295019157, "train_speed(iter/s)": 0.958318 }, { "epoch": 0.19442549459117045, "grad_norm": 0.904101550579071, "learning_rate": 9.440666444623828e-06, "loss": 0.03589332476258278, "memory(GiB)": 21.32, "step": 5985, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.958351 }, { "epoch": 0.19445798005392587, "grad_norm": 2.257938861846924, "learning_rate": 9.44041955104778e-06, "loss": 0.04230423644185066, "memory(GiB)": 21.32, "step": 5986, "token_acc": 0.985, "train_speed(iter/s)": 0.958382 }, { "epoch": 0.19449046551668128, "grad_norm": 0.694205641746521, "learning_rate": 9.440172606223381e-06, "loss": 0.04876658320426941, "memory(GiB)": 21.32, "step": 5987, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.958414 }, { "epoch": 0.1945229509794367, "grad_norm": 0.5901832580566406, "learning_rate": 9.439925610153481e-06, "loss": 0.05025016516447067, "memory(GiB)": 21.32, "step": 5988, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.958446 }, { "epoch": 0.19455543644219211, "grad_norm": 0.5602704882621765, "learning_rate": 9.439678562840929e-06, "loss": 0.0541667714715004, "memory(GiB)": 21.32, "step": 5989, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.958478 }, { "epoch": 0.19458792190494753, "grad_norm": 0.6796683073043823, "learning_rate": 9.439431464288577e-06, "loss": 0.04740755632519722, "memory(GiB)": 21.32, "step": 5990, "token_acc": 0.9754901960784313, "train_speed(iter/s)": 0.958509 }, { "epoch": 0.19462040736770295, "grad_norm": 0.498627632856369, "learning_rate": 9.439184314499278e-06, "loss": 0.03462045639753342, "memory(GiB)": 21.32, "step": 5991, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.958542 }, { "epoch": 0.19465289283045836, "grad_norm": 0.7448375225067139, "learning_rate": 9.438937113475884e-06, "loss": 0.05785248428583145, "memory(GiB)": 21.32, "step": 5992, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.958575 }, { "epoch": 0.19468537829321378, "grad_norm": 0.5394715666770935, "learning_rate": 9.438689861221247e-06, "loss": 0.041837822645902634, "memory(GiB)": 21.32, "step": 5993, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.958609 }, { "epoch": 0.1947178637559692, "grad_norm": 0.5967114567756653, "learning_rate": 9.438442557738218e-06, "loss": 0.036457277834415436, "memory(GiB)": 21.32, "step": 5994, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.958642 }, { "epoch": 0.1947503492187246, "grad_norm": 0.8087006211280823, "learning_rate": 9.438195203029659e-06, "loss": 0.05582939833402634, "memory(GiB)": 21.32, "step": 5995, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.958675 }, { "epoch": 0.19478283468148003, "grad_norm": 0.4757239520549774, "learning_rate": 9.437947797098417e-06, "loss": 0.04029424488544464, "memory(GiB)": 21.32, "step": 5996, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.958709 }, { "epoch": 0.19481532014423544, "grad_norm": 0.48042911291122437, "learning_rate": 9.437700339947352e-06, "loss": 0.038466595113277435, "memory(GiB)": 21.32, "step": 5997, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.958741 }, { "epoch": 0.19484780560699086, "grad_norm": 2.5957672595977783, "learning_rate": 9.437452831579318e-06, "loss": 0.04446692764759064, "memory(GiB)": 21.32, "step": 5998, "token_acc": 0.9813953488372092, "train_speed(iter/s)": 0.958775 }, { "epoch": 0.19488029106974628, "grad_norm": 0.7851005792617798, "learning_rate": 9.437205271997172e-06, "loss": 0.05799577012658119, "memory(GiB)": 21.32, "step": 5999, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.958807 }, { "epoch": 0.19491277653250172, "grad_norm": 0.5213297605514526, "learning_rate": 9.436957661203772e-06, "loss": 0.04430133476853371, "memory(GiB)": 21.32, "step": 6000, "token_acc": 0.9696969696969697, "train_speed(iter/s)": 0.95884 }, { "epoch": 0.19491277653250172, "eval_loss": 0.04548031464219093, "eval_runtime": 79.5739, "eval_samples_per_second": 125.041, "eval_steps_per_second": 3.908, "eval_token_acc": 0.9823413423668779, "step": 6000 }, { "epoch": 0.19494526199525714, "grad_norm": 2.3823623657226562, "learning_rate": 9.436709999201974e-06, "loss": 0.04120529443025589, "memory(GiB)": 21.32, "step": 6001, "token_acc": 0.9819130291910838, "train_speed(iter/s)": 0.945234 }, { "epoch": 0.19497774745801255, "grad_norm": 0.7203847169876099, "learning_rate": 9.436462285994638e-06, "loss": 0.055309779942035675, "memory(GiB)": 21.32, "step": 6002, "token_acc": 1.0, "train_speed(iter/s)": 0.945259 }, { "epoch": 0.19501023292076797, "grad_norm": 0.5794790983200073, "learning_rate": 9.436214521584623e-06, "loss": 0.040861912071704865, "memory(GiB)": 21.32, "step": 6003, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.945282 }, { "epoch": 0.19504271838352338, "grad_norm": 0.4672043025493622, "learning_rate": 9.435966705974786e-06, "loss": 0.043019428849220276, "memory(GiB)": 21.32, "step": 6004, "token_acc": 0.974025974025974, "train_speed(iter/s)": 0.945303 }, { "epoch": 0.1950752038462788, "grad_norm": 0.4683574140071869, "learning_rate": 9.435718839167992e-06, "loss": 0.0448441356420517, "memory(GiB)": 21.32, "step": 6005, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.945326 }, { "epoch": 0.19510768930903422, "grad_norm": 0.6901604533195496, "learning_rate": 9.435470921167096e-06, "loss": 0.04182007163763046, "memory(GiB)": 21.32, "step": 6006, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.945351 }, { "epoch": 0.19514017477178963, "grad_norm": 0.5903410315513611, "learning_rate": 9.435222951974962e-06, "loss": 0.05064056068658829, "memory(GiB)": 21.32, "step": 6007, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.945378 }, { "epoch": 0.19517266023454505, "grad_norm": 0.7854433655738831, "learning_rate": 9.434974931594451e-06, "loss": 0.05068027228116989, "memory(GiB)": 21.32, "step": 6008, "token_acc": 1.0, "train_speed(iter/s)": 0.945407 }, { "epoch": 0.19520514569730046, "grad_norm": 0.6563274264335632, "learning_rate": 9.434726860028429e-06, "loss": 0.05497819185256958, "memory(GiB)": 21.32, "step": 6009, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.945432 }, { "epoch": 0.19523763116005588, "grad_norm": 0.5937215685844421, "learning_rate": 9.434478737279755e-06, "loss": 0.041845548897981644, "memory(GiB)": 21.32, "step": 6010, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.945459 }, { "epoch": 0.1952701166228113, "grad_norm": 3.817223310470581, "learning_rate": 9.434230563351294e-06, "loss": 0.045368388295173645, "memory(GiB)": 21.32, "step": 6011, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.945485 }, { "epoch": 0.1953026020855667, "grad_norm": 0.490779310464859, "learning_rate": 9.433982338245911e-06, "loss": 0.04316338896751404, "memory(GiB)": 21.32, "step": 6012, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.945514 }, { "epoch": 0.19533508754832213, "grad_norm": 0.47735831141471863, "learning_rate": 9.433734061966471e-06, "loss": 0.039967842400074005, "memory(GiB)": 21.32, "step": 6013, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.945543 }, { "epoch": 0.19536757301107754, "grad_norm": 1.5173767805099487, "learning_rate": 9.433485734515838e-06, "loss": 0.05047298222780228, "memory(GiB)": 21.32, "step": 6014, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.94557 }, { "epoch": 0.19540005847383296, "grad_norm": 0.5585666298866272, "learning_rate": 9.433237355896877e-06, "loss": 0.04031422734260559, "memory(GiB)": 21.32, "step": 6015, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.945603 }, { "epoch": 0.19543254393658838, "grad_norm": 0.525291383266449, "learning_rate": 9.432988926112458e-06, "loss": 0.046949248760938644, "memory(GiB)": 21.32, "step": 6016, "token_acc": 0.9720930232558139, "train_speed(iter/s)": 0.945638 }, { "epoch": 0.1954650293993438, "grad_norm": 0.6326757669448853, "learning_rate": 9.432740445165445e-06, "loss": 0.04189398139715195, "memory(GiB)": 21.32, "step": 6017, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.945674 }, { "epoch": 0.1954975148620992, "grad_norm": 0.7188867330551147, "learning_rate": 9.432491913058708e-06, "loss": 0.050736624747514725, "memory(GiB)": 21.32, "step": 6018, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.945708 }, { "epoch": 0.19553000032485462, "grad_norm": 0.8756279945373535, "learning_rate": 9.432243329795116e-06, "loss": 0.056495580822229385, "memory(GiB)": 21.32, "step": 6019, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.945743 }, { "epoch": 0.19556248578761004, "grad_norm": 0.3907088041305542, "learning_rate": 9.431994695377535e-06, "loss": 0.041019294410943985, "memory(GiB)": 21.32, "step": 6020, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.945777 }, { "epoch": 0.19559497125036546, "grad_norm": 0.4956514239311218, "learning_rate": 9.431746009808837e-06, "loss": 0.05086666718125343, "memory(GiB)": 21.32, "step": 6021, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.945811 }, { "epoch": 0.19562745671312087, "grad_norm": 0.7405316233634949, "learning_rate": 9.431497273091892e-06, "loss": 0.044995229691267014, "memory(GiB)": 21.32, "step": 6022, "token_acc": 0.9703703703703703, "train_speed(iter/s)": 0.945846 }, { "epoch": 0.1956599421758763, "grad_norm": 0.5110084414482117, "learning_rate": 9.43124848522957e-06, "loss": 0.03579073026776314, "memory(GiB)": 21.32, "step": 6023, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.945881 }, { "epoch": 0.1956924276386317, "grad_norm": 0.5254899263381958, "learning_rate": 9.430999646224741e-06, "loss": 0.04752935841679573, "memory(GiB)": 21.32, "step": 6024, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.945917 }, { "epoch": 0.19572491310138712, "grad_norm": 0.5834025144577026, "learning_rate": 9.430750756080278e-06, "loss": 0.04470260441303253, "memory(GiB)": 21.32, "step": 6025, "token_acc": 0.975103734439834, "train_speed(iter/s)": 0.945951 }, { "epoch": 0.19575739856414254, "grad_norm": 0.5739982724189758, "learning_rate": 9.430501814799058e-06, "loss": 0.045597534626722336, "memory(GiB)": 21.32, "step": 6026, "token_acc": 0.9759615384615384, "train_speed(iter/s)": 0.945986 }, { "epoch": 0.19578988402689795, "grad_norm": 1.1314280033111572, "learning_rate": 9.430252822383947e-06, "loss": 0.05049644783139229, "memory(GiB)": 21.32, "step": 6027, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.946022 }, { "epoch": 0.19582236948965337, "grad_norm": 0.5068221092224121, "learning_rate": 9.430003778837823e-06, "loss": 0.03412172198295593, "memory(GiB)": 21.32, "step": 6028, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.946057 }, { "epoch": 0.19585485495240879, "grad_norm": 0.5876809358596802, "learning_rate": 9.429754684163556e-06, "loss": 0.04257696866989136, "memory(GiB)": 21.32, "step": 6029, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.946092 }, { "epoch": 0.1958873404151642, "grad_norm": 0.6942270398139954, "learning_rate": 9.429505538364026e-06, "loss": 0.040714576840400696, "memory(GiB)": 21.32, "step": 6030, "token_acc": 0.9746835443037974, "train_speed(iter/s)": 0.946127 }, { "epoch": 0.19591982587791962, "grad_norm": 0.5241595506668091, "learning_rate": 9.429256341442108e-06, "loss": 0.032431796193122864, "memory(GiB)": 21.32, "step": 6031, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.946162 }, { "epoch": 0.19595231134067506, "grad_norm": 1.1419278383255005, "learning_rate": 9.429007093400675e-06, "loss": 0.060844164341688156, "memory(GiB)": 21.32, "step": 6032, "token_acc": 0.9459459459459459, "train_speed(iter/s)": 0.946196 }, { "epoch": 0.19598479680343048, "grad_norm": 1.0645675659179688, "learning_rate": 9.428757794242604e-06, "loss": 0.053785718977451324, "memory(GiB)": 21.32, "step": 6033, "token_acc": 0.979253112033195, "train_speed(iter/s)": 0.946228 }, { "epoch": 0.1960172822661859, "grad_norm": 0.4686960279941559, "learning_rate": 9.428508443970776e-06, "loss": 0.04266215115785599, "memory(GiB)": 21.32, "step": 6034, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.946263 }, { "epoch": 0.1960497677289413, "grad_norm": 0.7191898226737976, "learning_rate": 9.428259042588063e-06, "loss": 0.048934295773506165, "memory(GiB)": 21.32, "step": 6035, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.946298 }, { "epoch": 0.19608225319169673, "grad_norm": 0.7242434620857239, "learning_rate": 9.42800959009735e-06, "loss": 0.0384201779961586, "memory(GiB)": 21.32, "step": 6036, "token_acc": 0.9742647058823529, "train_speed(iter/s)": 0.94633 }, { "epoch": 0.19611473865445214, "grad_norm": 0.5662316679954529, "learning_rate": 9.427760086501512e-06, "loss": 0.044061191380023956, "memory(GiB)": 21.32, "step": 6037, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.946364 }, { "epoch": 0.19614722411720756, "grad_norm": 0.4464554786682129, "learning_rate": 9.427510531803427e-06, "loss": 0.036000803112983704, "memory(GiB)": 21.32, "step": 6038, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.946399 }, { "epoch": 0.19617970957996297, "grad_norm": 0.6495367884635925, "learning_rate": 9.42726092600598e-06, "loss": 0.04276907444000244, "memory(GiB)": 21.32, "step": 6039, "token_acc": 0.9714285714285714, "train_speed(iter/s)": 0.946433 }, { "epoch": 0.1962121950427184, "grad_norm": 0.5680199861526489, "learning_rate": 9.427011269112047e-06, "loss": 0.048506490886211395, "memory(GiB)": 21.32, "step": 6040, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.946468 }, { "epoch": 0.1962446805054738, "grad_norm": 0.6191424131393433, "learning_rate": 9.426761561124514e-06, "loss": 0.04257795959711075, "memory(GiB)": 21.32, "step": 6041, "token_acc": 0.9812206572769953, "train_speed(iter/s)": 0.946502 }, { "epoch": 0.19627716596822922, "grad_norm": 0.8863160014152527, "learning_rate": 9.426511802046259e-06, "loss": 0.036705952137708664, "memory(GiB)": 21.32, "step": 6042, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.946538 }, { "epoch": 0.19630965143098464, "grad_norm": 0.6130456924438477, "learning_rate": 9.426261991880167e-06, "loss": 0.04600189998745918, "memory(GiB)": 21.32, "step": 6043, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.946574 }, { "epoch": 0.19634213689374005, "grad_norm": 0.42921993136405945, "learning_rate": 9.426012130629118e-06, "loss": 0.03777046501636505, "memory(GiB)": 21.32, "step": 6044, "token_acc": 0.9779735682819384, "train_speed(iter/s)": 0.946608 }, { "epoch": 0.19637462235649547, "grad_norm": 0.5144205689430237, "learning_rate": 9.425762218296001e-06, "loss": 0.05199568718671799, "memory(GiB)": 21.32, "step": 6045, "token_acc": 0.9653465346534653, "train_speed(iter/s)": 0.94664 }, { "epoch": 0.1964071078192509, "grad_norm": 0.4578991234302521, "learning_rate": 9.425512254883697e-06, "loss": 0.029474083334207535, "memory(GiB)": 21.32, "step": 6046, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.94667 }, { "epoch": 0.1964395932820063, "grad_norm": 1.720446228981018, "learning_rate": 9.42526224039509e-06, "loss": 0.03974632918834686, "memory(GiB)": 21.32, "step": 6047, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.946697 }, { "epoch": 0.19647207874476172, "grad_norm": 0.5567111372947693, "learning_rate": 9.425012174833065e-06, "loss": 0.033093735575675964, "memory(GiB)": 21.32, "step": 6048, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.946724 }, { "epoch": 0.19650456420751714, "grad_norm": 0.5602275729179382, "learning_rate": 9.424762058200514e-06, "loss": 0.03835400938987732, "memory(GiB)": 21.32, "step": 6049, "token_acc": 0.9799196787148594, "train_speed(iter/s)": 0.946705 }, { "epoch": 0.19653704967027255, "grad_norm": 0.5167268514633179, "learning_rate": 9.424511890500316e-06, "loss": 0.039226654917001724, "memory(GiB)": 21.32, "step": 6050, "token_acc": 0.9828767123287672, "train_speed(iter/s)": 0.946728 }, { "epoch": 0.19656953513302797, "grad_norm": 0.7181341052055359, "learning_rate": 9.424261671735364e-06, "loss": 0.044126540422439575, "memory(GiB)": 21.32, "step": 6051, "token_acc": 0.9751243781094527, "train_speed(iter/s)": 0.946753 }, { "epoch": 0.19660202059578338, "grad_norm": 0.5071418881416321, "learning_rate": 9.424011401908545e-06, "loss": 0.04320064187049866, "memory(GiB)": 21.32, "step": 6052, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.946781 }, { "epoch": 0.1966345060585388, "grad_norm": 0.736484706401825, "learning_rate": 9.423761081022741e-06, "loss": 0.04933849722146988, "memory(GiB)": 21.32, "step": 6053, "token_acc": 0.9788732394366197, "train_speed(iter/s)": 0.946808 }, { "epoch": 0.19666699152129422, "grad_norm": 0.5089824795722961, "learning_rate": 9.42351070908085e-06, "loss": 0.03850853443145752, "memory(GiB)": 21.32, "step": 6054, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.946833 }, { "epoch": 0.19669947698404963, "grad_norm": 1.3391692638397217, "learning_rate": 9.423260286085756e-06, "loss": 0.045328155159950256, "memory(GiB)": 21.32, "step": 6055, "token_acc": 0.9745762711864406, "train_speed(iter/s)": 0.946858 }, { "epoch": 0.19673196244680505, "grad_norm": 0.8695306777954102, "learning_rate": 9.423009812040352e-06, "loss": 0.042804040014743805, "memory(GiB)": 21.32, "step": 6056, "token_acc": 0.978494623655914, "train_speed(iter/s)": 0.946881 }, { "epoch": 0.19676444790956046, "grad_norm": 0.826373279094696, "learning_rate": 9.422759286947527e-06, "loss": 0.053895339369773865, "memory(GiB)": 21.32, "step": 6057, "token_acc": 0.975, "train_speed(iter/s)": 0.946906 }, { "epoch": 0.19679693337231588, "grad_norm": 0.5981917381286621, "learning_rate": 9.422508710810175e-06, "loss": 0.04697920009493828, "memory(GiB)": 21.32, "step": 6058, "token_acc": 0.9866220735785953, "train_speed(iter/s)": 0.946931 }, { "epoch": 0.1968294188350713, "grad_norm": 0.5718504190444946, "learning_rate": 9.422258083631184e-06, "loss": 0.04869017004966736, "memory(GiB)": 21.32, "step": 6059, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.946955 }, { "epoch": 0.1968619042978267, "grad_norm": 1.2270302772521973, "learning_rate": 9.42200740541345e-06, "loss": 0.0465189591050148, "memory(GiB)": 21.32, "step": 6060, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.946979 }, { "epoch": 0.19689438976058213, "grad_norm": 0.6154909133911133, "learning_rate": 9.421756676159864e-06, "loss": 0.057057350873947144, "memory(GiB)": 21.32, "step": 6061, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.947005 }, { "epoch": 0.19692687522333754, "grad_norm": 0.5846442580223083, "learning_rate": 9.421505895873321e-06, "loss": 0.0416664183139801, "memory(GiB)": 21.32, "step": 6062, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.947029 }, { "epoch": 0.19695936068609296, "grad_norm": 0.4771806299686432, "learning_rate": 9.421255064556715e-06, "loss": 0.04180390015244484, "memory(GiB)": 21.32, "step": 6063, "token_acc": 0.9768518518518519, "train_speed(iter/s)": 0.947049 }, { "epoch": 0.1969918461488484, "grad_norm": 0.5322031378746033, "learning_rate": 9.421004182212941e-06, "loss": 0.04900101199746132, "memory(GiB)": 21.32, "step": 6064, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.947075 }, { "epoch": 0.19702433161160382, "grad_norm": 0.6751949787139893, "learning_rate": 9.420753248844895e-06, "loss": 0.05650202929973602, "memory(GiB)": 21.32, "step": 6065, "token_acc": 0.9682539682539683, "train_speed(iter/s)": 0.947096 }, { "epoch": 0.19705681707435924, "grad_norm": 0.34392863512039185, "learning_rate": 9.42050226445547e-06, "loss": 0.029837416484951973, "memory(GiB)": 21.32, "step": 6066, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.947123 }, { "epoch": 0.19708930253711465, "grad_norm": 0.5585479736328125, "learning_rate": 9.420251229047566e-06, "loss": 0.04670953005552292, "memory(GiB)": 21.32, "step": 6067, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.947147 }, { "epoch": 0.19712178799987007, "grad_norm": 0.6657507419586182, "learning_rate": 9.420000142624081e-06, "loss": 0.04813329875469208, "memory(GiB)": 21.32, "step": 6068, "token_acc": 0.9742647058823529, "train_speed(iter/s)": 0.947175 }, { "epoch": 0.19715427346262548, "grad_norm": 0.4208855628967285, "learning_rate": 9.41974900518791e-06, "loss": 0.04147069901227951, "memory(GiB)": 21.32, "step": 6069, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.947101 }, { "epoch": 0.1971867589253809, "grad_norm": 0.5229220390319824, "learning_rate": 9.419497816741953e-06, "loss": 0.03396078944206238, "memory(GiB)": 21.32, "step": 6070, "token_acc": 0.9764705882352941, "train_speed(iter/s)": 0.947129 }, { "epoch": 0.19721924438813632, "grad_norm": 0.6204413771629333, "learning_rate": 9.41924657728911e-06, "loss": 0.04978279024362564, "memory(GiB)": 21.32, "step": 6071, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.947156 }, { "epoch": 0.19725172985089173, "grad_norm": 0.8043028116226196, "learning_rate": 9.41899528683228e-06, "loss": 0.04169324040412903, "memory(GiB)": 21.32, "step": 6072, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.94718 }, { "epoch": 0.19728421531364715, "grad_norm": 1.208403468132019, "learning_rate": 9.41874394537436e-06, "loss": 0.04432584345340729, "memory(GiB)": 21.32, "step": 6073, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.947203 }, { "epoch": 0.19731670077640256, "grad_norm": 0.5458878874778748, "learning_rate": 9.418492552918255e-06, "loss": 0.0431562140583992, "memory(GiB)": 21.32, "step": 6074, "token_acc": 0.9885496183206107, "train_speed(iter/s)": 0.94723 }, { "epoch": 0.19734918623915798, "grad_norm": 0.5306212902069092, "learning_rate": 9.418241109466867e-06, "loss": 0.03987304866313934, "memory(GiB)": 21.32, "step": 6075, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.947259 }, { "epoch": 0.1973816717019134, "grad_norm": 1.120905876159668, "learning_rate": 9.417989615023093e-06, "loss": 0.041821807622909546, "memory(GiB)": 21.32, "step": 6076, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.947286 }, { "epoch": 0.1974141571646688, "grad_norm": 0.7132804989814758, "learning_rate": 9.417738069589839e-06, "loss": 0.04216846823692322, "memory(GiB)": 21.32, "step": 6077, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.947316 }, { "epoch": 0.19744664262742423, "grad_norm": 0.5041176676750183, "learning_rate": 9.417486473170008e-06, "loss": 0.039779722690582275, "memory(GiB)": 21.32, "step": 6078, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.947348 }, { "epoch": 0.19747912809017965, "grad_norm": 0.8936723470687866, "learning_rate": 9.417234825766503e-06, "loss": 0.04709905385971069, "memory(GiB)": 21.32, "step": 6079, "token_acc": 0.9724770642201835, "train_speed(iter/s)": 0.947381 }, { "epoch": 0.19751161355293506, "grad_norm": 0.777076244354248, "learning_rate": 9.41698312738223e-06, "loss": 0.035471588373184204, "memory(GiB)": 21.32, "step": 6080, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.947414 }, { "epoch": 0.19754409901569048, "grad_norm": 0.7125402688980103, "learning_rate": 9.416731378020092e-06, "loss": 0.037984803318977356, "memory(GiB)": 21.32, "step": 6081, "token_acc": 0.9752066115702479, "train_speed(iter/s)": 0.947448 }, { "epoch": 0.1975765844784459, "grad_norm": 0.5904854536056519, "learning_rate": 9.416479577682995e-06, "loss": 0.054847024381160736, "memory(GiB)": 21.32, "step": 6082, "token_acc": 0.9767441860465116, "train_speed(iter/s)": 0.947482 }, { "epoch": 0.1976090699412013, "grad_norm": 0.5188193917274475, "learning_rate": 9.416227726373846e-06, "loss": 0.03836812078952789, "memory(GiB)": 21.32, "step": 6083, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.947517 }, { "epoch": 0.19764155540395673, "grad_norm": 0.40159979462623596, "learning_rate": 9.415975824095552e-06, "loss": 0.04457708075642586, "memory(GiB)": 21.32, "step": 6084, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.947551 }, { "epoch": 0.19767404086671214, "grad_norm": 0.7807753086090088, "learning_rate": 9.415723870851017e-06, "loss": 0.05744198337197304, "memory(GiB)": 21.32, "step": 6085, "token_acc": 0.9683098591549296, "train_speed(iter/s)": 0.947585 }, { "epoch": 0.19770652632946756, "grad_norm": 0.6902093291282654, "learning_rate": 9.415471866643151e-06, "loss": 0.03937728330492973, "memory(GiB)": 21.32, "step": 6086, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.947619 }, { "epoch": 0.19773901179222297, "grad_norm": 0.7699567675590515, "learning_rate": 9.415219811474865e-06, "loss": 0.05374043062329292, "memory(GiB)": 21.32, "step": 6087, "token_acc": 0.9634703196347032, "train_speed(iter/s)": 0.947653 }, { "epoch": 0.1977714972549784, "grad_norm": 0.6862236857414246, "learning_rate": 9.414967705349065e-06, "loss": 0.05669436231255531, "memory(GiB)": 21.32, "step": 6088, "token_acc": 0.9823008849557522, "train_speed(iter/s)": 0.947686 }, { "epoch": 0.1978039827177338, "grad_norm": 0.5352177619934082, "learning_rate": 9.414715548268661e-06, "loss": 0.033716004341840744, "memory(GiB)": 21.32, "step": 6089, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.947721 }, { "epoch": 0.19783646818048922, "grad_norm": 0.5115655064582825, "learning_rate": 9.414463340236563e-06, "loss": 0.04487181082367897, "memory(GiB)": 21.32, "step": 6090, "token_acc": 0.9796747967479674, "train_speed(iter/s)": 0.947753 }, { "epoch": 0.19786895364324464, "grad_norm": 1.0936156511306763, "learning_rate": 9.414211081255685e-06, "loss": 0.04545879736542702, "memory(GiB)": 21.32, "step": 6091, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.947789 }, { "epoch": 0.19790143910600005, "grad_norm": 0.5851143002510071, "learning_rate": 9.413958771328932e-06, "loss": 0.0434870608150959, "memory(GiB)": 21.32, "step": 6092, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.947823 }, { "epoch": 0.19793392456875547, "grad_norm": 0.4409302771091461, "learning_rate": 9.413706410459223e-06, "loss": 0.03742207586765289, "memory(GiB)": 21.32, "step": 6093, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.947858 }, { "epoch": 0.1979664100315109, "grad_norm": 0.44413942098617554, "learning_rate": 9.413453998649465e-06, "loss": 0.04472256451845169, "memory(GiB)": 21.32, "step": 6094, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.947893 }, { "epoch": 0.1979988954942663, "grad_norm": 0.7253164649009705, "learning_rate": 9.413201535902575e-06, "loss": 0.04754065349698067, "memory(GiB)": 21.32, "step": 6095, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.947927 }, { "epoch": 0.19803138095702175, "grad_norm": 0.475616991519928, "learning_rate": 9.412949022221466e-06, "loss": 0.048775386065244675, "memory(GiB)": 21.32, "step": 6096, "token_acc": 0.9701492537313433, "train_speed(iter/s)": 0.94796 }, { "epoch": 0.19806386641977716, "grad_norm": 0.5059094429016113, "learning_rate": 9.41269645760905e-06, "loss": 0.03404372185468674, "memory(GiB)": 21.32, "step": 6097, "token_acc": 0.9806949806949807, "train_speed(iter/s)": 0.947994 }, { "epoch": 0.19809635188253258, "grad_norm": 0.45724549889564514, "learning_rate": 9.412443842068244e-06, "loss": 0.048080235719680786, "memory(GiB)": 21.32, "step": 6098, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.948028 }, { "epoch": 0.198128837345288, "grad_norm": 0.5844466090202332, "learning_rate": 9.412191175601963e-06, "loss": 0.046339601278305054, "memory(GiB)": 21.32, "step": 6099, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.948063 }, { "epoch": 0.1981613228080434, "grad_norm": 0.5046796798706055, "learning_rate": 9.411938458213123e-06, "loss": 0.04281878471374512, "memory(GiB)": 21.32, "step": 6100, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.948095 }, { "epoch": 0.19819380827079883, "grad_norm": 0.45862650871276855, "learning_rate": 9.411685689904641e-06, "loss": 0.04072035104036331, "memory(GiB)": 21.32, "step": 6101, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.948131 }, { "epoch": 0.19822629373355424, "grad_norm": 0.637406051158905, "learning_rate": 9.411432870679436e-06, "loss": 0.056888800114393234, "memory(GiB)": 21.32, "step": 6102, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.948162 }, { "epoch": 0.19825877919630966, "grad_norm": 0.7068948149681091, "learning_rate": 9.411180000540423e-06, "loss": 0.049490563571453094, "memory(GiB)": 21.32, "step": 6103, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.948197 }, { "epoch": 0.19829126465906508, "grad_norm": 0.5973048806190491, "learning_rate": 9.410927079490522e-06, "loss": 0.051714517176151276, "memory(GiB)": 21.32, "step": 6104, "token_acc": 0.9609756097560975, "train_speed(iter/s)": 0.948231 }, { "epoch": 0.1983237501218205, "grad_norm": 0.6124444007873535, "learning_rate": 9.410674107532649e-06, "loss": 0.047927774488925934, "memory(GiB)": 21.32, "step": 6105, "token_acc": 0.978494623655914, "train_speed(iter/s)": 0.948258 }, { "epoch": 0.1983562355845759, "grad_norm": 0.4954785406589508, "learning_rate": 9.410421084669728e-06, "loss": 0.04601318761706352, "memory(GiB)": 21.32, "step": 6106, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.948284 }, { "epoch": 0.19838872104733132, "grad_norm": 0.46645426750183105, "learning_rate": 9.41016801090468e-06, "loss": 0.043783675879240036, "memory(GiB)": 21.32, "step": 6107, "token_acc": 0.9755244755244755, "train_speed(iter/s)": 0.948311 }, { "epoch": 0.19842120651008674, "grad_norm": 0.5486024618148804, "learning_rate": 9.40991488624042e-06, "loss": 0.046852439641952515, "memory(GiB)": 21.32, "step": 6108, "token_acc": 0.9753086419753086, "train_speed(iter/s)": 0.948336 }, { "epoch": 0.19845369197284216, "grad_norm": 1.179418921470642, "learning_rate": 9.409661710679871e-06, "loss": 0.05360257998108864, "memory(GiB)": 21.32, "step": 6109, "token_acc": 0.9672897196261683, "train_speed(iter/s)": 0.948363 }, { "epoch": 0.19848617743559757, "grad_norm": 0.4793359935283661, "learning_rate": 9.40940848422596e-06, "loss": 0.040843427181243896, "memory(GiB)": 21.32, "step": 6110, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.948388 }, { "epoch": 0.198518662898353, "grad_norm": 0.5559746623039246, "learning_rate": 9.409155206881605e-06, "loss": 0.04838058352470398, "memory(GiB)": 21.32, "step": 6111, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.948417 }, { "epoch": 0.1985511483611084, "grad_norm": 0.7140052914619446, "learning_rate": 9.408901878649731e-06, "loss": 0.05380471050739288, "memory(GiB)": 21.32, "step": 6112, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.94844 }, { "epoch": 0.19858363382386382, "grad_norm": 0.5997152328491211, "learning_rate": 9.40864849953326e-06, "loss": 0.046775415539741516, "memory(GiB)": 21.32, "step": 6113, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.948464 }, { "epoch": 0.19861611928661924, "grad_norm": 0.3898853063583374, "learning_rate": 9.40839506953512e-06, "loss": 0.03346990793943405, "memory(GiB)": 21.32, "step": 6114, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.948491 }, { "epoch": 0.19864860474937465, "grad_norm": 0.45807507634162903, "learning_rate": 9.408141588658231e-06, "loss": 0.0342295840382576, "memory(GiB)": 21.32, "step": 6115, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.94852 }, { "epoch": 0.19868109021213007, "grad_norm": 0.4856702983379364, "learning_rate": 9.407888056905519e-06, "loss": 0.047614119946956635, "memory(GiB)": 21.32, "step": 6116, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.948545 }, { "epoch": 0.19871357567488548, "grad_norm": 0.5538395047187805, "learning_rate": 9.407634474279915e-06, "loss": 0.05606510117650032, "memory(GiB)": 21.32, "step": 6117, "token_acc": 0.9662447257383966, "train_speed(iter/s)": 0.948572 }, { "epoch": 0.1987460611376409, "grad_norm": 0.4697124660015106, "learning_rate": 9.407380840784342e-06, "loss": 0.033078208565711975, "memory(GiB)": 21.32, "step": 6118, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.948596 }, { "epoch": 0.19877854660039632, "grad_norm": 0.5919265151023865, "learning_rate": 9.407127156421727e-06, "loss": 0.0486215204000473, "memory(GiB)": 21.32, "step": 6119, "token_acc": 0.9701492537313433, "train_speed(iter/s)": 0.948617 }, { "epoch": 0.19881103206315173, "grad_norm": 0.6466934084892273, "learning_rate": 9.406873421195e-06, "loss": 0.05417414754629135, "memory(GiB)": 21.32, "step": 6120, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.94864 }, { "epoch": 0.19884351752590715, "grad_norm": 0.4523782432079315, "learning_rate": 9.406619635107087e-06, "loss": 0.03999520465731621, "memory(GiB)": 21.32, "step": 6121, "token_acc": 0.9884169884169884, "train_speed(iter/s)": 0.948663 }, { "epoch": 0.19887600298866256, "grad_norm": 0.5853298902511597, "learning_rate": 9.40636579816092e-06, "loss": 0.044298119843006134, "memory(GiB)": 21.32, "step": 6122, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.948685 }, { "epoch": 0.19890848845141798, "grad_norm": 0.6784891486167908, "learning_rate": 9.406111910359424e-06, "loss": 0.04786008223891258, "memory(GiB)": 21.32, "step": 6123, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.948708 }, { "epoch": 0.1989409739141734, "grad_norm": 0.5446167588233948, "learning_rate": 9.405857971705535e-06, "loss": 0.04627154767513275, "memory(GiB)": 21.32, "step": 6124, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.948732 }, { "epoch": 0.1989734593769288, "grad_norm": 0.6231908798217773, "learning_rate": 9.405603982202179e-06, "loss": 0.05090290680527687, "memory(GiB)": 21.32, "step": 6125, "token_acc": 0.9758454106280193, "train_speed(iter/s)": 0.948758 }, { "epoch": 0.19900594483968423, "grad_norm": 0.49684828519821167, "learning_rate": 9.405349941852289e-06, "loss": 0.038004420697689056, "memory(GiB)": 21.32, "step": 6126, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.948786 }, { "epoch": 0.19903843030243965, "grad_norm": 0.6416242718696594, "learning_rate": 9.405095850658798e-06, "loss": 0.040227338671684265, "memory(GiB)": 21.32, "step": 6127, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.948811 }, { "epoch": 0.1990709157651951, "grad_norm": 0.789954662322998, "learning_rate": 9.404841708624637e-06, "loss": 0.060007162392139435, "memory(GiB)": 21.32, "step": 6128, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.948835 }, { "epoch": 0.1991034012279505, "grad_norm": 0.8054434061050415, "learning_rate": 9.404587515752742e-06, "loss": 0.04499749466776848, "memory(GiB)": 21.32, "step": 6129, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.948859 }, { "epoch": 0.19913588669070592, "grad_norm": 0.7954849600791931, "learning_rate": 9.404333272046042e-06, "loss": 0.043391935527324677, "memory(GiB)": 21.32, "step": 6130, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.948885 }, { "epoch": 0.19916837215346134, "grad_norm": 0.3710673153400421, "learning_rate": 9.404078977507473e-06, "loss": 0.0381820909678936, "memory(GiB)": 21.32, "step": 6131, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.948912 }, { "epoch": 0.19920085761621675, "grad_norm": 0.5227330327033997, "learning_rate": 9.403824632139974e-06, "loss": 0.04075206071138382, "memory(GiB)": 21.32, "step": 6132, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.948938 }, { "epoch": 0.19923334307897217, "grad_norm": 0.46152031421661377, "learning_rate": 9.403570235946476e-06, "loss": 0.037526100873947144, "memory(GiB)": 21.32, "step": 6133, "token_acc": 0.9764705882352941, "train_speed(iter/s)": 0.948965 }, { "epoch": 0.19926582854172759, "grad_norm": 0.5161942839622498, "learning_rate": 9.403315788929917e-06, "loss": 0.04008379206061363, "memory(GiB)": 21.32, "step": 6134, "token_acc": 0.9836956521739131, "train_speed(iter/s)": 0.948992 }, { "epoch": 0.199298314004483, "grad_norm": 0.5212801694869995, "learning_rate": 9.403061291093231e-06, "loss": 0.030943430960178375, "memory(GiB)": 21.32, "step": 6135, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.949018 }, { "epoch": 0.19933079946723842, "grad_norm": 0.950615406036377, "learning_rate": 9.40280674243936e-06, "loss": 0.038452159613370895, "memory(GiB)": 21.32, "step": 6136, "token_acc": 1.0, "train_speed(iter/s)": 0.949044 }, { "epoch": 0.19936328492999383, "grad_norm": 0.4552544057369232, "learning_rate": 9.402552142971236e-06, "loss": 0.032526854425668716, "memory(GiB)": 21.32, "step": 6137, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.949067 }, { "epoch": 0.19939577039274925, "grad_norm": 0.669402539730072, "learning_rate": 9.402297492691803e-06, "loss": 0.04385121539235115, "memory(GiB)": 21.32, "step": 6138, "token_acc": 0.9774436090225563, "train_speed(iter/s)": 0.949093 }, { "epoch": 0.19942825585550467, "grad_norm": 0.8530267477035522, "learning_rate": 9.402042791603996e-06, "loss": 0.04437354952096939, "memory(GiB)": 21.32, "step": 6139, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.949119 }, { "epoch": 0.19946074131826008, "grad_norm": 1.6235734224319458, "learning_rate": 9.401788039710755e-06, "loss": 0.03846141695976257, "memory(GiB)": 21.32, "step": 6140, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.949144 }, { "epoch": 0.1994932267810155, "grad_norm": 0.8432925343513489, "learning_rate": 9.401533237015023e-06, "loss": 0.036606285721063614, "memory(GiB)": 21.32, "step": 6141, "token_acc": 0.9927272727272727, "train_speed(iter/s)": 0.94917 }, { "epoch": 0.19952571224377091, "grad_norm": 0.6361426115036011, "learning_rate": 9.40127838351974e-06, "loss": 0.04030417278409004, "memory(GiB)": 21.32, "step": 6142, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.949199 }, { "epoch": 0.19955819770652633, "grad_norm": 0.7116118669509888, "learning_rate": 9.401023479227845e-06, "loss": 0.05009078234434128, "memory(GiB)": 21.32, "step": 6143, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.949232 }, { "epoch": 0.19959068316928175, "grad_norm": 0.5930513143539429, "learning_rate": 9.40076852414228e-06, "loss": 0.03980458527803421, "memory(GiB)": 21.32, "step": 6144, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.949264 }, { "epoch": 0.19962316863203716, "grad_norm": 0.5881969928741455, "learning_rate": 9.400513518265992e-06, "loss": 0.04700668156147003, "memory(GiB)": 21.32, "step": 6145, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.949299 }, { "epoch": 0.19965565409479258, "grad_norm": 0.5978735089302063, "learning_rate": 9.400258461601918e-06, "loss": 0.04101788252592087, "memory(GiB)": 21.32, "step": 6146, "token_acc": 0.9702127659574468, "train_speed(iter/s)": 0.949332 }, { "epoch": 0.199688139557548, "grad_norm": 1.0061701536178589, "learning_rate": 9.400003354153006e-06, "loss": 0.05516299232840538, "memory(GiB)": 21.32, "step": 6147, "token_acc": 0.9704433497536946, "train_speed(iter/s)": 0.949365 }, { "epoch": 0.1997206250203034, "grad_norm": 0.49841418862342834, "learning_rate": 9.3997481959222e-06, "loss": 0.04121089726686478, "memory(GiB)": 21.32, "step": 6148, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.949397 }, { "epoch": 0.19975311048305883, "grad_norm": 0.5064183473587036, "learning_rate": 9.399492986912443e-06, "loss": 0.03771591559052467, "memory(GiB)": 21.32, "step": 6149, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.949431 }, { "epoch": 0.19978559594581424, "grad_norm": 0.49948540329933167, "learning_rate": 9.39923772712668e-06, "loss": 0.04509164020419121, "memory(GiB)": 21.32, "step": 6150, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.949465 }, { "epoch": 0.19981808140856966, "grad_norm": 0.670595109462738, "learning_rate": 9.39898241656786e-06, "loss": 0.04650956392288208, "memory(GiB)": 21.32, "step": 6151, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.949497 }, { "epoch": 0.19985056687132507, "grad_norm": 0.5419623851776123, "learning_rate": 9.398727055238926e-06, "loss": 0.04423630237579346, "memory(GiB)": 21.32, "step": 6152, "token_acc": 1.0, "train_speed(iter/s)": 0.949531 }, { "epoch": 0.1998830523340805, "grad_norm": 0.5549430251121521, "learning_rate": 9.398471643142827e-06, "loss": 0.04875504970550537, "memory(GiB)": 21.32, "step": 6153, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.949563 }, { "epoch": 0.1999155377968359, "grad_norm": 0.5377567410469055, "learning_rate": 9.398216180282514e-06, "loss": 0.04483906924724579, "memory(GiB)": 21.32, "step": 6154, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.949595 }, { "epoch": 0.19994802325959132, "grad_norm": 0.5686526298522949, "learning_rate": 9.39796066666093e-06, "loss": 0.04324321076273918, "memory(GiB)": 21.32, "step": 6155, "token_acc": 0.9780701754385965, "train_speed(iter/s)": 0.949629 }, { "epoch": 0.19998050872234674, "grad_norm": 0.5042718648910522, "learning_rate": 9.397705102281024e-06, "loss": 0.039486296474933624, "memory(GiB)": 21.32, "step": 6156, "token_acc": 0.9811912225705329, "train_speed(iter/s)": 0.949661 }, { "epoch": 0.20001299418510216, "grad_norm": 0.6034063696861267, "learning_rate": 9.397449487145751e-06, "loss": 0.04608619958162308, "memory(GiB)": 21.32, "step": 6157, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.949696 }, { "epoch": 0.20004547964785757, "grad_norm": 0.7499189376831055, "learning_rate": 9.397193821258058e-06, "loss": 0.05009043216705322, "memory(GiB)": 21.32, "step": 6158, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.949728 }, { "epoch": 0.200077965110613, "grad_norm": 0.6041127443313599, "learning_rate": 9.396938104620894e-06, "loss": 0.04287215694785118, "memory(GiB)": 21.32, "step": 6159, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.949761 }, { "epoch": 0.20011045057336843, "grad_norm": 0.638333261013031, "learning_rate": 9.396682337237213e-06, "loss": 0.046971239149570465, "memory(GiB)": 21.32, "step": 6160, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.949794 }, { "epoch": 0.20014293603612385, "grad_norm": 0.57512366771698, "learning_rate": 9.396426519109966e-06, "loss": 0.041010886430740356, "memory(GiB)": 21.32, "step": 6161, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.949828 }, { "epoch": 0.20017542149887926, "grad_norm": 0.6028775572776794, "learning_rate": 9.396170650242104e-06, "loss": 0.04319213703274727, "memory(GiB)": 21.32, "step": 6162, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.949862 }, { "epoch": 0.20020790696163468, "grad_norm": 0.5119014382362366, "learning_rate": 9.395914730636583e-06, "loss": 0.03822549432516098, "memory(GiB)": 21.32, "step": 6163, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.949896 }, { "epoch": 0.2002403924243901, "grad_norm": 0.5197210311889648, "learning_rate": 9.395658760296354e-06, "loss": 0.04974143207073212, "memory(GiB)": 21.32, "step": 6164, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.949925 }, { "epoch": 0.2002728778871455, "grad_norm": 0.6589265465736389, "learning_rate": 9.395402739224373e-06, "loss": 0.048658452928066254, "memory(GiB)": 21.32, "step": 6165, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.94995 }, { "epoch": 0.20030536334990093, "grad_norm": 0.4484517276287079, "learning_rate": 9.395146667423594e-06, "loss": 0.04151508957147598, "memory(GiB)": 21.32, "step": 6166, "token_acc": 0.9696969696969697, "train_speed(iter/s)": 0.949975 }, { "epoch": 0.20033784881265634, "grad_norm": 0.5750933289527893, "learning_rate": 9.394890544896972e-06, "loss": 0.038474924862384796, "memory(GiB)": 21.32, "step": 6167, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.950002 }, { "epoch": 0.20037033427541176, "grad_norm": 0.7665316462516785, "learning_rate": 9.394634371647464e-06, "loss": 0.048827797174453735, "memory(GiB)": 21.32, "step": 6168, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.950029 }, { "epoch": 0.20040281973816718, "grad_norm": 0.6029632687568665, "learning_rate": 9.394378147678025e-06, "loss": 0.04432053864002228, "memory(GiB)": 21.32, "step": 6169, "token_acc": 0.9805825242718447, "train_speed(iter/s)": 0.950054 }, { "epoch": 0.2004353052009226, "grad_norm": 0.6429193019866943, "learning_rate": 9.394121872991615e-06, "loss": 0.04763185232877731, "memory(GiB)": 21.32, "step": 6170, "token_acc": 0.9781420765027322, "train_speed(iter/s)": 0.950079 }, { "epoch": 0.200467790663678, "grad_norm": 0.5195643305778503, "learning_rate": 9.393865547591189e-06, "loss": 0.03552508354187012, "memory(GiB)": 21.32, "step": 6171, "token_acc": 1.0, "train_speed(iter/s)": 0.950104 }, { "epoch": 0.20050027612643342, "grad_norm": 0.4918544590473175, "learning_rate": 9.393609171479707e-06, "loss": 0.039316363632678986, "memory(GiB)": 21.32, "step": 6172, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.950128 }, { "epoch": 0.20053276158918884, "grad_norm": 0.6194105744361877, "learning_rate": 9.393352744660128e-06, "loss": 0.03955921530723572, "memory(GiB)": 21.32, "step": 6173, "token_acc": 0.9810126582278481, "train_speed(iter/s)": 0.950153 }, { "epoch": 0.20056524705194426, "grad_norm": 0.4460882246494293, "learning_rate": 9.393096267135408e-06, "loss": 0.03746978938579559, "memory(GiB)": 21.32, "step": 6174, "token_acc": 0.9875, "train_speed(iter/s)": 0.950178 }, { "epoch": 0.20059773251469967, "grad_norm": 0.47562292218208313, "learning_rate": 9.392839738908513e-06, "loss": 0.0450238361954689, "memory(GiB)": 21.32, "step": 6175, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.950201 }, { "epoch": 0.2006302179774551, "grad_norm": 0.6866028904914856, "learning_rate": 9.392583159982398e-06, "loss": 0.03943878039717674, "memory(GiB)": 21.32, "step": 6176, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.950226 }, { "epoch": 0.2006627034402105, "grad_norm": 0.494311660528183, "learning_rate": 9.392326530360028e-06, "loss": 0.04279470443725586, "memory(GiB)": 21.32, "step": 6177, "token_acc": 0.9854014598540146, "train_speed(iter/s)": 0.950249 }, { "epoch": 0.20069518890296592, "grad_norm": 0.44746044278144836, "learning_rate": 9.392069850044362e-06, "loss": 0.037029772996902466, "memory(GiB)": 21.32, "step": 6178, "token_acc": 0.9845559845559846, "train_speed(iter/s)": 0.950275 }, { "epoch": 0.20072767436572134, "grad_norm": 0.4807482957839966, "learning_rate": 9.391813119038366e-06, "loss": 0.038953304290771484, "memory(GiB)": 21.32, "step": 6179, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.950301 }, { "epoch": 0.20076015982847675, "grad_norm": 0.6079766750335693, "learning_rate": 9.391556337345003e-06, "loss": 0.038339436054229736, "memory(GiB)": 21.32, "step": 6180, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.950327 }, { "epoch": 0.20079264529123217, "grad_norm": 0.5909979939460754, "learning_rate": 9.391299504967231e-06, "loss": 0.036706771701574326, "memory(GiB)": 21.32, "step": 6181, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.95035 }, { "epoch": 0.20082513075398759, "grad_norm": 0.4867231249809265, "learning_rate": 9.39104262190802e-06, "loss": 0.041077714413404465, "memory(GiB)": 21.32, "step": 6182, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.950373 }, { "epoch": 0.200857616216743, "grad_norm": 0.4986148774623871, "learning_rate": 9.390785688170331e-06, "loss": 0.04215402156114578, "memory(GiB)": 21.32, "step": 6183, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.950397 }, { "epoch": 0.20089010167949842, "grad_norm": 0.8894439935684204, "learning_rate": 9.390528703757134e-06, "loss": 0.04256560653448105, "memory(GiB)": 21.32, "step": 6184, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.950422 }, { "epoch": 0.20092258714225383, "grad_norm": 0.5023501515388489, "learning_rate": 9.39027166867139e-06, "loss": 0.03753603249788284, "memory(GiB)": 21.32, "step": 6185, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.95045 }, { "epoch": 0.20095507260500925, "grad_norm": 0.5273312330245972, "learning_rate": 9.390014582916067e-06, "loss": 0.04995378106832504, "memory(GiB)": 21.32, "step": 6186, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.950478 }, { "epoch": 0.20098755806776467, "grad_norm": 0.46128448843955994, "learning_rate": 9.389757446494131e-06, "loss": 0.04349863529205322, "memory(GiB)": 21.32, "step": 6187, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.950503 }, { "epoch": 0.20102004353052008, "grad_norm": 0.6284477710723877, "learning_rate": 9.389500259408555e-06, "loss": 0.04861962050199509, "memory(GiB)": 21.32, "step": 6188, "token_acc": 0.9875, "train_speed(iter/s)": 0.950527 }, { "epoch": 0.2010525289932755, "grad_norm": 0.7444373965263367, "learning_rate": 9.389243021662303e-06, "loss": 0.05485178157687187, "memory(GiB)": 21.32, "step": 6189, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.95055 }, { "epoch": 0.2010850144560309, "grad_norm": 0.5646063089370728, "learning_rate": 9.388985733258343e-06, "loss": 0.04657568782567978, "memory(GiB)": 21.32, "step": 6190, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.950576 }, { "epoch": 0.20111749991878633, "grad_norm": 0.616741955280304, "learning_rate": 9.388728394199647e-06, "loss": 0.0429762527346611, "memory(GiB)": 21.32, "step": 6191, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.950602 }, { "epoch": 0.20114998538154177, "grad_norm": 0.4024827480316162, "learning_rate": 9.388471004489184e-06, "loss": 0.0371837317943573, "memory(GiB)": 21.32, "step": 6192, "token_acc": 0.96415770609319, "train_speed(iter/s)": 0.950629 }, { "epoch": 0.2011824708442972, "grad_norm": 0.4369083344936371, "learning_rate": 9.388213564129923e-06, "loss": 0.04157131910324097, "memory(GiB)": 21.32, "step": 6193, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.950655 }, { "epoch": 0.2012149563070526, "grad_norm": 11.43652629852295, "learning_rate": 9.387956073124837e-06, "loss": 0.043554503470659256, "memory(GiB)": 21.32, "step": 6194, "token_acc": 1.0, "train_speed(iter/s)": 0.95068 }, { "epoch": 0.20124744176980802, "grad_norm": 0.465452641248703, "learning_rate": 9.387698531476899e-06, "loss": 0.03959083557128906, "memory(GiB)": 21.32, "step": 6195, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.950708 }, { "epoch": 0.20127992723256344, "grad_norm": 0.403774231672287, "learning_rate": 9.38744093918908e-06, "loss": 0.0382300540804863, "memory(GiB)": 21.32, "step": 6196, "token_acc": 0.9884615384615385, "train_speed(iter/s)": 0.950736 }, { "epoch": 0.20131241269531885, "grad_norm": 0.882256269454956, "learning_rate": 9.387183296264351e-06, "loss": 0.05924943462014198, "memory(GiB)": 21.32, "step": 6197, "token_acc": 0.9695652173913043, "train_speed(iter/s)": 0.950762 }, { "epoch": 0.20134489815807427, "grad_norm": 0.6957715749740601, "learning_rate": 9.38692560270569e-06, "loss": 0.04398350417613983, "memory(GiB)": 21.32, "step": 6198, "token_acc": 0.9877049180327869, "train_speed(iter/s)": 0.950788 }, { "epoch": 0.2013773836208297, "grad_norm": 0.525568425655365, "learning_rate": 9.386667858516067e-06, "loss": 0.03856214880943298, "memory(GiB)": 21.32, "step": 6199, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.950809 }, { "epoch": 0.2014098690835851, "grad_norm": 0.4727969765663147, "learning_rate": 9.38641006369846e-06, "loss": 0.03937660902738571, "memory(GiB)": 21.32, "step": 6200, "token_acc": 0.9708333333333333, "train_speed(iter/s)": 0.950833 }, { "epoch": 0.20144235454634052, "grad_norm": 0.6452562808990479, "learning_rate": 9.386152218255839e-06, "loss": 0.05013487860560417, "memory(GiB)": 21.32, "step": 6201, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.950859 }, { "epoch": 0.20147484000909593, "grad_norm": 0.428693562746048, "learning_rate": 9.385894322191185e-06, "loss": 0.039617329835891724, "memory(GiB)": 21.32, "step": 6202, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.950884 }, { "epoch": 0.20150732547185135, "grad_norm": 0.615815281867981, "learning_rate": 9.385636375507473e-06, "loss": 0.05126524716615677, "memory(GiB)": 21.32, "step": 6203, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.950912 }, { "epoch": 0.20153981093460677, "grad_norm": 0.9398365616798401, "learning_rate": 9.385378378207682e-06, "loss": 0.040003057569265366, "memory(GiB)": 21.32, "step": 6204, "token_acc": 0.9696969696969697, "train_speed(iter/s)": 0.950942 }, { "epoch": 0.20157229639736218, "grad_norm": 0.7900395393371582, "learning_rate": 9.385120330294786e-06, "loss": 0.061272576451301575, "memory(GiB)": 21.32, "step": 6205, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.950976 }, { "epoch": 0.2016047818601176, "grad_norm": 0.49904686212539673, "learning_rate": 9.384862231771764e-06, "loss": 0.03927481919527054, "memory(GiB)": 21.32, "step": 6206, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.951008 }, { "epoch": 0.20163726732287302, "grad_norm": 0.5948041081428528, "learning_rate": 9.384604082641597e-06, "loss": 0.0418492928147316, "memory(GiB)": 21.32, "step": 6207, "token_acc": 0.9853479853479854, "train_speed(iter/s)": 0.951042 }, { "epoch": 0.20166975278562843, "grad_norm": 0.4646502137184143, "learning_rate": 9.384345882907262e-06, "loss": 0.03730155527591705, "memory(GiB)": 21.32, "step": 6208, "token_acc": 0.9823008849557522, "train_speed(iter/s)": 0.951074 }, { "epoch": 0.20170223824838385, "grad_norm": 0.5159799456596375, "learning_rate": 9.38408763257174e-06, "loss": 0.04695894569158554, "memory(GiB)": 21.32, "step": 6209, "token_acc": 0.992, "train_speed(iter/s)": 0.951106 }, { "epoch": 0.20173472371113926, "grad_norm": 0.5243057608604431, "learning_rate": 9.383829331638011e-06, "loss": 0.037100084125995636, "memory(GiB)": 21.32, "step": 6210, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.951138 }, { "epoch": 0.20176720917389468, "grad_norm": 0.5968567132949829, "learning_rate": 9.383570980109058e-06, "loss": 0.03974924609065056, "memory(GiB)": 21.32, "step": 6211, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.951171 }, { "epoch": 0.2017996946366501, "grad_norm": 0.6594251990318298, "learning_rate": 9.383312577987861e-06, "loss": 0.04458273574709892, "memory(GiB)": 21.32, "step": 6212, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.951199 }, { "epoch": 0.2018321800994055, "grad_norm": 0.473726361989975, "learning_rate": 9.383054125277402e-06, "loss": 0.033293455839157104, "memory(GiB)": 21.32, "step": 6213, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.951233 }, { "epoch": 0.20186466556216093, "grad_norm": 0.5392098426818848, "learning_rate": 9.382795621980665e-06, "loss": 0.03209864720702171, "memory(GiB)": 21.32, "step": 6214, "token_acc": 0.9809160305343512, "train_speed(iter/s)": 0.951266 }, { "epoch": 0.20189715102491634, "grad_norm": 0.63885098695755, "learning_rate": 9.382537068100634e-06, "loss": 0.0448903888463974, "memory(GiB)": 21.32, "step": 6215, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.951295 }, { "epoch": 0.20192963648767176, "grad_norm": 0.601873517036438, "learning_rate": 9.38227846364029e-06, "loss": 0.03964708745479584, "memory(GiB)": 21.32, "step": 6216, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.951324 }, { "epoch": 0.20196212195042718, "grad_norm": 0.6216142177581787, "learning_rate": 9.382019808602622e-06, "loss": 0.04088033735752106, "memory(GiB)": 21.32, "step": 6217, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.951356 }, { "epoch": 0.2019946074131826, "grad_norm": 0.6725082397460938, "learning_rate": 9.381761102990611e-06, "loss": 0.04217282682657242, "memory(GiB)": 21.32, "step": 6218, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.951389 }, { "epoch": 0.202027092875938, "grad_norm": 0.6971205472946167, "learning_rate": 9.381502346807246e-06, "loss": 0.05248140171170235, "memory(GiB)": 21.32, "step": 6219, "token_acc": 0.9718875502008032, "train_speed(iter/s)": 0.951422 }, { "epoch": 0.20205957833869342, "grad_norm": 0.59047931432724, "learning_rate": 9.381243540055512e-06, "loss": 0.03903544694185257, "memory(GiB)": 21.32, "step": 6220, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.951455 }, { "epoch": 0.20209206380144884, "grad_norm": 1.2211661338806152, "learning_rate": 9.380984682738396e-06, "loss": 0.04434981942176819, "memory(GiB)": 21.32, "step": 6221, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.951489 }, { "epoch": 0.20212454926420426, "grad_norm": 0.7681014537811279, "learning_rate": 9.380725774858886e-06, "loss": 0.047106318175792694, "memory(GiB)": 21.32, "step": 6222, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.951522 }, { "epoch": 0.20215703472695967, "grad_norm": 2.1170449256896973, "learning_rate": 9.380466816419968e-06, "loss": 0.04381120204925537, "memory(GiB)": 21.32, "step": 6223, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.951551 }, { "epoch": 0.20218952018971512, "grad_norm": 0.5489096641540527, "learning_rate": 9.380207807424634e-06, "loss": 0.04371961951255798, "memory(GiB)": 21.32, "step": 6224, "token_acc": 0.9858657243816255, "train_speed(iter/s)": 0.951575 }, { "epoch": 0.20222200565247053, "grad_norm": 0.9836611747741699, "learning_rate": 9.379948747875872e-06, "loss": 0.03140588849782944, "memory(GiB)": 21.32, "step": 6225, "token_acc": 0.9817073170731707, "train_speed(iter/s)": 0.951599 }, { "epoch": 0.20225449111522595, "grad_norm": 0.968681812286377, "learning_rate": 9.379689637776672e-06, "loss": 0.05591814965009689, "memory(GiB)": 21.32, "step": 6226, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.951627 }, { "epoch": 0.20228697657798136, "grad_norm": 1.0216642618179321, "learning_rate": 9.379430477130024e-06, "loss": 0.033160798251628876, "memory(GiB)": 21.32, "step": 6227, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.951654 }, { "epoch": 0.20231946204073678, "grad_norm": 0.6232711672782898, "learning_rate": 9.379171265938918e-06, "loss": 0.0470992811024189, "memory(GiB)": 21.32, "step": 6228, "token_acc": 0.988950276243094, "train_speed(iter/s)": 0.951677 }, { "epoch": 0.2023519475034922, "grad_norm": 0.8421478867530823, "learning_rate": 9.378912004206349e-06, "loss": 0.04187764227390289, "memory(GiB)": 21.32, "step": 6229, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.951704 }, { "epoch": 0.2023844329662476, "grad_norm": 0.6030786633491516, "learning_rate": 9.378652691935307e-06, "loss": 0.0456923246383667, "memory(GiB)": 21.32, "step": 6230, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.951731 }, { "epoch": 0.20241691842900303, "grad_norm": 0.4780420660972595, "learning_rate": 9.378393329128783e-06, "loss": 0.033846259117126465, "memory(GiB)": 21.32, "step": 6231, "token_acc": 0.9895470383275261, "train_speed(iter/s)": 0.951753 }, { "epoch": 0.20244940389175844, "grad_norm": 0.7037742733955383, "learning_rate": 9.378133915789774e-06, "loss": 0.042932458221912384, "memory(GiB)": 21.32, "step": 6232, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.951777 }, { "epoch": 0.20248188935451386, "grad_norm": 0.5046917796134949, "learning_rate": 9.377874451921272e-06, "loss": 0.042615510523319244, "memory(GiB)": 21.32, "step": 6233, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.951802 }, { "epoch": 0.20251437481726928, "grad_norm": 0.4656340479850769, "learning_rate": 9.377614937526271e-06, "loss": 0.041717156767845154, "memory(GiB)": 21.32, "step": 6234, "token_acc": 0.9806201550387597, "train_speed(iter/s)": 0.951827 }, { "epoch": 0.2025468602800247, "grad_norm": 0.9275957345962524, "learning_rate": 9.377355372607769e-06, "loss": 0.06481868773698807, "memory(GiB)": 21.32, "step": 6235, "token_acc": 0.9675675675675676, "train_speed(iter/s)": 0.95185 }, { "epoch": 0.2025793457427801, "grad_norm": 0.693354070186615, "learning_rate": 9.377095757168757e-06, "loss": 0.05773818492889404, "memory(GiB)": 21.32, "step": 6236, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.951875 }, { "epoch": 0.20261183120553553, "grad_norm": 0.3686501383781433, "learning_rate": 9.376836091212236e-06, "loss": 0.03679044544696808, "memory(GiB)": 21.32, "step": 6237, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.951898 }, { "epoch": 0.20264431666829094, "grad_norm": 0.5219729542732239, "learning_rate": 9.3765763747412e-06, "loss": 0.047561369836330414, "memory(GiB)": 21.32, "step": 6238, "token_acc": 0.9798657718120806, "train_speed(iter/s)": 0.951924 }, { "epoch": 0.20267680213104636, "grad_norm": 0.5712352991104126, "learning_rate": 9.37631660775865e-06, "loss": 0.04127344489097595, "memory(GiB)": 21.32, "step": 6239, "token_acc": 0.9736842105263158, "train_speed(iter/s)": 0.95195 }, { "epoch": 0.20270928759380177, "grad_norm": 0.5580271482467651, "learning_rate": 9.376056790267579e-06, "loss": 0.04693355783820152, "memory(GiB)": 21.32, "step": 6240, "token_acc": 0.9797979797979798, "train_speed(iter/s)": 0.951977 }, { "epoch": 0.2027417730565572, "grad_norm": 0.522024929523468, "learning_rate": 9.375796922270988e-06, "loss": 0.03761421889066696, "memory(GiB)": 21.32, "step": 6241, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.951998 }, { "epoch": 0.2027742585193126, "grad_norm": 0.6659374833106995, "learning_rate": 9.375537003771876e-06, "loss": 0.036192819476127625, "memory(GiB)": 21.32, "step": 6242, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.952019 }, { "epoch": 0.20280674398206802, "grad_norm": 0.8480783700942993, "learning_rate": 9.375277034773244e-06, "loss": 0.04682307690382004, "memory(GiB)": 21.32, "step": 6243, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.952042 }, { "epoch": 0.20283922944482344, "grad_norm": 0.5032973289489746, "learning_rate": 9.375017015278092e-06, "loss": 0.05026683211326599, "memory(GiB)": 21.32, "step": 6244, "token_acc": 0.9895470383275261, "train_speed(iter/s)": 0.952067 }, { "epoch": 0.20287171490757885, "grad_norm": 0.48203006386756897, "learning_rate": 9.37475694528942e-06, "loss": 0.0361366868019104, "memory(GiB)": 21.32, "step": 6245, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.952091 }, { "epoch": 0.20290420037033427, "grad_norm": 0.45262449979782104, "learning_rate": 9.37449682481023e-06, "loss": 0.045289475470781326, "memory(GiB)": 21.32, "step": 6246, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.952116 }, { "epoch": 0.20293668583308969, "grad_norm": 0.5191052556037903, "learning_rate": 9.374236653843525e-06, "loss": 0.03691261261701584, "memory(GiB)": 21.32, "step": 6247, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.952142 }, { "epoch": 0.2029691712958451, "grad_norm": 0.6143032908439636, "learning_rate": 9.373976432392306e-06, "loss": 0.03728489950299263, "memory(GiB)": 21.32, "step": 6248, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.952167 }, { "epoch": 0.20300165675860052, "grad_norm": 0.4420914947986603, "learning_rate": 9.373716160459577e-06, "loss": 0.03866322338581085, "memory(GiB)": 21.32, "step": 6249, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.952191 }, { "epoch": 0.20303414222135593, "grad_norm": 0.5451327562332153, "learning_rate": 9.373455838048341e-06, "loss": 0.050841376185417175, "memory(GiB)": 21.32, "step": 6250, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.952217 }, { "epoch": 0.20306662768411135, "grad_norm": 0.6641612648963928, "learning_rate": 9.373195465161606e-06, "loss": 0.04667629674077034, "memory(GiB)": 21.32, "step": 6251, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.952241 }, { "epoch": 0.20309911314686677, "grad_norm": 0.6555724143981934, "learning_rate": 9.372935041802374e-06, "loss": 0.050256434828042984, "memory(GiB)": 21.32, "step": 6252, "token_acc": 0.9794238683127572, "train_speed(iter/s)": 0.952262 }, { "epoch": 0.20313159860962218, "grad_norm": 0.6618660092353821, "learning_rate": 9.372674567973651e-06, "loss": 0.046323012560606, "memory(GiB)": 21.32, "step": 6253, "token_acc": 0.9598214285714286, "train_speed(iter/s)": 0.952286 }, { "epoch": 0.2031640840723776, "grad_norm": 0.7173216938972473, "learning_rate": 9.372414043678443e-06, "loss": 0.04065518081188202, "memory(GiB)": 21.32, "step": 6254, "token_acc": 0.972, "train_speed(iter/s)": 0.952312 }, { "epoch": 0.20319656953513301, "grad_norm": 0.7788598537445068, "learning_rate": 9.372153468919758e-06, "loss": 0.04602250084280968, "memory(GiB)": 21.32, "step": 6255, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.952336 }, { "epoch": 0.20322905499788846, "grad_norm": 0.6851760745048523, "learning_rate": 9.371892843700603e-06, "loss": 0.0496046207845211, "memory(GiB)": 21.32, "step": 6256, "token_acc": 0.975, "train_speed(iter/s)": 0.952362 }, { "epoch": 0.20326154046064387, "grad_norm": 0.6346796154975891, "learning_rate": 9.371632168023985e-06, "loss": 0.04274607449769974, "memory(GiB)": 21.32, "step": 6257, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.952389 }, { "epoch": 0.2032940259233993, "grad_norm": 0.8637452125549316, "learning_rate": 9.371371441892913e-06, "loss": 0.03455084562301636, "memory(GiB)": 21.32, "step": 6258, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.952412 }, { "epoch": 0.2033265113861547, "grad_norm": 0.49307480454444885, "learning_rate": 9.371110665310397e-06, "loss": 0.036056071519851685, "memory(GiB)": 21.32, "step": 6259, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.952437 }, { "epoch": 0.20335899684891012, "grad_norm": 1.0906870365142822, "learning_rate": 9.370849838279445e-06, "loss": 0.04676785320043564, "memory(GiB)": 21.32, "step": 6260, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.952461 }, { "epoch": 0.20339148231166554, "grad_norm": 0.8584520816802979, "learning_rate": 9.370588960803069e-06, "loss": 0.04696305841207504, "memory(GiB)": 21.32, "step": 6261, "token_acc": 0.9757281553398058, "train_speed(iter/s)": 0.95248 }, { "epoch": 0.20342396777442096, "grad_norm": 0.4585702121257782, "learning_rate": 9.37032803288428e-06, "loss": 0.033362481743097305, "memory(GiB)": 21.32, "step": 6262, "token_acc": 0.9875, "train_speed(iter/s)": 0.952504 }, { "epoch": 0.20345645323717637, "grad_norm": 0.8217877149581909, "learning_rate": 9.370067054526086e-06, "loss": 0.04292280972003937, "memory(GiB)": 21.32, "step": 6263, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.952531 }, { "epoch": 0.2034889386999318, "grad_norm": 0.5214129686355591, "learning_rate": 9.369806025731503e-06, "loss": 0.0385764054954052, "memory(GiB)": 21.32, "step": 6264, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.952555 }, { "epoch": 0.2035214241626872, "grad_norm": 0.4995960295200348, "learning_rate": 9.369544946503542e-06, "loss": 0.03732974827289581, "memory(GiB)": 21.32, "step": 6265, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.952579 }, { "epoch": 0.20355390962544262, "grad_norm": 0.6717448234558105, "learning_rate": 9.369283816845217e-06, "loss": 0.03221764415502548, "memory(GiB)": 21.32, "step": 6266, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.952612 }, { "epoch": 0.20358639508819804, "grad_norm": 0.5902312994003296, "learning_rate": 9.369022636759542e-06, "loss": 0.045048121362924576, "memory(GiB)": 21.32, "step": 6267, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.952644 }, { "epoch": 0.20361888055095345, "grad_norm": 0.7394847869873047, "learning_rate": 9.36876140624953e-06, "loss": 0.0411406084895134, "memory(GiB)": 21.32, "step": 6268, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.952676 }, { "epoch": 0.20365136601370887, "grad_norm": 0.45926547050476074, "learning_rate": 9.368500125318194e-06, "loss": 0.04448210075497627, "memory(GiB)": 21.32, "step": 6269, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.952709 }, { "epoch": 0.20368385147646428, "grad_norm": 0.5000710487365723, "learning_rate": 9.368238793968556e-06, "loss": 0.03979334980249405, "memory(GiB)": 21.32, "step": 6270, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.952742 }, { "epoch": 0.2037163369392197, "grad_norm": 0.503614604473114, "learning_rate": 9.367977412203626e-06, "loss": 0.035193875432014465, "memory(GiB)": 21.32, "step": 6271, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.952775 }, { "epoch": 0.20374882240197512, "grad_norm": 0.45674049854278564, "learning_rate": 9.367715980026423e-06, "loss": 0.042436886578798294, "memory(GiB)": 21.32, "step": 6272, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.952807 }, { "epoch": 0.20378130786473053, "grad_norm": 0.6561258435249329, "learning_rate": 9.367454497439965e-06, "loss": 0.03845548257231712, "memory(GiB)": 21.32, "step": 6273, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.95284 }, { "epoch": 0.20381379332748595, "grad_norm": 0.5467458367347717, "learning_rate": 9.367192964447269e-06, "loss": 0.03915267437696457, "memory(GiB)": 21.32, "step": 6274, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.952872 }, { "epoch": 0.20384627879024136, "grad_norm": 0.3887276351451874, "learning_rate": 9.366931381051352e-06, "loss": 0.03405389562249184, "memory(GiB)": 21.32, "step": 6275, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.952903 }, { "epoch": 0.20387876425299678, "grad_norm": 0.48439621925354004, "learning_rate": 9.366669747255237e-06, "loss": 0.049115777015686035, "memory(GiB)": 21.32, "step": 6276, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.952936 }, { "epoch": 0.2039112497157522, "grad_norm": 0.5735440254211426, "learning_rate": 9.36640806306194e-06, "loss": 0.040272172540426254, "memory(GiB)": 21.32, "step": 6277, "token_acc": 0.9929824561403509, "train_speed(iter/s)": 0.952969 }, { "epoch": 0.2039437351785076, "grad_norm": 0.6598528623580933, "learning_rate": 9.366146328474482e-06, "loss": 0.037245430052280426, "memory(GiB)": 21.32, "step": 6278, "token_acc": 0.9864406779661017, "train_speed(iter/s)": 0.953001 }, { "epoch": 0.20397622064126303, "grad_norm": 0.6251087784767151, "learning_rate": 9.365884543495883e-06, "loss": 0.04268500953912735, "memory(GiB)": 21.32, "step": 6279, "token_acc": 0.974169741697417, "train_speed(iter/s)": 0.953034 }, { "epoch": 0.20400870610401844, "grad_norm": 0.6046212315559387, "learning_rate": 9.365622708129167e-06, "loss": 0.02420804649591446, "memory(GiB)": 21.32, "step": 6280, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.953065 }, { "epoch": 0.20404119156677386, "grad_norm": 0.6492069363594055, "learning_rate": 9.365360822377354e-06, "loss": 0.04922935739159584, "memory(GiB)": 21.32, "step": 6281, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.953097 }, { "epoch": 0.20407367702952928, "grad_norm": 0.6527352333068848, "learning_rate": 9.365098886243467e-06, "loss": 0.04413517192006111, "memory(GiB)": 21.32, "step": 6282, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.953129 }, { "epoch": 0.2041061624922847, "grad_norm": 0.45304474234580994, "learning_rate": 9.364836899730527e-06, "loss": 0.038646150380373, "memory(GiB)": 21.32, "step": 6283, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.953156 }, { "epoch": 0.2041386479550401, "grad_norm": 0.4211592376232147, "learning_rate": 9.364574862841563e-06, "loss": 0.03438996151089668, "memory(GiB)": 21.32, "step": 6284, "token_acc": 0.98046875, "train_speed(iter/s)": 0.953179 }, { "epoch": 0.20417113341779553, "grad_norm": 1.1719162464141846, "learning_rate": 9.364312775579593e-06, "loss": 0.04702066257596016, "memory(GiB)": 21.32, "step": 6285, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.953207 }, { "epoch": 0.20420361888055094, "grad_norm": 0.47335997223854065, "learning_rate": 9.364050637947645e-06, "loss": 0.03520243614912033, "memory(GiB)": 21.32, "step": 6286, "token_acc": 0.9836065573770492, "train_speed(iter/s)": 0.95323 }, { "epoch": 0.20423610434330636, "grad_norm": 0.5744709372520447, "learning_rate": 9.363788449948745e-06, "loss": 0.0437634214758873, "memory(GiB)": 21.32, "step": 6287, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.953251 }, { "epoch": 0.2042685898060618, "grad_norm": 0.46883925795555115, "learning_rate": 9.363526211585919e-06, "loss": 0.04508630186319351, "memory(GiB)": 21.32, "step": 6288, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.95328 }, { "epoch": 0.20430107526881722, "grad_norm": 0.5446876883506775, "learning_rate": 9.36326392286219e-06, "loss": 0.04114075005054474, "memory(GiB)": 21.32, "step": 6289, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.953307 }, { "epoch": 0.20433356073157263, "grad_norm": 0.4209229052066803, "learning_rate": 9.363001583780589e-06, "loss": 0.041975706815719604, "memory(GiB)": 21.32, "step": 6290, "token_acc": 0.9704433497536946, "train_speed(iter/s)": 0.953332 }, { "epoch": 0.20436604619432805, "grad_norm": 0.5696906447410583, "learning_rate": 9.362739194344143e-06, "loss": 0.03336164355278015, "memory(GiB)": 21.32, "step": 6291, "token_acc": 0.9835390946502057, "train_speed(iter/s)": 0.953358 }, { "epoch": 0.20439853165708347, "grad_norm": 0.5438201427459717, "learning_rate": 9.36247675455588e-06, "loss": 0.042868904769420624, "memory(GiB)": 21.32, "step": 6292, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.953383 }, { "epoch": 0.20443101711983888, "grad_norm": 1.023015022277832, "learning_rate": 9.362214264418829e-06, "loss": 0.040726497769355774, "memory(GiB)": 21.32, "step": 6293, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.953408 }, { "epoch": 0.2044635025825943, "grad_norm": 0.5789351463317871, "learning_rate": 9.36195172393602e-06, "loss": 0.03506834805011749, "memory(GiB)": 21.32, "step": 6294, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.953435 }, { "epoch": 0.2044959880453497, "grad_norm": 0.40201041102409363, "learning_rate": 9.36168913311048e-06, "loss": 0.03742247074842453, "memory(GiB)": 21.32, "step": 6295, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.953462 }, { "epoch": 0.20452847350810513, "grad_norm": 0.6824280023574829, "learning_rate": 9.361426491945244e-06, "loss": 0.04931701719760895, "memory(GiB)": 21.32, "step": 6296, "token_acc": 0.9769230769230769, "train_speed(iter/s)": 0.953489 }, { "epoch": 0.20456095897086055, "grad_norm": 0.6119515895843506, "learning_rate": 9.36116380044334e-06, "loss": 0.045030735433101654, "memory(GiB)": 21.32, "step": 6297, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.953514 }, { "epoch": 0.20459344443361596, "grad_norm": 0.6909216642379761, "learning_rate": 9.3609010586078e-06, "loss": 0.051013506948947906, "memory(GiB)": 21.32, "step": 6298, "token_acc": 0.9758454106280193, "train_speed(iter/s)": 0.953541 }, { "epoch": 0.20462592989637138, "grad_norm": 2.66754150390625, "learning_rate": 9.360638266441661e-06, "loss": 0.044989973306655884, "memory(GiB)": 21.32, "step": 6299, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.953563 }, { "epoch": 0.2046584153591268, "grad_norm": 1.0177264213562012, "learning_rate": 9.36037542394795e-06, "loss": 0.038925252854824066, "memory(GiB)": 21.32, "step": 6300, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.953586 }, { "epoch": 0.2046909008218822, "grad_norm": 0.5260764956474304, "learning_rate": 9.360112531129704e-06, "loss": 0.02930460311472416, "memory(GiB)": 21.32, "step": 6301, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.953612 }, { "epoch": 0.20472338628463763, "grad_norm": 0.5833768248558044, "learning_rate": 9.359849587989955e-06, "loss": 0.046184610575437546, "memory(GiB)": 21.32, "step": 6302, "token_acc": 0.9797979797979798, "train_speed(iter/s)": 0.953638 }, { "epoch": 0.20475587174739304, "grad_norm": 0.5538716316223145, "learning_rate": 9.359586594531739e-06, "loss": 0.04107218608260155, "memory(GiB)": 21.32, "step": 6303, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.95367 }, { "epoch": 0.20478835721014846, "grad_norm": 0.5033968091011047, "learning_rate": 9.359323550758092e-06, "loss": 0.035943299531936646, "memory(GiB)": 21.32, "step": 6304, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.953695 }, { "epoch": 0.20482084267290387, "grad_norm": 0.6436828970909119, "learning_rate": 9.359060456672047e-06, "loss": 0.043505482375621796, "memory(GiB)": 21.32, "step": 6305, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.953721 }, { "epoch": 0.2048533281356593, "grad_norm": 0.504753053188324, "learning_rate": 9.358797312276644e-06, "loss": 0.04112116992473602, "memory(GiB)": 21.32, "step": 6306, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.953747 }, { "epoch": 0.2048858135984147, "grad_norm": 0.49397239089012146, "learning_rate": 9.358534117574919e-06, "loss": 0.03974474221467972, "memory(GiB)": 21.32, "step": 6307, "token_acc": 0.9764705882352941, "train_speed(iter/s)": 0.953773 }, { "epoch": 0.20491829906117012, "grad_norm": 3.2043955326080322, "learning_rate": 9.358270872569908e-06, "loss": 0.04023024067282677, "memory(GiB)": 21.32, "step": 6308, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.953798 }, { "epoch": 0.20495078452392554, "grad_norm": 0.4894058406352997, "learning_rate": 9.358007577264651e-06, "loss": 0.04317154362797737, "memory(GiB)": 21.32, "step": 6309, "token_acc": 0.9742489270386266, "train_speed(iter/s)": 0.953822 }, { "epoch": 0.20498326998668095, "grad_norm": 0.5740326642990112, "learning_rate": 9.357744231662185e-06, "loss": 0.047704558819532394, "memory(GiB)": 21.32, "step": 6310, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.953846 }, { "epoch": 0.20501575544943637, "grad_norm": 0.48960694670677185, "learning_rate": 9.35748083576555e-06, "loss": 0.039912499487400055, "memory(GiB)": 21.32, "step": 6311, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.953872 }, { "epoch": 0.2050482409121918, "grad_norm": 0.5039865970611572, "learning_rate": 9.357217389577789e-06, "loss": 0.04757658392190933, "memory(GiB)": 21.32, "step": 6312, "token_acc": 0.9826589595375722, "train_speed(iter/s)": 0.953896 }, { "epoch": 0.2050807263749472, "grad_norm": 0.6941320896148682, "learning_rate": 9.35695389310194e-06, "loss": 0.04695020616054535, "memory(GiB)": 21.32, "step": 6313, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.953921 }, { "epoch": 0.20511321183770262, "grad_norm": 0.6095650792121887, "learning_rate": 9.356690346341041e-06, "loss": 0.04424591362476349, "memory(GiB)": 21.32, "step": 6314, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.953945 }, { "epoch": 0.20514569730045804, "grad_norm": 0.9486714601516724, "learning_rate": 9.356426749298137e-06, "loss": 0.04387495294213295, "memory(GiB)": 21.32, "step": 6315, "token_acc": 0.9814126394052045, "train_speed(iter/s)": 0.953969 }, { "epoch": 0.20517818276321345, "grad_norm": 0.4307842552661896, "learning_rate": 9.356163101976273e-06, "loss": 0.03960122913122177, "memory(GiB)": 21.32, "step": 6316, "token_acc": 0.988, "train_speed(iter/s)": 0.953994 }, { "epoch": 0.20521066822596887, "grad_norm": 0.7444605231285095, "learning_rate": 9.355899404378486e-06, "loss": 0.04009994864463806, "memory(GiB)": 21.32, "step": 6317, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.95402 }, { "epoch": 0.20524315368872428, "grad_norm": 0.4586770534515381, "learning_rate": 9.355635656507822e-06, "loss": 0.04059834033250809, "memory(GiB)": 21.32, "step": 6318, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.954046 }, { "epoch": 0.2052756391514797, "grad_norm": 0.5992458462715149, "learning_rate": 9.355371858367326e-06, "loss": 0.04142111912369728, "memory(GiB)": 21.32, "step": 6319, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.954074 }, { "epoch": 0.20530812461423514, "grad_norm": 0.6812293529510498, "learning_rate": 9.355108009960042e-06, "loss": 0.05432652309536934, "memory(GiB)": 21.32, "step": 6320, "token_acc": 0.982078853046595, "train_speed(iter/s)": 0.954099 }, { "epoch": 0.20534061007699056, "grad_norm": 1.736791729927063, "learning_rate": 9.354844111289014e-06, "loss": 0.040987949818372726, "memory(GiB)": 21.32, "step": 6321, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.954124 }, { "epoch": 0.20537309553974598, "grad_norm": 2.1185994148254395, "learning_rate": 9.35458016235729e-06, "loss": 0.04730278253555298, "memory(GiB)": 21.32, "step": 6322, "token_acc": 0.9758064516129032, "train_speed(iter/s)": 0.954143 }, { "epoch": 0.2054055810025014, "grad_norm": 0.48589709401130676, "learning_rate": 9.354316163167914e-06, "loss": 0.044306278228759766, "memory(GiB)": 21.32, "step": 6323, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.954165 }, { "epoch": 0.2054380664652568, "grad_norm": 1.6690346002578735, "learning_rate": 9.354052113723936e-06, "loss": 0.048522479832172394, "memory(GiB)": 21.32, "step": 6324, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.954189 }, { "epoch": 0.20547055192801222, "grad_norm": 0.5772044062614441, "learning_rate": 9.3537880140284e-06, "loss": 0.04378633201122284, "memory(GiB)": 21.32, "step": 6325, "token_acc": 0.9867109634551495, "train_speed(iter/s)": 0.954216 }, { "epoch": 0.20550303739076764, "grad_norm": 0.634513258934021, "learning_rate": 9.353523864084353e-06, "loss": 0.0420980378985405, "memory(GiB)": 21.32, "step": 6326, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.954241 }, { "epoch": 0.20553552285352306, "grad_norm": 0.511182427406311, "learning_rate": 9.35325966389485e-06, "loss": 0.04495452344417572, "memory(GiB)": 21.32, "step": 6327, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.954266 }, { "epoch": 0.20556800831627847, "grad_norm": 0.6570940613746643, "learning_rate": 9.352995413462933e-06, "loss": 0.049905404448509216, "memory(GiB)": 21.32, "step": 6328, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.954294 }, { "epoch": 0.2056004937790339, "grad_norm": 0.5169395804405212, "learning_rate": 9.352731112791657e-06, "loss": 0.03838333487510681, "memory(GiB)": 21.32, "step": 6329, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.954328 }, { "epoch": 0.2056329792417893, "grad_norm": 0.8826668858528137, "learning_rate": 9.35246676188407e-06, "loss": 0.053762078285217285, "memory(GiB)": 21.32, "step": 6330, "token_acc": 0.9752650176678446, "train_speed(iter/s)": 0.95436 }, { "epoch": 0.20566546470454472, "grad_norm": 0.7273550033569336, "learning_rate": 9.352202360743223e-06, "loss": 0.04993853345513344, "memory(GiB)": 21.32, "step": 6331, "token_acc": 0.9601593625498008, "train_speed(iter/s)": 0.954394 }, { "epoch": 0.20569795016730014, "grad_norm": 0.42821088433265686, "learning_rate": 9.351937909372169e-06, "loss": 0.039509423077106476, "memory(GiB)": 21.32, "step": 6332, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.954425 }, { "epoch": 0.20573043563005555, "grad_norm": 1.9711277484893799, "learning_rate": 9.35167340777396e-06, "loss": 0.04111170768737793, "memory(GiB)": 21.32, "step": 6333, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.954458 }, { "epoch": 0.20576292109281097, "grad_norm": 1.4506584405899048, "learning_rate": 9.351408855951645e-06, "loss": 0.04673367738723755, "memory(GiB)": 21.32, "step": 6334, "token_acc": 0.9703703703703703, "train_speed(iter/s)": 0.954491 }, { "epoch": 0.20579540655556638, "grad_norm": 0.6504762768745422, "learning_rate": 9.351144253908282e-06, "loss": 0.03992946073412895, "memory(GiB)": 21.32, "step": 6335, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.954519 }, { "epoch": 0.2058278920183218, "grad_norm": 0.9324586391448975, "learning_rate": 9.350879601646923e-06, "loss": 0.041936516761779785, "memory(GiB)": 21.32, "step": 6336, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.954552 }, { "epoch": 0.20586037748107722, "grad_norm": 0.6770215630531311, "learning_rate": 9.350614899170622e-06, "loss": 0.04371272400021553, "memory(GiB)": 21.32, "step": 6337, "token_acc": 0.963855421686747, "train_speed(iter/s)": 0.954584 }, { "epoch": 0.20589286294383263, "grad_norm": 1.1319226026535034, "learning_rate": 9.350350146482435e-06, "loss": 0.03269553929567337, "memory(GiB)": 21.32, "step": 6338, "token_acc": 0.9847328244274809, "train_speed(iter/s)": 0.954616 }, { "epoch": 0.20592534840658805, "grad_norm": 0.5430607795715332, "learning_rate": 9.350085343585416e-06, "loss": 0.041336625814437866, "memory(GiB)": 21.32, "step": 6339, "token_acc": 0.9885714285714285, "train_speed(iter/s)": 0.954646 }, { "epoch": 0.20595783386934347, "grad_norm": 0.8419530391693115, "learning_rate": 9.349820490482622e-06, "loss": 0.052002355456352234, "memory(GiB)": 21.32, "step": 6340, "token_acc": 0.988, "train_speed(iter/s)": 0.954679 }, { "epoch": 0.20599031933209888, "grad_norm": 0.5424681305885315, "learning_rate": 9.34955558717711e-06, "loss": 0.04077550768852234, "memory(GiB)": 21.32, "step": 6341, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.954712 }, { "epoch": 0.2060228047948543, "grad_norm": 0.45908451080322266, "learning_rate": 9.349290633671937e-06, "loss": 0.03821945935487747, "memory(GiB)": 21.32, "step": 6342, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.95474 }, { "epoch": 0.2060552902576097, "grad_norm": 0.6188878417015076, "learning_rate": 9.349025629970163e-06, "loss": 0.0347091481089592, "memory(GiB)": 21.32, "step": 6343, "token_acc": 0.9770491803278688, "train_speed(iter/s)": 0.95476 }, { "epoch": 0.20608777572036513, "grad_norm": 0.6828511953353882, "learning_rate": 9.348760576074844e-06, "loss": 0.05843246355652809, "memory(GiB)": 21.32, "step": 6344, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.954784 }, { "epoch": 0.20612026118312055, "grad_norm": 0.6950938701629639, "learning_rate": 9.348495471989039e-06, "loss": 0.03967725858092308, "memory(GiB)": 21.32, "step": 6345, "token_acc": 0.975, "train_speed(iter/s)": 0.954802 }, { "epoch": 0.20615274664587596, "grad_norm": 0.7394180297851562, "learning_rate": 9.34823031771581e-06, "loss": 0.05050714313983917, "memory(GiB)": 21.32, "step": 6346, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.954827 }, { "epoch": 0.20618523210863138, "grad_norm": 0.5916367769241333, "learning_rate": 9.347965113258212e-06, "loss": 0.036021556705236435, "memory(GiB)": 21.32, "step": 6347, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.954852 }, { "epoch": 0.2062177175713868, "grad_norm": 0.6264121532440186, "learning_rate": 9.347699858619312e-06, "loss": 0.048874951899051666, "memory(GiB)": 21.32, "step": 6348, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.954879 }, { "epoch": 0.2062502030341422, "grad_norm": 0.5704671144485474, "learning_rate": 9.34743455380217e-06, "loss": 0.03758027404546738, "memory(GiB)": 21.32, "step": 6349, "token_acc": 1.0, "train_speed(iter/s)": 0.954901 }, { "epoch": 0.20628268849689763, "grad_norm": 0.41274121403694153, "learning_rate": 9.347169198809845e-06, "loss": 0.03503936156630516, "memory(GiB)": 21.32, "step": 6350, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.954926 }, { "epoch": 0.20631517395965304, "grad_norm": 0.8006893396377563, "learning_rate": 9.346903793645403e-06, "loss": 0.044272709637880325, "memory(GiB)": 21.32, "step": 6351, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.954949 }, { "epoch": 0.20634765942240849, "grad_norm": 0.5632657408714294, "learning_rate": 9.346638338311903e-06, "loss": 0.050747327506542206, "memory(GiB)": 21.32, "step": 6352, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.954975 }, { "epoch": 0.2063801448851639, "grad_norm": 0.7430873513221741, "learning_rate": 9.346372832812414e-06, "loss": 0.05126018077135086, "memory(GiB)": 21.32, "step": 6353, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.954999 }, { "epoch": 0.20641263034791932, "grad_norm": 0.5476124286651611, "learning_rate": 9.346107277149996e-06, "loss": 0.0429215244948864, "memory(GiB)": 21.32, "step": 6354, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.955024 }, { "epoch": 0.20644511581067473, "grad_norm": 0.7096750140190125, "learning_rate": 9.345841671327716e-06, "loss": 0.05103163421154022, "memory(GiB)": 21.32, "step": 6355, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.955048 }, { "epoch": 0.20647760127343015, "grad_norm": 0.6182129979133606, "learning_rate": 9.345576015348637e-06, "loss": 0.05137263238430023, "memory(GiB)": 21.32, "step": 6356, "token_acc": 0.9658536585365853, "train_speed(iter/s)": 0.955072 }, { "epoch": 0.20651008673618557, "grad_norm": 0.5245370268821716, "learning_rate": 9.345310309215828e-06, "loss": 0.03948380425572395, "memory(GiB)": 21.32, "step": 6357, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.955096 }, { "epoch": 0.20654257219894098, "grad_norm": 0.455236554145813, "learning_rate": 9.345044552932356e-06, "loss": 0.030853066593408585, "memory(GiB)": 21.32, "step": 6358, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.95512 }, { "epoch": 0.2065750576616964, "grad_norm": 0.7570352554321289, "learning_rate": 9.344778746501284e-06, "loss": 0.04281877726316452, "memory(GiB)": 21.32, "step": 6359, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.955144 }, { "epoch": 0.20660754312445181, "grad_norm": 0.7314091920852661, "learning_rate": 9.344512889925684e-06, "loss": 0.04430490359663963, "memory(GiB)": 21.32, "step": 6360, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955171 }, { "epoch": 0.20664002858720723, "grad_norm": 0.3629571795463562, "learning_rate": 9.344246983208622e-06, "loss": 0.027828235179185867, "memory(GiB)": 21.32, "step": 6361, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.955195 }, { "epoch": 0.20667251404996265, "grad_norm": 0.5572319030761719, "learning_rate": 9.343981026353169e-06, "loss": 0.04269729182124138, "memory(GiB)": 21.32, "step": 6362, "token_acc": 0.9766355140186916, "train_speed(iter/s)": 0.955227 }, { "epoch": 0.20670499951271806, "grad_norm": 0.4843314588069916, "learning_rate": 9.343715019362392e-06, "loss": 0.03208556026220322, "memory(GiB)": 21.32, "step": 6363, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.955257 }, { "epoch": 0.20673748497547348, "grad_norm": 0.4693012833595276, "learning_rate": 9.343448962239361e-06, "loss": 0.041460052132606506, "memory(GiB)": 21.32, "step": 6364, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.95529 }, { "epoch": 0.2067699704382289, "grad_norm": 0.6936850547790527, "learning_rate": 9.343182854987149e-06, "loss": 0.05022449791431427, "memory(GiB)": 21.32, "step": 6365, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.95532 }, { "epoch": 0.2068024559009843, "grad_norm": 0.441486120223999, "learning_rate": 9.342916697608828e-06, "loss": 0.037071436643600464, "memory(GiB)": 21.32, "step": 6366, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955347 }, { "epoch": 0.20683494136373973, "grad_norm": 0.4321296811103821, "learning_rate": 9.342650490107466e-06, "loss": 0.038393646478652954, "memory(GiB)": 21.32, "step": 6367, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955372 }, { "epoch": 0.20686742682649514, "grad_norm": 0.5097222924232483, "learning_rate": 9.342384232486138e-06, "loss": 0.05198749899864197, "memory(GiB)": 21.32, "step": 6368, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.955397 }, { "epoch": 0.20689991228925056, "grad_norm": 0.4700731635093689, "learning_rate": 9.342117924747915e-06, "loss": 0.034762218594551086, "memory(GiB)": 21.32, "step": 6369, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.95542 }, { "epoch": 0.20693239775200598, "grad_norm": 0.4625698924064636, "learning_rate": 9.341851566895872e-06, "loss": 0.032895248383283615, "memory(GiB)": 21.32, "step": 6370, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.955442 }, { "epoch": 0.2069648832147614, "grad_norm": 0.4807303249835968, "learning_rate": 9.341585158933082e-06, "loss": 0.04678227752447128, "memory(GiB)": 21.32, "step": 6371, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.955467 }, { "epoch": 0.2069973686775168, "grad_norm": 0.448188453912735, "learning_rate": 9.341318700862622e-06, "loss": 0.037989191710948944, "memory(GiB)": 21.32, "step": 6372, "token_acc": 0.9727272727272728, "train_speed(iter/s)": 0.955493 }, { "epoch": 0.20702985414027222, "grad_norm": 0.5356308221817017, "learning_rate": 9.341052192687568e-06, "loss": 0.04447147995233536, "memory(GiB)": 21.32, "step": 6373, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.955518 }, { "epoch": 0.20706233960302764, "grad_norm": 0.427937775850296, "learning_rate": 9.340785634410992e-06, "loss": 0.03381287306547165, "memory(GiB)": 21.32, "step": 6374, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.95554 }, { "epoch": 0.20709482506578306, "grad_norm": 0.5319185256958008, "learning_rate": 9.34051902603597e-06, "loss": 0.040766749531030655, "memory(GiB)": 21.32, "step": 6375, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.955565 }, { "epoch": 0.20712731052853847, "grad_norm": 0.5882797241210938, "learning_rate": 9.340252367565585e-06, "loss": 0.0441596582531929, "memory(GiB)": 21.32, "step": 6376, "token_acc": 0.9660377358490566, "train_speed(iter/s)": 0.955588 }, { "epoch": 0.2071597959912939, "grad_norm": 0.5636200308799744, "learning_rate": 9.339985659002908e-06, "loss": 0.03986286371946335, "memory(GiB)": 21.32, "step": 6377, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.955612 }, { "epoch": 0.2071922814540493, "grad_norm": 0.4366548955440521, "learning_rate": 9.33971890035102e-06, "loss": 0.03194937855005264, "memory(GiB)": 21.32, "step": 6378, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.955638 }, { "epoch": 0.20722476691680472, "grad_norm": 0.6820768713951111, "learning_rate": 9.339452091613001e-06, "loss": 0.03225138783454895, "memory(GiB)": 21.32, "step": 6379, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.955663 }, { "epoch": 0.20725725237956014, "grad_norm": 0.6146630644798279, "learning_rate": 9.33918523279193e-06, "loss": 0.03551679849624634, "memory(GiB)": 21.32, "step": 6380, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.955687 }, { "epoch": 0.20728973784231555, "grad_norm": 0.58941650390625, "learning_rate": 9.338918323890885e-06, "loss": 0.04471465200185776, "memory(GiB)": 21.32, "step": 6381, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.955713 }, { "epoch": 0.20732222330507097, "grad_norm": 0.7558193802833557, "learning_rate": 9.338651364912946e-06, "loss": 0.04126589372754097, "memory(GiB)": 21.32, "step": 6382, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.955741 }, { "epoch": 0.20735470876782638, "grad_norm": 0.3311019241809845, "learning_rate": 9.338384355861196e-06, "loss": 0.02501026727259159, "memory(GiB)": 21.32, "step": 6383, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.955765 }, { "epoch": 0.20738719423058183, "grad_norm": 0.5997790098190308, "learning_rate": 9.338117296738717e-06, "loss": 0.03992892801761627, "memory(GiB)": 21.32, "step": 6384, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.955786 }, { "epoch": 0.20741967969333724, "grad_norm": 0.7629850506782532, "learning_rate": 9.337850187548591e-06, "loss": 0.03241961821913719, "memory(GiB)": 21.32, "step": 6385, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.95581 }, { "epoch": 0.20745216515609266, "grad_norm": 1.1521867513656616, "learning_rate": 9.3375830282939e-06, "loss": 0.0616278201341629, "memory(GiB)": 21.32, "step": 6386, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.955833 }, { "epoch": 0.20748465061884808, "grad_norm": 0.910926878452301, "learning_rate": 9.337315818977726e-06, "loss": 0.051124803721904755, "memory(GiB)": 21.32, "step": 6387, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.955858 }, { "epoch": 0.2075171360816035, "grad_norm": 0.9612460732460022, "learning_rate": 9.337048559603157e-06, "loss": 0.05600584298372269, "memory(GiB)": 21.32, "step": 6388, "token_acc": 0.9669421487603306, "train_speed(iter/s)": 0.955884 }, { "epoch": 0.2075496215443589, "grad_norm": 0.8058083653450012, "learning_rate": 9.336781250173273e-06, "loss": 0.04189014434814453, "memory(GiB)": 21.32, "step": 6389, "token_acc": 0.9708333333333333, "train_speed(iter/s)": 0.955911 }, { "epoch": 0.20758210700711432, "grad_norm": 0.5815333127975464, "learning_rate": 9.336513890691162e-06, "loss": 0.03451883792877197, "memory(GiB)": 21.32, "step": 6390, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.955944 }, { "epoch": 0.20761459246986974, "grad_norm": 0.5838140249252319, "learning_rate": 9.336246481159908e-06, "loss": 0.04273693636059761, "memory(GiB)": 21.32, "step": 6391, "token_acc": 0.9847715736040609, "train_speed(iter/s)": 0.955976 }, { "epoch": 0.20764707793262516, "grad_norm": 0.6629461646080017, "learning_rate": 9.335979021582597e-06, "loss": 0.046632081270217896, "memory(GiB)": 21.32, "step": 6392, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.956007 }, { "epoch": 0.20767956339538057, "grad_norm": 0.6643999218940735, "learning_rate": 9.33571151196232e-06, "loss": 0.04347366839647293, "memory(GiB)": 21.32, "step": 6393, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.956038 }, { "epoch": 0.207712048858136, "grad_norm": 0.499706894159317, "learning_rate": 9.33544395230216e-06, "loss": 0.040156688541173935, "memory(GiB)": 21.32, "step": 6394, "token_acc": 0.98828125, "train_speed(iter/s)": 0.956069 }, { "epoch": 0.2077445343208914, "grad_norm": 0.4835924208164215, "learning_rate": 9.335176342605206e-06, "loss": 0.03049497678875923, "memory(GiB)": 21.32, "step": 6395, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.9561 }, { "epoch": 0.20777701978364682, "grad_norm": 0.36273446679115295, "learning_rate": 9.334908682874546e-06, "loss": 0.026732653379440308, "memory(GiB)": 21.32, "step": 6396, "token_acc": 0.9816849816849816, "train_speed(iter/s)": 0.95613 }, { "epoch": 0.20780950524640224, "grad_norm": 0.5922853350639343, "learning_rate": 9.334640973113271e-06, "loss": 0.052286915481090546, "memory(GiB)": 21.32, "step": 6397, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.956157 }, { "epoch": 0.20784199070915765, "grad_norm": 1.508374810218811, "learning_rate": 9.33437321332447e-06, "loss": 0.054775774478912354, "memory(GiB)": 21.32, "step": 6398, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.956186 }, { "epoch": 0.20787447617191307, "grad_norm": 0.6669032573699951, "learning_rate": 9.334105403511234e-06, "loss": 0.05021314695477486, "memory(GiB)": 21.32, "step": 6399, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.956216 }, { "epoch": 0.20790696163466849, "grad_norm": 0.5260224342346191, "learning_rate": 9.333837543676652e-06, "loss": 0.047946006059646606, "memory(GiB)": 21.32, "step": 6400, "token_acc": 0.9789473684210527, "train_speed(iter/s)": 0.956248 }, { "epoch": 0.2079394470974239, "grad_norm": 0.595764696598053, "learning_rate": 9.333569633823816e-06, "loss": 0.04680458828806877, "memory(GiB)": 21.32, "step": 6401, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.956276 }, { "epoch": 0.20797193256017932, "grad_norm": 0.5240511894226074, "learning_rate": 9.333301673955819e-06, "loss": 0.049382783472537994, "memory(GiB)": 21.32, "step": 6402, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.956298 }, { "epoch": 0.20800441802293473, "grad_norm": 0.3382582366466522, "learning_rate": 9.33303366407575e-06, "loss": 0.03158661723136902, "memory(GiB)": 21.32, "step": 6403, "token_acc": 0.9813953488372092, "train_speed(iter/s)": 0.956323 }, { "epoch": 0.20803690348569015, "grad_norm": 0.48867034912109375, "learning_rate": 9.332765604186708e-06, "loss": 0.03570722043514252, "memory(GiB)": 21.32, "step": 6404, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.956349 }, { "epoch": 0.20806938894844557, "grad_norm": 0.44380030035972595, "learning_rate": 9.332497494291786e-06, "loss": 0.03492268919944763, "memory(GiB)": 21.32, "step": 6405, "token_acc": 0.993006993006993, "train_speed(iter/s)": 0.956374 }, { "epoch": 0.20810187441120098, "grad_norm": 0.4501776397228241, "learning_rate": 9.332229334394074e-06, "loss": 0.03546672686934471, "memory(GiB)": 21.32, "step": 6406, "token_acc": 0.9810606060606061, "train_speed(iter/s)": 0.956397 }, { "epoch": 0.2081343598739564, "grad_norm": 0.45541906356811523, "learning_rate": 9.33196112449667e-06, "loss": 0.03841998800635338, "memory(GiB)": 21.32, "step": 6407, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.956421 }, { "epoch": 0.20816684533671181, "grad_norm": 0.6097151637077332, "learning_rate": 9.331692864602668e-06, "loss": 0.049945563077926636, "memory(GiB)": 21.32, "step": 6408, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.956445 }, { "epoch": 0.20819933079946723, "grad_norm": 0.47475665807724, "learning_rate": 9.331424554715166e-06, "loss": 0.04205986484885216, "memory(GiB)": 21.32, "step": 6409, "token_acc": 0.9836956521739131, "train_speed(iter/s)": 0.956467 }, { "epoch": 0.20823181626222265, "grad_norm": 0.6469448208808899, "learning_rate": 9.331156194837259e-06, "loss": 0.053435854613780975, "memory(GiB)": 21.32, "step": 6410, "token_acc": 0.98046875, "train_speed(iter/s)": 0.956491 }, { "epoch": 0.20826430172497806, "grad_norm": 0.3894121050834656, "learning_rate": 9.330887784972044e-06, "loss": 0.03661070764064789, "memory(GiB)": 21.32, "step": 6411, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.956516 }, { "epoch": 0.20829678718773348, "grad_norm": 0.4710025191307068, "learning_rate": 9.330619325122621e-06, "loss": 0.042803503572940826, "memory(GiB)": 21.32, "step": 6412, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.95654 }, { "epoch": 0.2083292726504889, "grad_norm": 0.7526729702949524, "learning_rate": 9.330350815292086e-06, "loss": 0.03449130803346634, "memory(GiB)": 21.32, "step": 6413, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.956563 }, { "epoch": 0.2083617581132443, "grad_norm": 0.7705169916152954, "learning_rate": 9.33008225548354e-06, "loss": 0.05120764672756195, "memory(GiB)": 21.32, "step": 6414, "token_acc": 0.966183574879227, "train_speed(iter/s)": 0.956586 }, { "epoch": 0.20839424357599973, "grad_norm": 0.620137631893158, "learning_rate": 9.32981364570008e-06, "loss": 0.045809801667928696, "memory(GiB)": 21.32, "step": 6415, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956609 }, { "epoch": 0.20842672903875517, "grad_norm": 0.5077242255210876, "learning_rate": 9.329544985944807e-06, "loss": 0.02776212804019451, "memory(GiB)": 21.32, "step": 6416, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.956634 }, { "epoch": 0.2084592145015106, "grad_norm": 0.45083552598953247, "learning_rate": 9.329276276220824e-06, "loss": 0.033178023993968964, "memory(GiB)": 21.32, "step": 6417, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.956654 }, { "epoch": 0.208491699964266, "grad_norm": 1.0338220596313477, "learning_rate": 9.329007516531229e-06, "loss": 0.04346241056919098, "memory(GiB)": 21.32, "step": 6418, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.956679 }, { "epoch": 0.20852418542702142, "grad_norm": 0.6308667659759521, "learning_rate": 9.328738706879128e-06, "loss": 0.049791909754276276, "memory(GiB)": 21.32, "step": 6419, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956702 }, { "epoch": 0.20855667088977684, "grad_norm": 0.6026980876922607, "learning_rate": 9.328469847267618e-06, "loss": 0.03872721269726753, "memory(GiB)": 21.32, "step": 6420, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.956721 }, { "epoch": 0.20858915635253225, "grad_norm": 0.5687324404716492, "learning_rate": 9.328200937699806e-06, "loss": 0.03773856163024902, "memory(GiB)": 21.32, "step": 6421, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.95675 }, { "epoch": 0.20862164181528767, "grad_norm": 0.679711103439331, "learning_rate": 9.327931978178795e-06, "loss": 0.037252821028232574, "memory(GiB)": 21.32, "step": 6422, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.956782 }, { "epoch": 0.20865412727804308, "grad_norm": 0.7104858160018921, "learning_rate": 9.327662968707687e-06, "loss": 0.03573719039559364, "memory(GiB)": 21.32, "step": 6423, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.956814 }, { "epoch": 0.2086866127407985, "grad_norm": 0.49244406819343567, "learning_rate": 9.32739390928959e-06, "loss": 0.040566254407167435, "memory(GiB)": 21.32, "step": 6424, "token_acc": 0.9787985865724381, "train_speed(iter/s)": 0.956846 }, { "epoch": 0.20871909820355392, "grad_norm": 0.6700202226638794, "learning_rate": 9.327124799927605e-06, "loss": 0.026106534525752068, "memory(GiB)": 21.32, "step": 6425, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956877 }, { "epoch": 0.20875158366630933, "grad_norm": 0.4943089485168457, "learning_rate": 9.326855640624842e-06, "loss": 0.03674931824207306, "memory(GiB)": 21.32, "step": 6426, "token_acc": 0.9766355140186916, "train_speed(iter/s)": 0.956908 }, { "epoch": 0.20878406912906475, "grad_norm": 0.5677785277366638, "learning_rate": 9.326586431384405e-06, "loss": 0.04204709455370903, "memory(GiB)": 21.32, "step": 6427, "token_acc": 0.9766355140186916, "train_speed(iter/s)": 0.956935 }, { "epoch": 0.20881655459182016, "grad_norm": 0.6072689890861511, "learning_rate": 9.326317172209403e-06, "loss": 0.04462587088346481, "memory(GiB)": 21.32, "step": 6428, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.95696 }, { "epoch": 0.20884904005457558, "grad_norm": 0.8501468300819397, "learning_rate": 9.326047863102943e-06, "loss": 0.035703226923942566, "memory(GiB)": 21.32, "step": 6429, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.956983 }, { "epoch": 0.208881525517331, "grad_norm": 0.8400242328643799, "learning_rate": 9.325778504068132e-06, "loss": 0.04560079425573349, "memory(GiB)": 21.32, "step": 6430, "token_acc": 0.969811320754717, "train_speed(iter/s)": 0.957008 }, { "epoch": 0.2089140109800864, "grad_norm": 0.409259170293808, "learning_rate": 9.32550909510808e-06, "loss": 0.033658064901828766, "memory(GiB)": 21.32, "step": 6431, "token_acc": 0.9796954314720813, "train_speed(iter/s)": 0.957032 }, { "epoch": 0.20894649644284183, "grad_norm": 0.5530904531478882, "learning_rate": 9.325239636225895e-06, "loss": 0.04370016232132912, "memory(GiB)": 21.32, "step": 6432, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.957057 }, { "epoch": 0.20897898190559724, "grad_norm": 0.7052095532417297, "learning_rate": 9.324970127424688e-06, "loss": 0.045251645147800446, "memory(GiB)": 21.32, "step": 6433, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.957081 }, { "epoch": 0.20901146736835266, "grad_norm": 0.5426783561706543, "learning_rate": 9.324700568707567e-06, "loss": 0.04888772964477539, "memory(GiB)": 21.32, "step": 6434, "token_acc": 0.9730941704035875, "train_speed(iter/s)": 0.957105 }, { "epoch": 0.20904395283110808, "grad_norm": 1.720935583114624, "learning_rate": 9.324430960077648e-06, "loss": 0.04341460391879082, "memory(GiB)": 21.32, "step": 6435, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.957129 }, { "epoch": 0.2090764382938635, "grad_norm": 0.5592260956764221, "learning_rate": 9.32416130153804e-06, "loss": 0.05160686373710632, "memory(GiB)": 21.32, "step": 6436, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.957151 }, { "epoch": 0.2091089237566189, "grad_norm": 0.5701900720596313, "learning_rate": 9.323891593091854e-06, "loss": 0.042805254459381104, "memory(GiB)": 21.32, "step": 6437, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.957176 }, { "epoch": 0.20914140921937432, "grad_norm": 0.6044749021530151, "learning_rate": 9.323621834742205e-06, "loss": 0.03393080085515976, "memory(GiB)": 21.32, "step": 6438, "token_acc": 0.9878048780487805, "train_speed(iter/s)": 0.957199 }, { "epoch": 0.20917389468212974, "grad_norm": 0.5502933859825134, "learning_rate": 9.323352026492206e-06, "loss": 0.04433765262365341, "memory(GiB)": 21.32, "step": 6439, "token_acc": 0.9707317073170731, "train_speed(iter/s)": 0.957224 }, { "epoch": 0.20920638014488516, "grad_norm": 0.4543640911579132, "learning_rate": 9.323082168344967e-06, "loss": 0.04372997581958771, "memory(GiB)": 21.32, "step": 6440, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.957249 }, { "epoch": 0.20923886560764057, "grad_norm": 0.7195932269096375, "learning_rate": 9.322812260303607e-06, "loss": 0.04393577575683594, "memory(GiB)": 21.32, "step": 6441, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.957273 }, { "epoch": 0.209271351070396, "grad_norm": 0.5029066801071167, "learning_rate": 9.32254230237124e-06, "loss": 0.040948670357465744, "memory(GiB)": 21.32, "step": 6442, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957299 }, { "epoch": 0.2093038365331514, "grad_norm": 0.6079750061035156, "learning_rate": 9.322272294550983e-06, "loss": 0.039363183081150055, "memory(GiB)": 21.32, "step": 6443, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957323 }, { "epoch": 0.20933632199590682, "grad_norm": 0.5695775747299194, "learning_rate": 9.32200223684595e-06, "loss": 0.044356998056173325, "memory(GiB)": 21.32, "step": 6444, "token_acc": 0.9751243781094527, "train_speed(iter/s)": 0.957346 }, { "epoch": 0.20936880745866224, "grad_norm": 0.4010311961174011, "learning_rate": 9.321732129259259e-06, "loss": 0.03666640818119049, "memory(GiB)": 21.32, "step": 6445, "token_acc": 0.992, "train_speed(iter/s)": 0.957372 }, { "epoch": 0.20940129292141765, "grad_norm": 0.6654497981071472, "learning_rate": 9.321461971794026e-06, "loss": 0.038329191505908966, "memory(GiB)": 21.32, "step": 6446, "token_acc": 0.9836956521739131, "train_speed(iter/s)": 0.957391 }, { "epoch": 0.20943377838417307, "grad_norm": 1.263267159461975, "learning_rate": 9.32119176445337e-06, "loss": 0.035559020936489105, "memory(GiB)": 21.32, "step": 6447, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.957416 }, { "epoch": 0.2094662638469285, "grad_norm": 0.48123809695243835, "learning_rate": 9.32092150724041e-06, "loss": 0.03808929771184921, "memory(GiB)": 21.32, "step": 6448, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.957439 }, { "epoch": 0.20949874930968393, "grad_norm": 1.1328667402267456, "learning_rate": 9.320651200158264e-06, "loss": 0.05955473333597183, "memory(GiB)": 21.32, "step": 6449, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.957461 }, { "epoch": 0.20953123477243935, "grad_norm": 0.44655874371528625, "learning_rate": 9.320380843210056e-06, "loss": 0.041933152824640274, "memory(GiB)": 21.32, "step": 6450, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.957487 }, { "epoch": 0.20956372023519476, "grad_norm": 0.5103712677955627, "learning_rate": 9.320110436398898e-06, "loss": 0.03735063970088959, "memory(GiB)": 21.32, "step": 6451, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.957517 }, { "epoch": 0.20959620569795018, "grad_norm": 0.5054192543029785, "learning_rate": 9.319839979727918e-06, "loss": 0.0378849022090435, "memory(GiB)": 21.32, "step": 6452, "token_acc": 0.988950276243094, "train_speed(iter/s)": 0.957547 }, { "epoch": 0.2096286911607056, "grad_norm": 0.7181800007820129, "learning_rate": 9.319569473200235e-06, "loss": 0.04674673080444336, "memory(GiB)": 21.32, "step": 6453, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.957563 }, { "epoch": 0.209661176623461, "grad_norm": 0.6300449967384338, "learning_rate": 9.319298916818969e-06, "loss": 0.045245490968227386, "memory(GiB)": 21.32, "step": 6454, "token_acc": 0.9541666666666667, "train_speed(iter/s)": 0.957594 }, { "epoch": 0.20969366208621643, "grad_norm": 0.6664807200431824, "learning_rate": 9.319028310587246e-06, "loss": 0.03453724831342697, "memory(GiB)": 21.32, "step": 6455, "token_acc": 0.9947643979057592, "train_speed(iter/s)": 0.957625 }, { "epoch": 0.20972614754897184, "grad_norm": 0.6499580144882202, "learning_rate": 9.318757654508186e-06, "loss": 0.05554073303937912, "memory(GiB)": 21.32, "step": 6456, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.957656 }, { "epoch": 0.20975863301172726, "grad_norm": 0.5956165790557861, "learning_rate": 9.318486948584916e-06, "loss": 0.04580654203891754, "memory(GiB)": 21.32, "step": 6457, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.957686 }, { "epoch": 0.20979111847448267, "grad_norm": 3.368190288543701, "learning_rate": 9.31821619282056e-06, "loss": 0.06584671884775162, "memory(GiB)": 21.32, "step": 6458, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.957718 }, { "epoch": 0.2098236039372381, "grad_norm": 0.6130002737045288, "learning_rate": 9.31794538721824e-06, "loss": 0.04655502736568451, "memory(GiB)": 21.32, "step": 6459, "token_acc": 0.9739130434782609, "train_speed(iter/s)": 0.95775 }, { "epoch": 0.2098560893999935, "grad_norm": 0.44171079993247986, "learning_rate": 9.317674531781082e-06, "loss": 0.03439263999462128, "memory(GiB)": 21.32, "step": 6460, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.95778 }, { "epoch": 0.20988857486274892, "grad_norm": 0.4551573693752289, "learning_rate": 9.317403626512214e-06, "loss": 0.03276832401752472, "memory(GiB)": 21.32, "step": 6461, "token_acc": 0.9781659388646288, "train_speed(iter/s)": 0.957802 }, { "epoch": 0.20992106032550434, "grad_norm": 0.5651737451553345, "learning_rate": 9.317132671414762e-06, "loss": 0.037528712302446365, "memory(GiB)": 21.32, "step": 6462, "token_acc": 0.9881656804733728, "train_speed(iter/s)": 0.957829 }, { "epoch": 0.20995354578825975, "grad_norm": 0.9848909378051758, "learning_rate": 9.316861666491853e-06, "loss": 0.04840388149023056, "memory(GiB)": 21.32, "step": 6463, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.957855 }, { "epoch": 0.20998603125101517, "grad_norm": 2.828397035598755, "learning_rate": 9.316590611746614e-06, "loss": 0.0494396910071373, "memory(GiB)": 21.32, "step": 6464, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.95788 }, { "epoch": 0.2100185167137706, "grad_norm": 0.4312264323234558, "learning_rate": 9.316319507182175e-06, "loss": 0.031444184482097626, "memory(GiB)": 21.32, "step": 6465, "token_acc": 0.9822695035460993, "train_speed(iter/s)": 0.957905 }, { "epoch": 0.210051002176526, "grad_norm": 0.7569388747215271, "learning_rate": 9.316048352801664e-06, "loss": 0.046448372304439545, "memory(GiB)": 21.32, "step": 6466, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.957927 }, { "epoch": 0.21008348763928142, "grad_norm": 0.49894025921821594, "learning_rate": 9.315777148608209e-06, "loss": 0.04526567459106445, "memory(GiB)": 21.32, "step": 6467, "token_acc": 0.9732824427480916, "train_speed(iter/s)": 0.957948 }, { "epoch": 0.21011597310203683, "grad_norm": 0.4861547350883484, "learning_rate": 9.315505894604941e-06, "loss": 0.03910117596387863, "memory(GiB)": 21.32, "step": 6468, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.957974 }, { "epoch": 0.21014845856479225, "grad_norm": 0.4864198565483093, "learning_rate": 9.315234590794993e-06, "loss": 0.041967298835515976, "memory(GiB)": 21.32, "step": 6469, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.957998 }, { "epoch": 0.21018094402754767, "grad_norm": 0.624305009841919, "learning_rate": 9.314963237181494e-06, "loss": 0.043116554617881775, "memory(GiB)": 21.32, "step": 6470, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.958022 }, { "epoch": 0.21021342949030308, "grad_norm": 0.48255711793899536, "learning_rate": 9.314691833767576e-06, "loss": 0.04342499002814293, "memory(GiB)": 21.32, "step": 6471, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.958046 }, { "epoch": 0.2102459149530585, "grad_norm": 0.6404876708984375, "learning_rate": 9.31442038055637e-06, "loss": 0.047147236764431, "memory(GiB)": 21.32, "step": 6472, "token_acc": 0.9670781893004116, "train_speed(iter/s)": 0.958069 }, { "epoch": 0.21027840041581392, "grad_norm": 0.6529620289802551, "learning_rate": 9.31414887755101e-06, "loss": 0.04402459040284157, "memory(GiB)": 21.32, "step": 6473, "token_acc": 0.9658119658119658, "train_speed(iter/s)": 0.958091 }, { "epoch": 0.21031088587856933, "grad_norm": 0.5072553157806396, "learning_rate": 9.31387732475463e-06, "loss": 0.041288044303655624, "memory(GiB)": 21.32, "step": 6474, "token_acc": 0.977859778597786, "train_speed(iter/s)": 0.958115 }, { "epoch": 0.21034337134132475, "grad_norm": 0.4351748824119568, "learning_rate": 9.313605722170367e-06, "loss": 0.03844587504863739, "memory(GiB)": 21.32, "step": 6475, "token_acc": 0.9746835443037974, "train_speed(iter/s)": 0.958141 }, { "epoch": 0.21037585680408016, "grad_norm": 0.5949146151542664, "learning_rate": 9.31333406980135e-06, "loss": 0.04522759094834328, "memory(GiB)": 21.32, "step": 6476, "token_acc": 0.9727272727272728, "train_speed(iter/s)": 0.958163 }, { "epoch": 0.21040834226683558, "grad_norm": 0.7752320170402527, "learning_rate": 9.313062367650715e-06, "loss": 0.033955685794353485, "memory(GiB)": 21.32, "step": 6477, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.958186 }, { "epoch": 0.210440827729591, "grad_norm": 1.345375657081604, "learning_rate": 9.312790615721603e-06, "loss": 0.03878181800246239, "memory(GiB)": 21.32, "step": 6478, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.958208 }, { "epoch": 0.2104733131923464, "grad_norm": 0.7877755165100098, "learning_rate": 9.312518814017147e-06, "loss": 0.0451577827334404, "memory(GiB)": 21.32, "step": 6479, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.958233 }, { "epoch": 0.21050579865510186, "grad_norm": 0.3956318497657776, "learning_rate": 9.312246962540482e-06, "loss": 0.038621023297309875, "memory(GiB)": 21.32, "step": 6480, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.958259 }, { "epoch": 0.21053828411785727, "grad_norm": 0.310753732919693, "learning_rate": 9.31197506129475e-06, "loss": 0.027503466233611107, "memory(GiB)": 21.32, "step": 6481, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.958292 }, { "epoch": 0.2105707695806127, "grad_norm": 0.5314180254936218, "learning_rate": 9.311703110283084e-06, "loss": 0.03917543217539787, "memory(GiB)": 21.32, "step": 6482, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.958324 }, { "epoch": 0.2106032550433681, "grad_norm": 0.6031609177589417, "learning_rate": 9.311431109508626e-06, "loss": 0.0479956790804863, "memory(GiB)": 21.32, "step": 6483, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.958356 }, { "epoch": 0.21063574050612352, "grad_norm": 0.6559855341911316, "learning_rate": 9.311159058974514e-06, "loss": 0.04375412315130234, "memory(GiB)": 21.32, "step": 6484, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.958384 }, { "epoch": 0.21066822596887894, "grad_norm": 0.5487485527992249, "learning_rate": 9.31088695868389e-06, "loss": 0.03516180440783501, "memory(GiB)": 21.32, "step": 6485, "token_acc": 0.9940119760479041, "train_speed(iter/s)": 0.958415 }, { "epoch": 0.21070071143163435, "grad_norm": 0.4610353410243988, "learning_rate": 9.31061480863989e-06, "loss": 0.046975743025541306, "memory(GiB)": 21.32, "step": 6486, "token_acc": 0.9776785714285714, "train_speed(iter/s)": 0.958447 }, { "epoch": 0.21073319689438977, "grad_norm": 4.036047458648682, "learning_rate": 9.310342608845659e-06, "loss": 0.05838673561811447, "memory(GiB)": 21.32, "step": 6487, "token_acc": 0.972318339100346, "train_speed(iter/s)": 0.958478 }, { "epoch": 0.21076568235714518, "grad_norm": 0.3859861195087433, "learning_rate": 9.310070359304338e-06, "loss": 0.03818418085575104, "memory(GiB)": 21.32, "step": 6488, "token_acc": 0.9847908745247148, "train_speed(iter/s)": 0.958509 }, { "epoch": 0.2107981678199006, "grad_norm": 0.6963129043579102, "learning_rate": 9.309798060019069e-06, "loss": 0.042945608496665955, "memory(GiB)": 21.32, "step": 6489, "token_acc": 0.9751243781094527, "train_speed(iter/s)": 0.958534 }, { "epoch": 0.21083065328265602, "grad_norm": 0.6044189929962158, "learning_rate": 9.309525710992991e-06, "loss": 0.04389278590679169, "memory(GiB)": 21.32, "step": 6490, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.958558 }, { "epoch": 0.21086313874541143, "grad_norm": 0.6576371788978577, "learning_rate": 9.309253312229252e-06, "loss": 0.04634085297584534, "memory(GiB)": 21.32, "step": 6491, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.958584 }, { "epoch": 0.21089562420816685, "grad_norm": 0.6997806429862976, "learning_rate": 9.308980863730996e-06, "loss": 0.04385647922754288, "memory(GiB)": 21.32, "step": 6492, "token_acc": 0.9631336405529954, "train_speed(iter/s)": 0.958609 }, { "epoch": 0.21092810967092226, "grad_norm": 0.453811377286911, "learning_rate": 9.308708365501364e-06, "loss": 0.04151798039674759, "memory(GiB)": 21.32, "step": 6493, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.958634 }, { "epoch": 0.21096059513367768, "grad_norm": 0.5630860924720764, "learning_rate": 9.308435817543502e-06, "loss": 0.037977442145347595, "memory(GiB)": 21.32, "step": 6494, "token_acc": 0.9699248120300752, "train_speed(iter/s)": 0.958656 }, { "epoch": 0.2109930805964331, "grad_norm": 0.38511228561401367, "learning_rate": 9.308163219860558e-06, "loss": 0.029781268909573555, "memory(GiB)": 21.32, "step": 6495, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.958682 }, { "epoch": 0.2110255660591885, "grad_norm": 0.6047680974006653, "learning_rate": 9.307890572455673e-06, "loss": 0.03512313589453697, "memory(GiB)": 21.32, "step": 6496, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.958705 }, { "epoch": 0.21105805152194393, "grad_norm": 0.5547815561294556, "learning_rate": 9.307617875332e-06, "loss": 0.04015915095806122, "memory(GiB)": 21.32, "step": 6497, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.95873 }, { "epoch": 0.21109053698469935, "grad_norm": 0.477536141872406, "learning_rate": 9.307345128492682e-06, "loss": 0.03841637447476387, "memory(GiB)": 21.32, "step": 6498, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.958756 }, { "epoch": 0.21112302244745476, "grad_norm": 0.4714328348636627, "learning_rate": 9.30707233194087e-06, "loss": 0.03971457481384277, "memory(GiB)": 21.32, "step": 6499, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.958777 }, { "epoch": 0.21115550791021018, "grad_norm": 0.5407509207725525, "learning_rate": 9.306799485679708e-06, "loss": 0.04262978583574295, "memory(GiB)": 21.32, "step": 6500, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.958795 }, { "epoch": 0.21115550791021018, "eval_loss": 0.04178513586521149, "eval_runtime": 80.0457, "eval_samples_per_second": 124.304, "eval_steps_per_second": 3.885, "eval_token_acc": 0.9838497932958881, "step": 6500 }, { "epoch": 0.2111879933729656, "grad_norm": 0.5523679256439209, "learning_rate": 9.306526589712348e-06, "loss": 0.04042523726820946, "memory(GiB)": 21.32, "step": 6501, "token_acc": 0.9837779330565717, "train_speed(iter/s)": 0.946119 }, { "epoch": 0.211220478835721, "grad_norm": 0.41965290904045105, "learning_rate": 9.306253644041941e-06, "loss": 0.03356419503688812, "memory(GiB)": 21.32, "step": 6502, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.946142 }, { "epoch": 0.21125296429847643, "grad_norm": 0.4372810423374176, "learning_rate": 9.305980648671633e-06, "loss": 0.03667932748794556, "memory(GiB)": 21.32, "step": 6503, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.946166 }, { "epoch": 0.21128544976123184, "grad_norm": 0.614568829536438, "learning_rate": 9.30570760360458e-06, "loss": 0.0515793040394783, "memory(GiB)": 21.32, "step": 6504, "token_acc": 0.983957219251337, "train_speed(iter/s)": 0.94619 }, { "epoch": 0.21131793522398726, "grad_norm": 0.6227648258209229, "learning_rate": 9.305434508843927e-06, "loss": 0.03408513963222504, "memory(GiB)": 21.32, "step": 6505, "token_acc": 0.9746835443037974, "train_speed(iter/s)": 0.946213 }, { "epoch": 0.21135042068674267, "grad_norm": 0.5654377341270447, "learning_rate": 9.305161364392832e-06, "loss": 0.04585355520248413, "memory(GiB)": 21.32, "step": 6506, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.946236 }, { "epoch": 0.2113829061494981, "grad_norm": 0.6420932412147522, "learning_rate": 9.304888170254442e-06, "loss": 0.035883791744709015, "memory(GiB)": 21.32, "step": 6507, "token_acc": 1.0, "train_speed(iter/s)": 0.946257 }, { "epoch": 0.2114153916122535, "grad_norm": 0.570214033126831, "learning_rate": 9.304614926431916e-06, "loss": 0.040317654609680176, "memory(GiB)": 21.32, "step": 6508, "token_acc": 0.99, "train_speed(iter/s)": 0.946282 }, { "epoch": 0.21144787707500892, "grad_norm": 0.6058654189109802, "learning_rate": 9.304341632928403e-06, "loss": 0.045703254640102386, "memory(GiB)": 21.32, "step": 6509, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.946307 }, { "epoch": 0.21148036253776434, "grad_norm": 0.5339811444282532, "learning_rate": 9.304068289747058e-06, "loss": 0.03719954192638397, "memory(GiB)": 21.32, "step": 6510, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.946332 }, { "epoch": 0.21151284800051975, "grad_norm": 0.7927766442298889, "learning_rate": 9.303794896891035e-06, "loss": 0.047429159283638, "memory(GiB)": 21.32, "step": 6511, "token_acc": 0.98, "train_speed(iter/s)": 0.946359 }, { "epoch": 0.2115453334632752, "grad_norm": 0.6250688433647156, "learning_rate": 9.303521454363492e-06, "loss": 0.05112138390541077, "memory(GiB)": 21.32, "step": 6512, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.946352 }, { "epoch": 0.21157781892603061, "grad_norm": 0.4869990348815918, "learning_rate": 9.303247962167582e-06, "loss": 0.03407426178455353, "memory(GiB)": 21.32, "step": 6513, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.946378 }, { "epoch": 0.21161030438878603, "grad_norm": 1.546460509300232, "learning_rate": 9.302974420306465e-06, "loss": 0.03490807116031647, "memory(GiB)": 21.32, "step": 6514, "token_acc": 0.9890510948905109, "train_speed(iter/s)": 0.946402 }, { "epoch": 0.21164278985154145, "grad_norm": 0.8566735982894897, "learning_rate": 9.302700828783297e-06, "loss": 0.04102851822972298, "memory(GiB)": 21.32, "step": 6515, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.946424 }, { "epoch": 0.21167527531429686, "grad_norm": 0.4773387908935547, "learning_rate": 9.302427187601233e-06, "loss": 0.03552645444869995, "memory(GiB)": 21.32, "step": 6516, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.946447 }, { "epoch": 0.21170776077705228, "grad_norm": 0.46431612968444824, "learning_rate": 9.302153496763431e-06, "loss": 0.035348184406757355, "memory(GiB)": 21.32, "step": 6517, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.946473 }, { "epoch": 0.2117402462398077, "grad_norm": 0.4569014012813568, "learning_rate": 9.301879756273053e-06, "loss": 0.04314135015010834, "memory(GiB)": 21.32, "step": 6518, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.946498 }, { "epoch": 0.2117727317025631, "grad_norm": 0.5988984107971191, "learning_rate": 9.301605966133257e-06, "loss": 0.04606587067246437, "memory(GiB)": 21.32, "step": 6519, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.946525 }, { "epoch": 0.21180521716531853, "grad_norm": 0.44125574827194214, "learning_rate": 9.301332126347203e-06, "loss": 0.043071210384368896, "memory(GiB)": 21.32, "step": 6520, "token_acc": 0.988, "train_speed(iter/s)": 0.946555 }, { "epoch": 0.21183770262807394, "grad_norm": 0.34688600897789, "learning_rate": 9.301058236918051e-06, "loss": 0.03261294960975647, "memory(GiB)": 21.32, "step": 6521, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.946587 }, { "epoch": 0.21187018809082936, "grad_norm": 0.8936774134635925, "learning_rate": 9.300784297848962e-06, "loss": 0.04232548177242279, "memory(GiB)": 21.32, "step": 6522, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.946619 }, { "epoch": 0.21190267355358477, "grad_norm": 0.5508658289909363, "learning_rate": 9.300510309143098e-06, "loss": 0.044659413397312164, "memory(GiB)": 21.32, "step": 6523, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.946652 }, { "epoch": 0.2119351590163402, "grad_norm": 0.4497024416923523, "learning_rate": 9.300236270803619e-06, "loss": 0.03437959775328636, "memory(GiB)": 21.32, "step": 6524, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.946684 }, { "epoch": 0.2119676444790956, "grad_norm": 0.5577453374862671, "learning_rate": 9.299962182833693e-06, "loss": 0.03526609018445015, "memory(GiB)": 21.32, "step": 6525, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.946717 }, { "epoch": 0.21200012994185102, "grad_norm": 0.8197460174560547, "learning_rate": 9.29968804523648e-06, "loss": 0.041554197669029236, "memory(GiB)": 21.32, "step": 6526, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.946747 }, { "epoch": 0.21203261540460644, "grad_norm": 0.539230465888977, "learning_rate": 9.299413858015141e-06, "loss": 0.04461897537112236, "memory(GiB)": 21.32, "step": 6527, "token_acc": 0.9875, "train_speed(iter/s)": 0.94678 }, { "epoch": 0.21206510086736186, "grad_norm": 0.6286599636077881, "learning_rate": 9.299139621172846e-06, "loss": 0.035549409687519073, "memory(GiB)": 21.32, "step": 6528, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.946812 }, { "epoch": 0.21209758633011727, "grad_norm": 6.561964511871338, "learning_rate": 9.298865334712758e-06, "loss": 0.04574614018201828, "memory(GiB)": 21.32, "step": 6529, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.946846 }, { "epoch": 0.2121300717928727, "grad_norm": 0.7980237603187561, "learning_rate": 9.298590998638042e-06, "loss": 0.06082356721162796, "memory(GiB)": 21.32, "step": 6530, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.946877 }, { "epoch": 0.2121625572556281, "grad_norm": 0.5302019715309143, "learning_rate": 9.298316612951863e-06, "loss": 0.03545337915420532, "memory(GiB)": 21.32, "step": 6531, "token_acc": 0.988950276243094, "train_speed(iter/s)": 0.94691 }, { "epoch": 0.21219504271838352, "grad_norm": 0.5298119187355042, "learning_rate": 9.29804217765739e-06, "loss": 0.04049040749669075, "memory(GiB)": 21.32, "step": 6532, "token_acc": 0.9836065573770492, "train_speed(iter/s)": 0.946942 }, { "epoch": 0.21222752818113894, "grad_norm": 0.6258095502853394, "learning_rate": 9.29776769275779e-06, "loss": 0.036768827587366104, "memory(GiB)": 21.32, "step": 6533, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.946976 }, { "epoch": 0.21226001364389435, "grad_norm": 0.616729736328125, "learning_rate": 9.29749315825623e-06, "loss": 0.03620663657784462, "memory(GiB)": 21.32, "step": 6534, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.947009 }, { "epoch": 0.21229249910664977, "grad_norm": 0.6069076657295227, "learning_rate": 9.29721857415588e-06, "loss": 0.03779591992497444, "memory(GiB)": 21.32, "step": 6535, "token_acc": 0.9794238683127572, "train_speed(iter/s)": 0.94704 }, { "epoch": 0.21232498456940518, "grad_norm": 0.778045654296875, "learning_rate": 9.296943940459907e-06, "loss": 0.03980812430381775, "memory(GiB)": 21.32, "step": 6536, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.947071 }, { "epoch": 0.2123574700321606, "grad_norm": 0.45752254128456116, "learning_rate": 9.296669257171482e-06, "loss": 0.03464774787425995, "memory(GiB)": 21.32, "step": 6537, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.947101 }, { "epoch": 0.21238995549491602, "grad_norm": 0.49520033597946167, "learning_rate": 9.296394524293775e-06, "loss": 0.031003978103399277, "memory(GiB)": 21.32, "step": 6538, "token_acc": 0.9781021897810219, "train_speed(iter/s)": 0.947133 }, { "epoch": 0.21242244095767143, "grad_norm": 0.6349917650222778, "learning_rate": 9.296119741829957e-06, "loss": 0.04079292714595795, "memory(GiB)": 21.32, "step": 6539, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.947165 }, { "epoch": 0.21245492642042685, "grad_norm": 0.6965273022651672, "learning_rate": 9.295844909783198e-06, "loss": 0.04914907366037369, "memory(GiB)": 21.32, "step": 6540, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.947198 }, { "epoch": 0.21248741188318226, "grad_norm": 0.566968560218811, "learning_rate": 9.295570028156674e-06, "loss": 0.04023038595914841, "memory(GiB)": 21.32, "step": 6541, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.94723 }, { "epoch": 0.21251989734593768, "grad_norm": 0.602830171585083, "learning_rate": 9.29529509695355e-06, "loss": 0.04367901384830475, "memory(GiB)": 21.32, "step": 6542, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.947263 }, { "epoch": 0.2125523828086931, "grad_norm": 0.7213795185089111, "learning_rate": 9.295020116177006e-06, "loss": 0.04931378364562988, "memory(GiB)": 21.32, "step": 6543, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.947296 }, { "epoch": 0.21258486827144854, "grad_norm": 0.988652765750885, "learning_rate": 9.294745085830214e-06, "loss": 0.04272625595331192, "memory(GiB)": 21.32, "step": 6544, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.947327 }, { "epoch": 0.21261735373420396, "grad_norm": 0.7887037396430969, "learning_rate": 9.294470005916346e-06, "loss": 0.046301718801259995, "memory(GiB)": 21.32, "step": 6545, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.94736 }, { "epoch": 0.21264983919695937, "grad_norm": 0.4039277136325836, "learning_rate": 9.294194876438579e-06, "loss": 0.035520877689123154, "memory(GiB)": 21.32, "step": 6546, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.947393 }, { "epoch": 0.2126823246597148, "grad_norm": 0.8688421249389648, "learning_rate": 9.293919697400087e-06, "loss": 0.06449158489704132, "memory(GiB)": 21.32, "step": 6547, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.947425 }, { "epoch": 0.2127148101224702, "grad_norm": 0.7866066694259644, "learning_rate": 9.293644468804046e-06, "loss": 0.04885552451014519, "memory(GiB)": 21.32, "step": 6548, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.947452 }, { "epoch": 0.21274729558522562, "grad_norm": 0.607852041721344, "learning_rate": 9.293369190653633e-06, "loss": 0.03854681923985481, "memory(GiB)": 21.32, "step": 6549, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.947476 }, { "epoch": 0.21277978104798104, "grad_norm": 0.5538027882575989, "learning_rate": 9.293093862952027e-06, "loss": 0.04133490100502968, "memory(GiB)": 21.32, "step": 6550, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.947503 }, { "epoch": 0.21281226651073645, "grad_norm": 0.5153956413269043, "learning_rate": 9.292818485702401e-06, "loss": 0.03790300339460373, "memory(GiB)": 21.32, "step": 6551, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.947531 }, { "epoch": 0.21284475197349187, "grad_norm": 0.34306055307388306, "learning_rate": 9.292543058907939e-06, "loss": 0.031820159405469894, "memory(GiB)": 21.32, "step": 6552, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.947556 }, { "epoch": 0.21287723743624729, "grad_norm": 0.3630790412425995, "learning_rate": 9.292267582571814e-06, "loss": 0.03998483717441559, "memory(GiB)": 21.32, "step": 6553, "token_acc": 0.9724409448818898, "train_speed(iter/s)": 0.947582 }, { "epoch": 0.2129097228990027, "grad_norm": 0.5324056148529053, "learning_rate": 9.29199205669721e-06, "loss": 0.03269678354263306, "memory(GiB)": 21.32, "step": 6554, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.947609 }, { "epoch": 0.21294220836175812, "grad_norm": 0.5575662851333618, "learning_rate": 9.291716481287304e-06, "loss": 0.03419968858361244, "memory(GiB)": 21.32, "step": 6555, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.947634 }, { "epoch": 0.21297469382451353, "grad_norm": 0.5891647934913635, "learning_rate": 9.291440856345277e-06, "loss": 0.04872273653745651, "memory(GiB)": 21.32, "step": 6556, "token_acc": 0.968609865470852, "train_speed(iter/s)": 0.947657 }, { "epoch": 0.21300717928726895, "grad_norm": 0.5386520028114319, "learning_rate": 9.291165181874312e-06, "loss": 0.03532428294420242, "memory(GiB)": 21.32, "step": 6557, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.947684 }, { "epoch": 0.21303966475002437, "grad_norm": 0.35012030601501465, "learning_rate": 9.29088945787759e-06, "loss": 0.03189387917518616, "memory(GiB)": 21.32, "step": 6558, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.947711 }, { "epoch": 0.21307215021277978, "grad_norm": 0.5798262357711792, "learning_rate": 9.290613684358289e-06, "loss": 0.04187886416912079, "memory(GiB)": 21.32, "step": 6559, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.947738 }, { "epoch": 0.2131046356755352, "grad_norm": 2.599851369857788, "learning_rate": 9.290337861319599e-06, "loss": 0.050419025123119354, "memory(GiB)": 21.32, "step": 6560, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.947762 }, { "epoch": 0.21313712113829061, "grad_norm": 1.1324330568313599, "learning_rate": 9.290061988764698e-06, "loss": 0.056512005627155304, "memory(GiB)": 21.32, "step": 6561, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.947786 }, { "epoch": 0.21316960660104603, "grad_norm": 0.7361568808555603, "learning_rate": 9.28978606669677e-06, "loss": 0.04156485199928284, "memory(GiB)": 21.32, "step": 6562, "token_acc": 0.9793388429752066, "train_speed(iter/s)": 0.947808 }, { "epoch": 0.21320209206380145, "grad_norm": 0.6040014028549194, "learning_rate": 9.289510095119004e-06, "loss": 0.05158821865916252, "memory(GiB)": 21.32, "step": 6563, "token_acc": 0.984, "train_speed(iter/s)": 0.947831 }, { "epoch": 0.21323457752655686, "grad_norm": 0.40955474972724915, "learning_rate": 9.289234074034581e-06, "loss": 0.028545545414090157, "memory(GiB)": 21.32, "step": 6564, "token_acc": 1.0, "train_speed(iter/s)": 0.947855 }, { "epoch": 0.21326706298931228, "grad_norm": 0.6147710084915161, "learning_rate": 9.28895800344669e-06, "loss": 0.04106727987527847, "memory(GiB)": 21.32, "step": 6565, "token_acc": 0.9787985865724381, "train_speed(iter/s)": 0.947878 }, { "epoch": 0.2132995484520677, "grad_norm": 0.493684321641922, "learning_rate": 9.288681883358512e-06, "loss": 0.03738943487405777, "memory(GiB)": 21.32, "step": 6566, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.947901 }, { "epoch": 0.2133320339148231, "grad_norm": 0.6569750308990479, "learning_rate": 9.288405713773236e-06, "loss": 0.035590287297964096, "memory(GiB)": 21.32, "step": 6567, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.947923 }, { "epoch": 0.21336451937757853, "grad_norm": 0.4353874921798706, "learning_rate": 9.288129494694054e-06, "loss": 0.033620256930589676, "memory(GiB)": 21.32, "step": 6568, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.947945 }, { "epoch": 0.21339700484033394, "grad_norm": 0.5498348474502563, "learning_rate": 9.287853226124147e-06, "loss": 0.04709043353796005, "memory(GiB)": 21.32, "step": 6569, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.947969 }, { "epoch": 0.21342949030308936, "grad_norm": 0.6007739305496216, "learning_rate": 9.287576908066708e-06, "loss": 0.03936169669032097, "memory(GiB)": 21.32, "step": 6570, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.947991 }, { "epoch": 0.21346197576584477, "grad_norm": 0.56387859582901, "learning_rate": 9.287300540524924e-06, "loss": 0.036441851407289505, "memory(GiB)": 21.32, "step": 6571, "token_acc": 0.988950276243094, "train_speed(iter/s)": 0.948014 }, { "epoch": 0.2134944612286002, "grad_norm": 0.45690909028053284, "learning_rate": 9.287024123501986e-06, "loss": 0.03302496299147606, "memory(GiB)": 21.32, "step": 6572, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.948039 }, { "epoch": 0.2135269466913556, "grad_norm": 0.639450192451477, "learning_rate": 9.286747657001082e-06, "loss": 0.04060225933790207, "memory(GiB)": 21.32, "step": 6573, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.948061 }, { "epoch": 0.21355943215411102, "grad_norm": 0.5840580463409424, "learning_rate": 9.286471141025408e-06, "loss": 0.034091707319021225, "memory(GiB)": 21.32, "step": 6574, "token_acc": 0.9746192893401016, "train_speed(iter/s)": 0.948089 }, { "epoch": 0.21359191761686644, "grad_norm": 0.6026338338851929, "learning_rate": 9.286194575578149e-06, "loss": 0.04085644334554672, "memory(GiB)": 21.32, "step": 6575, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.948115 }, { "epoch": 0.21362440307962188, "grad_norm": 1.3729689121246338, "learning_rate": 9.2859179606625e-06, "loss": 0.04218316823244095, "memory(GiB)": 21.32, "step": 6576, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.948139 }, { "epoch": 0.2136568885423773, "grad_norm": 0.583469569683075, "learning_rate": 9.285641296281654e-06, "loss": 0.051493607461452484, "memory(GiB)": 21.32, "step": 6577, "token_acc": 0.9669421487603306, "train_speed(iter/s)": 0.948165 }, { "epoch": 0.21368937400513272, "grad_norm": 0.6559811234474182, "learning_rate": 9.285364582438803e-06, "loss": 0.03735524043440819, "memory(GiB)": 21.32, "step": 6578, "token_acc": 0.9793388429752066, "train_speed(iter/s)": 0.948186 }, { "epoch": 0.21372185946788813, "grad_norm": 0.6716380715370178, "learning_rate": 9.28508781913714e-06, "loss": 0.04613664001226425, "memory(GiB)": 21.32, "step": 6579, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.948209 }, { "epoch": 0.21375434493064355, "grad_norm": 0.7147231101989746, "learning_rate": 9.284811006379862e-06, "loss": 0.04242943227291107, "memory(GiB)": 21.32, "step": 6580, "token_acc": 0.9674418604651163, "train_speed(iter/s)": 0.948235 }, { "epoch": 0.21378683039339896, "grad_norm": 0.45058268308639526, "learning_rate": 9.28453414417016e-06, "loss": 0.03318433836102486, "memory(GiB)": 21.32, "step": 6581, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.948258 }, { "epoch": 0.21381931585615438, "grad_norm": 0.4515543282032013, "learning_rate": 9.284257232511231e-06, "loss": 0.03510824218392372, "memory(GiB)": 21.32, "step": 6582, "token_acc": 0.9702970297029703, "train_speed(iter/s)": 0.948284 }, { "epoch": 0.2138518013189098, "grad_norm": 0.7149311304092407, "learning_rate": 9.283980271406271e-06, "loss": 0.05058936029672623, "memory(GiB)": 21.32, "step": 6583, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.948308 }, { "epoch": 0.2138842867816652, "grad_norm": 0.5866330862045288, "learning_rate": 9.283703260858479e-06, "loss": 0.04247455298900604, "memory(GiB)": 21.32, "step": 6584, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.948333 }, { "epoch": 0.21391677224442063, "grad_norm": 0.36828508973121643, "learning_rate": 9.283426200871049e-06, "loss": 0.03438066691160202, "memory(GiB)": 21.32, "step": 6585, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.948366 }, { "epoch": 0.21394925770717604, "grad_norm": 0.4110386371612549, "learning_rate": 9.28314909144718e-06, "loss": 0.032171450555324554, "memory(GiB)": 21.32, "step": 6586, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.948399 }, { "epoch": 0.21398174316993146, "grad_norm": 0.49013012647628784, "learning_rate": 9.282871932590068e-06, "loss": 0.03806686028838158, "memory(GiB)": 21.32, "step": 6587, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.948429 }, { "epoch": 0.21401422863268688, "grad_norm": 0.5984716415405273, "learning_rate": 9.282594724302913e-06, "loss": 0.04357558488845825, "memory(GiB)": 21.32, "step": 6588, "token_acc": 0.978494623655914, "train_speed(iter/s)": 0.948462 }, { "epoch": 0.2140467140954423, "grad_norm": 0.6179372668266296, "learning_rate": 9.282317466588916e-06, "loss": 0.04412771388888359, "memory(GiB)": 21.32, "step": 6589, "token_acc": 0.9738219895287958, "train_speed(iter/s)": 0.948494 }, { "epoch": 0.2140791995581977, "grad_norm": 0.5526688098907471, "learning_rate": 9.282040159451277e-06, "loss": 0.04385238140821457, "memory(GiB)": 21.32, "step": 6590, "token_acc": 0.9661016949152542, "train_speed(iter/s)": 0.948525 }, { "epoch": 0.21411168502095312, "grad_norm": 0.3677193522453308, "learning_rate": 9.281762802893192e-06, "loss": 0.033541593700647354, "memory(GiB)": 21.32, "step": 6591, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.948555 }, { "epoch": 0.21414417048370854, "grad_norm": 0.4951185882091522, "learning_rate": 9.281485396917869e-06, "loss": 0.039249297231435776, "memory(GiB)": 21.32, "step": 6592, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.948585 }, { "epoch": 0.21417665594646396, "grad_norm": 0.7793512344360352, "learning_rate": 9.281207941528503e-06, "loss": 0.05549512431025505, "memory(GiB)": 21.32, "step": 6593, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.948617 }, { "epoch": 0.21420914140921937, "grad_norm": 0.7610906362533569, "learning_rate": 9.280930436728299e-06, "loss": 0.0512646846473217, "memory(GiB)": 21.32, "step": 6594, "token_acc": 0.969811320754717, "train_speed(iter/s)": 0.948649 }, { "epoch": 0.2142416268719748, "grad_norm": 0.7846055626869202, "learning_rate": 9.280652882520463e-06, "loss": 0.05215269327163696, "memory(GiB)": 21.32, "step": 6595, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.948679 }, { "epoch": 0.2142741123347302, "grad_norm": 0.6410812735557556, "learning_rate": 9.280375278908192e-06, "loss": 0.04909912124276161, "memory(GiB)": 21.32, "step": 6596, "token_acc": 0.978021978021978, "train_speed(iter/s)": 0.948711 }, { "epoch": 0.21430659779748562, "grad_norm": 0.506256103515625, "learning_rate": 9.280097625894693e-06, "loss": 0.03996367007493973, "memory(GiB)": 21.32, "step": 6597, "token_acc": 0.9638009049773756, "train_speed(iter/s)": 0.948743 }, { "epoch": 0.21433908326024104, "grad_norm": 0.6552006602287292, "learning_rate": 9.279819923483172e-06, "loss": 0.03854607790708542, "memory(GiB)": 21.32, "step": 6598, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.948774 }, { "epoch": 0.21437156872299645, "grad_norm": 0.6691238284111023, "learning_rate": 9.279542171676832e-06, "loss": 0.044327475130558014, "memory(GiB)": 21.32, "step": 6599, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.948807 }, { "epoch": 0.21440405418575187, "grad_norm": 0.6153085827827454, "learning_rate": 9.27926437047888e-06, "loss": 0.039677731692790985, "memory(GiB)": 21.32, "step": 6600, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.948839 }, { "epoch": 0.21443653964850728, "grad_norm": 0.540547251701355, "learning_rate": 9.278986519892522e-06, "loss": 0.03997992351651192, "memory(GiB)": 21.32, "step": 6601, "token_acc": 0.984375, "train_speed(iter/s)": 0.948871 }, { "epoch": 0.2144690251112627, "grad_norm": 0.5164638161659241, "learning_rate": 9.278708619920963e-06, "loss": 0.0403282456099987, "memory(GiB)": 21.32, "step": 6602, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.948903 }, { "epoch": 0.21450151057401812, "grad_norm": 0.6191339492797852, "learning_rate": 9.278430670567412e-06, "loss": 0.04411441832780838, "memory(GiB)": 21.32, "step": 6603, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.948933 }, { "epoch": 0.21453399603677353, "grad_norm": 0.6130065321922302, "learning_rate": 9.278152671835077e-06, "loss": 0.04067118465900421, "memory(GiB)": 21.32, "step": 6604, "token_acc": 0.9644268774703557, "train_speed(iter/s)": 0.948966 }, { "epoch": 0.21456648149952895, "grad_norm": 0.5702325701713562, "learning_rate": 9.277874623727165e-06, "loss": 0.04308575391769409, "memory(GiB)": 21.32, "step": 6605, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.948998 }, { "epoch": 0.21459896696228437, "grad_norm": 0.7641838788986206, "learning_rate": 9.277596526246886e-06, "loss": 0.045761093497276306, "memory(GiB)": 21.32, "step": 6606, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.94903 }, { "epoch": 0.21463145242503978, "grad_norm": 0.8878281712532043, "learning_rate": 9.277318379397449e-06, "loss": 0.059408072382211685, "memory(GiB)": 21.32, "step": 6607, "token_acc": 0.9736842105263158, "train_speed(iter/s)": 0.949061 }, { "epoch": 0.21466393788779523, "grad_norm": 0.5513394474983215, "learning_rate": 9.277040183182066e-06, "loss": 0.03299528732895851, "memory(GiB)": 21.32, "step": 6608, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.949086 }, { "epoch": 0.21469642335055064, "grad_norm": 0.750438392162323, "learning_rate": 9.276761937603947e-06, "loss": 0.03471033275127411, "memory(GiB)": 21.32, "step": 6609, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.949109 }, { "epoch": 0.21472890881330606, "grad_norm": 0.7596109509468079, "learning_rate": 9.276483642666303e-06, "loss": 0.04606902599334717, "memory(GiB)": 21.32, "step": 6610, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.949134 }, { "epoch": 0.21476139427606147, "grad_norm": 0.6044157147407532, "learning_rate": 9.276205298372344e-06, "loss": 0.04199330136179924, "memory(GiB)": 21.32, "step": 6611, "token_acc": 0.982532751091703, "train_speed(iter/s)": 0.949161 }, { "epoch": 0.2147938797388169, "grad_norm": 0.625157356262207, "learning_rate": 9.275926904725286e-06, "loss": 0.036043088883161545, "memory(GiB)": 21.32, "step": 6612, "token_acc": 0.981203007518797, "train_speed(iter/s)": 0.949185 }, { "epoch": 0.2148263652015723, "grad_norm": 0.727195680141449, "learning_rate": 9.27564846172834e-06, "loss": 0.04658690467476845, "memory(GiB)": 21.32, "step": 6613, "token_acc": 0.97265625, "train_speed(iter/s)": 0.949208 }, { "epoch": 0.21485885066432772, "grad_norm": 0.5052375793457031, "learning_rate": 9.275369969384718e-06, "loss": 0.0416138581931591, "memory(GiB)": 21.32, "step": 6614, "token_acc": 0.9805825242718447, "train_speed(iter/s)": 0.949233 }, { "epoch": 0.21489133612708314, "grad_norm": 1.6513888835906982, "learning_rate": 9.275091427697637e-06, "loss": 0.050859928131103516, "memory(GiB)": 21.32, "step": 6615, "token_acc": 0.966542750929368, "train_speed(iter/s)": 0.949258 }, { "epoch": 0.21492382158983855, "grad_norm": 0.672474205493927, "learning_rate": 9.274812836670313e-06, "loss": 0.04687957838177681, "memory(GiB)": 21.32, "step": 6616, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.949282 }, { "epoch": 0.21495630705259397, "grad_norm": 0.46751028299331665, "learning_rate": 9.274534196305956e-06, "loss": 0.03835656866431236, "memory(GiB)": 21.32, "step": 6617, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.949306 }, { "epoch": 0.2149887925153494, "grad_norm": 0.5163957476615906, "learning_rate": 9.274255506607787e-06, "loss": 0.04632934555411339, "memory(GiB)": 21.32, "step": 6618, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.949331 }, { "epoch": 0.2150212779781048, "grad_norm": 0.6983668208122253, "learning_rate": 9.27397676757902e-06, "loss": 0.04439745098352432, "memory(GiB)": 21.32, "step": 6619, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.949355 }, { "epoch": 0.21505376344086022, "grad_norm": 0.6241552233695984, "learning_rate": 9.273697979222872e-06, "loss": 0.045541513711214066, "memory(GiB)": 21.32, "step": 6620, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.949379 }, { "epoch": 0.21508624890361563, "grad_norm": 0.45883503556251526, "learning_rate": 9.273419141542562e-06, "loss": 0.05516570061445236, "memory(GiB)": 21.32, "step": 6621, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.949402 }, { "epoch": 0.21511873436637105, "grad_norm": 0.4947258234024048, "learning_rate": 9.273140254541306e-06, "loss": 0.049097515642642975, "memory(GiB)": 21.32, "step": 6622, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.949427 }, { "epoch": 0.21515121982912647, "grad_norm": 0.47587287425994873, "learning_rate": 9.272861318222324e-06, "loss": 0.03829776123166084, "memory(GiB)": 21.32, "step": 6623, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.949449 }, { "epoch": 0.21518370529188188, "grad_norm": 0.4435332119464874, "learning_rate": 9.272582332588836e-06, "loss": 0.04084444046020508, "memory(GiB)": 21.32, "step": 6624, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.949471 }, { "epoch": 0.2152161907546373, "grad_norm": 0.3908199071884155, "learning_rate": 9.27230329764406e-06, "loss": 0.037505410611629486, "memory(GiB)": 21.32, "step": 6625, "token_acc": 0.9812734082397003, "train_speed(iter/s)": 0.949493 }, { "epoch": 0.21524867621739271, "grad_norm": 0.5482227802276611, "learning_rate": 9.272024213391217e-06, "loss": 0.03815946727991104, "memory(GiB)": 21.32, "step": 6626, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.949515 }, { "epoch": 0.21528116168014813, "grad_norm": 0.5132521986961365, "learning_rate": 9.27174507983353e-06, "loss": 0.040021494030952454, "memory(GiB)": 21.32, "step": 6627, "token_acc": 0.99, "train_speed(iter/s)": 0.949537 }, { "epoch": 0.21531364714290355, "grad_norm": 0.36651888489723206, "learning_rate": 9.271465896974218e-06, "loss": 0.028724215924739838, "memory(GiB)": 21.32, "step": 6628, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.949562 }, { "epoch": 0.21534613260565896, "grad_norm": 0.7890663743019104, "learning_rate": 9.271186664816508e-06, "loss": 0.041197605431079865, "memory(GiB)": 21.32, "step": 6629, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.949587 }, { "epoch": 0.21537861806841438, "grad_norm": 0.658133327960968, "learning_rate": 9.270907383363614e-06, "loss": 0.04322308674454689, "memory(GiB)": 21.32, "step": 6630, "token_acc": 0.9702970297029703, "train_speed(iter/s)": 0.949611 }, { "epoch": 0.2154111035311698, "grad_norm": 0.6116445064544678, "learning_rate": 9.270628052618766e-06, "loss": 0.03859646990895271, "memory(GiB)": 21.32, "step": 6631, "token_acc": 0.992831541218638, "train_speed(iter/s)": 0.949636 }, { "epoch": 0.2154435889939252, "grad_norm": 0.7978073954582214, "learning_rate": 9.270348672585188e-06, "loss": 0.04054870828986168, "memory(GiB)": 21.32, "step": 6632, "token_acc": 0.995, "train_speed(iter/s)": 0.94966 }, { "epoch": 0.21547607445668063, "grad_norm": 0.45551249384880066, "learning_rate": 9.2700692432661e-06, "loss": 0.035023417323827744, "memory(GiB)": 21.32, "step": 6633, "token_acc": 0.984375, "train_speed(iter/s)": 0.949685 }, { "epoch": 0.21550855991943604, "grad_norm": 0.49076974391937256, "learning_rate": 9.26978976466473e-06, "loss": 0.049445152282714844, "memory(GiB)": 21.32, "step": 6634, "token_acc": 0.981203007518797, "train_speed(iter/s)": 0.949707 }, { "epoch": 0.21554104538219146, "grad_norm": 0.41718751192092896, "learning_rate": 9.269510236784304e-06, "loss": 0.0340605229139328, "memory(GiB)": 21.32, "step": 6635, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.949731 }, { "epoch": 0.21557353084494688, "grad_norm": 0.8372877836227417, "learning_rate": 9.269230659628046e-06, "loss": 0.03804274648427963, "memory(GiB)": 21.32, "step": 6636, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.949755 }, { "epoch": 0.2156060163077023, "grad_norm": 0.5211980938911438, "learning_rate": 9.268951033199185e-06, "loss": 0.039762627333402634, "memory(GiB)": 21.32, "step": 6637, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.94978 }, { "epoch": 0.2156385017704577, "grad_norm": 0.719308078289032, "learning_rate": 9.268671357500947e-06, "loss": 0.044170837849378586, "memory(GiB)": 21.32, "step": 6638, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.949805 }, { "epoch": 0.21567098723321312, "grad_norm": 0.7767859101295471, "learning_rate": 9.268391632536559e-06, "loss": 0.03995078429579735, "memory(GiB)": 21.32, "step": 6639, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.949831 }, { "epoch": 0.21570347269596857, "grad_norm": 0.4162515103816986, "learning_rate": 9.26811185830925e-06, "loss": 0.03671058639883995, "memory(GiB)": 21.32, "step": 6640, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.949855 }, { "epoch": 0.21573595815872398, "grad_norm": 0.9149301052093506, "learning_rate": 9.26783203482225e-06, "loss": 0.04869336634874344, "memory(GiB)": 21.32, "step": 6641, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.949879 }, { "epoch": 0.2157684436214794, "grad_norm": 0.5963478088378906, "learning_rate": 9.267552162078788e-06, "loss": 0.04277946799993515, "memory(GiB)": 21.32, "step": 6642, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.949904 }, { "epoch": 0.21580092908423482, "grad_norm": 0.4630486071109772, "learning_rate": 9.267272240082094e-06, "loss": 0.034260399639606476, "memory(GiB)": 21.32, "step": 6643, "token_acc": 0.9818840579710145, "train_speed(iter/s)": 0.949921 }, { "epoch": 0.21583341454699023, "grad_norm": 0.33862051367759705, "learning_rate": 9.266992268835399e-06, "loss": 0.030502913519740105, "memory(GiB)": 21.32, "step": 6644, "token_acc": 1.0, "train_speed(iter/s)": 0.949945 }, { "epoch": 0.21586590000974565, "grad_norm": 0.5629262924194336, "learning_rate": 9.266712248341934e-06, "loss": 0.04294624179601669, "memory(GiB)": 21.32, "step": 6645, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.94997 }, { "epoch": 0.21589838547250106, "grad_norm": 0.4357757270336151, "learning_rate": 9.26643217860493e-06, "loss": 0.037938736379146576, "memory(GiB)": 21.32, "step": 6646, "token_acc": 0.9730941704035875, "train_speed(iter/s)": 0.949994 }, { "epoch": 0.21593087093525648, "grad_norm": 0.8936022520065308, "learning_rate": 9.26615205962762e-06, "loss": 0.04041299968957901, "memory(GiB)": 21.32, "step": 6647, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.95002 }, { "epoch": 0.2159633563980119, "grad_norm": 0.5288628339767456, "learning_rate": 9.265871891413236e-06, "loss": 0.03457134962081909, "memory(GiB)": 21.32, "step": 6648, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.950049 }, { "epoch": 0.2159958418607673, "grad_norm": 0.5121409296989441, "learning_rate": 9.265591673965012e-06, "loss": 0.03332475572824478, "memory(GiB)": 21.32, "step": 6649, "token_acc": 0.9819277108433735, "train_speed(iter/s)": 0.95008 }, { "epoch": 0.21602832732352273, "grad_norm": 0.8432798981666565, "learning_rate": 9.265311407286187e-06, "loss": 0.046739742159843445, "memory(GiB)": 21.32, "step": 6650, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.950112 }, { "epoch": 0.21606081278627814, "grad_norm": 0.6155910491943359, "learning_rate": 9.265031091379988e-06, "loss": 0.04303000122308731, "memory(GiB)": 21.32, "step": 6651, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.950143 }, { "epoch": 0.21609329824903356, "grad_norm": 0.7237335443496704, "learning_rate": 9.264750726249654e-06, "loss": 0.03623189032077789, "memory(GiB)": 21.32, "step": 6652, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.950174 }, { "epoch": 0.21612578371178898, "grad_norm": 0.6611757874488831, "learning_rate": 9.26447031189842e-06, "loss": 0.0403386615216732, "memory(GiB)": 21.32, "step": 6653, "token_acc": 0.9751243781094527, "train_speed(iter/s)": 0.950205 }, { "epoch": 0.2161582691745444, "grad_norm": 0.6227015256881714, "learning_rate": 9.264189848329523e-06, "loss": 0.049618832767009735, "memory(GiB)": 21.32, "step": 6654, "token_acc": 0.9814126394052045, "train_speed(iter/s)": 0.950237 }, { "epoch": 0.2161907546372998, "grad_norm": 0.4378766417503357, "learning_rate": 9.263909335546199e-06, "loss": 0.027917319908738136, "memory(GiB)": 21.32, "step": 6655, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.950267 }, { "epoch": 0.21622324010005523, "grad_norm": 0.4191419184207916, "learning_rate": 9.263628773551684e-06, "loss": 0.034032244235277176, "memory(GiB)": 21.32, "step": 6656, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.950298 }, { "epoch": 0.21625572556281064, "grad_norm": 0.5721901059150696, "learning_rate": 9.263348162349222e-06, "loss": 0.044669412076473236, "memory(GiB)": 21.32, "step": 6657, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.950329 }, { "epoch": 0.21628821102556606, "grad_norm": 0.6206333637237549, "learning_rate": 9.263067501942046e-06, "loss": 0.04697020351886749, "memory(GiB)": 21.32, "step": 6658, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.950359 }, { "epoch": 0.21632069648832147, "grad_norm": 0.548205554485321, "learning_rate": 9.262786792333396e-06, "loss": 0.04545481130480766, "memory(GiB)": 21.32, "step": 6659, "token_acc": 0.981203007518797, "train_speed(iter/s)": 0.950388 }, { "epoch": 0.2163531819510769, "grad_norm": 0.4826980233192444, "learning_rate": 9.262506033526516e-06, "loss": 0.03792412951588631, "memory(GiB)": 21.32, "step": 6660, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.950415 }, { "epoch": 0.2163856674138323, "grad_norm": 0.48058804869651794, "learning_rate": 9.262225225524639e-06, "loss": 0.04020250216126442, "memory(GiB)": 21.32, "step": 6661, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.950444 }, { "epoch": 0.21641815287658772, "grad_norm": 2.066664218902588, "learning_rate": 9.261944368331012e-06, "loss": 0.04343220591545105, "memory(GiB)": 21.32, "step": 6662, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.950472 }, { "epoch": 0.21645063833934314, "grad_norm": 0.9961550235748291, "learning_rate": 9.261663461948873e-06, "loss": 0.040397804230451584, "memory(GiB)": 21.32, "step": 6663, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.950501 }, { "epoch": 0.21648312380209855, "grad_norm": 1.128719687461853, "learning_rate": 9.261382506381466e-06, "loss": 0.04868665710091591, "memory(GiB)": 21.32, "step": 6664, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.95053 }, { "epoch": 0.21651560926485397, "grad_norm": 0.4553324282169342, "learning_rate": 9.261101501632034e-06, "loss": 0.048384662717580795, "memory(GiB)": 21.32, "step": 6665, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.950558 }, { "epoch": 0.21654809472760939, "grad_norm": 0.39123937487602234, "learning_rate": 9.260820447703817e-06, "loss": 0.0342881977558136, "memory(GiB)": 21.32, "step": 6666, "token_acc": 0.985, "train_speed(iter/s)": 0.950585 }, { "epoch": 0.2165805801903648, "grad_norm": 0.5732800364494324, "learning_rate": 9.260539344600063e-06, "loss": 0.04079627990722656, "memory(GiB)": 21.32, "step": 6667, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.95061 }, { "epoch": 0.21661306565312022, "grad_norm": 0.6032876372337341, "learning_rate": 9.260258192324015e-06, "loss": 0.043669477105140686, "memory(GiB)": 21.32, "step": 6668, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.950634 }, { "epoch": 0.21664555111587563, "grad_norm": 0.4186912477016449, "learning_rate": 9.259976990878915e-06, "loss": 0.04457179456949234, "memory(GiB)": 21.32, "step": 6669, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.950659 }, { "epoch": 0.21667803657863105, "grad_norm": 0.5607894659042358, "learning_rate": 9.25969574026801e-06, "loss": 0.04301324114203453, "memory(GiB)": 21.32, "step": 6670, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.950684 }, { "epoch": 0.21671052204138647, "grad_norm": 1.403131365776062, "learning_rate": 9.25941444049455e-06, "loss": 0.034686021506786346, "memory(GiB)": 21.32, "step": 6671, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.950708 }, { "epoch": 0.2167430075041419, "grad_norm": 0.4239455461502075, "learning_rate": 9.259133091561774e-06, "loss": 0.04037202522158623, "memory(GiB)": 21.32, "step": 6672, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.950732 }, { "epoch": 0.21677549296689733, "grad_norm": 0.62933349609375, "learning_rate": 9.258851693472934e-06, "loss": 0.044989533722400665, "memory(GiB)": 21.32, "step": 6673, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.950758 }, { "epoch": 0.21680797842965274, "grad_norm": 0.6566511392593384, "learning_rate": 9.25857024623128e-06, "loss": 0.04255549982190132, "memory(GiB)": 21.32, "step": 6674, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.950778 }, { "epoch": 0.21684046389240816, "grad_norm": 1.0955349206924438, "learning_rate": 9.258288749840055e-06, "loss": 0.05291708558797836, "memory(GiB)": 21.32, "step": 6675, "token_acc": 0.9849056603773585, "train_speed(iter/s)": 0.950802 }, { "epoch": 0.21687294935516357, "grad_norm": 0.7093071937561035, "learning_rate": 9.258007204302508e-06, "loss": 0.04331614822149277, "memory(GiB)": 21.32, "step": 6676, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.950828 }, { "epoch": 0.216905434817919, "grad_norm": 0.5117325782775879, "learning_rate": 9.257725609621894e-06, "loss": 0.039670199155807495, "memory(GiB)": 21.32, "step": 6677, "token_acc": 1.0, "train_speed(iter/s)": 0.950853 }, { "epoch": 0.2169379202806744, "grad_norm": 0.4456256330013275, "learning_rate": 9.257443965801457e-06, "loss": 0.036510977894067764, "memory(GiB)": 21.32, "step": 6678, "token_acc": 0.973568281938326, "train_speed(iter/s)": 0.950872 }, { "epoch": 0.21697040574342982, "grad_norm": 0.4565444588661194, "learning_rate": 9.257162272844452e-06, "loss": 0.03905278071761131, "memory(GiB)": 21.32, "step": 6679, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.950895 }, { "epoch": 0.21700289120618524, "grad_norm": 0.6552014946937561, "learning_rate": 9.256880530754126e-06, "loss": 0.04437589645385742, "memory(GiB)": 21.32, "step": 6680, "token_acc": 0.9694656488549618, "train_speed(iter/s)": 0.950916 }, { "epoch": 0.21703537666894065, "grad_norm": 0.7179420590400696, "learning_rate": 9.256598739533736e-06, "loss": 0.036060795187950134, "memory(GiB)": 21.32, "step": 6681, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.95094 }, { "epoch": 0.21706786213169607, "grad_norm": 0.5853731036186218, "learning_rate": 9.256316899186528e-06, "loss": 0.03460258990526199, "memory(GiB)": 21.32, "step": 6682, "token_acc": 1.0, "train_speed(iter/s)": 0.950965 }, { "epoch": 0.2171003475944515, "grad_norm": 0.6752462983131409, "learning_rate": 9.256035009715757e-06, "loss": 0.04233195632696152, "memory(GiB)": 21.32, "step": 6683, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.950989 }, { "epoch": 0.2171328330572069, "grad_norm": 0.398868590593338, "learning_rate": 9.25575307112468e-06, "loss": 0.033509597182273865, "memory(GiB)": 21.32, "step": 6684, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.951011 }, { "epoch": 0.21716531851996232, "grad_norm": 0.9374428987503052, "learning_rate": 9.255471083416547e-06, "loss": 0.043980613350868225, "memory(GiB)": 21.32, "step": 6685, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.951033 }, { "epoch": 0.21719780398271774, "grad_norm": 0.5620390176773071, "learning_rate": 9.255189046594616e-06, "loss": 0.04661904275417328, "memory(GiB)": 21.32, "step": 6686, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.951055 }, { "epoch": 0.21723028944547315, "grad_norm": 0.40614908933639526, "learning_rate": 9.254906960662136e-06, "loss": 0.030317947268486023, "memory(GiB)": 21.32, "step": 6687, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.951078 }, { "epoch": 0.21726277490822857, "grad_norm": 0.5394797325134277, "learning_rate": 9.254624825622367e-06, "loss": 0.0450250580906868, "memory(GiB)": 21.32, "step": 6688, "token_acc": 0.9961832061068703, "train_speed(iter/s)": 0.951101 }, { "epoch": 0.21729526037098398, "grad_norm": 0.5643765926361084, "learning_rate": 9.254342641478566e-06, "loss": 0.04506421089172363, "memory(GiB)": 21.32, "step": 6689, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.951124 }, { "epoch": 0.2173277458337394, "grad_norm": 0.5258843302726746, "learning_rate": 9.254060408233989e-06, "loss": 0.04103697091341019, "memory(GiB)": 21.32, "step": 6690, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.95115 }, { "epoch": 0.21736023129649482, "grad_norm": 0.41422250866889954, "learning_rate": 9.25377812589189e-06, "loss": 0.03847876191139221, "memory(GiB)": 21.32, "step": 6691, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.951175 }, { "epoch": 0.21739271675925023, "grad_norm": 0.5199591517448425, "learning_rate": 9.253495794455532e-06, "loss": 0.04610525816679001, "memory(GiB)": 21.32, "step": 6692, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.951198 }, { "epoch": 0.21742520222200565, "grad_norm": 0.4960651397705078, "learning_rate": 9.253213413928171e-06, "loss": 0.04279481619596481, "memory(GiB)": 21.32, "step": 6693, "token_acc": 0.9860627177700348, "train_speed(iter/s)": 0.951222 }, { "epoch": 0.21745768768476106, "grad_norm": 0.7601115107536316, "learning_rate": 9.252930984313066e-06, "loss": 0.03345132991671562, "memory(GiB)": 21.32, "step": 6694, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.951245 }, { "epoch": 0.21749017314751648, "grad_norm": 0.44136151671409607, "learning_rate": 9.252648505613477e-06, "loss": 0.03640534728765488, "memory(GiB)": 21.32, "step": 6695, "token_acc": 0.9767441860465116, "train_speed(iter/s)": 0.951266 }, { "epoch": 0.2175226586102719, "grad_norm": 0.46282532811164856, "learning_rate": 9.252365977832662e-06, "loss": 0.03441447392106056, "memory(GiB)": 21.32, "step": 6696, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.951289 }, { "epoch": 0.2175551440730273, "grad_norm": 0.464094340801239, "learning_rate": 9.252083400973885e-06, "loss": 0.024788234382867813, "memory(GiB)": 21.32, "step": 6697, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.951312 }, { "epoch": 0.21758762953578273, "grad_norm": 0.5975486040115356, "learning_rate": 9.251800775040407e-06, "loss": 0.03685789555311203, "memory(GiB)": 21.32, "step": 6698, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.951337 }, { "epoch": 0.21762011499853814, "grad_norm": 0.6802288889884949, "learning_rate": 9.251518100035488e-06, "loss": 0.0337531752884388, "memory(GiB)": 21.32, "step": 6699, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.95136 }, { "epoch": 0.21765260046129356, "grad_norm": 0.5211631655693054, "learning_rate": 9.251235375962392e-06, "loss": 0.035459212958812714, "memory(GiB)": 21.32, "step": 6700, "token_acc": 0.9809523809523809, "train_speed(iter/s)": 0.951384 }, { "epoch": 0.21768508592404898, "grad_norm": 0.6225220561027527, "learning_rate": 9.25095260282438e-06, "loss": 0.04065503925085068, "memory(GiB)": 21.32, "step": 6701, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.951407 }, { "epoch": 0.2177175713868044, "grad_norm": 4.192636013031006, "learning_rate": 9.250669780624718e-06, "loss": 0.03360949084162712, "memory(GiB)": 21.32, "step": 6702, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.951432 }, { "epoch": 0.2177500568495598, "grad_norm": 1.329179286956787, "learning_rate": 9.250386909366671e-06, "loss": 0.04865805804729462, "memory(GiB)": 21.32, "step": 6703, "token_acc": 0.9641434262948207, "train_speed(iter/s)": 0.951458 }, { "epoch": 0.21778254231231525, "grad_norm": 0.5201743245124817, "learning_rate": 9.250103989053497e-06, "loss": 0.03757324442267418, "memory(GiB)": 21.32, "step": 6704, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.951476 }, { "epoch": 0.21781502777507067, "grad_norm": 0.49580177664756775, "learning_rate": 9.249821019688471e-06, "loss": 0.03669178858399391, "memory(GiB)": 21.32, "step": 6705, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.951497 }, { "epoch": 0.21784751323782608, "grad_norm": 0.5214641690254211, "learning_rate": 9.249538001274851e-06, "loss": 0.040437448769807816, "memory(GiB)": 21.32, "step": 6706, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.951519 }, { "epoch": 0.2178799987005815, "grad_norm": 0.7259652614593506, "learning_rate": 9.249254933815908e-06, "loss": 0.0405501052737236, "memory(GiB)": 21.32, "step": 6707, "token_acc": 0.961352657004831, "train_speed(iter/s)": 0.951542 }, { "epoch": 0.21791248416333692, "grad_norm": 0.5713953375816345, "learning_rate": 9.248971817314906e-06, "loss": 0.04309486597776413, "memory(GiB)": 21.32, "step": 6708, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.951565 }, { "epoch": 0.21794496962609233, "grad_norm": 0.6188633441925049, "learning_rate": 9.248688651775114e-06, "loss": 0.04766479879617691, "memory(GiB)": 21.32, "step": 6709, "token_acc": 0.9672131147540983, "train_speed(iter/s)": 0.95159 }, { "epoch": 0.21797745508884775, "grad_norm": 0.7073264718055725, "learning_rate": 9.248405437199801e-06, "loss": 0.03838439658284187, "memory(GiB)": 21.32, "step": 6710, "token_acc": 0.98046875, "train_speed(iter/s)": 0.951616 }, { "epoch": 0.21800994055160317, "grad_norm": 0.7176084518432617, "learning_rate": 9.248122173592234e-06, "loss": 0.04588144272565842, "memory(GiB)": 21.32, "step": 6711, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.951643 }, { "epoch": 0.21804242601435858, "grad_norm": 0.5646724700927734, "learning_rate": 9.247838860955682e-06, "loss": 0.035786017775535583, "memory(GiB)": 21.32, "step": 6712, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.951671 }, { "epoch": 0.218074911477114, "grad_norm": 0.8192888498306274, "learning_rate": 9.247555499293416e-06, "loss": 0.04840364307165146, "memory(GiB)": 21.32, "step": 6713, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.9517 }, { "epoch": 0.2181073969398694, "grad_norm": 0.9039208889007568, "learning_rate": 9.247272088608708e-06, "loss": 0.040046460926532745, "memory(GiB)": 21.32, "step": 6714, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.951729 }, { "epoch": 0.21813988240262483, "grad_norm": 1.8150557279586792, "learning_rate": 9.246988628904825e-06, "loss": 0.04460349678993225, "memory(GiB)": 21.32, "step": 6715, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.951759 }, { "epoch": 0.21817236786538025, "grad_norm": 0.7053580284118652, "learning_rate": 9.246705120185042e-06, "loss": 0.04160089045763016, "memory(GiB)": 21.32, "step": 6716, "token_acc": 0.9918367346938776, "train_speed(iter/s)": 0.951789 }, { "epoch": 0.21820485332813566, "grad_norm": 0.8361645936965942, "learning_rate": 9.24642156245263e-06, "loss": 0.03611144423484802, "memory(GiB)": 21.32, "step": 6717, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.951818 }, { "epoch": 0.21823733879089108, "grad_norm": 0.6470143795013428, "learning_rate": 9.246137955710858e-06, "loss": 0.03324688598513603, "memory(GiB)": 21.32, "step": 6718, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.951848 }, { "epoch": 0.2182698242536465, "grad_norm": 0.5636917948722839, "learning_rate": 9.245854299963005e-06, "loss": 0.03936094418168068, "memory(GiB)": 21.32, "step": 6719, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.951877 }, { "epoch": 0.2183023097164019, "grad_norm": 0.5133227109909058, "learning_rate": 9.24557059521234e-06, "loss": 0.03523632511496544, "memory(GiB)": 21.32, "step": 6720, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.951904 }, { "epoch": 0.21833479517915733, "grad_norm": 0.5883939266204834, "learning_rate": 9.245286841462141e-06, "loss": 0.051422182470560074, "memory(GiB)": 21.32, "step": 6721, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.951935 }, { "epoch": 0.21836728064191274, "grad_norm": 0.7926406860351562, "learning_rate": 9.245003038715682e-06, "loss": 0.0468873530626297, "memory(GiB)": 21.32, "step": 6722, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.951965 }, { "epoch": 0.21839976610466816, "grad_norm": 0.6433503031730652, "learning_rate": 9.244719186976237e-06, "loss": 0.03610615432262421, "memory(GiB)": 21.32, "step": 6723, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.951995 }, { "epoch": 0.21843225156742357, "grad_norm": 0.7231613993644714, "learning_rate": 9.244435286247085e-06, "loss": 0.04894913733005524, "memory(GiB)": 21.32, "step": 6724, "token_acc": 0.984, "train_speed(iter/s)": 0.952024 }, { "epoch": 0.218464737030179, "grad_norm": 0.6023004651069641, "learning_rate": 9.244151336531497e-06, "loss": 0.03932074457406998, "memory(GiB)": 21.32, "step": 6725, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.952055 }, { "epoch": 0.2184972224929344, "grad_norm": 0.48046445846557617, "learning_rate": 9.243867337832755e-06, "loss": 0.0412907600402832, "memory(GiB)": 21.32, "step": 6726, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.952079 }, { "epoch": 0.21852970795568982, "grad_norm": 0.538688600063324, "learning_rate": 9.243583290154134e-06, "loss": 0.03200354427099228, "memory(GiB)": 21.32, "step": 6727, "token_acc": 0.9893992932862191, "train_speed(iter/s)": 0.952093 }, { "epoch": 0.21856219341844524, "grad_norm": 0.736412525177002, "learning_rate": 9.243299193498916e-06, "loss": 0.04513702914118767, "memory(GiB)": 21.32, "step": 6728, "token_acc": 0.9836956521739131, "train_speed(iter/s)": 0.952114 }, { "epoch": 0.21859467888120065, "grad_norm": 0.6906259059906006, "learning_rate": 9.243015047870376e-06, "loss": 0.050001971423625946, "memory(GiB)": 21.32, "step": 6729, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.952136 }, { "epoch": 0.21862716434395607, "grad_norm": 0.4206908941268921, "learning_rate": 9.242730853271796e-06, "loss": 0.03867892175912857, "memory(GiB)": 21.32, "step": 6730, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.952159 }, { "epoch": 0.2186596498067115, "grad_norm": 0.5140112042427063, "learning_rate": 9.242446609706454e-06, "loss": 0.038309961557388306, "memory(GiB)": 21.32, "step": 6731, "token_acc": 0.970873786407767, "train_speed(iter/s)": 0.952184 }, { "epoch": 0.2186921352694669, "grad_norm": 0.5334054231643677, "learning_rate": 9.242162317177632e-06, "loss": 0.0371074452996254, "memory(GiB)": 21.32, "step": 6732, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.952206 }, { "epoch": 0.21872462073222232, "grad_norm": 0.4680531322956085, "learning_rate": 9.241877975688611e-06, "loss": 0.031141042709350586, "memory(GiB)": 21.32, "step": 6733, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.952229 }, { "epoch": 0.21875710619497774, "grad_norm": 0.5391191840171814, "learning_rate": 9.241593585242672e-06, "loss": 0.04738978296518326, "memory(GiB)": 21.32, "step": 6734, "token_acc": 0.9858490566037735, "train_speed(iter/s)": 0.95225 }, { "epoch": 0.21878959165773315, "grad_norm": 0.5922471880912781, "learning_rate": 9.241309145843095e-06, "loss": 0.04697873443365097, "memory(GiB)": 21.32, "step": 6735, "token_acc": 0.965, "train_speed(iter/s)": 0.95227 }, { "epoch": 0.2188220771204886, "grad_norm": 0.4841843545436859, "learning_rate": 9.24102465749317e-06, "loss": 0.04045914113521576, "memory(GiB)": 21.32, "step": 6736, "token_acc": 0.9684684684684685, "train_speed(iter/s)": 0.952295 }, { "epoch": 0.218854562583244, "grad_norm": 1.6871626377105713, "learning_rate": 9.240740120196173e-06, "loss": 0.03955404460430145, "memory(GiB)": 21.32, "step": 6737, "token_acc": 0.9766355140186916, "train_speed(iter/s)": 0.952318 }, { "epoch": 0.21888704804599943, "grad_norm": 0.47313952445983887, "learning_rate": 9.24045553395539e-06, "loss": 0.04299348592758179, "memory(GiB)": 21.32, "step": 6738, "token_acc": 0.9708333333333333, "train_speed(iter/s)": 0.952342 }, { "epoch": 0.21891953350875484, "grad_norm": 0.5167127847671509, "learning_rate": 9.240170898774105e-06, "loss": 0.03997476026415825, "memory(GiB)": 21.32, "step": 6739, "token_acc": 0.9691629955947136, "train_speed(iter/s)": 0.952366 }, { "epoch": 0.21895201897151026, "grad_norm": 0.3993234634399414, "learning_rate": 9.239886214655607e-06, "loss": 0.042041242122650146, "memory(GiB)": 21.32, "step": 6740, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.952384 }, { "epoch": 0.21898450443426568, "grad_norm": 0.6463128924369812, "learning_rate": 9.239601481603177e-06, "loss": 0.03794528543949127, "memory(GiB)": 21.32, "step": 6741, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.952404 }, { "epoch": 0.2190169898970211, "grad_norm": 0.556955873966217, "learning_rate": 9.239316699620104e-06, "loss": 0.04884042590856552, "memory(GiB)": 21.32, "step": 6742, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.952429 }, { "epoch": 0.2190494753597765, "grad_norm": 0.42380964756011963, "learning_rate": 9.239031868709673e-06, "loss": 0.04070238023996353, "memory(GiB)": 21.32, "step": 6743, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.952451 }, { "epoch": 0.21908196082253192, "grad_norm": 0.432850182056427, "learning_rate": 9.238746988875173e-06, "loss": 0.035160187631845474, "memory(GiB)": 21.32, "step": 6744, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.952475 }, { "epoch": 0.21911444628528734, "grad_norm": 0.4495824873447418, "learning_rate": 9.23846206011989e-06, "loss": 0.03323080390691757, "memory(GiB)": 21.32, "step": 6745, "token_acc": 0.9813953488372092, "train_speed(iter/s)": 0.952498 }, { "epoch": 0.21914693174804276, "grad_norm": 0.3893994092941284, "learning_rate": 9.238177082447115e-06, "loss": 0.04440976679325104, "memory(GiB)": 21.32, "step": 6746, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.952528 }, { "epoch": 0.21917941721079817, "grad_norm": 0.45015498995780945, "learning_rate": 9.237892055860135e-06, "loss": 0.03875688463449478, "memory(GiB)": 21.32, "step": 6747, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.952558 }, { "epoch": 0.2192119026735536, "grad_norm": 0.4671669602394104, "learning_rate": 9.23760698036224e-06, "loss": 0.040424786508083344, "memory(GiB)": 21.32, "step": 6748, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.952588 }, { "epoch": 0.219244388136309, "grad_norm": 0.5133130550384521, "learning_rate": 9.237321855956718e-06, "loss": 0.034197475761175156, "memory(GiB)": 21.32, "step": 6749, "token_acc": 1.0, "train_speed(iter/s)": 0.952617 }, { "epoch": 0.21927687359906442, "grad_norm": 0.5278525948524475, "learning_rate": 9.237036682646864e-06, "loss": 0.0420849546790123, "memory(GiB)": 21.32, "step": 6750, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.95264 }, { "epoch": 0.21930935906181984, "grad_norm": 0.5096864700317383, "learning_rate": 9.236751460435967e-06, "loss": 0.04028651490807533, "memory(GiB)": 21.32, "step": 6751, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.95264 }, { "epoch": 0.21934184452457525, "grad_norm": 0.45834779739379883, "learning_rate": 9.23646618932732e-06, "loss": 0.03784916177392006, "memory(GiB)": 21.32, "step": 6752, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.952665 }, { "epoch": 0.21937432998733067, "grad_norm": 0.5849021077156067, "learning_rate": 9.236180869324213e-06, "loss": 0.03523506969213486, "memory(GiB)": 21.32, "step": 6753, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.952687 }, { "epoch": 0.21940681545008608, "grad_norm": 1.8016657829284668, "learning_rate": 9.23589550042994e-06, "loss": 0.03962070494890213, "memory(GiB)": 21.32, "step": 6754, "token_acc": 0.9720670391061452, "train_speed(iter/s)": 0.952711 }, { "epoch": 0.2194393009128415, "grad_norm": 0.44774699211120605, "learning_rate": 9.235610082647797e-06, "loss": 0.03567143529653549, "memory(GiB)": 21.32, "step": 6755, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.952733 }, { "epoch": 0.21947178637559692, "grad_norm": 0.5379160046577454, "learning_rate": 9.235324615981074e-06, "loss": 0.03769978880882263, "memory(GiB)": 21.32, "step": 6756, "token_acc": 0.9739776951672863, "train_speed(iter/s)": 0.952756 }, { "epoch": 0.21950427183835233, "grad_norm": 0.5031079053878784, "learning_rate": 9.235039100433068e-06, "loss": 0.027386479079723358, "memory(GiB)": 21.32, "step": 6757, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.95278 }, { "epoch": 0.21953675730110775, "grad_norm": 0.6707330942153931, "learning_rate": 9.234753536007075e-06, "loss": 0.03611181676387787, "memory(GiB)": 21.32, "step": 6758, "token_acc": 0.9767441860465116, "train_speed(iter/s)": 0.952803 }, { "epoch": 0.21956924276386316, "grad_norm": 0.5742661952972412, "learning_rate": 9.234467922706389e-06, "loss": 0.037349313497543335, "memory(GiB)": 21.32, "step": 6759, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.952823 }, { "epoch": 0.21960172822661858, "grad_norm": 0.44445711374282837, "learning_rate": 9.234182260534308e-06, "loss": 0.03159206360578537, "memory(GiB)": 21.32, "step": 6760, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.952846 }, { "epoch": 0.219634213689374, "grad_norm": 0.4548211991786957, "learning_rate": 9.233896549494128e-06, "loss": 0.03413110226392746, "memory(GiB)": 21.32, "step": 6761, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.952868 }, { "epoch": 0.2196666991521294, "grad_norm": 0.4621327519416809, "learning_rate": 9.233610789589145e-06, "loss": 0.03060944750905037, "memory(GiB)": 21.32, "step": 6762, "token_acc": 0.9846153846153847, "train_speed(iter/s)": 0.952892 }, { "epoch": 0.21969918461488483, "grad_norm": 0.47868022322654724, "learning_rate": 9.23332498082266e-06, "loss": 0.035766907036304474, "memory(GiB)": 21.32, "step": 6763, "token_acc": 0.9798994974874372, "train_speed(iter/s)": 0.952915 }, { "epoch": 0.21973167007764025, "grad_norm": 0.7089102268218994, "learning_rate": 9.233039123197969e-06, "loss": 0.04849595949053764, "memory(GiB)": 21.32, "step": 6764, "token_acc": 0.9702970297029703, "train_speed(iter/s)": 0.952939 }, { "epoch": 0.21976415554039566, "grad_norm": 0.45830851793289185, "learning_rate": 9.232753216718373e-06, "loss": 0.031131872907280922, "memory(GiB)": 21.32, "step": 6765, "token_acc": 0.984, "train_speed(iter/s)": 0.952965 }, { "epoch": 0.21979664100315108, "grad_norm": 0.6240020990371704, "learning_rate": 9.232467261387171e-06, "loss": 0.03453803062438965, "memory(GiB)": 21.32, "step": 6766, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.952989 }, { "epoch": 0.2198291264659065, "grad_norm": 0.7778204083442688, "learning_rate": 9.232181257207664e-06, "loss": 0.04379180818796158, "memory(GiB)": 21.32, "step": 6767, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.953013 }, { "epoch": 0.21986161192866194, "grad_norm": 0.5095418095588684, "learning_rate": 9.231895204183152e-06, "loss": 0.034314095973968506, "memory(GiB)": 21.32, "step": 6768, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.953032 }, { "epoch": 0.21989409739141735, "grad_norm": 0.6644372940063477, "learning_rate": 9.231609102316936e-06, "loss": 0.045774269849061966, "memory(GiB)": 21.32, "step": 6769, "token_acc": 0.9788359788359788, "train_speed(iter/s)": 0.953055 }, { "epoch": 0.21992658285417277, "grad_norm": 1.175563097000122, "learning_rate": 9.231322951612318e-06, "loss": 0.043201837688684464, "memory(GiB)": 21.32, "step": 6770, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.953079 }, { "epoch": 0.21995906831692819, "grad_norm": 0.8283205628395081, "learning_rate": 9.231036752072602e-06, "loss": 0.038065243512392044, "memory(GiB)": 21.32, "step": 6771, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.953102 }, { "epoch": 0.2199915537796836, "grad_norm": 0.5691311955451965, "learning_rate": 9.230750503701091e-06, "loss": 0.03820043057203293, "memory(GiB)": 21.32, "step": 6772, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.953127 }, { "epoch": 0.22002403924243902, "grad_norm": 0.6385272145271301, "learning_rate": 9.230464206501089e-06, "loss": 0.040021538734436035, "memory(GiB)": 21.32, "step": 6773, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.953155 }, { "epoch": 0.22005652470519443, "grad_norm": 0.7474076151847839, "learning_rate": 9.230177860475898e-06, "loss": 0.046059489250183105, "memory(GiB)": 21.32, "step": 6774, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.953155 }, { "epoch": 0.22008901016794985, "grad_norm": 2.141493797302246, "learning_rate": 9.229891465628824e-06, "loss": 0.053035393357276917, "memory(GiB)": 21.32, "step": 6775, "token_acc": 0.98046875, "train_speed(iter/s)": 0.953185 }, { "epoch": 0.22012149563070527, "grad_norm": 0.8010231256484985, "learning_rate": 9.229605021963172e-06, "loss": 0.040058672428131104, "memory(GiB)": 21.32, "step": 6776, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.953215 }, { "epoch": 0.22015398109346068, "grad_norm": 0.6237381100654602, "learning_rate": 9.22931852948225e-06, "loss": 0.04559943452477455, "memory(GiB)": 21.32, "step": 6777, "token_acc": 0.9737991266375546, "train_speed(iter/s)": 0.953243 }, { "epoch": 0.2201864665562161, "grad_norm": 0.5148797035217285, "learning_rate": 9.229031988189362e-06, "loss": 0.03848298639059067, "memory(GiB)": 21.32, "step": 6778, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.953267 }, { "epoch": 0.22021895201897151, "grad_norm": 0.42831313610076904, "learning_rate": 9.228745398087818e-06, "loss": 0.03965369984507561, "memory(GiB)": 21.32, "step": 6779, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.953294 }, { "epoch": 0.22025143748172693, "grad_norm": 1.6883914470672607, "learning_rate": 9.228458759180921e-06, "loss": 0.03863703832030296, "memory(GiB)": 21.32, "step": 6780, "token_acc": 0.9816176470588235, "train_speed(iter/s)": 0.953323 }, { "epoch": 0.22028392294448235, "grad_norm": 2.8923239707946777, "learning_rate": 9.228172071471982e-06, "loss": 0.0422748439013958, "memory(GiB)": 21.32, "step": 6781, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.953352 }, { "epoch": 0.22031640840723776, "grad_norm": 0.5873656272888184, "learning_rate": 9.227885334964311e-06, "loss": 0.03583648055791855, "memory(GiB)": 21.32, "step": 6782, "token_acc": 0.9898477157360406, "train_speed(iter/s)": 0.953381 }, { "epoch": 0.22034889386999318, "grad_norm": 0.6562833189964294, "learning_rate": 9.227598549661215e-06, "loss": 0.038201719522476196, "memory(GiB)": 21.32, "step": 6783, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.953411 }, { "epoch": 0.2203813793327486, "grad_norm": 0.5538501143455505, "learning_rate": 9.227311715566006e-06, "loss": 0.03809560835361481, "memory(GiB)": 21.32, "step": 6784, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.953441 }, { "epoch": 0.220413864795504, "grad_norm": 0.5634772777557373, "learning_rate": 9.227024832681992e-06, "loss": 0.03980186954140663, "memory(GiB)": 21.32, "step": 6785, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.953465 }, { "epoch": 0.22044635025825943, "grad_norm": 0.8473009467124939, "learning_rate": 9.226737901012486e-06, "loss": 0.04104533791542053, "memory(GiB)": 21.32, "step": 6786, "token_acc": 0.9742489270386266, "train_speed(iter/s)": 0.953487 }, { "epoch": 0.22047883572101484, "grad_norm": 0.42874330282211304, "learning_rate": 9.226450920560798e-06, "loss": 0.03760620206594467, "memory(GiB)": 21.32, "step": 6787, "token_acc": 0.9765625, "train_speed(iter/s)": 0.953508 }, { "epoch": 0.22051132118377026, "grad_norm": 0.958811342716217, "learning_rate": 9.226163891330241e-06, "loss": 0.05235380306839943, "memory(GiB)": 21.32, "step": 6788, "token_acc": 0.9581589958158996, "train_speed(iter/s)": 0.953531 }, { "epoch": 0.22054380664652568, "grad_norm": 1.0333971977233887, "learning_rate": 9.225876813324128e-06, "loss": 0.06261269003152847, "memory(GiB)": 21.32, "step": 6789, "token_acc": 0.9636363636363636, "train_speed(iter/s)": 0.953551 }, { "epoch": 0.2205762921092811, "grad_norm": 2.514030694961548, "learning_rate": 9.225589686545772e-06, "loss": 0.04227428883314133, "memory(GiB)": 21.32, "step": 6790, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.953575 }, { "epoch": 0.2206087775720365, "grad_norm": 0.6518080234527588, "learning_rate": 9.225302510998489e-06, "loss": 0.04150104522705078, "memory(GiB)": 21.32, "step": 6791, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.953599 }, { "epoch": 0.22064126303479192, "grad_norm": 0.5231893658638, "learning_rate": 9.225015286685587e-06, "loss": 0.048126887530088425, "memory(GiB)": 21.32, "step": 6792, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.953622 }, { "epoch": 0.22067374849754734, "grad_norm": 0.4629021883010864, "learning_rate": 9.224728013610388e-06, "loss": 0.03675165772438049, "memory(GiB)": 21.32, "step": 6793, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.95364 }, { "epoch": 0.22070623396030276, "grad_norm": 0.5266225337982178, "learning_rate": 9.224440691776205e-06, "loss": 0.046285733580589294, "memory(GiB)": 21.32, "step": 6794, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.953664 }, { "epoch": 0.22073871942305817, "grad_norm": 0.5586864948272705, "learning_rate": 9.22415332118635e-06, "loss": 0.044817931950092316, "memory(GiB)": 21.32, "step": 6795, "token_acc": 0.9740740740740741, "train_speed(iter/s)": 0.953686 }, { "epoch": 0.2207712048858136, "grad_norm": 0.48613104224205017, "learning_rate": 9.223865901844147e-06, "loss": 0.03332066163420677, "memory(GiB)": 21.32, "step": 6796, "token_acc": 0.9835164835164835, "train_speed(iter/s)": 0.953708 }, { "epoch": 0.220803690348569, "grad_norm": 0.7091331481933594, "learning_rate": 9.223578433752908e-06, "loss": 0.03733747825026512, "memory(GiB)": 21.32, "step": 6797, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.953727 }, { "epoch": 0.22083617581132442, "grad_norm": 0.43953150510787964, "learning_rate": 9.223290916915953e-06, "loss": 0.037288520485162735, "memory(GiB)": 21.32, "step": 6798, "token_acc": 0.9835390946502057, "train_speed(iter/s)": 0.953748 }, { "epoch": 0.22086866127407984, "grad_norm": 0.4265197813510895, "learning_rate": 9.2230033513366e-06, "loss": 0.037778861820697784, "memory(GiB)": 21.32, "step": 6799, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.953771 }, { "epoch": 0.22090114673683528, "grad_norm": 0.49584537744522095, "learning_rate": 9.222715737018167e-06, "loss": 0.029453396797180176, "memory(GiB)": 21.32, "step": 6800, "token_acc": 0.9869565217391304, "train_speed(iter/s)": 0.953795 }, { "epoch": 0.2209336321995907, "grad_norm": 0.5725142955780029, "learning_rate": 9.222428073963974e-06, "loss": 0.046209439635276794, "memory(GiB)": 21.32, "step": 6801, "token_acc": 0.9691780821917808, "train_speed(iter/s)": 0.953816 }, { "epoch": 0.2209661176623461, "grad_norm": 0.5253798961639404, "learning_rate": 9.22214036217734e-06, "loss": 0.04720441997051239, "memory(GiB)": 21.32, "step": 6802, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.953841 }, { "epoch": 0.22099860312510153, "grad_norm": 0.5981359481811523, "learning_rate": 9.221852601661588e-06, "loss": 0.03728420287370682, "memory(GiB)": 21.32, "step": 6803, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.953863 }, { "epoch": 0.22103108858785694, "grad_norm": 0.5287158489227295, "learning_rate": 9.221564792420037e-06, "loss": 0.036485299468040466, "memory(GiB)": 21.32, "step": 6804, "token_acc": 0.9746835443037974, "train_speed(iter/s)": 0.953887 }, { "epoch": 0.22106357405061236, "grad_norm": 0.6692900061607361, "learning_rate": 9.22127693445601e-06, "loss": 0.05405043438076973, "memory(GiB)": 21.32, "step": 6805, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.953917 }, { "epoch": 0.22109605951336778, "grad_norm": 0.4357486963272095, "learning_rate": 9.220989027772827e-06, "loss": 0.02886025421321392, "memory(GiB)": 21.32, "step": 6806, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.953947 }, { "epoch": 0.2211285449761232, "grad_norm": 0.44746920466423035, "learning_rate": 9.220701072373814e-06, "loss": 0.034566961228847504, "memory(GiB)": 21.32, "step": 6807, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.953978 }, { "epoch": 0.2211610304388786, "grad_norm": 0.45899564027786255, "learning_rate": 9.220413068262293e-06, "loss": 0.030198965221643448, "memory(GiB)": 21.32, "step": 6808, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.95401 }, { "epoch": 0.22119351590163402, "grad_norm": 0.5347120761871338, "learning_rate": 9.220125015441587e-06, "loss": 0.03622451424598694, "memory(GiB)": 21.32, "step": 6809, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.95404 }, { "epoch": 0.22122600136438944, "grad_norm": 0.5645947456359863, "learning_rate": 9.21983691391502e-06, "loss": 0.03890739381313324, "memory(GiB)": 21.32, "step": 6810, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.954068 }, { "epoch": 0.22125848682714486, "grad_norm": 0.6498442888259888, "learning_rate": 9.219548763685922e-06, "loss": 0.035134654492139816, "memory(GiB)": 21.32, "step": 6811, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.954092 }, { "epoch": 0.22129097228990027, "grad_norm": 1.317662000656128, "learning_rate": 9.219260564757612e-06, "loss": 0.03908946365118027, "memory(GiB)": 21.32, "step": 6812, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.954114 }, { "epoch": 0.2213234577526557, "grad_norm": 0.5416918992996216, "learning_rate": 9.21897231713342e-06, "loss": 0.031053565442562103, "memory(GiB)": 21.32, "step": 6813, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.954137 }, { "epoch": 0.2213559432154111, "grad_norm": 0.9149413108825684, "learning_rate": 9.218684020816673e-06, "loss": 0.040501222014427185, "memory(GiB)": 21.32, "step": 6814, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.95416 }, { "epoch": 0.22138842867816652, "grad_norm": 0.472273588180542, "learning_rate": 9.218395675810698e-06, "loss": 0.04167530685663223, "memory(GiB)": 21.32, "step": 6815, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.954183 }, { "epoch": 0.22142091414092194, "grad_norm": 0.3624408543109894, "learning_rate": 9.21810728211882e-06, "loss": 0.03093140758574009, "memory(GiB)": 21.32, "step": 6816, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.954206 }, { "epoch": 0.22145339960367735, "grad_norm": 0.5777069926261902, "learning_rate": 9.217818839744372e-06, "loss": 0.041469622403383255, "memory(GiB)": 21.32, "step": 6817, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.95423 }, { "epoch": 0.22148588506643277, "grad_norm": 0.45819661021232605, "learning_rate": 9.217530348690679e-06, "loss": 0.026230905205011368, "memory(GiB)": 21.32, "step": 6818, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.954253 }, { "epoch": 0.22151837052918819, "grad_norm": 0.44240039587020874, "learning_rate": 9.217241808961072e-06, "loss": 0.03558426350355148, "memory(GiB)": 21.32, "step": 6819, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.954277 }, { "epoch": 0.2215508559919436, "grad_norm": 0.4871354401111603, "learning_rate": 9.216953220558882e-06, "loss": 0.03971567377448082, "memory(GiB)": 21.32, "step": 6820, "token_acc": 0.9663865546218487, "train_speed(iter/s)": 0.954298 }, { "epoch": 0.22158334145469902, "grad_norm": 0.6073246598243713, "learning_rate": 9.216664583487439e-06, "loss": 0.034185897558927536, "memory(GiB)": 21.32, "step": 6821, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.95432 }, { "epoch": 0.22161582691745443, "grad_norm": 0.5806357264518738, "learning_rate": 9.216375897750074e-06, "loss": 0.043468452990055084, "memory(GiB)": 21.32, "step": 6822, "token_acc": 1.0, "train_speed(iter/s)": 0.954342 }, { "epoch": 0.22164831238020985, "grad_norm": 0.5238623023033142, "learning_rate": 9.21608716335012e-06, "loss": 0.03854939341545105, "memory(GiB)": 21.32, "step": 6823, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.954367 }, { "epoch": 0.22168079784296527, "grad_norm": 0.6385864019393921, "learning_rate": 9.215798380290907e-06, "loss": 0.05122615769505501, "memory(GiB)": 21.32, "step": 6824, "token_acc": 0.9732142857142857, "train_speed(iter/s)": 0.95439 }, { "epoch": 0.22171328330572068, "grad_norm": 0.6034185290336609, "learning_rate": 9.215509548575772e-06, "loss": 0.040568891912698746, "memory(GiB)": 21.32, "step": 6825, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.954416 }, { "epoch": 0.2217457687684761, "grad_norm": 0.6651878952980042, "learning_rate": 9.215220668208045e-06, "loss": 0.0394272543489933, "memory(GiB)": 21.32, "step": 6826, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.954438 }, { "epoch": 0.22177825423123151, "grad_norm": 0.6915130019187927, "learning_rate": 9.21493173919106e-06, "loss": 0.03908613696694374, "memory(GiB)": 21.32, "step": 6827, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.954463 }, { "epoch": 0.22181073969398693, "grad_norm": 0.6567236185073853, "learning_rate": 9.214642761528154e-06, "loss": 0.039942286908626556, "memory(GiB)": 21.32, "step": 6828, "token_acc": 0.9757281553398058, "train_speed(iter/s)": 0.954488 }, { "epoch": 0.22184322515674235, "grad_norm": 0.9520503878593445, "learning_rate": 9.21435373522266e-06, "loss": 0.042385078966617584, "memory(GiB)": 21.32, "step": 6829, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.954509 }, { "epoch": 0.22187571061949776, "grad_norm": 0.6764935851097107, "learning_rate": 9.214064660277915e-06, "loss": 0.03966984152793884, "memory(GiB)": 21.32, "step": 6830, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.954529 }, { "epoch": 0.22190819608225318, "grad_norm": 0.5886715650558472, "learning_rate": 9.213775536697253e-06, "loss": 0.04207945615053177, "memory(GiB)": 21.32, "step": 6831, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.954552 }, { "epoch": 0.22194068154500862, "grad_norm": 0.7542985677719116, "learning_rate": 9.213486364484016e-06, "loss": 0.041198696941137314, "memory(GiB)": 21.32, "step": 6832, "token_acc": 0.9845360824742269, "train_speed(iter/s)": 0.954574 }, { "epoch": 0.22197316700776404, "grad_norm": 0.49366989731788635, "learning_rate": 9.213197143641536e-06, "loss": 0.04747484624385834, "memory(GiB)": 21.32, "step": 6833, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.954597 }, { "epoch": 0.22200565247051945, "grad_norm": 0.6612887382507324, "learning_rate": 9.212907874173155e-06, "loss": 0.04269440472126007, "memory(GiB)": 21.32, "step": 6834, "token_acc": 0.974025974025974, "train_speed(iter/s)": 0.954623 }, { "epoch": 0.22203813793327487, "grad_norm": 0.5041739344596863, "learning_rate": 9.212618556082208e-06, "loss": 0.042031820863485336, "memory(GiB)": 21.32, "step": 6835, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.954652 }, { "epoch": 0.2220706233960303, "grad_norm": 0.44056200981140137, "learning_rate": 9.212329189372036e-06, "loss": 0.04039064049720764, "memory(GiB)": 21.32, "step": 6836, "token_acc": 0.988, "train_speed(iter/s)": 0.954681 }, { "epoch": 0.2221031088587857, "grad_norm": 0.7441498637199402, "learning_rate": 9.21203977404598e-06, "loss": 0.04342471808195114, "memory(GiB)": 21.32, "step": 6837, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.954711 }, { "epoch": 0.22213559432154112, "grad_norm": 0.68592768907547, "learning_rate": 9.211750310107378e-06, "loss": 0.05156596004962921, "memory(GiB)": 21.32, "step": 6838, "token_acc": 0.9809523809523809, "train_speed(iter/s)": 0.954741 }, { "epoch": 0.22216807978429653, "grad_norm": 0.4516960382461548, "learning_rate": 9.211460797559571e-06, "loss": 0.038208942860364914, "memory(GiB)": 21.32, "step": 6839, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.95477 }, { "epoch": 0.22220056524705195, "grad_norm": 0.42675647139549255, "learning_rate": 9.211171236405903e-06, "loss": 0.03099202550947666, "memory(GiB)": 21.32, "step": 6840, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.9548 }, { "epoch": 0.22223305070980737, "grad_norm": 0.4513397216796875, "learning_rate": 9.210881626649711e-06, "loss": 0.042830973863601685, "memory(GiB)": 21.32, "step": 6841, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.95483 }, { "epoch": 0.22226553617256278, "grad_norm": 0.8010939359664917, "learning_rate": 9.210591968294343e-06, "loss": 0.043192025274038315, "memory(GiB)": 21.32, "step": 6842, "token_acc": 0.9795918367346939, "train_speed(iter/s)": 0.954861 }, { "epoch": 0.2222980216353182, "grad_norm": 0.4557095468044281, "learning_rate": 9.210302261343137e-06, "loss": 0.03175657242536545, "memory(GiB)": 21.32, "step": 6843, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.95489 }, { "epoch": 0.22233050709807362, "grad_norm": 0.5301356315612793, "learning_rate": 9.210012505799439e-06, "loss": 0.044435106217861176, "memory(GiB)": 21.32, "step": 6844, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.954913 }, { "epoch": 0.22236299256082903, "grad_norm": 0.7065622210502625, "learning_rate": 9.209722701666594e-06, "loss": 0.048023972660303116, "memory(GiB)": 21.32, "step": 6845, "token_acc": 0.9784172661870504, "train_speed(iter/s)": 0.954933 }, { "epoch": 0.22239547802358445, "grad_norm": 0.38370004296302795, "learning_rate": 9.209432848947948e-06, "loss": 0.025300312787294388, "memory(GiB)": 21.32, "step": 6846, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.954958 }, { "epoch": 0.22242796348633986, "grad_norm": 0.38604477047920227, "learning_rate": 9.209142947646842e-06, "loss": 0.03432980552315712, "memory(GiB)": 21.32, "step": 6847, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.954982 }, { "epoch": 0.22246044894909528, "grad_norm": 0.5937352180480957, "learning_rate": 9.208852997766624e-06, "loss": 0.0429137647151947, "memory(GiB)": 21.32, "step": 6848, "token_acc": 0.9759036144578314, "train_speed(iter/s)": 0.955001 }, { "epoch": 0.2224929344118507, "grad_norm": 0.5081093311309814, "learning_rate": 9.208562999310641e-06, "loss": 0.04335267096757889, "memory(GiB)": 21.32, "step": 6849, "token_acc": 0.9816176470588235, "train_speed(iter/s)": 0.955023 }, { "epoch": 0.2225254198746061, "grad_norm": 1.4012833833694458, "learning_rate": 9.20827295228224e-06, "loss": 0.042737338691949844, "memory(GiB)": 21.32, "step": 6850, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.955048 }, { "epoch": 0.22255790533736153, "grad_norm": 0.569677472114563, "learning_rate": 9.207982856684765e-06, "loss": 0.038673631846904755, "memory(GiB)": 21.32, "step": 6851, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.955068 }, { "epoch": 0.22259039080011694, "grad_norm": 0.3674671947956085, "learning_rate": 9.20769271252157e-06, "loss": 0.027642689645290375, "memory(GiB)": 21.32, "step": 6852, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955089 }, { "epoch": 0.22262287626287236, "grad_norm": 0.46712052822113037, "learning_rate": 9.207402519796e-06, "loss": 0.040840089321136475, "memory(GiB)": 21.32, "step": 6853, "token_acc": 0.981549815498155, "train_speed(iter/s)": 0.95511 }, { "epoch": 0.22265536172562778, "grad_norm": 0.4978291690349579, "learning_rate": 9.207112278511406e-06, "loss": 0.04046662896871567, "memory(GiB)": 21.32, "step": 6854, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.955134 }, { "epoch": 0.2226878471883832, "grad_norm": 0.6565234661102295, "learning_rate": 9.206821988671136e-06, "loss": 0.04572463408112526, "memory(GiB)": 21.32, "step": 6855, "token_acc": 0.98046875, "train_speed(iter/s)": 0.955154 }, { "epoch": 0.2227203326511386, "grad_norm": 0.5518842339515686, "learning_rate": 9.20653165027854e-06, "loss": 0.04691489040851593, "memory(GiB)": 21.32, "step": 6856, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.955172 }, { "epoch": 0.22275281811389402, "grad_norm": 1.6825666427612305, "learning_rate": 9.206241263336969e-06, "loss": 0.04040706157684326, "memory(GiB)": 21.32, "step": 6857, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.955193 }, { "epoch": 0.22278530357664944, "grad_norm": 0.4968593716621399, "learning_rate": 9.205950827849777e-06, "loss": 0.0397280752658844, "memory(GiB)": 21.32, "step": 6858, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.955215 }, { "epoch": 0.22281778903940486, "grad_norm": 0.6611095070838928, "learning_rate": 9.205660343820314e-06, "loss": 0.038546666502952576, "memory(GiB)": 21.32, "step": 6859, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.955238 }, { "epoch": 0.22285027450216027, "grad_norm": 0.4608403444290161, "learning_rate": 9.205369811251931e-06, "loss": 0.03477504104375839, "memory(GiB)": 21.32, "step": 6860, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.955257 }, { "epoch": 0.2228827599649157, "grad_norm": 0.486939013004303, "learning_rate": 9.205079230147986e-06, "loss": 0.030201997607946396, "memory(GiB)": 21.32, "step": 6861, "token_acc": 0.9859649122807017, "train_speed(iter/s)": 0.955279 }, { "epoch": 0.2229152454276711, "grad_norm": 0.5828614234924316, "learning_rate": 9.204788600511829e-06, "loss": 0.041278377175331116, "memory(GiB)": 21.32, "step": 6862, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.955301 }, { "epoch": 0.22294773089042652, "grad_norm": 0.438539981842041, "learning_rate": 9.204497922346814e-06, "loss": 0.0383361391723156, "memory(GiB)": 21.32, "step": 6863, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.955324 }, { "epoch": 0.22298021635318196, "grad_norm": 0.5895116925239563, "learning_rate": 9.204207195656298e-06, "loss": 0.036298394203186035, "memory(GiB)": 21.32, "step": 6864, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.955354 }, { "epoch": 0.22301270181593738, "grad_norm": 0.5763790011405945, "learning_rate": 9.203916420443635e-06, "loss": 0.034619830548763275, "memory(GiB)": 21.32, "step": 6865, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955382 }, { "epoch": 0.2230451872786928, "grad_norm": 0.5284741520881653, "learning_rate": 9.20362559671218e-06, "loss": 0.04942904785275459, "memory(GiB)": 21.32, "step": 6866, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955412 }, { "epoch": 0.2230776727414482, "grad_norm": 0.5467756390571594, "learning_rate": 9.203334724465291e-06, "loss": 0.044107258319854736, "memory(GiB)": 21.32, "step": 6867, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955439 }, { "epoch": 0.22311015820420363, "grad_norm": 0.44048380851745605, "learning_rate": 9.203043803706323e-06, "loss": 0.02566370740532875, "memory(GiB)": 21.32, "step": 6868, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.955468 }, { "epoch": 0.22314264366695905, "grad_norm": 0.601611316204071, "learning_rate": 9.20275283443864e-06, "loss": 0.039609573781490326, "memory(GiB)": 21.32, "step": 6869, "token_acc": 0.975, "train_speed(iter/s)": 0.955497 }, { "epoch": 0.22317512912971446, "grad_norm": 0.486135333776474, "learning_rate": 9.202461816665592e-06, "loss": 0.03930822014808655, "memory(GiB)": 21.32, "step": 6870, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.955528 }, { "epoch": 0.22320761459246988, "grad_norm": 0.5585822463035583, "learning_rate": 9.202170750390543e-06, "loss": 0.037224095314741135, "memory(GiB)": 21.32, "step": 6871, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955559 }, { "epoch": 0.2232401000552253, "grad_norm": 0.7634139657020569, "learning_rate": 9.201879635616849e-06, "loss": 0.042107611894607544, "memory(GiB)": 21.32, "step": 6872, "token_acc": 1.0, "train_speed(iter/s)": 0.955584 }, { "epoch": 0.2232725855179807, "grad_norm": 0.5184874534606934, "learning_rate": 9.201588472347871e-06, "loss": 0.04313591122627258, "memory(GiB)": 21.32, "step": 6873, "token_acc": 0.9699248120300752, "train_speed(iter/s)": 0.955609 }, { "epoch": 0.22330507098073613, "grad_norm": 0.42864763736724854, "learning_rate": 9.20129726058697e-06, "loss": 0.040192268788814545, "memory(GiB)": 21.32, "step": 6874, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955632 }, { "epoch": 0.22333755644349154, "grad_norm": 0.4898657500743866, "learning_rate": 9.201006000337507e-06, "loss": 0.0355503112077713, "memory(GiB)": 21.32, "step": 6875, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.955657 }, { "epoch": 0.22337004190624696, "grad_norm": 0.5694822072982788, "learning_rate": 9.200714691602843e-06, "loss": 0.037556614726781845, "memory(GiB)": 21.32, "step": 6876, "token_acc": 0.9928571428571429, "train_speed(iter/s)": 0.95568 }, { "epoch": 0.22340252736900237, "grad_norm": 0.5313537120819092, "learning_rate": 9.20042333438634e-06, "loss": 0.034109219908714294, "memory(GiB)": 21.32, "step": 6877, "token_acc": 0.9747474747474747, "train_speed(iter/s)": 0.955704 }, { "epoch": 0.2234350128317578, "grad_norm": 1.3479493856430054, "learning_rate": 9.200131928691362e-06, "loss": 0.05436377972364426, "memory(GiB)": 21.32, "step": 6878, "token_acc": 0.9674418604651163, "train_speed(iter/s)": 0.955727 }, { "epoch": 0.2234674982945132, "grad_norm": 0.492507666349411, "learning_rate": 9.199840474521271e-06, "loss": 0.043617717921733856, "memory(GiB)": 21.32, "step": 6879, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.955751 }, { "epoch": 0.22349998375726862, "grad_norm": 3.1237802505493164, "learning_rate": 9.19954897187943e-06, "loss": 0.04353690892457962, "memory(GiB)": 21.32, "step": 6880, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.955774 }, { "epoch": 0.22353246922002404, "grad_norm": 0.6612919569015503, "learning_rate": 9.199257420769206e-06, "loss": 0.04632916301488876, "memory(GiB)": 21.32, "step": 6881, "token_acc": 0.974169741697417, "train_speed(iter/s)": 0.955797 }, { "epoch": 0.22356495468277945, "grad_norm": 0.5824491381645203, "learning_rate": 9.198965821193959e-06, "loss": 0.0518399216234684, "memory(GiB)": 21.32, "step": 6882, "token_acc": 0.9766355140186916, "train_speed(iter/s)": 0.955819 }, { "epoch": 0.22359744014553487, "grad_norm": 0.41035953164100647, "learning_rate": 9.198674173157061e-06, "loss": 0.03926451876759529, "memory(GiB)": 21.32, "step": 6883, "token_acc": 0.9741379310344828, "train_speed(iter/s)": 0.955841 }, { "epoch": 0.2236299256082903, "grad_norm": 0.7407870888710022, "learning_rate": 9.198382476661873e-06, "loss": 0.04543831944465637, "memory(GiB)": 21.32, "step": 6884, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.955863 }, { "epoch": 0.2236624110710457, "grad_norm": 1.835480809211731, "learning_rate": 9.198090731711765e-06, "loss": 0.04838986694812775, "memory(GiB)": 21.32, "step": 6885, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.955886 }, { "epoch": 0.22369489653380112, "grad_norm": 0.7896131873130798, "learning_rate": 9.197798938310098e-06, "loss": 0.042767345905303955, "memory(GiB)": 21.32, "step": 6886, "token_acc": 0.9884615384615385, "train_speed(iter/s)": 0.955911 }, { "epoch": 0.22372738199655653, "grad_norm": 0.6678773760795593, "learning_rate": 9.197507096460248e-06, "loss": 0.036196257919073105, "memory(GiB)": 21.32, "step": 6887, "token_acc": 0.9704641350210971, "train_speed(iter/s)": 0.955935 }, { "epoch": 0.22375986745931195, "grad_norm": 0.5569577813148499, "learning_rate": 9.197215206165578e-06, "loss": 0.042026497423648834, "memory(GiB)": 21.32, "step": 6888, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.955958 }, { "epoch": 0.22379235292206737, "grad_norm": 0.38718631863594055, "learning_rate": 9.196923267429458e-06, "loss": 0.03767158463597298, "memory(GiB)": 21.32, "step": 6889, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.955981 }, { "epoch": 0.22382483838482278, "grad_norm": 0.9567207098007202, "learning_rate": 9.196631280255258e-06, "loss": 0.04749409854412079, "memory(GiB)": 21.32, "step": 6890, "token_acc": 0.9895470383275261, "train_speed(iter/s)": 0.956004 }, { "epoch": 0.2238573238475782, "grad_norm": 2.88162899017334, "learning_rate": 9.196339244646346e-06, "loss": 0.044775668531656265, "memory(GiB)": 21.32, "step": 6891, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.956021 }, { "epoch": 0.22388980931033362, "grad_norm": 0.4554292559623718, "learning_rate": 9.196047160606095e-06, "loss": 0.030749602243304253, "memory(GiB)": 21.32, "step": 6892, "token_acc": 0.979253112033195, "train_speed(iter/s)": 0.95604 }, { "epoch": 0.22392229477308903, "grad_norm": 0.5557021498680115, "learning_rate": 9.195755028137873e-06, "loss": 0.040019333362579346, "memory(GiB)": 21.32, "step": 6893, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.956063 }, { "epoch": 0.22395478023584445, "grad_norm": 0.5328534245491028, "learning_rate": 9.195462847245056e-06, "loss": 0.037316929548978806, "memory(GiB)": 21.32, "step": 6894, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.956086 }, { "epoch": 0.22398726569859986, "grad_norm": 0.4633011817932129, "learning_rate": 9.195170617931012e-06, "loss": 0.04062773659825325, "memory(GiB)": 21.32, "step": 6895, "token_acc": 0.9766536964980544, "train_speed(iter/s)": 0.95611 }, { "epoch": 0.2240197511613553, "grad_norm": 0.31390905380249023, "learning_rate": 9.194878340199117e-06, "loss": 0.03187011554837227, "memory(GiB)": 21.32, "step": 6896, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.956138 }, { "epoch": 0.22405223662411072, "grad_norm": 0.5665116906166077, "learning_rate": 9.19458601405274e-06, "loss": 0.040633849799633026, "memory(GiB)": 21.32, "step": 6897, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.956168 }, { "epoch": 0.22408472208686614, "grad_norm": 0.34802043437957764, "learning_rate": 9.194293639495259e-06, "loss": 0.029706820845603943, "memory(GiB)": 21.32, "step": 6898, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956198 }, { "epoch": 0.22411720754962156, "grad_norm": 0.4680689573287964, "learning_rate": 9.194001216530048e-06, "loss": 0.042591363191604614, "memory(GiB)": 21.32, "step": 6899, "token_acc": 0.9840425531914894, "train_speed(iter/s)": 0.956225 }, { "epoch": 0.22414969301237697, "grad_norm": 0.3955630362033844, "learning_rate": 9.19370874516048e-06, "loss": 0.04718029126524925, "memory(GiB)": 21.32, "step": 6900, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956253 }, { "epoch": 0.2241821784751324, "grad_norm": 0.4686935245990753, "learning_rate": 9.193416225389929e-06, "loss": 0.041081540286540985, "memory(GiB)": 21.32, "step": 6901, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.956281 }, { "epoch": 0.2242146639378878, "grad_norm": 0.6194155216217041, "learning_rate": 9.193123657221776e-06, "loss": 0.04169292747974396, "memory(GiB)": 21.32, "step": 6902, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.95631 }, { "epoch": 0.22424714940064322, "grad_norm": 0.6867320537567139, "learning_rate": 9.192831040659393e-06, "loss": 0.04041260480880737, "memory(GiB)": 21.32, "step": 6903, "token_acc": 1.0, "train_speed(iter/s)": 0.956332 }, { "epoch": 0.22427963486339864, "grad_norm": 0.5196608304977417, "learning_rate": 9.19253837570616e-06, "loss": 0.0358499139547348, "memory(GiB)": 21.32, "step": 6904, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.956355 }, { "epoch": 0.22431212032615405, "grad_norm": 0.5866206884384155, "learning_rate": 9.192245662365455e-06, "loss": 0.050448376685380936, "memory(GiB)": 21.32, "step": 6905, "token_acc": 0.9817518248175182, "train_speed(iter/s)": 0.956379 }, { "epoch": 0.22434460578890947, "grad_norm": 0.7093198895454407, "learning_rate": 9.191952900640653e-06, "loss": 0.041538409888744354, "memory(GiB)": 21.32, "step": 6906, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.956401 }, { "epoch": 0.22437709125166488, "grad_norm": 0.8750446438789368, "learning_rate": 9.191660090535138e-06, "loss": 0.04191358759999275, "memory(GiB)": 21.32, "step": 6907, "token_acc": 0.9810606060606061, "train_speed(iter/s)": 0.956422 }, { "epoch": 0.2244095767144203, "grad_norm": 0.4986878037452698, "learning_rate": 9.191367232052283e-06, "loss": 0.03208521008491516, "memory(GiB)": 21.32, "step": 6908, "token_acc": 0.9857651245551602, "train_speed(iter/s)": 0.956443 }, { "epoch": 0.22444206217717572, "grad_norm": 0.5041106343269348, "learning_rate": 9.191074325195472e-06, "loss": 0.03670162707567215, "memory(GiB)": 21.32, "step": 6909, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.956467 }, { "epoch": 0.22447454763993113, "grad_norm": 0.7265029549598694, "learning_rate": 9.190781369968087e-06, "loss": 0.04657146707177162, "memory(GiB)": 21.32, "step": 6910, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.956487 }, { "epoch": 0.22450703310268655, "grad_norm": 0.6658052802085876, "learning_rate": 9.190488366373507e-06, "loss": 0.04711936041712761, "memory(GiB)": 21.32, "step": 6911, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.956507 }, { "epoch": 0.22453951856544196, "grad_norm": 0.6884859204292297, "learning_rate": 9.190195314415113e-06, "loss": 0.04349025338888168, "memory(GiB)": 21.32, "step": 6912, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.956528 }, { "epoch": 0.22457200402819738, "grad_norm": 0.631253719329834, "learning_rate": 9.189902214096287e-06, "loss": 0.036271411925554276, "memory(GiB)": 21.32, "step": 6913, "token_acc": 0.9759036144578314, "train_speed(iter/s)": 0.956549 }, { "epoch": 0.2246044894909528, "grad_norm": 0.6286633610725403, "learning_rate": 9.189609065420414e-06, "loss": 0.04381762444972992, "memory(GiB)": 21.32, "step": 6914, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.956568 }, { "epoch": 0.2246369749537082, "grad_norm": 2.3368241786956787, "learning_rate": 9.189315868390876e-06, "loss": 0.033331602811813354, "memory(GiB)": 21.32, "step": 6915, "token_acc": 0.9824561403508771, "train_speed(iter/s)": 0.956591 }, { "epoch": 0.22466946041646363, "grad_norm": 0.6417152285575867, "learning_rate": 9.189022623011056e-06, "loss": 0.04090191796422005, "memory(GiB)": 21.32, "step": 6916, "token_acc": 0.968609865470852, "train_speed(iter/s)": 0.956613 }, { "epoch": 0.22470194587921904, "grad_norm": 0.7077170014381409, "learning_rate": 9.18872932928434e-06, "loss": 0.052774179726839066, "memory(GiB)": 21.32, "step": 6917, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.956631 }, { "epoch": 0.22473443134197446, "grad_norm": 0.709394097328186, "learning_rate": 9.188435987214112e-06, "loss": 0.05086198449134827, "memory(GiB)": 21.32, "step": 6918, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.956655 }, { "epoch": 0.22476691680472988, "grad_norm": 0.567815363407135, "learning_rate": 9.188142596803757e-06, "loss": 0.03157541900873184, "memory(GiB)": 21.32, "step": 6919, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.956678 }, { "epoch": 0.2247994022674853, "grad_norm": 0.45599469542503357, "learning_rate": 9.187849158056662e-06, "loss": 0.03513094410300255, "memory(GiB)": 21.32, "step": 6920, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.9567 }, { "epoch": 0.2248318877302407, "grad_norm": 0.35743841528892517, "learning_rate": 9.187555670976216e-06, "loss": 0.03342919796705246, "memory(GiB)": 21.32, "step": 6921, "token_acc": 0.9820143884892086, "train_speed(iter/s)": 0.956721 }, { "epoch": 0.22486437319299613, "grad_norm": 0.511292576789856, "learning_rate": 9.187262135565802e-06, "loss": 0.03490546718239784, "memory(GiB)": 21.32, "step": 6922, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.956745 }, { "epoch": 0.22489685865575154, "grad_norm": 0.4870375990867615, "learning_rate": 9.186968551828812e-06, "loss": 0.037394456565380096, "memory(GiB)": 21.32, "step": 6923, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.956774 }, { "epoch": 0.22492934411850696, "grad_norm": 0.5020365715026855, "learning_rate": 9.18667491976863e-06, "loss": 0.04727259278297424, "memory(GiB)": 21.32, "step": 6924, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.956804 }, { "epoch": 0.22496182958126237, "grad_norm": 0.3523258566856384, "learning_rate": 9.186381239388648e-06, "loss": 0.0301528237760067, "memory(GiB)": 21.32, "step": 6925, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.956834 }, { "epoch": 0.2249943150440178, "grad_norm": 0.36199960112571716, "learning_rate": 9.186087510692254e-06, "loss": 0.03344167023897171, "memory(GiB)": 21.32, "step": 6926, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.956862 }, { "epoch": 0.2250268005067732, "grad_norm": 0.5034199357032776, "learning_rate": 9.18579373368284e-06, "loss": 0.04548514634370804, "memory(GiB)": 21.32, "step": 6927, "token_acc": 0.9819494584837545, "train_speed(iter/s)": 0.956892 }, { "epoch": 0.22505928596952865, "grad_norm": 0.3965759873390198, "learning_rate": 9.18549990836379e-06, "loss": 0.03681791201233864, "memory(GiB)": 21.32, "step": 6928, "token_acc": 0.9893992932862191, "train_speed(iter/s)": 0.95692 }, { "epoch": 0.22509177143228407, "grad_norm": 0.4792283773422241, "learning_rate": 9.185206034738505e-06, "loss": 0.03492628410458565, "memory(GiB)": 21.32, "step": 6929, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.956949 }, { "epoch": 0.22512425689503948, "grad_norm": 0.3352391719818115, "learning_rate": 9.18491211281037e-06, "loss": 0.02897719293832779, "memory(GiB)": 21.32, "step": 6930, "token_acc": 0.9723502304147466, "train_speed(iter/s)": 0.956978 }, { "epoch": 0.2251567423577949, "grad_norm": 0.9332466125488281, "learning_rate": 9.18461814258278e-06, "loss": 0.038273923099040985, "memory(GiB)": 21.32, "step": 6931, "token_acc": 0.9884169884169884, "train_speed(iter/s)": 0.957006 }, { "epoch": 0.22518922782055031, "grad_norm": 0.33915144205093384, "learning_rate": 9.184324124059127e-06, "loss": 0.02429855987429619, "memory(GiB)": 21.32, "step": 6932, "token_acc": 1.0, "train_speed(iter/s)": 0.957034 }, { "epoch": 0.22522171328330573, "grad_norm": 0.7991592884063721, "learning_rate": 9.184030057242803e-06, "loss": 0.04970591515302658, "memory(GiB)": 21.32, "step": 6933, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.957063 }, { "epoch": 0.22525419874606115, "grad_norm": 0.34602853655815125, "learning_rate": 9.183735942137204e-06, "loss": 0.028883110731840134, "memory(GiB)": 21.32, "step": 6934, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.957087 }, { "epoch": 0.22528668420881656, "grad_norm": 0.6393256187438965, "learning_rate": 9.183441778745725e-06, "loss": 0.04278237000107765, "memory(GiB)": 21.32, "step": 6935, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.95711 }, { "epoch": 0.22531916967157198, "grad_norm": 0.4721168875694275, "learning_rate": 9.183147567071757e-06, "loss": 0.02664833888411522, "memory(GiB)": 21.32, "step": 6936, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.957131 }, { "epoch": 0.2253516551343274, "grad_norm": 0.5960224270820618, "learning_rate": 9.1828533071187e-06, "loss": 0.04644265025854111, "memory(GiB)": 21.32, "step": 6937, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.957153 }, { "epoch": 0.2253841405970828, "grad_norm": 0.6198883652687073, "learning_rate": 9.182558998889948e-06, "loss": 0.04184172302484512, "memory(GiB)": 21.32, "step": 6938, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.957177 }, { "epoch": 0.22541662605983823, "grad_norm": 0.5717049241065979, "learning_rate": 9.182264642388898e-06, "loss": 0.05879035219550133, "memory(GiB)": 21.32, "step": 6939, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.9572 }, { "epoch": 0.22544911152259364, "grad_norm": 0.6731811165809631, "learning_rate": 9.18197023761895e-06, "loss": 0.04509332776069641, "memory(GiB)": 21.32, "step": 6940, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.95722 }, { "epoch": 0.22548159698534906, "grad_norm": 0.5512905716896057, "learning_rate": 9.181675784583496e-06, "loss": 0.04434867203235626, "memory(GiB)": 21.32, "step": 6941, "token_acc": 0.9730941704035875, "train_speed(iter/s)": 0.957243 }, { "epoch": 0.22551408244810447, "grad_norm": 0.472462922334671, "learning_rate": 9.181381283285939e-06, "loss": 0.03816154599189758, "memory(GiB)": 21.32, "step": 6942, "token_acc": 0.9724770642201835, "train_speed(iter/s)": 0.957266 }, { "epoch": 0.2255465679108599, "grad_norm": 0.41011834144592285, "learning_rate": 9.181086733729676e-06, "loss": 0.03322605416178703, "memory(GiB)": 21.32, "step": 6943, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.957289 }, { "epoch": 0.2255790533736153, "grad_norm": 0.6336339712142944, "learning_rate": 9.180792135918107e-06, "loss": 0.03632179647684097, "memory(GiB)": 21.32, "step": 6944, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.95731 }, { "epoch": 0.22561153883637072, "grad_norm": 0.6956018209457397, "learning_rate": 9.180497489854634e-06, "loss": 0.037752337753772736, "memory(GiB)": 21.32, "step": 6945, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.957333 }, { "epoch": 0.22564402429912614, "grad_norm": 0.451381117105484, "learning_rate": 9.180202795542652e-06, "loss": 0.031616538763046265, "memory(GiB)": 21.32, "step": 6946, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.957356 }, { "epoch": 0.22567650976188156, "grad_norm": 0.3563265800476074, "learning_rate": 9.179908052985569e-06, "loss": 0.030560176819562912, "memory(GiB)": 21.32, "step": 6947, "token_acc": 0.9767441860465116, "train_speed(iter/s)": 0.957379 }, { "epoch": 0.22570899522463697, "grad_norm": 0.5392770171165466, "learning_rate": 9.179613262186784e-06, "loss": 0.0438074953854084, "memory(GiB)": 21.32, "step": 6948, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957402 }, { "epoch": 0.2257414806873924, "grad_norm": 0.4722929298877716, "learning_rate": 9.179318423149696e-06, "loss": 0.035004641860723495, "memory(GiB)": 21.32, "step": 6949, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957425 }, { "epoch": 0.2257739661501478, "grad_norm": 0.4903450310230255, "learning_rate": 9.179023535877714e-06, "loss": 0.04522532969713211, "memory(GiB)": 21.32, "step": 6950, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.957447 }, { "epoch": 0.22580645161290322, "grad_norm": 0.5374189019203186, "learning_rate": 9.178728600374236e-06, "loss": 0.04684049263596535, "memory(GiB)": 21.32, "step": 6951, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.957468 }, { "epoch": 0.22583893707565864, "grad_norm": 0.5258576273918152, "learning_rate": 9.178433616642668e-06, "loss": 0.04107808321714401, "memory(GiB)": 21.32, "step": 6952, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.957488 }, { "epoch": 0.22587142253841405, "grad_norm": 0.44064196944236755, "learning_rate": 9.178138584686416e-06, "loss": 0.03784569725394249, "memory(GiB)": 21.32, "step": 6953, "token_acc": 0.9753694581280788, "train_speed(iter/s)": 0.957508 }, { "epoch": 0.22590390800116947, "grad_norm": 0.7008800506591797, "learning_rate": 9.177843504508883e-06, "loss": 0.03712789714336395, "memory(GiB)": 21.32, "step": 6954, "token_acc": 0.9759615384615384, "train_speed(iter/s)": 0.95753 }, { "epoch": 0.22593639346392488, "grad_norm": 0.5464470386505127, "learning_rate": 9.177548376113474e-06, "loss": 0.04230280965566635, "memory(GiB)": 21.32, "step": 6955, "token_acc": 0.9742489270386266, "train_speed(iter/s)": 0.957553 }, { "epoch": 0.2259688789266803, "grad_norm": 0.577021062374115, "learning_rate": 9.177253199503597e-06, "loss": 0.03667284920811653, "memory(GiB)": 21.32, "step": 6956, "token_acc": 0.9694656488549618, "train_speed(iter/s)": 0.957576 }, { "epoch": 0.22600136438943572, "grad_norm": 0.5270503759384155, "learning_rate": 9.17695797468266e-06, "loss": 0.041243646293878555, "memory(GiB)": 21.32, "step": 6957, "token_acc": 0.9786476868327402, "train_speed(iter/s)": 0.957601 }, { "epoch": 0.22603384985219113, "grad_norm": 0.42139047384262085, "learning_rate": 9.176662701654066e-06, "loss": 0.03773435205221176, "memory(GiB)": 21.32, "step": 6958, "token_acc": 0.9779735682819384, "train_speed(iter/s)": 0.957632 }, { "epoch": 0.22606633531494655, "grad_norm": 0.6406558156013489, "learning_rate": 9.176367380421227e-06, "loss": 0.03441927954554558, "memory(GiB)": 21.32, "step": 6959, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.957661 }, { "epoch": 0.226098820777702, "grad_norm": 0.4433267414569855, "learning_rate": 9.176072010987549e-06, "loss": 0.03518642485141754, "memory(GiB)": 21.32, "step": 6960, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.957689 }, { "epoch": 0.2261313062404574, "grad_norm": 0.9855953454971313, "learning_rate": 9.175776593356443e-06, "loss": 0.04540719464421272, "memory(GiB)": 21.32, "step": 6961, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.957718 }, { "epoch": 0.22616379170321282, "grad_norm": 0.5424494743347168, "learning_rate": 9.175481127531315e-06, "loss": 0.038388557732105255, "memory(GiB)": 21.32, "step": 6962, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.957742 }, { "epoch": 0.22619627716596824, "grad_norm": 0.4483894407749176, "learning_rate": 9.175185613515578e-06, "loss": 0.03560171276330948, "memory(GiB)": 21.32, "step": 6963, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.957761 }, { "epoch": 0.22622876262872366, "grad_norm": 0.4918900430202484, "learning_rate": 9.174890051312643e-06, "loss": 0.03713798522949219, "memory(GiB)": 21.32, "step": 6964, "token_acc": 0.9644444444444444, "train_speed(iter/s)": 0.957785 }, { "epoch": 0.22626124809147907, "grad_norm": 0.41336217522621155, "learning_rate": 9.174594440925918e-06, "loss": 0.03649488463997841, "memory(GiB)": 21.32, "step": 6965, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957807 }, { "epoch": 0.2262937335542345, "grad_norm": 0.41801658272743225, "learning_rate": 9.17429878235882e-06, "loss": 0.03866163641214371, "memory(GiB)": 21.32, "step": 6966, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.957831 }, { "epoch": 0.2263262190169899, "grad_norm": 0.4608531594276428, "learning_rate": 9.174003075614755e-06, "loss": 0.03349129855632782, "memory(GiB)": 21.32, "step": 6967, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.957854 }, { "epoch": 0.22635870447974532, "grad_norm": 0.6875308156013489, "learning_rate": 9.173707320697139e-06, "loss": 0.03788329288363457, "memory(GiB)": 21.32, "step": 6968, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.957878 }, { "epoch": 0.22639118994250074, "grad_norm": 0.6873708963394165, "learning_rate": 9.173411517609387e-06, "loss": 0.04063349962234497, "memory(GiB)": 21.32, "step": 6969, "token_acc": 0.9690265486725663, "train_speed(iter/s)": 0.957897 }, { "epoch": 0.22642367540525615, "grad_norm": 0.5633804798126221, "learning_rate": 9.17311566635491e-06, "loss": 0.04060422256588936, "memory(GiB)": 21.32, "step": 6970, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.95792 }, { "epoch": 0.22645616086801157, "grad_norm": 0.7307780385017395, "learning_rate": 9.172819766937124e-06, "loss": 0.039370350539684296, "memory(GiB)": 21.32, "step": 6971, "token_acc": 0.9727626459143969, "train_speed(iter/s)": 0.95794 }, { "epoch": 0.22648864633076699, "grad_norm": 1.6831344366073608, "learning_rate": 9.172523819359444e-06, "loss": 0.04248280078172684, "memory(GiB)": 21.32, "step": 6972, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.95796 }, { "epoch": 0.2265211317935224, "grad_norm": 0.6130633354187012, "learning_rate": 9.172227823625286e-06, "loss": 0.035346999764442444, "memory(GiB)": 21.32, "step": 6973, "token_acc": 0.9799196787148594, "train_speed(iter/s)": 0.957981 }, { "epoch": 0.22655361725627782, "grad_norm": 0.4999081790447235, "learning_rate": 9.171931779738066e-06, "loss": 0.03742162138223648, "memory(GiB)": 21.32, "step": 6974, "token_acc": 0.9826839826839827, "train_speed(iter/s)": 0.958005 }, { "epoch": 0.22658610271903323, "grad_norm": 0.34919124841690063, "learning_rate": 9.1716356877012e-06, "loss": 0.02539275959134102, "memory(GiB)": 21.32, "step": 6975, "token_acc": 1.0, "train_speed(iter/s)": 0.95803 }, { "epoch": 0.22661858818178865, "grad_norm": 0.6435897946357727, "learning_rate": 9.171339547518106e-06, "loss": 0.03389505296945572, "memory(GiB)": 21.32, "step": 6976, "token_acc": 0.9806949806949807, "train_speed(iter/s)": 0.958053 }, { "epoch": 0.22665107364454407, "grad_norm": 0.7658809423446655, "learning_rate": 9.1710433591922e-06, "loss": 0.039120931178331375, "memory(GiB)": 21.32, "step": 6977, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.958073 }, { "epoch": 0.22668355910729948, "grad_norm": 0.5066235065460205, "learning_rate": 9.170747122726904e-06, "loss": 0.03432661294937134, "memory(GiB)": 21.32, "step": 6978, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.958095 }, { "epoch": 0.2267160445700549, "grad_norm": 0.5897167921066284, "learning_rate": 9.170450838125633e-06, "loss": 0.03664588928222656, "memory(GiB)": 21.32, "step": 6979, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.958118 }, { "epoch": 0.2267485300328103, "grad_norm": 0.5356449484825134, "learning_rate": 9.17015450539181e-06, "loss": 0.03531093895435333, "memory(GiB)": 21.32, "step": 6980, "token_acc": 0.9704641350210971, "train_speed(iter/s)": 0.958139 }, { "epoch": 0.22678101549556573, "grad_norm": 0.6874107718467712, "learning_rate": 9.169858124528853e-06, "loss": 0.03264841437339783, "memory(GiB)": 21.32, "step": 6981, "token_acc": 0.981549815498155, "train_speed(iter/s)": 0.95816 }, { "epoch": 0.22681350095832115, "grad_norm": 0.6848145127296448, "learning_rate": 9.169561695540184e-06, "loss": 0.04417666047811508, "memory(GiB)": 21.32, "step": 6982, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.958187 }, { "epoch": 0.22684598642107656, "grad_norm": 0.4513273239135742, "learning_rate": 9.169265218429222e-06, "loss": 0.03162394464015961, "memory(GiB)": 21.32, "step": 6983, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.958216 }, { "epoch": 0.22687847188383198, "grad_norm": 0.520544707775116, "learning_rate": 9.168968693199391e-06, "loss": 0.03404892235994339, "memory(GiB)": 21.32, "step": 6984, "token_acc": 0.9701492537313433, "train_speed(iter/s)": 0.958244 }, { "epoch": 0.2269109573465874, "grad_norm": 0.4965329170227051, "learning_rate": 9.168672119854111e-06, "loss": 0.036073602735996246, "memory(GiB)": 21.32, "step": 6985, "token_acc": 1.0, "train_speed(iter/s)": 0.958272 }, { "epoch": 0.2269434428093428, "grad_norm": 0.4940902292728424, "learning_rate": 9.168375498396809e-06, "loss": 0.03887259215116501, "memory(GiB)": 21.32, "step": 6986, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.9583 }, { "epoch": 0.22697592827209823, "grad_norm": 0.7072502374649048, "learning_rate": 9.168078828830905e-06, "loss": 0.046762023121118546, "memory(GiB)": 21.32, "step": 6987, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.958325 }, { "epoch": 0.22700841373485364, "grad_norm": 0.5160490870475769, "learning_rate": 9.167782111159821e-06, "loss": 0.03588218614459038, "memory(GiB)": 21.32, "step": 6988, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.958353 }, { "epoch": 0.22704089919760906, "grad_norm": 0.8580368161201477, "learning_rate": 9.167485345386987e-06, "loss": 0.043255165219306946, "memory(GiB)": 21.32, "step": 6989, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.958383 }, { "epoch": 0.22707338466036447, "grad_norm": 0.4964706599712372, "learning_rate": 9.167188531515822e-06, "loss": 0.03975875675678253, "memory(GiB)": 21.32, "step": 6990, "token_acc": 0.9854014598540146, "train_speed(iter/s)": 0.958411 }, { "epoch": 0.2271058701231199, "grad_norm": 0.45453503727912903, "learning_rate": 9.166891669549757e-06, "loss": 0.030255382880568504, "memory(GiB)": 21.32, "step": 6991, "token_acc": 0.9885931558935361, "train_speed(iter/s)": 0.958441 }, { "epoch": 0.22713835558587533, "grad_norm": 0.46711695194244385, "learning_rate": 9.166594759492217e-06, "loss": 0.03452892228960991, "memory(GiB)": 21.32, "step": 6992, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.958471 }, { "epoch": 0.22717084104863075, "grad_norm": 0.43647661805152893, "learning_rate": 9.166297801346627e-06, "loss": 0.03219118341803551, "memory(GiB)": 21.32, "step": 6993, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.958501 }, { "epoch": 0.22720332651138617, "grad_norm": 2.2785327434539795, "learning_rate": 9.166000795116414e-06, "loss": 0.045700911432504654, "memory(GiB)": 21.32, "step": 6994, "token_acc": 0.979933110367893, "train_speed(iter/s)": 0.958529 }, { "epoch": 0.22723581197414158, "grad_norm": 0.5772088170051575, "learning_rate": 9.165703740805008e-06, "loss": 0.04468526691198349, "memory(GiB)": 21.32, "step": 6995, "token_acc": 0.9808429118773946, "train_speed(iter/s)": 0.958557 }, { "epoch": 0.227268297436897, "grad_norm": 0.6042788028717041, "learning_rate": 9.165406638415834e-06, "loss": 0.04251500219106674, "memory(GiB)": 21.32, "step": 6996, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.95858 }, { "epoch": 0.22730078289965241, "grad_norm": 0.5999333262443542, "learning_rate": 9.165109487952326e-06, "loss": 0.04012277349829674, "memory(GiB)": 21.32, "step": 6997, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.958604 }, { "epoch": 0.22733326836240783, "grad_norm": 0.46547994017601013, "learning_rate": 9.16481228941791e-06, "loss": 0.03682614117860794, "memory(GiB)": 21.32, "step": 6998, "token_acc": 0.9900332225913622, "train_speed(iter/s)": 0.958628 }, { "epoch": 0.22736575382516325, "grad_norm": 0.42212456464767456, "learning_rate": 9.164515042816017e-06, "loss": 0.040024712681770325, "memory(GiB)": 21.32, "step": 6999, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.95865 }, { "epoch": 0.22739823928791866, "grad_norm": 0.4894828796386719, "learning_rate": 9.164217748150076e-06, "loss": 0.04034148156642914, "memory(GiB)": 21.32, "step": 7000, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.958673 }, { "epoch": 0.22739823928791866, "eval_loss": 0.03961784765124321, "eval_runtime": 80.0911, "eval_samples_per_second": 124.234, "eval_steps_per_second": 3.883, "eval_token_acc": 0.9847112749413524, "step": 7000 }, { "epoch": 0.22743072475067408, "grad_norm": 0.5163687467575073, "learning_rate": 9.163920405423521e-06, "loss": 0.045183487236499786, "memory(GiB)": 21.32, "step": 7001, "token_acc": 0.9846421287487852, "train_speed(iter/s)": 0.946948 }, { "epoch": 0.2274632102134295, "grad_norm": 0.9056230783462524, "learning_rate": 9.163623014639781e-06, "loss": 0.03315548226237297, "memory(GiB)": 21.32, "step": 7002, "token_acc": 0.9789473684210527, "train_speed(iter/s)": 0.94697 }, { "epoch": 0.2274956956761849, "grad_norm": 0.5530545115470886, "learning_rate": 9.163325575802292e-06, "loss": 0.047434888780117035, "memory(GiB)": 21.32, "step": 7003, "token_acc": 0.9824561403508771, "train_speed(iter/s)": 0.946991 }, { "epoch": 0.22752818113894033, "grad_norm": 0.4847688376903534, "learning_rate": 9.163028088914483e-06, "loss": 0.03641394525766373, "memory(GiB)": 21.32, "step": 7004, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.947011 }, { "epoch": 0.22756066660169574, "grad_norm": 0.4011990427970886, "learning_rate": 9.16273055397979e-06, "loss": 0.028038207441568375, "memory(GiB)": 21.32, "step": 7005, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.947031 }, { "epoch": 0.22759315206445116, "grad_norm": 0.5166465044021606, "learning_rate": 9.162432971001645e-06, "loss": 0.043424706906080246, "memory(GiB)": 21.32, "step": 7006, "token_acc": 0.9858657243816255, "train_speed(iter/s)": 0.947051 }, { "epoch": 0.22762563752720658, "grad_norm": 0.6089708805084229, "learning_rate": 9.162135339983483e-06, "loss": 0.03276839107275009, "memory(GiB)": 21.32, "step": 7007, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.947071 }, { "epoch": 0.227658122989962, "grad_norm": 0.6249209046363831, "learning_rate": 9.161837660928739e-06, "loss": 0.03848246484994888, "memory(GiB)": 21.32, "step": 7008, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.947093 }, { "epoch": 0.2276906084527174, "grad_norm": 1.3996295928955078, "learning_rate": 9.16153993384085e-06, "loss": 0.033058010041713715, "memory(GiB)": 21.32, "step": 7009, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.947114 }, { "epoch": 0.22772309391547282, "grad_norm": 0.5767090916633606, "learning_rate": 9.16124215872325e-06, "loss": 0.03845963627099991, "memory(GiB)": 21.32, "step": 7010, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.947138 }, { "epoch": 0.22775557937822824, "grad_norm": 0.4254307448863983, "learning_rate": 9.160944335579378e-06, "loss": 0.029919829219579697, "memory(GiB)": 21.32, "step": 7011, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.947163 }, { "epoch": 0.22778806484098366, "grad_norm": 0.6835799217224121, "learning_rate": 9.16064646441267e-06, "loss": 0.04252162203192711, "memory(GiB)": 21.32, "step": 7012, "token_acc": 0.99, "train_speed(iter/s)": 0.947189 }, { "epoch": 0.22782055030373907, "grad_norm": 0.5027192234992981, "learning_rate": 9.160348545226564e-06, "loss": 0.036049582064151764, "memory(GiB)": 21.32, "step": 7013, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.947213 }, { "epoch": 0.2278530357664945, "grad_norm": 0.5116775631904602, "learning_rate": 9.160050578024498e-06, "loss": 0.04072508215904236, "memory(GiB)": 21.32, "step": 7014, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.947239 }, { "epoch": 0.2278855212292499, "grad_norm": 0.5390783548355103, "learning_rate": 9.159752562809912e-06, "loss": 0.037907782942056656, "memory(GiB)": 21.32, "step": 7015, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.947263 }, { "epoch": 0.22791800669200532, "grad_norm": 0.4092303216457367, "learning_rate": 9.159454499586245e-06, "loss": 0.031200449913740158, "memory(GiB)": 21.32, "step": 7016, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.947288 }, { "epoch": 0.22795049215476074, "grad_norm": 0.3873143494129181, "learning_rate": 9.159156388356936e-06, "loss": 0.03357153385877609, "memory(GiB)": 21.32, "step": 7017, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.947312 }, { "epoch": 0.22798297761751615, "grad_norm": 0.46391329169273376, "learning_rate": 9.158858229125424e-06, "loss": 0.0412706695497036, "memory(GiB)": 21.32, "step": 7018, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.947337 }, { "epoch": 0.22801546308027157, "grad_norm": 0.38915637135505676, "learning_rate": 9.158560021895156e-06, "loss": 0.02784241922199726, "memory(GiB)": 21.32, "step": 7019, "token_acc": 0.9858657243816255, "train_speed(iter/s)": 0.947361 }, { "epoch": 0.22804794854302698, "grad_norm": 0.6199886798858643, "learning_rate": 9.15826176666957e-06, "loss": 0.03970126807689667, "memory(GiB)": 21.32, "step": 7020, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.947387 }, { "epoch": 0.2280804340057824, "grad_norm": 0.48279869556427, "learning_rate": 9.157963463452108e-06, "loss": 0.04068891704082489, "memory(GiB)": 21.32, "step": 7021, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.947411 }, { "epoch": 0.22811291946853782, "grad_norm": 0.5417327880859375, "learning_rate": 9.157665112246214e-06, "loss": 0.03424205631017685, "memory(GiB)": 21.32, "step": 7022, "token_acc": 0.9764705882352941, "train_speed(iter/s)": 0.947437 }, { "epoch": 0.22814540493129323, "grad_norm": 0.4971976578235626, "learning_rate": 9.15736671305533e-06, "loss": 0.033781424164772034, "memory(GiB)": 21.32, "step": 7023, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.947462 }, { "epoch": 0.22817789039404868, "grad_norm": 1.2552363872528076, "learning_rate": 9.157068265882902e-06, "loss": 0.04697592556476593, "memory(GiB)": 21.32, "step": 7024, "token_acc": 0.9826086956521739, "train_speed(iter/s)": 0.947485 }, { "epoch": 0.2282103758568041, "grad_norm": 0.6260533928871155, "learning_rate": 9.15676977073237e-06, "loss": 0.04642282426357269, "memory(GiB)": 21.32, "step": 7025, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.947504 }, { "epoch": 0.2282428613195595, "grad_norm": 0.5403842926025391, "learning_rate": 9.156471227607186e-06, "loss": 0.03578932583332062, "memory(GiB)": 21.32, "step": 7026, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.947529 }, { "epoch": 0.22827534678231493, "grad_norm": 0.4634336531162262, "learning_rate": 9.15617263651079e-06, "loss": 0.03477047011256218, "memory(GiB)": 21.32, "step": 7027, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.947555 }, { "epoch": 0.22830783224507034, "grad_norm": 0.484976589679718, "learning_rate": 9.15587399744663e-06, "loss": 0.048651162534952164, "memory(GiB)": 21.32, "step": 7028, "token_acc": 0.9831649831649831, "train_speed(iter/s)": 0.94758 }, { "epoch": 0.22834031770782576, "grad_norm": 0.47614121437072754, "learning_rate": 9.155575310418152e-06, "loss": 0.03646627068519592, "memory(GiB)": 21.32, "step": 7029, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.947577 }, { "epoch": 0.22837280317058117, "grad_norm": 0.9100672602653503, "learning_rate": 9.155276575428805e-06, "loss": 0.03376457095146179, "memory(GiB)": 21.32, "step": 7030, "token_acc": 0.984, "train_speed(iter/s)": 0.947601 }, { "epoch": 0.2284052886333366, "grad_norm": 1.0167670249938965, "learning_rate": 9.154977792482035e-06, "loss": 0.04876694828271866, "memory(GiB)": 21.32, "step": 7031, "token_acc": 0.9516129032258065, "train_speed(iter/s)": 0.947631 }, { "epoch": 0.228437774096092, "grad_norm": 0.9825931787490845, "learning_rate": 9.15467896158129e-06, "loss": 0.047135550528764725, "memory(GiB)": 21.32, "step": 7032, "token_acc": 0.9739130434782609, "train_speed(iter/s)": 0.947662 }, { "epoch": 0.22847025955884742, "grad_norm": 1.1899996995925903, "learning_rate": 9.154380082730022e-06, "loss": 0.04088267683982849, "memory(GiB)": 21.32, "step": 7033, "token_acc": 0.9748201438848921, "train_speed(iter/s)": 0.947692 }, { "epoch": 0.22850274502160284, "grad_norm": 0.5409799814224243, "learning_rate": 9.154081155931678e-06, "loss": 0.03444964811205864, "memory(GiB)": 21.32, "step": 7034, "token_acc": 0.9798387096774194, "train_speed(iter/s)": 0.947722 }, { "epoch": 0.22853523048435825, "grad_norm": 0.43257734179496765, "learning_rate": 9.153782181189708e-06, "loss": 0.034495510160923004, "memory(GiB)": 21.32, "step": 7035, "token_acc": 0.986013986013986, "train_speed(iter/s)": 0.947752 }, { "epoch": 0.22856771594711367, "grad_norm": 0.7156378626823425, "learning_rate": 9.153483158507562e-06, "loss": 0.0304725244641304, "memory(GiB)": 21.32, "step": 7036, "token_acc": 0.9877551020408163, "train_speed(iter/s)": 0.947784 }, { "epoch": 0.22860020140986909, "grad_norm": 0.5200844407081604, "learning_rate": 9.153184087888693e-06, "loss": 0.03401419520378113, "memory(GiB)": 21.32, "step": 7037, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.947814 }, { "epoch": 0.2286326868726245, "grad_norm": 1.2201076745986938, "learning_rate": 9.152884969336553e-06, "loss": 0.0444224514067173, "memory(GiB)": 21.32, "step": 7038, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.947842 }, { "epoch": 0.22866517233537992, "grad_norm": 0.5551149249076843, "learning_rate": 9.15258580285459e-06, "loss": 0.041663069278001785, "memory(GiB)": 21.32, "step": 7039, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.947871 }, { "epoch": 0.22869765779813533, "grad_norm": 0.5728740096092224, "learning_rate": 9.152286588446262e-06, "loss": 0.03592895716428757, "memory(GiB)": 21.32, "step": 7040, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.947902 }, { "epoch": 0.22873014326089075, "grad_norm": 0.6140359044075012, "learning_rate": 9.15198732611502e-06, "loss": 0.03187697380781174, "memory(GiB)": 21.32, "step": 7041, "token_acc": 0.9941176470588236, "train_speed(iter/s)": 0.947931 }, { "epoch": 0.22876262872364617, "grad_norm": 0.5656295418739319, "learning_rate": 9.151688015864317e-06, "loss": 0.03807736933231354, "memory(GiB)": 21.32, "step": 7042, "token_acc": 0.980544747081712, "train_speed(iter/s)": 0.947961 }, { "epoch": 0.22879511418640158, "grad_norm": 0.5546374917030334, "learning_rate": 9.151388657697608e-06, "loss": 0.045222602784633636, "memory(GiB)": 21.32, "step": 7043, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.947991 }, { "epoch": 0.228827599649157, "grad_norm": 0.5279116630554199, "learning_rate": 9.15108925161835e-06, "loss": 0.03734905272722244, "memory(GiB)": 21.32, "step": 7044, "token_acc": 0.9795918367346939, "train_speed(iter/s)": 0.94802 }, { "epoch": 0.22886008511191241, "grad_norm": 0.6734402775764465, "learning_rate": 9.150789797629997e-06, "loss": 0.0452573336660862, "memory(GiB)": 21.32, "step": 7045, "token_acc": 0.98828125, "train_speed(iter/s)": 0.948049 }, { "epoch": 0.22889257057466783, "grad_norm": 0.5702298283576965, "learning_rate": 9.150490295736003e-06, "loss": 0.04029650241136551, "memory(GiB)": 21.32, "step": 7046, "token_acc": 0.9774436090225563, "train_speed(iter/s)": 0.948078 }, { "epoch": 0.22892505603742325, "grad_norm": 0.6026278734207153, "learning_rate": 9.15019074593983e-06, "loss": 0.03642527014017105, "memory(GiB)": 21.32, "step": 7047, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.948107 }, { "epoch": 0.22895754150017866, "grad_norm": 0.4595799744129181, "learning_rate": 9.149891148244929e-06, "loss": 0.0366918221116066, "memory(GiB)": 21.32, "step": 7048, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.948129 }, { "epoch": 0.22899002696293408, "grad_norm": 0.3858742117881775, "learning_rate": 9.14959150265476e-06, "loss": 0.03451262786984444, "memory(GiB)": 21.32, "step": 7049, "token_acc": 0.9822695035460993, "train_speed(iter/s)": 0.948153 }, { "epoch": 0.2290225124256895, "grad_norm": 0.4844995141029358, "learning_rate": 9.149291809172785e-06, "loss": 0.03840378299355507, "memory(GiB)": 21.32, "step": 7050, "token_acc": 0.975, "train_speed(iter/s)": 0.948177 }, { "epoch": 0.2290549978884449, "grad_norm": 0.46244585514068604, "learning_rate": 9.148992067802458e-06, "loss": 0.03680945932865143, "memory(GiB)": 21.32, "step": 7051, "token_acc": 0.9707317073170731, "train_speed(iter/s)": 0.948201 }, { "epoch": 0.22908748335120033, "grad_norm": 0.5283300280570984, "learning_rate": 9.14869227854724e-06, "loss": 0.041708823293447495, "memory(GiB)": 21.32, "step": 7052, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.948224 }, { "epoch": 0.22911996881395574, "grad_norm": 0.38437458872795105, "learning_rate": 9.148392441410592e-06, "loss": 0.0308489128947258, "memory(GiB)": 21.32, "step": 7053, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.948245 }, { "epoch": 0.22915245427671116, "grad_norm": 0.5215349197387695, "learning_rate": 9.148092556395973e-06, "loss": 0.04873678833246231, "memory(GiB)": 21.32, "step": 7054, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.948268 }, { "epoch": 0.22918493973946658, "grad_norm": 0.44767749309539795, "learning_rate": 9.147792623506847e-06, "loss": 0.0382612869143486, "memory(GiB)": 21.32, "step": 7055, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.948289 }, { "epoch": 0.22921742520222202, "grad_norm": 0.49119848012924194, "learning_rate": 9.14749264274667e-06, "loss": 0.03577465936541557, "memory(GiB)": 21.32, "step": 7056, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.94831 }, { "epoch": 0.22924991066497744, "grad_norm": 3.177542209625244, "learning_rate": 9.14719261411891e-06, "loss": 0.04178278148174286, "memory(GiB)": 21.32, "step": 7057, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.948331 }, { "epoch": 0.22928239612773285, "grad_norm": 0.43501412868499756, "learning_rate": 9.146892537627026e-06, "loss": 0.03336001932621002, "memory(GiB)": 21.32, "step": 7058, "token_acc": 0.9727272727272728, "train_speed(iter/s)": 0.948316 }, { "epoch": 0.22931488159048827, "grad_norm": 0.7988733649253845, "learning_rate": 9.146592413274485e-06, "loss": 0.044803962111473083, "memory(GiB)": 21.32, "step": 7059, "token_acc": 0.9681818181818181, "train_speed(iter/s)": 0.948338 }, { "epoch": 0.22934736705324368, "grad_norm": 0.8140734434127808, "learning_rate": 9.146292241064746e-06, "loss": 0.043711405247449875, "memory(GiB)": 21.32, "step": 7060, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.94836 }, { "epoch": 0.2293798525159991, "grad_norm": 0.623444676399231, "learning_rate": 9.145992021001276e-06, "loss": 0.04131854325532913, "memory(GiB)": 21.32, "step": 7061, "token_acc": 0.9635627530364372, "train_speed(iter/s)": 0.948383 }, { "epoch": 0.22941233797875452, "grad_norm": 0.39057615399360657, "learning_rate": 9.14569175308754e-06, "loss": 0.04049261286854744, "memory(GiB)": 21.32, "step": 7062, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.948404 }, { "epoch": 0.22944482344150993, "grad_norm": 0.678617000579834, "learning_rate": 9.145391437327005e-06, "loss": 0.04961913079023361, "memory(GiB)": 21.32, "step": 7063, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.948428 }, { "epoch": 0.22947730890426535, "grad_norm": 0.5890525579452515, "learning_rate": 9.145091073723134e-06, "loss": 0.04480438679456711, "memory(GiB)": 21.32, "step": 7064, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.94845 }, { "epoch": 0.22950979436702076, "grad_norm": 0.5809884667396545, "learning_rate": 9.144790662279393e-06, "loss": 0.04214543476700783, "memory(GiB)": 21.32, "step": 7065, "token_acc": 0.9517684887459807, "train_speed(iter/s)": 0.94847 }, { "epoch": 0.22954227982977618, "grad_norm": 0.4654248058795929, "learning_rate": 9.144490202999255e-06, "loss": 0.039124295115470886, "memory(GiB)": 21.32, "step": 7066, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.948491 }, { "epoch": 0.2295747652925316, "grad_norm": 0.6827663779258728, "learning_rate": 9.14418969588618e-06, "loss": 0.04677964746952057, "memory(GiB)": 21.32, "step": 7067, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.948516 }, { "epoch": 0.229607250755287, "grad_norm": 0.4694039523601532, "learning_rate": 9.143889140943642e-06, "loss": 0.04020967334508896, "memory(GiB)": 21.32, "step": 7068, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.948544 }, { "epoch": 0.22963973621804243, "grad_norm": 0.45899707078933716, "learning_rate": 9.143588538175108e-06, "loss": 0.03683793172240257, "memory(GiB)": 21.32, "step": 7069, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.948569 }, { "epoch": 0.22967222168079784, "grad_norm": 2.896718740463257, "learning_rate": 9.143287887584045e-06, "loss": 0.03984803706407547, "memory(GiB)": 21.32, "step": 7070, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.948592 }, { "epoch": 0.22970470714355326, "grad_norm": 0.4479086101055145, "learning_rate": 9.142987189173928e-06, "loss": 0.035926688462495804, "memory(GiB)": 21.32, "step": 7071, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.948615 }, { "epoch": 0.22973719260630868, "grad_norm": 0.5009449124336243, "learning_rate": 9.142686442948222e-06, "loss": 0.04162946715950966, "memory(GiB)": 21.32, "step": 7072, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.948638 }, { "epoch": 0.2297696780690641, "grad_norm": 0.6593407988548279, "learning_rate": 9.142385648910402e-06, "loss": 0.04464191943407059, "memory(GiB)": 21.32, "step": 7073, "token_acc": 0.972, "train_speed(iter/s)": 0.948661 }, { "epoch": 0.2298021635318195, "grad_norm": 1.2290773391723633, "learning_rate": 9.142084807063937e-06, "loss": 0.046320974826812744, "memory(GiB)": 21.32, "step": 7074, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.948683 }, { "epoch": 0.22983464899457492, "grad_norm": 0.383524090051651, "learning_rate": 9.141783917412301e-06, "loss": 0.026881087571382523, "memory(GiB)": 21.32, "step": 7075, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.948705 }, { "epoch": 0.22986713445733034, "grad_norm": 0.39205047488212585, "learning_rate": 9.141482979958967e-06, "loss": 0.03722062706947327, "memory(GiB)": 21.32, "step": 7076, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.948729 }, { "epoch": 0.22989961992008576, "grad_norm": 0.46367132663726807, "learning_rate": 9.141181994707407e-06, "loss": 0.04744809865951538, "memory(GiB)": 21.32, "step": 7077, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.94875 }, { "epoch": 0.22993210538284117, "grad_norm": 0.5598581433296204, "learning_rate": 9.140880961661094e-06, "loss": 0.03972676768898964, "memory(GiB)": 21.32, "step": 7078, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.948771 }, { "epoch": 0.2299645908455966, "grad_norm": 0.3888275921344757, "learning_rate": 9.140579880823503e-06, "loss": 0.03354397788643837, "memory(GiB)": 21.32, "step": 7079, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.948791 }, { "epoch": 0.229997076308352, "grad_norm": 0.9087249040603638, "learning_rate": 9.140278752198108e-06, "loss": 0.04517305642366409, "memory(GiB)": 21.32, "step": 7080, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.948813 }, { "epoch": 0.23002956177110742, "grad_norm": 0.5143201947212219, "learning_rate": 9.139977575788387e-06, "loss": 0.03284387290477753, "memory(GiB)": 21.32, "step": 7081, "token_acc": 0.9921875, "train_speed(iter/s)": 0.948838 }, { "epoch": 0.23006204723386284, "grad_norm": 0.3035351037979126, "learning_rate": 9.139676351597814e-06, "loss": 0.027662359178066254, "memory(GiB)": 21.32, "step": 7082, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.94886 }, { "epoch": 0.23009453269661825, "grad_norm": 0.7241073846817017, "learning_rate": 9.139375079629866e-06, "loss": 0.042373549193143845, "memory(GiB)": 21.32, "step": 7083, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.948886 }, { "epoch": 0.23012701815937367, "grad_norm": 0.4325505793094635, "learning_rate": 9.139073759888022e-06, "loss": 0.03587011247873306, "memory(GiB)": 21.32, "step": 7084, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.948908 }, { "epoch": 0.23015950362212909, "grad_norm": 0.836174488067627, "learning_rate": 9.138772392375756e-06, "loss": 0.05490492284297943, "memory(GiB)": 21.32, "step": 7085, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.948932 }, { "epoch": 0.2301919890848845, "grad_norm": 0.48315736651420593, "learning_rate": 9.138470977096547e-06, "loss": 0.031787194311618805, "memory(GiB)": 21.32, "step": 7086, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.948957 }, { "epoch": 0.23022447454763992, "grad_norm": 0.4327971935272217, "learning_rate": 9.138169514053876e-06, "loss": 0.03622069209814072, "memory(GiB)": 21.32, "step": 7087, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.948977 }, { "epoch": 0.23025696001039536, "grad_norm": 0.5884600877761841, "learning_rate": 9.137868003251221e-06, "loss": 0.03311580419540405, "memory(GiB)": 21.32, "step": 7088, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.948999 }, { "epoch": 0.23028944547315078, "grad_norm": 0.57098388671875, "learning_rate": 9.137566444692061e-06, "loss": 0.04547576606273651, "memory(GiB)": 21.32, "step": 7089, "token_acc": 0.9686274509803922, "train_speed(iter/s)": 0.949021 }, { "epoch": 0.2303219309359062, "grad_norm": 0.6454753875732422, "learning_rate": 9.137264838379877e-06, "loss": 0.03260529041290283, "memory(GiB)": 21.32, "step": 7090, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.949042 }, { "epoch": 0.2303544163986616, "grad_norm": 0.5315489768981934, "learning_rate": 9.13696318431815e-06, "loss": 0.038463130593299866, "memory(GiB)": 21.32, "step": 7091, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.949067 }, { "epoch": 0.23038690186141703, "grad_norm": 0.5846010446548462, "learning_rate": 9.136661482510364e-06, "loss": 0.03443744406104088, "memory(GiB)": 21.32, "step": 7092, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.949091 }, { "epoch": 0.23041938732417244, "grad_norm": 0.6013936400413513, "learning_rate": 9.136359732959995e-06, "loss": 0.04100131243467331, "memory(GiB)": 21.32, "step": 7093, "token_acc": 0.9774436090225563, "train_speed(iter/s)": 0.949118 }, { "epoch": 0.23045187278692786, "grad_norm": 0.48163777589797974, "learning_rate": 9.13605793567053e-06, "loss": 0.02678028866648674, "memory(GiB)": 21.32, "step": 7094, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.949145 }, { "epoch": 0.23048435824968327, "grad_norm": 0.8312177062034607, "learning_rate": 9.135756090645454e-06, "loss": 0.04484787583351135, "memory(GiB)": 21.32, "step": 7095, "token_acc": 0.9940119760479041, "train_speed(iter/s)": 0.949174 }, { "epoch": 0.2305168437124387, "grad_norm": 0.5791243314743042, "learning_rate": 9.135454197888245e-06, "loss": 0.037812478840351105, "memory(GiB)": 21.32, "step": 7096, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.949203 }, { "epoch": 0.2305493291751941, "grad_norm": 0.542739987373352, "learning_rate": 9.135152257402392e-06, "loss": 0.03233031928539276, "memory(GiB)": 21.32, "step": 7097, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.949232 }, { "epoch": 0.23058181463794952, "grad_norm": 0.5805573463439941, "learning_rate": 9.134850269191378e-06, "loss": 0.036393433809280396, "memory(GiB)": 21.32, "step": 7098, "token_acc": 0.9681978798586572, "train_speed(iter/s)": 0.949262 }, { "epoch": 0.23061430010070494, "grad_norm": 0.5469306707382202, "learning_rate": 9.134548233258686e-06, "loss": 0.032633621245622635, "memory(GiB)": 21.32, "step": 7099, "token_acc": 0.995, "train_speed(iter/s)": 0.949292 }, { "epoch": 0.23064678556346035, "grad_norm": 0.5728910565376282, "learning_rate": 9.134246149607806e-06, "loss": 0.040523119270801544, "memory(GiB)": 21.32, "step": 7100, "token_acc": 1.0, "train_speed(iter/s)": 0.94932 }, { "epoch": 0.23067927102621577, "grad_norm": 0.5354111194610596, "learning_rate": 9.133944018242224e-06, "loss": 0.03367943316698074, "memory(GiB)": 21.32, "step": 7101, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.949349 }, { "epoch": 0.2307117564889712, "grad_norm": 0.49171555042266846, "learning_rate": 9.133641839165426e-06, "loss": 0.04536845162510872, "memory(GiB)": 21.32, "step": 7102, "token_acc": 0.9727626459143969, "train_speed(iter/s)": 0.949377 }, { "epoch": 0.2307442419517266, "grad_norm": 0.7344944477081299, "learning_rate": 9.133339612380897e-06, "loss": 0.03616436943411827, "memory(GiB)": 21.32, "step": 7103, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.949407 }, { "epoch": 0.23077672741448202, "grad_norm": 0.5295646786689758, "learning_rate": 9.133037337892129e-06, "loss": 0.03693992272019386, "memory(GiB)": 21.32, "step": 7104, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.949435 }, { "epoch": 0.23080921287723744, "grad_norm": 1.0758864879608154, "learning_rate": 9.132735015702609e-06, "loss": 0.03697582334280014, "memory(GiB)": 21.32, "step": 7105, "token_acc": 0.9834710743801653, "train_speed(iter/s)": 0.949466 }, { "epoch": 0.23084169833999285, "grad_norm": 0.461326539516449, "learning_rate": 9.132432645815825e-06, "loss": 0.03518225997686386, "memory(GiB)": 21.32, "step": 7106, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.949493 }, { "epoch": 0.23087418380274827, "grad_norm": 0.4983028471469879, "learning_rate": 9.13213022823527e-06, "loss": 0.04103229567408562, "memory(GiB)": 21.32, "step": 7107, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.949513 }, { "epoch": 0.23090666926550368, "grad_norm": 0.39707091450691223, "learning_rate": 9.131827762964431e-06, "loss": 0.02739466167986393, "memory(GiB)": 21.32, "step": 7108, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.949536 }, { "epoch": 0.2309391547282591, "grad_norm": 0.5498733520507812, "learning_rate": 9.1315252500068e-06, "loss": 0.03449085354804993, "memory(GiB)": 21.32, "step": 7109, "token_acc": 0.9938650306748467, "train_speed(iter/s)": 0.949558 }, { "epoch": 0.23097164019101452, "grad_norm": 0.4341870844364166, "learning_rate": 9.131222689365869e-06, "loss": 0.0382668636739254, "memory(GiB)": 21.32, "step": 7110, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.949581 }, { "epoch": 0.23100412565376993, "grad_norm": 0.5509061217308044, "learning_rate": 9.13092008104513e-06, "loss": 0.03839326649904251, "memory(GiB)": 21.32, "step": 7111, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.949602 }, { "epoch": 0.23103661111652535, "grad_norm": 0.5970234274864197, "learning_rate": 9.130617425048074e-06, "loss": 0.041051045060157776, "memory(GiB)": 21.32, "step": 7112, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.949626 }, { "epoch": 0.23106909657928076, "grad_norm": 0.37757542729377747, "learning_rate": 9.130314721378194e-06, "loss": 0.030387382954359055, "memory(GiB)": 21.32, "step": 7113, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.949646 }, { "epoch": 0.23110158204203618, "grad_norm": 0.7040896415710449, "learning_rate": 9.130011970038988e-06, "loss": 0.04455794021487236, "memory(GiB)": 21.32, "step": 7114, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.949668 }, { "epoch": 0.2311340675047916, "grad_norm": 0.7484796047210693, "learning_rate": 9.129709171033944e-06, "loss": 0.04354612156748772, "memory(GiB)": 21.32, "step": 7115, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.949689 }, { "epoch": 0.231166552967547, "grad_norm": 0.46721720695495605, "learning_rate": 9.12940632436656e-06, "loss": 0.03326908126473427, "memory(GiB)": 21.32, "step": 7116, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.949712 }, { "epoch": 0.23119903843030243, "grad_norm": 0.4786185622215271, "learning_rate": 9.129103430040332e-06, "loss": 0.04142250120639801, "memory(GiB)": 21.32, "step": 7117, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.949734 }, { "epoch": 0.23123152389305784, "grad_norm": 0.5094469785690308, "learning_rate": 9.128800488058753e-06, "loss": 0.02701202966272831, "memory(GiB)": 21.32, "step": 7118, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.949754 }, { "epoch": 0.23126400935581326, "grad_norm": 0.6480616927146912, "learning_rate": 9.128497498425322e-06, "loss": 0.03633841872215271, "memory(GiB)": 21.32, "step": 7119, "token_acc": 0.988, "train_speed(iter/s)": 0.949776 }, { "epoch": 0.2312964948185687, "grad_norm": 0.6806803941726685, "learning_rate": 9.128194461143536e-06, "loss": 0.03533143922686577, "memory(GiB)": 21.32, "step": 7120, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.9498 }, { "epoch": 0.23132898028132412, "grad_norm": 0.3838440775871277, "learning_rate": 9.12789137621689e-06, "loss": 0.03507266938686371, "memory(GiB)": 21.32, "step": 7121, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.949823 }, { "epoch": 0.23136146574407954, "grad_norm": 0.4910142719745636, "learning_rate": 9.127588243648883e-06, "loss": 0.03447438031435013, "memory(GiB)": 21.32, "step": 7122, "token_acc": 0.9707602339181286, "train_speed(iter/s)": 0.949847 }, { "epoch": 0.23139395120683495, "grad_norm": 0.7196939587593079, "learning_rate": 9.127285063443014e-06, "loss": 0.04181459918618202, "memory(GiB)": 21.32, "step": 7123, "token_acc": 0.9795918367346939, "train_speed(iter/s)": 0.949868 }, { "epoch": 0.23142643666959037, "grad_norm": 0.8604758977890015, "learning_rate": 9.126981835602782e-06, "loss": 0.042878977954387665, "memory(GiB)": 21.32, "step": 7124, "token_acc": 0.98046875, "train_speed(iter/s)": 0.94989 }, { "epoch": 0.23145892213234578, "grad_norm": 0.5941364765167236, "learning_rate": 9.126678560131687e-06, "loss": 0.04012257605791092, "memory(GiB)": 21.32, "step": 7125, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.949913 }, { "epoch": 0.2314914075951012, "grad_norm": 0.5499411821365356, "learning_rate": 9.126375237033229e-06, "loss": 0.030709898099303246, "memory(GiB)": 21.32, "step": 7126, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.949938 }, { "epoch": 0.23152389305785662, "grad_norm": 0.5644434094429016, "learning_rate": 9.126071866310908e-06, "loss": 0.0336451530456543, "memory(GiB)": 21.32, "step": 7127, "token_acc": 0.9965277777777778, "train_speed(iter/s)": 0.949967 }, { "epoch": 0.23155637852061203, "grad_norm": 0.8341624736785889, "learning_rate": 9.125768447968227e-06, "loss": 0.03713611513376236, "memory(GiB)": 21.32, "step": 7128, "token_acc": 0.981203007518797, "train_speed(iter/s)": 0.949997 }, { "epoch": 0.23158886398336745, "grad_norm": 0.6347327828407288, "learning_rate": 9.125464982008685e-06, "loss": 0.04128853231668472, "memory(GiB)": 21.32, "step": 7129, "token_acc": 0.9875, "train_speed(iter/s)": 0.950025 }, { "epoch": 0.23162134944612287, "grad_norm": 0.5988008379936218, "learning_rate": 9.125161468435786e-06, "loss": 0.03274869918823242, "memory(GiB)": 21.32, "step": 7130, "token_acc": 0.9834710743801653, "train_speed(iter/s)": 0.950055 }, { "epoch": 0.23165383490887828, "grad_norm": 0.5315479040145874, "learning_rate": 9.124857907253035e-06, "loss": 0.04336002469062805, "memory(GiB)": 21.32, "step": 7131, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.950084 }, { "epoch": 0.2316863203716337, "grad_norm": 0.5208790302276611, "learning_rate": 9.124554298463933e-06, "loss": 0.03433370217680931, "memory(GiB)": 21.32, "step": 7132, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.950106 }, { "epoch": 0.2317188058343891, "grad_norm": 0.6360105276107788, "learning_rate": 9.124250642071984e-06, "loss": 0.040622204542160034, "memory(GiB)": 21.32, "step": 7133, "token_acc": 0.9788359788359788, "train_speed(iter/s)": 0.95013 }, { "epoch": 0.23175129129714453, "grad_norm": 0.484977662563324, "learning_rate": 9.123946938080692e-06, "loss": 0.03620384633541107, "memory(GiB)": 21.32, "step": 7134, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.950152 }, { "epoch": 0.23178377675989995, "grad_norm": 0.6440344452857971, "learning_rate": 9.123643186493566e-06, "loss": 0.04909069091081619, "memory(GiB)": 21.32, "step": 7135, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.950173 }, { "epoch": 0.23181626222265536, "grad_norm": 0.8982989192008972, "learning_rate": 9.123339387314107e-06, "loss": 0.04590729624032974, "memory(GiB)": 21.32, "step": 7136, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.950194 }, { "epoch": 0.23184874768541078, "grad_norm": 0.4650346040725708, "learning_rate": 9.123035540545825e-06, "loss": 0.03708384931087494, "memory(GiB)": 21.32, "step": 7137, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.950216 }, { "epoch": 0.2318812331481662, "grad_norm": 0.4917827546596527, "learning_rate": 9.122731646192225e-06, "loss": 0.042041219770908356, "memory(GiB)": 21.32, "step": 7138, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.950238 }, { "epoch": 0.2319137186109216, "grad_norm": 0.757453203201294, "learning_rate": 9.122427704256813e-06, "loss": 0.04231410473585129, "memory(GiB)": 21.32, "step": 7139, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.95026 }, { "epoch": 0.23194620407367703, "grad_norm": 1.133581280708313, "learning_rate": 9.1221237147431e-06, "loss": 0.051142510026693344, "memory(GiB)": 21.32, "step": 7140, "token_acc": 0.9717741935483871, "train_speed(iter/s)": 0.950281 }, { "epoch": 0.23197868953643244, "grad_norm": 0.4462287425994873, "learning_rate": 9.121819677654591e-06, "loss": 0.04422340914607048, "memory(GiB)": 21.32, "step": 7141, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.950303 }, { "epoch": 0.23201117499918786, "grad_norm": 0.397773802280426, "learning_rate": 9.121515592994798e-06, "loss": 0.03397867828607559, "memory(GiB)": 21.32, "step": 7142, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.950322 }, { "epoch": 0.23204366046194327, "grad_norm": 0.45336654782295227, "learning_rate": 9.12121146076723e-06, "loss": 0.04119991138577461, "memory(GiB)": 21.32, "step": 7143, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.950343 }, { "epoch": 0.2320761459246987, "grad_norm": 0.5157869458198547, "learning_rate": 9.120907280975394e-06, "loss": 0.037671901285648346, "memory(GiB)": 21.32, "step": 7144, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.950363 }, { "epoch": 0.2321086313874541, "grad_norm": 0.644956648349762, "learning_rate": 9.120603053622805e-06, "loss": 0.030096733942627907, "memory(GiB)": 21.32, "step": 7145, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.950385 }, { "epoch": 0.23214111685020952, "grad_norm": 0.4185122847557068, "learning_rate": 9.120298778712972e-06, "loss": 0.037489019334316254, "memory(GiB)": 21.32, "step": 7146, "token_acc": 0.9823788546255506, "train_speed(iter/s)": 0.950407 }, { "epoch": 0.23217360231296494, "grad_norm": 0.34696224331855774, "learning_rate": 9.119994456249406e-06, "loss": 0.028965456411242485, "memory(GiB)": 21.32, "step": 7147, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.950427 }, { "epoch": 0.23220608777572035, "grad_norm": 0.38206997513771057, "learning_rate": 9.119690086235622e-06, "loss": 0.028536852449178696, "memory(GiB)": 21.32, "step": 7148, "token_acc": 1.0, "train_speed(iter/s)": 0.95045 }, { "epoch": 0.23223857323847577, "grad_norm": 0.47453755140304565, "learning_rate": 9.119385668675129e-06, "loss": 0.029253972694277763, "memory(GiB)": 21.32, "step": 7149, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.950473 }, { "epoch": 0.2322710587012312, "grad_norm": 0.469509094953537, "learning_rate": 9.119081203571445e-06, "loss": 0.03864697366952896, "memory(GiB)": 21.32, "step": 7150, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.950495 }, { "epoch": 0.2323035441639866, "grad_norm": 0.5179435014724731, "learning_rate": 9.11877669092808e-06, "loss": 0.03811177611351013, "memory(GiB)": 21.32, "step": 7151, "token_acc": 0.9836734693877551, "train_speed(iter/s)": 0.950515 }, { "epoch": 0.23233602962674205, "grad_norm": 0.7455605268478394, "learning_rate": 9.118472130748549e-06, "loss": 0.04871811717748642, "memory(GiB)": 21.32, "step": 7152, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.950536 }, { "epoch": 0.23236851508949746, "grad_norm": 0.524642825126648, "learning_rate": 9.118167523036368e-06, "loss": 0.03521953895688057, "memory(GiB)": 21.32, "step": 7153, "token_acc": 1.0, "train_speed(iter/s)": 0.950558 }, { "epoch": 0.23240100055225288, "grad_norm": 0.5291127562522888, "learning_rate": 9.117862867795054e-06, "loss": 0.04266189783811569, "memory(GiB)": 21.32, "step": 7154, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.950579 }, { "epoch": 0.2324334860150083, "grad_norm": 0.48765069246292114, "learning_rate": 9.117558165028123e-06, "loss": 0.03948844224214554, "memory(GiB)": 21.32, "step": 7155, "token_acc": 0.9660377358490566, "train_speed(iter/s)": 0.950602 }, { "epoch": 0.2324659714777637, "grad_norm": 0.3941475450992584, "learning_rate": 9.117253414739086e-06, "loss": 0.03210015594959259, "memory(GiB)": 21.32, "step": 7156, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.950629 }, { "epoch": 0.23249845694051913, "grad_norm": 0.7775498628616333, "learning_rate": 9.116948616931469e-06, "loss": 0.04810217767953873, "memory(GiB)": 21.32, "step": 7157, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.950657 }, { "epoch": 0.23253094240327454, "grad_norm": 0.4730016589164734, "learning_rate": 9.116643771608783e-06, "loss": 0.041113514453172684, "memory(GiB)": 21.32, "step": 7158, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.950684 }, { "epoch": 0.23256342786602996, "grad_norm": 0.4927939772605896, "learning_rate": 9.116338878774548e-06, "loss": 0.034480199217796326, "memory(GiB)": 21.32, "step": 7159, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.95071 }, { "epoch": 0.23259591332878538, "grad_norm": 0.3218753933906555, "learning_rate": 9.116033938432286e-06, "loss": 0.024665486067533493, "memory(GiB)": 21.32, "step": 7160, "token_acc": 0.9757785467128027, "train_speed(iter/s)": 0.950737 }, { "epoch": 0.2326283987915408, "grad_norm": 0.5337817072868347, "learning_rate": 9.11572895058551e-06, "loss": 0.031825482845306396, "memory(GiB)": 21.32, "step": 7161, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.950765 }, { "epoch": 0.2326608842542962, "grad_norm": 0.6462386846542358, "learning_rate": 9.115423915237747e-06, "loss": 0.046294569969177246, "memory(GiB)": 21.32, "step": 7162, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.950793 }, { "epoch": 0.23269336971705162, "grad_norm": 0.5487269163131714, "learning_rate": 9.115118832392514e-06, "loss": 0.030404355376958847, "memory(GiB)": 21.32, "step": 7163, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.95082 }, { "epoch": 0.23272585517980704, "grad_norm": 0.4815357029438019, "learning_rate": 9.114813702053331e-06, "loss": 0.037582360208034515, "memory(GiB)": 21.32, "step": 7164, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.950847 }, { "epoch": 0.23275834064256246, "grad_norm": 0.6645282506942749, "learning_rate": 9.114508524223721e-06, "loss": 0.037770628929138184, "memory(GiB)": 21.32, "step": 7165, "token_acc": 1.0, "train_speed(iter/s)": 0.950873 }, { "epoch": 0.23279082610531787, "grad_norm": 1.1882493495941162, "learning_rate": 9.114203298907209e-06, "loss": 0.04301106184720993, "memory(GiB)": 21.32, "step": 7166, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.950897 }, { "epoch": 0.2328233115680733, "grad_norm": 0.4714065194129944, "learning_rate": 9.113898026107313e-06, "loss": 0.028158122673630714, "memory(GiB)": 21.32, "step": 7167, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.95092 }, { "epoch": 0.2328557970308287, "grad_norm": 0.4477180540561676, "learning_rate": 9.11359270582756e-06, "loss": 0.031888529658317566, "memory(GiB)": 21.32, "step": 7168, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.950943 }, { "epoch": 0.23288828249358412, "grad_norm": 0.5898262858390808, "learning_rate": 9.113287338071468e-06, "loss": 0.0445876382291317, "memory(GiB)": 21.32, "step": 7169, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.950964 }, { "epoch": 0.23292076795633954, "grad_norm": 0.5444899201393127, "learning_rate": 9.11298192284257e-06, "loss": 0.038936011493206024, "memory(GiB)": 21.32, "step": 7170, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.950987 }, { "epoch": 0.23295325341909495, "grad_norm": 0.6827804446220398, "learning_rate": 9.112676460144383e-06, "loss": 0.05502273514866829, "memory(GiB)": 21.32, "step": 7171, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.951008 }, { "epoch": 0.23298573888185037, "grad_norm": 0.4455261826515198, "learning_rate": 9.112370949980436e-06, "loss": 0.03907207399606705, "memory(GiB)": 21.32, "step": 7172, "token_acc": 0.993006993006993, "train_speed(iter/s)": 0.95103 }, { "epoch": 0.23301822434460578, "grad_norm": 0.660855770111084, "learning_rate": 9.112065392354256e-06, "loss": 0.03753085434436798, "memory(GiB)": 21.32, "step": 7173, "token_acc": 1.0, "train_speed(iter/s)": 0.951052 }, { "epoch": 0.2330507098073612, "grad_norm": 0.5760266184806824, "learning_rate": 9.111759787269368e-06, "loss": 0.044622667133808136, "memory(GiB)": 21.32, "step": 7174, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.951072 }, { "epoch": 0.23308319527011662, "grad_norm": 0.5129549503326416, "learning_rate": 9.111454134729298e-06, "loss": 0.04365687072277069, "memory(GiB)": 21.32, "step": 7175, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.951091 }, { "epoch": 0.23311568073287203, "grad_norm": 0.7174344062805176, "learning_rate": 9.111148434737576e-06, "loss": 0.04279577359557152, "memory(GiB)": 21.32, "step": 7176, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.95111 }, { "epoch": 0.23314816619562745, "grad_norm": 0.4572655260562897, "learning_rate": 9.11084268729773e-06, "loss": 0.03434842824935913, "memory(GiB)": 21.32, "step": 7177, "token_acc": 1.0, "train_speed(iter/s)": 0.95113 }, { "epoch": 0.23318065165838286, "grad_norm": 0.4938437342643738, "learning_rate": 9.110536892413287e-06, "loss": 0.05075133591890335, "memory(GiB)": 21.32, "step": 7178, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.951152 }, { "epoch": 0.23321313712113828, "grad_norm": 0.47749030590057373, "learning_rate": 9.110231050087777e-06, "loss": 0.030942125245928764, "memory(GiB)": 21.32, "step": 7179, "token_acc": 0.9940119760479041, "train_speed(iter/s)": 0.95117 }, { "epoch": 0.2332456225838937, "grad_norm": 0.8515362739562988, "learning_rate": 9.10992516032473e-06, "loss": 0.04048867151141167, "memory(GiB)": 21.32, "step": 7180, "token_acc": 0.9875, "train_speed(iter/s)": 0.951192 }, { "epoch": 0.2332781080466491, "grad_norm": 0.5180659890174866, "learning_rate": 9.109619223127677e-06, "loss": 0.049595389515161514, "memory(GiB)": 21.32, "step": 7181, "token_acc": 0.9734848484848485, "train_speed(iter/s)": 0.951214 }, { "epoch": 0.23331059350940453, "grad_norm": 0.734250545501709, "learning_rate": 9.109313238500148e-06, "loss": 0.03236981853842735, "memory(GiB)": 21.32, "step": 7182, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.951234 }, { "epoch": 0.23334307897215995, "grad_norm": 0.5745910406112671, "learning_rate": 9.109007206445673e-06, "loss": 0.04191562533378601, "memory(GiB)": 21.32, "step": 7183, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.951257 }, { "epoch": 0.2333755644349154, "grad_norm": 0.6346140503883362, "learning_rate": 9.108701126967787e-06, "loss": 0.033467311412096024, "memory(GiB)": 21.32, "step": 7184, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.951278 }, { "epoch": 0.2334080498976708, "grad_norm": 0.8426249027252197, "learning_rate": 9.108395000070023e-06, "loss": 0.04789052903652191, "memory(GiB)": 21.32, "step": 7185, "token_acc": 0.99, "train_speed(iter/s)": 0.951304 }, { "epoch": 0.23344053536042622, "grad_norm": 0.47530001401901245, "learning_rate": 9.10808882575591e-06, "loss": 0.03721282631158829, "memory(GiB)": 21.32, "step": 7186, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.951332 }, { "epoch": 0.23347302082318164, "grad_norm": 0.49885955452919006, "learning_rate": 9.107782604028985e-06, "loss": 0.03329065069556236, "memory(GiB)": 21.32, "step": 7187, "token_acc": 0.9776785714285714, "train_speed(iter/s)": 0.951359 }, { "epoch": 0.23350550628593705, "grad_norm": 0.4559330642223358, "learning_rate": 9.107476334892782e-06, "loss": 0.033370018005371094, "memory(GiB)": 21.32, "step": 7188, "token_acc": 0.988950276243094, "train_speed(iter/s)": 0.951386 }, { "epoch": 0.23353799174869247, "grad_norm": 0.4291388690471649, "learning_rate": 9.107170018350834e-06, "loss": 0.03834948688745499, "memory(GiB)": 21.32, "step": 7189, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.951416 }, { "epoch": 0.23357047721144789, "grad_norm": 0.5721938610076904, "learning_rate": 9.106863654406679e-06, "loss": 0.038821764290332794, "memory(GiB)": 21.32, "step": 7190, "token_acc": 0.9836956521739131, "train_speed(iter/s)": 0.951445 }, { "epoch": 0.2336029626742033, "grad_norm": 0.580089271068573, "learning_rate": 9.106557243063849e-06, "loss": 0.05632661283016205, "memory(GiB)": 21.32, "step": 7191, "token_acc": 0.9698492462311558, "train_speed(iter/s)": 0.951471 }, { "epoch": 0.23363544813695872, "grad_norm": 0.6249419450759888, "learning_rate": 9.106250784325883e-06, "loss": 0.039937637746334076, "memory(GiB)": 21.32, "step": 7192, "token_acc": 0.9823008849557522, "train_speed(iter/s)": 0.9515 }, { "epoch": 0.23366793359971413, "grad_norm": 0.53696209192276, "learning_rate": 9.105944278196319e-06, "loss": 0.04127590358257294, "memory(GiB)": 21.32, "step": 7193, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.951528 }, { "epoch": 0.23370041906246955, "grad_norm": 1.4604703187942505, "learning_rate": 9.105637724678692e-06, "loss": 0.04192505031824112, "memory(GiB)": 21.32, "step": 7194, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.951551 }, { "epoch": 0.23373290452522497, "grad_norm": 0.512512743473053, "learning_rate": 9.105331123776542e-06, "loss": 0.0354803204536438, "memory(GiB)": 21.32, "step": 7195, "token_acc": 0.988, "train_speed(iter/s)": 0.951573 }, { "epoch": 0.23376538998798038, "grad_norm": 0.908085286617279, "learning_rate": 9.105024475493404e-06, "loss": 0.044089242815971375, "memory(GiB)": 21.32, "step": 7196, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.951594 }, { "epoch": 0.2337978754507358, "grad_norm": 0.5683832764625549, "learning_rate": 9.104717779832824e-06, "loss": 0.025497552007436752, "memory(GiB)": 21.32, "step": 7197, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.951615 }, { "epoch": 0.23383036091349121, "grad_norm": 4.7831010818481445, "learning_rate": 9.104411036798332e-06, "loss": 0.04110482707619667, "memory(GiB)": 21.32, "step": 7198, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.951637 }, { "epoch": 0.23386284637624663, "grad_norm": 0.4611546993255615, "learning_rate": 9.104104246393479e-06, "loss": 0.03225898742675781, "memory(GiB)": 21.32, "step": 7199, "token_acc": 0.9918367346938776, "train_speed(iter/s)": 0.951659 }, { "epoch": 0.23389533183900205, "grad_norm": 0.37431538105010986, "learning_rate": 9.103797408621797e-06, "loss": 0.029212864115834236, "memory(GiB)": 21.32, "step": 7200, "token_acc": 1.0, "train_speed(iter/s)": 0.951681 }, { "epoch": 0.23392781730175746, "grad_norm": 0.49410271644592285, "learning_rate": 9.103490523486834e-06, "loss": 0.03957463800907135, "memory(GiB)": 21.32, "step": 7201, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.951704 }, { "epoch": 0.23396030276451288, "grad_norm": 0.9546404480934143, "learning_rate": 9.103183590992126e-06, "loss": 0.0531223863363266, "memory(GiB)": 21.32, "step": 7202, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.951726 }, { "epoch": 0.2339927882272683, "grad_norm": 0.4537399411201477, "learning_rate": 9.102876611141219e-06, "loss": 0.03868904709815979, "memory(GiB)": 21.32, "step": 7203, "token_acc": 0.9799196787148594, "train_speed(iter/s)": 0.951748 }, { "epoch": 0.2340252736900237, "grad_norm": 0.6078075170516968, "learning_rate": 9.102569583937653e-06, "loss": 0.042402103543281555, "memory(GiB)": 21.32, "step": 7204, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.951768 }, { "epoch": 0.23405775915277913, "grad_norm": 1.4762511253356934, "learning_rate": 9.102262509384976e-06, "loss": 0.036656737327575684, "memory(GiB)": 21.32, "step": 7205, "token_acc": 0.983739837398374, "train_speed(iter/s)": 0.95179 }, { "epoch": 0.23409024461553454, "grad_norm": 1.1796437501907349, "learning_rate": 9.101955387486728e-06, "loss": 0.058706752955913544, "memory(GiB)": 21.32, "step": 7206, "token_acc": 0.9812734082397003, "train_speed(iter/s)": 0.951811 }, { "epoch": 0.23412273007828996, "grad_norm": 0.3777524530887604, "learning_rate": 9.101648218246456e-06, "loss": 0.03323006629943848, "memory(GiB)": 21.32, "step": 7207, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.95183 }, { "epoch": 0.23415521554104538, "grad_norm": 0.3423188626766205, "learning_rate": 9.101341001667705e-06, "loss": 0.03472687304019928, "memory(GiB)": 21.32, "step": 7208, "token_acc": 0.9867109634551495, "train_speed(iter/s)": 0.951852 }, { "epoch": 0.2341877010038008, "grad_norm": 0.3931427597999573, "learning_rate": 9.10103373775402e-06, "loss": 0.03893280029296875, "memory(GiB)": 21.32, "step": 7209, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.951875 }, { "epoch": 0.2342201864665562, "grad_norm": 0.32291722297668457, "learning_rate": 9.100726426508945e-06, "loss": 0.02685103937983513, "memory(GiB)": 21.32, "step": 7210, "token_acc": 0.984375, "train_speed(iter/s)": 0.951897 }, { "epoch": 0.23425267192931162, "grad_norm": 0.5199882388114929, "learning_rate": 9.10041906793603e-06, "loss": 0.041517093777656555, "memory(GiB)": 21.32, "step": 7211, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.951917 }, { "epoch": 0.23428515739206704, "grad_norm": 0.3613044321537018, "learning_rate": 9.100111662038822e-06, "loss": 0.0363335907459259, "memory(GiB)": 21.32, "step": 7212, "token_acc": 0.984, "train_speed(iter/s)": 0.951937 }, { "epoch": 0.23431764285482246, "grad_norm": 0.41023629903793335, "learning_rate": 9.099804208820867e-06, "loss": 0.03399953618645668, "memory(GiB)": 21.32, "step": 7213, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.951956 }, { "epoch": 0.23435012831757787, "grad_norm": 0.5570408701896667, "learning_rate": 9.099496708285715e-06, "loss": 0.042782261967659, "memory(GiB)": 21.32, "step": 7214, "token_acc": 0.9795918367346939, "train_speed(iter/s)": 0.951977 }, { "epoch": 0.2343826137803333, "grad_norm": 0.3591332733631134, "learning_rate": 9.099189160436915e-06, "loss": 0.031653016805648804, "memory(GiB)": 21.32, "step": 7215, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.951998 }, { "epoch": 0.23441509924308873, "grad_norm": 0.5209624171257019, "learning_rate": 9.098881565278017e-06, "loss": 0.042184773832559586, "memory(GiB)": 21.32, "step": 7216, "token_acc": 0.9813084112149533, "train_speed(iter/s)": 0.95202 }, { "epoch": 0.23444758470584415, "grad_norm": 0.37479010224342346, "learning_rate": 9.098573922812568e-06, "loss": 0.041606683284044266, "memory(GiB)": 21.32, "step": 7217, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.952043 }, { "epoch": 0.23448007016859956, "grad_norm": 0.4612186849117279, "learning_rate": 9.098266233044122e-06, "loss": 0.03466347977519035, "memory(GiB)": 21.32, "step": 7218, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.95207 }, { "epoch": 0.23451255563135498, "grad_norm": 0.6713724732398987, "learning_rate": 9.097958495976229e-06, "loss": 0.03918717801570892, "memory(GiB)": 21.32, "step": 7219, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.952097 }, { "epoch": 0.2345450410941104, "grad_norm": 1.0319979190826416, "learning_rate": 9.097650711612441e-06, "loss": 0.047953762114048004, "memory(GiB)": 21.32, "step": 7220, "token_acc": 0.9757281553398058, "train_speed(iter/s)": 0.952124 }, { "epoch": 0.2345775265568658, "grad_norm": 0.4796595871448517, "learning_rate": 9.09734287995631e-06, "loss": 0.03160944953560829, "memory(GiB)": 21.32, "step": 7221, "token_acc": 0.975, "train_speed(iter/s)": 0.952152 }, { "epoch": 0.23461001201962123, "grad_norm": 0.5646664500236511, "learning_rate": 9.097035001011388e-06, "loss": 0.04842224717140198, "memory(GiB)": 21.32, "step": 7222, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.952177 }, { "epoch": 0.23464249748237664, "grad_norm": 0.47055742144584656, "learning_rate": 9.096727074781228e-06, "loss": 0.040490686893463135, "memory(GiB)": 21.32, "step": 7223, "token_acc": 0.98, "train_speed(iter/s)": 0.952205 }, { "epoch": 0.23467498294513206, "grad_norm": 0.8244391679763794, "learning_rate": 9.096419101269384e-06, "loss": 0.04635852947831154, "memory(GiB)": 21.32, "step": 7224, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.952232 }, { "epoch": 0.23470746840788748, "grad_norm": 0.6545242071151733, "learning_rate": 9.096111080479413e-06, "loss": 0.04446316882967949, "memory(GiB)": 21.32, "step": 7225, "token_acc": 0.9834710743801653, "train_speed(iter/s)": 0.952252 }, { "epoch": 0.2347399538706429, "grad_norm": 0.4603092074394226, "learning_rate": 9.095803012414869e-06, "loss": 0.029842261224985123, "memory(GiB)": 21.32, "step": 7226, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.952274 }, { "epoch": 0.2347724393333983, "grad_norm": 0.5492895245552063, "learning_rate": 9.095494897079306e-06, "loss": 0.03572649136185646, "memory(GiB)": 21.32, "step": 7227, "token_acc": 0.9895104895104895, "train_speed(iter/s)": 0.952295 }, { "epoch": 0.23480492479615372, "grad_norm": 0.7913336753845215, "learning_rate": 9.09518673447628e-06, "loss": 0.04658770188689232, "memory(GiB)": 21.32, "step": 7228, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.952315 }, { "epoch": 0.23483741025890914, "grad_norm": 0.8081439137458801, "learning_rate": 9.094878524609349e-06, "loss": 0.03312551602721214, "memory(GiB)": 21.32, "step": 7229, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.952332 }, { "epoch": 0.23486989572166456, "grad_norm": 0.2536362111568451, "learning_rate": 9.09457026748207e-06, "loss": 0.021372858434915543, "memory(GiB)": 21.32, "step": 7230, "token_acc": 1.0, "train_speed(iter/s)": 0.952355 }, { "epoch": 0.23490238118441997, "grad_norm": 0.6159199476242065, "learning_rate": 9.094261963098e-06, "loss": 0.03210341930389404, "memory(GiB)": 21.32, "step": 7231, "token_acc": 0.9836734693877551, "train_speed(iter/s)": 0.952379 }, { "epoch": 0.2349348666471754, "grad_norm": 0.6019871234893799, "learning_rate": 9.093953611460697e-06, "loss": 0.04826163500547409, "memory(GiB)": 21.32, "step": 7232, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.952397 }, { "epoch": 0.2349673521099308, "grad_norm": 0.7028548717498779, "learning_rate": 9.093645212573721e-06, "loss": 0.040936291217803955, "memory(GiB)": 21.32, "step": 7233, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.952418 }, { "epoch": 0.23499983757268622, "grad_norm": 0.7344882488250732, "learning_rate": 9.093336766440629e-06, "loss": 0.04082939028739929, "memory(GiB)": 21.32, "step": 7234, "token_acc": 0.9764705882352941, "train_speed(iter/s)": 0.95244 }, { "epoch": 0.23503232303544164, "grad_norm": 0.4160171151161194, "learning_rate": 9.093028273064983e-06, "loss": 0.028174132108688354, "memory(GiB)": 21.32, "step": 7235, "token_acc": 0.978021978021978, "train_speed(iter/s)": 0.952461 }, { "epoch": 0.23506480849819705, "grad_norm": 0.5275886058807373, "learning_rate": 9.092719732450344e-06, "loss": 0.042338769882917404, "memory(GiB)": 21.32, "step": 7236, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.952481 }, { "epoch": 0.23509729396095247, "grad_norm": 0.48818233609199524, "learning_rate": 9.092411144600272e-06, "loss": 0.03357742726802826, "memory(GiB)": 21.32, "step": 7237, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.952499 }, { "epoch": 0.23512977942370789, "grad_norm": 0.43121644854545593, "learning_rate": 9.092102509518329e-06, "loss": 0.03985046222805977, "memory(GiB)": 21.32, "step": 7238, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.95252 }, { "epoch": 0.2351622648864633, "grad_norm": 0.4716542065143585, "learning_rate": 9.091793827208074e-06, "loss": 0.033560607582330704, "memory(GiB)": 21.32, "step": 7239, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.952541 }, { "epoch": 0.23519475034921872, "grad_norm": 0.5144716501235962, "learning_rate": 9.091485097673073e-06, "loss": 0.034730877727270126, "memory(GiB)": 21.32, "step": 7240, "token_acc": 0.9758454106280193, "train_speed(iter/s)": 0.952564 }, { "epoch": 0.23522723581197413, "grad_norm": 0.5111605525016785, "learning_rate": 9.09117632091689e-06, "loss": 0.03957238048315048, "memory(GiB)": 21.32, "step": 7241, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.952586 }, { "epoch": 0.23525972127472955, "grad_norm": 0.5470714569091797, "learning_rate": 9.090867496943086e-06, "loss": 0.03276177495718002, "memory(GiB)": 21.32, "step": 7242, "token_acc": 0.988, "train_speed(iter/s)": 0.952608 }, { "epoch": 0.23529220673748497, "grad_norm": 0.5376079678535461, "learning_rate": 9.090558625755225e-06, "loss": 0.04339955747127533, "memory(GiB)": 21.32, "step": 7243, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.95263 }, { "epoch": 0.23532469220024038, "grad_norm": 0.5427446365356445, "learning_rate": 9.090249707356873e-06, "loss": 0.046262092888355255, "memory(GiB)": 21.32, "step": 7244, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.952653 }, { "epoch": 0.2353571776629958, "grad_norm": 0.5131019353866577, "learning_rate": 9.089940741751595e-06, "loss": 0.032694123685359955, "memory(GiB)": 21.32, "step": 7245, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.95268 }, { "epoch": 0.23538966312575121, "grad_norm": 0.3352734446525574, "learning_rate": 9.08963172894296e-06, "loss": 0.03453615680336952, "memory(GiB)": 21.32, "step": 7246, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.952708 }, { "epoch": 0.23542214858850663, "grad_norm": 0.9609541296958923, "learning_rate": 9.089322668934529e-06, "loss": 0.03948146849870682, "memory(GiB)": 21.32, "step": 7247, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.952735 }, { "epoch": 0.23545463405126207, "grad_norm": 0.6596882939338684, "learning_rate": 9.089013561729871e-06, "loss": 0.03487232327461243, "memory(GiB)": 21.32, "step": 7248, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.952764 }, { "epoch": 0.2354871195140175, "grad_norm": 0.5393613576889038, "learning_rate": 9.088704407332556e-06, "loss": 0.03993745148181915, "memory(GiB)": 21.32, "step": 7249, "token_acc": 0.9733333333333334, "train_speed(iter/s)": 0.952793 }, { "epoch": 0.2355196049767729, "grad_norm": 0.5511243343353271, "learning_rate": 9.08839520574615e-06, "loss": 0.04142669588327408, "memory(GiB)": 21.32, "step": 7250, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.952821 }, { "epoch": 0.23555209043952832, "grad_norm": 0.4039881229400635, "learning_rate": 9.088085956974221e-06, "loss": 0.03423980623483658, "memory(GiB)": 21.32, "step": 7251, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.952849 }, { "epoch": 0.23558457590228374, "grad_norm": 0.8933954238891602, "learning_rate": 9.087776661020339e-06, "loss": 0.0292104110121727, "memory(GiB)": 21.32, "step": 7252, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.952878 }, { "epoch": 0.23561706136503915, "grad_norm": 0.42010653018951416, "learning_rate": 9.087467317888072e-06, "loss": 0.03598037362098694, "memory(GiB)": 21.32, "step": 7253, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.952907 }, { "epoch": 0.23564954682779457, "grad_norm": 0.4826960861682892, "learning_rate": 9.087157927580994e-06, "loss": 0.033366963267326355, "memory(GiB)": 21.32, "step": 7254, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.952933 }, { "epoch": 0.23568203229055, "grad_norm": 0.4125293493270874, "learning_rate": 9.086848490102673e-06, "loss": 0.02993885800242424, "memory(GiB)": 21.32, "step": 7255, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.95296 }, { "epoch": 0.2357145177533054, "grad_norm": 0.4424593448638916, "learning_rate": 9.08653900545668e-06, "loss": 0.03623676672577858, "memory(GiB)": 21.32, "step": 7256, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.952982 }, { "epoch": 0.23574700321606082, "grad_norm": 0.5077106952667236, "learning_rate": 9.086229473646588e-06, "loss": 0.038359686732292175, "memory(GiB)": 21.32, "step": 7257, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.953005 }, { "epoch": 0.23577948867881623, "grad_norm": 0.6124493479728699, "learning_rate": 9.085919894675968e-06, "loss": 0.03928514942526817, "memory(GiB)": 21.32, "step": 7258, "token_acc": 0.9864406779661017, "train_speed(iter/s)": 0.953028 }, { "epoch": 0.23581197414157165, "grad_norm": 0.4868687391281128, "learning_rate": 9.085610268548397e-06, "loss": 0.03476835787296295, "memory(GiB)": 21.32, "step": 7259, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.95305 }, { "epoch": 0.23584445960432707, "grad_norm": 0.5019357800483704, "learning_rate": 9.085300595267443e-06, "loss": 0.03214512765407562, "memory(GiB)": 21.32, "step": 7260, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.953073 }, { "epoch": 0.23587694506708248, "grad_norm": 0.3624890148639679, "learning_rate": 9.084990874836683e-06, "loss": 0.02877967245876789, "memory(GiB)": 21.32, "step": 7261, "token_acc": 0.9846938775510204, "train_speed(iter/s)": 0.953096 }, { "epoch": 0.2359094305298379, "grad_norm": 0.46025779843330383, "learning_rate": 9.084681107259692e-06, "loss": 0.03264979273080826, "memory(GiB)": 21.32, "step": 7262, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.95312 }, { "epoch": 0.23594191599259332, "grad_norm": 0.3900289237499237, "learning_rate": 9.084371292540045e-06, "loss": 0.0359765999019146, "memory(GiB)": 21.32, "step": 7263, "token_acc": 0.9890510948905109, "train_speed(iter/s)": 0.953142 }, { "epoch": 0.23597440145534873, "grad_norm": 0.9271882176399231, "learning_rate": 9.084061430681315e-06, "loss": 0.04410363361239433, "memory(GiB)": 21.32, "step": 7264, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.953163 }, { "epoch": 0.23600688691810415, "grad_norm": 0.5931317210197449, "learning_rate": 9.083751521687081e-06, "loss": 0.0445466972887516, "memory(GiB)": 21.32, "step": 7265, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.953183 }, { "epoch": 0.23603937238085956, "grad_norm": 0.4646367132663727, "learning_rate": 9.08344156556092e-06, "loss": 0.03894588351249695, "memory(GiB)": 21.32, "step": 7266, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.953202 }, { "epoch": 0.23607185784361498, "grad_norm": 0.44737470149993896, "learning_rate": 9.083131562306406e-06, "loss": 0.03370142728090286, "memory(GiB)": 21.32, "step": 7267, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.953222 }, { "epoch": 0.2361043433063704, "grad_norm": 0.4826795160770416, "learning_rate": 9.082821511927121e-06, "loss": 0.03496190160512924, "memory(GiB)": 21.32, "step": 7268, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.953246 }, { "epoch": 0.2361368287691258, "grad_norm": 0.3616524636745453, "learning_rate": 9.082511414426641e-06, "loss": 0.031795524060726166, "memory(GiB)": 21.32, "step": 7269, "token_acc": 1.0, "train_speed(iter/s)": 0.953267 }, { "epoch": 0.23616931423188123, "grad_norm": 0.476382851600647, "learning_rate": 9.082201269808546e-06, "loss": 0.0347730815410614, "memory(GiB)": 21.32, "step": 7270, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.953291 }, { "epoch": 0.23620179969463664, "grad_norm": 0.5228596329689026, "learning_rate": 9.081891078076415e-06, "loss": 0.04265813156962395, "memory(GiB)": 21.32, "step": 7271, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.953314 }, { "epoch": 0.23623428515739206, "grad_norm": 0.8999029994010925, "learning_rate": 9.081580839233827e-06, "loss": 0.04812239482998848, "memory(GiB)": 21.32, "step": 7272, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.953338 }, { "epoch": 0.23626677062014748, "grad_norm": 0.5162258148193359, "learning_rate": 9.081270553284364e-06, "loss": 0.03209352865815163, "memory(GiB)": 21.32, "step": 7273, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.95336 }, { "epoch": 0.2362992560829029, "grad_norm": 0.48049449920654297, "learning_rate": 9.080960220231608e-06, "loss": 0.0306860264390707, "memory(GiB)": 21.32, "step": 7274, "token_acc": 0.9813953488372092, "train_speed(iter/s)": 0.953378 }, { "epoch": 0.2363317415456583, "grad_norm": 0.45640838146209717, "learning_rate": 9.080649840079137e-06, "loss": 0.03275898098945618, "memory(GiB)": 21.32, "step": 7275, "token_acc": 0.9665071770334929, "train_speed(iter/s)": 0.953398 }, { "epoch": 0.23636422700841372, "grad_norm": 0.6162883639335632, "learning_rate": 9.080339412830536e-06, "loss": 0.04357244446873665, "memory(GiB)": 21.32, "step": 7276, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.95342 }, { "epoch": 0.23639671247116914, "grad_norm": 0.443602055311203, "learning_rate": 9.080028938489388e-06, "loss": 0.03972318023443222, "memory(GiB)": 21.32, "step": 7277, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.953441 }, { "epoch": 0.23642919793392456, "grad_norm": 0.49721458554267883, "learning_rate": 9.079718417059275e-06, "loss": 0.03650146350264549, "memory(GiB)": 21.32, "step": 7278, "token_acc": 0.981549815498155, "train_speed(iter/s)": 0.953463 }, { "epoch": 0.23646168339667997, "grad_norm": 0.39022448658943176, "learning_rate": 9.079407848543782e-06, "loss": 0.029911870136857033, "memory(GiB)": 21.32, "step": 7279, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.953489 }, { "epoch": 0.23649416885943542, "grad_norm": 0.7482038140296936, "learning_rate": 9.079097232946493e-06, "loss": 0.03385176509618759, "memory(GiB)": 21.32, "step": 7280, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.953514 }, { "epoch": 0.23652665432219083, "grad_norm": 0.662733793258667, "learning_rate": 9.078786570270992e-06, "loss": 0.04379325360059738, "memory(GiB)": 21.32, "step": 7281, "token_acc": 0.99, "train_speed(iter/s)": 0.95354 }, { "epoch": 0.23655913978494625, "grad_norm": 0.4975734353065491, "learning_rate": 9.078475860520866e-06, "loss": 0.03586965799331665, "memory(GiB)": 21.32, "step": 7282, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.953566 }, { "epoch": 0.23659162524770166, "grad_norm": 0.6336108446121216, "learning_rate": 9.078165103699697e-06, "loss": 0.04392929747700691, "memory(GiB)": 21.32, "step": 7283, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.953589 }, { "epoch": 0.23662411071045708, "grad_norm": 0.6844936609268188, "learning_rate": 9.077854299811077e-06, "loss": 0.04573193937540054, "memory(GiB)": 21.32, "step": 7284, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.953609 }, { "epoch": 0.2366565961732125, "grad_norm": 0.5459175109863281, "learning_rate": 9.077543448858592e-06, "loss": 0.036936718970537186, "memory(GiB)": 21.32, "step": 7285, "token_acc": 0.9895104895104895, "train_speed(iter/s)": 0.953627 }, { "epoch": 0.2366890816359679, "grad_norm": 0.40649065375328064, "learning_rate": 9.077232550845827e-06, "loss": 0.029824554920196533, "memory(GiB)": 21.32, "step": 7286, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.953647 }, { "epoch": 0.23672156709872333, "grad_norm": 0.47123250365257263, "learning_rate": 9.076921605776372e-06, "loss": 0.03588281571865082, "memory(GiB)": 21.32, "step": 7287, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.953669 }, { "epoch": 0.23675405256147874, "grad_norm": 0.3808956742286682, "learning_rate": 9.076610613653816e-06, "loss": 0.030976947396993637, "memory(GiB)": 21.32, "step": 7288, "token_acc": 0.9700374531835206, "train_speed(iter/s)": 0.953688 }, { "epoch": 0.23678653802423416, "grad_norm": 0.6928884387016296, "learning_rate": 9.076299574481747e-06, "loss": 0.03955230861902237, "memory(GiB)": 21.32, "step": 7289, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.95371 }, { "epoch": 0.23681902348698958, "grad_norm": 0.5671364068984985, "learning_rate": 9.075988488263753e-06, "loss": 0.037282463163137436, "memory(GiB)": 21.32, "step": 7290, "token_acc": 0.9707602339181286, "train_speed(iter/s)": 0.953732 }, { "epoch": 0.236851508949745, "grad_norm": 0.5002052783966064, "learning_rate": 9.07567735500343e-06, "loss": 0.03627898544073105, "memory(GiB)": 21.32, "step": 7291, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.953752 }, { "epoch": 0.2368839944125004, "grad_norm": 0.41213324666023254, "learning_rate": 9.075366174704364e-06, "loss": 0.03323821723461151, "memory(GiB)": 21.32, "step": 7292, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.953771 }, { "epoch": 0.23691647987525583, "grad_norm": 0.6634558439254761, "learning_rate": 9.075054947370147e-06, "loss": 0.03849811106920242, "memory(GiB)": 21.32, "step": 7293, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.953791 }, { "epoch": 0.23694896533801124, "grad_norm": 0.5414355397224426, "learning_rate": 9.074743673004374e-06, "loss": 0.03387279808521271, "memory(GiB)": 21.32, "step": 7294, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.95381 }, { "epoch": 0.23698145080076666, "grad_norm": 0.5055177211761475, "learning_rate": 9.074432351610635e-06, "loss": 0.041498325765132904, "memory(GiB)": 21.32, "step": 7295, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.953831 }, { "epoch": 0.23701393626352207, "grad_norm": 0.42272040247917175, "learning_rate": 9.074120983192524e-06, "loss": 0.03286556899547577, "memory(GiB)": 21.32, "step": 7296, "token_acc": 0.9923664122137404, "train_speed(iter/s)": 0.953851 }, { "epoch": 0.2370464217262775, "grad_norm": 0.5273209810256958, "learning_rate": 9.073809567753634e-06, "loss": 0.041954588145017624, "memory(GiB)": 21.32, "step": 7297, "token_acc": 0.9701492537313433, "train_speed(iter/s)": 0.953872 }, { "epoch": 0.2370789071890329, "grad_norm": 0.6545173525810242, "learning_rate": 9.073498105297559e-06, "loss": 0.03462757170200348, "memory(GiB)": 21.32, "step": 7298, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.95389 }, { "epoch": 0.23711139265178832, "grad_norm": 0.41687577962875366, "learning_rate": 9.073186595827895e-06, "loss": 0.03820182383060455, "memory(GiB)": 21.32, "step": 7299, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.95391 }, { "epoch": 0.23714387811454374, "grad_norm": 0.4765644371509552, "learning_rate": 9.072875039348235e-06, "loss": 0.03735630214214325, "memory(GiB)": 21.32, "step": 7300, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.953933 }, { "epoch": 0.23717636357729915, "grad_norm": 0.9329784512519836, "learning_rate": 9.072563435862176e-06, "loss": 0.05067409574985504, "memory(GiB)": 21.32, "step": 7301, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.953952 }, { "epoch": 0.23720884904005457, "grad_norm": 0.5207351446151733, "learning_rate": 9.072251785373315e-06, "loss": 0.034339696168899536, "memory(GiB)": 21.32, "step": 7302, "token_acc": 0.99, "train_speed(iter/s)": 0.953974 }, { "epoch": 0.23724133450281, "grad_norm": 0.5469412207603455, "learning_rate": 9.071940087885248e-06, "loss": 0.0366208553314209, "memory(GiB)": 21.32, "step": 7303, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.953997 }, { "epoch": 0.2372738199655654, "grad_norm": 0.6179095506668091, "learning_rate": 9.071628343401573e-06, "loss": 0.03654792904853821, "memory(GiB)": 21.32, "step": 7304, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.954024 }, { "epoch": 0.23730630542832082, "grad_norm": 0.4745834767818451, "learning_rate": 9.071316551925889e-06, "loss": 0.026066802442073822, "memory(GiB)": 21.32, "step": 7305, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.954051 }, { "epoch": 0.23733879089107623, "grad_norm": 1.596667766571045, "learning_rate": 9.07100471346179e-06, "loss": 0.03839299827814102, "memory(GiB)": 21.32, "step": 7306, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.954078 }, { "epoch": 0.23737127635383165, "grad_norm": 0.6474820375442505, "learning_rate": 9.07069282801288e-06, "loss": 0.04008614271879196, "memory(GiB)": 21.32, "step": 7307, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.954105 }, { "epoch": 0.23740376181658707, "grad_norm": 0.5411528944969177, "learning_rate": 9.070380895582756e-06, "loss": 0.036012474447488785, "memory(GiB)": 21.32, "step": 7308, "token_acc": 0.9779411764705882, "train_speed(iter/s)": 0.954133 }, { "epoch": 0.23743624727934248, "grad_norm": 0.3532239496707916, "learning_rate": 9.070068916175019e-06, "loss": 0.029362931847572327, "memory(GiB)": 21.32, "step": 7309, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.95416 }, { "epoch": 0.2374687327420979, "grad_norm": 0.5931797623634338, "learning_rate": 9.069756889793268e-06, "loss": 0.03193506971001625, "memory(GiB)": 21.32, "step": 7310, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.954188 }, { "epoch": 0.23750121820485331, "grad_norm": 0.5437283515930176, "learning_rate": 9.06944481644111e-06, "loss": 0.036446478217840195, "memory(GiB)": 21.32, "step": 7311, "token_acc": 0.973384030418251, "train_speed(iter/s)": 0.954215 }, { "epoch": 0.23753370366760876, "grad_norm": 0.5149916410446167, "learning_rate": 9.069132696122139e-06, "loss": 0.050335224717855453, "memory(GiB)": 21.32, "step": 7312, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.954243 }, { "epoch": 0.23756618913036417, "grad_norm": 0.42920219898223877, "learning_rate": 9.06882052883996e-06, "loss": 0.03658159822225571, "memory(GiB)": 21.32, "step": 7313, "token_acc": 0.9806763285024155, "train_speed(iter/s)": 0.954271 }, { "epoch": 0.2375986745931196, "grad_norm": 1.2872415781021118, "learning_rate": 9.068508314598179e-06, "loss": 0.04184328392148018, "memory(GiB)": 21.32, "step": 7314, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.9543 }, { "epoch": 0.237631160055875, "grad_norm": 0.4039425849914551, "learning_rate": 9.068196053400396e-06, "loss": 0.03159230202436447, "memory(GiB)": 21.32, "step": 7315, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.954327 }, { "epoch": 0.23766364551863042, "grad_norm": 0.6297569274902344, "learning_rate": 9.067883745250214e-06, "loss": 0.04406675696372986, "memory(GiB)": 21.32, "step": 7316, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.954355 }, { "epoch": 0.23769613098138584, "grad_norm": 0.7174239158630371, "learning_rate": 9.067571390151241e-06, "loss": 0.043645769357681274, "memory(GiB)": 21.32, "step": 7317, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.954378 }, { "epoch": 0.23772861644414126, "grad_norm": 0.5599774122238159, "learning_rate": 9.06725898810708e-06, "loss": 0.03543618321418762, "memory(GiB)": 21.32, "step": 7318, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.9544 }, { "epoch": 0.23776110190689667, "grad_norm": 0.6378228664398193, "learning_rate": 9.066946539121338e-06, "loss": 0.036215879023075104, "memory(GiB)": 21.32, "step": 7319, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.954422 }, { "epoch": 0.2377935873696521, "grad_norm": 0.6383257508277893, "learning_rate": 9.066634043197617e-06, "loss": 0.042155731469392776, "memory(GiB)": 21.32, "step": 7320, "token_acc": 0.9767441860465116, "train_speed(iter/s)": 0.954443 }, { "epoch": 0.2378260728324075, "grad_norm": 0.7958266735076904, "learning_rate": 9.066321500339528e-06, "loss": 0.0411541610956192, "memory(GiB)": 21.32, "step": 7321, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.954465 }, { "epoch": 0.23785855829516292, "grad_norm": 0.6338625550270081, "learning_rate": 9.066008910550677e-06, "loss": 0.037087976932525635, "memory(GiB)": 21.32, "step": 7322, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.954489 }, { "epoch": 0.23789104375791834, "grad_norm": 0.4526076316833496, "learning_rate": 9.06569627383467e-06, "loss": 0.032513879239559174, "memory(GiB)": 21.32, "step": 7323, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.954511 }, { "epoch": 0.23792352922067375, "grad_norm": 0.5993563532829285, "learning_rate": 9.065383590195119e-06, "loss": 0.039549924433231354, "memory(GiB)": 21.32, "step": 7324, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.954534 }, { "epoch": 0.23795601468342917, "grad_norm": 0.4388045370578766, "learning_rate": 9.065070859635627e-06, "loss": 0.03370872139930725, "memory(GiB)": 21.32, "step": 7325, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.954555 }, { "epoch": 0.23798850014618458, "grad_norm": 0.5905522704124451, "learning_rate": 9.06475808215981e-06, "loss": 0.03592477738857269, "memory(GiB)": 21.32, "step": 7326, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.954577 }, { "epoch": 0.23802098560894, "grad_norm": 0.4792170524597168, "learning_rate": 9.064445257771273e-06, "loss": 0.034488484263420105, "memory(GiB)": 21.32, "step": 7327, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.954598 }, { "epoch": 0.23805347107169542, "grad_norm": 0.45009979605674744, "learning_rate": 9.064132386473627e-06, "loss": 0.04444419592618942, "memory(GiB)": 21.32, "step": 7328, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.954619 }, { "epoch": 0.23808595653445083, "grad_norm": 0.3478117287158966, "learning_rate": 9.063819468270484e-06, "loss": 0.0319875031709671, "memory(GiB)": 21.32, "step": 7329, "token_acc": 0.984, "train_speed(iter/s)": 0.954642 }, { "epoch": 0.23811844199720625, "grad_norm": 0.7569363713264465, "learning_rate": 9.063506503165459e-06, "loss": 0.045928746461868286, "memory(GiB)": 21.32, "step": 7330, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.954665 }, { "epoch": 0.23815092745996166, "grad_norm": 0.3830643892288208, "learning_rate": 9.063193491162157e-06, "loss": 0.029587531462311745, "memory(GiB)": 21.32, "step": 7331, "token_acc": 0.984, "train_speed(iter/s)": 0.954686 }, { "epoch": 0.23818341292271708, "grad_norm": 0.4877535104751587, "learning_rate": 9.062880432264194e-06, "loss": 0.0360613688826561, "memory(GiB)": 21.32, "step": 7332, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.954708 }, { "epoch": 0.2382158983854725, "grad_norm": 2.409222364425659, "learning_rate": 9.062567326475185e-06, "loss": 0.04688207060098648, "memory(GiB)": 21.32, "step": 7333, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.954731 }, { "epoch": 0.2382483838482279, "grad_norm": 0.6662511825561523, "learning_rate": 9.06225417379874e-06, "loss": 0.04234649986028671, "memory(GiB)": 21.32, "step": 7334, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.954752 }, { "epoch": 0.23828086931098333, "grad_norm": 0.7085468173027039, "learning_rate": 9.061940974238476e-06, "loss": 0.046402011066675186, "memory(GiB)": 21.32, "step": 7335, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.954773 }, { "epoch": 0.23831335477373874, "grad_norm": 0.43290403485298157, "learning_rate": 9.061627727798005e-06, "loss": 0.036172717809677124, "memory(GiB)": 21.32, "step": 7336, "token_acc": 0.975103734439834, "train_speed(iter/s)": 0.954793 }, { "epoch": 0.23834584023649416, "grad_norm": 0.6750836968421936, "learning_rate": 9.061314434480947e-06, "loss": 0.042113013565540314, "memory(GiB)": 21.32, "step": 7337, "token_acc": 0.9698492462311558, "train_speed(iter/s)": 0.954815 }, { "epoch": 0.23837832569924958, "grad_norm": 0.40100544691085815, "learning_rate": 9.061001094290913e-06, "loss": 0.030288437381386757, "memory(GiB)": 21.32, "step": 7338, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.954838 }, { "epoch": 0.238410811162005, "grad_norm": 0.655098021030426, "learning_rate": 9.060687707231522e-06, "loss": 0.04239929839968681, "memory(GiB)": 21.32, "step": 7339, "token_acc": 0.9765625, "train_speed(iter/s)": 0.954862 }, { "epoch": 0.2384432966247604, "grad_norm": 0.407763808965683, "learning_rate": 9.06037427330639e-06, "loss": 0.027744103223085403, "memory(GiB)": 21.32, "step": 7340, "token_acc": 1.0, "train_speed(iter/s)": 0.954886 }, { "epoch": 0.23847578208751583, "grad_norm": 0.5102956891059875, "learning_rate": 9.060060792519134e-06, "loss": 0.03820186108350754, "memory(GiB)": 21.32, "step": 7341, "token_acc": 0.9692982456140351, "train_speed(iter/s)": 0.954915 }, { "epoch": 0.23850826755027124, "grad_norm": 0.5126075744628906, "learning_rate": 9.059747264873372e-06, "loss": 0.03253670781850815, "memory(GiB)": 21.32, "step": 7342, "token_acc": 0.9968051118210862, "train_speed(iter/s)": 0.954943 }, { "epoch": 0.23854075301302666, "grad_norm": 0.5686457753181458, "learning_rate": 9.059433690372723e-06, "loss": 0.035511430352926254, "memory(GiB)": 21.32, "step": 7343, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.954967 }, { "epoch": 0.2385732384757821, "grad_norm": 0.682141125202179, "learning_rate": 9.059120069020807e-06, "loss": 0.0515766479074955, "memory(GiB)": 21.32, "step": 7344, "token_acc": 0.9629629629629629, "train_speed(iter/s)": 0.954987 }, { "epoch": 0.23860572393853752, "grad_norm": 0.5825960636138916, "learning_rate": 9.058806400821242e-06, "loss": 0.03193044662475586, "memory(GiB)": 21.32, "step": 7345, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.955007 }, { "epoch": 0.23863820940129293, "grad_norm": 1.5979056358337402, "learning_rate": 9.058492685777652e-06, "loss": 0.03571007773280144, "memory(GiB)": 21.32, "step": 7346, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.95503 }, { "epoch": 0.23867069486404835, "grad_norm": 0.525817334651947, "learning_rate": 9.058178923893651e-06, "loss": 0.0411296971142292, "memory(GiB)": 21.32, "step": 7347, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.955051 }, { "epoch": 0.23870318032680377, "grad_norm": 0.4970410466194153, "learning_rate": 9.057865115172863e-06, "loss": 0.04417722672224045, "memory(GiB)": 21.32, "step": 7348, "token_acc": 0.9805825242718447, "train_speed(iter/s)": 0.955071 }, { "epoch": 0.23873566578955918, "grad_norm": 0.5142901539802551, "learning_rate": 9.057551259618912e-06, "loss": 0.043754931539297104, "memory(GiB)": 21.32, "step": 7349, "token_acc": 0.9766536964980544, "train_speed(iter/s)": 0.955093 }, { "epoch": 0.2387681512523146, "grad_norm": 0.3334901034832001, "learning_rate": 9.05723735723542e-06, "loss": 0.02663390524685383, "memory(GiB)": 21.32, "step": 7350, "token_acc": 1.0, "train_speed(iter/s)": 0.955111 }, { "epoch": 0.23880063671507, "grad_norm": 0.3162264823913574, "learning_rate": 9.056923408026009e-06, "loss": 0.028002582490444183, "memory(GiB)": 21.32, "step": 7351, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.955132 }, { "epoch": 0.23883312217782543, "grad_norm": 0.6915937662124634, "learning_rate": 9.056609411994302e-06, "loss": 0.041124407202005386, "memory(GiB)": 21.32, "step": 7352, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.955151 }, { "epoch": 0.23886560764058085, "grad_norm": 0.40288811922073364, "learning_rate": 9.05629536914392e-06, "loss": 0.030573351308703423, "memory(GiB)": 21.32, "step": 7353, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.955171 }, { "epoch": 0.23889809310333626, "grad_norm": 0.6158856749534607, "learning_rate": 9.055981279478495e-06, "loss": 0.04970761388540268, "memory(GiB)": 21.32, "step": 7354, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.955189 }, { "epoch": 0.23893057856609168, "grad_norm": 0.4087369441986084, "learning_rate": 9.055667143001644e-06, "loss": 0.03345279023051262, "memory(GiB)": 21.32, "step": 7355, "token_acc": 0.98828125, "train_speed(iter/s)": 0.955209 }, { "epoch": 0.2389630640288471, "grad_norm": 0.5172246694564819, "learning_rate": 9.055352959716996e-06, "loss": 0.03363385796546936, "memory(GiB)": 21.32, "step": 7356, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.955228 }, { "epoch": 0.2389955494916025, "grad_norm": 0.5600990056991577, "learning_rate": 9.05503872962818e-06, "loss": 0.0351560041308403, "memory(GiB)": 21.32, "step": 7357, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955251 }, { "epoch": 0.23902803495435793, "grad_norm": 0.7246681451797485, "learning_rate": 9.054724452738818e-06, "loss": 0.04574136808514595, "memory(GiB)": 21.32, "step": 7358, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.955274 }, { "epoch": 0.23906052041711334, "grad_norm": 0.49884232878685, "learning_rate": 9.05441012905254e-06, "loss": 0.03827175498008728, "memory(GiB)": 21.32, "step": 7359, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.955294 }, { "epoch": 0.23909300587986876, "grad_norm": 0.5705965757369995, "learning_rate": 9.054095758572972e-06, "loss": 0.039249107241630554, "memory(GiB)": 21.32, "step": 7360, "token_acc": 0.992, "train_speed(iter/s)": 0.955315 }, { "epoch": 0.23912549134262417, "grad_norm": 0.41049453616142273, "learning_rate": 9.053781341303743e-06, "loss": 0.032262060791254044, "memory(GiB)": 21.32, "step": 7361, "token_acc": 0.993006993006993, "train_speed(iter/s)": 0.955336 }, { "epoch": 0.2391579768053796, "grad_norm": 0.6452081203460693, "learning_rate": 9.05346687724848e-06, "loss": 0.030456071719527245, "memory(GiB)": 21.32, "step": 7362, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.955358 }, { "epoch": 0.239190462268135, "grad_norm": 0.594282865524292, "learning_rate": 9.053152366410815e-06, "loss": 0.04324401542544365, "memory(GiB)": 21.32, "step": 7363, "token_acc": 0.9854014598540146, "train_speed(iter/s)": 0.955384 }, { "epoch": 0.23922294773089042, "grad_norm": 0.536318302154541, "learning_rate": 9.052837808794379e-06, "loss": 0.04030371829867363, "memory(GiB)": 21.32, "step": 7364, "token_acc": 0.9757281553398058, "train_speed(iter/s)": 0.955412 }, { "epoch": 0.23925543319364584, "grad_norm": 0.6570704579353333, "learning_rate": 9.052523204402799e-06, "loss": 0.045769669115543365, "memory(GiB)": 21.32, "step": 7365, "token_acc": 0.9752066115702479, "train_speed(iter/s)": 0.955441 }, { "epoch": 0.23928791865640126, "grad_norm": 0.43359461426734924, "learning_rate": 9.052208553239708e-06, "loss": 0.03059692122042179, "memory(GiB)": 21.32, "step": 7366, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955466 }, { "epoch": 0.23932040411915667, "grad_norm": 0.4756467044353485, "learning_rate": 9.051893855308735e-06, "loss": 0.042537037283182144, "memory(GiB)": 21.32, "step": 7367, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.955491 }, { "epoch": 0.2393528895819121, "grad_norm": 0.49050962924957275, "learning_rate": 9.051579110613515e-06, "loss": 0.028579674661159515, "memory(GiB)": 21.32, "step": 7368, "token_acc": 0.9795081967213115, "train_speed(iter/s)": 0.955517 }, { "epoch": 0.2393853750446675, "grad_norm": 0.3989458978176117, "learning_rate": 9.05126431915768e-06, "loss": 0.02790842391550541, "memory(GiB)": 21.32, "step": 7369, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.955545 }, { "epoch": 0.23941786050742292, "grad_norm": 0.4680584669113159, "learning_rate": 9.050949480944861e-06, "loss": 0.04108244925737381, "memory(GiB)": 21.32, "step": 7370, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.955573 }, { "epoch": 0.23945034597017834, "grad_norm": 0.41859185695648193, "learning_rate": 9.050634595978696e-06, "loss": 0.032749440521001816, "memory(GiB)": 21.32, "step": 7371, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.9556 }, { "epoch": 0.23948283143293375, "grad_norm": 0.35399043560028076, "learning_rate": 9.050319664262815e-06, "loss": 0.02996181510388851, "memory(GiB)": 21.32, "step": 7372, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.955627 }, { "epoch": 0.23951531689568917, "grad_norm": 0.5170664191246033, "learning_rate": 9.050004685800853e-06, "loss": 0.03318243473768234, "memory(GiB)": 21.32, "step": 7373, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.955652 }, { "epoch": 0.23954780235844458, "grad_norm": 0.4413334131240845, "learning_rate": 9.049689660596448e-06, "loss": 0.029804691672325134, "memory(GiB)": 21.32, "step": 7374, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.95568 }, { "epoch": 0.2395802878212, "grad_norm": 0.585090696811676, "learning_rate": 9.049374588653232e-06, "loss": 0.03700333833694458, "memory(GiB)": 21.32, "step": 7375, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.955707 }, { "epoch": 0.23961277328395544, "grad_norm": 0.36496394872665405, "learning_rate": 9.049059469974846e-06, "loss": 0.02995225414633751, "memory(GiB)": 21.32, "step": 7376, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.955735 }, { "epoch": 0.23964525874671086, "grad_norm": 0.6949135661125183, "learning_rate": 9.048744304564922e-06, "loss": 0.038237299770116806, "memory(GiB)": 21.32, "step": 7377, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.955762 }, { "epoch": 0.23967774420946628, "grad_norm": 0.6952599287033081, "learning_rate": 9.0484290924271e-06, "loss": 0.03610137104988098, "memory(GiB)": 21.32, "step": 7378, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.95579 }, { "epoch": 0.2397102296722217, "grad_norm": 0.46723538637161255, "learning_rate": 9.04811383356502e-06, "loss": 0.034910544753074646, "memory(GiB)": 21.32, "step": 7379, "token_acc": 0.9783549783549783, "train_speed(iter/s)": 0.955812 }, { "epoch": 0.2397427151349771, "grad_norm": 0.5662422180175781, "learning_rate": 9.047798527982317e-06, "loss": 0.034695375710725784, "memory(GiB)": 21.32, "step": 7380, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.955835 }, { "epoch": 0.23977520059773252, "grad_norm": 0.5846260786056519, "learning_rate": 9.047483175682632e-06, "loss": 0.05667440593242645, "memory(GiB)": 21.32, "step": 7381, "token_acc": 0.9759615384615384, "train_speed(iter/s)": 0.95586 }, { "epoch": 0.23980768606048794, "grad_norm": 0.4696308672428131, "learning_rate": 9.047167776669604e-06, "loss": 0.02642977237701416, "memory(GiB)": 21.32, "step": 7382, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955882 }, { "epoch": 0.23984017152324336, "grad_norm": 0.6297817230224609, "learning_rate": 9.046852330946872e-06, "loss": 0.042622774839401245, "memory(GiB)": 21.32, "step": 7383, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.955905 }, { "epoch": 0.23987265698599877, "grad_norm": 0.5294188857078552, "learning_rate": 9.046536838518078e-06, "loss": 0.04260038956999779, "memory(GiB)": 21.32, "step": 7384, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955928 }, { "epoch": 0.2399051424487542, "grad_norm": 0.43273425102233887, "learning_rate": 9.046221299386864e-06, "loss": 0.03779084235429764, "memory(GiB)": 21.32, "step": 7385, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.955948 }, { "epoch": 0.2399376279115096, "grad_norm": 0.5061875581741333, "learning_rate": 9.04590571355687e-06, "loss": 0.0346539244055748, "memory(GiB)": 21.32, "step": 7386, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.95597 }, { "epoch": 0.23997011337426502, "grad_norm": 0.46228134632110596, "learning_rate": 9.04559008103174e-06, "loss": 0.03131796419620514, "memory(GiB)": 21.32, "step": 7387, "token_acc": 0.9727272727272728, "train_speed(iter/s)": 0.955992 }, { "epoch": 0.24000259883702044, "grad_norm": 0.6495826244354248, "learning_rate": 9.045274401815114e-06, "loss": 0.04075289145112038, "memory(GiB)": 21.32, "step": 7388, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.956014 }, { "epoch": 0.24003508429977585, "grad_norm": 0.6314598321914673, "learning_rate": 9.044958675910639e-06, "loss": 0.05451688915491104, "memory(GiB)": 21.32, "step": 7389, "token_acc": 0.9649122807017544, "train_speed(iter/s)": 0.956035 }, { "epoch": 0.24006756976253127, "grad_norm": 0.45071983337402344, "learning_rate": 9.044642903321956e-06, "loss": 0.05165467411279678, "memory(GiB)": 21.32, "step": 7390, "token_acc": 0.9856115107913669, "train_speed(iter/s)": 0.956055 }, { "epoch": 0.24010005522528668, "grad_norm": 0.43965062499046326, "learning_rate": 9.04432708405271e-06, "loss": 0.03066122718155384, "memory(GiB)": 21.32, "step": 7391, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.956078 }, { "epoch": 0.2401325406880421, "grad_norm": 0.6807864904403687, "learning_rate": 9.044011218106547e-06, "loss": 0.039621081203222275, "memory(GiB)": 21.32, "step": 7392, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.956099 }, { "epoch": 0.24016502615079752, "grad_norm": 0.6638148427009583, "learning_rate": 9.043695305487112e-06, "loss": 0.034650012850761414, "memory(GiB)": 21.32, "step": 7393, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.956123 }, { "epoch": 0.24019751161355293, "grad_norm": 0.456431120634079, "learning_rate": 9.04337934619805e-06, "loss": 0.03833719342947006, "memory(GiB)": 21.32, "step": 7394, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.956147 }, { "epoch": 0.24022999707630835, "grad_norm": 0.39429569244384766, "learning_rate": 9.043063340243012e-06, "loss": 0.02483813837170601, "memory(GiB)": 21.32, "step": 7395, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.956169 }, { "epoch": 0.24026248253906377, "grad_norm": 0.5312132239341736, "learning_rate": 9.042747287625639e-06, "loss": 0.04087890684604645, "memory(GiB)": 21.32, "step": 7396, "token_acc": 0.9765625, "train_speed(iter/s)": 0.956191 }, { "epoch": 0.24029496800181918, "grad_norm": 0.49206721782684326, "learning_rate": 9.042431188349582e-06, "loss": 0.03881854936480522, "memory(GiB)": 21.32, "step": 7397, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.956212 }, { "epoch": 0.2403274534645746, "grad_norm": 0.5569975972175598, "learning_rate": 9.042115042418487e-06, "loss": 0.044505663216114044, "memory(GiB)": 21.32, "step": 7398, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.956231 }, { "epoch": 0.24035993892733, "grad_norm": 0.4685872197151184, "learning_rate": 9.041798849836006e-06, "loss": 0.03636518120765686, "memory(GiB)": 21.32, "step": 7399, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.956252 }, { "epoch": 0.24039242439008543, "grad_norm": 0.6779491305351257, "learning_rate": 9.041482610605788e-06, "loss": 0.04675571992993355, "memory(GiB)": 21.32, "step": 7400, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.956274 }, { "epoch": 0.24042490985284085, "grad_norm": 0.7155365347862244, "learning_rate": 9.04116632473148e-06, "loss": 0.04050502926111221, "memory(GiB)": 21.32, "step": 7401, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.956297 }, { "epoch": 0.24045739531559626, "grad_norm": 0.5610573887825012, "learning_rate": 9.040849992216733e-06, "loss": 0.037812065333127975, "memory(GiB)": 21.32, "step": 7402, "token_acc": 0.9730639730639731, "train_speed(iter/s)": 0.956317 }, { "epoch": 0.24048988077835168, "grad_norm": 0.4091087877750397, "learning_rate": 9.0405336130652e-06, "loss": 0.03582202270627022, "memory(GiB)": 21.32, "step": 7403, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.956336 }, { "epoch": 0.2405223662411071, "grad_norm": 0.5192064642906189, "learning_rate": 9.04021718728053e-06, "loss": 0.03931422159075737, "memory(GiB)": 21.32, "step": 7404, "token_acc": 0.9806949806949807, "train_speed(iter/s)": 0.956357 }, { "epoch": 0.2405548517038625, "grad_norm": 0.4870133399963379, "learning_rate": 9.039900714866376e-06, "loss": 0.033753812313079834, "memory(GiB)": 21.32, "step": 7405, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.956379 }, { "epoch": 0.24058733716661793, "grad_norm": 1.4066787958145142, "learning_rate": 9.039584195826393e-06, "loss": 0.04322997108101845, "memory(GiB)": 21.32, "step": 7406, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.956399 }, { "epoch": 0.24061982262937334, "grad_norm": 0.6107823252677917, "learning_rate": 9.039267630164228e-06, "loss": 0.02952456846833229, "memory(GiB)": 21.32, "step": 7407, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.95642 }, { "epoch": 0.24065230809212879, "grad_norm": 0.43475341796875, "learning_rate": 9.03895101788354e-06, "loss": 0.031915996223688126, "memory(GiB)": 21.32, "step": 7408, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956441 }, { "epoch": 0.2406847935548842, "grad_norm": 0.5264195203781128, "learning_rate": 9.038634358987984e-06, "loss": 0.03939218446612358, "memory(GiB)": 21.32, "step": 7409, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.95646 }, { "epoch": 0.24071727901763962, "grad_norm": 0.5146368741989136, "learning_rate": 9.03831765348121e-06, "loss": 0.04635545611381531, "memory(GiB)": 21.32, "step": 7410, "token_acc": 0.9590163934426229, "train_speed(iter/s)": 0.956481 }, { "epoch": 0.24074976448039503, "grad_norm": 0.5264988541603088, "learning_rate": 9.038000901366874e-06, "loss": 0.04316691681742668, "memory(GiB)": 21.32, "step": 7411, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.956501 }, { "epoch": 0.24078224994315045, "grad_norm": 0.8516761064529419, "learning_rate": 9.037684102648636e-06, "loss": 0.043173015117645264, "memory(GiB)": 21.32, "step": 7412, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.956521 }, { "epoch": 0.24081473540590587, "grad_norm": 0.46508312225341797, "learning_rate": 9.037367257330148e-06, "loss": 0.04190179705619812, "memory(GiB)": 21.32, "step": 7413, "token_acc": 0.966804979253112, "train_speed(iter/s)": 0.95654 }, { "epoch": 0.24084722086866128, "grad_norm": 0.551128625869751, "learning_rate": 9.037050365415067e-06, "loss": 0.04195249080657959, "memory(GiB)": 21.32, "step": 7414, "token_acc": 0.973404255319149, "train_speed(iter/s)": 0.956561 }, { "epoch": 0.2408797063314167, "grad_norm": 0.5550283193588257, "learning_rate": 9.036733426907051e-06, "loss": 0.038404446095228195, "memory(GiB)": 21.32, "step": 7415, "token_acc": 0.9759036144578314, "train_speed(iter/s)": 0.956577 }, { "epoch": 0.24091219179417211, "grad_norm": 0.42707982659339905, "learning_rate": 9.03641644180976e-06, "loss": 0.035006266087293625, "memory(GiB)": 21.32, "step": 7416, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956598 }, { "epoch": 0.24094467725692753, "grad_norm": 0.387075811624527, "learning_rate": 9.03609941012685e-06, "loss": 0.02908533811569214, "memory(GiB)": 21.32, "step": 7417, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.956618 }, { "epoch": 0.24097716271968295, "grad_norm": 0.5190388560295105, "learning_rate": 9.035782331861983e-06, "loss": 0.04216872155666351, "memory(GiB)": 21.32, "step": 7418, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.956639 }, { "epoch": 0.24100964818243836, "grad_norm": 0.444898784160614, "learning_rate": 9.035465207018814e-06, "loss": 0.04096914082765579, "memory(GiB)": 21.32, "step": 7419, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.956659 }, { "epoch": 0.24104213364519378, "grad_norm": 0.5304621458053589, "learning_rate": 9.035148035601005e-06, "loss": 0.03907232731580734, "memory(GiB)": 21.32, "step": 7420, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.956679 }, { "epoch": 0.2410746191079492, "grad_norm": 0.46220141649246216, "learning_rate": 9.034830817612217e-06, "loss": 0.028823919594287872, "memory(GiB)": 21.32, "step": 7421, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.9567 }, { "epoch": 0.2411071045707046, "grad_norm": 0.4710399806499481, "learning_rate": 9.034513553056113e-06, "loss": 0.042284898459911346, "memory(GiB)": 21.32, "step": 7422, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.956727 }, { "epoch": 0.24113959003346003, "grad_norm": 0.994358479976654, "learning_rate": 9.03419624193635e-06, "loss": 0.03533890098333359, "memory(GiB)": 21.32, "step": 7423, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.956754 }, { "epoch": 0.24117207549621544, "grad_norm": 0.6690086722373962, "learning_rate": 9.033878884256594e-06, "loss": 0.04209796339273453, "memory(GiB)": 21.32, "step": 7424, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.956782 }, { "epoch": 0.24120456095897086, "grad_norm": 0.49555540084838867, "learning_rate": 9.033561480020508e-06, "loss": 0.03689970448613167, "memory(GiB)": 21.32, "step": 7425, "token_acc": 0.9820359281437125, "train_speed(iter/s)": 0.956807 }, { "epoch": 0.24123704642172628, "grad_norm": 0.37441080808639526, "learning_rate": 9.033244029231753e-06, "loss": 0.03012493997812271, "memory(GiB)": 21.32, "step": 7426, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956835 }, { "epoch": 0.2412695318844817, "grad_norm": 0.3924883306026459, "learning_rate": 9.032926531893993e-06, "loss": 0.03664572536945343, "memory(GiB)": 21.32, "step": 7427, "token_acc": 0.9844559585492227, "train_speed(iter/s)": 0.956862 }, { "epoch": 0.2413020173472371, "grad_norm": 0.46169543266296387, "learning_rate": 9.032608988010894e-06, "loss": 0.03613394871354103, "memory(GiB)": 21.32, "step": 7428, "token_acc": 0.9927536231884058, "train_speed(iter/s)": 0.95689 }, { "epoch": 0.24133450280999252, "grad_norm": 0.7888153195381165, "learning_rate": 9.032291397586119e-06, "loss": 0.03259243443608284, "memory(GiB)": 21.32, "step": 7429, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.956919 }, { "epoch": 0.24136698827274794, "grad_norm": 0.444068968296051, "learning_rate": 9.031973760623334e-06, "loss": 0.03430795669555664, "memory(GiB)": 21.32, "step": 7430, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.956947 }, { "epoch": 0.24139947373550336, "grad_norm": 0.3534752428531647, "learning_rate": 9.031656077126205e-06, "loss": 0.03135683014988899, "memory(GiB)": 21.32, "step": 7431, "token_acc": 0.9681978798586572, "train_speed(iter/s)": 0.956974 }, { "epoch": 0.24143195919825877, "grad_norm": 0.4219892919063568, "learning_rate": 9.031338347098401e-06, "loss": 0.030176807194948196, "memory(GiB)": 21.32, "step": 7432, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.957001 }, { "epoch": 0.2414644446610142, "grad_norm": 0.6415053009986877, "learning_rate": 9.031020570543585e-06, "loss": 0.03851231560111046, "memory(GiB)": 21.32, "step": 7433, "token_acc": 1.0, "train_speed(iter/s)": 0.957029 }, { "epoch": 0.2414969301237696, "grad_norm": 0.5307110548019409, "learning_rate": 9.030702747465426e-06, "loss": 0.04527562856674194, "memory(GiB)": 21.32, "step": 7434, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.957056 }, { "epoch": 0.24152941558652502, "grad_norm": 0.4870178699493408, "learning_rate": 9.030384877867593e-06, "loss": 0.03893110528588295, "memory(GiB)": 21.32, "step": 7435, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.957083 }, { "epoch": 0.24156190104928044, "grad_norm": 0.44790077209472656, "learning_rate": 9.030066961753754e-06, "loss": 0.033138126134872437, "memory(GiB)": 21.32, "step": 7436, "token_acc": 0.975103734439834, "train_speed(iter/s)": 0.957109 }, { "epoch": 0.24159438651203585, "grad_norm": 0.3984631299972534, "learning_rate": 9.029748999127578e-06, "loss": 0.03863552212715149, "memory(GiB)": 21.32, "step": 7437, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.957134 }, { "epoch": 0.24162687197479127, "grad_norm": 0.5803480744361877, "learning_rate": 9.029430989992736e-06, "loss": 0.034536123275756836, "memory(GiB)": 21.32, "step": 7438, "token_acc": 0.9783783783783784, "train_speed(iter/s)": 0.95716 }, { "epoch": 0.24165935743754668, "grad_norm": 0.38575467467308044, "learning_rate": 9.029112934352897e-06, "loss": 0.02841087058186531, "memory(GiB)": 21.32, "step": 7439, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.957187 }, { "epoch": 0.24169184290030213, "grad_norm": 0.6048884391784668, "learning_rate": 9.028794832211732e-06, "loss": 0.041217122226953506, "memory(GiB)": 21.32, "step": 7440, "token_acc": 0.9812734082397003, "train_speed(iter/s)": 0.957214 }, { "epoch": 0.24172432836305754, "grad_norm": 0.5186249613761902, "learning_rate": 9.028476683572911e-06, "loss": 0.0318019837141037, "memory(GiB)": 21.32, "step": 7441, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957237 }, { "epoch": 0.24175681382581296, "grad_norm": 0.6092933416366577, "learning_rate": 9.028158488440109e-06, "loss": 0.04858870804309845, "memory(GiB)": 21.32, "step": 7442, "token_acc": 0.9683257918552036, "train_speed(iter/s)": 0.957259 }, { "epoch": 0.24178929928856838, "grad_norm": 1.236497402191162, "learning_rate": 9.027840246816996e-06, "loss": 0.043547164648771286, "memory(GiB)": 21.32, "step": 7443, "token_acc": 0.9776785714285714, "train_speed(iter/s)": 0.957276 }, { "epoch": 0.2418217847513238, "grad_norm": 0.45946329832077026, "learning_rate": 9.027521958707245e-06, "loss": 0.036204561591148376, "memory(GiB)": 21.32, "step": 7444, "token_acc": 0.9923371647509579, "train_speed(iter/s)": 0.957297 }, { "epoch": 0.2418542702140792, "grad_norm": 0.8730579614639282, "learning_rate": 9.02720362411453e-06, "loss": 0.039398763328790665, "memory(GiB)": 21.32, "step": 7445, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.95732 }, { "epoch": 0.24188675567683462, "grad_norm": 0.8823497295379639, "learning_rate": 9.026885243042525e-06, "loss": 0.03775281831622124, "memory(GiB)": 21.32, "step": 7446, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.957342 }, { "epoch": 0.24191924113959004, "grad_norm": 0.4372352361679077, "learning_rate": 9.026566815494905e-06, "loss": 0.03638654202222824, "memory(GiB)": 21.32, "step": 7447, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.957365 }, { "epoch": 0.24195172660234546, "grad_norm": 0.5571604371070862, "learning_rate": 9.026248341475344e-06, "loss": 0.04021677374839783, "memory(GiB)": 21.32, "step": 7448, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.957385 }, { "epoch": 0.24198421206510087, "grad_norm": 0.5951229333877563, "learning_rate": 9.025929820987518e-06, "loss": 0.03280682489275932, "memory(GiB)": 21.32, "step": 7449, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.957406 }, { "epoch": 0.2420166975278563, "grad_norm": 0.4950934946537018, "learning_rate": 9.025611254035105e-06, "loss": 0.034266017377376556, "memory(GiB)": 21.32, "step": 7450, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957427 }, { "epoch": 0.2420491829906117, "grad_norm": 0.34543848037719727, "learning_rate": 9.025292640621778e-06, "loss": 0.03184259682893753, "memory(GiB)": 21.32, "step": 7451, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.957447 }, { "epoch": 0.24208166845336712, "grad_norm": 0.5444480180740356, "learning_rate": 9.024973980751217e-06, "loss": 0.030969522893428802, "memory(GiB)": 21.32, "step": 7452, "token_acc": 0.9961538461538462, "train_speed(iter/s)": 0.957468 }, { "epoch": 0.24211415391612254, "grad_norm": 1.4312833547592163, "learning_rate": 9.0246552744271e-06, "loss": 0.03580339625477791, "memory(GiB)": 21.32, "step": 7453, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.95749 }, { "epoch": 0.24214663937887795, "grad_norm": 1.7064461708068848, "learning_rate": 9.024336521653104e-06, "loss": 0.032873064279556274, "memory(GiB)": 21.32, "step": 7454, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.957511 }, { "epoch": 0.24217912484163337, "grad_norm": 1.1948275566101074, "learning_rate": 9.024017722432907e-06, "loss": 0.03138231858611107, "memory(GiB)": 21.32, "step": 7455, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.957532 }, { "epoch": 0.24221161030438879, "grad_norm": 0.47473087906837463, "learning_rate": 9.023698876770191e-06, "loss": 0.030463725328445435, "memory(GiB)": 21.32, "step": 7456, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.957555 }, { "epoch": 0.2422440957671442, "grad_norm": 0.6505547165870667, "learning_rate": 9.023379984668634e-06, "loss": 0.035127557814121246, "memory(GiB)": 21.32, "step": 7457, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957577 }, { "epoch": 0.24227658122989962, "grad_norm": 2.18666934967041, "learning_rate": 9.023061046131916e-06, "loss": 0.030230557546019554, "memory(GiB)": 21.32, "step": 7458, "token_acc": 0.9765625, "train_speed(iter/s)": 0.957598 }, { "epoch": 0.24230906669265503, "grad_norm": 0.6866491436958313, "learning_rate": 9.02274206116372e-06, "loss": 0.041826773434877396, "memory(GiB)": 21.32, "step": 7459, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.957616 }, { "epoch": 0.24234155215541045, "grad_norm": 3.1844379901885986, "learning_rate": 9.022423029767726e-06, "loss": 0.04122620075941086, "memory(GiB)": 21.32, "step": 7460, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.957636 }, { "epoch": 0.24237403761816587, "grad_norm": 0.6201614737510681, "learning_rate": 9.022103951947615e-06, "loss": 0.034622080624103546, "memory(GiB)": 21.32, "step": 7461, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.957656 }, { "epoch": 0.24240652308092128, "grad_norm": 0.453574538230896, "learning_rate": 9.021784827707072e-06, "loss": 0.02821958437561989, "memory(GiB)": 21.32, "step": 7462, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957675 }, { "epoch": 0.2424390085436767, "grad_norm": 0.5110929608345032, "learning_rate": 9.021465657049779e-06, "loss": 0.03940487653017044, "memory(GiB)": 21.32, "step": 7463, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.957692 }, { "epoch": 0.24247149400643211, "grad_norm": 0.7854476571083069, "learning_rate": 9.02114643997942e-06, "loss": 0.05051637813448906, "memory(GiB)": 21.32, "step": 7464, "token_acc": 0.9662447257383966, "train_speed(iter/s)": 0.957711 }, { "epoch": 0.24250397946918753, "grad_norm": 0.7366292476654053, "learning_rate": 9.02082717649968e-06, "loss": 0.0459749698638916, "memory(GiB)": 21.32, "step": 7465, "token_acc": 0.9673469387755103, "train_speed(iter/s)": 0.957733 }, { "epoch": 0.24253646493194295, "grad_norm": 0.6228760480880737, "learning_rate": 9.02050786661424e-06, "loss": 0.03432279825210571, "memory(GiB)": 21.32, "step": 7466, "token_acc": 1.0, "train_speed(iter/s)": 0.957755 }, { "epoch": 0.24256895039469836, "grad_norm": 0.5175514817237854, "learning_rate": 9.020188510326791e-06, "loss": 0.03128145635128021, "memory(GiB)": 21.32, "step": 7467, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.957776 }, { "epoch": 0.24260143585745378, "grad_norm": 0.615606427192688, "learning_rate": 9.019869107641013e-06, "loss": 0.038062773644924164, "memory(GiB)": 21.32, "step": 7468, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.957798 }, { "epoch": 0.2426339213202092, "grad_norm": 0.4276280999183655, "learning_rate": 9.019549658560596e-06, "loss": 0.02543051913380623, "memory(GiB)": 21.32, "step": 7469, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.957816 }, { "epoch": 0.2426664067829646, "grad_norm": 0.49372801184654236, "learning_rate": 9.019230163089228e-06, "loss": 0.034498363733291626, "memory(GiB)": 21.32, "step": 7470, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.957835 }, { "epoch": 0.24269889224572003, "grad_norm": 0.847176194190979, "learning_rate": 9.018910621230592e-06, "loss": 0.03700526803731918, "memory(GiB)": 21.32, "step": 7471, "token_acc": 0.9875, "train_speed(iter/s)": 0.957855 }, { "epoch": 0.24273137770847547, "grad_norm": 0.5007466077804565, "learning_rate": 9.018591032988378e-06, "loss": 0.034468356519937515, "memory(GiB)": 21.32, "step": 7472, "token_acc": 0.963265306122449, "train_speed(iter/s)": 0.957877 }, { "epoch": 0.2427638631712309, "grad_norm": 0.5430213212966919, "learning_rate": 9.018271398366277e-06, "loss": 0.043790873140096664, "memory(GiB)": 21.32, "step": 7473, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.957897 }, { "epoch": 0.2427963486339863, "grad_norm": 0.4358338713645935, "learning_rate": 9.017951717367975e-06, "loss": 0.031041394919157028, "memory(GiB)": 21.32, "step": 7474, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957916 }, { "epoch": 0.24282883409674172, "grad_norm": 0.5852704644203186, "learning_rate": 9.017631989997162e-06, "loss": 0.040304139256477356, "memory(GiB)": 21.32, "step": 7475, "token_acc": 0.982532751091703, "train_speed(iter/s)": 0.957935 }, { "epoch": 0.24286131955949714, "grad_norm": 0.5047913193702698, "learning_rate": 9.017312216257527e-06, "loss": 0.050924722105264664, "memory(GiB)": 21.32, "step": 7476, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.957955 }, { "epoch": 0.24289380502225255, "grad_norm": 0.600130021572113, "learning_rate": 9.016992396152763e-06, "loss": 0.03990668058395386, "memory(GiB)": 21.32, "step": 7477, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.957976 }, { "epoch": 0.24292629048500797, "grad_norm": 0.514119029045105, "learning_rate": 9.01667252968656e-06, "loss": 0.03530224412679672, "memory(GiB)": 21.32, "step": 7478, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.957997 }, { "epoch": 0.24295877594776338, "grad_norm": 0.5562882423400879, "learning_rate": 9.016352616862609e-06, "loss": 0.05169444903731346, "memory(GiB)": 21.32, "step": 7479, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.958018 }, { "epoch": 0.2429912614105188, "grad_norm": 0.526804506778717, "learning_rate": 9.016032657684606e-06, "loss": 0.04069913923740387, "memory(GiB)": 21.32, "step": 7480, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.95804 }, { "epoch": 0.24302374687327422, "grad_norm": 0.5376364588737488, "learning_rate": 9.015712652156238e-06, "loss": 0.03775453194975853, "memory(GiB)": 21.32, "step": 7481, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.958062 }, { "epoch": 0.24305623233602963, "grad_norm": 0.43585339188575745, "learning_rate": 9.015392600281203e-06, "loss": 0.03706498444080353, "memory(GiB)": 21.32, "step": 7482, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.95809 }, { "epoch": 0.24308871779878505, "grad_norm": 0.4651854336261749, "learning_rate": 9.015072502063192e-06, "loss": 0.04499363526701927, "memory(GiB)": 21.32, "step": 7483, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.958115 }, { "epoch": 0.24312120326154046, "grad_norm": 0.5116199851036072, "learning_rate": 9.0147523575059e-06, "loss": 0.0402003675699234, "memory(GiB)": 21.32, "step": 7484, "token_acc": 0.9810606060606061, "train_speed(iter/s)": 0.958142 }, { "epoch": 0.24315368872429588, "grad_norm": 0.4290888011455536, "learning_rate": 9.014432166613021e-06, "loss": 0.04024535045027733, "memory(GiB)": 21.32, "step": 7485, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.958168 }, { "epoch": 0.2431861741870513, "grad_norm": 0.6312867403030396, "learning_rate": 9.014111929388254e-06, "loss": 0.033230461180210114, "memory(GiB)": 21.32, "step": 7486, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.958195 }, { "epoch": 0.2432186596498067, "grad_norm": 0.3603205680847168, "learning_rate": 9.01379164583529e-06, "loss": 0.03374285250902176, "memory(GiB)": 21.32, "step": 7487, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.958222 }, { "epoch": 0.24325114511256213, "grad_norm": 0.5997050404548645, "learning_rate": 9.013471315957831e-06, "loss": 0.030196556821465492, "memory(GiB)": 21.32, "step": 7488, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.958249 }, { "epoch": 0.24328363057531754, "grad_norm": 0.5677964687347412, "learning_rate": 9.01315093975957e-06, "loss": 0.04714231938123703, "memory(GiB)": 21.32, "step": 7489, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.958274 }, { "epoch": 0.24331611603807296, "grad_norm": 0.4626769721508026, "learning_rate": 9.012830517244205e-06, "loss": 0.03836214169859886, "memory(GiB)": 21.32, "step": 7490, "token_acc": 1.0, "train_speed(iter/s)": 0.9583 }, { "epoch": 0.24334860150082838, "grad_norm": 0.40941566228866577, "learning_rate": 9.012510048415435e-06, "loss": 0.025562139227986336, "memory(GiB)": 21.32, "step": 7491, "token_acc": 0.9878787878787879, "train_speed(iter/s)": 0.958327 }, { "epoch": 0.2433810869635838, "grad_norm": 0.47436243295669556, "learning_rate": 9.012189533276958e-06, "loss": 0.03456319123506546, "memory(GiB)": 21.32, "step": 7492, "token_acc": 0.9797979797979798, "train_speed(iter/s)": 0.958354 }, { "epoch": 0.2434135724263392, "grad_norm": 0.3302413821220398, "learning_rate": 9.011868971832474e-06, "loss": 0.0334603488445282, "memory(GiB)": 21.32, "step": 7493, "token_acc": 0.9742489270386266, "train_speed(iter/s)": 0.95838 }, { "epoch": 0.24344605788909462, "grad_norm": 0.3956203758716583, "learning_rate": 9.011548364085682e-06, "loss": 0.032647505402565, "memory(GiB)": 21.32, "step": 7494, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.958406 }, { "epoch": 0.24347854335185004, "grad_norm": 0.39699411392211914, "learning_rate": 9.011227710040284e-06, "loss": 0.03807980567216873, "memory(GiB)": 21.32, "step": 7495, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.95843 }, { "epoch": 0.24351102881460546, "grad_norm": 0.4617096483707428, "learning_rate": 9.010907009699979e-06, "loss": 0.03907279670238495, "memory(GiB)": 21.32, "step": 7496, "token_acc": 0.9834254143646409, "train_speed(iter/s)": 0.958456 }, { "epoch": 0.24354351427736087, "grad_norm": 0.367774099111557, "learning_rate": 9.010586263068465e-06, "loss": 0.03008355014026165, "memory(GiB)": 21.32, "step": 7497, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.95848 }, { "epoch": 0.2435759997401163, "grad_norm": 0.4106472134590149, "learning_rate": 9.010265470149452e-06, "loss": 0.03519850969314575, "memory(GiB)": 21.32, "step": 7498, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.958506 }, { "epoch": 0.2436084852028717, "grad_norm": 0.3225133717060089, "learning_rate": 9.009944630946637e-06, "loss": 0.022514082491397858, "memory(GiB)": 21.32, "step": 7499, "token_acc": 0.98, "train_speed(iter/s)": 0.958533 }, { "epoch": 0.24364097066562712, "grad_norm": 0.596431314945221, "learning_rate": 9.009623745463721e-06, "loss": 0.03735601529479027, "memory(GiB)": 21.32, "step": 7500, "token_acc": 0.9775280898876404, "train_speed(iter/s)": 0.95856 }, { "epoch": 0.24364097066562712, "eval_loss": 0.03613930195569992, "eval_runtime": 80.4799, "eval_samples_per_second": 123.633, "eval_steps_per_second": 3.864, "eval_token_acc": 0.9860824379587347, "step": 7500 }, { "epoch": 0.24367345612838254, "grad_norm": 0.7343073487281799, "learning_rate": 9.009302813704412e-06, "loss": 0.04131359979510307, "memory(GiB)": 21.32, "step": 7501, "token_acc": 0.9854268534878307, "train_speed(iter/s)": 0.947512 }, { "epoch": 0.24370594159113795, "grad_norm": 0.3660920262336731, "learning_rate": 9.008981835672412e-06, "loss": 0.027219098061323166, "memory(GiB)": 21.32, "step": 7502, "token_acc": 0.9897959183673469, "train_speed(iter/s)": 0.947533 }, { "epoch": 0.24373842705389337, "grad_norm": 0.39831674098968506, "learning_rate": 9.008660811371427e-06, "loss": 0.04035096615552902, "memory(GiB)": 21.32, "step": 7503, "token_acc": 0.9723502304147466, "train_speed(iter/s)": 0.947553 }, { "epoch": 0.2437709125166488, "grad_norm": 0.5054333209991455, "learning_rate": 9.00833974080516e-06, "loss": 0.0346115417778492, "memory(GiB)": 21.32, "step": 7504, "token_acc": 0.9846743295019157, "train_speed(iter/s)": 0.947574 }, { "epoch": 0.24380339797940423, "grad_norm": 0.5530218482017517, "learning_rate": 9.008018623977315e-06, "loss": 0.030351007357239723, "memory(GiB)": 21.32, "step": 7505, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.947595 }, { "epoch": 0.24383588344215965, "grad_norm": 0.6288659572601318, "learning_rate": 9.007697460891603e-06, "loss": 0.04086337983608246, "memory(GiB)": 21.32, "step": 7506, "token_acc": 0.9753086419753086, "train_speed(iter/s)": 0.947618 }, { "epoch": 0.24386836890491506, "grad_norm": 0.5602853894233704, "learning_rate": 9.007376251551727e-06, "loss": 0.036760181188583374, "memory(GiB)": 21.32, "step": 7507, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.947645 }, { "epoch": 0.24390085436767048, "grad_norm": 0.6418449878692627, "learning_rate": 9.007054995961395e-06, "loss": 0.03637851029634476, "memory(GiB)": 21.32, "step": 7508, "token_acc": 0.9931972789115646, "train_speed(iter/s)": 0.94767 }, { "epoch": 0.2439333398304259, "grad_norm": 0.4127994477748871, "learning_rate": 9.006733694124317e-06, "loss": 0.03631532937288284, "memory(GiB)": 21.32, "step": 7509, "token_acc": 0.9758454106280193, "train_speed(iter/s)": 0.947695 }, { "epoch": 0.2439658252931813, "grad_norm": 0.5100104808807373, "learning_rate": 9.006412346044198e-06, "loss": 0.03970577195286751, "memory(GiB)": 21.32, "step": 7510, "token_acc": 0.974025974025974, "train_speed(iter/s)": 0.94772 }, { "epoch": 0.24399831075593673, "grad_norm": 0.5938687920570374, "learning_rate": 9.006090951724747e-06, "loss": 0.04122306406497955, "memory(GiB)": 21.32, "step": 7511, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.947748 }, { "epoch": 0.24403079621869214, "grad_norm": 1.1505378484725952, "learning_rate": 9.005769511169675e-06, "loss": 0.05037735030055046, "memory(GiB)": 21.32, "step": 7512, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.947775 }, { "epoch": 0.24406328168144756, "grad_norm": 0.3685193359851837, "learning_rate": 9.005448024382692e-06, "loss": 0.03100254014134407, "memory(GiB)": 21.32, "step": 7513, "token_acc": 0.982532751091703, "train_speed(iter/s)": 0.947799 }, { "epoch": 0.24409576714420297, "grad_norm": 0.5360807776451111, "learning_rate": 9.005126491367506e-06, "loss": 0.03666488826274872, "memory(GiB)": 21.32, "step": 7514, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.947823 }, { "epoch": 0.2441282526069584, "grad_norm": 2.5459144115448, "learning_rate": 9.00480491212783e-06, "loss": 0.04668736457824707, "memory(GiB)": 21.32, "step": 7515, "token_acc": 0.9724409448818898, "train_speed(iter/s)": 0.947846 }, { "epoch": 0.2441607380697138, "grad_norm": 0.44234055280685425, "learning_rate": 9.004483286667376e-06, "loss": 0.034136444330215454, "memory(GiB)": 21.32, "step": 7516, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.947869 }, { "epoch": 0.24419322353246922, "grad_norm": 0.5049602389335632, "learning_rate": 9.004161614989853e-06, "loss": 0.0406075119972229, "memory(GiB)": 21.32, "step": 7517, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.947891 }, { "epoch": 0.24422570899522464, "grad_norm": 0.5513599514961243, "learning_rate": 9.003839897098978e-06, "loss": 0.034995727241039276, "memory(GiB)": 21.32, "step": 7518, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.947914 }, { "epoch": 0.24425819445798005, "grad_norm": 0.5556125640869141, "learning_rate": 9.003518132998459e-06, "loss": 0.04395300894975662, "memory(GiB)": 21.32, "step": 7519, "token_acc": 0.9877049180327869, "train_speed(iter/s)": 0.947935 }, { "epoch": 0.24429067992073547, "grad_norm": 0.6873027086257935, "learning_rate": 9.003196322692014e-06, "loss": 0.036062728613615036, "memory(GiB)": 21.32, "step": 7520, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.947925 }, { "epoch": 0.2443231653834909, "grad_norm": 0.5490449666976929, "learning_rate": 9.002874466183353e-06, "loss": 0.049315195530653, "memory(GiB)": 21.32, "step": 7521, "token_acc": 0.9859649122807017, "train_speed(iter/s)": 0.947948 }, { "epoch": 0.2443556508462463, "grad_norm": 0.7184879183769226, "learning_rate": 9.002552563476194e-06, "loss": 0.04176446050405502, "memory(GiB)": 21.32, "step": 7522, "token_acc": 0.9836065573770492, "train_speed(iter/s)": 0.947972 }, { "epoch": 0.24438813630900172, "grad_norm": 0.7282605767250061, "learning_rate": 9.002230614574252e-06, "loss": 0.05636809393763542, "memory(GiB)": 21.32, "step": 7523, "token_acc": 0.9575471698113207, "train_speed(iter/s)": 0.947992 }, { "epoch": 0.24442062177175713, "grad_norm": 0.6241411566734314, "learning_rate": 9.00190861948124e-06, "loss": 0.04257424920797348, "memory(GiB)": 21.32, "step": 7524, "token_acc": 0.985, "train_speed(iter/s)": 0.948013 }, { "epoch": 0.24445310723451255, "grad_norm": 0.4882113039493561, "learning_rate": 9.001586578200879e-06, "loss": 0.02960309386253357, "memory(GiB)": 21.32, "step": 7525, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.948034 }, { "epoch": 0.24448559269726797, "grad_norm": 0.7634349465370178, "learning_rate": 9.001264490736879e-06, "loss": 0.029927365481853485, "memory(GiB)": 21.32, "step": 7526, "token_acc": 0.9869565217391304, "train_speed(iter/s)": 0.948053 }, { "epoch": 0.24451807816002338, "grad_norm": 0.584183931350708, "learning_rate": 9.000942357092963e-06, "loss": 0.03718949854373932, "memory(GiB)": 21.32, "step": 7527, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.948077 }, { "epoch": 0.2445505636227788, "grad_norm": 0.3848309814929962, "learning_rate": 9.000620177272847e-06, "loss": 0.032751306891441345, "memory(GiB)": 21.32, "step": 7528, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.9481 }, { "epoch": 0.24458304908553422, "grad_norm": 0.4272174537181854, "learning_rate": 9.000297951280251e-06, "loss": 0.030981572344899178, "memory(GiB)": 21.32, "step": 7529, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.948124 }, { "epoch": 0.24461553454828963, "grad_norm": 0.5938199758529663, "learning_rate": 8.99997567911889e-06, "loss": 0.03452552855014801, "memory(GiB)": 21.32, "step": 7530, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.948147 }, { "epoch": 0.24464802001104505, "grad_norm": 0.6510077118873596, "learning_rate": 8.999653360792486e-06, "loss": 0.03655529022216797, "memory(GiB)": 21.32, "step": 7531, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.948168 }, { "epoch": 0.24468050547380046, "grad_norm": 0.4603886902332306, "learning_rate": 8.999330996304758e-06, "loss": 0.03591587767004967, "memory(GiB)": 21.32, "step": 7532, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.948187 }, { "epoch": 0.24471299093655588, "grad_norm": 0.5488660335540771, "learning_rate": 8.999008585659429e-06, "loss": 0.03775887191295624, "memory(GiB)": 21.32, "step": 7533, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.948206 }, { "epoch": 0.2447454763993113, "grad_norm": 0.551720380783081, "learning_rate": 8.998686128860217e-06, "loss": 0.04545275866985321, "memory(GiB)": 21.32, "step": 7534, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.948227 }, { "epoch": 0.2447779618620667, "grad_norm": 0.5110777020454407, "learning_rate": 8.998363625910845e-06, "loss": 0.030361704528331757, "memory(GiB)": 21.32, "step": 7535, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.948248 }, { "epoch": 0.24481044732482216, "grad_norm": 0.6730523705482483, "learning_rate": 8.998041076815034e-06, "loss": 0.03565496951341629, "memory(GiB)": 21.32, "step": 7536, "token_acc": 0.9867549668874173, "train_speed(iter/s)": 0.948269 }, { "epoch": 0.24484293278757757, "grad_norm": 0.4783165156841278, "learning_rate": 8.997718481576509e-06, "loss": 0.03440499305725098, "memory(GiB)": 21.32, "step": 7537, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.948293 }, { "epoch": 0.244875418250333, "grad_norm": 0.5620691776275635, "learning_rate": 8.997395840198992e-06, "loss": 0.03775790333747864, "memory(GiB)": 21.32, "step": 7538, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.948318 }, { "epoch": 0.2449079037130884, "grad_norm": 0.38980725407600403, "learning_rate": 8.997073152686205e-06, "loss": 0.03443695977330208, "memory(GiB)": 21.32, "step": 7539, "token_acc": 0.988, "train_speed(iter/s)": 0.948344 }, { "epoch": 0.24494038917584382, "grad_norm": 0.6890510320663452, "learning_rate": 8.996750419041875e-06, "loss": 0.03943527862429619, "memory(GiB)": 21.32, "step": 7540, "token_acc": 0.9863945578231292, "train_speed(iter/s)": 0.948369 }, { "epoch": 0.24497287463859924, "grad_norm": 0.6715010404586792, "learning_rate": 8.996427639269724e-06, "loss": 0.049269918352365494, "memory(GiB)": 21.32, "step": 7541, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.948394 }, { "epoch": 0.24500536010135465, "grad_norm": 0.4254172742366791, "learning_rate": 8.99610481337348e-06, "loss": 0.023767482489347458, "memory(GiB)": 21.32, "step": 7542, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.948422 }, { "epoch": 0.24503784556411007, "grad_norm": 0.5212367177009583, "learning_rate": 8.995781941356868e-06, "loss": 0.04025941342115402, "memory(GiB)": 21.32, "step": 7543, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.948451 }, { "epoch": 0.24507033102686548, "grad_norm": 0.6147709488868713, "learning_rate": 8.995459023223614e-06, "loss": 0.05345839262008667, "memory(GiB)": 21.32, "step": 7544, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.948479 }, { "epoch": 0.2451028164896209, "grad_norm": 0.5937173366546631, "learning_rate": 8.995136058977445e-06, "loss": 0.04849373549222946, "memory(GiB)": 21.32, "step": 7545, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.948507 }, { "epoch": 0.24513530195237632, "grad_norm": 1.1673848628997803, "learning_rate": 8.994813048622088e-06, "loss": 0.036092691123485565, "memory(GiB)": 21.32, "step": 7546, "token_acc": 0.976, "train_speed(iter/s)": 0.948529 }, { "epoch": 0.24516778741513173, "grad_norm": 0.5961156487464905, "learning_rate": 8.99448999216127e-06, "loss": 0.043079763650894165, "memory(GiB)": 21.32, "step": 7547, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.94855 }, { "epoch": 0.24520027287788715, "grad_norm": 0.6719121336936951, "learning_rate": 8.994166889598723e-06, "loss": 0.046491820365190506, "memory(GiB)": 21.32, "step": 7548, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.94857 }, { "epoch": 0.24523275834064256, "grad_norm": 0.5426157116889954, "learning_rate": 8.993843740938174e-06, "loss": 0.04103901982307434, "memory(GiB)": 21.32, "step": 7549, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.948592 }, { "epoch": 0.24526524380339798, "grad_norm": 0.4882114827632904, "learning_rate": 8.993520546183349e-06, "loss": 0.03743036091327667, "memory(GiB)": 21.32, "step": 7550, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.948613 }, { "epoch": 0.2452977292661534, "grad_norm": 0.40510544180870056, "learning_rate": 8.993197305337985e-06, "loss": 0.03191174566745758, "memory(GiB)": 21.32, "step": 7551, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.948633 }, { "epoch": 0.2453302147289088, "grad_norm": 0.6825124025344849, "learning_rate": 8.992874018405808e-06, "loss": 0.05045650899410248, "memory(GiB)": 21.32, "step": 7552, "token_acc": 0.9656652360515021, "train_speed(iter/s)": 0.948655 }, { "epoch": 0.24536270019166423, "grad_norm": 0.5868504047393799, "learning_rate": 8.99255068539055e-06, "loss": 0.03844241052865982, "memory(GiB)": 21.32, "step": 7553, "token_acc": 0.9754901960784313, "train_speed(iter/s)": 0.948675 }, { "epoch": 0.24539518565441965, "grad_norm": 0.4959976375102997, "learning_rate": 8.992227306295944e-06, "loss": 0.04022825509309769, "memory(GiB)": 21.32, "step": 7554, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.948694 }, { "epoch": 0.24542767111717506, "grad_norm": 0.4471580982208252, "learning_rate": 8.991903881125721e-06, "loss": 0.042559534311294556, "memory(GiB)": 21.32, "step": 7555, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.948713 }, { "epoch": 0.24546015657993048, "grad_norm": 0.4267451763153076, "learning_rate": 8.991580409883613e-06, "loss": 0.04661772400140762, "memory(GiB)": 21.32, "step": 7556, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.948735 }, { "epoch": 0.2454926420426859, "grad_norm": 0.42289796471595764, "learning_rate": 8.991256892573355e-06, "loss": 0.03143620491027832, "memory(GiB)": 21.32, "step": 7557, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.948755 }, { "epoch": 0.2455251275054413, "grad_norm": 0.6171623468399048, "learning_rate": 8.99093332919868e-06, "loss": 0.049129046499729156, "memory(GiB)": 21.32, "step": 7558, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.948776 }, { "epoch": 0.24555761296819673, "grad_norm": 1.061901569366455, "learning_rate": 8.990609719763323e-06, "loss": 0.05108477920293808, "memory(GiB)": 21.32, "step": 7559, "token_acc": 0.9724770642201835, "train_speed(iter/s)": 0.948795 }, { "epoch": 0.24559009843095214, "grad_norm": 0.4105742871761322, "learning_rate": 8.990286064271018e-06, "loss": 0.03455306589603424, "memory(GiB)": 21.32, "step": 7560, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.948817 }, { "epoch": 0.24562258389370756, "grad_norm": 0.36165815591812134, "learning_rate": 8.9899623627255e-06, "loss": 0.033749762922525406, "memory(GiB)": 21.32, "step": 7561, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.948839 }, { "epoch": 0.24565506935646297, "grad_norm": 0.4474756121635437, "learning_rate": 8.989638615130507e-06, "loss": 0.03718363493680954, "memory(GiB)": 21.32, "step": 7562, "token_acc": 0.9809523809523809, "train_speed(iter/s)": 0.94886 }, { "epoch": 0.2456875548192184, "grad_norm": 0.4799833595752716, "learning_rate": 8.989314821489773e-06, "loss": 0.03428000956773758, "memory(GiB)": 21.32, "step": 7563, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.948881 }, { "epoch": 0.2457200402819738, "grad_norm": 0.3447379171848297, "learning_rate": 8.988990981807037e-06, "loss": 0.036091454327106476, "memory(GiB)": 21.32, "step": 7564, "token_acc": 0.985781990521327, "train_speed(iter/s)": 0.948903 }, { "epoch": 0.24575252574472922, "grad_norm": 0.5630391240119934, "learning_rate": 8.988667096086034e-06, "loss": 0.039520472288131714, "memory(GiB)": 21.32, "step": 7565, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.948926 }, { "epoch": 0.24578501120748464, "grad_norm": 0.658766508102417, "learning_rate": 8.988343164330507e-06, "loss": 0.03781166672706604, "memory(GiB)": 21.32, "step": 7566, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.948953 }, { "epoch": 0.24581749667024005, "grad_norm": 0.585665225982666, "learning_rate": 8.988019186544188e-06, "loss": 0.03231263533234596, "memory(GiB)": 21.32, "step": 7567, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.948979 }, { "epoch": 0.2458499821329955, "grad_norm": 0.48630958795547485, "learning_rate": 8.987695162730823e-06, "loss": 0.032714344561100006, "memory(GiB)": 21.32, "step": 7568, "token_acc": 0.9757085020242915, "train_speed(iter/s)": 0.949007 }, { "epoch": 0.24588246759575091, "grad_norm": 0.6180503964424133, "learning_rate": 8.987371092894146e-06, "loss": 0.0422840341925621, "memory(GiB)": 21.32, "step": 7569, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.949035 }, { "epoch": 0.24591495305850633, "grad_norm": 0.7402946949005127, "learning_rate": 8.987046977037898e-06, "loss": 0.04289330169558525, "memory(GiB)": 21.32, "step": 7570, "token_acc": 0.9727626459143969, "train_speed(iter/s)": 0.949063 }, { "epoch": 0.24594743852126175, "grad_norm": 0.5761430859565735, "learning_rate": 8.986722815165825e-06, "loss": 0.03850289434194565, "memory(GiB)": 21.32, "step": 7571, "token_acc": 1.0, "train_speed(iter/s)": 0.949089 }, { "epoch": 0.24597992398401716, "grad_norm": 0.48875948786735535, "learning_rate": 8.986398607281661e-06, "loss": 0.03274700045585632, "memory(GiB)": 21.32, "step": 7572, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.949115 }, { "epoch": 0.24601240944677258, "grad_norm": 0.7592548727989197, "learning_rate": 8.986074353389153e-06, "loss": 0.041085414588451385, "memory(GiB)": 21.32, "step": 7573, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.949142 }, { "epoch": 0.246044894909528, "grad_norm": 0.5321835875511169, "learning_rate": 8.985750053492041e-06, "loss": 0.03763286769390106, "memory(GiB)": 21.32, "step": 7574, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.94917 }, { "epoch": 0.2460773803722834, "grad_norm": 0.5105628967285156, "learning_rate": 8.985425707594069e-06, "loss": 0.03518601134419441, "memory(GiB)": 21.32, "step": 7575, "token_acc": 0.9884169884169884, "train_speed(iter/s)": 0.949198 }, { "epoch": 0.24610986583503883, "grad_norm": 0.4216194152832031, "learning_rate": 8.985101315698979e-06, "loss": 0.0319080650806427, "memory(GiB)": 21.32, "step": 7576, "token_acc": 0.9781659388646288, "train_speed(iter/s)": 0.949225 }, { "epoch": 0.24614235129779424, "grad_norm": 0.5276411771774292, "learning_rate": 8.984776877810515e-06, "loss": 0.03476938605308533, "memory(GiB)": 21.32, "step": 7577, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.949247 }, { "epoch": 0.24617483676054966, "grad_norm": 0.7141724824905396, "learning_rate": 8.984452393932424e-06, "loss": 0.04167152941226959, "memory(GiB)": 21.32, "step": 7578, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.94927 }, { "epoch": 0.24620732222330508, "grad_norm": 0.5999066829681396, "learning_rate": 8.984127864068447e-06, "loss": 0.03772183880209923, "memory(GiB)": 21.32, "step": 7579, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.949293 }, { "epoch": 0.2462398076860605, "grad_norm": 0.7198658585548401, "learning_rate": 8.983803288222332e-06, "loss": 0.04191812127828598, "memory(GiB)": 21.32, "step": 7580, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.949315 }, { "epoch": 0.2462722931488159, "grad_norm": 0.4443560540676117, "learning_rate": 8.983478666397828e-06, "loss": 0.03178313747048378, "memory(GiB)": 21.32, "step": 7581, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.949337 }, { "epoch": 0.24630477861157132, "grad_norm": 0.2901063561439514, "learning_rate": 8.983153998598674e-06, "loss": 0.023065846413373947, "memory(GiB)": 21.32, "step": 7582, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.949359 }, { "epoch": 0.24633726407432674, "grad_norm": 0.712874710559845, "learning_rate": 8.982829284828622e-06, "loss": 0.040616512298583984, "memory(GiB)": 21.32, "step": 7583, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.949381 }, { "epoch": 0.24636974953708216, "grad_norm": 0.5418204069137573, "learning_rate": 8.98250452509142e-06, "loss": 0.03795962780714035, "memory(GiB)": 21.32, "step": 7584, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.949404 }, { "epoch": 0.24640223499983757, "grad_norm": 0.7040104269981384, "learning_rate": 8.982179719390814e-06, "loss": 0.03841809183359146, "memory(GiB)": 21.32, "step": 7585, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.949425 }, { "epoch": 0.246434720462593, "grad_norm": 0.49601835012435913, "learning_rate": 8.981854867730555e-06, "loss": 0.046201009303331375, "memory(GiB)": 21.32, "step": 7586, "token_acc": 0.9707317073170731, "train_speed(iter/s)": 0.949448 }, { "epoch": 0.2464672059253484, "grad_norm": 0.5151963233947754, "learning_rate": 8.981529970114391e-06, "loss": 0.03252646327018738, "memory(GiB)": 21.32, "step": 7587, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.949468 }, { "epoch": 0.24649969138810382, "grad_norm": 0.47981348633766174, "learning_rate": 8.98120502654607e-06, "loss": 0.038523513823747635, "memory(GiB)": 21.32, "step": 7588, "token_acc": 0.9855595667870036, "train_speed(iter/s)": 0.94949 }, { "epoch": 0.24653217685085924, "grad_norm": 0.5036730170249939, "learning_rate": 8.980880037029346e-06, "loss": 0.0423850417137146, "memory(GiB)": 21.32, "step": 7589, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.94951 }, { "epoch": 0.24656466231361465, "grad_norm": 0.3863353431224823, "learning_rate": 8.980555001567968e-06, "loss": 0.0362715944647789, "memory(GiB)": 21.32, "step": 7590, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.949532 }, { "epoch": 0.24659714777637007, "grad_norm": 0.5486142635345459, "learning_rate": 8.980229920165686e-06, "loss": 0.03994258493185043, "memory(GiB)": 21.32, "step": 7591, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.949556 }, { "epoch": 0.24662963323912548, "grad_norm": 0.4547741711139679, "learning_rate": 8.979904792826253e-06, "loss": 0.0354544073343277, "memory(GiB)": 21.32, "step": 7592, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.949578 }, { "epoch": 0.2466621187018809, "grad_norm": 0.514622688293457, "learning_rate": 8.97957961955342e-06, "loss": 0.03980576992034912, "memory(GiB)": 21.32, "step": 7593, "token_acc": 0.9609929078014184, "train_speed(iter/s)": 0.949601 }, { "epoch": 0.24669460416463632, "grad_norm": 0.5992196798324585, "learning_rate": 8.979254400350944e-06, "loss": 0.037525203078985214, "memory(GiB)": 21.32, "step": 7594, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.949624 }, { "epoch": 0.24672708962739173, "grad_norm": 0.5717490911483765, "learning_rate": 8.978929135222575e-06, "loss": 0.03497312590479851, "memory(GiB)": 21.32, "step": 7595, "token_acc": 0.990521327014218, "train_speed(iter/s)": 0.949646 }, { "epoch": 0.24675957509014715, "grad_norm": 0.48199138045310974, "learning_rate": 8.978603824172067e-06, "loss": 0.04129931703209877, "memory(GiB)": 21.32, "step": 7596, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.949664 }, { "epoch": 0.24679206055290256, "grad_norm": 0.5345404744148254, "learning_rate": 8.978278467203177e-06, "loss": 0.044366609305143356, "memory(GiB)": 21.32, "step": 7597, "token_acc": 0.9894179894179894, "train_speed(iter/s)": 0.949686 }, { "epoch": 0.24682454601565798, "grad_norm": 0.4672122597694397, "learning_rate": 8.977953064319658e-06, "loss": 0.039614319801330566, "memory(GiB)": 21.32, "step": 7598, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.949709 }, { "epoch": 0.2468570314784134, "grad_norm": 0.4225817024707794, "learning_rate": 8.977627615525266e-06, "loss": 0.0359019935131073, "memory(GiB)": 21.32, "step": 7599, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.949732 }, { "epoch": 0.24688951694116884, "grad_norm": 0.7530979514122009, "learning_rate": 8.977302120823756e-06, "loss": 0.04237637296319008, "memory(GiB)": 21.32, "step": 7600, "token_acc": 0.9823529411764705, "train_speed(iter/s)": 0.949754 }, { "epoch": 0.24692200240392426, "grad_norm": 0.44210124015808105, "learning_rate": 8.976976580218885e-06, "loss": 0.03589325398206711, "memory(GiB)": 21.32, "step": 7601, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.949772 }, { "epoch": 0.24695448786667967, "grad_norm": 0.6354369521141052, "learning_rate": 8.976650993714413e-06, "loss": 0.0386243499815464, "memory(GiB)": 21.32, "step": 7602, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.9498 }, { "epoch": 0.2469869733294351, "grad_norm": 0.75995934009552, "learning_rate": 8.976325361314095e-06, "loss": 0.05126060172915459, "memory(GiB)": 21.32, "step": 7603, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.949826 }, { "epoch": 0.2470194587921905, "grad_norm": 0.4819768965244293, "learning_rate": 8.97599968302169e-06, "loss": 0.03300883248448372, "memory(GiB)": 21.32, "step": 7604, "token_acc": 0.9869281045751634, "train_speed(iter/s)": 0.949851 }, { "epoch": 0.24705194425494592, "grad_norm": 0.45341506600379944, "learning_rate": 8.975673958840956e-06, "loss": 0.036499977111816406, "memory(GiB)": 21.32, "step": 7605, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.949876 }, { "epoch": 0.24708442971770134, "grad_norm": 0.365923672914505, "learning_rate": 8.975348188775654e-06, "loss": 0.03380195423960686, "memory(GiB)": 21.32, "step": 7606, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.949897 }, { "epoch": 0.24711691518045675, "grad_norm": 0.5076944231987, "learning_rate": 8.975022372829542e-06, "loss": 0.04256496578454971, "memory(GiB)": 21.32, "step": 7607, "token_acc": 0.968, "train_speed(iter/s)": 0.949919 }, { "epoch": 0.24714940064321217, "grad_norm": 0.6023056507110596, "learning_rate": 8.974696511006381e-06, "loss": 0.03590422868728638, "memory(GiB)": 21.32, "step": 7608, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.949939 }, { "epoch": 0.24718188610596759, "grad_norm": 0.5500745177268982, "learning_rate": 8.974370603309931e-06, "loss": 0.038877859711647034, "memory(GiB)": 21.32, "step": 7609, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.949961 }, { "epoch": 0.247214371568723, "grad_norm": 0.6305720806121826, "learning_rate": 8.974044649743955e-06, "loss": 0.051683180034160614, "memory(GiB)": 21.32, "step": 7610, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.949978 }, { "epoch": 0.24724685703147842, "grad_norm": 0.4935586452484131, "learning_rate": 8.973718650312216e-06, "loss": 0.036541253328323364, "memory(GiB)": 21.32, "step": 7611, "token_acc": 0.9947368421052631, "train_speed(iter/s)": 0.949999 }, { "epoch": 0.24727934249423383, "grad_norm": 0.4211421608924866, "learning_rate": 8.973392605018474e-06, "loss": 0.037690628319978714, "memory(GiB)": 21.32, "step": 7612, "token_acc": 1.0, "train_speed(iter/s)": 0.95002 }, { "epoch": 0.24731182795698925, "grad_norm": 0.41173020005226135, "learning_rate": 8.973066513866492e-06, "loss": 0.03819335624575615, "memory(GiB)": 21.32, "step": 7613, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.950038 }, { "epoch": 0.24734431341974467, "grad_norm": 0.5234086513519287, "learning_rate": 8.972740376860034e-06, "loss": 0.04440288990736008, "memory(GiB)": 21.32, "step": 7614, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.950058 }, { "epoch": 0.24737679888250008, "grad_norm": 0.3083139657974243, "learning_rate": 8.972414194002865e-06, "loss": 0.03143078461289406, "memory(GiB)": 21.32, "step": 7615, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.95008 }, { "epoch": 0.2474092843452555, "grad_norm": 0.5392047166824341, "learning_rate": 8.97208796529875e-06, "loss": 0.03758327662944794, "memory(GiB)": 21.32, "step": 7616, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.950101 }, { "epoch": 0.24744176980801091, "grad_norm": 0.5447059869766235, "learning_rate": 8.971761690751451e-06, "loss": 0.041142478585243225, "memory(GiB)": 21.32, "step": 7617, "token_acc": 0.9659574468085106, "train_speed(iter/s)": 0.950121 }, { "epoch": 0.24747425527076633, "grad_norm": 0.2904743254184723, "learning_rate": 8.971435370364735e-06, "loss": 0.023880984634160995, "memory(GiB)": 21.32, "step": 7618, "token_acc": 0.9888268156424581, "train_speed(iter/s)": 0.950142 }, { "epoch": 0.24750674073352175, "grad_norm": 0.43288975954055786, "learning_rate": 8.971109004142371e-06, "loss": 0.03556782007217407, "memory(GiB)": 21.32, "step": 7619, "token_acc": 0.9798387096774194, "train_speed(iter/s)": 0.950162 }, { "epoch": 0.24753922619627716, "grad_norm": 0.8965774178504944, "learning_rate": 8.970782592088124e-06, "loss": 0.03369036316871643, "memory(GiB)": 21.32, "step": 7620, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.950182 }, { "epoch": 0.24757171165903258, "grad_norm": 0.5012972354888916, "learning_rate": 8.97045613420576e-06, "loss": 0.04663274437189102, "memory(GiB)": 21.32, "step": 7621, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.950204 }, { "epoch": 0.247604197121788, "grad_norm": 0.355179101228714, "learning_rate": 8.970129630499047e-06, "loss": 0.028809912502765656, "memory(GiB)": 21.32, "step": 7622, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.950225 }, { "epoch": 0.2476366825845434, "grad_norm": 0.4116269648075104, "learning_rate": 8.969803080971754e-06, "loss": 0.036164477467536926, "memory(GiB)": 21.32, "step": 7623, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.950245 }, { "epoch": 0.24766916804729883, "grad_norm": 0.3749864399433136, "learning_rate": 8.96947648562765e-06, "loss": 0.029364027082920074, "memory(GiB)": 21.32, "step": 7624, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.950266 }, { "epoch": 0.24770165351005424, "grad_norm": 0.5017833113670349, "learning_rate": 8.969149844470504e-06, "loss": 0.03237725794315338, "memory(GiB)": 21.32, "step": 7625, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.950294 }, { "epoch": 0.24773413897280966, "grad_norm": 0.5411396622657776, "learning_rate": 8.968823157504085e-06, "loss": 0.032857805490493774, "memory(GiB)": 21.32, "step": 7626, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.950321 }, { "epoch": 0.24776662443556507, "grad_norm": 0.4342491030693054, "learning_rate": 8.968496424732165e-06, "loss": 0.034836798906326294, "memory(GiB)": 21.32, "step": 7627, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.950348 }, { "epoch": 0.2477991098983205, "grad_norm": 0.4615084230899811, "learning_rate": 8.968169646158515e-06, "loss": 0.02936842292547226, "memory(GiB)": 21.32, "step": 7628, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.950375 }, { "epoch": 0.2478315953610759, "grad_norm": 0.5086002349853516, "learning_rate": 8.967842821786906e-06, "loss": 0.04461515694856644, "memory(GiB)": 21.32, "step": 7629, "token_acc": 0.9808612440191388, "train_speed(iter/s)": 0.950401 }, { "epoch": 0.24786408082383132, "grad_norm": 0.4617091417312622, "learning_rate": 8.967515951621106e-06, "loss": 0.026228390634059906, "memory(GiB)": 21.32, "step": 7630, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.95043 }, { "epoch": 0.24789656628658674, "grad_norm": 0.41620248556137085, "learning_rate": 8.967189035664894e-06, "loss": 0.0315575897693634, "memory(GiB)": 21.32, "step": 7631, "token_acc": 0.98, "train_speed(iter/s)": 0.950457 }, { "epoch": 0.24792905174934218, "grad_norm": 0.6246848702430725, "learning_rate": 8.96686207392204e-06, "loss": 0.054737936705350876, "memory(GiB)": 21.32, "step": 7632, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.950486 }, { "epoch": 0.2479615372120976, "grad_norm": 0.637798011302948, "learning_rate": 8.966535066396318e-06, "loss": 0.042923279106616974, "memory(GiB)": 21.32, "step": 7633, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.950511 }, { "epoch": 0.24799402267485302, "grad_norm": 0.7060808539390564, "learning_rate": 8.966208013091502e-06, "loss": 0.04339785873889923, "memory(GiB)": 21.32, "step": 7634, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.950538 }, { "epoch": 0.24802650813760843, "grad_norm": 0.5301503539085388, "learning_rate": 8.965880914011365e-06, "loss": 0.032091349363327026, "memory(GiB)": 21.32, "step": 7635, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.950566 }, { "epoch": 0.24805899360036385, "grad_norm": 0.5180854797363281, "learning_rate": 8.965553769159685e-06, "loss": 0.036958497017621994, "memory(GiB)": 21.32, "step": 7636, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.950592 }, { "epoch": 0.24809147906311926, "grad_norm": 0.5330132842063904, "learning_rate": 8.965226578540237e-06, "loss": 0.03855667635798454, "memory(GiB)": 21.32, "step": 7637, "token_acc": 0.98828125, "train_speed(iter/s)": 0.950619 }, { "epoch": 0.24812396452587468, "grad_norm": 0.5580753684043884, "learning_rate": 8.964899342156794e-06, "loss": 0.03545572608709335, "memory(GiB)": 21.32, "step": 7638, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.950646 }, { "epoch": 0.2481564499886301, "grad_norm": 0.35992318391799927, "learning_rate": 8.964572060013137e-06, "loss": 0.028034338727593422, "memory(GiB)": 21.32, "step": 7639, "token_acc": 1.0, "train_speed(iter/s)": 0.950674 }, { "epoch": 0.2481889354513855, "grad_norm": 0.4355109930038452, "learning_rate": 8.964244732113041e-06, "loss": 0.04572362080216408, "memory(GiB)": 21.32, "step": 7640, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.950702 }, { "epoch": 0.24822142091414093, "grad_norm": 0.5544381141662598, "learning_rate": 8.963917358460285e-06, "loss": 0.03591307997703552, "memory(GiB)": 21.32, "step": 7641, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.950727 }, { "epoch": 0.24825390637689634, "grad_norm": 0.4408154785633087, "learning_rate": 8.963589939058649e-06, "loss": 0.04144895821809769, "memory(GiB)": 21.32, "step": 7642, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.950749 }, { "epoch": 0.24828639183965176, "grad_norm": 0.48461851477622986, "learning_rate": 8.963262473911905e-06, "loss": 0.036408670246601105, "memory(GiB)": 21.32, "step": 7643, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.950772 }, { "epoch": 0.24831887730240718, "grad_norm": 0.49250757694244385, "learning_rate": 8.96293496302384e-06, "loss": 0.03815970942378044, "memory(GiB)": 21.32, "step": 7644, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.950794 }, { "epoch": 0.2483513627651626, "grad_norm": 1.4735313653945923, "learning_rate": 8.96260740639823e-06, "loss": 0.03806355595588684, "memory(GiB)": 21.32, "step": 7645, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.950815 }, { "epoch": 0.248383848227918, "grad_norm": 0.5547328591346741, "learning_rate": 8.962279804038856e-06, "loss": 0.033028535544872284, "memory(GiB)": 21.32, "step": 7646, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.950838 }, { "epoch": 0.24841633369067342, "grad_norm": 0.4340195953845978, "learning_rate": 8.961952155949501e-06, "loss": 0.03236190602183342, "memory(GiB)": 21.32, "step": 7647, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.950859 }, { "epoch": 0.24844881915342884, "grad_norm": 0.46092602610588074, "learning_rate": 8.961624462133942e-06, "loss": 0.039884380996227264, "memory(GiB)": 21.32, "step": 7648, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.950881 }, { "epoch": 0.24848130461618426, "grad_norm": 0.41537028551101685, "learning_rate": 8.961296722595966e-06, "loss": 0.031177416443824768, "memory(GiB)": 21.32, "step": 7649, "token_acc": 0.9824561403508771, "train_speed(iter/s)": 0.950904 }, { "epoch": 0.24851379007893967, "grad_norm": 0.47871431708335876, "learning_rate": 8.960968937339352e-06, "loss": 0.032553285360336304, "memory(GiB)": 21.32, "step": 7650, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.950926 }, { "epoch": 0.2485462755416951, "grad_norm": 0.45106732845306396, "learning_rate": 8.960641106367886e-06, "loss": 0.03323611989617348, "memory(GiB)": 21.32, "step": 7651, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.950948 }, { "epoch": 0.2485787610044505, "grad_norm": 0.5625671744346619, "learning_rate": 8.960313229685349e-06, "loss": 0.04354231059551239, "memory(GiB)": 21.32, "step": 7652, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.950968 }, { "epoch": 0.24861124646720592, "grad_norm": 0.3967166543006897, "learning_rate": 8.959985307295526e-06, "loss": 0.02997475489974022, "memory(GiB)": 21.32, "step": 7653, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.950988 }, { "epoch": 0.24864373192996134, "grad_norm": 0.3504618704319, "learning_rate": 8.959657339202201e-06, "loss": 0.029794882982969284, "memory(GiB)": 21.32, "step": 7654, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.95101 }, { "epoch": 0.24867621739271675, "grad_norm": 0.4149797260761261, "learning_rate": 8.959329325409162e-06, "loss": 0.027960117906332016, "memory(GiB)": 21.32, "step": 7655, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.951031 }, { "epoch": 0.24870870285547217, "grad_norm": 0.540200412273407, "learning_rate": 8.959001265920191e-06, "loss": 0.037484005093574524, "memory(GiB)": 21.32, "step": 7656, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.951053 }, { "epoch": 0.24874118831822759, "grad_norm": 0.6067160964012146, "learning_rate": 8.958673160739078e-06, "loss": 0.03872101381421089, "memory(GiB)": 21.32, "step": 7657, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.951075 }, { "epoch": 0.248773673780983, "grad_norm": 0.44361573457717896, "learning_rate": 8.958345009869606e-06, "loss": 0.03328673169016838, "memory(GiB)": 21.32, "step": 7658, "token_acc": 0.988950276243094, "train_speed(iter/s)": 0.951098 }, { "epoch": 0.24880615924373842, "grad_norm": 0.6345771551132202, "learning_rate": 8.958016813315564e-06, "loss": 0.04215700179338455, "memory(GiB)": 21.32, "step": 7659, "token_acc": 0.986159169550173, "train_speed(iter/s)": 0.95112 }, { "epoch": 0.24883864470649383, "grad_norm": 0.5018633604049683, "learning_rate": 8.95768857108074e-06, "loss": 0.031095709651708603, "memory(GiB)": 21.32, "step": 7660, "token_acc": 0.9927536231884058, "train_speed(iter/s)": 0.951137 }, { "epoch": 0.24887113016924925, "grad_norm": 0.4313238263130188, "learning_rate": 8.957360283168924e-06, "loss": 0.03574816882610321, "memory(GiB)": 21.32, "step": 7661, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.951158 }, { "epoch": 0.24890361563200467, "grad_norm": 0.639018177986145, "learning_rate": 8.957031949583901e-06, "loss": 0.0359615758061409, "memory(GiB)": 21.32, "step": 7662, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.951178 }, { "epoch": 0.24893610109476008, "grad_norm": 0.6171826124191284, "learning_rate": 8.956703570329464e-06, "loss": 0.039901189506053925, "memory(GiB)": 21.32, "step": 7663, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.951199 }, { "epoch": 0.24896858655751553, "grad_norm": 0.49679678678512573, "learning_rate": 8.9563751454094e-06, "loss": 0.03427698090672493, "memory(GiB)": 21.32, "step": 7664, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.951222 }, { "epoch": 0.24900107202027094, "grad_norm": 0.5534296631813049, "learning_rate": 8.956046674827501e-06, "loss": 0.03476768359541893, "memory(GiB)": 21.32, "step": 7665, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.951242 }, { "epoch": 0.24903355748302636, "grad_norm": 0.5013315677642822, "learning_rate": 8.955718158587558e-06, "loss": 0.04182682931423187, "memory(GiB)": 21.32, "step": 7666, "token_acc": 0.9647058823529412, "train_speed(iter/s)": 0.951261 }, { "epoch": 0.24906604294578177, "grad_norm": 0.4040589928627014, "learning_rate": 8.955389596693363e-06, "loss": 0.03004811890423298, "memory(GiB)": 21.32, "step": 7667, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.951282 }, { "epoch": 0.2490985284085372, "grad_norm": 0.5081174373626709, "learning_rate": 8.955060989148707e-06, "loss": 0.03290890157222748, "memory(GiB)": 21.32, "step": 7668, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.951304 }, { "epoch": 0.2491310138712926, "grad_norm": 0.8465278744697571, "learning_rate": 8.954732335957384e-06, "loss": 0.0381128303706646, "memory(GiB)": 21.32, "step": 7669, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.951324 }, { "epoch": 0.24916349933404802, "grad_norm": 0.5015246272087097, "learning_rate": 8.954403637123185e-06, "loss": 0.03268483281135559, "memory(GiB)": 21.32, "step": 7670, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.951342 }, { "epoch": 0.24919598479680344, "grad_norm": 0.508671760559082, "learning_rate": 8.954074892649905e-06, "loss": 0.03138347342610359, "memory(GiB)": 21.32, "step": 7671, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.951363 }, { "epoch": 0.24922847025955885, "grad_norm": 0.5257691144943237, "learning_rate": 8.953746102541339e-06, "loss": 0.03711891546845436, "memory(GiB)": 21.32, "step": 7672, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.951383 }, { "epoch": 0.24926095572231427, "grad_norm": 0.4474141001701355, "learning_rate": 8.953417266801278e-06, "loss": 0.03110678866505623, "memory(GiB)": 21.32, "step": 7673, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.951402 }, { "epoch": 0.2492934411850697, "grad_norm": 0.4465576708316803, "learning_rate": 8.953088385433521e-06, "loss": 0.03810478746891022, "memory(GiB)": 21.32, "step": 7674, "token_acc": 0.9634703196347032, "train_speed(iter/s)": 0.951421 }, { "epoch": 0.2493259266478251, "grad_norm": 0.5008369088172913, "learning_rate": 8.952759458441864e-06, "loss": 0.03812457621097565, "memory(GiB)": 21.32, "step": 7675, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.95144 }, { "epoch": 0.24935841211058052, "grad_norm": 1.0756334066390991, "learning_rate": 8.952430485830102e-06, "loss": 0.03265271335840225, "memory(GiB)": 21.32, "step": 7676, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.951456 }, { "epoch": 0.24939089757333593, "grad_norm": 0.6004572510719299, "learning_rate": 8.952101467602028e-06, "loss": 0.03847698122262955, "memory(GiB)": 21.32, "step": 7677, "token_acc": 0.9757281553398058, "train_speed(iter/s)": 0.951475 }, { "epoch": 0.24942338303609135, "grad_norm": 0.5060548186302185, "learning_rate": 8.951772403761446e-06, "loss": 0.03857070952653885, "memory(GiB)": 21.32, "step": 7678, "token_acc": 0.9727272727272728, "train_speed(iter/s)": 0.951495 }, { "epoch": 0.24945586849884677, "grad_norm": 0.2923884987831116, "learning_rate": 8.95144329431215e-06, "loss": 0.02359314262866974, "memory(GiB)": 21.32, "step": 7679, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.951512 }, { "epoch": 0.24948835396160218, "grad_norm": 0.43313801288604736, "learning_rate": 8.951114139257937e-06, "loss": 0.030809437856078148, "memory(GiB)": 21.32, "step": 7680, "token_acc": 0.992, "train_speed(iter/s)": 0.951529 }, { "epoch": 0.2495208394243576, "grad_norm": 0.41469186544418335, "learning_rate": 8.95078493860261e-06, "loss": 0.031318873167037964, "memory(GiB)": 21.32, "step": 7681, "token_acc": 1.0, "train_speed(iter/s)": 0.951548 }, { "epoch": 0.24955332488711301, "grad_norm": 0.6054946184158325, "learning_rate": 8.950455692349966e-06, "loss": 0.041161008179187775, "memory(GiB)": 21.32, "step": 7682, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.951569 }, { "epoch": 0.24958581034986843, "grad_norm": 0.5275658369064331, "learning_rate": 8.950126400503805e-06, "loss": 0.03463192284107208, "memory(GiB)": 21.32, "step": 7683, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.951589 }, { "epoch": 0.24961829581262385, "grad_norm": 0.5025089383125305, "learning_rate": 8.949797063067927e-06, "loss": 0.04793759062886238, "memory(GiB)": 21.32, "step": 7684, "token_acc": 0.98046875, "train_speed(iter/s)": 0.951611 }, { "epoch": 0.24965078127537926, "grad_norm": 0.48605310916900635, "learning_rate": 8.949467680046135e-06, "loss": 0.03646315634250641, "memory(GiB)": 21.32, "step": 7685, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.951636 }, { "epoch": 0.24968326673813468, "grad_norm": 0.597394585609436, "learning_rate": 8.94913825144223e-06, "loss": 0.038191087543964386, "memory(GiB)": 21.32, "step": 7686, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.951664 }, { "epoch": 0.2497157522008901, "grad_norm": 0.5279771089553833, "learning_rate": 8.948808777260011e-06, "loss": 0.03334033489227295, "memory(GiB)": 21.32, "step": 7687, "token_acc": 0.980544747081712, "train_speed(iter/s)": 0.95169 }, { "epoch": 0.2497482376636455, "grad_norm": 0.4077644646167755, "learning_rate": 8.948479257503285e-06, "loss": 0.03487452119588852, "memory(GiB)": 21.32, "step": 7688, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.951717 }, { "epoch": 0.24978072312640093, "grad_norm": 0.397706001996994, "learning_rate": 8.94814969217585e-06, "loss": 0.04033844545483589, "memory(GiB)": 21.32, "step": 7689, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.951744 }, { "epoch": 0.24981320858915634, "grad_norm": 0.6262274980545044, "learning_rate": 8.947820081281516e-06, "loss": 0.03823370486497879, "memory(GiB)": 21.32, "step": 7690, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.951772 }, { "epoch": 0.24984569405191176, "grad_norm": 0.41931378841400146, "learning_rate": 8.947490424824082e-06, "loss": 0.03360296040773392, "memory(GiB)": 21.32, "step": 7691, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.951799 }, { "epoch": 0.24987817951466718, "grad_norm": 0.6817886233329773, "learning_rate": 8.947160722807355e-06, "loss": 0.032523684203624725, "memory(GiB)": 21.32, "step": 7692, "token_acc": 0.9885931558935361, "train_speed(iter/s)": 0.951826 }, { "epoch": 0.2499106649774226, "grad_norm": 0.6741359233856201, "learning_rate": 8.946830975235137e-06, "loss": 0.03962977975606918, "memory(GiB)": 21.32, "step": 7693, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.951853 }, { "epoch": 0.249943150440178, "grad_norm": 0.4491533935070038, "learning_rate": 8.94650118211124e-06, "loss": 0.03194594755768776, "memory(GiB)": 21.32, "step": 7694, "token_acc": 1.0, "train_speed(iter/s)": 0.95188 }, { "epoch": 0.24997563590293342, "grad_norm": 0.3817332684993744, "learning_rate": 8.946171343439464e-06, "loss": 0.033553242683410645, "memory(GiB)": 21.32, "step": 7695, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.951906 }, { "epoch": 0.25000812136568884, "grad_norm": 0.5932241678237915, "learning_rate": 8.94584145922362e-06, "loss": 0.042554400861263275, "memory(GiB)": 21.32, "step": 7696, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.951934 }, { "epoch": 0.25004060682844426, "grad_norm": 0.4895479679107666, "learning_rate": 8.945511529467513e-06, "loss": 0.028631681576371193, "memory(GiB)": 21.32, "step": 7697, "token_acc": 0.98828125, "train_speed(iter/s)": 0.951958 }, { "epoch": 0.2500730922911997, "grad_norm": 0.36683911085128784, "learning_rate": 8.945181554174951e-06, "loss": 0.032991573214530945, "memory(GiB)": 21.32, "step": 7698, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.951985 }, { "epoch": 0.2501055777539551, "grad_norm": 0.4022922217845917, "learning_rate": 8.944851533349744e-06, "loss": 0.025473708286881447, "memory(GiB)": 21.32, "step": 7699, "token_acc": 1.0, "train_speed(iter/s)": 0.952011 }, { "epoch": 0.2501380632167105, "grad_norm": 0.3694405257701874, "learning_rate": 8.9445214669957e-06, "loss": 0.03550654649734497, "memory(GiB)": 21.32, "step": 7700, "token_acc": 0.9826388888888888, "train_speed(iter/s)": 0.952039 }, { "epoch": 0.2501705486794659, "grad_norm": 0.48387816548347473, "learning_rate": 8.944191355116625e-06, "loss": 0.047727830708026886, "memory(GiB)": 21.32, "step": 7701, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.952065 }, { "epoch": 0.25020303414222134, "grad_norm": 0.37082409858703613, "learning_rate": 8.943861197716336e-06, "loss": 0.03208980709314346, "memory(GiB)": 21.32, "step": 7702, "token_acc": 0.9928057553956835, "train_speed(iter/s)": 0.952092 }, { "epoch": 0.25023551960497675, "grad_norm": 0.37250009179115295, "learning_rate": 8.943530994798636e-06, "loss": 0.03389012813568115, "memory(GiB)": 21.32, "step": 7703, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.952117 }, { "epoch": 0.25026800506773217, "grad_norm": 0.6668117642402649, "learning_rate": 8.943200746367343e-06, "loss": 0.03792092204093933, "memory(GiB)": 21.32, "step": 7704, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.952138 }, { "epoch": 0.2503004905304876, "grad_norm": 0.7354569435119629, "learning_rate": 8.942870452426264e-06, "loss": 0.04203135892748833, "memory(GiB)": 21.32, "step": 7705, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.95216 }, { "epoch": 0.250332975993243, "grad_norm": 0.40111762285232544, "learning_rate": 8.94254011297921e-06, "loss": 0.02611645497381687, "memory(GiB)": 21.32, "step": 7706, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.952183 }, { "epoch": 0.2503654614559984, "grad_norm": 0.742875337600708, "learning_rate": 8.942209728029999e-06, "loss": 0.04110298678278923, "memory(GiB)": 21.32, "step": 7707, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.952205 }, { "epoch": 0.25039794691875383, "grad_norm": 0.6887815594673157, "learning_rate": 8.941879297582436e-06, "loss": 0.03863590955734253, "memory(GiB)": 21.32, "step": 7708, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.952228 }, { "epoch": 0.25043043238150925, "grad_norm": 0.7411791086196899, "learning_rate": 8.941548821640341e-06, "loss": 0.04228191450238228, "memory(GiB)": 21.32, "step": 7709, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.95225 }, { "epoch": 0.25046291784426467, "grad_norm": 0.6842263340950012, "learning_rate": 8.941218300207529e-06, "loss": 0.04686378687620163, "memory(GiB)": 21.32, "step": 7710, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.952273 }, { "epoch": 0.2504954033070201, "grad_norm": 0.7544156908988953, "learning_rate": 8.94088773328781e-06, "loss": 0.03850708156824112, "memory(GiB)": 21.32, "step": 7711, "token_acc": 0.9858657243816255, "train_speed(iter/s)": 0.952294 }, { "epoch": 0.25052788876977555, "grad_norm": 0.3729122281074524, "learning_rate": 8.940557120885003e-06, "loss": 0.028288936242461205, "memory(GiB)": 21.32, "step": 7712, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.952316 }, { "epoch": 0.25056037423253097, "grad_norm": 0.48760128021240234, "learning_rate": 8.94022646300292e-06, "loss": 0.03529641777276993, "memory(GiB)": 21.32, "step": 7713, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.952338 }, { "epoch": 0.2505928596952864, "grad_norm": 0.4149368107318878, "learning_rate": 8.93989575964538e-06, "loss": 0.034822992980480194, "memory(GiB)": 21.32, "step": 7714, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.95236 }, { "epoch": 0.2506253451580418, "grad_norm": 0.5676212906837463, "learning_rate": 8.939565010816198e-06, "loss": 0.029956113547086716, "memory(GiB)": 21.32, "step": 7715, "token_acc": 0.9776119402985075, "train_speed(iter/s)": 0.952381 }, { "epoch": 0.2506578306207972, "grad_norm": 0.3890349566936493, "learning_rate": 8.939234216519193e-06, "loss": 0.040572866797447205, "memory(GiB)": 21.32, "step": 7716, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.952403 }, { "epoch": 0.25069031608355263, "grad_norm": 0.43259555101394653, "learning_rate": 8.938903376758181e-06, "loss": 0.03796762600541115, "memory(GiB)": 21.32, "step": 7717, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.952423 }, { "epoch": 0.25072280154630805, "grad_norm": 0.3743220269680023, "learning_rate": 8.938572491536983e-06, "loss": 0.028751740232110023, "memory(GiB)": 21.32, "step": 7718, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.952444 }, { "epoch": 0.25075528700906347, "grad_norm": 0.46578970551490784, "learning_rate": 8.938241560859416e-06, "loss": 0.03829522803425789, "memory(GiB)": 21.32, "step": 7719, "token_acc": 0.984, "train_speed(iter/s)": 0.952467 }, { "epoch": 0.2507877724718189, "grad_norm": 0.4656694531440735, "learning_rate": 8.937910584729299e-06, "loss": 0.035125624388456345, "memory(GiB)": 21.32, "step": 7720, "token_acc": 0.9752475247524752, "train_speed(iter/s)": 0.952489 }, { "epoch": 0.2508202579345743, "grad_norm": 0.47636985778808594, "learning_rate": 8.937579563150453e-06, "loss": 0.04175948351621628, "memory(GiB)": 21.32, "step": 7721, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.952512 }, { "epoch": 0.2508527433973297, "grad_norm": 0.5087583661079407, "learning_rate": 8.937248496126697e-06, "loss": 0.032034602016210556, "memory(GiB)": 21.32, "step": 7722, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.952531 }, { "epoch": 0.25088522886008513, "grad_norm": 0.6061556935310364, "learning_rate": 8.936917383661855e-06, "loss": 0.040760211646556854, "memory(GiB)": 21.32, "step": 7723, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.952552 }, { "epoch": 0.25091771432284055, "grad_norm": 0.4014201760292053, "learning_rate": 8.936586225759742e-06, "loss": 0.036207061260938644, "memory(GiB)": 21.32, "step": 7724, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.952572 }, { "epoch": 0.25095019978559596, "grad_norm": 0.4168539345264435, "learning_rate": 8.936255022424189e-06, "loss": 0.03298970311880112, "memory(GiB)": 21.32, "step": 7725, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.95259 }, { "epoch": 0.2509826852483514, "grad_norm": 0.4281042814254761, "learning_rate": 8.935923773659013e-06, "loss": 0.036556512117385864, "memory(GiB)": 21.32, "step": 7726, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.952608 }, { "epoch": 0.2510151707111068, "grad_norm": 0.7134367823600769, "learning_rate": 8.935592479468036e-06, "loss": 0.0346948616206646, "memory(GiB)": 21.32, "step": 7727, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.952626 }, { "epoch": 0.2510476561738622, "grad_norm": 0.47582927346229553, "learning_rate": 8.935261139855084e-06, "loss": 0.04454002529382706, "memory(GiB)": 21.32, "step": 7728, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.952643 }, { "epoch": 0.2510801416366176, "grad_norm": 0.47408169507980347, "learning_rate": 8.934929754823981e-06, "loss": 0.03473515063524246, "memory(GiB)": 21.32, "step": 7729, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.952663 }, { "epoch": 0.25111262709937304, "grad_norm": 0.3890876770019531, "learning_rate": 8.934598324378552e-06, "loss": 0.028025105595588684, "memory(GiB)": 21.32, "step": 7730, "token_acc": 0.974025974025974, "train_speed(iter/s)": 0.952684 }, { "epoch": 0.25114511256212846, "grad_norm": 0.43131422996520996, "learning_rate": 8.93426684852262e-06, "loss": 0.03450564295053482, "memory(GiB)": 21.32, "step": 7731, "token_acc": 0.9826839826839827, "train_speed(iter/s)": 0.952705 }, { "epoch": 0.2511775980248839, "grad_norm": 0.3794797360897064, "learning_rate": 8.933935327260012e-06, "loss": 0.02808637171983719, "memory(GiB)": 21.32, "step": 7732, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.952723 }, { "epoch": 0.2512100834876393, "grad_norm": 0.44791993498802185, "learning_rate": 8.933603760594557e-06, "loss": 0.031185567378997803, "memory(GiB)": 21.32, "step": 7733, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.952742 }, { "epoch": 0.2512425689503947, "grad_norm": 0.40183112025260925, "learning_rate": 8.933272148530076e-06, "loss": 0.022691089659929276, "memory(GiB)": 21.32, "step": 7734, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.952762 }, { "epoch": 0.2512750544131501, "grad_norm": 0.5248934626579285, "learning_rate": 8.9329404910704e-06, "loss": 0.03472594916820526, "memory(GiB)": 21.32, "step": 7735, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.952779 }, { "epoch": 0.25130753987590554, "grad_norm": 0.4173409938812256, "learning_rate": 8.932608788219357e-06, "loss": 0.029935715720057487, "memory(GiB)": 21.32, "step": 7736, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.952799 }, { "epoch": 0.25134002533866096, "grad_norm": 0.41676104068756104, "learning_rate": 8.932277039980775e-06, "loss": 0.02934795431792736, "memory(GiB)": 21.32, "step": 7737, "token_acc": 1.0, "train_speed(iter/s)": 0.952819 }, { "epoch": 0.25137251080141637, "grad_norm": 0.42427799105644226, "learning_rate": 8.93194524635848e-06, "loss": 0.039550308138132095, "memory(GiB)": 21.32, "step": 7738, "token_acc": 0.9707317073170731, "train_speed(iter/s)": 0.95284 }, { "epoch": 0.2514049962641718, "grad_norm": 0.4919145405292511, "learning_rate": 8.931613407356305e-06, "loss": 0.027439169585704803, "memory(GiB)": 21.32, "step": 7739, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.952859 }, { "epoch": 0.2514374817269272, "grad_norm": 0.4389802813529968, "learning_rate": 8.931281522978077e-06, "loss": 0.02916128560900688, "memory(GiB)": 21.32, "step": 7740, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.952879 }, { "epoch": 0.2514699671896826, "grad_norm": 0.46416640281677246, "learning_rate": 8.930949593227628e-06, "loss": 0.029702119529247284, "memory(GiB)": 21.32, "step": 7741, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.952895 }, { "epoch": 0.25150245265243804, "grad_norm": 0.4976109564304352, "learning_rate": 8.930617618108789e-06, "loss": 0.02981829084455967, "memory(GiB)": 21.32, "step": 7742, "token_acc": 1.0, "train_speed(iter/s)": 0.952915 }, { "epoch": 0.25153493811519345, "grad_norm": 0.4975937604904175, "learning_rate": 8.93028559762539e-06, "loss": 0.030942946672439575, "memory(GiB)": 21.32, "step": 7743, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.952935 }, { "epoch": 0.25156742357794887, "grad_norm": 0.4506756365299225, "learning_rate": 8.929953531781266e-06, "loss": 0.026364102959632874, "memory(GiB)": 21.32, "step": 7744, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.952959 }, { "epoch": 0.2515999090407043, "grad_norm": 0.5805103182792664, "learning_rate": 8.929621420580245e-06, "loss": 0.030794676393270493, "memory(GiB)": 21.32, "step": 7745, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.952986 }, { "epoch": 0.2516323945034597, "grad_norm": 0.36337101459503174, "learning_rate": 8.929289264026164e-06, "loss": 0.02046891674399376, "memory(GiB)": 21.32, "step": 7746, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.953011 }, { "epoch": 0.2516648799662151, "grad_norm": 0.6762611865997314, "learning_rate": 8.928957062122856e-06, "loss": 0.04368166625499725, "memory(GiB)": 21.32, "step": 7747, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.953038 }, { "epoch": 0.25169736542897053, "grad_norm": 0.4772750437259674, "learning_rate": 8.928624814874152e-06, "loss": 0.02612198516726494, "memory(GiB)": 21.32, "step": 7748, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.953065 }, { "epoch": 0.25172985089172595, "grad_norm": 0.5735235810279846, "learning_rate": 8.92829252228389e-06, "loss": 0.0404474176466465, "memory(GiB)": 21.32, "step": 7749, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.953089 }, { "epoch": 0.25176233635448136, "grad_norm": 0.7145252227783203, "learning_rate": 8.927960184355903e-06, "loss": 0.03828292340040207, "memory(GiB)": 21.32, "step": 7750, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.953114 }, { "epoch": 0.2517948218172368, "grad_norm": 0.49242985248565674, "learning_rate": 8.927627801094029e-06, "loss": 0.05501578003168106, "memory(GiB)": 21.32, "step": 7751, "token_acc": 0.9759036144578314, "train_speed(iter/s)": 0.953139 }, { "epoch": 0.2518273072799922, "grad_norm": 0.47977688908576965, "learning_rate": 8.9272953725021e-06, "loss": 0.04413200914859772, "memory(GiB)": 21.32, "step": 7752, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.953166 }, { "epoch": 0.2518597927427476, "grad_norm": 0.44164207577705383, "learning_rate": 8.926962898583957e-06, "loss": 0.02703842893242836, "memory(GiB)": 21.32, "step": 7753, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.953192 }, { "epoch": 0.25189227820550303, "grad_norm": 0.5072125196456909, "learning_rate": 8.926630379343435e-06, "loss": 0.031331054866313934, "memory(GiB)": 21.32, "step": 7754, "token_acc": 0.9778761061946902, "train_speed(iter/s)": 0.953219 }, { "epoch": 0.25192476366825844, "grad_norm": 0.521198034286499, "learning_rate": 8.926297814784373e-06, "loss": 0.031665485352277756, "memory(GiB)": 21.32, "step": 7755, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.953246 }, { "epoch": 0.25195724913101386, "grad_norm": 0.36667683720588684, "learning_rate": 8.925965204910605e-06, "loss": 0.03206854313611984, "memory(GiB)": 21.32, "step": 7756, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.953271 }, { "epoch": 0.2519897345937693, "grad_norm": 0.4222549796104431, "learning_rate": 8.925632549725976e-06, "loss": 0.04079336300492287, "memory(GiB)": 21.32, "step": 7757, "token_acc": 0.981203007518797, "train_speed(iter/s)": 0.953298 }, { "epoch": 0.2520222200565247, "grad_norm": 0.5312726497650146, "learning_rate": 8.925299849234323e-06, "loss": 0.03618647903203964, "memory(GiB)": 21.32, "step": 7758, "token_acc": 0.9849056603773585, "train_speed(iter/s)": 0.953325 }, { "epoch": 0.2520547055192801, "grad_norm": 0.46833646297454834, "learning_rate": 8.924967103439484e-06, "loss": 0.030527735128998756, "memory(GiB)": 21.32, "step": 7759, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.953352 }, { "epoch": 0.2520871909820355, "grad_norm": 0.4256156086921692, "learning_rate": 8.924634312345299e-06, "loss": 0.029225725680589676, "memory(GiB)": 21.32, "step": 7760, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.953375 }, { "epoch": 0.25211967644479094, "grad_norm": 0.8253567814826965, "learning_rate": 8.924301475955612e-06, "loss": 0.04253292828798294, "memory(GiB)": 21.32, "step": 7761, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.953403 }, { "epoch": 0.25215216190754636, "grad_norm": 0.43437719345092773, "learning_rate": 8.923968594274262e-06, "loss": 0.03242921456694603, "memory(GiB)": 21.32, "step": 7762, "token_acc": 0.9890510948905109, "train_speed(iter/s)": 0.953429 }, { "epoch": 0.2521846473703018, "grad_norm": 0.503556489944458, "learning_rate": 8.923635667305093e-06, "loss": 0.04521194100379944, "memory(GiB)": 21.32, "step": 7763, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.953455 }, { "epoch": 0.2522171328330572, "grad_norm": 0.4940754175186157, "learning_rate": 8.923302695051945e-06, "loss": 0.03437023609876633, "memory(GiB)": 21.32, "step": 7764, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.95348 }, { "epoch": 0.2522496182958126, "grad_norm": 0.33062267303466797, "learning_rate": 8.922969677518662e-06, "loss": 0.02826143056154251, "memory(GiB)": 21.32, "step": 7765, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.953503 }, { "epoch": 0.252282103758568, "grad_norm": 0.4242318570613861, "learning_rate": 8.922636614709088e-06, "loss": 0.034729067236185074, "memory(GiB)": 21.32, "step": 7766, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.953526 }, { "epoch": 0.25231458922132344, "grad_norm": 0.42034974694252014, "learning_rate": 8.922303506627066e-06, "loss": 0.02951500192284584, "memory(GiB)": 21.32, "step": 7767, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.953546 }, { "epoch": 0.25234707468407885, "grad_norm": 0.37486526370048523, "learning_rate": 8.921970353276441e-06, "loss": 0.03221713379025459, "memory(GiB)": 21.32, "step": 7768, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.953568 }, { "epoch": 0.25237956014683427, "grad_norm": 0.6445857286453247, "learning_rate": 8.921637154661058e-06, "loss": 0.03964269533753395, "memory(GiB)": 21.32, "step": 7769, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.953587 }, { "epoch": 0.2524120456095897, "grad_norm": 0.5057337880134583, "learning_rate": 8.921303910784762e-06, "loss": 0.028130868449807167, "memory(GiB)": 21.32, "step": 7770, "token_acc": 0.9790940766550522, "train_speed(iter/s)": 0.953609 }, { "epoch": 0.2524445310723451, "grad_norm": 0.594448447227478, "learning_rate": 8.920970621651401e-06, "loss": 0.03660894185304642, "memory(GiB)": 21.32, "step": 7771, "token_acc": 0.9794871794871794, "train_speed(iter/s)": 0.953631 }, { "epoch": 0.2524770165351005, "grad_norm": 0.6380192637443542, "learning_rate": 8.920637287264819e-06, "loss": 0.046555183827877045, "memory(GiB)": 21.32, "step": 7772, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.953653 }, { "epoch": 0.25250950199785593, "grad_norm": 0.4091029167175293, "learning_rate": 8.920303907628864e-06, "loss": 0.031138423830270767, "memory(GiB)": 21.32, "step": 7773, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.953674 }, { "epoch": 0.25254198746061135, "grad_norm": 0.5289194583892822, "learning_rate": 8.919970482747386e-06, "loss": 0.033994294703006744, "memory(GiB)": 21.32, "step": 7774, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.953696 }, { "epoch": 0.25257447292336677, "grad_norm": 0.4812152087688446, "learning_rate": 8.91963701262423e-06, "loss": 0.03888123854994774, "memory(GiB)": 21.32, "step": 7775, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.953715 }, { "epoch": 0.25260695838612224, "grad_norm": 0.5648111701011658, "learning_rate": 8.919303497263246e-06, "loss": 0.034991778433322906, "memory(GiB)": 21.32, "step": 7776, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.953734 }, { "epoch": 0.25263944384887765, "grad_norm": 0.49785590171813965, "learning_rate": 8.918969936668282e-06, "loss": 0.032858628779649734, "memory(GiB)": 21.32, "step": 7777, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.953754 }, { "epoch": 0.25267192931163307, "grad_norm": 0.5464732646942139, "learning_rate": 8.918636330843188e-06, "loss": 0.04468496888875961, "memory(GiB)": 21.32, "step": 7778, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.953776 }, { "epoch": 0.2527044147743885, "grad_norm": 0.5125837922096252, "learning_rate": 8.918302679791819e-06, "loss": 0.0366508886218071, "memory(GiB)": 21.32, "step": 7779, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.953794 }, { "epoch": 0.2527369002371439, "grad_norm": 0.35238775610923767, "learning_rate": 8.91796898351802e-06, "loss": 0.038021933287382126, "memory(GiB)": 21.32, "step": 7780, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.953817 }, { "epoch": 0.2527693856998993, "grad_norm": 0.3745606243610382, "learning_rate": 8.917635242025641e-06, "loss": 0.03270764648914337, "memory(GiB)": 21.32, "step": 7781, "token_acc": 0.9895287958115183, "train_speed(iter/s)": 0.95384 }, { "epoch": 0.25280187116265473, "grad_norm": 0.4333360493183136, "learning_rate": 8.91730145531854e-06, "loss": 0.030128153041005135, "memory(GiB)": 21.32, "step": 7782, "token_acc": 0.979253112033195, "train_speed(iter/s)": 0.953859 }, { "epoch": 0.25283435662541015, "grad_norm": 0.3941552937030792, "learning_rate": 8.916967623400568e-06, "loss": 0.031065091490745544, "memory(GiB)": 21.32, "step": 7783, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.953881 }, { "epoch": 0.25286684208816557, "grad_norm": 0.3603547513484955, "learning_rate": 8.916633746275573e-06, "loss": 0.03278360515832901, "memory(GiB)": 21.32, "step": 7784, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.953897 }, { "epoch": 0.252899327550921, "grad_norm": 0.4880918860435486, "learning_rate": 8.916299823947412e-06, "loss": 0.03318121284246445, "memory(GiB)": 21.32, "step": 7785, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.953914 }, { "epoch": 0.2529318130136764, "grad_norm": 0.4324438273906708, "learning_rate": 8.915965856419939e-06, "loss": 0.032388146966695786, "memory(GiB)": 21.32, "step": 7786, "token_acc": 0.9750778816199377, "train_speed(iter/s)": 0.953933 }, { "epoch": 0.2529642984764318, "grad_norm": 0.799590528011322, "learning_rate": 8.915631843697008e-06, "loss": 0.03201114386320114, "memory(GiB)": 21.32, "step": 7787, "token_acc": 0.9767441860465116, "train_speed(iter/s)": 0.95395 }, { "epoch": 0.25299678393918723, "grad_norm": 0.43034353852272034, "learning_rate": 8.915297785782475e-06, "loss": 0.027847103774547577, "memory(GiB)": 21.32, "step": 7788, "token_acc": 0.9835390946502057, "train_speed(iter/s)": 0.953967 }, { "epoch": 0.25302926940194265, "grad_norm": 0.2990387976169586, "learning_rate": 8.914963682680192e-06, "loss": 0.02576351910829544, "memory(GiB)": 21.32, "step": 7789, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.953985 }, { "epoch": 0.25306175486469806, "grad_norm": 0.4207499921321869, "learning_rate": 8.914629534394018e-06, "loss": 0.036087363958358765, "memory(GiB)": 21.32, "step": 7790, "token_acc": 0.9794238683127572, "train_speed(iter/s)": 0.954007 }, { "epoch": 0.2530942403274535, "grad_norm": 0.7342396974563599, "learning_rate": 8.91429534092781e-06, "loss": 0.04719877988100052, "memory(GiB)": 21.32, "step": 7791, "token_acc": 0.9702970297029703, "train_speed(iter/s)": 0.954025 }, { "epoch": 0.2531267257902089, "grad_norm": 0.6756883859634399, "learning_rate": 8.913961102285422e-06, "loss": 0.035754941403865814, "memory(GiB)": 21.32, "step": 7792, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.954044 }, { "epoch": 0.2531592112529643, "grad_norm": 0.594050407409668, "learning_rate": 8.913626818470715e-06, "loss": 0.03323087841272354, "memory(GiB)": 21.32, "step": 7793, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.954063 }, { "epoch": 0.2531916967157197, "grad_norm": 0.5858427286148071, "learning_rate": 8.913292489487545e-06, "loss": 0.03590802475810051, "memory(GiB)": 21.32, "step": 7794, "token_acc": 1.0, "train_speed(iter/s)": 0.954084 }, { "epoch": 0.25322418217847514, "grad_norm": 0.6564564108848572, "learning_rate": 8.912958115339771e-06, "loss": 0.0389363095164299, "memory(GiB)": 21.32, "step": 7795, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.954102 }, { "epoch": 0.25325666764123056, "grad_norm": 0.6919028162956238, "learning_rate": 8.912623696031252e-06, "loss": 0.035038962960243225, "memory(GiB)": 21.32, "step": 7796, "token_acc": 0.98828125, "train_speed(iter/s)": 0.954121 }, { "epoch": 0.253289153103986, "grad_norm": 0.5481098294258118, "learning_rate": 8.912289231565847e-06, "loss": 0.03791451454162598, "memory(GiB)": 21.32, "step": 7797, "token_acc": 1.0, "train_speed(iter/s)": 0.954138 }, { "epoch": 0.2533216385667414, "grad_norm": 0.5993009805679321, "learning_rate": 8.911954721947419e-06, "loss": 0.03619677573442459, "memory(GiB)": 21.32, "step": 7798, "token_acc": 0.9943502824858758, "train_speed(iter/s)": 0.954156 }, { "epoch": 0.2533541240294968, "grad_norm": 0.5582984089851379, "learning_rate": 8.911620167179827e-06, "loss": 0.03803876042366028, "memory(GiB)": 21.32, "step": 7799, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.954176 }, { "epoch": 0.2533866094922522, "grad_norm": 0.46721765398979187, "learning_rate": 8.91128556726693e-06, "loss": 0.027343738824129105, "memory(GiB)": 21.32, "step": 7800, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.954194 }, { "epoch": 0.25341909495500764, "grad_norm": 0.4235498905181885, "learning_rate": 8.910950922212591e-06, "loss": 0.028439711779356003, "memory(GiB)": 21.32, "step": 7801, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.954212 }, { "epoch": 0.25345158041776306, "grad_norm": 0.4753955006599426, "learning_rate": 8.910616232020674e-06, "loss": 0.03272828459739685, "memory(GiB)": 21.32, "step": 7802, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.954232 }, { "epoch": 0.25348406588051847, "grad_norm": 0.45974001288414, "learning_rate": 8.910281496695041e-06, "loss": 0.037434302270412445, "memory(GiB)": 21.32, "step": 7803, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.954252 }, { "epoch": 0.2535165513432739, "grad_norm": 0.8701241612434387, "learning_rate": 8.909946716239554e-06, "loss": 0.04665795713663101, "memory(GiB)": 21.32, "step": 7804, "token_acc": 1.0, "train_speed(iter/s)": 0.954277 }, { "epoch": 0.2535490368060293, "grad_norm": 0.37506863474845886, "learning_rate": 8.90961189065808e-06, "loss": 0.03237117826938629, "memory(GiB)": 21.32, "step": 7805, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.954302 }, { "epoch": 0.2535815222687847, "grad_norm": 0.4033544361591339, "learning_rate": 8.90927701995448e-06, "loss": 0.03222689405083656, "memory(GiB)": 21.32, "step": 7806, "token_acc": 0.9875, "train_speed(iter/s)": 0.954329 }, { "epoch": 0.25361400773154014, "grad_norm": 0.4343715310096741, "learning_rate": 8.90894210413262e-06, "loss": 0.03860132023692131, "memory(GiB)": 21.32, "step": 7807, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.954354 }, { "epoch": 0.25364649319429555, "grad_norm": 0.5361011028289795, "learning_rate": 8.908607143196365e-06, "loss": 0.03453810513019562, "memory(GiB)": 21.32, "step": 7808, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.95438 }, { "epoch": 0.25367897865705097, "grad_norm": 0.7730790972709656, "learning_rate": 8.90827213714958e-06, "loss": 0.03613300621509552, "memory(GiB)": 21.32, "step": 7809, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.954403 }, { "epoch": 0.2537114641198064, "grad_norm": 1.1953290700912476, "learning_rate": 8.907937085996134e-06, "loss": 0.04309522360563278, "memory(GiB)": 21.32, "step": 7810, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.95443 }, { "epoch": 0.2537439495825618, "grad_norm": 0.5340145230293274, "learning_rate": 8.907601989739893e-06, "loss": 0.03419007360935211, "memory(GiB)": 21.32, "step": 7811, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.954455 }, { "epoch": 0.2537764350453172, "grad_norm": 0.5010549426078796, "learning_rate": 8.907266848384724e-06, "loss": 0.04662206023931503, "memory(GiB)": 21.32, "step": 7812, "token_acc": 0.9627906976744186, "train_speed(iter/s)": 0.95448 }, { "epoch": 0.25380892050807263, "grad_norm": 0.42632368206977844, "learning_rate": 8.906931661934494e-06, "loss": 0.029746979475021362, "memory(GiB)": 21.32, "step": 7813, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.954507 }, { "epoch": 0.25384140597082805, "grad_norm": 0.5019921660423279, "learning_rate": 8.906596430393073e-06, "loss": 0.029837770387530327, "memory(GiB)": 21.32, "step": 7814, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.954532 }, { "epoch": 0.25387389143358347, "grad_norm": 0.5401306748390198, "learning_rate": 8.90626115376433e-06, "loss": 0.04224593937397003, "memory(GiB)": 21.32, "step": 7815, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.954558 }, { "epoch": 0.2539063768963389, "grad_norm": 0.6958221793174744, "learning_rate": 8.905925832052133e-06, "loss": 0.038554757833480835, "memory(GiB)": 21.32, "step": 7816, "token_acc": 0.9766355140186916, "train_speed(iter/s)": 0.954582 }, { "epoch": 0.2539388623590943, "grad_norm": 0.8815537095069885, "learning_rate": 8.905590465260354e-06, "loss": 0.05082816258072853, "memory(GiB)": 21.32, "step": 7817, "token_acc": 0.9755244755244755, "train_speed(iter/s)": 0.954603 }, { "epoch": 0.2539713478218497, "grad_norm": 0.45289114117622375, "learning_rate": 8.905255053392864e-06, "loss": 0.034193217754364014, "memory(GiB)": 21.32, "step": 7818, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.954626 }, { "epoch": 0.25400383328460513, "grad_norm": 0.47963768243789673, "learning_rate": 8.904919596453529e-06, "loss": 0.03281993791460991, "memory(GiB)": 21.32, "step": 7819, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.95465 }, { "epoch": 0.25403631874736055, "grad_norm": 0.45552459359169006, "learning_rate": 8.904584094446227e-06, "loss": 0.03904828429222107, "memory(GiB)": 21.32, "step": 7820, "token_acc": 0.9674418604651163, "train_speed(iter/s)": 0.954675 }, { "epoch": 0.25406880421011596, "grad_norm": 0.4408488869667053, "learning_rate": 8.904248547374827e-06, "loss": 0.027593472972512245, "memory(GiB)": 21.32, "step": 7821, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.954699 }, { "epoch": 0.2541012896728714, "grad_norm": 0.6788385510444641, "learning_rate": 8.903912955243203e-06, "loss": 0.05681698024272919, "memory(GiB)": 21.32, "step": 7822, "token_acc": 0.9702970297029703, "train_speed(iter/s)": 0.954723 }, { "epoch": 0.2541337751356268, "grad_norm": 0.3867433965206146, "learning_rate": 8.903577318055226e-06, "loss": 0.037451229989528656, "memory(GiB)": 21.32, "step": 7823, "token_acc": 0.9840425531914894, "train_speed(iter/s)": 0.954749 }, { "epoch": 0.2541662605983822, "grad_norm": 0.34654513001441956, "learning_rate": 8.903241635814772e-06, "loss": 0.02545761503279209, "memory(GiB)": 21.32, "step": 7824, "token_acc": 0.9806949806949807, "train_speed(iter/s)": 0.954774 }, { "epoch": 0.2541987460611376, "grad_norm": 0.48598000407218933, "learning_rate": 8.902905908525714e-06, "loss": 0.038445089012384415, "memory(GiB)": 21.32, "step": 7825, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.9548 }, { "epoch": 0.25423123152389304, "grad_norm": 1.0037189722061157, "learning_rate": 8.902570136191929e-06, "loss": 0.03533565625548363, "memory(GiB)": 21.32, "step": 7826, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.954822 }, { "epoch": 0.25426371698664846, "grad_norm": 0.5354477763175964, "learning_rate": 8.902234318817286e-06, "loss": 0.03770476207137108, "memory(GiB)": 21.32, "step": 7827, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.954843 }, { "epoch": 0.2542962024494039, "grad_norm": 0.4388134479522705, "learning_rate": 8.901898456405668e-06, "loss": 0.03670373558998108, "memory(GiB)": 21.32, "step": 7828, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.954863 }, { "epoch": 0.2543286879121593, "grad_norm": 0.4491097331047058, "learning_rate": 8.90156254896095e-06, "loss": 0.03318319469690323, "memory(GiB)": 21.32, "step": 7829, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.954884 }, { "epoch": 0.2543611733749147, "grad_norm": 0.332960844039917, "learning_rate": 8.901226596487005e-06, "loss": 0.02856871858239174, "memory(GiB)": 21.32, "step": 7830, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.954905 }, { "epoch": 0.2543936588376701, "grad_norm": 0.43551042675971985, "learning_rate": 8.900890598987713e-06, "loss": 0.032842427492141724, "memory(GiB)": 21.32, "step": 7831, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.954927 }, { "epoch": 0.25442614430042554, "grad_norm": 0.4562990367412567, "learning_rate": 8.90055455646695e-06, "loss": 0.03093334287405014, "memory(GiB)": 21.32, "step": 7832, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.954948 }, { "epoch": 0.25445862976318095, "grad_norm": 1.8952330350875854, "learning_rate": 8.900218468928598e-06, "loss": 0.03283688798546791, "memory(GiB)": 21.32, "step": 7833, "token_acc": 0.984375, "train_speed(iter/s)": 0.954969 }, { "epoch": 0.25449111522593637, "grad_norm": 0.5099518895149231, "learning_rate": 8.899882336376533e-06, "loss": 0.026718761771917343, "memory(GiB)": 21.32, "step": 7834, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.954989 }, { "epoch": 0.2545236006886918, "grad_norm": 0.4509809613227844, "learning_rate": 8.899546158814633e-06, "loss": 0.027334149926900864, "memory(GiB)": 21.32, "step": 7835, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.955008 }, { "epoch": 0.2545560861514472, "grad_norm": 0.5670776963233948, "learning_rate": 8.899209936246783e-06, "loss": 0.03217416629195213, "memory(GiB)": 21.32, "step": 7836, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.95503 }, { "epoch": 0.2545885716142026, "grad_norm": 0.49634528160095215, "learning_rate": 8.89887366867686e-06, "loss": 0.03979094326496124, "memory(GiB)": 21.32, "step": 7837, "token_acc": 0.9616724738675958, "train_speed(iter/s)": 0.955049 }, { "epoch": 0.25462105707695804, "grad_norm": 0.5206161141395569, "learning_rate": 8.898537356108745e-06, "loss": 0.031028028577566147, "memory(GiB)": 21.32, "step": 7838, "token_acc": 0.9837837837837838, "train_speed(iter/s)": 0.955067 }, { "epoch": 0.25465354253971345, "grad_norm": 0.5623011589050293, "learning_rate": 8.898200998546319e-06, "loss": 0.03877505660057068, "memory(GiB)": 21.32, "step": 7839, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.955088 }, { "epoch": 0.2546860280024689, "grad_norm": 0.5849049687385559, "learning_rate": 8.897864595993466e-06, "loss": 0.03829769045114517, "memory(GiB)": 21.32, "step": 7840, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.955107 }, { "epoch": 0.25471851346522434, "grad_norm": 0.57790207862854, "learning_rate": 8.897528148454066e-06, "loss": 0.036950141191482544, "memory(GiB)": 21.32, "step": 7841, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.955129 }, { "epoch": 0.25475099892797975, "grad_norm": 0.44382625818252563, "learning_rate": 8.897191655932006e-06, "loss": 0.030742239207029343, "memory(GiB)": 21.32, "step": 7842, "token_acc": 0.9858657243816255, "train_speed(iter/s)": 0.95515 }, { "epoch": 0.25478348439073517, "grad_norm": 0.7556994557380676, "learning_rate": 8.896855118431167e-06, "loss": 0.036479800939559937, "memory(GiB)": 21.32, "step": 7843, "token_acc": 1.0, "train_speed(iter/s)": 0.955169 }, { "epoch": 0.2548159698534906, "grad_norm": 0.46641406416893005, "learning_rate": 8.896518535955431e-06, "loss": 0.030647888779640198, "memory(GiB)": 21.32, "step": 7844, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.955185 }, { "epoch": 0.254848455316246, "grad_norm": 0.3153276741504669, "learning_rate": 8.896181908508687e-06, "loss": 0.02557867206633091, "memory(GiB)": 21.32, "step": 7845, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.955202 }, { "epoch": 0.2548809407790014, "grad_norm": 0.5278126001358032, "learning_rate": 8.895845236094817e-06, "loss": 0.03558254987001419, "memory(GiB)": 21.32, "step": 7846, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.955217 }, { "epoch": 0.25491342624175684, "grad_norm": 0.3886970579624176, "learning_rate": 8.895508518717708e-06, "loss": 0.03015841916203499, "memory(GiB)": 21.32, "step": 7847, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955234 }, { "epoch": 0.25494591170451225, "grad_norm": 0.5314563512802124, "learning_rate": 8.895171756381245e-06, "loss": 0.04364366829395294, "memory(GiB)": 21.32, "step": 7848, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.955252 }, { "epoch": 0.25497839716726767, "grad_norm": 0.45898208022117615, "learning_rate": 8.894834949089316e-06, "loss": 0.03330019861459732, "memory(GiB)": 21.32, "step": 7849, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.955271 }, { "epoch": 0.2550108826300231, "grad_norm": 0.6901088356971741, "learning_rate": 8.894498096845807e-06, "loss": 0.05005107447504997, "memory(GiB)": 21.32, "step": 7850, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.955288 }, { "epoch": 0.2550433680927785, "grad_norm": 0.5338559150695801, "learning_rate": 8.894161199654607e-06, "loss": 0.03816782683134079, "memory(GiB)": 21.32, "step": 7851, "token_acc": 0.9779005524861878, "train_speed(iter/s)": 0.955306 }, { "epoch": 0.2550758535555339, "grad_norm": 0.47904089093208313, "learning_rate": 8.893824257519604e-06, "loss": 0.034463971853256226, "memory(GiB)": 21.32, "step": 7852, "token_acc": 1.0, "train_speed(iter/s)": 0.955328 }, { "epoch": 0.25510833901828933, "grad_norm": 0.33634689450263977, "learning_rate": 8.893487270444686e-06, "loss": 0.026740461587905884, "memory(GiB)": 21.32, "step": 7853, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.955345 }, { "epoch": 0.25514082448104475, "grad_norm": 0.5635859370231628, "learning_rate": 8.893150238433743e-06, "loss": 0.033014535903930664, "memory(GiB)": 21.32, "step": 7854, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955364 }, { "epoch": 0.25517330994380016, "grad_norm": 0.5947706699371338, "learning_rate": 8.892813161490665e-06, "loss": 0.038483329117298126, "memory(GiB)": 21.32, "step": 7855, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.955384 }, { "epoch": 0.2552057954065556, "grad_norm": 0.6591711640357971, "learning_rate": 8.89247603961934e-06, "loss": 0.05066109448671341, "memory(GiB)": 21.32, "step": 7856, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.955404 }, { "epoch": 0.255238280869311, "grad_norm": 0.4312017858028412, "learning_rate": 8.892138872823662e-06, "loss": 0.04131074994802475, "memory(GiB)": 21.32, "step": 7857, "token_acc": 0.9933110367892977, "train_speed(iter/s)": 0.955423 }, { "epoch": 0.2552707663320664, "grad_norm": 0.556628406047821, "learning_rate": 8.89180166110752e-06, "loss": 0.03511514514684677, "memory(GiB)": 21.32, "step": 7858, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.955444 }, { "epoch": 0.25530325179482183, "grad_norm": 0.7978305816650391, "learning_rate": 8.891464404474808e-06, "loss": 0.04321916401386261, "memory(GiB)": 21.32, "step": 7859, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.955464 }, { "epoch": 0.25533573725757724, "grad_norm": 0.6474682092666626, "learning_rate": 8.891127102929417e-06, "loss": 0.03394486382603645, "memory(GiB)": 21.32, "step": 7860, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.955484 }, { "epoch": 0.25536822272033266, "grad_norm": 0.4522494673728943, "learning_rate": 8.89078975647524e-06, "loss": 0.027445916086435318, "memory(GiB)": 21.32, "step": 7861, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.955505 }, { "epoch": 0.2554007081830881, "grad_norm": 0.584287703037262, "learning_rate": 8.89045236511617e-06, "loss": 0.03907288238406181, "memory(GiB)": 21.32, "step": 7862, "token_acc": 0.975103734439834, "train_speed(iter/s)": 0.955526 }, { "epoch": 0.2554331936458435, "grad_norm": 0.5221661329269409, "learning_rate": 8.890114928856101e-06, "loss": 0.03490596264600754, "memory(GiB)": 21.32, "step": 7863, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.955552 }, { "epoch": 0.2554656791085989, "grad_norm": 0.41892433166503906, "learning_rate": 8.88977744769893e-06, "loss": 0.03169817104935646, "memory(GiB)": 21.32, "step": 7864, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.955576 }, { "epoch": 0.2554981645713543, "grad_norm": 0.5456230640411377, "learning_rate": 8.889439921648549e-06, "loss": 0.032956723123788834, "memory(GiB)": 21.32, "step": 7865, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955601 }, { "epoch": 0.25553065003410974, "grad_norm": 0.46243613958358765, "learning_rate": 8.889102350708855e-06, "loss": 0.03727472946047783, "memory(GiB)": 21.32, "step": 7866, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.955628 }, { "epoch": 0.25556313549686516, "grad_norm": 0.4009411036968231, "learning_rate": 8.888764734883742e-06, "loss": 0.03630318492650986, "memory(GiB)": 21.32, "step": 7867, "token_acc": 1.0, "train_speed(iter/s)": 0.955652 }, { "epoch": 0.2555956209596206, "grad_norm": 0.4351324737071991, "learning_rate": 8.88842707417711e-06, "loss": 0.03823525086045265, "memory(GiB)": 21.32, "step": 7868, "token_acc": 0.9725490196078431, "train_speed(iter/s)": 0.955676 }, { "epoch": 0.255628106422376, "grad_norm": 0.4016934037208557, "learning_rate": 8.888089368592852e-06, "loss": 0.033102262765169144, "memory(GiB)": 21.32, "step": 7869, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.955701 }, { "epoch": 0.2556605918851314, "grad_norm": 0.6217389106750488, "learning_rate": 8.887751618134869e-06, "loss": 0.044503748416900635, "memory(GiB)": 21.32, "step": 7870, "token_acc": 0.9734513274336283, "train_speed(iter/s)": 0.955727 }, { "epoch": 0.2556930773478868, "grad_norm": 0.9035936594009399, "learning_rate": 8.887413822807057e-06, "loss": 0.03936700522899628, "memory(GiB)": 21.32, "step": 7871, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.955749 }, { "epoch": 0.25572556281064224, "grad_norm": 0.626866340637207, "learning_rate": 8.887075982613317e-06, "loss": 0.0418299101293087, "memory(GiB)": 21.32, "step": 7872, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.955773 }, { "epoch": 0.25575804827339765, "grad_norm": 0.43469327688217163, "learning_rate": 8.886738097557545e-06, "loss": 0.033080752938985825, "memory(GiB)": 21.32, "step": 7873, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.955799 }, { "epoch": 0.25579053373615307, "grad_norm": 1.6819026470184326, "learning_rate": 8.886400167643641e-06, "loss": 0.038729891180992126, "memory(GiB)": 21.32, "step": 7874, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.955825 }, { "epoch": 0.2558230191989085, "grad_norm": 0.45980119705200195, "learning_rate": 8.886062192875508e-06, "loss": 0.04481740668416023, "memory(GiB)": 21.32, "step": 7875, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.95585 }, { "epoch": 0.2558555046616639, "grad_norm": 0.4813483953475952, "learning_rate": 8.885724173257045e-06, "loss": 0.03360671550035477, "memory(GiB)": 21.32, "step": 7876, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.955876 }, { "epoch": 0.2558879901244193, "grad_norm": 0.5130493640899658, "learning_rate": 8.885386108792154e-06, "loss": 0.03855687752366066, "memory(GiB)": 21.32, "step": 7877, "token_acc": 0.9675675675675676, "train_speed(iter/s)": 0.955901 }, { "epoch": 0.25592047558717473, "grad_norm": 0.2727912962436676, "learning_rate": 8.885047999484733e-06, "loss": 0.028322787955403328, "memory(GiB)": 21.32, "step": 7878, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.955926 }, { "epoch": 0.25595296104993015, "grad_norm": 0.40582531690597534, "learning_rate": 8.88470984533869e-06, "loss": 0.03395050764083862, "memory(GiB)": 21.32, "step": 7879, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.95595 }, { "epoch": 0.25598544651268557, "grad_norm": 0.39654266834259033, "learning_rate": 8.884371646357923e-06, "loss": 0.0381239578127861, "memory(GiB)": 21.32, "step": 7880, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.955975 }, { "epoch": 0.256017931975441, "grad_norm": 0.3940967321395874, "learning_rate": 8.884033402546337e-06, "loss": 0.03403247147798538, "memory(GiB)": 21.32, "step": 7881, "token_acc": 1.0, "train_speed(iter/s)": 0.956001 }, { "epoch": 0.2560504174381964, "grad_norm": 0.3843098282814026, "learning_rate": 8.883695113907838e-06, "loss": 0.03767649084329605, "memory(GiB)": 21.32, "step": 7882, "token_acc": 0.978021978021978, "train_speed(iter/s)": 0.956027 }, { "epoch": 0.2560829029009518, "grad_norm": 0.5246800780296326, "learning_rate": 8.883356780446327e-06, "loss": 0.037202138453722, "memory(GiB)": 21.32, "step": 7883, "token_acc": 0.9754901960784313, "train_speed(iter/s)": 0.95605 }, { "epoch": 0.25611538836370723, "grad_norm": 0.4895402789115906, "learning_rate": 8.88301840216571e-06, "loss": 0.03505975753068924, "memory(GiB)": 21.32, "step": 7884, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956074 }, { "epoch": 0.25614787382646265, "grad_norm": 0.4481041133403778, "learning_rate": 8.88267997906989e-06, "loss": 0.03654548525810242, "memory(GiB)": 21.32, "step": 7885, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.9561 }, { "epoch": 0.25618035928921806, "grad_norm": 0.37911930680274963, "learning_rate": 8.882341511162778e-06, "loss": 0.029995590448379517, "memory(GiB)": 21.32, "step": 7886, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.956124 }, { "epoch": 0.2562128447519735, "grad_norm": 0.4139878451824188, "learning_rate": 8.882002998448277e-06, "loss": 0.02614157274365425, "memory(GiB)": 21.32, "step": 7887, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.956149 }, { "epoch": 0.2562453302147289, "grad_norm": 0.4860132336616516, "learning_rate": 8.881664440930294e-06, "loss": 0.03482910245656967, "memory(GiB)": 21.32, "step": 7888, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.956174 }, { "epoch": 0.2562778156774843, "grad_norm": 0.6135109066963196, "learning_rate": 8.881325838612738e-06, "loss": 0.04484911635518074, "memory(GiB)": 21.32, "step": 7889, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956127 }, { "epoch": 0.2563103011402397, "grad_norm": 0.533132016658783, "learning_rate": 8.880987191499516e-06, "loss": 0.03374045714735985, "memory(GiB)": 21.32, "step": 7890, "token_acc": 0.983739837398374, "train_speed(iter/s)": 0.956147 }, { "epoch": 0.25634278660299514, "grad_norm": 0.396632581949234, "learning_rate": 8.880648499594532e-06, "loss": 0.034392908215522766, "memory(GiB)": 21.32, "step": 7891, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.956093 }, { "epoch": 0.25637527206575056, "grad_norm": 0.3468073010444641, "learning_rate": 8.880309762901705e-06, "loss": 0.02251492068171501, "memory(GiB)": 21.32, "step": 7892, "token_acc": 1.0, "train_speed(iter/s)": 0.956114 }, { "epoch": 0.256407757528506, "grad_norm": 0.47888079285621643, "learning_rate": 8.879970981424935e-06, "loss": 0.03163541108369827, "memory(GiB)": 21.32, "step": 7893, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.956136 }, { "epoch": 0.2564402429912614, "grad_norm": 2.2654900550842285, "learning_rate": 8.879632155168136e-06, "loss": 0.03484601527452469, "memory(GiB)": 21.32, "step": 7894, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956158 }, { "epoch": 0.2564727284540168, "grad_norm": 0.5600394606590271, "learning_rate": 8.879293284135219e-06, "loss": 0.02651926688849926, "memory(GiB)": 21.32, "step": 7895, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.956178 }, { "epoch": 0.2565052139167722, "grad_norm": 0.5884109139442444, "learning_rate": 8.878954368330095e-06, "loss": 0.0413493849337101, "memory(GiB)": 21.32, "step": 7896, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.956199 }, { "epoch": 0.25653769937952764, "grad_norm": 0.45549431443214417, "learning_rate": 8.878615407756675e-06, "loss": 0.03747151046991348, "memory(GiB)": 21.32, "step": 7897, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956218 }, { "epoch": 0.25657018484228306, "grad_norm": 1.0031183958053589, "learning_rate": 8.878276402418869e-06, "loss": 0.034471698105335236, "memory(GiB)": 21.32, "step": 7898, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.956239 }, { "epoch": 0.25660267030503847, "grad_norm": 0.594810962677002, "learning_rate": 8.877937352320591e-06, "loss": 0.03680543601512909, "memory(GiB)": 21.32, "step": 7899, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.95626 }, { "epoch": 0.2566351557677939, "grad_norm": 0.49653834104537964, "learning_rate": 8.877598257465757e-06, "loss": 0.04535740613937378, "memory(GiB)": 21.32, "step": 7900, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.956282 }, { "epoch": 0.2566676412305493, "grad_norm": 0.5771225094795227, "learning_rate": 8.877259117858275e-06, "loss": 0.04111715406179428, "memory(GiB)": 21.32, "step": 7901, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.956299 }, { "epoch": 0.2567001266933047, "grad_norm": 1.1997212171554565, "learning_rate": 8.876919933502063e-06, "loss": 0.06309866160154343, "memory(GiB)": 21.32, "step": 7902, "token_acc": 0.959349593495935, "train_speed(iter/s)": 0.956315 }, { "epoch": 0.25673261215606014, "grad_norm": 0.4221770763397217, "learning_rate": 8.876580704401036e-06, "loss": 0.032688867300748825, "memory(GiB)": 21.32, "step": 7903, "token_acc": 0.9854014598540146, "train_speed(iter/s)": 0.956332 }, { "epoch": 0.2567650976188156, "grad_norm": 0.5587925314903259, "learning_rate": 8.876241430559107e-06, "loss": 0.03843153268098831, "memory(GiB)": 21.32, "step": 7904, "token_acc": 0.9692982456140351, "train_speed(iter/s)": 0.95635 }, { "epoch": 0.256797583081571, "grad_norm": 0.4758478105068207, "learning_rate": 8.875902111980193e-06, "loss": 0.03424309194087982, "memory(GiB)": 21.32, "step": 7905, "token_acc": 0.9885496183206107, "train_speed(iter/s)": 0.956367 }, { "epoch": 0.25683006854432644, "grad_norm": 0.44998517632484436, "learning_rate": 8.87556274866821e-06, "loss": 0.038099274039268494, "memory(GiB)": 21.32, "step": 7906, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.956382 }, { "epoch": 0.25686255400708186, "grad_norm": 0.3696623742580414, "learning_rate": 8.875223340627075e-06, "loss": 0.03518013656139374, "memory(GiB)": 21.32, "step": 7907, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956398 }, { "epoch": 0.25689503946983727, "grad_norm": 0.46370959281921387, "learning_rate": 8.874883887860703e-06, "loss": 0.0282633937895298, "memory(GiB)": 21.32, "step": 7908, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.956416 }, { "epoch": 0.2569275249325927, "grad_norm": 0.42562952637672424, "learning_rate": 8.874544390373014e-06, "loss": 0.040455520153045654, "memory(GiB)": 21.32, "step": 7909, "token_acc": 0.9736842105263158, "train_speed(iter/s)": 0.956434 }, { "epoch": 0.2569600103953481, "grad_norm": 0.4831242263317108, "learning_rate": 8.874204848167927e-06, "loss": 0.036389388144016266, "memory(GiB)": 21.32, "step": 7910, "token_acc": 0.9893238434163701, "train_speed(iter/s)": 0.956451 }, { "epoch": 0.2569924958581035, "grad_norm": 0.3827475607395172, "learning_rate": 8.873865261249358e-06, "loss": 0.036041952669620514, "memory(GiB)": 21.32, "step": 7911, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.956469 }, { "epoch": 0.25702498132085894, "grad_norm": 0.39865732192993164, "learning_rate": 8.873525629621229e-06, "loss": 0.03182944282889366, "memory(GiB)": 21.32, "step": 7912, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956488 }, { "epoch": 0.25705746678361435, "grad_norm": 0.4871503412723541, "learning_rate": 8.873185953287458e-06, "loss": 0.03967004269361496, "memory(GiB)": 21.32, "step": 7913, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.956508 }, { "epoch": 0.25708995224636977, "grad_norm": 0.4632835388183594, "learning_rate": 8.872846232251965e-06, "loss": 0.031029576435685158, "memory(GiB)": 21.32, "step": 7914, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.956527 }, { "epoch": 0.2571224377091252, "grad_norm": 1.2322791814804077, "learning_rate": 8.872506466518673e-06, "loss": 0.04229606315493584, "memory(GiB)": 21.32, "step": 7915, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.956548 }, { "epoch": 0.2571549231718806, "grad_norm": 0.5235912203788757, "learning_rate": 8.872166656091504e-06, "loss": 0.03371120244264603, "memory(GiB)": 21.32, "step": 7916, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.956567 }, { "epoch": 0.257187408634636, "grad_norm": 0.49036335945129395, "learning_rate": 8.871826800974373e-06, "loss": 0.03773728013038635, "memory(GiB)": 21.32, "step": 7917, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956588 }, { "epoch": 0.25721989409739143, "grad_norm": 0.5542126297950745, "learning_rate": 8.871486901171212e-06, "loss": 0.03785233199596405, "memory(GiB)": 21.32, "step": 7918, "token_acc": 0.9710982658959537, "train_speed(iter/s)": 0.956607 }, { "epoch": 0.25725237956014685, "grad_norm": 0.46454542875289917, "learning_rate": 8.871146956685936e-06, "loss": 0.03666962310671806, "memory(GiB)": 21.32, "step": 7919, "token_acc": 0.9797979797979798, "train_speed(iter/s)": 0.956624 }, { "epoch": 0.25728486502290226, "grad_norm": 0.5193501710891724, "learning_rate": 8.870806967522474e-06, "loss": 0.03751831501722336, "memory(GiB)": 21.32, "step": 7920, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.956644 }, { "epoch": 0.2573173504856577, "grad_norm": 0.5843715071678162, "learning_rate": 8.870466933684744e-06, "loss": 0.03097347915172577, "memory(GiB)": 21.32, "step": 7921, "token_acc": 0.9885931558935361, "train_speed(iter/s)": 0.956666 }, { "epoch": 0.2573498359484131, "grad_norm": 0.5224182605743408, "learning_rate": 8.870126855176679e-06, "loss": 0.02717563323676586, "memory(GiB)": 21.32, "step": 7922, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.95669 }, { "epoch": 0.2573823214111685, "grad_norm": 0.4466891288757324, "learning_rate": 8.869786732002196e-06, "loss": 0.028552472591400146, "memory(GiB)": 21.32, "step": 7923, "token_acc": 0.9826086956521739, "train_speed(iter/s)": 0.956715 }, { "epoch": 0.25741480687392393, "grad_norm": 0.4189687967300415, "learning_rate": 8.869446564165223e-06, "loss": 0.03326672315597534, "memory(GiB)": 21.32, "step": 7924, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.956739 }, { "epoch": 0.25744729233667935, "grad_norm": 0.4862983524799347, "learning_rate": 8.869106351669688e-06, "loss": 0.03846585750579834, "memory(GiB)": 21.32, "step": 7925, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.956764 }, { "epoch": 0.25747977779943476, "grad_norm": 0.6900476813316345, "learning_rate": 8.868766094519515e-06, "loss": 0.032050229609012604, "memory(GiB)": 21.32, "step": 7926, "token_acc": 0.994475138121547, "train_speed(iter/s)": 0.956789 }, { "epoch": 0.2575122632621902, "grad_norm": 0.8700563311576843, "learning_rate": 8.868425792718633e-06, "loss": 0.039351578801870346, "memory(GiB)": 21.32, "step": 7927, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.956814 }, { "epoch": 0.2575447487249456, "grad_norm": 0.5108405351638794, "learning_rate": 8.868085446270966e-06, "loss": 0.036766018718481064, "memory(GiB)": 21.32, "step": 7928, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.95684 }, { "epoch": 0.257577234187701, "grad_norm": 0.5349226593971252, "learning_rate": 8.867745055180447e-06, "loss": 0.02948826551437378, "memory(GiB)": 21.32, "step": 7929, "token_acc": 1.0, "train_speed(iter/s)": 0.956864 }, { "epoch": 0.2576097196504564, "grad_norm": 0.431758850812912, "learning_rate": 8.867404619451002e-06, "loss": 0.03763563930988312, "memory(GiB)": 21.32, "step": 7930, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.956888 }, { "epoch": 0.25764220511321184, "grad_norm": 0.3902305066585541, "learning_rate": 8.867064139086559e-06, "loss": 0.03202704340219498, "memory(GiB)": 21.32, "step": 7931, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956915 }, { "epoch": 0.25767469057596726, "grad_norm": 0.3779405951499939, "learning_rate": 8.866723614091049e-06, "loss": 0.0305959302932024, "memory(GiB)": 21.32, "step": 7932, "token_acc": 1.0, "train_speed(iter/s)": 0.956941 }, { "epoch": 0.2577071760387227, "grad_norm": 0.5335490703582764, "learning_rate": 8.866383044468402e-06, "loss": 0.038522034883499146, "memory(GiB)": 21.32, "step": 7933, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.956965 }, { "epoch": 0.2577396615014781, "grad_norm": 0.6566635370254517, "learning_rate": 8.86604243022255e-06, "loss": 0.04316132888197899, "memory(GiB)": 21.32, "step": 7934, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.956991 }, { "epoch": 0.2577721469642335, "grad_norm": 0.5558226704597473, "learning_rate": 8.86570177135742e-06, "loss": 0.041210487484931946, "memory(GiB)": 21.32, "step": 7935, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.957015 }, { "epoch": 0.2578046324269889, "grad_norm": 0.484277606010437, "learning_rate": 8.865361067876948e-06, "loss": 0.028208617120981216, "memory(GiB)": 21.32, "step": 7936, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.95704 }, { "epoch": 0.25783711788974434, "grad_norm": 0.7659075260162354, "learning_rate": 8.865020319785065e-06, "loss": 0.029156342148780823, "memory(GiB)": 21.32, "step": 7937, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.957063 }, { "epoch": 0.25786960335249975, "grad_norm": 0.504332959651947, "learning_rate": 8.864679527085702e-06, "loss": 0.03670337051153183, "memory(GiB)": 21.32, "step": 7938, "token_acc": 0.9824561403508771, "train_speed(iter/s)": 0.957086 }, { "epoch": 0.25790208881525517, "grad_norm": 0.4275790750980377, "learning_rate": 8.864338689782793e-06, "loss": 0.031303077936172485, "memory(GiB)": 21.32, "step": 7939, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.957111 }, { "epoch": 0.2579345742780106, "grad_norm": 0.6505334973335266, "learning_rate": 8.863997807880273e-06, "loss": 0.02126496657729149, "memory(GiB)": 21.32, "step": 7940, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.957134 }, { "epoch": 0.257967059740766, "grad_norm": 0.40262776613235474, "learning_rate": 8.863656881382074e-06, "loss": 0.029136253520846367, "memory(GiB)": 21.32, "step": 7941, "token_acc": 1.0, "train_speed(iter/s)": 0.957159 }, { "epoch": 0.2579995452035214, "grad_norm": 0.6531875729560852, "learning_rate": 8.863315910292134e-06, "loss": 0.03751140087842941, "memory(GiB)": 21.32, "step": 7942, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.957184 }, { "epoch": 0.25803203066627683, "grad_norm": 0.42054781317710876, "learning_rate": 8.862974894614386e-06, "loss": 0.028650756925344467, "memory(GiB)": 21.32, "step": 7943, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.957208 }, { "epoch": 0.25806451612903225, "grad_norm": 0.7815672755241394, "learning_rate": 8.862633834352765e-06, "loss": 0.03609287738800049, "memory(GiB)": 21.32, "step": 7944, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957233 }, { "epoch": 0.25809700159178767, "grad_norm": 0.6374859809875488, "learning_rate": 8.86229272951121e-06, "loss": 0.03841612488031387, "memory(GiB)": 21.32, "step": 7945, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.957259 }, { "epoch": 0.2581294870545431, "grad_norm": 0.4623141586780548, "learning_rate": 8.861951580093655e-06, "loss": 0.03165346011519432, "memory(GiB)": 21.32, "step": 7946, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957284 }, { "epoch": 0.2581619725172985, "grad_norm": 0.5626937747001648, "learning_rate": 8.861610386104039e-06, "loss": 0.04458902031183243, "memory(GiB)": 21.32, "step": 7947, "token_acc": 0.9928571428571429, "train_speed(iter/s)": 0.957309 }, { "epoch": 0.2581944579800539, "grad_norm": 0.5411679148674011, "learning_rate": 8.861269147546298e-06, "loss": 0.038538143038749695, "memory(GiB)": 21.32, "step": 7948, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.957333 }, { "epoch": 0.25822694344280933, "grad_norm": 0.40592285990715027, "learning_rate": 8.860927864424374e-06, "loss": 0.03469120338559151, "memory(GiB)": 21.32, "step": 7949, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.957356 }, { "epoch": 0.25825942890556475, "grad_norm": 0.4372061789035797, "learning_rate": 8.860586536742201e-06, "loss": 0.034075018018484116, "memory(GiB)": 21.32, "step": 7950, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957376 }, { "epoch": 0.25829191436832016, "grad_norm": 0.4747248888015747, "learning_rate": 8.860245164503724e-06, "loss": 0.02660471200942993, "memory(GiB)": 21.32, "step": 7951, "token_acc": 1.0, "train_speed(iter/s)": 0.957397 }, { "epoch": 0.2583243998310756, "grad_norm": 0.4867340624332428, "learning_rate": 8.859903747712877e-06, "loss": 0.03302483260631561, "memory(GiB)": 21.32, "step": 7952, "token_acc": 0.9809885931558935, "train_speed(iter/s)": 0.957417 }, { "epoch": 0.258356885293831, "grad_norm": 0.6807785034179688, "learning_rate": 8.859562286373605e-06, "loss": 0.034244880080223083, "memory(GiB)": 21.32, "step": 7953, "token_acc": 1.0, "train_speed(iter/s)": 0.957438 }, { "epoch": 0.2583893707565864, "grad_norm": 0.7622984647750854, "learning_rate": 8.859220780489847e-06, "loss": 0.035457268357276917, "memory(GiB)": 21.32, "step": 7954, "token_acc": 1.0, "train_speed(iter/s)": 0.957459 }, { "epoch": 0.25842185621934183, "grad_norm": 1.737686276435852, "learning_rate": 8.858879230065543e-06, "loss": 0.038022398948669434, "memory(GiB)": 21.32, "step": 7955, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.957478 }, { "epoch": 0.25845434168209724, "grad_norm": 0.38271740078926086, "learning_rate": 8.858537635104638e-06, "loss": 0.034314125776290894, "memory(GiB)": 21.32, "step": 7956, "token_acc": 0.9669421487603306, "train_speed(iter/s)": 0.957498 }, { "epoch": 0.25848682714485266, "grad_norm": 0.3939228057861328, "learning_rate": 8.858195995611073e-06, "loss": 0.030782945454120636, "memory(GiB)": 21.32, "step": 7957, "token_acc": 0.9928057553956835, "train_speed(iter/s)": 0.957516 }, { "epoch": 0.2585193126076081, "grad_norm": 0.5001353025436401, "learning_rate": 8.85785431158879e-06, "loss": 0.031764496117830276, "memory(GiB)": 21.32, "step": 7958, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957533 }, { "epoch": 0.2585517980703635, "grad_norm": 0.6229397058486938, "learning_rate": 8.857512583041735e-06, "loss": 0.045283570885658264, "memory(GiB)": 21.32, "step": 7959, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957551 }, { "epoch": 0.2585842835331189, "grad_norm": 0.48989608883857727, "learning_rate": 8.85717080997385e-06, "loss": 0.034449540078639984, "memory(GiB)": 21.32, "step": 7960, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.95757 }, { "epoch": 0.2586167689958743, "grad_norm": 0.42731428146362305, "learning_rate": 8.85682899238908e-06, "loss": 0.04088636115193367, "memory(GiB)": 21.32, "step": 7961, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.957587 }, { "epoch": 0.25864925445862974, "grad_norm": 0.4358087182044983, "learning_rate": 8.856487130291369e-06, "loss": 0.03319761902093887, "memory(GiB)": 21.32, "step": 7962, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.957604 }, { "epoch": 0.25868173992138516, "grad_norm": 0.3063920736312866, "learning_rate": 8.856145223684664e-06, "loss": 0.027430742979049683, "memory(GiB)": 21.32, "step": 7963, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957621 }, { "epoch": 0.2587142253841406, "grad_norm": 0.4786717891693115, "learning_rate": 8.85580327257291e-06, "loss": 0.03183668106794357, "memory(GiB)": 21.32, "step": 7964, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.957637 }, { "epoch": 0.258746710846896, "grad_norm": 0.3428668975830078, "learning_rate": 8.855461276960055e-06, "loss": 0.031082073226571083, "memory(GiB)": 21.32, "step": 7965, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.957654 }, { "epoch": 0.2587791963096514, "grad_norm": 0.5015180110931396, "learning_rate": 8.855119236850044e-06, "loss": 0.037843771278858185, "memory(GiB)": 21.32, "step": 7966, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.95767 }, { "epoch": 0.2588116817724068, "grad_norm": 0.5108981132507324, "learning_rate": 8.854777152246828e-06, "loss": 0.028992652893066406, "memory(GiB)": 21.32, "step": 7967, "token_acc": 0.9753086419753086, "train_speed(iter/s)": 0.957685 }, { "epoch": 0.2588441672351623, "grad_norm": 0.45849350094795227, "learning_rate": 8.85443502315435e-06, "loss": 0.03147996962070465, "memory(GiB)": 21.32, "step": 7968, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.957701 }, { "epoch": 0.2588766526979177, "grad_norm": 0.5785079598426819, "learning_rate": 8.854092849576565e-06, "loss": 0.03526955097913742, "memory(GiB)": 21.32, "step": 7969, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.957718 }, { "epoch": 0.2589091381606731, "grad_norm": 0.6356362104415894, "learning_rate": 8.853750631517418e-06, "loss": 0.040664732456207275, "memory(GiB)": 21.32, "step": 7970, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.957735 }, { "epoch": 0.25894162362342854, "grad_norm": 0.5389688014984131, "learning_rate": 8.853408368980858e-06, "loss": 0.04135558009147644, "memory(GiB)": 21.32, "step": 7971, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.957751 }, { "epoch": 0.25897410908618396, "grad_norm": 0.5365985035896301, "learning_rate": 8.853066061970837e-06, "loss": 0.02644127979874611, "memory(GiB)": 21.32, "step": 7972, "token_acc": 1.0, "train_speed(iter/s)": 0.957767 }, { "epoch": 0.2590065945489394, "grad_norm": 0.4096220135688782, "learning_rate": 8.852723710491306e-06, "loss": 0.02985130064189434, "memory(GiB)": 21.32, "step": 7973, "token_acc": 0.9884169884169884, "train_speed(iter/s)": 0.957784 }, { "epoch": 0.2590390800116948, "grad_norm": 0.6253281235694885, "learning_rate": 8.852381314546215e-06, "loss": 0.040329545736312866, "memory(GiB)": 21.32, "step": 7974, "token_acc": 0.9947916666666666, "train_speed(iter/s)": 0.957801 }, { "epoch": 0.2590715654744502, "grad_norm": 0.6785786747932434, "learning_rate": 8.852038874139514e-06, "loss": 0.03320813924074173, "memory(GiB)": 21.32, "step": 7975, "token_acc": 0.996, "train_speed(iter/s)": 0.957817 }, { "epoch": 0.2591040509372056, "grad_norm": 0.3992927074432373, "learning_rate": 8.85169638927516e-06, "loss": 0.02749333158135414, "memory(GiB)": 21.32, "step": 7976, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.957837 }, { "epoch": 0.25913653639996104, "grad_norm": 0.3786216080188751, "learning_rate": 8.851353859957102e-06, "loss": 0.02952348068356514, "memory(GiB)": 21.32, "step": 7977, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.957854 }, { "epoch": 0.25916902186271645, "grad_norm": 0.3536083996295929, "learning_rate": 8.851011286189294e-06, "loss": 0.031670838594436646, "memory(GiB)": 21.32, "step": 7978, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.957871 }, { "epoch": 0.25920150732547187, "grad_norm": 0.43526723980903625, "learning_rate": 8.850668667975692e-06, "loss": 0.032254040241241455, "memory(GiB)": 21.32, "step": 7979, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.95789 }, { "epoch": 0.2592339927882273, "grad_norm": 0.43284744024276733, "learning_rate": 8.850326005320246e-06, "loss": 0.02964458242058754, "memory(GiB)": 21.32, "step": 7980, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.95791 }, { "epoch": 0.2592664782509827, "grad_norm": 0.5709553360939026, "learning_rate": 8.849983298226914e-06, "loss": 0.03444461524486542, "memory(GiB)": 21.32, "step": 7981, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.957933 }, { "epoch": 0.2592989637137381, "grad_norm": 0.4881803095340729, "learning_rate": 8.849640546699652e-06, "loss": 0.03521104156970978, "memory(GiB)": 21.32, "step": 7982, "token_acc": 0.9799196787148594, "train_speed(iter/s)": 0.957958 }, { "epoch": 0.25933144917649353, "grad_norm": 0.4576930105686188, "learning_rate": 8.849297750742411e-06, "loss": 0.033386290073394775, "memory(GiB)": 21.32, "step": 7983, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.95798 }, { "epoch": 0.25936393463924895, "grad_norm": 0.3174864649772644, "learning_rate": 8.848954910359153e-06, "loss": 0.03439965844154358, "memory(GiB)": 21.32, "step": 7984, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.958004 }, { "epoch": 0.25939642010200437, "grad_norm": 0.9756715893745422, "learning_rate": 8.848612025553834e-06, "loss": 0.03444856032729149, "memory(GiB)": 21.32, "step": 7985, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.958029 }, { "epoch": 0.2594289055647598, "grad_norm": 0.5937459468841553, "learning_rate": 8.848269096330406e-06, "loss": 0.033972419798374176, "memory(GiB)": 21.32, "step": 7986, "token_acc": 0.9774436090225563, "train_speed(iter/s)": 0.958054 }, { "epoch": 0.2594613910275152, "grad_norm": 0.3836318850517273, "learning_rate": 8.84792612269283e-06, "loss": 0.028343064710497856, "memory(GiB)": 21.32, "step": 7987, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.95808 }, { "epoch": 0.2594938764902706, "grad_norm": 0.5457004904747009, "learning_rate": 8.84758310464507e-06, "loss": 0.03853504732251167, "memory(GiB)": 21.32, "step": 7988, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.9581 }, { "epoch": 0.25952636195302603, "grad_norm": 0.43579378724098206, "learning_rate": 8.847240042191077e-06, "loss": 0.03093915432691574, "memory(GiB)": 21.32, "step": 7989, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.958124 }, { "epoch": 0.25955884741578145, "grad_norm": 0.577870786190033, "learning_rate": 8.846896935334813e-06, "loss": 0.03941888362169266, "memory(GiB)": 21.32, "step": 7990, "token_acc": 0.992, "train_speed(iter/s)": 0.95815 }, { "epoch": 0.25959133287853686, "grad_norm": 0.7920264005661011, "learning_rate": 8.846553784080238e-06, "loss": 0.029914729297161102, "memory(GiB)": 21.32, "step": 7991, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.958175 }, { "epoch": 0.2596238183412923, "grad_norm": 1.1011440753936768, "learning_rate": 8.846210588431313e-06, "loss": 0.04533146321773529, "memory(GiB)": 21.32, "step": 7992, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.958201 }, { "epoch": 0.2596563038040477, "grad_norm": 0.48966601490974426, "learning_rate": 8.845867348392e-06, "loss": 0.03488130122423172, "memory(GiB)": 21.32, "step": 7993, "token_acc": 0.9706959706959707, "train_speed(iter/s)": 0.958224 }, { "epoch": 0.2596887892668031, "grad_norm": 0.44697874784469604, "learning_rate": 8.845524063966255e-06, "loss": 0.035046856850385666, "memory(GiB)": 21.32, "step": 7994, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.958248 }, { "epoch": 0.2597212747295585, "grad_norm": 0.42379269003868103, "learning_rate": 8.845180735158047e-06, "loss": 0.038887813687324524, "memory(GiB)": 21.32, "step": 7995, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.958273 }, { "epoch": 0.25975376019231394, "grad_norm": 0.6693635582923889, "learning_rate": 8.844837361971334e-06, "loss": 0.0364503413438797, "memory(GiB)": 21.32, "step": 7996, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.958296 }, { "epoch": 0.25978624565506936, "grad_norm": 0.5468130111694336, "learning_rate": 8.844493944410082e-06, "loss": 0.034644365310668945, "memory(GiB)": 21.32, "step": 7997, "token_acc": 0.9921875, "train_speed(iter/s)": 0.95832 }, { "epoch": 0.2598187311178248, "grad_norm": 0.6206537485122681, "learning_rate": 8.844150482478251e-06, "loss": 0.035669632256031036, "memory(GiB)": 21.32, "step": 7998, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.958345 }, { "epoch": 0.2598512165805802, "grad_norm": 0.5159467458724976, "learning_rate": 8.843806976179808e-06, "loss": 0.02782025933265686, "memory(GiB)": 21.32, "step": 7999, "token_acc": 0.988, "train_speed(iter/s)": 0.95837 }, { "epoch": 0.2598837020433356, "grad_norm": 0.3974649906158447, "learning_rate": 8.843463425518717e-06, "loss": 0.035471513867378235, "memory(GiB)": 21.32, "step": 8000, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.958395 }, { "epoch": 0.2598837020433356, "eval_loss": 0.03521514683961868, "eval_runtime": 79.3803, "eval_samples_per_second": 125.346, "eval_steps_per_second": 3.918, "eval_token_acc": 0.9863879035621065, "step": 8000 }, { "epoch": 0.259916187506091, "grad_norm": 0.4403257668018341, "learning_rate": 8.84311983049894e-06, "loss": 0.03183763101696968, "memory(GiB)": 21.48, "step": 8001, "token_acc": 0.9859509790497056, "train_speed(iter/s)": 0.948193 }, { "epoch": 0.25994867296884644, "grad_norm": 0.32661861181259155, "learning_rate": 8.842776191124447e-06, "loss": 0.027863269671797752, "memory(GiB)": 21.48, "step": 8002, "token_acc": 0.99609375, "train_speed(iter/s)": 0.948207 }, { "epoch": 0.25998115843160186, "grad_norm": 0.5629222989082336, "learning_rate": 8.842432507399201e-06, "loss": 0.03406810760498047, "memory(GiB)": 21.48, "step": 8003, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.948224 }, { "epoch": 0.26001364389435727, "grad_norm": 0.502496063709259, "learning_rate": 8.842088779327171e-06, "loss": 0.034648068249225616, "memory(GiB)": 21.48, "step": 8004, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.94824 }, { "epoch": 0.2600461293571127, "grad_norm": 0.49032121896743774, "learning_rate": 8.841745006912323e-06, "loss": 0.03783139958977699, "memory(GiB)": 21.48, "step": 8005, "token_acc": 0.9797979797979798, "train_speed(iter/s)": 0.948257 }, { "epoch": 0.2600786148198681, "grad_norm": 0.4694438874721527, "learning_rate": 8.841401190158623e-06, "loss": 0.033238254487514496, "memory(GiB)": 21.48, "step": 8006, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.948275 }, { "epoch": 0.2601111002826235, "grad_norm": 0.6942703127861023, "learning_rate": 8.84105732907004e-06, "loss": 0.04359830915927887, "memory(GiB)": 21.48, "step": 8007, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.948292 }, { "epoch": 0.26014358574537894, "grad_norm": 0.5596514344215393, "learning_rate": 8.840713423650544e-06, "loss": 0.03820443153381348, "memory(GiB)": 21.48, "step": 8008, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.948312 }, { "epoch": 0.26017607120813435, "grad_norm": 0.46695592999458313, "learning_rate": 8.840369473904102e-06, "loss": 0.04167287051677704, "memory(GiB)": 21.48, "step": 8009, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.948328 }, { "epoch": 0.26020855667088977, "grad_norm": 0.36492079496383667, "learning_rate": 8.840025479834686e-06, "loss": 0.032278500497341156, "memory(GiB)": 21.48, "step": 8010, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.948347 }, { "epoch": 0.2602410421336452, "grad_norm": 0.5079807639122009, "learning_rate": 8.839681441446265e-06, "loss": 0.03689985349774361, "memory(GiB)": 21.48, "step": 8011, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.948368 }, { "epoch": 0.2602735275964006, "grad_norm": 0.7003970742225647, "learning_rate": 8.839337358742809e-06, "loss": 0.03324631601572037, "memory(GiB)": 21.48, "step": 8012, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.948388 }, { "epoch": 0.260306013059156, "grad_norm": 0.379520058631897, "learning_rate": 8.838993231728288e-06, "loss": 0.032303910702466965, "memory(GiB)": 21.48, "step": 8013, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.948409 }, { "epoch": 0.26033849852191143, "grad_norm": 0.3841017484664917, "learning_rate": 8.838649060406678e-06, "loss": 0.03773690015077591, "memory(GiB)": 21.48, "step": 8014, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.948431 }, { "epoch": 0.26037098398466685, "grad_norm": 0.4278326630592346, "learning_rate": 8.838304844781948e-06, "loss": 0.03341358155012131, "memory(GiB)": 21.48, "step": 8015, "token_acc": 0.9776119402985075, "train_speed(iter/s)": 0.948456 }, { "epoch": 0.26040346944742226, "grad_norm": 0.44465216994285583, "learning_rate": 8.837960584858069e-06, "loss": 0.03639375790953636, "memory(GiB)": 21.48, "step": 8016, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.948483 }, { "epoch": 0.2604359549101777, "grad_norm": 0.6516062617301941, "learning_rate": 8.837616280639019e-06, "loss": 0.03450276702642441, "memory(GiB)": 21.48, "step": 8017, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.948509 }, { "epoch": 0.2604684403729331, "grad_norm": 0.5220569968223572, "learning_rate": 8.837271932128769e-06, "loss": 0.03448767215013504, "memory(GiB)": 21.48, "step": 8018, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.948534 }, { "epoch": 0.2605009258356885, "grad_norm": 0.3487984240055084, "learning_rate": 8.836927539331294e-06, "loss": 0.028798287734389305, "memory(GiB)": 21.48, "step": 8019, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.948559 }, { "epoch": 0.26053341129844393, "grad_norm": 0.496913343667984, "learning_rate": 8.836583102250566e-06, "loss": 0.032995980232954025, "memory(GiB)": 21.48, "step": 8020, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.948582 }, { "epoch": 0.26056589676119934, "grad_norm": 0.6860108971595764, "learning_rate": 8.836238620890565e-06, "loss": 0.027437277138233185, "memory(GiB)": 21.48, "step": 8021, "token_acc": 0.99609375, "train_speed(iter/s)": 0.948609 }, { "epoch": 0.26059838222395476, "grad_norm": 0.5101864337921143, "learning_rate": 8.835894095255263e-06, "loss": 0.037044957280159, "memory(GiB)": 21.48, "step": 8022, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.948635 }, { "epoch": 0.2606308676867102, "grad_norm": 0.4299708306789398, "learning_rate": 8.835549525348639e-06, "loss": 0.03605974093079567, "memory(GiB)": 21.48, "step": 8023, "token_acc": 1.0, "train_speed(iter/s)": 0.948661 }, { "epoch": 0.2606633531494656, "grad_norm": 0.47861990332603455, "learning_rate": 8.835204911174667e-06, "loss": 0.032146696001291275, "memory(GiB)": 21.48, "step": 8024, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.948687 }, { "epoch": 0.260695838612221, "grad_norm": 0.39399176836013794, "learning_rate": 8.834860252737325e-06, "loss": 0.03079240396618843, "memory(GiB)": 21.48, "step": 8025, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.948712 }, { "epoch": 0.2607283240749764, "grad_norm": 0.4686785042285919, "learning_rate": 8.834515550040593e-06, "loss": 0.033460021018981934, "memory(GiB)": 21.48, "step": 8026, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.948738 }, { "epoch": 0.26076080953773184, "grad_norm": 0.536913275718689, "learning_rate": 8.834170803088446e-06, "loss": 0.028751902282238007, "memory(GiB)": 21.48, "step": 8027, "token_acc": 1.0, "train_speed(iter/s)": 0.948764 }, { "epoch": 0.26079329500048726, "grad_norm": 0.542641282081604, "learning_rate": 8.833826011884868e-06, "loss": 0.039278216660022736, "memory(GiB)": 21.48, "step": 8028, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.948791 }, { "epoch": 0.2608257804632427, "grad_norm": 0.691914975643158, "learning_rate": 8.833481176433833e-06, "loss": 0.03481326624751091, "memory(GiB)": 21.48, "step": 8029, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.948817 }, { "epoch": 0.2608582659259981, "grad_norm": 0.6112751364707947, "learning_rate": 8.833136296739321e-06, "loss": 0.03519468754529953, "memory(GiB)": 21.48, "step": 8030, "token_acc": 0.996, "train_speed(iter/s)": 0.948843 }, { "epoch": 0.2608907513887535, "grad_norm": 0.5199449062347412, "learning_rate": 8.832791372805317e-06, "loss": 0.03465732932090759, "memory(GiB)": 21.48, "step": 8031, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.948869 }, { "epoch": 0.260923236851509, "grad_norm": 1.12216317653656, "learning_rate": 8.832446404635799e-06, "loss": 0.02743689902126789, "memory(GiB)": 21.48, "step": 8032, "token_acc": 0.9788359788359788, "train_speed(iter/s)": 0.948895 }, { "epoch": 0.2609557223142644, "grad_norm": 0.5334284901618958, "learning_rate": 8.832101392234746e-06, "loss": 0.039157185703516006, "memory(GiB)": 21.48, "step": 8033, "token_acc": 0.9877049180327869, "train_speed(iter/s)": 0.94892 }, { "epoch": 0.2609882077770198, "grad_norm": 0.6014745831489563, "learning_rate": 8.831756335606145e-06, "loss": 0.038796938955783844, "memory(GiB)": 21.48, "step": 8034, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.948946 }, { "epoch": 0.2610206932397752, "grad_norm": 0.4467153549194336, "learning_rate": 8.831411234753975e-06, "loss": 0.02577715739607811, "memory(GiB)": 21.48, "step": 8035, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.948972 }, { "epoch": 0.26105317870253064, "grad_norm": 0.5090304017066956, "learning_rate": 8.831066089682222e-06, "loss": 0.046837370842695236, "memory(GiB)": 21.48, "step": 8036, "token_acc": 0.963963963963964, "train_speed(iter/s)": 0.948998 }, { "epoch": 0.26108566416528606, "grad_norm": 0.43455877900123596, "learning_rate": 8.830720900394862e-06, "loss": 0.032229699194431305, "memory(GiB)": 21.48, "step": 8037, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.949024 }, { "epoch": 0.2611181496280415, "grad_norm": 0.6110352277755737, "learning_rate": 8.830375666895888e-06, "loss": 0.04301264509558678, "memory(GiB)": 21.48, "step": 8038, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.94905 }, { "epoch": 0.2611506350907969, "grad_norm": 0.7229561805725098, "learning_rate": 8.83003038918928e-06, "loss": 0.046882327646017075, "memory(GiB)": 21.48, "step": 8039, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.949075 }, { "epoch": 0.2611831205535523, "grad_norm": 0.5351254940032959, "learning_rate": 8.829685067279022e-06, "loss": 0.04037877917289734, "memory(GiB)": 21.48, "step": 8040, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.949102 }, { "epoch": 0.2612156060163077, "grad_norm": 0.5399956703186035, "learning_rate": 8.829339701169104e-06, "loss": 0.037691205739974976, "memory(GiB)": 21.48, "step": 8041, "token_acc": 0.9766536964980544, "train_speed(iter/s)": 0.949127 }, { "epoch": 0.26124809147906314, "grad_norm": 0.4785962998867035, "learning_rate": 8.828994290863506e-06, "loss": 0.03817087784409523, "memory(GiB)": 21.48, "step": 8042, "token_acc": 0.9790209790209791, "train_speed(iter/s)": 0.949153 }, { "epoch": 0.26128057694181855, "grad_norm": 0.6566441655158997, "learning_rate": 8.828648836366217e-06, "loss": 0.0338188111782074, "memory(GiB)": 21.48, "step": 8043, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.949179 }, { "epoch": 0.26131306240457397, "grad_norm": 0.3534291386604309, "learning_rate": 8.828303337681227e-06, "loss": 0.03078576736152172, "memory(GiB)": 21.48, "step": 8044, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.949204 }, { "epoch": 0.2613455478673294, "grad_norm": 0.515792727470398, "learning_rate": 8.82795779481252e-06, "loss": 0.03260939195752144, "memory(GiB)": 21.48, "step": 8045, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.949227 }, { "epoch": 0.2613780333300848, "grad_norm": 0.43277111649513245, "learning_rate": 8.827612207764086e-06, "loss": 0.03101933002471924, "memory(GiB)": 21.48, "step": 8046, "token_acc": 0.9868995633187773, "train_speed(iter/s)": 0.949253 }, { "epoch": 0.2614105187928402, "grad_norm": 0.4235237240791321, "learning_rate": 8.827266576539912e-06, "loss": 0.03054249845445156, "memory(GiB)": 21.48, "step": 8047, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.949277 }, { "epoch": 0.26144300425559563, "grad_norm": 0.28667378425598145, "learning_rate": 8.826920901143987e-06, "loss": 0.024779265746474266, "memory(GiB)": 21.48, "step": 8048, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.949298 }, { "epoch": 0.26147548971835105, "grad_norm": 0.4764167070388794, "learning_rate": 8.826575181580302e-06, "loss": 0.027835553511977196, "memory(GiB)": 21.48, "step": 8049, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.949316 }, { "epoch": 0.26150797518110647, "grad_norm": 0.5493786334991455, "learning_rate": 8.826229417852847e-06, "loss": 0.037620678544044495, "memory(GiB)": 21.48, "step": 8050, "token_acc": 1.0, "train_speed(iter/s)": 0.949337 }, { "epoch": 0.2615404606438619, "grad_norm": 0.511838972568512, "learning_rate": 8.825883609965611e-06, "loss": 0.04135856777429581, "memory(GiB)": 21.48, "step": 8051, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.949358 }, { "epoch": 0.2615729461066173, "grad_norm": 0.90069180727005, "learning_rate": 8.825537757922584e-06, "loss": 0.04775720089673996, "memory(GiB)": 21.48, "step": 8052, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.949378 }, { "epoch": 0.2616054315693727, "grad_norm": 0.36036065220832825, "learning_rate": 8.825191861727763e-06, "loss": 0.03743494302034378, "memory(GiB)": 21.48, "step": 8053, "token_acc": 0.984, "train_speed(iter/s)": 0.949396 }, { "epoch": 0.26163791703212813, "grad_norm": 0.43814077973365784, "learning_rate": 8.824845921385134e-06, "loss": 0.038160745054483414, "memory(GiB)": 21.48, "step": 8054, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.949417 }, { "epoch": 0.26167040249488355, "grad_norm": 0.5192458033561707, "learning_rate": 8.824499936898693e-06, "loss": 0.04671010375022888, "memory(GiB)": 21.48, "step": 8055, "token_acc": 0.9765886287625418, "train_speed(iter/s)": 0.949434 }, { "epoch": 0.26170288795763896, "grad_norm": 0.6116575598716736, "learning_rate": 8.824153908272432e-06, "loss": 0.03791888803243637, "memory(GiB)": 21.48, "step": 8056, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.94945 }, { "epoch": 0.2617353734203944, "grad_norm": 0.43680500984191895, "learning_rate": 8.823807835510346e-06, "loss": 0.04082246124744415, "memory(GiB)": 21.48, "step": 8057, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.949469 }, { "epoch": 0.2617678588831498, "grad_norm": 0.478768527507782, "learning_rate": 8.823461718616426e-06, "loss": 0.036946527659893036, "memory(GiB)": 21.48, "step": 8058, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.949486 }, { "epoch": 0.2618003443459052, "grad_norm": 0.9479082822799683, "learning_rate": 8.82311555759467e-06, "loss": 0.03496623784303665, "memory(GiB)": 21.48, "step": 8059, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.949504 }, { "epoch": 0.2618328298086606, "grad_norm": 0.5623651742935181, "learning_rate": 8.822769352449073e-06, "loss": 0.02819807082414627, "memory(GiB)": 21.48, "step": 8060, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.94952 }, { "epoch": 0.26186531527141604, "grad_norm": 0.3585762083530426, "learning_rate": 8.822423103183627e-06, "loss": 0.025916285812854767, "memory(GiB)": 21.48, "step": 8061, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.949537 }, { "epoch": 0.26189780073417146, "grad_norm": 0.3732359707355499, "learning_rate": 8.822076809802333e-06, "loss": 0.030540013685822487, "memory(GiB)": 21.48, "step": 8062, "token_acc": 0.9823788546255506, "train_speed(iter/s)": 0.949555 }, { "epoch": 0.2619302861969269, "grad_norm": 0.4954870045185089, "learning_rate": 8.821730472309184e-06, "loss": 0.036175426095724106, "memory(GiB)": 21.48, "step": 8063, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.949573 }, { "epoch": 0.2619627716596823, "grad_norm": 0.3936596214771271, "learning_rate": 8.82138409070818e-06, "loss": 0.029889915138483047, "memory(GiB)": 21.48, "step": 8064, "token_acc": 1.0, "train_speed(iter/s)": 0.949591 }, { "epoch": 0.2619952571224377, "grad_norm": 1.050889253616333, "learning_rate": 8.821037665003317e-06, "loss": 0.038687873631715775, "memory(GiB)": 21.48, "step": 8065, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.949608 }, { "epoch": 0.2620277425851931, "grad_norm": 0.4222487211227417, "learning_rate": 8.820691195198594e-06, "loss": 0.034229815006256104, "memory(GiB)": 21.48, "step": 8066, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.949595 }, { "epoch": 0.26206022804794854, "grad_norm": 0.7129313349723816, "learning_rate": 8.820344681298008e-06, "loss": 0.031053822487592697, "memory(GiB)": 21.48, "step": 8067, "token_acc": 0.9825783972125436, "train_speed(iter/s)": 0.949611 }, { "epoch": 0.26209271351070396, "grad_norm": 0.5351874232292175, "learning_rate": 8.819998123305559e-06, "loss": 0.038738176226615906, "memory(GiB)": 21.48, "step": 8068, "token_acc": 0.9838056680161943, "train_speed(iter/s)": 0.949628 }, { "epoch": 0.2621251989734594, "grad_norm": 0.4355650544166565, "learning_rate": 8.81965152122525e-06, "loss": 0.0410085991024971, "memory(GiB)": 21.48, "step": 8069, "token_acc": 0.984, "train_speed(iter/s)": 0.949647 }, { "epoch": 0.2621576844362148, "grad_norm": 0.6066824197769165, "learning_rate": 8.819304875061076e-06, "loss": 0.03184884041547775, "memory(GiB)": 21.48, "step": 8070, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.949665 }, { "epoch": 0.2621901698989702, "grad_norm": 0.5047786831855774, "learning_rate": 8.818958184817041e-06, "loss": 0.03693278878927231, "memory(GiB)": 21.48, "step": 8071, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.949683 }, { "epoch": 0.2622226553617256, "grad_norm": 0.37309402227401733, "learning_rate": 8.818611450497147e-06, "loss": 0.03121039643883705, "memory(GiB)": 21.48, "step": 8072, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.9497 }, { "epoch": 0.26225514082448104, "grad_norm": 0.41960710287094116, "learning_rate": 8.818264672105392e-06, "loss": 0.03741316869854927, "memory(GiB)": 21.48, "step": 8073, "token_acc": 0.9798994974874372, "train_speed(iter/s)": 0.949717 }, { "epoch": 0.26228762628723645, "grad_norm": 1.3064273595809937, "learning_rate": 8.817917849645782e-06, "loss": 0.028122585266828537, "memory(GiB)": 21.48, "step": 8074, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.949734 }, { "epoch": 0.26232011174999187, "grad_norm": 0.4063274562358856, "learning_rate": 8.817570983122317e-06, "loss": 0.03321348875761032, "memory(GiB)": 21.48, "step": 8075, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.949752 }, { "epoch": 0.2623525972127473, "grad_norm": 0.49505114555358887, "learning_rate": 8.817224072539004e-06, "loss": 0.038164637982845306, "memory(GiB)": 21.48, "step": 8076, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.94977 }, { "epoch": 0.2623850826755027, "grad_norm": 0.8467214107513428, "learning_rate": 8.816877117899843e-06, "loss": 0.035254884511232376, "memory(GiB)": 21.48, "step": 8077, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.949787 }, { "epoch": 0.2624175681382581, "grad_norm": 0.8881524801254272, "learning_rate": 8.81653011920884e-06, "loss": 0.03192557394504547, "memory(GiB)": 21.48, "step": 8078, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.949804 }, { "epoch": 0.26245005360101353, "grad_norm": 0.39385986328125, "learning_rate": 8.816183076469998e-06, "loss": 0.029133368283510208, "memory(GiB)": 21.48, "step": 8079, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.949824 }, { "epoch": 0.26248253906376895, "grad_norm": 0.4171226918697357, "learning_rate": 8.815835989687326e-06, "loss": 0.03962136059999466, "memory(GiB)": 21.48, "step": 8080, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.949845 }, { "epoch": 0.26251502452652437, "grad_norm": 1.0852158069610596, "learning_rate": 8.815488858864826e-06, "loss": 0.055560722947120667, "memory(GiB)": 21.48, "step": 8081, "token_acc": 0.9776785714285714, "train_speed(iter/s)": 0.949865 }, { "epoch": 0.2625475099892798, "grad_norm": 0.3775441646575928, "learning_rate": 8.81514168400651e-06, "loss": 0.02720733731985092, "memory(GiB)": 21.48, "step": 8082, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.949886 }, { "epoch": 0.2625799954520352, "grad_norm": 0.5547414422035217, "learning_rate": 8.814794465116377e-06, "loss": 0.03550013527274132, "memory(GiB)": 21.48, "step": 8083, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.949907 }, { "epoch": 0.2626124809147906, "grad_norm": 0.4555668830871582, "learning_rate": 8.81444720219844e-06, "loss": 0.03135760501027107, "memory(GiB)": 21.48, "step": 8084, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.949927 }, { "epoch": 0.26264496637754603, "grad_norm": 0.7090399265289307, "learning_rate": 8.814099895256705e-06, "loss": 0.03607052564620972, "memory(GiB)": 21.48, "step": 8085, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.949948 }, { "epoch": 0.26267745184030145, "grad_norm": 0.4771921932697296, "learning_rate": 8.813752544295182e-06, "loss": 0.03455290198326111, "memory(GiB)": 21.48, "step": 8086, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.949967 }, { "epoch": 0.26270993730305686, "grad_norm": 0.5791275501251221, "learning_rate": 8.813405149317876e-06, "loss": 0.038623057305812836, "memory(GiB)": 21.48, "step": 8087, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.949987 }, { "epoch": 0.2627424227658123, "grad_norm": 0.45149925351142883, "learning_rate": 8.8130577103288e-06, "loss": 0.04072710871696472, "memory(GiB)": 21.48, "step": 8088, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.95001 }, { "epoch": 0.2627749082285677, "grad_norm": 0.5460553169250488, "learning_rate": 8.812710227331964e-06, "loss": 0.03527925908565521, "memory(GiB)": 21.48, "step": 8089, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.950036 }, { "epoch": 0.2628073936913231, "grad_norm": 0.5290070176124573, "learning_rate": 8.812362700331376e-06, "loss": 0.0313621461391449, "memory(GiB)": 21.48, "step": 8090, "token_acc": 0.9835164835164835, "train_speed(iter/s)": 0.950062 }, { "epoch": 0.2628398791540785, "grad_norm": 0.470454603433609, "learning_rate": 8.812015129331049e-06, "loss": 0.037843525409698486, "memory(GiB)": 21.48, "step": 8091, "token_acc": 0.971830985915493, "train_speed(iter/s)": 0.950088 }, { "epoch": 0.26287236461683394, "grad_norm": 0.36665216088294983, "learning_rate": 8.811667514334994e-06, "loss": 0.034277528524398804, "memory(GiB)": 21.48, "step": 8092, "token_acc": 0.9730941704035875, "train_speed(iter/s)": 0.950114 }, { "epoch": 0.26290485007958936, "grad_norm": 0.42401984333992004, "learning_rate": 8.811319855347221e-06, "loss": 0.03202388435602188, "memory(GiB)": 21.48, "step": 8093, "token_acc": 0.9767441860465116, "train_speed(iter/s)": 0.95014 }, { "epoch": 0.2629373355423448, "grad_norm": 0.3587397336959839, "learning_rate": 8.810972152371745e-06, "loss": 0.03393419086933136, "memory(GiB)": 21.48, "step": 8094, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.950165 }, { "epoch": 0.2629698210051002, "grad_norm": 0.478841096162796, "learning_rate": 8.810624405412578e-06, "loss": 0.029573187232017517, "memory(GiB)": 21.48, "step": 8095, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.950191 }, { "epoch": 0.26300230646785566, "grad_norm": 0.5093172788619995, "learning_rate": 8.810276614473733e-06, "loss": 0.033310666680336, "memory(GiB)": 21.48, "step": 8096, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.950216 }, { "epoch": 0.2630347919306111, "grad_norm": 0.5495963096618652, "learning_rate": 8.809928779559224e-06, "loss": 0.03832346573472023, "memory(GiB)": 21.48, "step": 8097, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.95024 }, { "epoch": 0.2630672773933665, "grad_norm": 0.6420137286186218, "learning_rate": 8.809580900673067e-06, "loss": 0.03457754850387573, "memory(GiB)": 21.48, "step": 8098, "token_acc": 0.971830985915493, "train_speed(iter/s)": 0.950266 }, { "epoch": 0.2630997628561219, "grad_norm": 0.553866446018219, "learning_rate": 8.809232977819276e-06, "loss": 0.0362437441945076, "memory(GiB)": 21.48, "step": 8099, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.95029 }, { "epoch": 0.2631322483188773, "grad_norm": 0.47919631004333496, "learning_rate": 8.808885011001864e-06, "loss": 0.027071483433246613, "memory(GiB)": 21.48, "step": 8100, "token_acc": 0.983957219251337, "train_speed(iter/s)": 0.950317 }, { "epoch": 0.26316473378163274, "grad_norm": 0.4252931773662567, "learning_rate": 8.808537000224853e-06, "loss": 0.02319580502808094, "memory(GiB)": 21.48, "step": 8101, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.950342 }, { "epoch": 0.26319721924438816, "grad_norm": 0.4657873809337616, "learning_rate": 8.808188945492254e-06, "loss": 0.03473764657974243, "memory(GiB)": 21.48, "step": 8102, "token_acc": 0.9795081967213115, "train_speed(iter/s)": 0.950367 }, { "epoch": 0.2632297047071436, "grad_norm": 0.6019977927207947, "learning_rate": 8.807840846808084e-06, "loss": 0.03126274794340134, "memory(GiB)": 21.48, "step": 8103, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.950394 }, { "epoch": 0.263262190169899, "grad_norm": 0.6877776980400085, "learning_rate": 8.807492704176366e-06, "loss": 0.04190199077129364, "memory(GiB)": 21.48, "step": 8104, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.950419 }, { "epoch": 0.2632946756326544, "grad_norm": 0.35465094447135925, "learning_rate": 8.807144517601112e-06, "loss": 0.027102572843432426, "memory(GiB)": 21.48, "step": 8105, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.950446 }, { "epoch": 0.2633271610954098, "grad_norm": 1.037955403327942, "learning_rate": 8.806796287086343e-06, "loss": 0.04399736225605011, "memory(GiB)": 21.48, "step": 8106, "token_acc": 0.9653465346534653, "train_speed(iter/s)": 0.950472 }, { "epoch": 0.26335964655816524, "grad_norm": 0.3820883631706238, "learning_rate": 8.80644801263608e-06, "loss": 0.03850703686475754, "memory(GiB)": 21.48, "step": 8107, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.950497 }, { "epoch": 0.26339213202092066, "grad_norm": 0.6012949347496033, "learning_rate": 8.806099694254339e-06, "loss": 0.033739957958459854, "memory(GiB)": 21.48, "step": 8108, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.950522 }, { "epoch": 0.26342461748367607, "grad_norm": 0.42284366488456726, "learning_rate": 8.805751331945142e-06, "loss": 0.028653720393776894, "memory(GiB)": 21.48, "step": 8109, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.950548 }, { "epoch": 0.2634571029464315, "grad_norm": 0.6039713025093079, "learning_rate": 8.805402925712507e-06, "loss": 0.03635630011558533, "memory(GiB)": 21.48, "step": 8110, "token_acc": 0.9728682170542635, "train_speed(iter/s)": 0.950574 }, { "epoch": 0.2634895884091869, "grad_norm": 1.6944668292999268, "learning_rate": 8.805054475560459e-06, "loss": 0.03726818785071373, "memory(GiB)": 21.48, "step": 8111, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.950601 }, { "epoch": 0.2635220738719423, "grad_norm": 0.5399768948554993, "learning_rate": 8.80470598149302e-06, "loss": 0.03888736292719841, "memory(GiB)": 21.48, "step": 8112, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.950627 }, { "epoch": 0.26355455933469774, "grad_norm": 0.5149738788604736, "learning_rate": 8.804357443514207e-06, "loss": 0.03656592220067978, "memory(GiB)": 21.48, "step": 8113, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.950651 }, { "epoch": 0.26358704479745315, "grad_norm": 0.9454123377799988, "learning_rate": 8.804008861628046e-06, "loss": 0.028376072645187378, "memory(GiB)": 21.48, "step": 8114, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.950673 }, { "epoch": 0.26361953026020857, "grad_norm": 0.6124163269996643, "learning_rate": 8.803660235838561e-06, "loss": 0.03774293139576912, "memory(GiB)": 21.48, "step": 8115, "token_acc": 0.975, "train_speed(iter/s)": 0.950698 }, { "epoch": 0.263652015722964, "grad_norm": 0.6488702297210693, "learning_rate": 8.803311566149774e-06, "loss": 0.04003900662064552, "memory(GiB)": 21.48, "step": 8116, "token_acc": 0.9795918367346939, "train_speed(iter/s)": 0.950726 }, { "epoch": 0.2636845011857194, "grad_norm": 0.5527524352073669, "learning_rate": 8.802962852565708e-06, "loss": 0.03265538811683655, "memory(GiB)": 21.48, "step": 8117, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.950753 }, { "epoch": 0.2637169866484748, "grad_norm": 0.6186864376068115, "learning_rate": 8.80261409509039e-06, "loss": 0.03507312387228012, "memory(GiB)": 21.48, "step": 8118, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.950778 }, { "epoch": 0.26374947211123023, "grad_norm": 0.412131667137146, "learning_rate": 8.802265293727842e-06, "loss": 0.025054894387722015, "memory(GiB)": 21.48, "step": 8119, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.950798 }, { "epoch": 0.26378195757398565, "grad_norm": 0.4754185676574707, "learning_rate": 8.801916448482095e-06, "loss": 0.03623843193054199, "memory(GiB)": 21.48, "step": 8120, "token_acc": 0.9753694581280788, "train_speed(iter/s)": 0.950819 }, { "epoch": 0.26381444303674106, "grad_norm": 2.1173582077026367, "learning_rate": 8.80156755935717e-06, "loss": 0.03227470442652702, "memory(GiB)": 21.48, "step": 8121, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.95084 }, { "epoch": 0.2638469284994965, "grad_norm": 0.8015904426574707, "learning_rate": 8.801218626357096e-06, "loss": 0.04551707208156586, "memory(GiB)": 21.48, "step": 8122, "token_acc": 0.9756944444444444, "train_speed(iter/s)": 0.950862 }, { "epoch": 0.2638794139622519, "grad_norm": 0.3679341673851013, "learning_rate": 8.8008696494859e-06, "loss": 0.026797547936439514, "memory(GiB)": 21.48, "step": 8123, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.950882 }, { "epoch": 0.2639118994250073, "grad_norm": 0.4495088756084442, "learning_rate": 8.800520628747608e-06, "loss": 0.02918470837175846, "memory(GiB)": 21.48, "step": 8124, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.950904 }, { "epoch": 0.26394438488776273, "grad_norm": 0.8869497776031494, "learning_rate": 8.80017156414625e-06, "loss": 0.045096397399902344, "memory(GiB)": 21.48, "step": 8125, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.950923 }, { "epoch": 0.26397687035051814, "grad_norm": 0.560481607913971, "learning_rate": 8.799822455685855e-06, "loss": 0.04254569485783577, "memory(GiB)": 21.48, "step": 8126, "token_acc": 0.9730769230769231, "train_speed(iter/s)": 0.950943 }, { "epoch": 0.26400935581327356, "grad_norm": 0.4403550326824188, "learning_rate": 8.79947330337045e-06, "loss": 0.03584829717874527, "memory(GiB)": 21.48, "step": 8127, "token_acc": 0.9823321554770318, "train_speed(iter/s)": 0.950962 }, { "epoch": 0.264041841276029, "grad_norm": 0.5738806128501892, "learning_rate": 8.799124107204067e-06, "loss": 0.03052489086985588, "memory(GiB)": 21.48, "step": 8128, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.950982 }, { "epoch": 0.2640743267387844, "grad_norm": 0.5530945658683777, "learning_rate": 8.798774867190734e-06, "loss": 0.030483214184641838, "memory(GiB)": 21.48, "step": 8129, "token_acc": 0.984, "train_speed(iter/s)": 0.950999 }, { "epoch": 0.2641068122015398, "grad_norm": 0.5854731798171997, "learning_rate": 8.798425583334484e-06, "loss": 0.03308561071753502, "memory(GiB)": 21.48, "step": 8130, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.951017 }, { "epoch": 0.2641392976642952, "grad_norm": 0.4302821457386017, "learning_rate": 8.798076255639347e-06, "loss": 0.03907036781311035, "memory(GiB)": 21.48, "step": 8131, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.951035 }, { "epoch": 0.26417178312705064, "grad_norm": 0.5046969056129456, "learning_rate": 8.797726884109355e-06, "loss": 0.031971439719200134, "memory(GiB)": 21.48, "step": 8132, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.951051 }, { "epoch": 0.26420426858980606, "grad_norm": 0.5293745994567871, "learning_rate": 8.797377468748538e-06, "loss": 0.03468594700098038, "memory(GiB)": 21.48, "step": 8133, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.951067 }, { "epoch": 0.2642367540525615, "grad_norm": 0.5905001163482666, "learning_rate": 8.797028009560932e-06, "loss": 0.04672890901565552, "memory(GiB)": 21.48, "step": 8134, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.951084 }, { "epoch": 0.2642692395153169, "grad_norm": 0.4910118877887726, "learning_rate": 8.79667850655057e-06, "loss": 0.038377076387405396, "memory(GiB)": 21.48, "step": 8135, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.951103 }, { "epoch": 0.2643017249780723, "grad_norm": 0.5488280057907104, "learning_rate": 8.796328959721485e-06, "loss": 0.032120559364557266, "memory(GiB)": 21.48, "step": 8136, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.951121 }, { "epoch": 0.2643342104408277, "grad_norm": 0.5153383016586304, "learning_rate": 8.79597936907771e-06, "loss": 0.03647406026721001, "memory(GiB)": 21.48, "step": 8137, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.951137 }, { "epoch": 0.26436669590358314, "grad_norm": 0.41375139355659485, "learning_rate": 8.79562973462328e-06, "loss": 0.032801881432533264, "memory(GiB)": 21.48, "step": 8138, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.951152 }, { "epoch": 0.26439918136633855, "grad_norm": 0.458268404006958, "learning_rate": 8.79528005636223e-06, "loss": 0.036229658871889114, "memory(GiB)": 21.48, "step": 8139, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.951169 }, { "epoch": 0.26443166682909397, "grad_norm": 0.45599836111068726, "learning_rate": 8.7949303342986e-06, "loss": 0.03576155751943588, "memory(GiB)": 21.48, "step": 8140, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.951187 }, { "epoch": 0.2644641522918494, "grad_norm": 0.37844744324684143, "learning_rate": 8.794580568436419e-06, "loss": 0.03263440355658531, "memory(GiB)": 21.48, "step": 8141, "token_acc": 0.9730941704035875, "train_speed(iter/s)": 0.951205 }, { "epoch": 0.2644966377546048, "grad_norm": 0.560199499130249, "learning_rate": 8.794230758779729e-06, "loss": 0.03580779954791069, "memory(GiB)": 21.48, "step": 8142, "token_acc": 0.982532751091703, "train_speed(iter/s)": 0.95122 }, { "epoch": 0.2645291232173602, "grad_norm": 0.37212109565734863, "learning_rate": 8.793880905332567e-06, "loss": 0.04203891009092331, "memory(GiB)": 21.48, "step": 8143, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.951238 }, { "epoch": 0.26456160868011563, "grad_norm": 0.44887590408325195, "learning_rate": 8.79353100809897e-06, "loss": 0.034291092306375504, "memory(GiB)": 21.48, "step": 8144, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.951259 }, { "epoch": 0.26459409414287105, "grad_norm": 0.6632139682769775, "learning_rate": 8.793181067082974e-06, "loss": 0.03929674252867699, "memory(GiB)": 21.48, "step": 8145, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.951279 }, { "epoch": 0.26462657960562647, "grad_norm": 0.47091367840766907, "learning_rate": 8.792831082288622e-06, "loss": 0.044474080204963684, "memory(GiB)": 21.48, "step": 8146, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.9513 }, { "epoch": 0.2646590650683819, "grad_norm": 0.440033495426178, "learning_rate": 8.79248105371995e-06, "loss": 0.03901256248354912, "memory(GiB)": 21.48, "step": 8147, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.951318 }, { "epoch": 0.2646915505311373, "grad_norm": 0.4358580708503723, "learning_rate": 8.792130981380998e-06, "loss": 0.03875676542520523, "memory(GiB)": 21.48, "step": 8148, "token_acc": 0.9719626168224299, "train_speed(iter/s)": 0.95134 }, { "epoch": 0.2647240359938927, "grad_norm": 0.8449848890304565, "learning_rate": 8.791780865275807e-06, "loss": 0.042098622769117355, "memory(GiB)": 21.48, "step": 8149, "token_acc": 0.9771241830065359, "train_speed(iter/s)": 0.951364 }, { "epoch": 0.26475652145664813, "grad_norm": 0.36317354440689087, "learning_rate": 8.79143070540842e-06, "loss": 0.024864237755537033, "memory(GiB)": 21.48, "step": 8150, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.951388 }, { "epoch": 0.26478900691940355, "grad_norm": 0.3944564461708069, "learning_rate": 8.791080501782875e-06, "loss": 0.02707909420132637, "memory(GiB)": 21.48, "step": 8151, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.951413 }, { "epoch": 0.26482149238215896, "grad_norm": 0.49059364199638367, "learning_rate": 8.790730254403214e-06, "loss": 0.040007684379816055, "memory(GiB)": 21.48, "step": 8152, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.951438 }, { "epoch": 0.2648539778449144, "grad_norm": 0.49947649240493774, "learning_rate": 8.790379963273483e-06, "loss": 0.02857869863510132, "memory(GiB)": 21.48, "step": 8153, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.951462 }, { "epoch": 0.2648864633076698, "grad_norm": 0.4157579243183136, "learning_rate": 8.790029628397721e-06, "loss": 0.030232973396778107, "memory(GiB)": 21.48, "step": 8154, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.951488 }, { "epoch": 0.2649189487704252, "grad_norm": 0.5617002248764038, "learning_rate": 8.789679249779973e-06, "loss": 0.03745736926794052, "memory(GiB)": 21.48, "step": 8155, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.951513 }, { "epoch": 0.2649514342331806, "grad_norm": 0.43770211935043335, "learning_rate": 8.78932882742428e-06, "loss": 0.033773019909858704, "memory(GiB)": 21.48, "step": 8156, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.951538 }, { "epoch": 0.26498391969593604, "grad_norm": 0.39184555411338806, "learning_rate": 8.788978361334692e-06, "loss": 0.030575532466173172, "memory(GiB)": 21.48, "step": 8157, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.951564 }, { "epoch": 0.26501640515869146, "grad_norm": 0.8054717779159546, "learning_rate": 8.78862785151525e-06, "loss": 0.04262038692831993, "memory(GiB)": 21.48, "step": 8158, "token_acc": 0.9948717948717949, "train_speed(iter/s)": 0.951589 }, { "epoch": 0.2650488906214469, "grad_norm": 0.4096052944660187, "learning_rate": 8.788277297969998e-06, "loss": 0.03041941672563553, "memory(GiB)": 21.48, "step": 8159, "token_acc": 0.978021978021978, "train_speed(iter/s)": 0.951614 }, { "epoch": 0.26508137608420235, "grad_norm": 1.139099359512329, "learning_rate": 8.787926700702985e-06, "loss": 0.038556672632694244, "memory(GiB)": 21.48, "step": 8160, "token_acc": 0.9725490196078431, "train_speed(iter/s)": 0.951639 }, { "epoch": 0.26511386154695776, "grad_norm": 0.44270437955856323, "learning_rate": 8.787576059718257e-06, "loss": 0.030502863228321075, "memory(GiB)": 21.48, "step": 8161, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.951664 }, { "epoch": 0.2651463470097132, "grad_norm": 0.5318164229393005, "learning_rate": 8.78722537501986e-06, "loss": 0.033543847501277924, "memory(GiB)": 21.48, "step": 8162, "token_acc": 0.9735849056603774, "train_speed(iter/s)": 0.951689 }, { "epoch": 0.2651788324724686, "grad_norm": 0.5770193338394165, "learning_rate": 8.78687464661184e-06, "loss": 0.03730151057243347, "memory(GiB)": 21.48, "step": 8163, "token_acc": 1.0, "train_speed(iter/s)": 0.951714 }, { "epoch": 0.265211317935224, "grad_norm": 0.4551112949848175, "learning_rate": 8.786523874498246e-06, "loss": 0.0379624143242836, "memory(GiB)": 21.48, "step": 8164, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.951739 }, { "epoch": 0.2652438033979794, "grad_norm": 0.42606717348098755, "learning_rate": 8.786173058683127e-06, "loss": 0.034108396619558334, "memory(GiB)": 21.48, "step": 8165, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.951764 }, { "epoch": 0.26527628886073484, "grad_norm": 0.3892662525177002, "learning_rate": 8.785822199170533e-06, "loss": 0.0307382270693779, "memory(GiB)": 21.48, "step": 8166, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.951789 }, { "epoch": 0.26530877432349026, "grad_norm": 0.5015144348144531, "learning_rate": 8.785471295964511e-06, "loss": 0.03854517638683319, "memory(GiB)": 21.48, "step": 8167, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.951815 }, { "epoch": 0.2653412597862457, "grad_norm": 0.4542641341686249, "learning_rate": 8.785120349069111e-06, "loss": 0.03402309864759445, "memory(GiB)": 21.48, "step": 8168, "token_acc": 0.9923371647509579, "train_speed(iter/s)": 0.95184 }, { "epoch": 0.2653737452490011, "grad_norm": 0.3931855857372284, "learning_rate": 8.784769358488385e-06, "loss": 0.03215507045388222, "memory(GiB)": 21.48, "step": 8169, "token_acc": 0.9665271966527197, "train_speed(iter/s)": 0.951865 }, { "epoch": 0.2654062307117565, "grad_norm": 0.8893682360649109, "learning_rate": 8.784418324226383e-06, "loss": 0.03666159510612488, "memory(GiB)": 21.48, "step": 8170, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.951889 }, { "epoch": 0.2654387161745119, "grad_norm": 0.5867589712142944, "learning_rate": 8.784067246287156e-06, "loss": 0.03816533088684082, "memory(GiB)": 21.48, "step": 8171, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.951913 }, { "epoch": 0.26547120163726734, "grad_norm": 0.527197539806366, "learning_rate": 8.783716124674757e-06, "loss": 0.03212089091539383, "memory(GiB)": 21.48, "step": 8172, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.951936 }, { "epoch": 0.26550368710002276, "grad_norm": 0.9629083871841431, "learning_rate": 8.783364959393238e-06, "loss": 0.025958269834518433, "memory(GiB)": 21.48, "step": 8173, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.951962 }, { "epoch": 0.26553617256277817, "grad_norm": 0.49251654744148254, "learning_rate": 8.783013750446652e-06, "loss": 0.03468279540538788, "memory(GiB)": 21.48, "step": 8174, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.951987 }, { "epoch": 0.2655686580255336, "grad_norm": 0.4107365608215332, "learning_rate": 8.78266249783905e-06, "loss": 0.03068963997066021, "memory(GiB)": 21.48, "step": 8175, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.952012 }, { "epoch": 0.265601143488289, "grad_norm": 0.7025783061981201, "learning_rate": 8.78231120157449e-06, "loss": 0.03734029084444046, "memory(GiB)": 21.48, "step": 8176, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.952037 }, { "epoch": 0.2656336289510444, "grad_norm": 0.4109097719192505, "learning_rate": 8.781959861657025e-06, "loss": 0.037500105798244476, "memory(GiB)": 21.48, "step": 8177, "token_acc": 0.9808612440191388, "train_speed(iter/s)": 0.952061 }, { "epoch": 0.26566611441379984, "grad_norm": 0.4568881690502167, "learning_rate": 8.781608478090709e-06, "loss": 0.038251131772994995, "memory(GiB)": 21.48, "step": 8178, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.952086 }, { "epoch": 0.26569859987655525, "grad_norm": 0.39532938599586487, "learning_rate": 8.781257050879596e-06, "loss": 0.02768786996603012, "memory(GiB)": 21.48, "step": 8179, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.952109 }, { "epoch": 0.26573108533931067, "grad_norm": 0.4185837507247925, "learning_rate": 8.780905580027745e-06, "loss": 0.04029540717601776, "memory(GiB)": 21.48, "step": 8180, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.952135 }, { "epoch": 0.2657635708020661, "grad_norm": 0.38377368450164795, "learning_rate": 8.780554065539213e-06, "loss": 0.02724378928542137, "memory(GiB)": 21.48, "step": 8181, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.95216 }, { "epoch": 0.2657960562648215, "grad_norm": 0.5757765173912048, "learning_rate": 8.780202507418052e-06, "loss": 0.029426822438836098, "memory(GiB)": 21.48, "step": 8182, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.952185 }, { "epoch": 0.2658285417275769, "grad_norm": 0.4598727226257324, "learning_rate": 8.779850905668324e-06, "loss": 0.03681630641222, "memory(GiB)": 21.48, "step": 8183, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.952209 }, { "epoch": 0.26586102719033233, "grad_norm": 0.45518675446510315, "learning_rate": 8.779499260294087e-06, "loss": 0.03163345530629158, "memory(GiB)": 21.48, "step": 8184, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.952229 }, { "epoch": 0.26589351265308775, "grad_norm": 0.49068722128868103, "learning_rate": 8.779147571299395e-06, "loss": 0.03584934026002884, "memory(GiB)": 21.48, "step": 8185, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.952248 }, { "epoch": 0.26592599811584317, "grad_norm": 0.4258521795272827, "learning_rate": 8.778795838688312e-06, "loss": 0.03723779320716858, "memory(GiB)": 21.48, "step": 8186, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.952267 }, { "epoch": 0.2659584835785986, "grad_norm": 0.5213797092437744, "learning_rate": 8.778444062464896e-06, "loss": 0.03945910930633545, "memory(GiB)": 21.48, "step": 8187, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.952286 }, { "epoch": 0.265990969041354, "grad_norm": 0.7812170386314392, "learning_rate": 8.778092242633206e-06, "loss": 0.02932729199528694, "memory(GiB)": 21.48, "step": 8188, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.952305 }, { "epoch": 0.2660234545041094, "grad_norm": 0.40206199884414673, "learning_rate": 8.777740379197301e-06, "loss": 0.025428134948015213, "memory(GiB)": 21.48, "step": 8189, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.952321 }, { "epoch": 0.26605593996686483, "grad_norm": 0.4525007903575897, "learning_rate": 8.777388472161246e-06, "loss": 0.031527772545814514, "memory(GiB)": 21.48, "step": 8190, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.952338 }, { "epoch": 0.26608842542962025, "grad_norm": 0.4318752884864807, "learning_rate": 8.7770365215291e-06, "loss": 0.03133264556527138, "memory(GiB)": 21.48, "step": 8191, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.952355 }, { "epoch": 0.26612091089237566, "grad_norm": 0.5240717530250549, "learning_rate": 8.776684527304925e-06, "loss": 0.03050142154097557, "memory(GiB)": 21.48, "step": 8192, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.952372 }, { "epoch": 0.2661533963551311, "grad_norm": 0.5275457501411438, "learning_rate": 8.776332489492783e-06, "loss": 0.035960353910923004, "memory(GiB)": 21.48, "step": 8193, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.95239 }, { "epoch": 0.2661858818178865, "grad_norm": 0.5285593867301941, "learning_rate": 8.77598040809674e-06, "loss": 0.028459854423999786, "memory(GiB)": 21.48, "step": 8194, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.952406 }, { "epoch": 0.2662183672806419, "grad_norm": 0.45446860790252686, "learning_rate": 8.775628283120857e-06, "loss": 0.02968025580048561, "memory(GiB)": 21.48, "step": 8195, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.952423 }, { "epoch": 0.2662508527433973, "grad_norm": 0.6593459248542786, "learning_rate": 8.775276114569195e-06, "loss": 0.04626400023698807, "memory(GiB)": 21.48, "step": 8196, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.95244 }, { "epoch": 0.26628333820615274, "grad_norm": 0.7727022171020508, "learning_rate": 8.774923902445825e-06, "loss": 0.05138808488845825, "memory(GiB)": 21.48, "step": 8197, "token_acc": 0.9940828402366864, "train_speed(iter/s)": 0.952454 }, { "epoch": 0.26631582366890816, "grad_norm": 0.4010877311229706, "learning_rate": 8.774571646754807e-06, "loss": 0.03231005370616913, "memory(GiB)": 21.48, "step": 8198, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.95247 }, { "epoch": 0.2663483091316636, "grad_norm": 0.38111400604248047, "learning_rate": 8.774219347500208e-06, "loss": 0.03228217735886574, "memory(GiB)": 21.48, "step": 8199, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.952486 }, { "epoch": 0.266380794594419, "grad_norm": 0.4052867889404297, "learning_rate": 8.773867004686097e-06, "loss": 0.03850487247109413, "memory(GiB)": 21.48, "step": 8200, "token_acc": 0.9702127659574468, "train_speed(iter/s)": 0.952503 }, { "epoch": 0.2664132800571744, "grad_norm": 0.486127108335495, "learning_rate": 8.773514618316535e-06, "loss": 0.031748875975608826, "memory(GiB)": 21.48, "step": 8201, "token_acc": 0.981549815498155, "train_speed(iter/s)": 0.95252 }, { "epoch": 0.2664457655199298, "grad_norm": 1.2999720573425293, "learning_rate": 8.773162188395591e-06, "loss": 0.052585020661354065, "memory(GiB)": 21.48, "step": 8202, "token_acc": 0.9764705882352941, "train_speed(iter/s)": 0.952535 }, { "epoch": 0.26647825098268524, "grad_norm": 0.3458460569381714, "learning_rate": 8.772809714927337e-06, "loss": 0.026761911809444427, "memory(GiB)": 21.48, "step": 8203, "token_acc": 0.9875, "train_speed(iter/s)": 0.952551 }, { "epoch": 0.26651073644544065, "grad_norm": 0.40546685457229614, "learning_rate": 8.772457197915837e-06, "loss": 0.036085013300180435, "memory(GiB)": 21.48, "step": 8204, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.952568 }, { "epoch": 0.26654322190819607, "grad_norm": 0.5400688052177429, "learning_rate": 8.772104637365155e-06, "loss": 0.036417532712221146, "memory(GiB)": 21.48, "step": 8205, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.952585 }, { "epoch": 0.2665757073709515, "grad_norm": 0.5850088000297546, "learning_rate": 8.771752033279369e-06, "loss": 0.03324766457080841, "memory(GiB)": 21.48, "step": 8206, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.952602 }, { "epoch": 0.2666081928337069, "grad_norm": 0.4175582826137543, "learning_rate": 8.771399385662543e-06, "loss": 0.03186527639627457, "memory(GiB)": 21.48, "step": 8207, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.952617 }, { "epoch": 0.2666406782964623, "grad_norm": 0.803360641002655, "learning_rate": 8.771046694518748e-06, "loss": 0.03277041018009186, "memory(GiB)": 21.48, "step": 8208, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.95264 }, { "epoch": 0.26667316375921774, "grad_norm": 0.5296096801757812, "learning_rate": 8.770693959852057e-06, "loss": 0.02792544476687908, "memory(GiB)": 21.48, "step": 8209, "token_acc": 0.9768339768339769, "train_speed(iter/s)": 0.952664 }, { "epoch": 0.26670564922197315, "grad_norm": 0.6626173853874207, "learning_rate": 8.770341181666538e-06, "loss": 0.047996990382671356, "memory(GiB)": 21.48, "step": 8210, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.952689 }, { "epoch": 0.26673813468472857, "grad_norm": 0.5580883622169495, "learning_rate": 8.769988359966261e-06, "loss": 0.04312177002429962, "memory(GiB)": 21.48, "step": 8211, "token_acc": 1.0, "train_speed(iter/s)": 0.952706 }, { "epoch": 0.266770620147484, "grad_norm": 0.4253101944923401, "learning_rate": 8.769635494755303e-06, "loss": 0.0328814797103405, "memory(GiB)": 21.48, "step": 8212, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.952731 }, { "epoch": 0.2668031056102394, "grad_norm": 0.49952226877212524, "learning_rate": 8.769282586037732e-06, "loss": 0.03665095567703247, "memory(GiB)": 21.48, "step": 8213, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.952755 }, { "epoch": 0.2668355910729948, "grad_norm": 0.33654502034187317, "learning_rate": 8.768929633817627e-06, "loss": 0.029501624405384064, "memory(GiB)": 21.48, "step": 8214, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.95278 }, { "epoch": 0.26686807653575023, "grad_norm": 0.4275246858596802, "learning_rate": 8.768576638099054e-06, "loss": 0.03249366581439972, "memory(GiB)": 21.48, "step": 8215, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.952804 }, { "epoch": 0.26690056199850565, "grad_norm": 0.4650041162967682, "learning_rate": 8.768223598886092e-06, "loss": 0.038519348949193954, "memory(GiB)": 21.48, "step": 8216, "token_acc": 0.9713114754098361, "train_speed(iter/s)": 0.952829 }, { "epoch": 0.26693304746126106, "grad_norm": 0.47548389434814453, "learning_rate": 8.767870516182814e-06, "loss": 0.0357411652803421, "memory(GiB)": 21.48, "step": 8217, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.952851 }, { "epoch": 0.2669655329240165, "grad_norm": 0.33153632283210754, "learning_rate": 8.767517389993295e-06, "loss": 0.03101803921163082, "memory(GiB)": 21.48, "step": 8218, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.952874 }, { "epoch": 0.2669980183867719, "grad_norm": 0.45510295033454895, "learning_rate": 8.76716422032161e-06, "loss": 0.039852168411016464, "memory(GiB)": 21.48, "step": 8219, "token_acc": 0.9923076923076923, "train_speed(iter/s)": 0.952896 }, { "epoch": 0.2670305038495273, "grad_norm": 0.5767762660980225, "learning_rate": 8.766811007171837e-06, "loss": 0.04237152636051178, "memory(GiB)": 21.48, "step": 8220, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.95292 }, { "epoch": 0.26706298931228273, "grad_norm": 0.5479958057403564, "learning_rate": 8.766457750548052e-06, "loss": 0.04412832111120224, "memory(GiB)": 21.48, "step": 8221, "token_acc": 0.9812734082397003, "train_speed(iter/s)": 0.952943 }, { "epoch": 0.26709547477503814, "grad_norm": 0.40411025285720825, "learning_rate": 8.766104450454329e-06, "loss": 0.030332110822200775, "memory(GiB)": 21.48, "step": 8222, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.952969 }, { "epoch": 0.26712796023779356, "grad_norm": 0.459762305021286, "learning_rate": 8.76575110689475e-06, "loss": 0.033464785665273666, "memory(GiB)": 21.48, "step": 8223, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.95299 }, { "epoch": 0.26716044570054903, "grad_norm": 0.4245108664035797, "learning_rate": 8.765397719873389e-06, "loss": 0.0295290257781744, "memory(GiB)": 21.48, "step": 8224, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.953015 }, { "epoch": 0.26719293116330445, "grad_norm": 0.3458883464336395, "learning_rate": 8.76504428939433e-06, "loss": 0.03576069325208664, "memory(GiB)": 21.48, "step": 8225, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.953039 }, { "epoch": 0.26722541662605986, "grad_norm": 0.3521055281162262, "learning_rate": 8.764690815461644e-06, "loss": 0.02792995423078537, "memory(GiB)": 21.48, "step": 8226, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.953063 }, { "epoch": 0.2672579020888153, "grad_norm": 0.7681235074996948, "learning_rate": 8.76433729807942e-06, "loss": 0.05001988261938095, "memory(GiB)": 21.48, "step": 8227, "token_acc": 0.9766355140186916, "train_speed(iter/s)": 0.953088 }, { "epoch": 0.2672903875515707, "grad_norm": 0.731968343257904, "learning_rate": 8.76398373725173e-06, "loss": 0.036949384957551956, "memory(GiB)": 21.48, "step": 8228, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.953109 }, { "epoch": 0.2673228730143261, "grad_norm": 0.39487892389297485, "learning_rate": 8.763630132982659e-06, "loss": 0.0266755111515522, "memory(GiB)": 21.48, "step": 8229, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.953132 }, { "epoch": 0.26735535847708153, "grad_norm": 0.6414765119552612, "learning_rate": 8.763276485276287e-06, "loss": 0.03649959713220596, "memory(GiB)": 21.48, "step": 8230, "token_acc": 0.9835390946502057, "train_speed(iter/s)": 0.953156 }, { "epoch": 0.26738784393983694, "grad_norm": 0.49462658166885376, "learning_rate": 8.762922794136696e-06, "loss": 0.03547469526529312, "memory(GiB)": 21.48, "step": 8231, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.953181 }, { "epoch": 0.26742032940259236, "grad_norm": 0.3360728323459625, "learning_rate": 8.762569059567966e-06, "loss": 0.029367836192250252, "memory(GiB)": 21.48, "step": 8232, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.953206 }, { "epoch": 0.2674528148653478, "grad_norm": 0.40606579184532166, "learning_rate": 8.762215281574182e-06, "loss": 0.035182978957891464, "memory(GiB)": 21.48, "step": 8233, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.953231 }, { "epoch": 0.2674853003281032, "grad_norm": 0.7851589918136597, "learning_rate": 8.761861460159427e-06, "loss": 0.029225915670394897, "memory(GiB)": 21.48, "step": 8234, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.953254 }, { "epoch": 0.2675177857908586, "grad_norm": 0.6733700633049011, "learning_rate": 8.761507595327781e-06, "loss": 0.03580612689256668, "memory(GiB)": 21.48, "step": 8235, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.953277 }, { "epoch": 0.267550271253614, "grad_norm": 0.5508164763450623, "learning_rate": 8.761153687083333e-06, "loss": 0.04293288663029671, "memory(GiB)": 21.48, "step": 8236, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.953303 }, { "epoch": 0.26758275671636944, "grad_norm": 0.2786422669887543, "learning_rate": 8.760799735430166e-06, "loss": 0.017845826223492622, "memory(GiB)": 21.48, "step": 8237, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.953326 }, { "epoch": 0.26761524217912486, "grad_norm": 0.3704717457294464, "learning_rate": 8.760445740372363e-06, "loss": 0.030124031007289886, "memory(GiB)": 21.48, "step": 8238, "token_acc": 0.99, "train_speed(iter/s)": 0.953351 }, { "epoch": 0.2676477276418803, "grad_norm": 0.5195667743682861, "learning_rate": 8.760091701914011e-06, "loss": 0.04051894694566727, "memory(GiB)": 21.48, "step": 8239, "token_acc": 0.983957219251337, "train_speed(iter/s)": 0.953374 }, { "epoch": 0.2676802131046357, "grad_norm": 0.741119921207428, "learning_rate": 8.759737620059196e-06, "loss": 0.03469637781381607, "memory(GiB)": 21.48, "step": 8240, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.953396 }, { "epoch": 0.2677126985673911, "grad_norm": 0.5796152353286743, "learning_rate": 8.759383494812006e-06, "loss": 0.03632873296737671, "memory(GiB)": 21.48, "step": 8241, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.953419 }, { "epoch": 0.2677451840301465, "grad_norm": 0.5184596180915833, "learning_rate": 8.759029326176524e-06, "loss": 0.02601800113916397, "memory(GiB)": 21.48, "step": 8242, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.953441 }, { "epoch": 0.26777766949290194, "grad_norm": 0.3849088251590729, "learning_rate": 8.758675114156842e-06, "loss": 0.03621775656938553, "memory(GiB)": 21.48, "step": 8243, "token_acc": 0.9718875502008032, "train_speed(iter/s)": 0.953465 }, { "epoch": 0.26781015495565735, "grad_norm": 0.48947349190711975, "learning_rate": 8.758320858757047e-06, "loss": 0.03468204289674759, "memory(GiB)": 21.48, "step": 8244, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.95349 }, { "epoch": 0.26784264041841277, "grad_norm": 0.5181014537811279, "learning_rate": 8.757966559981229e-06, "loss": 0.0420672781765461, "memory(GiB)": 21.48, "step": 8245, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.953513 }, { "epoch": 0.2678751258811682, "grad_norm": 0.570367157459259, "learning_rate": 8.75761221783347e-06, "loss": 0.034029580652713776, "memory(GiB)": 21.48, "step": 8246, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.953533 }, { "epoch": 0.2679076113439236, "grad_norm": 0.49375370144844055, "learning_rate": 8.757257832317868e-06, "loss": 0.035273946821689606, "memory(GiB)": 21.48, "step": 8247, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.953551 }, { "epoch": 0.267940096806679, "grad_norm": 0.28933361172676086, "learning_rate": 8.756903403438509e-06, "loss": 0.02239985391497612, "memory(GiB)": 21.48, "step": 8248, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.953569 }, { "epoch": 0.26797258226943443, "grad_norm": 0.788102388381958, "learning_rate": 8.756548931199485e-06, "loss": 0.04307762533426285, "memory(GiB)": 21.48, "step": 8249, "token_acc": 0.9884169884169884, "train_speed(iter/s)": 0.953586 }, { "epoch": 0.26800506773218985, "grad_norm": 0.4560582637786865, "learning_rate": 8.756194415604884e-06, "loss": 0.03020494617521763, "memory(GiB)": 21.48, "step": 8250, "token_acc": 0.9967213114754099, "train_speed(iter/s)": 0.953601 }, { "epoch": 0.26803755319494527, "grad_norm": 0.45987629890441895, "learning_rate": 8.755839856658803e-06, "loss": 0.032059937715530396, "memory(GiB)": 21.48, "step": 8251, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.953612 }, { "epoch": 0.2680700386577007, "grad_norm": 0.46932071447372437, "learning_rate": 8.755485254365329e-06, "loss": 0.03419514000415802, "memory(GiB)": 21.48, "step": 8252, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.953627 }, { "epoch": 0.2681025241204561, "grad_norm": 0.5503795146942139, "learning_rate": 8.755130608728556e-06, "loss": 0.024835310876369476, "memory(GiB)": 21.48, "step": 8253, "token_acc": 0.9823529411764705, "train_speed(iter/s)": 0.953643 }, { "epoch": 0.2681350095832115, "grad_norm": 0.4814335107803345, "learning_rate": 8.75477591975258e-06, "loss": 0.039326563477516174, "memory(GiB)": 21.48, "step": 8254, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.953659 }, { "epoch": 0.26816749504596693, "grad_norm": 0.595320463180542, "learning_rate": 8.75442118744149e-06, "loss": 0.03082284703850746, "memory(GiB)": 21.48, "step": 8255, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.953675 }, { "epoch": 0.26819998050872235, "grad_norm": 0.36453190445899963, "learning_rate": 8.754066411799384e-06, "loss": 0.029200345277786255, "memory(GiB)": 21.48, "step": 8256, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.953692 }, { "epoch": 0.26823246597147776, "grad_norm": 1.1274889707565308, "learning_rate": 8.753711592830354e-06, "loss": 0.032330818474292755, "memory(GiB)": 21.48, "step": 8257, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.953708 }, { "epoch": 0.2682649514342332, "grad_norm": 0.33286744356155396, "learning_rate": 8.753356730538498e-06, "loss": 0.02939055860042572, "memory(GiB)": 21.48, "step": 8258, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.953725 }, { "epoch": 0.2682974368969886, "grad_norm": 0.35884231328964233, "learning_rate": 8.75300182492791e-06, "loss": 0.02885441668331623, "memory(GiB)": 21.48, "step": 8259, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.95374 }, { "epoch": 0.268329922359744, "grad_norm": 0.4965309202671051, "learning_rate": 8.752646876002681e-06, "loss": 0.04061245918273926, "memory(GiB)": 21.48, "step": 8260, "token_acc": 0.9856115107913669, "train_speed(iter/s)": 0.953755 }, { "epoch": 0.2683624078224994, "grad_norm": 0.5310170650482178, "learning_rate": 8.752291883766918e-06, "loss": 0.0494140163064003, "memory(GiB)": 21.48, "step": 8261, "token_acc": 0.99, "train_speed(iter/s)": 0.953771 }, { "epoch": 0.26839489328525484, "grad_norm": 0.38245296478271484, "learning_rate": 8.751936848224708e-06, "loss": 0.026211529970169067, "memory(GiB)": 21.48, "step": 8262, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.953787 }, { "epoch": 0.26842737874801026, "grad_norm": 0.36054110527038574, "learning_rate": 8.751581769380156e-06, "loss": 0.03836784511804581, "memory(GiB)": 21.48, "step": 8263, "token_acc": 0.9897435897435898, "train_speed(iter/s)": 0.953801 }, { "epoch": 0.2684598642107657, "grad_norm": 0.43850982189178467, "learning_rate": 8.751226647237355e-06, "loss": 0.033034756779670715, "memory(GiB)": 21.48, "step": 8264, "token_acc": 0.9753694581280788, "train_speed(iter/s)": 0.953815 }, { "epoch": 0.2684923496735211, "grad_norm": 0.5154341459274292, "learning_rate": 8.750871481800404e-06, "loss": 0.04364943504333496, "memory(GiB)": 21.48, "step": 8265, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.953831 }, { "epoch": 0.2685248351362765, "grad_norm": 0.6569172739982605, "learning_rate": 8.750516273073408e-06, "loss": 0.03092459961771965, "memory(GiB)": 21.48, "step": 8266, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.953848 }, { "epoch": 0.2685573205990319, "grad_norm": 0.5438829064369202, "learning_rate": 8.750161021060459e-06, "loss": 0.03970291465520859, "memory(GiB)": 21.48, "step": 8267, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.953864 }, { "epoch": 0.26858980606178734, "grad_norm": 0.3622457981109619, "learning_rate": 8.749805725765662e-06, "loss": 0.024243401363492012, "memory(GiB)": 21.48, "step": 8268, "token_acc": 0.9788732394366197, "train_speed(iter/s)": 0.953884 }, { "epoch": 0.26862229152454276, "grad_norm": 0.44917944073677063, "learning_rate": 8.749450387193115e-06, "loss": 0.029377460479736328, "memory(GiB)": 21.48, "step": 8269, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.953907 }, { "epoch": 0.26865477698729817, "grad_norm": 0.6846837997436523, "learning_rate": 8.749095005346921e-06, "loss": 0.048209574073553085, "memory(GiB)": 21.48, "step": 8270, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.953931 }, { "epoch": 0.2686872624500536, "grad_norm": 0.5483188629150391, "learning_rate": 8.74873958023118e-06, "loss": 0.03578164428472519, "memory(GiB)": 21.48, "step": 8271, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.953952 }, { "epoch": 0.268719747912809, "grad_norm": 0.3871236741542816, "learning_rate": 8.748384111849995e-06, "loss": 0.03294385224580765, "memory(GiB)": 21.48, "step": 8272, "token_acc": 0.9783549783549783, "train_speed(iter/s)": 0.953974 }, { "epoch": 0.2687522333755644, "grad_norm": 0.5143577456474304, "learning_rate": 8.748028600207468e-06, "loss": 0.038295380771160126, "memory(GiB)": 21.48, "step": 8273, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.954 }, { "epoch": 0.26878471883831984, "grad_norm": 0.32397904992103577, "learning_rate": 8.747673045307703e-06, "loss": 0.026373405009508133, "memory(GiB)": 21.48, "step": 8274, "token_acc": 0.9875, "train_speed(iter/s)": 0.954025 }, { "epoch": 0.26881720430107525, "grad_norm": 0.3802140951156616, "learning_rate": 8.747317447154802e-06, "loss": 0.02184903994202614, "memory(GiB)": 21.48, "step": 8275, "token_acc": 0.986013986013986, "train_speed(iter/s)": 0.954048 }, { "epoch": 0.26884968976383067, "grad_norm": 0.5245948433876038, "learning_rate": 8.746961805752869e-06, "loss": 0.040238041430711746, "memory(GiB)": 21.48, "step": 8276, "token_acc": 0.9781659388646288, "train_speed(iter/s)": 0.954072 }, { "epoch": 0.2688821752265861, "grad_norm": 0.5186355113983154, "learning_rate": 8.746606121106011e-06, "loss": 0.019865166395902634, "memory(GiB)": 21.48, "step": 8277, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.954095 }, { "epoch": 0.2689146606893415, "grad_norm": 0.5696994066238403, "learning_rate": 8.746250393218332e-06, "loss": 0.04583080857992172, "memory(GiB)": 21.48, "step": 8278, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.95412 }, { "epoch": 0.2689471461520969, "grad_norm": 0.4481654465198517, "learning_rate": 8.745894622093938e-06, "loss": 0.03610093146562576, "memory(GiB)": 21.48, "step": 8279, "token_acc": 1.0, "train_speed(iter/s)": 0.954144 }, { "epoch": 0.26897963161485233, "grad_norm": 0.5229933261871338, "learning_rate": 8.745538807736932e-06, "loss": 0.03307783976197243, "memory(GiB)": 21.48, "step": 8280, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.954167 }, { "epoch": 0.26901211707760775, "grad_norm": 0.3878759443759918, "learning_rate": 8.745182950151424e-06, "loss": 0.027120135724544525, "memory(GiB)": 21.48, "step": 8281, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.954192 }, { "epoch": 0.26904460254036316, "grad_norm": 1.0346059799194336, "learning_rate": 8.74482704934152e-06, "loss": 0.03758960962295532, "memory(GiB)": 21.48, "step": 8282, "token_acc": 0.9800664451827242, "train_speed(iter/s)": 0.954215 }, { "epoch": 0.2690770880031186, "grad_norm": 0.4430210590362549, "learning_rate": 8.744471105311327e-06, "loss": 0.0312887541949749, "memory(GiB)": 21.48, "step": 8283, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.954236 }, { "epoch": 0.269109573465874, "grad_norm": 0.432004451751709, "learning_rate": 8.744115118064954e-06, "loss": 0.03654935956001282, "memory(GiB)": 21.48, "step": 8284, "token_acc": 0.9858490566037735, "train_speed(iter/s)": 0.954258 }, { "epoch": 0.2691420589286294, "grad_norm": 0.40468713641166687, "learning_rate": 8.743759087606509e-06, "loss": 0.03628230839967728, "memory(GiB)": 21.48, "step": 8285, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.954282 }, { "epoch": 0.26917454439138483, "grad_norm": 0.429677814245224, "learning_rate": 8.743403013940102e-06, "loss": 0.029244571924209595, "memory(GiB)": 21.48, "step": 8286, "token_acc": 0.985781990521327, "train_speed(iter/s)": 0.954305 }, { "epoch": 0.26920702985414025, "grad_norm": 0.44980135560035706, "learning_rate": 8.74304689706984e-06, "loss": 0.030601024627685547, "memory(GiB)": 21.48, "step": 8287, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.954329 }, { "epoch": 0.2692395153168957, "grad_norm": 0.516525149345398, "learning_rate": 8.742690736999836e-06, "loss": 0.03526098653674126, "memory(GiB)": 21.48, "step": 8288, "token_acc": 0.975, "train_speed(iter/s)": 0.954352 }, { "epoch": 0.26927200077965113, "grad_norm": 0.5733935832977295, "learning_rate": 8.742334533734199e-06, "loss": 0.034363068640232086, "memory(GiB)": 21.48, "step": 8289, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.954373 }, { "epoch": 0.26930448624240655, "grad_norm": 0.9269742369651794, "learning_rate": 8.74197828727704e-06, "loss": 0.040547363460063934, "memory(GiB)": 21.48, "step": 8290, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.954397 }, { "epoch": 0.26933697170516196, "grad_norm": 0.5831129550933838, "learning_rate": 8.741621997632473e-06, "loss": 0.03787195309996605, "memory(GiB)": 21.48, "step": 8291, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.954422 }, { "epoch": 0.2693694571679174, "grad_norm": 0.7428010702133179, "learning_rate": 8.741265664804606e-06, "loss": 0.05150957033038139, "memory(GiB)": 21.48, "step": 8292, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.954446 }, { "epoch": 0.2694019426306728, "grad_norm": 0.3857182264328003, "learning_rate": 8.740909288797555e-06, "loss": 0.030604083091020584, "memory(GiB)": 21.48, "step": 8293, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.954469 }, { "epoch": 0.2694344280934282, "grad_norm": 0.3690304458141327, "learning_rate": 8.740552869615431e-06, "loss": 0.0295909084379673, "memory(GiB)": 21.48, "step": 8294, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.954493 }, { "epoch": 0.26946691355618363, "grad_norm": 0.41216787695884705, "learning_rate": 8.740196407262347e-06, "loss": 0.027466388419270515, "memory(GiB)": 21.48, "step": 8295, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.954515 }, { "epoch": 0.26949939901893905, "grad_norm": 0.5365158915519714, "learning_rate": 8.73983990174242e-06, "loss": 0.033405907452106476, "memory(GiB)": 21.48, "step": 8296, "token_acc": 0.985, "train_speed(iter/s)": 0.95454 }, { "epoch": 0.26953188448169446, "grad_norm": 0.3594723343849182, "learning_rate": 8.739483353059762e-06, "loss": 0.030551254749298096, "memory(GiB)": 21.48, "step": 8297, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.954564 }, { "epoch": 0.2695643699444499, "grad_norm": 0.6276281476020813, "learning_rate": 8.739126761218488e-06, "loss": 0.03544119745492935, "memory(GiB)": 21.48, "step": 8298, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.954587 }, { "epoch": 0.2695968554072053, "grad_norm": 0.3765312433242798, "learning_rate": 8.738770126222717e-06, "loss": 0.03675515949726105, "memory(GiB)": 21.48, "step": 8299, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.95461 }, { "epoch": 0.2696293408699607, "grad_norm": 0.4402606189250946, "learning_rate": 8.73841344807656e-06, "loss": 0.033231496810913086, "memory(GiB)": 21.48, "step": 8300, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.954634 }, { "epoch": 0.2696618263327161, "grad_norm": 0.6127375364303589, "learning_rate": 8.738056726784138e-06, "loss": 0.04506535828113556, "memory(GiB)": 21.48, "step": 8301, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.954659 }, { "epoch": 0.26969431179547154, "grad_norm": 0.46844667196273804, "learning_rate": 8.737699962349564e-06, "loss": 0.03501204401254654, "memory(GiB)": 21.48, "step": 8302, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.954684 }, { "epoch": 0.26972679725822696, "grad_norm": 0.4412584602832794, "learning_rate": 8.73734315477696e-06, "loss": 0.03330201655626297, "memory(GiB)": 21.48, "step": 8303, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.954707 }, { "epoch": 0.2697592827209824, "grad_norm": 0.6123254895210266, "learning_rate": 8.73698630407044e-06, "loss": 0.029822923243045807, "memory(GiB)": 21.48, "step": 8304, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.95473 }, { "epoch": 0.2697917681837378, "grad_norm": 0.5240501165390015, "learning_rate": 8.736629410234123e-06, "loss": 0.032756589353084564, "memory(GiB)": 21.48, "step": 8305, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.954755 }, { "epoch": 0.2698242536464932, "grad_norm": 0.4643714129924774, "learning_rate": 8.736272473272132e-06, "loss": 0.03510244935750961, "memory(GiB)": 21.48, "step": 8306, "token_acc": 0.99609375, "train_speed(iter/s)": 0.954779 }, { "epoch": 0.2698567391092486, "grad_norm": 0.6199677586555481, "learning_rate": 8.73591549318858e-06, "loss": 0.04469556361436844, "memory(GiB)": 21.48, "step": 8307, "token_acc": 0.9836065573770492, "train_speed(iter/s)": 0.954802 }, { "epoch": 0.26988922457200404, "grad_norm": 0.692352294921875, "learning_rate": 8.735558469987595e-06, "loss": 0.03814277797937393, "memory(GiB)": 21.48, "step": 8308, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.954817 }, { "epoch": 0.26992171003475945, "grad_norm": 0.3769018054008484, "learning_rate": 8.73520140367329e-06, "loss": 0.03321440517902374, "memory(GiB)": 21.48, "step": 8309, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.954832 }, { "epoch": 0.26995419549751487, "grad_norm": 0.3768179416656494, "learning_rate": 8.734844294249791e-06, "loss": 0.03127705678343773, "memory(GiB)": 21.48, "step": 8310, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.954849 }, { "epoch": 0.2699866809602703, "grad_norm": 0.3988740146160126, "learning_rate": 8.734487141721218e-06, "loss": 0.03914787992835045, "memory(GiB)": 21.48, "step": 8311, "token_acc": 0.971830985915493, "train_speed(iter/s)": 0.954864 }, { "epoch": 0.2700191664230257, "grad_norm": 0.3463554382324219, "learning_rate": 8.734129946091691e-06, "loss": 0.02749127894639969, "memory(GiB)": 21.48, "step": 8312, "token_acc": 0.9835164835164835, "train_speed(iter/s)": 0.954881 }, { "epoch": 0.2700516518857811, "grad_norm": 0.36318111419677734, "learning_rate": 8.733772707365335e-06, "loss": 0.021710172295570374, "memory(GiB)": 21.48, "step": 8313, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.954898 }, { "epoch": 0.27008413734853653, "grad_norm": 0.6092336177825928, "learning_rate": 8.733415425546272e-06, "loss": 0.03970498964190483, "memory(GiB)": 21.48, "step": 8314, "token_acc": 0.98828125, "train_speed(iter/s)": 0.954915 }, { "epoch": 0.27011662281129195, "grad_norm": 0.41653525829315186, "learning_rate": 8.733058100638627e-06, "loss": 0.03137780725955963, "memory(GiB)": 21.48, "step": 8315, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.954932 }, { "epoch": 0.27014910827404737, "grad_norm": 0.47887280583381653, "learning_rate": 8.732700732646522e-06, "loss": 0.03363010659813881, "memory(GiB)": 21.48, "step": 8316, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.954944 }, { "epoch": 0.2701815937368028, "grad_norm": 0.3346295654773712, "learning_rate": 8.732343321574084e-06, "loss": 0.03696388006210327, "memory(GiB)": 21.48, "step": 8317, "token_acc": 0.9698275862068966, "train_speed(iter/s)": 0.954961 }, { "epoch": 0.2702140791995582, "grad_norm": 0.5566579699516296, "learning_rate": 8.731985867425434e-06, "loss": 0.03873005509376526, "memory(GiB)": 21.48, "step": 8318, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.954976 }, { "epoch": 0.2702465646623136, "grad_norm": 0.4358087182044983, "learning_rate": 8.7316283702047e-06, "loss": 0.033646296709775925, "memory(GiB)": 21.48, "step": 8319, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.954992 }, { "epoch": 0.27027905012506903, "grad_norm": 0.4447898268699646, "learning_rate": 8.731270829916009e-06, "loss": 0.04042480140924454, "memory(GiB)": 21.48, "step": 8320, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.955006 }, { "epoch": 0.27031153558782445, "grad_norm": 0.4337300658226013, "learning_rate": 8.730913246563487e-06, "loss": 0.03269314393401146, "memory(GiB)": 21.48, "step": 8321, "token_acc": 0.99, "train_speed(iter/s)": 0.955017 }, { "epoch": 0.27034402105057986, "grad_norm": 1.8240975141525269, "learning_rate": 8.730555620151258e-06, "loss": 0.03516501188278198, "memory(GiB)": 21.48, "step": 8322, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.955032 }, { "epoch": 0.2703765065133353, "grad_norm": 0.36917099356651306, "learning_rate": 8.730197950683453e-06, "loss": 0.02926839515566826, "memory(GiB)": 21.48, "step": 8323, "token_acc": 1.0, "train_speed(iter/s)": 0.955044 }, { "epoch": 0.2704089919760907, "grad_norm": 0.4642236530780792, "learning_rate": 8.7298402381642e-06, "loss": 0.03612954542040825, "memory(GiB)": 21.48, "step": 8324, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.95506 }, { "epoch": 0.2704414774388461, "grad_norm": 0.42699894309043884, "learning_rate": 8.729482482597624e-06, "loss": 0.02959330379962921, "memory(GiB)": 21.48, "step": 8325, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.955077 }, { "epoch": 0.27047396290160153, "grad_norm": 0.5336451530456543, "learning_rate": 8.729124683987858e-06, "loss": 0.030643701553344727, "memory(GiB)": 21.48, "step": 8326, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.95509 }, { "epoch": 0.27050644836435694, "grad_norm": 0.29813092947006226, "learning_rate": 8.728766842339027e-06, "loss": 0.025874875485897064, "memory(GiB)": 21.48, "step": 8327, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.955106 }, { "epoch": 0.27053893382711236, "grad_norm": 0.3798421621322632, "learning_rate": 8.728408957655265e-06, "loss": 0.02807580679655075, "memory(GiB)": 21.48, "step": 8328, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.955128 }, { "epoch": 0.2705714192898678, "grad_norm": 0.46621808409690857, "learning_rate": 8.728051029940703e-06, "loss": 0.03844841569662094, "memory(GiB)": 21.48, "step": 8329, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.955147 }, { "epoch": 0.2706039047526232, "grad_norm": 0.35821300745010376, "learning_rate": 8.727693059199468e-06, "loss": 0.026598244905471802, "memory(GiB)": 21.48, "step": 8330, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.955169 }, { "epoch": 0.2706363902153786, "grad_norm": 0.3799375295639038, "learning_rate": 8.727335045435694e-06, "loss": 0.028952671214938164, "memory(GiB)": 21.48, "step": 8331, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.955192 }, { "epoch": 0.270668875678134, "grad_norm": 0.8469533324241638, "learning_rate": 8.726976988653514e-06, "loss": 0.03688206151127815, "memory(GiB)": 21.48, "step": 8332, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.955215 }, { "epoch": 0.27070136114088944, "grad_norm": 0.37337741255760193, "learning_rate": 8.726618888857057e-06, "loss": 0.026734594255685806, "memory(GiB)": 21.48, "step": 8333, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.95524 }, { "epoch": 0.27073384660364486, "grad_norm": 0.4778984487056732, "learning_rate": 8.726260746050459e-06, "loss": 0.03627363219857216, "memory(GiB)": 21.48, "step": 8334, "token_acc": 0.996, "train_speed(iter/s)": 0.95526 }, { "epoch": 0.2707663320664003, "grad_norm": 0.5402439832687378, "learning_rate": 8.725902560237852e-06, "loss": 0.033452846109867096, "memory(GiB)": 21.48, "step": 8335, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.955283 }, { "epoch": 0.2707988175291557, "grad_norm": 0.47420087456703186, "learning_rate": 8.72554433142337e-06, "loss": 0.04061319679021835, "memory(GiB)": 21.48, "step": 8336, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955307 }, { "epoch": 0.2708313029919111, "grad_norm": 0.4440356194972992, "learning_rate": 8.725186059611149e-06, "loss": 0.03015545755624771, "memory(GiB)": 21.48, "step": 8337, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955331 }, { "epoch": 0.2708637884546665, "grad_norm": 0.4154447615146637, "learning_rate": 8.72482774480532e-06, "loss": 0.031225187703967094, "memory(GiB)": 21.48, "step": 8338, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.955357 }, { "epoch": 0.27089627391742194, "grad_norm": 0.39105433225631714, "learning_rate": 8.724469387010024e-06, "loss": 0.03663882240653038, "memory(GiB)": 21.48, "step": 8339, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955381 }, { "epoch": 0.27092875938017735, "grad_norm": 0.5286718010902405, "learning_rate": 8.724110986229392e-06, "loss": 0.04533710330724716, "memory(GiB)": 21.48, "step": 8340, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955404 }, { "epoch": 0.27096124484293277, "grad_norm": 0.6991996765136719, "learning_rate": 8.723752542467563e-06, "loss": 0.03280532732605934, "memory(GiB)": 21.48, "step": 8341, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.955427 }, { "epoch": 0.2709937303056882, "grad_norm": 0.4661482572555542, "learning_rate": 8.723394055728673e-06, "loss": 0.028722401708364487, "memory(GiB)": 21.48, "step": 8342, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955449 }, { "epoch": 0.2710262157684436, "grad_norm": 0.46116575598716736, "learning_rate": 8.72303552601686e-06, "loss": 0.03131935000419617, "memory(GiB)": 21.48, "step": 8343, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.955475 }, { "epoch": 0.271058701231199, "grad_norm": 0.6358493566513062, "learning_rate": 8.722676953336265e-06, "loss": 0.032262131571769714, "memory(GiB)": 21.48, "step": 8344, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.955498 }, { "epoch": 0.27109118669395443, "grad_norm": 0.39031273126602173, "learning_rate": 8.72231833769102e-06, "loss": 0.02784193679690361, "memory(GiB)": 21.48, "step": 8345, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.955519 }, { "epoch": 0.27112367215670985, "grad_norm": 0.3296769857406616, "learning_rate": 8.721959679085265e-06, "loss": 0.03302263468503952, "memory(GiB)": 21.48, "step": 8346, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.955543 }, { "epoch": 0.27115615761946527, "grad_norm": 0.33279529213905334, "learning_rate": 8.721600977523145e-06, "loss": 0.02588600292801857, "memory(GiB)": 21.48, "step": 8347, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.955565 }, { "epoch": 0.2711886430822207, "grad_norm": 0.4960402250289917, "learning_rate": 8.721242233008795e-06, "loss": 0.03235402703285217, "memory(GiB)": 21.48, "step": 8348, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955588 }, { "epoch": 0.2712211285449761, "grad_norm": 0.7146081924438477, "learning_rate": 8.720883445546356e-06, "loss": 0.03115636296570301, "memory(GiB)": 21.48, "step": 8349, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.95561 }, { "epoch": 0.2712536140077315, "grad_norm": 0.37013623118400574, "learning_rate": 8.72052461513997e-06, "loss": 0.025413459166884422, "memory(GiB)": 21.48, "step": 8350, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955635 }, { "epoch": 0.27128609947048693, "grad_norm": 0.519982099533081, "learning_rate": 8.720165741793778e-06, "loss": 0.037201493978500366, "memory(GiB)": 21.48, "step": 8351, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955656 }, { "epoch": 0.2713185849332424, "grad_norm": 0.3588981628417969, "learning_rate": 8.719806825511921e-06, "loss": 0.026874708011746407, "memory(GiB)": 21.48, "step": 8352, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955681 }, { "epoch": 0.2713510703959978, "grad_norm": 0.48659154772758484, "learning_rate": 8.719447866298542e-06, "loss": 0.03245110064744949, "memory(GiB)": 21.48, "step": 8353, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.955705 }, { "epoch": 0.27138355585875323, "grad_norm": 0.4290705919265747, "learning_rate": 8.719088864157786e-06, "loss": 0.03210368752479553, "memory(GiB)": 21.48, "step": 8354, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.955725 }, { "epoch": 0.27141604132150865, "grad_norm": 0.6633796095848083, "learning_rate": 8.718729819093794e-06, "loss": 0.03613224998116493, "memory(GiB)": 21.48, "step": 8355, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.955747 }, { "epoch": 0.27144852678426407, "grad_norm": 0.3353614807128906, "learning_rate": 8.71837073111071e-06, "loss": 0.028558336198329926, "memory(GiB)": 21.48, "step": 8356, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.955767 }, { "epoch": 0.2714810122470195, "grad_norm": 0.5556187629699707, "learning_rate": 8.718011600212676e-06, "loss": 0.030009252950549126, "memory(GiB)": 21.48, "step": 8357, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955791 }, { "epoch": 0.2715134977097749, "grad_norm": 0.5579927563667297, "learning_rate": 8.717652426403843e-06, "loss": 0.035378605127334595, "memory(GiB)": 21.48, "step": 8358, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.955816 }, { "epoch": 0.2715459831725303, "grad_norm": 9.27906322479248, "learning_rate": 8.717293209688352e-06, "loss": 0.030789852142333984, "memory(GiB)": 21.48, "step": 8359, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.955839 }, { "epoch": 0.27157846863528573, "grad_norm": 0.48791050910949707, "learning_rate": 8.71693395007035e-06, "loss": 0.028605084866285324, "memory(GiB)": 21.48, "step": 8360, "token_acc": 0.9795081967213115, "train_speed(iter/s)": 0.955862 }, { "epoch": 0.27161095409804115, "grad_norm": 0.4087759256362915, "learning_rate": 8.716574647553982e-06, "loss": 0.03682519868016243, "memory(GiB)": 21.48, "step": 8361, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.955885 }, { "epoch": 0.27164343956079656, "grad_norm": 0.576726496219635, "learning_rate": 8.716215302143395e-06, "loss": 0.029642000794410706, "memory(GiB)": 21.48, "step": 8362, "token_acc": 0.9963636363636363, "train_speed(iter/s)": 0.955908 }, { "epoch": 0.271675925023552, "grad_norm": 0.6113746166229248, "learning_rate": 8.71585591384274e-06, "loss": 0.027496512979269028, "memory(GiB)": 21.48, "step": 8363, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.955931 }, { "epoch": 0.2717084104863074, "grad_norm": 0.3515855073928833, "learning_rate": 8.715496482656159e-06, "loss": 0.0274425707757473, "memory(GiB)": 21.48, "step": 8364, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.955954 }, { "epoch": 0.2717408959490628, "grad_norm": 0.35963472723960876, "learning_rate": 8.715137008587805e-06, "loss": 0.02739863656461239, "memory(GiB)": 21.48, "step": 8365, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.955978 }, { "epoch": 0.2717733814118182, "grad_norm": 0.47927114367485046, "learning_rate": 8.714777491641825e-06, "loss": 0.040286555886268616, "memory(GiB)": 21.48, "step": 8366, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.956001 }, { "epoch": 0.27180586687457364, "grad_norm": 0.47317901253700256, "learning_rate": 8.714417931822367e-06, "loss": 0.03693453222513199, "memory(GiB)": 21.48, "step": 8367, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956023 }, { "epoch": 0.27183835233732906, "grad_norm": 0.9016436338424683, "learning_rate": 8.714058329133583e-06, "loss": 0.033044565469026566, "memory(GiB)": 21.48, "step": 8368, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.956042 }, { "epoch": 0.2718708378000845, "grad_norm": 0.49931180477142334, "learning_rate": 8.713698683579623e-06, "loss": 0.032728150486946106, "memory(GiB)": 21.48, "step": 8369, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.956062 }, { "epoch": 0.2719033232628399, "grad_norm": 0.463755339384079, "learning_rate": 8.713338995164635e-06, "loss": 0.03344608098268509, "memory(GiB)": 21.48, "step": 8370, "token_acc": 0.9807692307692307, "train_speed(iter/s)": 0.956079 }, { "epoch": 0.2719358087255953, "grad_norm": 0.4296908676624298, "learning_rate": 8.712979263892776e-06, "loss": 0.03290310874581337, "memory(GiB)": 21.48, "step": 8371, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.956096 }, { "epoch": 0.2719682941883507, "grad_norm": 0.33219683170318604, "learning_rate": 8.712619489768191e-06, "loss": 0.027151234447956085, "memory(GiB)": 21.48, "step": 8372, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.956113 }, { "epoch": 0.27200077965110614, "grad_norm": 4.50819730758667, "learning_rate": 8.712259672795039e-06, "loss": 0.03448449820280075, "memory(GiB)": 21.48, "step": 8373, "token_acc": 0.970954356846473, "train_speed(iter/s)": 0.956131 }, { "epoch": 0.27203326511386156, "grad_norm": 0.41503891348838806, "learning_rate": 8.711899812977467e-06, "loss": 0.04024891555309296, "memory(GiB)": 21.48, "step": 8374, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.956145 }, { "epoch": 0.27206575057661697, "grad_norm": 0.31778767704963684, "learning_rate": 8.71153991031963e-06, "loss": 0.030259544029831886, "memory(GiB)": 21.48, "step": 8375, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.956161 }, { "epoch": 0.2720982360393724, "grad_norm": 0.3881739675998688, "learning_rate": 8.711179964825684e-06, "loss": 0.034856148064136505, "memory(GiB)": 21.48, "step": 8376, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.956177 }, { "epoch": 0.2721307215021278, "grad_norm": 0.45617929100990295, "learning_rate": 8.710819976499782e-06, "loss": 0.027647223323583603, "memory(GiB)": 21.48, "step": 8377, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.956192 }, { "epoch": 0.2721632069648832, "grad_norm": 0.4462055265903473, "learning_rate": 8.710459945346078e-06, "loss": 0.04046742990612984, "memory(GiB)": 21.48, "step": 8378, "token_acc": 0.9686274509803922, "train_speed(iter/s)": 0.95621 }, { "epoch": 0.27219569242763864, "grad_norm": 0.5700607299804688, "learning_rate": 8.710099871368726e-06, "loss": 0.03640800341963768, "memory(GiB)": 21.48, "step": 8379, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.956227 }, { "epoch": 0.27222817789039405, "grad_norm": 0.9573990106582642, "learning_rate": 8.709739754571887e-06, "loss": 0.03742942214012146, "memory(GiB)": 21.48, "step": 8380, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.956244 }, { "epoch": 0.27226066335314947, "grad_norm": 0.49217692017555237, "learning_rate": 8.70937959495971e-06, "loss": 0.03015998937189579, "memory(GiB)": 21.48, "step": 8381, "token_acc": 1.0, "train_speed(iter/s)": 0.95626 }, { "epoch": 0.2722931488159049, "grad_norm": 0.5782371759414673, "learning_rate": 8.709019392536358e-06, "loss": 0.03226958215236664, "memory(GiB)": 21.48, "step": 8382, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956276 }, { "epoch": 0.2723256342786603, "grad_norm": 0.5510357022285461, "learning_rate": 8.708659147305982e-06, "loss": 0.04194165766239166, "memory(GiB)": 21.48, "step": 8383, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.95629 }, { "epoch": 0.2723581197414157, "grad_norm": 0.7692569494247437, "learning_rate": 8.708298859272745e-06, "loss": 0.040477998554706573, "memory(GiB)": 21.48, "step": 8384, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.956305 }, { "epoch": 0.27239060520417113, "grad_norm": 0.4483305811882019, "learning_rate": 8.707938528440805e-06, "loss": 0.03926439210772514, "memory(GiB)": 21.48, "step": 8385, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956322 }, { "epoch": 0.27242309066692655, "grad_norm": 0.5103245973587036, "learning_rate": 8.707578154814318e-06, "loss": 0.03303582966327667, "memory(GiB)": 21.48, "step": 8386, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.956339 }, { "epoch": 0.27245557612968196, "grad_norm": 0.398645281791687, "learning_rate": 8.707217738397445e-06, "loss": 0.02748636156320572, "memory(GiB)": 21.48, "step": 8387, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.956356 }, { "epoch": 0.2724880615924374, "grad_norm": 0.5250520706176758, "learning_rate": 8.706857279194343e-06, "loss": 0.03185313567519188, "memory(GiB)": 21.48, "step": 8388, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.956371 }, { "epoch": 0.2725205470551928, "grad_norm": 0.4616779088973999, "learning_rate": 8.706496777209175e-06, "loss": 0.037085987627506256, "memory(GiB)": 21.48, "step": 8389, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.956391 }, { "epoch": 0.2725530325179482, "grad_norm": 0.4512932598590851, "learning_rate": 8.706136232446101e-06, "loss": 0.026728956028819084, "memory(GiB)": 21.48, "step": 8390, "token_acc": 1.0, "train_speed(iter/s)": 0.95641 }, { "epoch": 0.27258551798070363, "grad_norm": 0.6621303558349609, "learning_rate": 8.705775644909282e-06, "loss": 0.03274626284837723, "memory(GiB)": 21.48, "step": 8391, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.956428 }, { "epoch": 0.27261800344345904, "grad_norm": 0.850857138633728, "learning_rate": 8.705415014602881e-06, "loss": 0.04167615622282028, "memory(GiB)": 21.48, "step": 8392, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.956447 }, { "epoch": 0.27265048890621446, "grad_norm": 0.5657123923301697, "learning_rate": 8.705054341531057e-06, "loss": 0.03625166416168213, "memory(GiB)": 21.48, "step": 8393, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.956469 }, { "epoch": 0.2726829743689699, "grad_norm": 0.5072833299636841, "learning_rate": 8.704693625697975e-06, "loss": 0.03626600652933121, "memory(GiB)": 21.48, "step": 8394, "token_acc": 0.9702380952380952, "train_speed(iter/s)": 0.956492 }, { "epoch": 0.2727154598317253, "grad_norm": 0.5082969069480896, "learning_rate": 8.704332867107799e-06, "loss": 0.037293121218681335, "memory(GiB)": 21.48, "step": 8395, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.956517 }, { "epoch": 0.2727479452944807, "grad_norm": 0.5981229543685913, "learning_rate": 8.70397206576469e-06, "loss": 0.030938662588596344, "memory(GiB)": 21.48, "step": 8396, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.956541 }, { "epoch": 0.2727804307572361, "grad_norm": 0.4610128104686737, "learning_rate": 8.703611221672813e-06, "loss": 0.037894975394010544, "memory(GiB)": 21.48, "step": 8397, "token_acc": 1.0, "train_speed(iter/s)": 0.956564 }, { "epoch": 0.27281291621999154, "grad_norm": 0.42907679080963135, "learning_rate": 8.703250334836334e-06, "loss": 0.027826067060232162, "memory(GiB)": 21.48, "step": 8398, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.956588 }, { "epoch": 0.27284540168274696, "grad_norm": 0.482679545879364, "learning_rate": 8.702889405259416e-06, "loss": 0.02606447972357273, "memory(GiB)": 21.48, "step": 8399, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.956611 }, { "epoch": 0.2728778871455024, "grad_norm": 0.45019927620887756, "learning_rate": 8.702528432946227e-06, "loss": 0.04012631997466087, "memory(GiB)": 21.48, "step": 8400, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.956636 }, { "epoch": 0.2729103726082578, "grad_norm": 0.5438134074211121, "learning_rate": 8.70216741790093e-06, "loss": 0.03086547739803791, "memory(GiB)": 21.48, "step": 8401, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.956661 }, { "epoch": 0.2729428580710132, "grad_norm": 0.5608199238777161, "learning_rate": 8.701806360127696e-06, "loss": 0.0327787771821022, "memory(GiB)": 21.48, "step": 8402, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.956685 }, { "epoch": 0.2729753435337686, "grad_norm": 0.8432398438453674, "learning_rate": 8.701445259630688e-06, "loss": 0.038687869906425476, "memory(GiB)": 21.48, "step": 8403, "token_acc": 0.9708333333333333, "train_speed(iter/s)": 0.956708 }, { "epoch": 0.27300782899652404, "grad_norm": 0.5202376842498779, "learning_rate": 8.701084116414075e-06, "loss": 0.03698441758751869, "memory(GiB)": 21.48, "step": 8404, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.956732 }, { "epoch": 0.27304031445927945, "grad_norm": 0.7090391516685486, "learning_rate": 8.700722930482025e-06, "loss": 0.048666857182979584, "memory(GiB)": 21.48, "step": 8405, "token_acc": 0.9553903345724907, "train_speed(iter/s)": 0.956755 }, { "epoch": 0.27307279992203487, "grad_norm": 0.6258558630943298, "learning_rate": 8.700361701838707e-06, "loss": 0.03252524137496948, "memory(GiB)": 21.48, "step": 8406, "token_acc": 0.99, "train_speed(iter/s)": 0.956778 }, { "epoch": 0.2731052853847903, "grad_norm": 1.277953028678894, "learning_rate": 8.70000043048829e-06, "loss": 0.04582330584526062, "memory(GiB)": 21.48, "step": 8407, "token_acc": 0.9824561403508771, "train_speed(iter/s)": 0.956803 }, { "epoch": 0.2731377708475457, "grad_norm": 0.7232173681259155, "learning_rate": 8.699639116434943e-06, "loss": 0.04613470658659935, "memory(GiB)": 21.48, "step": 8408, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.956826 }, { "epoch": 0.2731702563103011, "grad_norm": 0.6424701809883118, "learning_rate": 8.699277759682837e-06, "loss": 0.04137898236513138, "memory(GiB)": 21.48, "step": 8409, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.956849 }, { "epoch": 0.27320274177305653, "grad_norm": 0.8649361729621887, "learning_rate": 8.69891636023614e-06, "loss": 0.044217295944690704, "memory(GiB)": 21.48, "step": 8410, "token_acc": 0.9923076923076923, "train_speed(iter/s)": 0.956873 }, { "epoch": 0.27323522723581195, "grad_norm": 0.5343859195709229, "learning_rate": 8.698554918099028e-06, "loss": 0.04889852926135063, "memory(GiB)": 21.48, "step": 8411, "token_acc": 0.9822695035460993, "train_speed(iter/s)": 0.956896 }, { "epoch": 0.27326771269856737, "grad_norm": 0.529781699180603, "learning_rate": 8.698193433275667e-06, "loss": 0.027142951264977455, "memory(GiB)": 21.48, "step": 8412, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.956916 }, { "epoch": 0.2733001981613228, "grad_norm": 0.4851885437965393, "learning_rate": 8.697831905770233e-06, "loss": 0.040656678378582, "memory(GiB)": 21.48, "step": 8413, "token_acc": 0.9764705882352941, "train_speed(iter/s)": 0.95694 }, { "epoch": 0.2733326836240782, "grad_norm": 0.4722588062286377, "learning_rate": 8.697470335586897e-06, "loss": 0.04087933152914047, "memory(GiB)": 21.48, "step": 8414, "token_acc": 0.9790209790209791, "train_speed(iter/s)": 0.956964 }, { "epoch": 0.2733651690868336, "grad_norm": 0.31130462884902954, "learning_rate": 8.697108722729833e-06, "loss": 0.031487636268138885, "memory(GiB)": 21.48, "step": 8415, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.956988 }, { "epoch": 0.2733976545495891, "grad_norm": 0.7952394485473633, "learning_rate": 8.69674706720321e-06, "loss": 0.031006526201963425, "memory(GiB)": 21.48, "step": 8416, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.957011 }, { "epoch": 0.2734301400123445, "grad_norm": 0.383068323135376, "learning_rate": 8.696385369011209e-06, "loss": 0.03192569315433502, "memory(GiB)": 21.48, "step": 8417, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.957035 }, { "epoch": 0.2734626254750999, "grad_norm": 0.5343812108039856, "learning_rate": 8.696023628158e-06, "loss": 0.03222261741757393, "memory(GiB)": 21.48, "step": 8418, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.957059 }, { "epoch": 0.27349511093785533, "grad_norm": 0.6329163312911987, "learning_rate": 8.695661844647757e-06, "loss": 0.05472278594970703, "memory(GiB)": 21.48, "step": 8419, "token_acc": 0.9812206572769953, "train_speed(iter/s)": 0.957084 }, { "epoch": 0.27352759640061075, "grad_norm": 0.4796507656574249, "learning_rate": 8.695300018484657e-06, "loss": 0.03857757896184921, "memory(GiB)": 21.48, "step": 8420, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.957106 }, { "epoch": 0.27356008186336617, "grad_norm": 0.32449036836624146, "learning_rate": 8.694938149672881e-06, "loss": 0.02418738603591919, "memory(GiB)": 21.48, "step": 8421, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.957131 }, { "epoch": 0.2735925673261216, "grad_norm": 0.45758771896362305, "learning_rate": 8.694576238216596e-06, "loss": 0.03621196746826172, "memory(GiB)": 21.48, "step": 8422, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.957155 }, { "epoch": 0.273625052788877, "grad_norm": 0.6833622455596924, "learning_rate": 8.694214284119986e-06, "loss": 0.039939600974321365, "memory(GiB)": 21.48, "step": 8423, "token_acc": 0.9808612440191388, "train_speed(iter/s)": 0.957176 }, { "epoch": 0.2736575382516324, "grad_norm": 0.8099510669708252, "learning_rate": 8.693852287387225e-06, "loss": 0.040068767964839935, "memory(GiB)": 21.48, "step": 8424, "token_acc": 0.9746835443037974, "train_speed(iter/s)": 0.957199 }, { "epoch": 0.27369002371438783, "grad_norm": 0.6194279789924622, "learning_rate": 8.693490248022496e-06, "loss": 0.03808976337313652, "memory(GiB)": 21.48, "step": 8425, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.957224 }, { "epoch": 0.27372250917714325, "grad_norm": 0.4992307722568512, "learning_rate": 8.693128166029969e-06, "loss": 0.0342877060174942, "memory(GiB)": 21.48, "step": 8426, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.957244 }, { "epoch": 0.27375499463989866, "grad_norm": 0.8247988224029541, "learning_rate": 8.692766041413829e-06, "loss": 0.03243048116564751, "memory(GiB)": 21.48, "step": 8427, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.957265 }, { "epoch": 0.2737874801026541, "grad_norm": 0.6318982243537903, "learning_rate": 8.692403874178252e-06, "loss": 0.04672171175479889, "memory(GiB)": 21.48, "step": 8428, "token_acc": 0.9680365296803652, "train_speed(iter/s)": 0.957279 }, { "epoch": 0.2738199655654095, "grad_norm": 0.3665138781070709, "learning_rate": 8.692041664327422e-06, "loss": 0.031278595328330994, "memory(GiB)": 21.48, "step": 8429, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957295 }, { "epoch": 0.2738524510281649, "grad_norm": 0.40118664503097534, "learning_rate": 8.691679411865516e-06, "loss": 0.03757324069738388, "memory(GiB)": 21.48, "step": 8430, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957313 }, { "epoch": 0.2738849364909203, "grad_norm": 0.3643995225429535, "learning_rate": 8.691317116796718e-06, "loss": 0.03093118965625763, "memory(GiB)": 21.48, "step": 8431, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.957331 }, { "epoch": 0.27391742195367574, "grad_norm": 0.3599463105201721, "learning_rate": 8.690954779125206e-06, "loss": 0.03489833325147629, "memory(GiB)": 21.48, "step": 8432, "token_acc": 0.9708333333333333, "train_speed(iter/s)": 0.957347 }, { "epoch": 0.27394990741643116, "grad_norm": 0.48524752259254456, "learning_rate": 8.690592398855163e-06, "loss": 0.03629118576645851, "memory(GiB)": 21.48, "step": 8433, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.957364 }, { "epoch": 0.2739823928791866, "grad_norm": 0.42373892664909363, "learning_rate": 8.690229975990772e-06, "loss": 0.028855249285697937, "memory(GiB)": 21.48, "step": 8434, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.95738 }, { "epoch": 0.274014878341942, "grad_norm": 0.9109572172164917, "learning_rate": 8.689867510536215e-06, "loss": 0.0387677326798439, "memory(GiB)": 21.48, "step": 8435, "token_acc": 0.9923954372623575, "train_speed(iter/s)": 0.957395 }, { "epoch": 0.2740473638046974, "grad_norm": 0.32759740948677063, "learning_rate": 8.689505002495675e-06, "loss": 0.027185825631022453, "memory(GiB)": 21.48, "step": 8436, "token_acc": 0.988, "train_speed(iter/s)": 0.957411 }, { "epoch": 0.2740798492674528, "grad_norm": 0.33704280853271484, "learning_rate": 8.689142451873337e-06, "loss": 0.025648176670074463, "memory(GiB)": 21.48, "step": 8437, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.957427 }, { "epoch": 0.27411233473020824, "grad_norm": 0.4207848906517029, "learning_rate": 8.688779858673385e-06, "loss": 0.036387957632541656, "memory(GiB)": 21.48, "step": 8438, "token_acc": 0.9798994974874372, "train_speed(iter/s)": 0.957444 }, { "epoch": 0.27414482019296366, "grad_norm": 0.4844226837158203, "learning_rate": 8.688417222900004e-06, "loss": 0.032607194036245346, "memory(GiB)": 21.48, "step": 8439, "token_acc": 0.9764705882352941, "train_speed(iter/s)": 0.957458 }, { "epoch": 0.2741773056557191, "grad_norm": 0.4151513874530792, "learning_rate": 8.688054544557378e-06, "loss": 0.03380496799945831, "memory(GiB)": 21.48, "step": 8440, "token_acc": 0.9678571428571429, "train_speed(iter/s)": 0.957471 }, { "epoch": 0.2742097911184745, "grad_norm": 0.5550477504730225, "learning_rate": 8.687691823649695e-06, "loss": 0.04370858520269394, "memory(GiB)": 21.48, "step": 8441, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957487 }, { "epoch": 0.2742422765812299, "grad_norm": 0.571628987789154, "learning_rate": 8.68732906018114e-06, "loss": 0.03983709588646889, "memory(GiB)": 21.48, "step": 8442, "token_acc": 0.9823321554770318, "train_speed(iter/s)": 0.957503 }, { "epoch": 0.2742747620439853, "grad_norm": 0.33502018451690674, "learning_rate": 8.6869662541559e-06, "loss": 0.034705981612205505, "memory(GiB)": 21.48, "step": 8443, "token_acc": 0.9808429118773946, "train_speed(iter/s)": 0.957519 }, { "epoch": 0.27430724750674074, "grad_norm": 0.35975682735443115, "learning_rate": 8.68660340557816e-06, "loss": 0.03397277742624283, "memory(GiB)": 21.48, "step": 8444, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.957533 }, { "epoch": 0.27433973296949615, "grad_norm": 1.439946174621582, "learning_rate": 8.68624051445211e-06, "loss": 0.031126199290156364, "memory(GiB)": 21.48, "step": 8445, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.957546 }, { "epoch": 0.27437221843225157, "grad_norm": 0.4426560699939728, "learning_rate": 8.685877580781942e-06, "loss": 0.0337769016623497, "memory(GiB)": 21.48, "step": 8446, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.957562 }, { "epoch": 0.274404703895007, "grad_norm": 0.3458237648010254, "learning_rate": 8.685514604571838e-06, "loss": 0.025658775120973587, "memory(GiB)": 21.48, "step": 8447, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.95758 }, { "epoch": 0.2744371893577624, "grad_norm": 0.4547163248062134, "learning_rate": 8.685151585825991e-06, "loss": 0.042234912514686584, "memory(GiB)": 21.48, "step": 8448, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.957599 }, { "epoch": 0.2744696748205178, "grad_norm": 0.2762458324432373, "learning_rate": 8.684788524548589e-06, "loss": 0.019787266850471497, "memory(GiB)": 21.48, "step": 8449, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957618 }, { "epoch": 0.27450216028327323, "grad_norm": 0.5031529664993286, "learning_rate": 8.68442542074382e-06, "loss": 0.03771317005157471, "memory(GiB)": 21.48, "step": 8450, "token_acc": 0.9625, "train_speed(iter/s)": 0.957632 }, { "epoch": 0.27453464574602865, "grad_norm": 0.32323402166366577, "learning_rate": 8.684062274415883e-06, "loss": 0.024213798344135284, "memory(GiB)": 21.48, "step": 8451, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.95765 }, { "epoch": 0.27456713120878407, "grad_norm": 0.3391924798488617, "learning_rate": 8.683699085568963e-06, "loss": 0.0238140057772398, "memory(GiB)": 21.48, "step": 8452, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.95767 }, { "epoch": 0.2745996166715395, "grad_norm": 0.6695007085800171, "learning_rate": 8.68333585420725e-06, "loss": 0.050466138869524, "memory(GiB)": 21.48, "step": 8453, "token_acc": 0.9805825242718447, "train_speed(iter/s)": 0.957689 }, { "epoch": 0.2746321021342949, "grad_norm": 0.4410835802555084, "learning_rate": 8.68297258033494e-06, "loss": 0.03045887127518654, "memory(GiB)": 21.48, "step": 8454, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957709 }, { "epoch": 0.2746645875970503, "grad_norm": 1.2738935947418213, "learning_rate": 8.682609263956224e-06, "loss": 0.03637552633881569, "memory(GiB)": 21.48, "step": 8455, "token_acc": 0.9875, "train_speed(iter/s)": 0.95773 }, { "epoch": 0.27469707305980573, "grad_norm": 0.7348783612251282, "learning_rate": 8.682245905075297e-06, "loss": 0.03873530775308609, "memory(GiB)": 21.48, "step": 8456, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.957754 }, { "epoch": 0.27472955852256115, "grad_norm": 0.4784683585166931, "learning_rate": 8.681882503696348e-06, "loss": 0.04121263325214386, "memory(GiB)": 21.48, "step": 8457, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.957777 }, { "epoch": 0.27476204398531656, "grad_norm": 0.4853108823299408, "learning_rate": 8.681519059823577e-06, "loss": 0.030486250296235085, "memory(GiB)": 21.48, "step": 8458, "token_acc": 0.9875, "train_speed(iter/s)": 0.957801 }, { "epoch": 0.274794529448072, "grad_norm": 0.48838016390800476, "learning_rate": 8.681155573461174e-06, "loss": 0.033133093267679214, "memory(GiB)": 21.48, "step": 8459, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.957825 }, { "epoch": 0.2748270149108274, "grad_norm": 0.585012674331665, "learning_rate": 8.68079204461334e-06, "loss": 0.040411047637462616, "memory(GiB)": 21.48, "step": 8460, "token_acc": 0.995, "train_speed(iter/s)": 0.957848 }, { "epoch": 0.2748595003735828, "grad_norm": 0.4218336343765259, "learning_rate": 8.680428473284264e-06, "loss": 0.03359653055667877, "memory(GiB)": 21.48, "step": 8461, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.957872 }, { "epoch": 0.2748919858363382, "grad_norm": 0.5803262591362, "learning_rate": 8.680064859478143e-06, "loss": 0.039351873099803925, "memory(GiB)": 21.48, "step": 8462, "token_acc": 0.9765258215962441, "train_speed(iter/s)": 0.957895 }, { "epoch": 0.27492447129909364, "grad_norm": 0.9320659637451172, "learning_rate": 8.679701203199178e-06, "loss": 0.03153816610574722, "memory(GiB)": 21.48, "step": 8463, "token_acc": 0.9894179894179894, "train_speed(iter/s)": 0.957918 }, { "epoch": 0.27495695676184906, "grad_norm": 0.4595158100128174, "learning_rate": 8.679337504451564e-06, "loss": 0.03825744241476059, "memory(GiB)": 21.48, "step": 8464, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.957939 }, { "epoch": 0.2749894422246045, "grad_norm": 0.44236257672309875, "learning_rate": 8.678973763239495e-06, "loss": 0.030782634392380714, "memory(GiB)": 21.48, "step": 8465, "token_acc": 0.9930313588850174, "train_speed(iter/s)": 0.957963 }, { "epoch": 0.2750219276873599, "grad_norm": 0.3571862578392029, "learning_rate": 8.678609979567174e-06, "loss": 0.03459009900689125, "memory(GiB)": 21.48, "step": 8466, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.957985 }, { "epoch": 0.2750544131501153, "grad_norm": 0.534584105014801, "learning_rate": 8.678246153438799e-06, "loss": 0.04229750111699104, "memory(GiB)": 21.48, "step": 8467, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.958007 }, { "epoch": 0.2750868986128707, "grad_norm": 0.38025757670402527, "learning_rate": 8.677882284858565e-06, "loss": 0.02972877398133278, "memory(GiB)": 21.48, "step": 8468, "token_acc": 0.9896373056994818, "train_speed(iter/s)": 0.95803 }, { "epoch": 0.27511938407562614, "grad_norm": 1.5292023420333862, "learning_rate": 8.677518373830675e-06, "loss": 0.04814697057008743, "memory(GiB)": 21.48, "step": 8469, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.958053 }, { "epoch": 0.27515186953838155, "grad_norm": 0.41549450159072876, "learning_rate": 8.677154420359329e-06, "loss": 0.029405340552330017, "memory(GiB)": 21.48, "step": 8470, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.958077 }, { "epoch": 0.27518435500113697, "grad_norm": 0.5417357683181763, "learning_rate": 8.676790424448726e-06, "loss": 0.03556803613901138, "memory(GiB)": 21.48, "step": 8471, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.958102 }, { "epoch": 0.2752168404638924, "grad_norm": 0.4833086133003235, "learning_rate": 8.676426386103067e-06, "loss": 0.028339311480522156, "memory(GiB)": 21.48, "step": 8472, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.958125 }, { "epoch": 0.2752493259266478, "grad_norm": 0.35333168506622314, "learning_rate": 8.676062305326556e-06, "loss": 0.02632918953895569, "memory(GiB)": 21.48, "step": 8473, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.958149 }, { "epoch": 0.2752818113894032, "grad_norm": 4.750300407409668, "learning_rate": 8.675698182123392e-06, "loss": 0.02512645721435547, "memory(GiB)": 21.48, "step": 8474, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.958172 }, { "epoch": 0.27531429685215864, "grad_norm": 0.40532466769218445, "learning_rate": 8.675334016497779e-06, "loss": 0.03140542656183243, "memory(GiB)": 21.48, "step": 8475, "token_acc": 0.9885057471264368, "train_speed(iter/s)": 0.958196 }, { "epoch": 0.27534678231491405, "grad_norm": 0.481293261051178, "learning_rate": 8.67496980845392e-06, "loss": 0.04042865335941315, "memory(GiB)": 21.48, "step": 8476, "token_acc": 0.9719298245614035, "train_speed(iter/s)": 0.95822 }, { "epoch": 0.27537926777766947, "grad_norm": 0.4894282817840576, "learning_rate": 8.674605557996017e-06, "loss": 0.034734297543764114, "memory(GiB)": 21.48, "step": 8477, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.958244 }, { "epoch": 0.2754117532404249, "grad_norm": 0.3791973888874054, "learning_rate": 8.674241265128275e-06, "loss": 0.030343804508447647, "memory(GiB)": 21.48, "step": 8478, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.958267 }, { "epoch": 0.2754442387031803, "grad_norm": 0.4731314778327942, "learning_rate": 8.673876929854898e-06, "loss": 0.03412407636642456, "memory(GiB)": 21.48, "step": 8479, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.95829 }, { "epoch": 0.27547672416593577, "grad_norm": 0.6305601000785828, "learning_rate": 8.673512552180093e-06, "loss": 0.038006510585546494, "memory(GiB)": 21.48, "step": 8480, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.958312 }, { "epoch": 0.2755092096286912, "grad_norm": 0.5195584297180176, "learning_rate": 8.673148132108062e-06, "loss": 0.03520198166370392, "memory(GiB)": 21.48, "step": 8481, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.958336 }, { "epoch": 0.2755416950914466, "grad_norm": 0.48377326130867004, "learning_rate": 8.672783669643014e-06, "loss": 0.034922678023576736, "memory(GiB)": 21.48, "step": 8482, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.958357 }, { "epoch": 0.275574180554202, "grad_norm": 0.6250095367431641, "learning_rate": 8.672419164789154e-06, "loss": 0.040224283933639526, "memory(GiB)": 21.48, "step": 8483, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.958381 }, { "epoch": 0.27560666601695744, "grad_norm": 0.6325885057449341, "learning_rate": 8.672054617550689e-06, "loss": 0.04566110670566559, "memory(GiB)": 21.48, "step": 8484, "token_acc": 0.9826989619377162, "train_speed(iter/s)": 0.958405 }, { "epoch": 0.27563915147971285, "grad_norm": 0.5264137983322144, "learning_rate": 8.671690027931824e-06, "loss": 0.031102091073989868, "memory(GiB)": 21.48, "step": 8485, "token_acc": 1.0, "train_speed(iter/s)": 0.958429 }, { "epoch": 0.27567163694246827, "grad_norm": 0.40998759865760803, "learning_rate": 8.671325395936772e-06, "loss": 0.02804369106888771, "memory(GiB)": 21.48, "step": 8486, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.958452 }, { "epoch": 0.2757041224052237, "grad_norm": 0.45024052262306213, "learning_rate": 8.670960721569737e-06, "loss": 0.02803245186805725, "memory(GiB)": 21.48, "step": 8487, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.958471 }, { "epoch": 0.2757366078679791, "grad_norm": 0.3321753740310669, "learning_rate": 8.670596004834929e-06, "loss": 0.02779718115925789, "memory(GiB)": 21.48, "step": 8488, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.958489 }, { "epoch": 0.2757690933307345, "grad_norm": 0.4228346645832062, "learning_rate": 8.670231245736557e-06, "loss": 0.03226758539676666, "memory(GiB)": 21.48, "step": 8489, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.958509 }, { "epoch": 0.27580157879348993, "grad_norm": 0.587091326713562, "learning_rate": 8.669866444278832e-06, "loss": 0.03664275258779526, "memory(GiB)": 21.48, "step": 8490, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.958529 }, { "epoch": 0.27583406425624535, "grad_norm": 0.5018993020057678, "learning_rate": 8.669501600465964e-06, "loss": 0.02661984972655773, "memory(GiB)": 21.48, "step": 8491, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.958544 }, { "epoch": 0.27586654971900076, "grad_norm": 0.6736940145492554, "learning_rate": 8.669136714302163e-06, "loss": 0.033289823681116104, "memory(GiB)": 21.48, "step": 8492, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.958562 }, { "epoch": 0.2758990351817562, "grad_norm": 0.37480857968330383, "learning_rate": 8.668771785791642e-06, "loss": 0.03141680359840393, "memory(GiB)": 21.48, "step": 8493, "token_acc": 0.9826086956521739, "train_speed(iter/s)": 0.958578 }, { "epoch": 0.2759315206445116, "grad_norm": 0.4187876284122467, "learning_rate": 8.668406814938611e-06, "loss": 0.030601365491747856, "memory(GiB)": 21.48, "step": 8494, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.958593 }, { "epoch": 0.275964006107267, "grad_norm": 0.508804976940155, "learning_rate": 8.66804180174728e-06, "loss": 0.032204799354076385, "memory(GiB)": 21.48, "step": 8495, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.958609 }, { "epoch": 0.27599649157002243, "grad_norm": 0.5484666228294373, "learning_rate": 8.667676746221867e-06, "loss": 0.02898591384291649, "memory(GiB)": 21.48, "step": 8496, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.958625 }, { "epoch": 0.27602897703277784, "grad_norm": 0.5003629326820374, "learning_rate": 8.667311648366583e-06, "loss": 0.027378253638744354, "memory(GiB)": 21.48, "step": 8497, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.958641 }, { "epoch": 0.27606146249553326, "grad_norm": 0.3871864080429077, "learning_rate": 8.66694650818564e-06, "loss": 0.03124573454260826, "memory(GiB)": 21.48, "step": 8498, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.958655 }, { "epoch": 0.2760939479582887, "grad_norm": 0.4100535809993744, "learning_rate": 8.666581325683254e-06, "loss": 0.02703130804002285, "memory(GiB)": 21.48, "step": 8499, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.95867 }, { "epoch": 0.2761264334210441, "grad_norm": 0.3901115953922272, "learning_rate": 8.66621610086364e-06, "loss": 0.023558247834444046, "memory(GiB)": 21.48, "step": 8500, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.958686 }, { "epoch": 0.2761264334210441, "eval_loss": 0.03576480969786644, "eval_runtime": 81.2388, "eval_samples_per_second": 122.478, "eval_steps_per_second": 3.828, "eval_token_acc": 0.9859829042228045, "step": 8500 }, { "epoch": 0.2761589188837995, "grad_norm": 0.6050278544425964, "learning_rate": 8.665850833731012e-06, "loss": 0.045709021389484406, "memory(GiB)": 21.48, "step": 8501, "token_acc": 0.9862201219094583, "train_speed(iter/s)": 0.948918 }, { "epoch": 0.2761914043465549, "grad_norm": 0.5335977673530579, "learning_rate": 8.665485524289587e-06, "loss": 0.03334537893533707, "memory(GiB)": 21.48, "step": 8502, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.948937 }, { "epoch": 0.27622388980931034, "grad_norm": 0.7654301524162292, "learning_rate": 8.665120172543578e-06, "loss": 0.032795146107673645, "memory(GiB)": 21.48, "step": 8503, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.948956 }, { "epoch": 0.27625637527206576, "grad_norm": 0.418923020362854, "learning_rate": 8.664754778497207e-06, "loss": 0.034637875854969025, "memory(GiB)": 21.48, "step": 8504, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.948975 }, { "epoch": 0.2762888607348212, "grad_norm": 0.4958287477493286, "learning_rate": 8.664389342154686e-06, "loss": 0.037508945912122726, "memory(GiB)": 21.48, "step": 8505, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.948993 }, { "epoch": 0.2763213461975766, "grad_norm": 0.8309184908866882, "learning_rate": 8.664023863520235e-06, "loss": 0.03488478064537048, "memory(GiB)": 21.48, "step": 8506, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.949014 }, { "epoch": 0.276353831660332, "grad_norm": 2.1876230239868164, "learning_rate": 8.663658342598071e-06, "loss": 0.02725396305322647, "memory(GiB)": 21.48, "step": 8507, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.949034 }, { "epoch": 0.2763863171230874, "grad_norm": 0.44042205810546875, "learning_rate": 8.663292779392417e-06, "loss": 0.027753407135605812, "memory(GiB)": 21.48, "step": 8508, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.949053 }, { "epoch": 0.27641880258584284, "grad_norm": 2.021094799041748, "learning_rate": 8.662927173907486e-06, "loss": 0.03075970895588398, "memory(GiB)": 21.48, "step": 8509, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.949073 }, { "epoch": 0.27645128804859825, "grad_norm": 0.5449264049530029, "learning_rate": 8.662561526147498e-06, "loss": 0.03364694491028786, "memory(GiB)": 21.48, "step": 8510, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.949093 }, { "epoch": 0.27648377351135367, "grad_norm": 0.42445799708366394, "learning_rate": 8.662195836116677e-06, "loss": 0.033803604543209076, "memory(GiB)": 21.48, "step": 8511, "token_acc": 1.0, "train_speed(iter/s)": 0.949112 }, { "epoch": 0.2765162589741091, "grad_norm": 0.4166962504386902, "learning_rate": 8.661830103819242e-06, "loss": 0.022508468478918076, "memory(GiB)": 21.48, "step": 8512, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.949133 }, { "epoch": 0.2765487444368645, "grad_norm": 0.5470890998840332, "learning_rate": 8.661464329259413e-06, "loss": 0.042504217475652695, "memory(GiB)": 21.48, "step": 8513, "token_acc": 0.98828125, "train_speed(iter/s)": 0.94915 }, { "epoch": 0.2765812298996199, "grad_norm": 1.244696855545044, "learning_rate": 8.661098512441413e-06, "loss": 0.041008464992046356, "memory(GiB)": 21.48, "step": 8514, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.949169 }, { "epoch": 0.27661371536237533, "grad_norm": 1.4010473489761353, "learning_rate": 8.660732653369463e-06, "loss": 0.02698422595858574, "memory(GiB)": 21.48, "step": 8515, "token_acc": 0.9778761061946902, "train_speed(iter/s)": 0.949186 }, { "epoch": 0.27664620082513075, "grad_norm": 0.6265254020690918, "learning_rate": 8.660366752047787e-06, "loss": 0.03653493896126747, "memory(GiB)": 21.48, "step": 8516, "token_acc": 0.98828125, "train_speed(iter/s)": 0.949205 }, { "epoch": 0.27667868628788617, "grad_norm": 0.46690380573272705, "learning_rate": 8.660000808480604e-06, "loss": 0.04243723303079605, "memory(GiB)": 21.48, "step": 8517, "token_acc": 0.9806949806949807, "train_speed(iter/s)": 0.949224 }, { "epoch": 0.2767111717506416, "grad_norm": 0.4972822368144989, "learning_rate": 8.659634822672143e-06, "loss": 0.03584393858909607, "memory(GiB)": 21.48, "step": 8518, "token_acc": 0.9927007299270073, "train_speed(iter/s)": 0.949244 }, { "epoch": 0.276743657213397, "grad_norm": 1.6096194982528687, "learning_rate": 8.659268794626623e-06, "loss": 0.046181049197912216, "memory(GiB)": 21.48, "step": 8519, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.949263 }, { "epoch": 0.2767761426761524, "grad_norm": 0.45814815163612366, "learning_rate": 8.658902724348272e-06, "loss": 0.03216899186372757, "memory(GiB)": 21.48, "step": 8520, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.949282 }, { "epoch": 0.27680862813890783, "grad_norm": 0.40247786045074463, "learning_rate": 8.658536611841312e-06, "loss": 0.03395562991499901, "memory(GiB)": 21.48, "step": 8521, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.9493 }, { "epoch": 0.27684111360166325, "grad_norm": 0.5050835609436035, "learning_rate": 8.65817045710997e-06, "loss": 0.044564612209796906, "memory(GiB)": 21.48, "step": 8522, "token_acc": 0.9745762711864406, "train_speed(iter/s)": 0.949314 }, { "epoch": 0.27687359906441866, "grad_norm": 0.496151864528656, "learning_rate": 8.657804260158475e-06, "loss": 0.03306710347533226, "memory(GiB)": 21.48, "step": 8523, "token_acc": 0.9760765550239234, "train_speed(iter/s)": 0.949332 }, { "epoch": 0.2769060845271741, "grad_norm": 0.34973254799842834, "learning_rate": 8.657438020991049e-06, "loss": 0.03393925726413727, "memory(GiB)": 21.48, "step": 8524, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.94935 }, { "epoch": 0.2769385699899295, "grad_norm": 0.39230862259864807, "learning_rate": 8.657071739611918e-06, "loss": 0.030547330155968666, "memory(GiB)": 21.48, "step": 8525, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.94937 }, { "epoch": 0.2769710554526849, "grad_norm": 0.4875562787055969, "learning_rate": 8.656705416025314e-06, "loss": 0.03216845542192459, "memory(GiB)": 21.48, "step": 8526, "token_acc": 0.9933110367892977, "train_speed(iter/s)": 0.949391 }, { "epoch": 0.2770035409154403, "grad_norm": 0.41200119256973267, "learning_rate": 8.656339050235461e-06, "loss": 0.03055868297815323, "memory(GiB)": 21.48, "step": 8527, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.94941 }, { "epoch": 0.27703602637819574, "grad_norm": 0.45196059346199036, "learning_rate": 8.65597264224659e-06, "loss": 0.03185839205980301, "memory(GiB)": 21.48, "step": 8528, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.949431 }, { "epoch": 0.27706851184095116, "grad_norm": 0.4095093905925751, "learning_rate": 8.655606192062927e-06, "loss": 0.02388041839003563, "memory(GiB)": 21.48, "step": 8529, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.949423 }, { "epoch": 0.2771009973037066, "grad_norm": 0.45172053575515747, "learning_rate": 8.655239699688705e-06, "loss": 0.034722574055194855, "memory(GiB)": 21.48, "step": 8530, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.949447 }, { "epoch": 0.277133482766462, "grad_norm": 1.251772403717041, "learning_rate": 8.65487316512815e-06, "loss": 0.039920173585414886, "memory(GiB)": 21.48, "step": 8531, "token_acc": 0.9890909090909091, "train_speed(iter/s)": 0.949472 }, { "epoch": 0.2771659682292174, "grad_norm": 0.5267596244812012, "learning_rate": 8.654506588385495e-06, "loss": 0.048733677715063095, "memory(GiB)": 21.48, "step": 8532, "token_acc": 0.9651741293532339, "train_speed(iter/s)": 0.949489 }, { "epoch": 0.2771984536919728, "grad_norm": 0.4264078140258789, "learning_rate": 8.654139969464969e-06, "loss": 0.04001152142882347, "memory(GiB)": 21.48, "step": 8533, "token_acc": 0.9933333333333333, "train_speed(iter/s)": 0.949507 }, { "epoch": 0.27723093915472824, "grad_norm": 0.4461211860179901, "learning_rate": 8.653773308370806e-06, "loss": 0.03226714953780174, "memory(GiB)": 21.48, "step": 8534, "token_acc": 0.9862542955326461, "train_speed(iter/s)": 0.949524 }, { "epoch": 0.27726342461748366, "grad_norm": 0.41645801067352295, "learning_rate": 8.653406605107233e-06, "loss": 0.031569916754961014, "memory(GiB)": 21.48, "step": 8535, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.949544 }, { "epoch": 0.27729591008023907, "grad_norm": 0.4937187433242798, "learning_rate": 8.653039859678487e-06, "loss": 0.03761156275868416, "memory(GiB)": 21.48, "step": 8536, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.949562 }, { "epoch": 0.2773283955429945, "grad_norm": 0.532342255115509, "learning_rate": 8.652673072088798e-06, "loss": 0.043380267918109894, "memory(GiB)": 21.48, "step": 8537, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.94958 }, { "epoch": 0.2773608810057499, "grad_norm": 0.4023451805114746, "learning_rate": 8.652306242342401e-06, "loss": 0.027814825996756554, "memory(GiB)": 21.48, "step": 8538, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.949601 }, { "epoch": 0.2773933664685053, "grad_norm": 0.48895692825317383, "learning_rate": 8.651939370443528e-06, "loss": 0.03353605419397354, "memory(GiB)": 21.48, "step": 8539, "token_acc": 0.9792387543252595, "train_speed(iter/s)": 0.94962 }, { "epoch": 0.27742585193126074, "grad_norm": 0.36590540409088135, "learning_rate": 8.651572456396416e-06, "loss": 0.0292351096868515, "memory(GiB)": 21.48, "step": 8540, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.949636 }, { "epoch": 0.27745833739401615, "grad_norm": 0.38853928446769714, "learning_rate": 8.651205500205295e-06, "loss": 0.032000720500946045, "memory(GiB)": 21.48, "step": 8541, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.949654 }, { "epoch": 0.27749082285677157, "grad_norm": 0.5116375684738159, "learning_rate": 8.650838501874403e-06, "loss": 0.033991847187280655, "memory(GiB)": 21.48, "step": 8542, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.949674 }, { "epoch": 0.277523308319527, "grad_norm": 0.3790701925754547, "learning_rate": 8.650471461407976e-06, "loss": 0.03543585538864136, "memory(GiB)": 21.48, "step": 8543, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.949692 }, { "epoch": 0.27755579378228246, "grad_norm": 0.6347402930259705, "learning_rate": 8.650104378810249e-06, "loss": 0.04075338691473007, "memory(GiB)": 21.48, "step": 8544, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.949712 }, { "epoch": 0.27758827924503787, "grad_norm": 0.4742181599140167, "learning_rate": 8.64973725408546e-06, "loss": 0.036466263234615326, "memory(GiB)": 21.48, "step": 8545, "token_acc": 0.9752066115702479, "train_speed(iter/s)": 0.949731 }, { "epoch": 0.2776207647077933, "grad_norm": 0.48617109656333923, "learning_rate": 8.649370087237844e-06, "loss": 0.02709241211414337, "memory(GiB)": 21.48, "step": 8546, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.94975 }, { "epoch": 0.2776532501705487, "grad_norm": 0.4590671956539154, "learning_rate": 8.649002878271641e-06, "loss": 0.03735586255788803, "memory(GiB)": 21.48, "step": 8547, "token_acc": 0.9789473684210527, "train_speed(iter/s)": 0.94977 }, { "epoch": 0.2776857356333041, "grad_norm": 0.6094958782196045, "learning_rate": 8.648635627191088e-06, "loss": 0.04089890792965889, "memory(GiB)": 21.48, "step": 8548, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.949789 }, { "epoch": 0.27771822109605954, "grad_norm": 0.5191473364830017, "learning_rate": 8.648268334000425e-06, "loss": 0.03554003685712814, "memory(GiB)": 21.48, "step": 8549, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.949806 }, { "epoch": 0.27775070655881495, "grad_norm": 0.3795244097709656, "learning_rate": 8.647900998703887e-06, "loss": 0.031016945838928223, "memory(GiB)": 21.48, "step": 8550, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.949825 }, { "epoch": 0.27778319202157037, "grad_norm": 0.4557062089443207, "learning_rate": 8.647533621305718e-06, "loss": 0.027641333639621735, "memory(GiB)": 21.48, "step": 8551, "token_acc": 0.9948979591836735, "train_speed(iter/s)": 0.949845 }, { "epoch": 0.2778156774843258, "grad_norm": 0.5416308045387268, "learning_rate": 8.647166201810156e-06, "loss": 0.037514496594667435, "memory(GiB)": 21.48, "step": 8552, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.949868 }, { "epoch": 0.2778481629470812, "grad_norm": 0.45576924085617065, "learning_rate": 8.64679874022144e-06, "loss": 0.033780649304389954, "memory(GiB)": 21.48, "step": 8553, "token_acc": 0.973568281938326, "train_speed(iter/s)": 0.949888 }, { "epoch": 0.2778806484098366, "grad_norm": 0.4958270192146301, "learning_rate": 8.646431236543814e-06, "loss": 0.031091082841157913, "memory(GiB)": 21.48, "step": 8554, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.949906 }, { "epoch": 0.27791313387259203, "grad_norm": 0.4542591869831085, "learning_rate": 8.646063690781519e-06, "loss": 0.03672805055975914, "memory(GiB)": 21.48, "step": 8555, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.949927 }, { "epoch": 0.27794561933534745, "grad_norm": 0.46866053342819214, "learning_rate": 8.645696102938794e-06, "loss": 0.033567897975444794, "memory(GiB)": 21.48, "step": 8556, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.949947 }, { "epoch": 0.27797810479810287, "grad_norm": 0.4704323709011078, "learning_rate": 8.645328473019884e-06, "loss": 0.038777682930231094, "memory(GiB)": 21.48, "step": 8557, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.949965 }, { "epoch": 0.2780105902608583, "grad_norm": 0.35445713996887207, "learning_rate": 8.644960801029033e-06, "loss": 0.027617909014225006, "memory(GiB)": 21.48, "step": 8558, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.949984 }, { "epoch": 0.2780430757236137, "grad_norm": 0.4422157108783722, "learning_rate": 8.644593086970481e-06, "loss": 0.028203897178173065, "memory(GiB)": 21.48, "step": 8559, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.950002 }, { "epoch": 0.2780755611863691, "grad_norm": 0.3635478615760803, "learning_rate": 8.644225330848475e-06, "loss": 0.02630573697388172, "memory(GiB)": 21.48, "step": 8560, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.95002 }, { "epoch": 0.27810804664912453, "grad_norm": 0.5885924696922302, "learning_rate": 8.643857532667258e-06, "loss": 0.03225596249103546, "memory(GiB)": 21.48, "step": 8561, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.950038 }, { "epoch": 0.27814053211187995, "grad_norm": 0.297297865152359, "learning_rate": 8.643489692431075e-06, "loss": 0.02492894046008587, "memory(GiB)": 21.48, "step": 8562, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.950057 }, { "epoch": 0.27817301757463536, "grad_norm": 0.5090445876121521, "learning_rate": 8.643121810144172e-06, "loss": 0.03516952320933342, "memory(GiB)": 21.48, "step": 8563, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.950076 }, { "epoch": 0.2782055030373908, "grad_norm": 0.6236875057220459, "learning_rate": 8.642753885810793e-06, "loss": 0.03995789587497711, "memory(GiB)": 21.48, "step": 8564, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.950096 }, { "epoch": 0.2782379885001462, "grad_norm": 0.45513492822647095, "learning_rate": 8.642385919435187e-06, "loss": 0.02726723998785019, "memory(GiB)": 21.48, "step": 8565, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.950115 }, { "epoch": 0.2782704739629016, "grad_norm": 0.487630158662796, "learning_rate": 8.642017911021598e-06, "loss": 0.030800526961684227, "memory(GiB)": 21.48, "step": 8566, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.950132 }, { "epoch": 0.278302959425657, "grad_norm": 0.517490804195404, "learning_rate": 8.641649860574278e-06, "loss": 0.03361702710390091, "memory(GiB)": 21.48, "step": 8567, "token_acc": 1.0, "train_speed(iter/s)": 0.950137 }, { "epoch": 0.27833544488841244, "grad_norm": 0.416483610868454, "learning_rate": 8.641281768097469e-06, "loss": 0.035388246178627014, "memory(GiB)": 21.48, "step": 8568, "token_acc": 0.9899665551839465, "train_speed(iter/s)": 0.950154 }, { "epoch": 0.27836793035116786, "grad_norm": 0.39933446049690247, "learning_rate": 8.640913633595423e-06, "loss": 0.03477215766906738, "memory(GiB)": 21.48, "step": 8569, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.950171 }, { "epoch": 0.2784004158139233, "grad_norm": 0.5391074419021606, "learning_rate": 8.640545457072386e-06, "loss": 0.04186960309743881, "memory(GiB)": 21.48, "step": 8570, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.950187 }, { "epoch": 0.2784329012766787, "grad_norm": 0.578141987323761, "learning_rate": 8.64017723853261e-06, "loss": 0.03625565022230148, "memory(GiB)": 21.48, "step": 8571, "token_acc": 0.9752475247524752, "train_speed(iter/s)": 0.950204 }, { "epoch": 0.2784653867394341, "grad_norm": 0.5866992473602295, "learning_rate": 8.639808977980343e-06, "loss": 0.0379626527428627, "memory(GiB)": 21.48, "step": 8572, "token_acc": 0.9645669291338582, "train_speed(iter/s)": 0.950221 }, { "epoch": 0.2784978722021895, "grad_norm": 2.2388951778411865, "learning_rate": 8.639440675419835e-06, "loss": 0.03954555094242096, "memory(GiB)": 21.48, "step": 8573, "token_acc": 0.9755244755244755, "train_speed(iter/s)": 0.950241 }, { "epoch": 0.27853035766494494, "grad_norm": 0.5737051963806152, "learning_rate": 8.63907233085534e-06, "loss": 0.03857787325978279, "memory(GiB)": 21.48, "step": 8574, "token_acc": 0.9823788546255506, "train_speed(iter/s)": 0.95026 }, { "epoch": 0.27856284312770035, "grad_norm": 0.4572925567626953, "learning_rate": 8.638703944291104e-06, "loss": 0.03364161029458046, "memory(GiB)": 21.48, "step": 8575, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.950279 }, { "epoch": 0.27859532859045577, "grad_norm": 0.5373750329017639, "learning_rate": 8.638335515731384e-06, "loss": 0.04272498935461044, "memory(GiB)": 21.48, "step": 8576, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.950297 }, { "epoch": 0.2786278140532112, "grad_norm": 0.32606902718544006, "learning_rate": 8.637967045180428e-06, "loss": 0.02937568537890911, "memory(GiB)": 21.48, "step": 8577, "token_acc": 0.9888268156424581, "train_speed(iter/s)": 0.950313 }, { "epoch": 0.2786602995159666, "grad_norm": 0.5990333557128906, "learning_rate": 8.637598532642491e-06, "loss": 0.03307613730430603, "memory(GiB)": 21.48, "step": 8578, "token_acc": 1.0, "train_speed(iter/s)": 0.95033 }, { "epoch": 0.278692784978722, "grad_norm": 0.7925398945808411, "learning_rate": 8.637229978121823e-06, "loss": 0.03005625680088997, "memory(GiB)": 21.48, "step": 8579, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.950348 }, { "epoch": 0.27872527044147744, "grad_norm": 0.4505665898323059, "learning_rate": 8.636861381622683e-06, "loss": 0.03677123412489891, "memory(GiB)": 21.48, "step": 8580, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.950366 }, { "epoch": 0.27875775590423285, "grad_norm": 0.3899003863334656, "learning_rate": 8.63649274314932e-06, "loss": 0.027497557923197746, "memory(GiB)": 21.48, "step": 8581, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.950386 }, { "epoch": 0.27879024136698827, "grad_norm": 0.4265192747116089, "learning_rate": 8.636124062705992e-06, "loss": 0.02627432346343994, "memory(GiB)": 21.48, "step": 8582, "token_acc": 0.9849056603773585, "train_speed(iter/s)": 0.950405 }, { "epoch": 0.2788227268297437, "grad_norm": 0.9762408137321472, "learning_rate": 8.635755340296952e-06, "loss": 0.0390160009264946, "memory(GiB)": 21.48, "step": 8583, "token_acc": 0.9739776951672863, "train_speed(iter/s)": 0.950422 }, { "epoch": 0.2788552122924991, "grad_norm": 0.5892001986503601, "learning_rate": 8.635386575926456e-06, "loss": 0.02952992171049118, "memory(GiB)": 21.48, "step": 8584, "token_acc": 0.996, "train_speed(iter/s)": 0.950441 }, { "epoch": 0.2788876977552545, "grad_norm": 0.47992798686027527, "learning_rate": 8.635017769598762e-06, "loss": 0.04171031340956688, "memory(GiB)": 21.48, "step": 8585, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.950457 }, { "epoch": 0.27892018321800993, "grad_norm": 0.4338582158088684, "learning_rate": 8.634648921318121e-06, "loss": 0.03327729180455208, "memory(GiB)": 21.48, "step": 8586, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.950475 }, { "epoch": 0.27895266868076535, "grad_norm": 0.5202420949935913, "learning_rate": 8.634280031088796e-06, "loss": 0.03441943600773811, "memory(GiB)": 21.48, "step": 8587, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.950486 }, { "epoch": 0.27898515414352076, "grad_norm": 0.565146803855896, "learning_rate": 8.633911098915043e-06, "loss": 0.04222735017538071, "memory(GiB)": 21.48, "step": 8588, "token_acc": 0.981549815498155, "train_speed(iter/s)": 0.950502 }, { "epoch": 0.2790176396062762, "grad_norm": 0.5061657428741455, "learning_rate": 8.633542124801117e-06, "loss": 0.03699749335646629, "memory(GiB)": 21.48, "step": 8589, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.950522 }, { "epoch": 0.2790501250690316, "grad_norm": 0.4446676969528198, "learning_rate": 8.63317310875128e-06, "loss": 0.03082157112658024, "memory(GiB)": 21.48, "step": 8590, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.950541 }, { "epoch": 0.279082610531787, "grad_norm": 0.35096195340156555, "learning_rate": 8.63280405076979e-06, "loss": 0.03417517989873886, "memory(GiB)": 21.48, "step": 8591, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.950563 }, { "epoch": 0.27911509599454243, "grad_norm": 0.45347511768341064, "learning_rate": 8.632434950860906e-06, "loss": 0.026020359247922897, "memory(GiB)": 21.48, "step": 8592, "token_acc": 0.973384030418251, "train_speed(iter/s)": 0.950588 }, { "epoch": 0.27914758145729784, "grad_norm": 0.4634868800640106, "learning_rate": 8.632065809028887e-06, "loss": 0.029446987435221672, "memory(GiB)": 21.48, "step": 8593, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.95061 }, { "epoch": 0.27918006692005326, "grad_norm": 0.49335625767707825, "learning_rate": 8.631696625277995e-06, "loss": 0.03483375161886215, "memory(GiB)": 21.48, "step": 8594, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.950634 }, { "epoch": 0.2792125523828087, "grad_norm": 0.44730037450790405, "learning_rate": 8.631327399612491e-06, "loss": 0.035785943269729614, "memory(GiB)": 21.48, "step": 8595, "token_acc": 0.975177304964539, "train_speed(iter/s)": 0.950658 }, { "epoch": 0.2792450378455641, "grad_norm": 0.46930164098739624, "learning_rate": 8.630958132036634e-06, "loss": 0.03133770823478699, "memory(GiB)": 21.48, "step": 8596, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.950681 }, { "epoch": 0.2792775233083195, "grad_norm": 0.4075171947479248, "learning_rate": 8.630588822554687e-06, "loss": 0.027263682335615158, "memory(GiB)": 21.48, "step": 8597, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.950705 }, { "epoch": 0.2793100087710749, "grad_norm": 0.4648670256137848, "learning_rate": 8.630219471170913e-06, "loss": 0.03333365172147751, "memory(GiB)": 21.48, "step": 8598, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.950729 }, { "epoch": 0.27934249423383034, "grad_norm": 0.35412463545799255, "learning_rate": 8.629850077889574e-06, "loss": 0.029418453574180603, "memory(GiB)": 21.48, "step": 8599, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.950752 }, { "epoch": 0.27937497969658576, "grad_norm": 0.6858428716659546, "learning_rate": 8.629480642714936e-06, "loss": 0.04053754732012749, "memory(GiB)": 21.48, "step": 8600, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.950775 }, { "epoch": 0.2794074651593412, "grad_norm": 0.5244807004928589, "learning_rate": 8.629111165651259e-06, "loss": 0.028245892375707626, "memory(GiB)": 21.48, "step": 8601, "token_acc": 0.983739837398374, "train_speed(iter/s)": 0.950798 }, { "epoch": 0.2794399506220966, "grad_norm": 0.514093816280365, "learning_rate": 8.62874164670281e-06, "loss": 0.03729429095983505, "memory(GiB)": 21.48, "step": 8602, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.950821 }, { "epoch": 0.279472436084852, "grad_norm": 0.45766302943229675, "learning_rate": 8.62837208587385e-06, "loss": 0.03775613754987717, "memory(GiB)": 21.48, "step": 8603, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.950844 }, { "epoch": 0.2795049215476074, "grad_norm": 0.5225456953048706, "learning_rate": 8.628002483168649e-06, "loss": 0.04517175257205963, "memory(GiB)": 21.48, "step": 8604, "token_acc": 0.9740740740740741, "train_speed(iter/s)": 0.950868 }, { "epoch": 0.27953740701036284, "grad_norm": 0.4679166078567505, "learning_rate": 8.62763283859147e-06, "loss": 0.030126754194498062, "memory(GiB)": 21.48, "step": 8605, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.950891 }, { "epoch": 0.27956989247311825, "grad_norm": 0.6308724880218506, "learning_rate": 8.62726315214658e-06, "loss": 0.04059293866157532, "memory(GiB)": 21.48, "step": 8606, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.950915 }, { "epoch": 0.27960237793587367, "grad_norm": 0.5818367600440979, "learning_rate": 8.626893423838246e-06, "loss": 0.034128665924072266, "memory(GiB)": 21.48, "step": 8607, "token_acc": 0.9931972789115646, "train_speed(iter/s)": 0.950939 }, { "epoch": 0.27963486339862914, "grad_norm": 0.3357200622558594, "learning_rate": 8.626523653670732e-06, "loss": 0.02458556368947029, "memory(GiB)": 21.48, "step": 8608, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.950963 }, { "epoch": 0.27966734886138456, "grad_norm": 0.3921760320663452, "learning_rate": 8.62615384164831e-06, "loss": 0.025195833295583725, "memory(GiB)": 21.48, "step": 8609, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.950987 }, { "epoch": 0.27969983432414, "grad_norm": 0.5062513947486877, "learning_rate": 8.625783987775245e-06, "loss": 0.030100373551249504, "memory(GiB)": 21.48, "step": 8610, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.951011 }, { "epoch": 0.2797323197868954, "grad_norm": 0.37561357021331787, "learning_rate": 8.625414092055809e-06, "loss": 0.02730712667107582, "memory(GiB)": 21.48, "step": 8611, "token_acc": 1.0, "train_speed(iter/s)": 0.951035 }, { "epoch": 0.2797648052496508, "grad_norm": 0.3503773510456085, "learning_rate": 8.625044154494266e-06, "loss": 0.03334186226129532, "memory(GiB)": 21.48, "step": 8612, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.951056 }, { "epoch": 0.2797972907124062, "grad_norm": 0.5246814489364624, "learning_rate": 8.624674175094891e-06, "loss": 0.035321541130542755, "memory(GiB)": 21.48, "step": 8613, "token_acc": 0.9609375, "train_speed(iter/s)": 0.951076 }, { "epoch": 0.27982977617516164, "grad_norm": 0.4378589689731598, "learning_rate": 8.624304153861952e-06, "loss": 0.026389922946691513, "memory(GiB)": 21.48, "step": 8614, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.951096 }, { "epoch": 0.27986226163791705, "grad_norm": 0.40313026309013367, "learning_rate": 8.623934090799716e-06, "loss": 0.04069999232888222, "memory(GiB)": 21.48, "step": 8615, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.951114 }, { "epoch": 0.27989474710067247, "grad_norm": 0.6959226727485657, "learning_rate": 8.62356398591246e-06, "loss": 0.03477301448583603, "memory(GiB)": 21.48, "step": 8616, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.951132 }, { "epoch": 0.2799272325634279, "grad_norm": 0.30959033966064453, "learning_rate": 8.623193839204452e-06, "loss": 0.025159412994980812, "memory(GiB)": 21.48, "step": 8617, "token_acc": 0.993103448275862, "train_speed(iter/s)": 0.951151 }, { "epoch": 0.2799597180261833, "grad_norm": 0.5229185819625854, "learning_rate": 8.622823650679964e-06, "loss": 0.030352668836712837, "memory(GiB)": 21.48, "step": 8618, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.951169 }, { "epoch": 0.2799922034889387, "grad_norm": 1.9890522956848145, "learning_rate": 8.622453420343269e-06, "loss": 0.031450945883989334, "memory(GiB)": 21.48, "step": 8619, "token_acc": 0.974025974025974, "train_speed(iter/s)": 0.951189 }, { "epoch": 0.28002468895169413, "grad_norm": 0.3402487337589264, "learning_rate": 8.622083148198641e-06, "loss": 0.024986207485198975, "memory(GiB)": 21.48, "step": 8620, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.951208 }, { "epoch": 0.28005717441444955, "grad_norm": 1.2510578632354736, "learning_rate": 8.621712834250351e-06, "loss": 0.035471249371767044, "memory(GiB)": 21.48, "step": 8621, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.951229 }, { "epoch": 0.28008965987720497, "grad_norm": 0.5572176575660706, "learning_rate": 8.621342478502676e-06, "loss": 0.03614775091409683, "memory(GiB)": 21.48, "step": 8622, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.951247 }, { "epoch": 0.2801221453399604, "grad_norm": 0.5013747215270996, "learning_rate": 8.620972080959888e-06, "loss": 0.03197042644023895, "memory(GiB)": 21.48, "step": 8623, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.951267 }, { "epoch": 0.2801546308027158, "grad_norm": 0.6844398975372314, "learning_rate": 8.620601641626263e-06, "loss": 0.028235357254743576, "memory(GiB)": 21.48, "step": 8624, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.951285 }, { "epoch": 0.2801871162654712, "grad_norm": 0.380121648311615, "learning_rate": 8.620231160506076e-06, "loss": 0.026787325739860535, "memory(GiB)": 21.48, "step": 8625, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.951304 }, { "epoch": 0.28021960172822663, "grad_norm": 0.4215930700302124, "learning_rate": 8.619860637603603e-06, "loss": 0.03444898501038551, "memory(GiB)": 21.48, "step": 8626, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.951324 }, { "epoch": 0.28025208719098205, "grad_norm": 0.36369141936302185, "learning_rate": 8.61949007292312e-06, "loss": 0.027015885338187218, "memory(GiB)": 21.48, "step": 8627, "token_acc": 0.9819494584837545, "train_speed(iter/s)": 0.951343 }, { "epoch": 0.28028457265373746, "grad_norm": 0.4000615179538727, "learning_rate": 8.619119466468903e-06, "loss": 0.029820913448929787, "memory(GiB)": 21.48, "step": 8628, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.951363 }, { "epoch": 0.2803170581164929, "grad_norm": 0.4020293354988098, "learning_rate": 8.618748818245232e-06, "loss": 0.03621508926153183, "memory(GiB)": 21.48, "step": 8629, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.951382 }, { "epoch": 0.2803495435792483, "grad_norm": 0.8799543976783752, "learning_rate": 8.618378128256382e-06, "loss": 0.03839147090911865, "memory(GiB)": 21.48, "step": 8630, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.951401 }, { "epoch": 0.2803820290420037, "grad_norm": 0.4031517803668976, "learning_rate": 8.618007396506633e-06, "loss": 0.03151412680745125, "memory(GiB)": 21.48, "step": 8631, "token_acc": 0.9929824561403509, "train_speed(iter/s)": 0.951417 }, { "epoch": 0.2804145145047591, "grad_norm": 0.3368707597255707, "learning_rate": 8.617636623000263e-06, "loss": 0.03103707544505596, "memory(GiB)": 21.48, "step": 8632, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.951435 }, { "epoch": 0.28044699996751454, "grad_norm": 0.34018605947494507, "learning_rate": 8.617265807741553e-06, "loss": 0.028438501060009003, "memory(GiB)": 21.48, "step": 8633, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.951453 }, { "epoch": 0.28047948543026996, "grad_norm": 0.546777606010437, "learning_rate": 8.616894950734778e-06, "loss": 0.039005622267723083, "memory(GiB)": 21.48, "step": 8634, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.951471 }, { "epoch": 0.2805119708930254, "grad_norm": 0.42587289214134216, "learning_rate": 8.616524051984223e-06, "loss": 0.031291067600250244, "memory(GiB)": 21.48, "step": 8635, "token_acc": 0.996, "train_speed(iter/s)": 0.951489 }, { "epoch": 0.2805444563557808, "grad_norm": 0.4401664435863495, "learning_rate": 8.616153111494166e-06, "loss": 0.029970619827508926, "memory(GiB)": 21.48, "step": 8636, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.951508 }, { "epoch": 0.2805769418185362, "grad_norm": 0.4068506360054016, "learning_rate": 8.615782129268889e-06, "loss": 0.030314743518829346, "memory(GiB)": 21.48, "step": 8637, "token_acc": 0.98828125, "train_speed(iter/s)": 0.951527 }, { "epoch": 0.2806094272812916, "grad_norm": 0.6019281148910522, "learning_rate": 8.615411105312674e-06, "loss": 0.03799505531787872, "memory(GiB)": 21.48, "step": 8638, "token_acc": 0.9894179894179894, "train_speed(iter/s)": 0.951545 }, { "epoch": 0.28064191274404704, "grad_norm": 0.4012906551361084, "learning_rate": 8.615040039629803e-06, "loss": 0.028478924185037613, "memory(GiB)": 21.48, "step": 8639, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.951563 }, { "epoch": 0.28067439820680246, "grad_norm": 0.32888707518577576, "learning_rate": 8.614668932224556e-06, "loss": 0.026653500273823738, "memory(GiB)": 21.48, "step": 8640, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.951581 }, { "epoch": 0.28070688366955787, "grad_norm": 0.49142035841941833, "learning_rate": 8.614297783101221e-06, "loss": 0.029031557962298393, "memory(GiB)": 21.48, "step": 8641, "token_acc": 0.9725274725274725, "train_speed(iter/s)": 0.951598 }, { "epoch": 0.2807393691323133, "grad_norm": 0.43096232414245605, "learning_rate": 8.613926592264078e-06, "loss": 0.031964123249053955, "memory(GiB)": 21.48, "step": 8642, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.951615 }, { "epoch": 0.2807718545950687, "grad_norm": 0.5924274325370789, "learning_rate": 8.613555359717414e-06, "loss": 0.02945377491414547, "memory(GiB)": 21.48, "step": 8643, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.951632 }, { "epoch": 0.2808043400578241, "grad_norm": 0.4797709584236145, "learning_rate": 8.61318408546551e-06, "loss": 0.034860700368881226, "memory(GiB)": 21.48, "step": 8644, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.95165 }, { "epoch": 0.28083682552057954, "grad_norm": 0.39604178071022034, "learning_rate": 8.612812769512652e-06, "loss": 0.026002926751971245, "memory(GiB)": 21.48, "step": 8645, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.95167 }, { "epoch": 0.28086931098333495, "grad_norm": 0.6600325703620911, "learning_rate": 8.612441411863126e-06, "loss": 0.04122118651866913, "memory(GiB)": 21.48, "step": 8646, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.951689 }, { "epoch": 0.28090179644609037, "grad_norm": 0.41209104657173157, "learning_rate": 8.612070012521218e-06, "loss": 0.03465118259191513, "memory(GiB)": 21.48, "step": 8647, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.951706 }, { "epoch": 0.2809342819088458, "grad_norm": 0.5737027525901794, "learning_rate": 8.611698571491216e-06, "loss": 0.03207853436470032, "memory(GiB)": 21.48, "step": 8648, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.951723 }, { "epoch": 0.2809667673716012, "grad_norm": 0.4340173602104187, "learning_rate": 8.611327088777403e-06, "loss": 0.027064403519034386, "memory(GiB)": 21.48, "step": 8649, "token_acc": 1.0, "train_speed(iter/s)": 0.95174 }, { "epoch": 0.2809992528343566, "grad_norm": 0.5860121846199036, "learning_rate": 8.61095556438407e-06, "loss": 0.028778076171875, "memory(GiB)": 21.48, "step": 8650, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.951756 }, { "epoch": 0.28103173829711203, "grad_norm": 0.6896695494651794, "learning_rate": 8.610583998315504e-06, "loss": 0.04667285084724426, "memory(GiB)": 21.48, "step": 8651, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.951772 }, { "epoch": 0.28106422375986745, "grad_norm": 0.37374815344810486, "learning_rate": 8.610212390575993e-06, "loss": 0.02538580819964409, "memory(GiB)": 21.48, "step": 8652, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.951783 }, { "epoch": 0.28109670922262286, "grad_norm": 0.48553386330604553, "learning_rate": 8.609840741169825e-06, "loss": 0.0399913527071476, "memory(GiB)": 21.48, "step": 8653, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.951802 }, { "epoch": 0.2811291946853783, "grad_norm": 0.3585471212863922, "learning_rate": 8.60946905010129e-06, "loss": 0.023741070181131363, "memory(GiB)": 21.48, "step": 8654, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.951821 }, { "epoch": 0.2811616801481337, "grad_norm": 0.42197003960609436, "learning_rate": 8.609097317374679e-06, "loss": 0.028532784432172775, "memory(GiB)": 21.48, "step": 8655, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.951844 }, { "epoch": 0.2811941656108891, "grad_norm": 0.5231341123580933, "learning_rate": 8.60872554299428e-06, "loss": 0.022641755640506744, "memory(GiB)": 21.48, "step": 8656, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.951868 }, { "epoch": 0.28122665107364453, "grad_norm": 0.8126062154769897, "learning_rate": 8.608353726964387e-06, "loss": 0.02798536792397499, "memory(GiB)": 21.48, "step": 8657, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.951891 }, { "epoch": 0.28125913653639995, "grad_norm": 0.4583136737346649, "learning_rate": 8.607981869289286e-06, "loss": 0.032847605645656586, "memory(GiB)": 21.48, "step": 8658, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.951915 }, { "epoch": 0.28129162199915536, "grad_norm": 0.5043861269950867, "learning_rate": 8.607609969973274e-06, "loss": 0.032348260283470154, "memory(GiB)": 21.48, "step": 8659, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.951938 }, { "epoch": 0.2813241074619108, "grad_norm": 0.6757453680038452, "learning_rate": 8.607238029020641e-06, "loss": 0.04127020388841629, "memory(GiB)": 21.48, "step": 8660, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.951962 }, { "epoch": 0.2813565929246662, "grad_norm": 0.6516871452331543, "learning_rate": 8.60686604643568e-06, "loss": 0.03301137313246727, "memory(GiB)": 21.48, "step": 8661, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.951985 }, { "epoch": 0.2813890783874216, "grad_norm": 0.39266321063041687, "learning_rate": 8.606494022222685e-06, "loss": 0.031451817601919174, "memory(GiB)": 21.48, "step": 8662, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.952008 }, { "epoch": 0.281421563850177, "grad_norm": 0.44944995641708374, "learning_rate": 8.606121956385949e-06, "loss": 0.03052677772939205, "memory(GiB)": 21.48, "step": 8663, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.952032 }, { "epoch": 0.28145404931293244, "grad_norm": 0.6175422668457031, "learning_rate": 8.605749848929764e-06, "loss": 0.03615826740860939, "memory(GiB)": 21.48, "step": 8664, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.952056 }, { "epoch": 0.28148653477568786, "grad_norm": 0.8165323138237, "learning_rate": 8.605377699858428e-06, "loss": 0.03203049302101135, "memory(GiB)": 21.48, "step": 8665, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.952079 }, { "epoch": 0.2815190202384433, "grad_norm": 0.32024550437927246, "learning_rate": 8.605005509176233e-06, "loss": 0.02459355816245079, "memory(GiB)": 21.48, "step": 8666, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.952104 }, { "epoch": 0.2815515057011987, "grad_norm": 1.384798288345337, "learning_rate": 8.604633276887477e-06, "loss": 0.047301094979047775, "memory(GiB)": 21.48, "step": 8667, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.952127 }, { "epoch": 0.2815839911639541, "grad_norm": 0.9801422953605652, "learning_rate": 8.604261002996455e-06, "loss": 0.04144085943698883, "memory(GiB)": 21.48, "step": 8668, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.952152 }, { "epoch": 0.2816164766267095, "grad_norm": 0.4965912997722626, "learning_rate": 8.603888687507464e-06, "loss": 0.02860003150999546, "memory(GiB)": 21.48, "step": 8669, "token_acc": 0.9768518518518519, "train_speed(iter/s)": 0.952174 }, { "epoch": 0.28164896208946494, "grad_norm": 0.5857892632484436, "learning_rate": 8.603516330424802e-06, "loss": 0.0450117290019989, "memory(GiB)": 21.48, "step": 8670, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.952197 }, { "epoch": 0.28168144755222035, "grad_norm": 0.8250186443328857, "learning_rate": 8.603143931752764e-06, "loss": 0.034765440970659256, "memory(GiB)": 21.48, "step": 8671, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.952217 }, { "epoch": 0.2817139330149758, "grad_norm": 0.34259921312332153, "learning_rate": 8.602771491495649e-06, "loss": 0.02995583787560463, "memory(GiB)": 21.48, "step": 8672, "token_acc": 0.9758454106280193, "train_speed(iter/s)": 0.952238 }, { "epoch": 0.28174641847773124, "grad_norm": 0.43485429883003235, "learning_rate": 8.602399009657758e-06, "loss": 0.029877811670303345, "memory(GiB)": 21.48, "step": 8673, "token_acc": 0.9727272727272728, "train_speed(iter/s)": 0.952258 }, { "epoch": 0.28177890394048666, "grad_norm": 0.47701260447502136, "learning_rate": 8.602026486243384e-06, "loss": 0.034388087689876556, "memory(GiB)": 21.48, "step": 8674, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.952278 }, { "epoch": 0.2818113894032421, "grad_norm": 0.5690454244613647, "learning_rate": 8.601653921256832e-06, "loss": 0.034492939710617065, "memory(GiB)": 21.48, "step": 8675, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.952297 }, { "epoch": 0.2818438748659975, "grad_norm": 0.397543340921402, "learning_rate": 8.601281314702401e-06, "loss": 0.04164712876081467, "memory(GiB)": 21.48, "step": 8676, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.952314 }, { "epoch": 0.2818763603287529, "grad_norm": 0.7013412714004517, "learning_rate": 8.600908666584388e-06, "loss": 0.04042947292327881, "memory(GiB)": 21.48, "step": 8677, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.952335 }, { "epoch": 0.2819088457915083, "grad_norm": 0.3778172731399536, "learning_rate": 8.600535976907099e-06, "loss": 0.027893275022506714, "memory(GiB)": 21.48, "step": 8678, "token_acc": 0.9857651245551602, "train_speed(iter/s)": 0.952355 }, { "epoch": 0.28194133125426374, "grad_norm": 0.34964507818222046, "learning_rate": 8.60016324567483e-06, "loss": 0.03288738429546356, "memory(GiB)": 21.48, "step": 8679, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.952374 }, { "epoch": 0.28197381671701915, "grad_norm": 0.38189542293548584, "learning_rate": 8.599790472891885e-06, "loss": 0.03247343376278877, "memory(GiB)": 21.48, "step": 8680, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.952393 }, { "epoch": 0.28200630217977457, "grad_norm": 0.42436906695365906, "learning_rate": 8.599417658562568e-06, "loss": 0.033042922616004944, "memory(GiB)": 21.48, "step": 8681, "token_acc": 0.9842271293375394, "train_speed(iter/s)": 0.952411 }, { "epoch": 0.28203878764253, "grad_norm": 0.5194555521011353, "learning_rate": 8.599044802691181e-06, "loss": 0.031753379851579666, "memory(GiB)": 21.48, "step": 8682, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.952429 }, { "epoch": 0.2820712731052854, "grad_norm": 0.587933361530304, "learning_rate": 8.598671905282026e-06, "loss": 0.04005761817097664, "memory(GiB)": 21.48, "step": 8683, "token_acc": 0.9617486338797814, "train_speed(iter/s)": 0.952449 }, { "epoch": 0.2821037585680408, "grad_norm": 0.3738604485988617, "learning_rate": 8.598298966339406e-06, "loss": 0.032430265098810196, "memory(GiB)": 21.48, "step": 8684, "token_acc": 1.0, "train_speed(iter/s)": 0.952468 }, { "epoch": 0.28213624403079623, "grad_norm": 0.2766115963459015, "learning_rate": 8.597925985867629e-06, "loss": 0.019439982250332832, "memory(GiB)": 21.48, "step": 8685, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.952486 }, { "epoch": 0.28216872949355165, "grad_norm": 0.5515900254249573, "learning_rate": 8.597552963870996e-06, "loss": 0.04519084095954895, "memory(GiB)": 21.48, "step": 8686, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.952504 }, { "epoch": 0.28220121495630707, "grad_norm": 0.4020307660102844, "learning_rate": 8.597179900353814e-06, "loss": 0.03898118436336517, "memory(GiB)": 21.48, "step": 8687, "token_acc": 0.980544747081712, "train_speed(iter/s)": 0.952524 }, { "epoch": 0.2822337004190625, "grad_norm": 0.5074746012687683, "learning_rate": 8.596806795320387e-06, "loss": 0.03840763121843338, "memory(GiB)": 21.48, "step": 8688, "token_acc": 0.98046875, "train_speed(iter/s)": 0.952543 }, { "epoch": 0.2822661858818179, "grad_norm": 0.7610753774642944, "learning_rate": 8.596433648775023e-06, "loss": 0.030059335753321648, "memory(GiB)": 21.48, "step": 8689, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.95256 }, { "epoch": 0.2822986713445733, "grad_norm": 0.317709743976593, "learning_rate": 8.59606046072203e-06, "loss": 0.030346032232046127, "memory(GiB)": 21.48, "step": 8690, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.952579 }, { "epoch": 0.28233115680732873, "grad_norm": 0.40159857273101807, "learning_rate": 8.595687231165709e-06, "loss": 0.02813138999044895, "memory(GiB)": 21.48, "step": 8691, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.952599 }, { "epoch": 0.28236364227008415, "grad_norm": 0.5249897241592407, "learning_rate": 8.595313960110373e-06, "loss": 0.03930313140153885, "memory(GiB)": 21.48, "step": 8692, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.952622 }, { "epoch": 0.28239612773283956, "grad_norm": 0.4217389225959778, "learning_rate": 8.59494064756033e-06, "loss": 0.03997718542814255, "memory(GiB)": 21.48, "step": 8693, "token_acc": 0.9714285714285714, "train_speed(iter/s)": 0.95264 }, { "epoch": 0.282428613195595, "grad_norm": 0.4874080419540405, "learning_rate": 8.594567293519885e-06, "loss": 0.03298993036150932, "memory(GiB)": 21.48, "step": 8694, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.952659 }, { "epoch": 0.2824610986583504, "grad_norm": 0.5739938020706177, "learning_rate": 8.59419389799335e-06, "loss": 0.03759083151817322, "memory(GiB)": 21.48, "step": 8695, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.952679 }, { "epoch": 0.2824935841211058, "grad_norm": 0.3654384911060333, "learning_rate": 8.593820460985034e-06, "loss": 0.031768567860126495, "memory(GiB)": 21.48, "step": 8696, "token_acc": 0.974169741697417, "train_speed(iter/s)": 0.952697 }, { "epoch": 0.28252606958386123, "grad_norm": 0.6081517934799194, "learning_rate": 8.593446982499246e-06, "loss": 0.040676675736904144, "memory(GiB)": 21.48, "step": 8697, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.952716 }, { "epoch": 0.28255855504661664, "grad_norm": 0.4219589829444885, "learning_rate": 8.593073462540297e-06, "loss": 0.033775005489587784, "memory(GiB)": 21.48, "step": 8698, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.952733 }, { "epoch": 0.28259104050937206, "grad_norm": 0.7327272891998291, "learning_rate": 8.592699901112498e-06, "loss": 0.048433005809783936, "memory(GiB)": 21.48, "step": 8699, "token_acc": 0.98046875, "train_speed(iter/s)": 0.952752 }, { "epoch": 0.2826235259721275, "grad_norm": 0.3332110345363617, "learning_rate": 8.59232629822016e-06, "loss": 0.02897612564265728, "memory(GiB)": 21.48, "step": 8700, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.95277 }, { "epoch": 0.2826560114348829, "grad_norm": 0.40794461965560913, "learning_rate": 8.591952653867594e-06, "loss": 0.025609146803617477, "memory(GiB)": 21.48, "step": 8701, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.95279 }, { "epoch": 0.2826884968976383, "grad_norm": 0.3679981529712677, "learning_rate": 8.591578968059116e-06, "loss": 0.02532709576189518, "memory(GiB)": 21.48, "step": 8702, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.952808 }, { "epoch": 0.2827209823603937, "grad_norm": 0.5736078023910522, "learning_rate": 8.591205240799035e-06, "loss": 0.029887057840824127, "memory(GiB)": 21.48, "step": 8703, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.952826 }, { "epoch": 0.28275346782314914, "grad_norm": 0.40150487422943115, "learning_rate": 8.590831472091666e-06, "loss": 0.028285743668675423, "memory(GiB)": 21.48, "step": 8704, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.952842 }, { "epoch": 0.28278595328590456, "grad_norm": 0.4406058192253113, "learning_rate": 8.590457661941321e-06, "loss": 0.028644073754549026, "memory(GiB)": 21.48, "step": 8705, "token_acc": 1.0, "train_speed(iter/s)": 0.952859 }, { "epoch": 0.28281843874866, "grad_norm": 0.6301061511039734, "learning_rate": 8.590083810352317e-06, "loss": 0.03498055785894394, "memory(GiB)": 21.48, "step": 8706, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.952877 }, { "epoch": 0.2828509242114154, "grad_norm": 0.5347561240196228, "learning_rate": 8.589709917328969e-06, "loss": 0.031849246472120285, "memory(GiB)": 21.48, "step": 8707, "token_acc": 0.9822695035460993, "train_speed(iter/s)": 0.952897 }, { "epoch": 0.2828834096741708, "grad_norm": 0.35263386368751526, "learning_rate": 8.589335982875588e-06, "loss": 0.028689410537481308, "memory(GiB)": 21.48, "step": 8708, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.952916 }, { "epoch": 0.2829158951369262, "grad_norm": 0.672156035900116, "learning_rate": 8.588962006996493e-06, "loss": 0.03211507201194763, "memory(GiB)": 21.48, "step": 8709, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.952933 }, { "epoch": 0.28294838059968164, "grad_norm": 1.1018463373184204, "learning_rate": 8.588587989696e-06, "loss": 0.035679593682289124, "memory(GiB)": 21.48, "step": 8710, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.952952 }, { "epoch": 0.28298086606243705, "grad_norm": 0.3911106586456299, "learning_rate": 8.588213930978424e-06, "loss": 0.025603819638490677, "memory(GiB)": 21.48, "step": 8711, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.952969 }, { "epoch": 0.28301335152519247, "grad_norm": 1.848370909690857, "learning_rate": 8.587839830848087e-06, "loss": 0.03917381912469864, "memory(GiB)": 21.48, "step": 8712, "token_acc": 1.0, "train_speed(iter/s)": 0.952986 }, { "epoch": 0.2830458369879479, "grad_norm": 0.47625070810317993, "learning_rate": 8.587465689309298e-06, "loss": 0.028950415551662445, "memory(GiB)": 21.48, "step": 8713, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.953005 }, { "epoch": 0.2830783224507033, "grad_norm": 0.42537298798561096, "learning_rate": 8.587091506366382e-06, "loss": 0.02477409318089485, "memory(GiB)": 21.48, "step": 8714, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.953025 }, { "epoch": 0.2831108079134587, "grad_norm": 0.7030469179153442, "learning_rate": 8.586717282023657e-06, "loss": 0.05042961984872818, "memory(GiB)": 21.48, "step": 8715, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.953043 }, { "epoch": 0.28314329337621413, "grad_norm": 0.5028107166290283, "learning_rate": 8.58634301628544e-06, "loss": 0.03247244283556938, "memory(GiB)": 21.48, "step": 8716, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.953061 }, { "epoch": 0.28317577883896955, "grad_norm": 0.5245944261550903, "learning_rate": 8.585968709156051e-06, "loss": 0.026785533875226974, "memory(GiB)": 21.48, "step": 8717, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.953084 }, { "epoch": 0.28320826430172497, "grad_norm": 0.774107813835144, "learning_rate": 8.58559436063981e-06, "loss": 0.04522881656885147, "memory(GiB)": 21.48, "step": 8718, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.953109 }, { "epoch": 0.2832407497644804, "grad_norm": 0.552643358707428, "learning_rate": 8.585219970741038e-06, "loss": 0.04589511454105377, "memory(GiB)": 21.48, "step": 8719, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.953132 }, { "epoch": 0.2832732352272358, "grad_norm": 0.4108392000198364, "learning_rate": 8.584845539464055e-06, "loss": 0.033316317945718765, "memory(GiB)": 21.48, "step": 8720, "token_acc": 1.0, "train_speed(iter/s)": 0.953155 }, { "epoch": 0.2833057206899912, "grad_norm": 0.4466126561164856, "learning_rate": 8.584471066813183e-06, "loss": 0.02828112430870533, "memory(GiB)": 21.48, "step": 8721, "token_acc": 1.0, "train_speed(iter/s)": 0.953179 }, { "epoch": 0.28333820615274663, "grad_norm": 0.3448942005634308, "learning_rate": 8.584096552792745e-06, "loss": 0.029279474169015884, "memory(GiB)": 21.48, "step": 8722, "token_acc": 0.9806201550387597, "train_speed(iter/s)": 0.9532 }, { "epoch": 0.28337069161550205, "grad_norm": 0.3818534314632416, "learning_rate": 8.583721997407062e-06, "loss": 0.02519812434911728, "memory(GiB)": 21.48, "step": 8723, "token_acc": 1.0, "train_speed(iter/s)": 0.953222 }, { "epoch": 0.28340317707825746, "grad_norm": 0.4405020475387573, "learning_rate": 8.583347400660455e-06, "loss": 0.03170544654130936, "memory(GiB)": 21.48, "step": 8724, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.953246 }, { "epoch": 0.2834356625410129, "grad_norm": 0.4875916838645935, "learning_rate": 8.582972762557253e-06, "loss": 0.04280609264969826, "memory(GiB)": 21.48, "step": 8725, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.95327 }, { "epoch": 0.2834681480037683, "grad_norm": 0.346748948097229, "learning_rate": 8.582598083101775e-06, "loss": 0.033636581152677536, "memory(GiB)": 21.48, "step": 8726, "token_acc": 0.98, "train_speed(iter/s)": 0.953294 }, { "epoch": 0.2835006334665237, "grad_norm": 0.4176509976387024, "learning_rate": 8.582223362298345e-06, "loss": 0.03214109688997269, "memory(GiB)": 21.48, "step": 8727, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.953316 }, { "epoch": 0.2835331189292791, "grad_norm": 0.3557588756084442, "learning_rate": 8.581848600151293e-06, "loss": 0.026030369102954865, "memory(GiB)": 21.48, "step": 8728, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.95334 }, { "epoch": 0.28356560439203454, "grad_norm": 0.3773297667503357, "learning_rate": 8.581473796664938e-06, "loss": 0.02657097578048706, "memory(GiB)": 21.48, "step": 8729, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.953364 }, { "epoch": 0.28359808985478996, "grad_norm": 0.47978144884109497, "learning_rate": 8.58109895184361e-06, "loss": 0.028628312051296234, "memory(GiB)": 21.48, "step": 8730, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.953388 }, { "epoch": 0.2836305753175454, "grad_norm": 0.6198869347572327, "learning_rate": 8.580724065691633e-06, "loss": 0.031764306128025055, "memory(GiB)": 21.48, "step": 8731, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.953408 }, { "epoch": 0.2836630607803008, "grad_norm": 0.6184191107749939, "learning_rate": 8.580349138213333e-06, "loss": 0.03620604798197746, "memory(GiB)": 21.48, "step": 8732, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.953428 }, { "epoch": 0.2836955462430562, "grad_norm": 0.38714835047721863, "learning_rate": 8.57997416941304e-06, "loss": 0.03112131915986538, "memory(GiB)": 21.48, "step": 8733, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.953446 }, { "epoch": 0.2837280317058116, "grad_norm": 0.5406293272972107, "learning_rate": 8.57959915929508e-06, "loss": 0.030893821269273758, "memory(GiB)": 21.48, "step": 8734, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.953466 }, { "epoch": 0.28376051716856704, "grad_norm": 0.38361015915870667, "learning_rate": 8.579224107863782e-06, "loss": 0.03511872887611389, "memory(GiB)": 21.48, "step": 8735, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.953483 }, { "epoch": 0.2837930026313225, "grad_norm": 0.4264237582683563, "learning_rate": 8.578849015123472e-06, "loss": 0.024685662239789963, "memory(GiB)": 21.48, "step": 8736, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.953502 }, { "epoch": 0.2838254880940779, "grad_norm": 0.4581500291824341, "learning_rate": 8.578473881078482e-06, "loss": 0.029811689630150795, "memory(GiB)": 21.48, "step": 8737, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.95352 }, { "epoch": 0.28385797355683334, "grad_norm": 0.380928099155426, "learning_rate": 8.57809870573314e-06, "loss": 0.029406830668449402, "memory(GiB)": 21.48, "step": 8738, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.953539 }, { "epoch": 0.28389045901958876, "grad_norm": 0.42547354102134705, "learning_rate": 8.577723489091778e-06, "loss": 0.027152668684720993, "memory(GiB)": 21.48, "step": 8739, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.953557 }, { "epoch": 0.2839229444823442, "grad_norm": 0.3967592120170593, "learning_rate": 8.577348231158723e-06, "loss": 0.03439440578222275, "memory(GiB)": 21.48, "step": 8740, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.953574 }, { "epoch": 0.2839554299450996, "grad_norm": 0.4579519033432007, "learning_rate": 8.57697293193831e-06, "loss": 0.028502807021141052, "memory(GiB)": 21.48, "step": 8741, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.953594 }, { "epoch": 0.283987915407855, "grad_norm": 0.3909071981906891, "learning_rate": 8.576597591434867e-06, "loss": 0.026671264320611954, "memory(GiB)": 21.48, "step": 8742, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.953614 }, { "epoch": 0.2840204008706104, "grad_norm": 0.5781029462814331, "learning_rate": 8.576222209652728e-06, "loss": 0.029849382117390633, "memory(GiB)": 21.48, "step": 8743, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.953633 }, { "epoch": 0.28405288633336584, "grad_norm": 0.5198733806610107, "learning_rate": 8.575846786596224e-06, "loss": 0.030688175931572914, "memory(GiB)": 21.48, "step": 8744, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.953652 }, { "epoch": 0.28408537179612126, "grad_norm": 0.7483140230178833, "learning_rate": 8.575471322269689e-06, "loss": 0.038262106478214264, "memory(GiB)": 21.48, "step": 8745, "token_acc": 0.9875, "train_speed(iter/s)": 0.95367 }, { "epoch": 0.28411785725887667, "grad_norm": 0.4785459637641907, "learning_rate": 8.575095816677456e-06, "loss": 0.027806580066680908, "memory(GiB)": 21.48, "step": 8746, "token_acc": 1.0, "train_speed(iter/s)": 0.953687 }, { "epoch": 0.2841503427216321, "grad_norm": 0.5285828113555908, "learning_rate": 8.574720269823859e-06, "loss": 0.030209455639123917, "memory(GiB)": 21.48, "step": 8747, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.953706 }, { "epoch": 0.2841828281843875, "grad_norm": 0.3830198049545288, "learning_rate": 8.574344681713231e-06, "loss": 0.03199087828397751, "memory(GiB)": 21.48, "step": 8748, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.953725 }, { "epoch": 0.2842153136471429, "grad_norm": 0.351754367351532, "learning_rate": 8.57396905234991e-06, "loss": 0.018094226717948914, "memory(GiB)": 21.48, "step": 8749, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.953745 }, { "epoch": 0.28424779910989834, "grad_norm": 0.5510312914848328, "learning_rate": 8.573593381738228e-06, "loss": 0.027405409142374992, "memory(GiB)": 21.48, "step": 8750, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.953764 }, { "epoch": 0.28428028457265375, "grad_norm": 0.7844766974449158, "learning_rate": 8.573217669882523e-06, "loss": 0.04545729607343674, "memory(GiB)": 21.48, "step": 8751, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.953786 }, { "epoch": 0.28431277003540917, "grad_norm": 0.5768226385116577, "learning_rate": 8.572841916787128e-06, "loss": 0.037426017224788666, "memory(GiB)": 21.48, "step": 8752, "token_acc": 0.975177304964539, "train_speed(iter/s)": 0.95381 }, { "epoch": 0.2843452554981646, "grad_norm": 0.4998912811279297, "learning_rate": 8.572466122456383e-06, "loss": 0.03591775894165039, "memory(GiB)": 21.48, "step": 8753, "token_acc": 0.9933554817275747, "train_speed(iter/s)": 0.953834 }, { "epoch": 0.28437774096092, "grad_norm": 0.45695585012435913, "learning_rate": 8.572090286894625e-06, "loss": 0.02717725932598114, "memory(GiB)": 21.48, "step": 8754, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.953858 }, { "epoch": 0.2844102264236754, "grad_norm": 0.5087605714797974, "learning_rate": 8.571714410106188e-06, "loss": 0.03679995983839035, "memory(GiB)": 21.48, "step": 8755, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.953876 }, { "epoch": 0.28444271188643083, "grad_norm": 0.41185423731803894, "learning_rate": 8.571338492095416e-06, "loss": 0.02865646220743656, "memory(GiB)": 21.48, "step": 8756, "token_acc": 0.9787985865724381, "train_speed(iter/s)": 0.953895 }, { "epoch": 0.28447519734918625, "grad_norm": 0.4201764166355133, "learning_rate": 8.570962532866642e-06, "loss": 0.024277444928884506, "memory(GiB)": 21.48, "step": 8757, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.953913 }, { "epoch": 0.28450768281194166, "grad_norm": 0.4770981967449188, "learning_rate": 8.57058653242421e-06, "loss": 0.04362253099679947, "memory(GiB)": 21.48, "step": 8758, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.953931 }, { "epoch": 0.2845401682746971, "grad_norm": 0.5183200836181641, "learning_rate": 8.570210490772455e-06, "loss": 0.03236996382474899, "memory(GiB)": 21.48, "step": 8759, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.953951 }, { "epoch": 0.2845726537374525, "grad_norm": 0.5446013808250427, "learning_rate": 8.569834407915718e-06, "loss": 0.04442828893661499, "memory(GiB)": 21.48, "step": 8760, "token_acc": 0.9768339768339769, "train_speed(iter/s)": 0.953969 }, { "epoch": 0.2846051392002079, "grad_norm": 0.6996743679046631, "learning_rate": 8.569458283858344e-06, "loss": 0.038882799446582794, "memory(GiB)": 21.48, "step": 8761, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.953988 }, { "epoch": 0.28463762466296333, "grad_norm": 0.6487863063812256, "learning_rate": 8.569082118604665e-06, "loss": 0.039741113781929016, "memory(GiB)": 21.48, "step": 8762, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.954006 }, { "epoch": 0.28467011012571874, "grad_norm": 0.4262789785861969, "learning_rate": 8.568705912159032e-06, "loss": 0.031995989382267, "memory(GiB)": 21.48, "step": 8763, "token_acc": 0.9890909090909091, "train_speed(iter/s)": 0.954025 }, { "epoch": 0.28470259558847416, "grad_norm": 0.335832417011261, "learning_rate": 8.568329664525784e-06, "loss": 0.026064898818731308, "memory(GiB)": 21.48, "step": 8764, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.954043 }, { "epoch": 0.2847350810512296, "grad_norm": 0.4133675694465637, "learning_rate": 8.56795337570926e-06, "loss": 0.028621051460504532, "memory(GiB)": 21.48, "step": 8765, "token_acc": 1.0, "train_speed(iter/s)": 0.95406 }, { "epoch": 0.284767566513985, "grad_norm": 0.4177230894565582, "learning_rate": 8.567577045713804e-06, "loss": 0.030440136790275574, "memory(GiB)": 21.48, "step": 8766, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.954078 }, { "epoch": 0.2848000519767404, "grad_norm": 0.3989800810813904, "learning_rate": 8.567200674543763e-06, "loss": 0.026657238602638245, "memory(GiB)": 21.48, "step": 8767, "token_acc": 0.9895470383275261, "train_speed(iter/s)": 0.954098 }, { "epoch": 0.2848325374394958, "grad_norm": 0.42264434695243835, "learning_rate": 8.566824262203475e-06, "loss": 0.028335239738225937, "memory(GiB)": 21.48, "step": 8768, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.954115 }, { "epoch": 0.28486502290225124, "grad_norm": 0.3576688766479492, "learning_rate": 8.566447808697292e-06, "loss": 0.02924703061580658, "memory(GiB)": 21.48, "step": 8769, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.954134 }, { "epoch": 0.28489750836500666, "grad_norm": 0.35586613416671753, "learning_rate": 8.566071314029552e-06, "loss": 0.038526084274053574, "memory(GiB)": 21.48, "step": 8770, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.954154 }, { "epoch": 0.2849299938277621, "grad_norm": 0.5288438200950623, "learning_rate": 8.565694778204602e-06, "loss": 0.040119364857673645, "memory(GiB)": 21.48, "step": 8771, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.954174 }, { "epoch": 0.2849624792905175, "grad_norm": 0.4340706467628479, "learning_rate": 8.565318201226788e-06, "loss": 0.03546376898884773, "memory(GiB)": 21.48, "step": 8772, "token_acc": 1.0, "train_speed(iter/s)": 0.954192 }, { "epoch": 0.2849949647532729, "grad_norm": 0.35861706733703613, "learning_rate": 8.564941583100458e-06, "loss": 0.03044094704091549, "memory(GiB)": 21.48, "step": 8773, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.954208 }, { "epoch": 0.2850274502160283, "grad_norm": 0.6809383630752563, "learning_rate": 8.564564923829956e-06, "loss": 0.03782740235328674, "memory(GiB)": 21.48, "step": 8774, "token_acc": 0.9857651245551602, "train_speed(iter/s)": 0.954225 }, { "epoch": 0.28505993567878374, "grad_norm": 0.7771202921867371, "learning_rate": 8.564188223419631e-06, "loss": 0.03664560616016388, "memory(GiB)": 21.48, "step": 8775, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.954242 }, { "epoch": 0.28509242114153915, "grad_norm": 0.3845835328102112, "learning_rate": 8.563811481873831e-06, "loss": 0.026431942358613014, "memory(GiB)": 21.48, "step": 8776, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.95426 }, { "epoch": 0.28512490660429457, "grad_norm": 0.29270440340042114, "learning_rate": 8.563434699196902e-06, "loss": 0.022225281223654747, "memory(GiB)": 21.48, "step": 8777, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.95428 }, { "epoch": 0.28515739206705, "grad_norm": 0.3482447564601898, "learning_rate": 8.563057875393194e-06, "loss": 0.029785946011543274, "memory(GiB)": 21.48, "step": 8778, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.954299 }, { "epoch": 0.2851898775298054, "grad_norm": 0.6180251836776733, "learning_rate": 8.562681010467055e-06, "loss": 0.03335809335112572, "memory(GiB)": 21.48, "step": 8779, "token_acc": 0.9776785714285714, "train_speed(iter/s)": 0.954322 }, { "epoch": 0.2852223629925608, "grad_norm": 0.434408575296402, "learning_rate": 8.562304104422835e-06, "loss": 0.035397566854953766, "memory(GiB)": 21.48, "step": 8780, "token_acc": 0.9823788546255506, "train_speed(iter/s)": 0.954346 }, { "epoch": 0.28525484845531623, "grad_norm": 0.34329673647880554, "learning_rate": 8.561927157264886e-06, "loss": 0.030657218769192696, "memory(GiB)": 21.48, "step": 8781, "token_acc": 0.9884615384615385, "train_speed(iter/s)": 0.95437 }, { "epoch": 0.28528733391807165, "grad_norm": 0.30190062522888184, "learning_rate": 8.561550168997555e-06, "loss": 0.0216277614235878, "memory(GiB)": 21.48, "step": 8782, "token_acc": 0.992, "train_speed(iter/s)": 0.954393 }, { "epoch": 0.28531981938082707, "grad_norm": 0.5726388096809387, "learning_rate": 8.561173139625195e-06, "loss": 0.04113021492958069, "memory(GiB)": 21.48, "step": 8783, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.954416 }, { "epoch": 0.2853523048435825, "grad_norm": 0.43428927659988403, "learning_rate": 8.560796069152158e-06, "loss": 0.035845063626766205, "memory(GiB)": 21.48, "step": 8784, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.95444 }, { "epoch": 0.2853847903063379, "grad_norm": 0.46612313389778137, "learning_rate": 8.560418957582795e-06, "loss": 0.041721977293491364, "memory(GiB)": 21.48, "step": 8785, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.954464 }, { "epoch": 0.2854172757690933, "grad_norm": 0.3748566210269928, "learning_rate": 8.560041804921456e-06, "loss": 0.024722807109355927, "memory(GiB)": 21.48, "step": 8786, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.954487 }, { "epoch": 0.28544976123184873, "grad_norm": 0.549244225025177, "learning_rate": 8.559664611172498e-06, "loss": 0.04998953640460968, "memory(GiB)": 21.48, "step": 8787, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.95451 }, { "epoch": 0.28548224669460415, "grad_norm": 0.372709184885025, "learning_rate": 8.559287376340273e-06, "loss": 0.027600083500146866, "memory(GiB)": 21.48, "step": 8788, "token_acc": 0.9812734082397003, "train_speed(iter/s)": 0.954534 }, { "epoch": 0.28551473215735956, "grad_norm": 0.44425299763679504, "learning_rate": 8.558910100429132e-06, "loss": 0.03215639293193817, "memory(GiB)": 21.48, "step": 8789, "token_acc": 1.0, "train_speed(iter/s)": 0.954558 }, { "epoch": 0.285547217620115, "grad_norm": 0.5509361624717712, "learning_rate": 8.558532783443435e-06, "loss": 0.03242931887507439, "memory(GiB)": 21.48, "step": 8790, "token_acc": 0.9779005524861878, "train_speed(iter/s)": 0.954581 }, { "epoch": 0.2855797030828704, "grad_norm": 0.4205198884010315, "learning_rate": 8.55815542538753e-06, "loss": 0.029624683782458305, "memory(GiB)": 21.48, "step": 8791, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.9546 }, { "epoch": 0.2856121885456258, "grad_norm": 0.4964853525161743, "learning_rate": 8.557778026265778e-06, "loss": 0.029501400887966156, "memory(GiB)": 21.48, "step": 8792, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.954618 }, { "epoch": 0.2856446740083812, "grad_norm": 0.3827464282512665, "learning_rate": 8.557400586082529e-06, "loss": 0.03211522474884987, "memory(GiB)": 21.48, "step": 8793, "token_acc": 0.984, "train_speed(iter/s)": 0.954635 }, { "epoch": 0.28567715947113664, "grad_norm": 0.33524635434150696, "learning_rate": 8.557023104842146e-06, "loss": 0.021118488162755966, "memory(GiB)": 21.48, "step": 8794, "token_acc": 0.9849056603773585, "train_speed(iter/s)": 0.954655 }, { "epoch": 0.28570964493389206, "grad_norm": 0.5186344385147095, "learning_rate": 8.55664558254898e-06, "loss": 0.03583485260605812, "memory(GiB)": 21.48, "step": 8795, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.954674 }, { "epoch": 0.2857421303966475, "grad_norm": 0.42978766560554504, "learning_rate": 8.556268019207389e-06, "loss": 0.02957366779446602, "memory(GiB)": 21.48, "step": 8796, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.954694 }, { "epoch": 0.2857746158594029, "grad_norm": 0.47665834426879883, "learning_rate": 8.555890414821734e-06, "loss": 0.02989223226904869, "memory(GiB)": 21.48, "step": 8797, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.954711 }, { "epoch": 0.2858071013221583, "grad_norm": 1.2224098443984985, "learning_rate": 8.55551276939637e-06, "loss": 0.05238264054059982, "memory(GiB)": 21.48, "step": 8798, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.95473 }, { "epoch": 0.2858395867849137, "grad_norm": 0.8796420097351074, "learning_rate": 8.555135082935657e-06, "loss": 0.04058849811553955, "memory(GiB)": 21.48, "step": 8799, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.95475 }, { "epoch": 0.2858720722476692, "grad_norm": 0.4134598970413208, "learning_rate": 8.554757355443951e-06, "loss": 0.02875032275915146, "memory(GiB)": 21.48, "step": 8800, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.954769 }, { "epoch": 0.2859045577104246, "grad_norm": 0.5493388772010803, "learning_rate": 8.554379586925616e-06, "loss": 0.04219325631856918, "memory(GiB)": 21.48, "step": 8801, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.954788 }, { "epoch": 0.28593704317318, "grad_norm": 0.4384523034095764, "learning_rate": 8.554001777385008e-06, "loss": 0.028429772704839706, "memory(GiB)": 21.48, "step": 8802, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.954806 }, { "epoch": 0.28596952863593544, "grad_norm": 0.5582617521286011, "learning_rate": 8.55362392682649e-06, "loss": 0.03871038183569908, "memory(GiB)": 21.48, "step": 8803, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.954824 }, { "epoch": 0.28600201409869086, "grad_norm": 0.4501173794269562, "learning_rate": 8.553246035254422e-06, "loss": 0.028346052393317223, "memory(GiB)": 21.48, "step": 8804, "token_acc": 0.996, "train_speed(iter/s)": 0.954842 }, { "epoch": 0.2860344995614463, "grad_norm": 0.7433188557624817, "learning_rate": 8.552868102673165e-06, "loss": 0.04980405420064926, "memory(GiB)": 21.48, "step": 8805, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.954861 }, { "epoch": 0.2860669850242017, "grad_norm": 0.5523333549499512, "learning_rate": 8.552490129087082e-06, "loss": 0.036675773561000824, "memory(GiB)": 21.48, "step": 8806, "token_acc": 1.0, "train_speed(iter/s)": 0.95488 }, { "epoch": 0.2860994704869571, "grad_norm": 0.7835092544555664, "learning_rate": 8.552112114500533e-06, "loss": 0.04028492048382759, "memory(GiB)": 21.48, "step": 8807, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.954898 }, { "epoch": 0.2861319559497125, "grad_norm": 0.5658204555511475, "learning_rate": 8.551734058917885e-06, "loss": 0.02699100784957409, "memory(GiB)": 21.48, "step": 8808, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.954915 }, { "epoch": 0.28616444141246794, "grad_norm": 0.5938062071800232, "learning_rate": 8.551355962343497e-06, "loss": 0.04426327347755432, "memory(GiB)": 21.48, "step": 8809, "token_acc": 0.9768339768339769, "train_speed(iter/s)": 0.954933 }, { "epoch": 0.28619692687522336, "grad_norm": 0.44435811042785645, "learning_rate": 8.550977824781734e-06, "loss": 0.024456052109599113, "memory(GiB)": 21.48, "step": 8810, "token_acc": 1.0, "train_speed(iter/s)": 0.954951 }, { "epoch": 0.2862294123379788, "grad_norm": 0.34607967734336853, "learning_rate": 8.550599646236961e-06, "loss": 0.03206852078437805, "memory(GiB)": 21.48, "step": 8811, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.954975 }, { "epoch": 0.2862618978007342, "grad_norm": 0.4763389527797699, "learning_rate": 8.550221426713543e-06, "loss": 0.023179704323410988, "memory(GiB)": 21.48, "step": 8812, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.954999 }, { "epoch": 0.2862943832634896, "grad_norm": 0.41370245814323425, "learning_rate": 8.549843166215843e-06, "loss": 0.02857297472655773, "memory(GiB)": 21.48, "step": 8813, "token_acc": 0.9832214765100671, "train_speed(iter/s)": 0.955022 }, { "epoch": 0.286326868726245, "grad_norm": 0.8090996742248535, "learning_rate": 8.54946486474823e-06, "loss": 0.04010479897260666, "memory(GiB)": 21.48, "step": 8814, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.955046 }, { "epoch": 0.28635935418900044, "grad_norm": 0.3812980651855469, "learning_rate": 8.549086522315065e-06, "loss": 0.031242556869983673, "memory(GiB)": 21.48, "step": 8815, "token_acc": 0.9806763285024155, "train_speed(iter/s)": 0.95507 }, { "epoch": 0.28639183965175585, "grad_norm": 0.6109249591827393, "learning_rate": 8.54870813892072e-06, "loss": 0.035099148750305176, "memory(GiB)": 21.48, "step": 8816, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.955092 }, { "epoch": 0.28642432511451127, "grad_norm": 0.44939345121383667, "learning_rate": 8.54832971456956e-06, "loss": 0.03022361546754837, "memory(GiB)": 21.48, "step": 8817, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955109 }, { "epoch": 0.2864568105772667, "grad_norm": 0.6220571398735046, "learning_rate": 8.547951249265951e-06, "loss": 0.03219197690486908, "memory(GiB)": 21.48, "step": 8818, "token_acc": 0.9683794466403162, "train_speed(iter/s)": 0.955127 }, { "epoch": 0.2864892960400221, "grad_norm": 0.45523881912231445, "learning_rate": 8.547572743014263e-06, "loss": 0.03217114880681038, "memory(GiB)": 21.48, "step": 8819, "token_acc": 0.9823529411764705, "train_speed(iter/s)": 0.955145 }, { "epoch": 0.2865217815027775, "grad_norm": 0.41681739687919617, "learning_rate": 8.547194195818863e-06, "loss": 0.027490731328725815, "memory(GiB)": 21.48, "step": 8820, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.955163 }, { "epoch": 0.28655426696553293, "grad_norm": 0.395004540681839, "learning_rate": 8.546815607684122e-06, "loss": 0.03211411088705063, "memory(GiB)": 21.48, "step": 8821, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.955182 }, { "epoch": 0.28658675242828835, "grad_norm": 0.5251941680908203, "learning_rate": 8.546436978614406e-06, "loss": 0.03169127553701401, "memory(GiB)": 21.48, "step": 8822, "token_acc": 0.9814126394052045, "train_speed(iter/s)": 0.9552 }, { "epoch": 0.28661923789104377, "grad_norm": 0.51014643907547, "learning_rate": 8.546058308614089e-06, "loss": 0.04489180073142052, "memory(GiB)": 21.48, "step": 8823, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.955219 }, { "epoch": 0.2866517233537992, "grad_norm": 0.5422778725624084, "learning_rate": 8.545679597687538e-06, "loss": 0.0353938564658165, "memory(GiB)": 21.48, "step": 8824, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.955238 }, { "epoch": 0.2866842088165546, "grad_norm": 0.35299184918403625, "learning_rate": 8.545300845839126e-06, "loss": 0.025372367352247238, "memory(GiB)": 21.48, "step": 8825, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.955257 }, { "epoch": 0.28671669427931, "grad_norm": 4.7569451332092285, "learning_rate": 8.544922053073223e-06, "loss": 0.047228239476680756, "memory(GiB)": 21.48, "step": 8826, "token_acc": 0.9765625, "train_speed(iter/s)": 0.955275 }, { "epoch": 0.28674917974206543, "grad_norm": 0.4377346634864807, "learning_rate": 8.544543219394202e-06, "loss": 0.02592233195900917, "memory(GiB)": 21.48, "step": 8827, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.955291 }, { "epoch": 0.28678166520482085, "grad_norm": 0.5890727639198303, "learning_rate": 8.544164344806434e-06, "loss": 0.0404592826962471, "memory(GiB)": 21.48, "step": 8828, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.955309 }, { "epoch": 0.28681415066757626, "grad_norm": 0.36559927463531494, "learning_rate": 8.543785429314292e-06, "loss": 0.028850454837083817, "memory(GiB)": 21.48, "step": 8829, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.955328 }, { "epoch": 0.2868466361303317, "grad_norm": 0.3064188063144684, "learning_rate": 8.543406472922149e-06, "loss": 0.026419516652822495, "memory(GiB)": 21.48, "step": 8830, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.955348 }, { "epoch": 0.2868791215930871, "grad_norm": 0.4060618281364441, "learning_rate": 8.54302747563438e-06, "loss": 0.02840123325586319, "memory(GiB)": 21.48, "step": 8831, "token_acc": 0.9783549783549783, "train_speed(iter/s)": 0.955367 }, { "epoch": 0.2869116070558425, "grad_norm": 0.416633278131485, "learning_rate": 8.542648437455358e-06, "loss": 0.03425901383161545, "memory(GiB)": 21.48, "step": 8832, "token_acc": 0.9823788546255506, "train_speed(iter/s)": 0.955385 }, { "epoch": 0.2869440925185979, "grad_norm": 0.46085986495018005, "learning_rate": 8.542269358389457e-06, "loss": 0.03174635395407677, "memory(GiB)": 21.48, "step": 8833, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.955404 }, { "epoch": 0.28697657798135334, "grad_norm": 0.44288599491119385, "learning_rate": 8.541890238441053e-06, "loss": 0.03704484552145004, "memory(GiB)": 21.48, "step": 8834, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.955422 }, { "epoch": 0.28700906344410876, "grad_norm": 0.5010985136032104, "learning_rate": 8.541511077614521e-06, "loss": 0.028542153537273407, "memory(GiB)": 21.48, "step": 8835, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.95544 }, { "epoch": 0.2870415489068642, "grad_norm": 0.5201783180236816, "learning_rate": 8.54113187591424e-06, "loss": 0.02549520507454872, "memory(GiB)": 21.48, "step": 8836, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955454 }, { "epoch": 0.2870740343696196, "grad_norm": 0.4268204867839813, "learning_rate": 8.540752633344582e-06, "loss": 0.03453720360994339, "memory(GiB)": 21.48, "step": 8837, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.955472 }, { "epoch": 0.287106519832375, "grad_norm": 0.4470100402832031, "learning_rate": 8.540373349909925e-06, "loss": 0.0375392884016037, "memory(GiB)": 21.48, "step": 8838, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.955491 }, { "epoch": 0.2871390052951304, "grad_norm": 0.43995827436447144, "learning_rate": 8.539994025614649e-06, "loss": 0.04237406700849533, "memory(GiB)": 21.48, "step": 8839, "token_acc": 0.9858657243816255, "train_speed(iter/s)": 0.955509 }, { "epoch": 0.28717149075788584, "grad_norm": 0.42158591747283936, "learning_rate": 8.53961466046313e-06, "loss": 0.032021477818489075, "memory(GiB)": 21.48, "step": 8840, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.955529 }, { "epoch": 0.28720397622064125, "grad_norm": 0.5033024549484253, "learning_rate": 8.539235254459746e-06, "loss": 0.0365322083234787, "memory(GiB)": 21.48, "step": 8841, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.955552 }, { "epoch": 0.28723646168339667, "grad_norm": 0.4144434332847595, "learning_rate": 8.538855807608875e-06, "loss": 0.027941782027482986, "memory(GiB)": 21.48, "step": 8842, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.955576 }, { "epoch": 0.2872689471461521, "grad_norm": 0.4851469397544861, "learning_rate": 8.5384763199149e-06, "loss": 0.03122740611433983, "memory(GiB)": 21.48, "step": 8843, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.955598 }, { "epoch": 0.2873014326089075, "grad_norm": 0.3661888837814331, "learning_rate": 8.538096791382198e-06, "loss": 0.028547225520014763, "memory(GiB)": 21.48, "step": 8844, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.955622 }, { "epoch": 0.2873339180716629, "grad_norm": 0.27581191062927246, "learning_rate": 8.53771722201515e-06, "loss": 0.02458846941590309, "memory(GiB)": 21.48, "step": 8845, "token_acc": 0.9858657243816255, "train_speed(iter/s)": 0.955643 }, { "epoch": 0.28736640353441834, "grad_norm": 0.7167780995368958, "learning_rate": 8.537337611818135e-06, "loss": 0.03276440501213074, "memory(GiB)": 21.48, "step": 8846, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.955667 }, { "epoch": 0.28739888899717375, "grad_norm": 0.46679264307022095, "learning_rate": 8.536957960795537e-06, "loss": 0.03296252340078354, "memory(GiB)": 21.48, "step": 8847, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.95569 }, { "epoch": 0.28743137445992917, "grad_norm": 0.46593624353408813, "learning_rate": 8.536578268951738e-06, "loss": 0.024871060624718666, "memory(GiB)": 21.48, "step": 8848, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.955713 }, { "epoch": 0.2874638599226846, "grad_norm": 0.6462532877922058, "learning_rate": 8.536198536291115e-06, "loss": 0.04310643672943115, "memory(GiB)": 21.48, "step": 8849, "token_acc": 0.9800664451827242, "train_speed(iter/s)": 0.955736 }, { "epoch": 0.28749634538544, "grad_norm": 0.38893359899520874, "learning_rate": 8.535818762818058e-06, "loss": 0.034547049552202225, "memory(GiB)": 21.48, "step": 8850, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955758 }, { "epoch": 0.2875288308481954, "grad_norm": 0.786844789981842, "learning_rate": 8.535438948536944e-06, "loss": 0.03345680981874466, "memory(GiB)": 21.48, "step": 8851, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.955775 }, { "epoch": 0.28756131631095083, "grad_norm": 0.5383519530296326, "learning_rate": 8.535059093452161e-06, "loss": 0.024765387177467346, "memory(GiB)": 21.48, "step": 8852, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955795 }, { "epoch": 0.28759380177370625, "grad_norm": 0.4918554127216339, "learning_rate": 8.53467919756809e-06, "loss": 0.027954142540693283, "memory(GiB)": 21.48, "step": 8853, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955812 }, { "epoch": 0.28762628723646166, "grad_norm": 0.46756312251091003, "learning_rate": 8.534299260889114e-06, "loss": 0.03426027297973633, "memory(GiB)": 21.48, "step": 8854, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.95583 }, { "epoch": 0.2876587726992171, "grad_norm": 0.9047346711158752, "learning_rate": 8.533919283419623e-06, "loss": 0.03364172205328941, "memory(GiB)": 21.48, "step": 8855, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.955848 }, { "epoch": 0.2876912581619725, "grad_norm": 0.4822937548160553, "learning_rate": 8.533539265163998e-06, "loss": 0.039420709013938904, "memory(GiB)": 21.48, "step": 8856, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.955866 }, { "epoch": 0.2877237436247279, "grad_norm": 0.6303361654281616, "learning_rate": 8.53315920612663e-06, "loss": 0.025924015790224075, "memory(GiB)": 21.48, "step": 8857, "token_acc": 1.0, "train_speed(iter/s)": 0.955884 }, { "epoch": 0.28775622908748333, "grad_norm": 0.49250656366348267, "learning_rate": 8.5327791063119e-06, "loss": 0.03014436922967434, "memory(GiB)": 21.48, "step": 8858, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.955901 }, { "epoch": 0.28778871455023874, "grad_norm": 0.3622872829437256, "learning_rate": 8.532398965724196e-06, "loss": 0.02641959860920906, "memory(GiB)": 21.48, "step": 8859, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955919 }, { "epoch": 0.28782120001299416, "grad_norm": 0.6344566941261292, "learning_rate": 8.532018784367906e-06, "loss": 0.03334471583366394, "memory(GiB)": 21.48, "step": 8860, "token_acc": 0.9928057553956835, "train_speed(iter/s)": 0.955937 }, { "epoch": 0.2878536854757496, "grad_norm": 0.5144587159156799, "learning_rate": 8.53163856224742e-06, "loss": 0.02816830947995186, "memory(GiB)": 21.48, "step": 8861, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955953 }, { "epoch": 0.287886170938505, "grad_norm": 0.39852920174598694, "learning_rate": 8.531258299367125e-06, "loss": 0.025615312159061432, "memory(GiB)": 21.48, "step": 8862, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955969 }, { "epoch": 0.2879186564012604, "grad_norm": 0.3699764609336853, "learning_rate": 8.530877995731408e-06, "loss": 0.023085029795765877, "memory(GiB)": 21.48, "step": 8863, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.955985 }, { "epoch": 0.2879511418640159, "grad_norm": 0.5429940819740295, "learning_rate": 8.530497651344658e-06, "loss": 0.03723127767443657, "memory(GiB)": 21.48, "step": 8864, "token_acc": 0.9794238683127572, "train_speed(iter/s)": 0.956003 }, { "epoch": 0.2879836273267713, "grad_norm": 0.3895454406738281, "learning_rate": 8.530117266211267e-06, "loss": 0.02629168890416622, "memory(GiB)": 21.48, "step": 8865, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.95602 }, { "epoch": 0.2880161127895267, "grad_norm": 0.572404682636261, "learning_rate": 8.529736840335625e-06, "loss": 0.03558855876326561, "memory(GiB)": 21.48, "step": 8866, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.956039 }, { "epoch": 0.28804859825228213, "grad_norm": 0.5009708404541016, "learning_rate": 8.529356373722122e-06, "loss": 0.03233449161052704, "memory(GiB)": 21.48, "step": 8867, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.956056 }, { "epoch": 0.28808108371503754, "grad_norm": 0.5392035245895386, "learning_rate": 8.528975866375147e-06, "loss": 0.03060605749487877, "memory(GiB)": 21.48, "step": 8868, "token_acc": 0.9884615384615385, "train_speed(iter/s)": 0.956074 }, { "epoch": 0.28811356917779296, "grad_norm": 0.6318281292915344, "learning_rate": 8.528595318299096e-06, "loss": 0.0476679652929306, "memory(GiB)": 21.48, "step": 8869, "token_acc": 0.966789667896679, "train_speed(iter/s)": 0.956092 }, { "epoch": 0.2881460546405484, "grad_norm": 0.5135272741317749, "learning_rate": 8.528214729498356e-06, "loss": 0.03464841470122337, "memory(GiB)": 21.48, "step": 8870, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.956112 }, { "epoch": 0.2881785401033038, "grad_norm": 0.5389730334281921, "learning_rate": 8.527834099977323e-06, "loss": 0.03968953341245651, "memory(GiB)": 21.48, "step": 8871, "token_acc": 0.9905660377358491, "train_speed(iter/s)": 0.956135 }, { "epoch": 0.2882110255660592, "grad_norm": 0.3202574551105499, "learning_rate": 8.52745342974039e-06, "loss": 0.018989846110343933, "memory(GiB)": 21.48, "step": 8872, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.956157 }, { "epoch": 0.2882435110288146, "grad_norm": 0.4018053710460663, "learning_rate": 8.527072718791947e-06, "loss": 0.032272081822156906, "memory(GiB)": 21.48, "step": 8873, "token_acc": 0.9859649122807017, "train_speed(iter/s)": 0.956181 }, { "epoch": 0.28827599649157004, "grad_norm": 0.5455691814422607, "learning_rate": 8.526691967136391e-06, "loss": 0.025771968066692352, "memory(GiB)": 21.48, "step": 8874, "token_acc": 0.9847328244274809, "train_speed(iter/s)": 0.956203 }, { "epoch": 0.28830848195432546, "grad_norm": 0.42929860949516296, "learning_rate": 8.526311174778117e-06, "loss": 0.03681652620434761, "memory(GiB)": 21.48, "step": 8875, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956227 }, { "epoch": 0.2883409674170809, "grad_norm": 0.6414080262184143, "learning_rate": 8.525930341721519e-06, "loss": 0.04279007017612457, "memory(GiB)": 21.48, "step": 8876, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.95625 }, { "epoch": 0.2883734528798363, "grad_norm": 0.3500100374221802, "learning_rate": 8.525549467970991e-06, "loss": 0.023820284754037857, "memory(GiB)": 21.48, "step": 8877, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.956272 }, { "epoch": 0.2884059383425917, "grad_norm": 0.3762010633945465, "learning_rate": 8.525168553530929e-06, "loss": 0.030090410262346268, "memory(GiB)": 21.48, "step": 8878, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.956295 }, { "epoch": 0.2884384238053471, "grad_norm": 0.49615803360939026, "learning_rate": 8.52478759840573e-06, "loss": 0.030700568109750748, "memory(GiB)": 21.48, "step": 8879, "token_acc": 1.0, "train_speed(iter/s)": 0.956314 }, { "epoch": 0.28847090926810254, "grad_norm": 0.4290791153907776, "learning_rate": 8.52440660259979e-06, "loss": 0.031016509979963303, "memory(GiB)": 21.48, "step": 8880, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.956332 }, { "epoch": 0.28850339473085795, "grad_norm": 0.5522897839546204, "learning_rate": 8.524025566117508e-06, "loss": 0.039771221578121185, "memory(GiB)": 21.48, "step": 8881, "token_acc": 0.991869918699187, "train_speed(iter/s)": 0.95635 }, { "epoch": 0.28853588019361337, "grad_norm": 0.4909234642982483, "learning_rate": 8.52364448896328e-06, "loss": 0.033452264964580536, "memory(GiB)": 21.48, "step": 8882, "token_acc": 0.9893238434163701, "train_speed(iter/s)": 0.956369 }, { "epoch": 0.2885683656563688, "grad_norm": 0.44750866293907166, "learning_rate": 8.523263371141507e-06, "loss": 0.0294838547706604, "memory(GiB)": 21.48, "step": 8883, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956388 }, { "epoch": 0.2886008511191242, "grad_norm": 0.4962960481643677, "learning_rate": 8.522882212656583e-06, "loss": 0.03742608800530434, "memory(GiB)": 21.48, "step": 8884, "token_acc": 0.9877049180327869, "train_speed(iter/s)": 0.956406 }, { "epoch": 0.2886333365818796, "grad_norm": 0.5581954121589661, "learning_rate": 8.522501013512908e-06, "loss": 0.03590827435255051, "memory(GiB)": 21.48, "step": 8885, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.956425 }, { "epoch": 0.28866582204463503, "grad_norm": 1.4466320276260376, "learning_rate": 8.522119773714885e-06, "loss": 0.029586266726255417, "memory(GiB)": 21.48, "step": 8886, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.956443 }, { "epoch": 0.28869830750739045, "grad_norm": 0.48728254437446594, "learning_rate": 8.52173849326691e-06, "loss": 0.02105480059981346, "memory(GiB)": 21.48, "step": 8887, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956461 }, { "epoch": 0.28873079297014587, "grad_norm": 0.43045371770858765, "learning_rate": 8.521357172173387e-06, "loss": 0.035245344042778015, "memory(GiB)": 21.48, "step": 8888, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.95648 }, { "epoch": 0.2887632784329013, "grad_norm": 0.3343771994113922, "learning_rate": 8.520975810438713e-06, "loss": 0.03109431453049183, "memory(GiB)": 21.48, "step": 8889, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.956495 }, { "epoch": 0.2887957638956567, "grad_norm": 0.3998085856437683, "learning_rate": 8.520594408067295e-06, "loss": 0.03045187145471573, "memory(GiB)": 21.48, "step": 8890, "token_acc": 0.9814126394052045, "train_speed(iter/s)": 0.956511 }, { "epoch": 0.2888282493584121, "grad_norm": 0.3224674463272095, "learning_rate": 8.52021296506353e-06, "loss": 0.03283054381608963, "memory(GiB)": 21.48, "step": 8891, "token_acc": 0.9741379310344828, "train_speed(iter/s)": 0.95653 }, { "epoch": 0.28886073482116753, "grad_norm": 0.4460398554801941, "learning_rate": 8.519831481431824e-06, "loss": 0.0265042707324028, "memory(GiB)": 21.48, "step": 8892, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.956548 }, { "epoch": 0.28889322028392295, "grad_norm": 0.5347083210945129, "learning_rate": 8.519449957176576e-06, "loss": 0.039443619549274445, "memory(GiB)": 21.48, "step": 8893, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.956567 }, { "epoch": 0.28892570574667836, "grad_norm": 0.5087650418281555, "learning_rate": 8.51906839230219e-06, "loss": 0.03293321654200554, "memory(GiB)": 21.48, "step": 8894, "token_acc": 0.9966887417218543, "train_speed(iter/s)": 0.956586 }, { "epoch": 0.2889581912094338, "grad_norm": 0.42244142293930054, "learning_rate": 8.518686786813072e-06, "loss": 0.027361955493688583, "memory(GiB)": 21.48, "step": 8895, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.956605 }, { "epoch": 0.2889906766721892, "grad_norm": 0.752027153968811, "learning_rate": 8.518305140713626e-06, "loss": 0.027702653780579567, "memory(GiB)": 21.48, "step": 8896, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.956622 }, { "epoch": 0.2890231621349446, "grad_norm": 0.8665294647216797, "learning_rate": 8.517923454008255e-06, "loss": 0.027669589966535568, "memory(GiB)": 21.48, "step": 8897, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956638 }, { "epoch": 0.2890556475977, "grad_norm": 0.6216554641723633, "learning_rate": 8.517541726701365e-06, "loss": 0.03830842301249504, "memory(GiB)": 21.48, "step": 8898, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.956653 }, { "epoch": 0.28908813306045544, "grad_norm": 0.5200842022895813, "learning_rate": 8.51715995879736e-06, "loss": 0.025224026292562485, "memory(GiB)": 21.48, "step": 8899, "token_acc": 0.981549815498155, "train_speed(iter/s)": 0.956669 }, { "epoch": 0.28912061852321086, "grad_norm": 0.4381125569343567, "learning_rate": 8.516778150300651e-06, "loss": 0.027202963829040527, "memory(GiB)": 21.48, "step": 8900, "token_acc": 0.9927007299270073, "train_speed(iter/s)": 0.956688 }, { "epoch": 0.2891531039859663, "grad_norm": 0.4423864483833313, "learning_rate": 8.516396301215639e-06, "loss": 0.0306868739426136, "memory(GiB)": 21.48, "step": 8901, "token_acc": 0.988, "train_speed(iter/s)": 0.956706 }, { "epoch": 0.2891855894487217, "grad_norm": 0.43039050698280334, "learning_rate": 8.516014411546734e-06, "loss": 0.0319344624876976, "memory(GiB)": 21.48, "step": 8902, "token_acc": 0.9964285714285714, "train_speed(iter/s)": 0.956726 }, { "epoch": 0.2892180749114771, "grad_norm": 0.46526890993118286, "learning_rate": 8.515632481298343e-06, "loss": 0.025310803204774857, "memory(GiB)": 21.48, "step": 8903, "token_acc": 0.992831541218638, "train_speed(iter/s)": 0.956749 }, { "epoch": 0.2892505603742325, "grad_norm": 0.4526183009147644, "learning_rate": 8.515250510474874e-06, "loss": 0.026236949488520622, "memory(GiB)": 21.48, "step": 8904, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.956772 }, { "epoch": 0.28928304583698794, "grad_norm": 0.4020063877105713, "learning_rate": 8.514868499080735e-06, "loss": 0.02574886381626129, "memory(GiB)": 21.48, "step": 8905, "token_acc": 0.9884615384615385, "train_speed(iter/s)": 0.956795 }, { "epoch": 0.28931553129974336, "grad_norm": 0.47349703311920166, "learning_rate": 8.514486447120335e-06, "loss": 0.02170124650001526, "memory(GiB)": 21.48, "step": 8906, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.956819 }, { "epoch": 0.28934801676249877, "grad_norm": 0.5153781771659851, "learning_rate": 8.514104354598085e-06, "loss": 0.028911683708429337, "memory(GiB)": 21.48, "step": 8907, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.956843 }, { "epoch": 0.2893805022252542, "grad_norm": 0.3425545394420624, "learning_rate": 8.513722221518392e-06, "loss": 0.02299417555332184, "memory(GiB)": 21.48, "step": 8908, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.956865 }, { "epoch": 0.2894129876880096, "grad_norm": 0.8754022121429443, "learning_rate": 8.513340047885669e-06, "loss": 0.03290871903300285, "memory(GiB)": 21.48, "step": 8909, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.956888 }, { "epoch": 0.289445473150765, "grad_norm": 0.4591183364391327, "learning_rate": 8.512957833704325e-06, "loss": 0.031071845442056656, "memory(GiB)": 21.48, "step": 8910, "token_acc": 0.9822695035460993, "train_speed(iter/s)": 0.956908 }, { "epoch": 0.28947795861352044, "grad_norm": 0.7644783854484558, "learning_rate": 8.512575578978772e-06, "loss": 0.03320451080799103, "memory(GiB)": 21.48, "step": 8911, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956926 }, { "epoch": 0.28951044407627585, "grad_norm": 0.5412054061889648, "learning_rate": 8.512193283713422e-06, "loss": 0.041282638907432556, "memory(GiB)": 21.48, "step": 8912, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.956945 }, { "epoch": 0.28954292953903127, "grad_norm": 0.705321729183197, "learning_rate": 8.511810947912683e-06, "loss": 0.035537127405405045, "memory(GiB)": 21.48, "step": 8913, "token_acc": 0.9922779922779923, "train_speed(iter/s)": 0.956964 }, { "epoch": 0.2895754150017867, "grad_norm": 1.5405926704406738, "learning_rate": 8.511428571580976e-06, "loss": 0.035248130559921265, "memory(GiB)": 21.48, "step": 8914, "token_acc": 0.9778761061946902, "train_speed(iter/s)": 0.95698 }, { "epoch": 0.2896079004645421, "grad_norm": 0.6830539107322693, "learning_rate": 8.511046154722709e-06, "loss": 0.036641091108322144, "memory(GiB)": 21.48, "step": 8915, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956995 }, { "epoch": 0.2896403859272975, "grad_norm": 0.42549383640289307, "learning_rate": 8.510663697342297e-06, "loss": 0.03774750232696533, "memory(GiB)": 21.48, "step": 8916, "token_acc": 0.9798387096774194, "train_speed(iter/s)": 0.957012 }, { "epoch": 0.28967287139005293, "grad_norm": 0.7147989869117737, "learning_rate": 8.51028119944415e-06, "loss": 0.05181494355201721, "memory(GiB)": 21.48, "step": 8917, "token_acc": 0.96, "train_speed(iter/s)": 0.957028 }, { "epoch": 0.28970535685280835, "grad_norm": 0.5789979100227356, "learning_rate": 8.509898661032687e-06, "loss": 0.0306459441781044, "memory(GiB)": 21.48, "step": 8918, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.957045 }, { "epoch": 0.28973784231556377, "grad_norm": 0.4510910212993622, "learning_rate": 8.509516082112322e-06, "loss": 0.04206007719039917, "memory(GiB)": 21.48, "step": 8919, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.957062 }, { "epoch": 0.2897703277783192, "grad_norm": 0.34798726439476013, "learning_rate": 8.509133462687471e-06, "loss": 0.0264669731259346, "memory(GiB)": 21.48, "step": 8920, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.95708 }, { "epoch": 0.2898028132410746, "grad_norm": 0.6310858130455017, "learning_rate": 8.508750802762549e-06, "loss": 0.027286304160952568, "memory(GiB)": 21.48, "step": 8921, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.957097 }, { "epoch": 0.28983529870383, "grad_norm": 0.4118984639644623, "learning_rate": 8.508368102341972e-06, "loss": 0.032480090856552124, "memory(GiB)": 21.48, "step": 8922, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957114 }, { "epoch": 0.28986778416658543, "grad_norm": 0.8100460767745972, "learning_rate": 8.507985361430157e-06, "loss": 0.02652524784207344, "memory(GiB)": 21.48, "step": 8923, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.957132 }, { "epoch": 0.28990026962934085, "grad_norm": 0.42834386229515076, "learning_rate": 8.507602580031523e-06, "loss": 0.029767772182822227, "memory(GiB)": 21.48, "step": 8924, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957151 }, { "epoch": 0.28993275509209626, "grad_norm": 0.7987231016159058, "learning_rate": 8.507219758150486e-06, "loss": 0.0317813865840435, "memory(GiB)": 21.48, "step": 8925, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.957167 }, { "epoch": 0.2899652405548517, "grad_norm": 0.3750923275947571, "learning_rate": 8.506836895791465e-06, "loss": 0.025423934683203697, "memory(GiB)": 21.48, "step": 8926, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.957183 }, { "epoch": 0.2899977260176071, "grad_norm": 0.493198961019516, "learning_rate": 8.50645399295888e-06, "loss": 0.0362081304192543, "memory(GiB)": 21.48, "step": 8927, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.957201 }, { "epoch": 0.29003021148036257, "grad_norm": 0.5420341491699219, "learning_rate": 8.506071049657146e-06, "loss": 0.04008878022432327, "memory(GiB)": 21.48, "step": 8928, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.957219 }, { "epoch": 0.290062696943118, "grad_norm": 0.43596047163009644, "learning_rate": 8.505688065890687e-06, "loss": 0.03069409914314747, "memory(GiB)": 21.48, "step": 8929, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.957237 }, { "epoch": 0.2900951824058734, "grad_norm": 0.526668131351471, "learning_rate": 8.505305041663923e-06, "loss": 0.03659749776124954, "memory(GiB)": 21.48, "step": 8930, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.957259 }, { "epoch": 0.2901276678686288, "grad_norm": 0.47335943579673767, "learning_rate": 8.50492197698127e-06, "loss": 0.02802046574652195, "memory(GiB)": 21.48, "step": 8931, "token_acc": 0.98046875, "train_speed(iter/s)": 0.957281 }, { "epoch": 0.29016015333138423, "grad_norm": 0.3834867477416992, "learning_rate": 8.504538871847154e-06, "loss": 0.025809241458773613, "memory(GiB)": 21.48, "step": 8932, "token_acc": 1.0, "train_speed(iter/s)": 0.957302 }, { "epoch": 0.29019263879413965, "grad_norm": 0.5802419185638428, "learning_rate": 8.504155726265998e-06, "loss": 0.04265566170215607, "memory(GiB)": 21.48, "step": 8933, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.957323 }, { "epoch": 0.29022512425689506, "grad_norm": 0.4239517152309418, "learning_rate": 8.503772540242216e-06, "loss": 0.02956358715891838, "memory(GiB)": 21.48, "step": 8934, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.957346 }, { "epoch": 0.2902576097196505, "grad_norm": 0.3789498209953308, "learning_rate": 8.503389313780239e-06, "loss": 0.024129118770360947, "memory(GiB)": 21.48, "step": 8935, "token_acc": 0.9845559845559846, "train_speed(iter/s)": 0.957369 }, { "epoch": 0.2902900951824059, "grad_norm": 0.6747987270355225, "learning_rate": 8.503006046884483e-06, "loss": 0.0374516025185585, "memory(GiB)": 21.48, "step": 8936, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.957392 }, { "epoch": 0.2903225806451613, "grad_norm": 0.44929614663124084, "learning_rate": 8.502622739559378e-06, "loss": 0.03684322535991669, "memory(GiB)": 21.48, "step": 8937, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.957414 }, { "epoch": 0.2903550661079167, "grad_norm": 0.49594560265541077, "learning_rate": 8.502239391809343e-06, "loss": 0.035199932754039764, "memory(GiB)": 21.48, "step": 8938, "token_acc": 1.0, "train_speed(iter/s)": 0.957437 }, { "epoch": 0.29038755157067214, "grad_norm": 0.47136804461479187, "learning_rate": 8.501856003638805e-06, "loss": 0.03314351290464401, "memory(GiB)": 21.48, "step": 8939, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.95746 }, { "epoch": 0.29042003703342756, "grad_norm": 0.40088677406311035, "learning_rate": 8.501472575052186e-06, "loss": 0.029724247753620148, "memory(GiB)": 21.48, "step": 8940, "token_acc": 0.9854545454545455, "train_speed(iter/s)": 0.957484 }, { "epoch": 0.290452522496183, "grad_norm": 0.3759693205356598, "learning_rate": 8.501089106053914e-06, "loss": 0.02313918247818947, "memory(GiB)": 21.48, "step": 8941, "token_acc": 1.0, "train_speed(iter/s)": 0.957502 }, { "epoch": 0.2904850079589384, "grad_norm": 0.6580095291137695, "learning_rate": 8.500705596648412e-06, "loss": 0.028358399868011475, "memory(GiB)": 21.48, "step": 8942, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.957521 }, { "epoch": 0.2905174934216938, "grad_norm": 0.4511031210422516, "learning_rate": 8.50032204684011e-06, "loss": 0.033959757536649704, "memory(GiB)": 21.48, "step": 8943, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.957538 }, { "epoch": 0.2905499788844492, "grad_norm": 0.560093343257904, "learning_rate": 8.499938456633433e-06, "loss": 0.033406831324100494, "memory(GiB)": 21.48, "step": 8944, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.957555 }, { "epoch": 0.29058246434720464, "grad_norm": 0.6230751872062683, "learning_rate": 8.499554826032808e-06, "loss": 0.04258866608142853, "memory(GiB)": 21.48, "step": 8945, "token_acc": 0.9812734082397003, "train_speed(iter/s)": 0.957572 }, { "epoch": 0.29061494980996005, "grad_norm": 0.4262571632862091, "learning_rate": 8.49917115504266e-06, "loss": 0.029747799038887024, "memory(GiB)": 21.48, "step": 8946, "token_acc": 0.9835390946502057, "train_speed(iter/s)": 0.957591 }, { "epoch": 0.29064743527271547, "grad_norm": 0.39317429065704346, "learning_rate": 8.498787443667421e-06, "loss": 0.03383169323205948, "memory(GiB)": 21.48, "step": 8947, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.957609 }, { "epoch": 0.2906799207354709, "grad_norm": 0.3827706575393677, "learning_rate": 8.498403691911519e-06, "loss": 0.023445233702659607, "memory(GiB)": 21.48, "step": 8948, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.957626 }, { "epoch": 0.2907124061982263, "grad_norm": 0.3001099228858948, "learning_rate": 8.49801989977938e-06, "loss": 0.024634260684251785, "memory(GiB)": 21.48, "step": 8949, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.957643 }, { "epoch": 0.2907448916609817, "grad_norm": 0.37411150336265564, "learning_rate": 8.497636067275436e-06, "loss": 0.03338286653161049, "memory(GiB)": 21.48, "step": 8950, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.95766 }, { "epoch": 0.29077737712373714, "grad_norm": 0.31150367856025696, "learning_rate": 8.497252194404117e-06, "loss": 0.022295091301202774, "memory(GiB)": 21.48, "step": 8951, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.957675 }, { "epoch": 0.29080986258649255, "grad_norm": 0.584351658821106, "learning_rate": 8.496868281169855e-06, "loss": 0.039681874215602875, "memory(GiB)": 21.48, "step": 8952, "token_acc": 0.958139534883721, "train_speed(iter/s)": 0.957692 }, { "epoch": 0.29084234804924797, "grad_norm": 0.6262988448143005, "learning_rate": 8.496484327577075e-06, "loss": 0.03723989054560661, "memory(GiB)": 21.48, "step": 8953, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.957709 }, { "epoch": 0.2908748335120034, "grad_norm": 0.35130980610847473, "learning_rate": 8.496100333630215e-06, "loss": 0.01915132626891136, "memory(GiB)": 21.48, "step": 8954, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.957726 }, { "epoch": 0.2909073189747588, "grad_norm": 0.4426102936267853, "learning_rate": 8.495716299333701e-06, "loss": 0.029365135356783867, "memory(GiB)": 21.48, "step": 8955, "token_acc": 0.9765258215962441, "train_speed(iter/s)": 0.957744 }, { "epoch": 0.2909398044375142, "grad_norm": 0.37360531091690063, "learning_rate": 8.49533222469197e-06, "loss": 0.02726391702890396, "memory(GiB)": 21.48, "step": 8956, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.957762 }, { "epoch": 0.29097228990026963, "grad_norm": 0.6038020253181458, "learning_rate": 8.494948109709452e-06, "loss": 0.03197638317942619, "memory(GiB)": 21.48, "step": 8957, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.957781 }, { "epoch": 0.29100477536302505, "grad_norm": 0.41173240542411804, "learning_rate": 8.494563954390582e-06, "loss": 0.027408521622419357, "memory(GiB)": 21.48, "step": 8958, "token_acc": 1.0, "train_speed(iter/s)": 0.957797 }, { "epoch": 0.29103726082578046, "grad_norm": 0.5273420810699463, "learning_rate": 8.494179758739793e-06, "loss": 0.029385827481746674, "memory(GiB)": 21.48, "step": 8959, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957813 }, { "epoch": 0.2910697462885359, "grad_norm": 0.6667708158493042, "learning_rate": 8.493795522761518e-06, "loss": 0.037302009761333466, "memory(GiB)": 21.48, "step": 8960, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.957827 }, { "epoch": 0.2911022317512913, "grad_norm": 0.3941488265991211, "learning_rate": 8.493411246460193e-06, "loss": 0.03420242294669151, "memory(GiB)": 21.48, "step": 8961, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.957844 }, { "epoch": 0.2911347172140467, "grad_norm": 0.47798359394073486, "learning_rate": 8.493026929840251e-06, "loss": 0.038461461663246155, "memory(GiB)": 21.48, "step": 8962, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.957862 }, { "epoch": 0.29116720267680213, "grad_norm": 0.4531797468662262, "learning_rate": 8.492642572906128e-06, "loss": 0.02860569953918457, "memory(GiB)": 21.48, "step": 8963, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.95788 }, { "epoch": 0.29119968813955754, "grad_norm": 0.5567311644554138, "learning_rate": 8.492258175662265e-06, "loss": 0.03891265392303467, "memory(GiB)": 21.48, "step": 8964, "token_acc": 0.9746835443037974, "train_speed(iter/s)": 0.957897 }, { "epoch": 0.29123217360231296, "grad_norm": 0.4771019220352173, "learning_rate": 8.491873738113093e-06, "loss": 0.029840203002095222, "memory(GiB)": 21.48, "step": 8965, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.957919 }, { "epoch": 0.2912646590650684, "grad_norm": 0.40702199935913086, "learning_rate": 8.491489260263048e-06, "loss": 0.026963049545884132, "memory(GiB)": 21.48, "step": 8966, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.957941 }, { "epoch": 0.2912971445278238, "grad_norm": 0.491737961769104, "learning_rate": 8.491104742116572e-06, "loss": 0.03468254208564758, "memory(GiB)": 21.48, "step": 8967, "token_acc": 0.984375, "train_speed(iter/s)": 0.957962 }, { "epoch": 0.2913296299905792, "grad_norm": 0.6851837635040283, "learning_rate": 8.4907201836781e-06, "loss": 0.040890105068683624, "memory(GiB)": 21.48, "step": 8968, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957983 }, { "epoch": 0.2913621154533346, "grad_norm": 0.6381861567497253, "learning_rate": 8.490335584952071e-06, "loss": 0.03499697893857956, "memory(GiB)": 21.48, "step": 8969, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.958002 }, { "epoch": 0.29139460091609004, "grad_norm": 0.48885175585746765, "learning_rate": 8.489950945942924e-06, "loss": 0.02944663166999817, "memory(GiB)": 21.48, "step": 8970, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.958017 }, { "epoch": 0.29142708637884546, "grad_norm": 1.167847752571106, "learning_rate": 8.489566266655099e-06, "loss": 0.033979207277297974, "memory(GiB)": 21.48, "step": 8971, "token_acc": 0.973568281938326, "train_speed(iter/s)": 0.958034 }, { "epoch": 0.2914595718416009, "grad_norm": 0.34131115674972534, "learning_rate": 8.489181547093032e-06, "loss": 0.03582468628883362, "memory(GiB)": 21.48, "step": 8972, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.958052 }, { "epoch": 0.2914920573043563, "grad_norm": 0.3489174544811249, "learning_rate": 8.48879678726117e-06, "loss": 0.020277490839362144, "memory(GiB)": 21.48, "step": 8973, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.95807 }, { "epoch": 0.2915245427671117, "grad_norm": 0.3580782115459442, "learning_rate": 8.488411987163945e-06, "loss": 0.030206002295017242, "memory(GiB)": 21.48, "step": 8974, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.958089 }, { "epoch": 0.2915570282298671, "grad_norm": 0.3402804732322693, "learning_rate": 8.488027146805805e-06, "loss": 0.03876487910747528, "memory(GiB)": 21.48, "step": 8975, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.958106 }, { "epoch": 0.29158951369262254, "grad_norm": 0.6525551676750183, "learning_rate": 8.48764226619119e-06, "loss": 0.03302288055419922, "memory(GiB)": 21.48, "step": 8976, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.95812 }, { "epoch": 0.29162199915537795, "grad_norm": 0.40620186924934387, "learning_rate": 8.48725734532454e-06, "loss": 0.03343517705798149, "memory(GiB)": 21.48, "step": 8977, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.958138 }, { "epoch": 0.29165448461813337, "grad_norm": 0.37277624011039734, "learning_rate": 8.486872384210298e-06, "loss": 0.022992949932813644, "memory(GiB)": 21.48, "step": 8978, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.958157 }, { "epoch": 0.2916869700808888, "grad_norm": 0.4915767312049866, "learning_rate": 8.486487382852907e-06, "loss": 0.02915114536881447, "memory(GiB)": 21.48, "step": 8979, "token_acc": 0.9923076923076923, "train_speed(iter/s)": 0.958173 }, { "epoch": 0.2917194555436442, "grad_norm": 0.5790391564369202, "learning_rate": 8.486102341256812e-06, "loss": 0.037031158804893494, "memory(GiB)": 21.48, "step": 8980, "token_acc": 0.9768518518518519, "train_speed(iter/s)": 0.958191 }, { "epoch": 0.2917519410063996, "grad_norm": 0.4045262634754181, "learning_rate": 8.485717259426456e-06, "loss": 0.03435378894209862, "memory(GiB)": 21.48, "step": 8981, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.958208 }, { "epoch": 0.29178442646915503, "grad_norm": 0.3024674952030182, "learning_rate": 8.485332137366285e-06, "loss": 0.02523801475763321, "memory(GiB)": 21.48, "step": 8982, "token_acc": 1.0, "train_speed(iter/s)": 0.958224 }, { "epoch": 0.29181691193191045, "grad_norm": 0.42800015211105347, "learning_rate": 8.48494697508074e-06, "loss": 0.03593870997428894, "memory(GiB)": 21.48, "step": 8983, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.958241 }, { "epoch": 0.29184939739466587, "grad_norm": 0.5279902219772339, "learning_rate": 8.484561772574269e-06, "loss": 0.03610554337501526, "memory(GiB)": 21.48, "step": 8984, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.958256 }, { "epoch": 0.2918818828574213, "grad_norm": 0.3988119065761566, "learning_rate": 8.484176529851318e-06, "loss": 0.026767350733280182, "memory(GiB)": 21.48, "step": 8985, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.958273 }, { "epoch": 0.2919143683201767, "grad_norm": 0.2941867411136627, "learning_rate": 8.483791246916334e-06, "loss": 0.017812270671129227, "memory(GiB)": 21.48, "step": 8986, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.958291 }, { "epoch": 0.2919468537829321, "grad_norm": 0.5373402833938599, "learning_rate": 8.483405923773759e-06, "loss": 0.03965606540441513, "memory(GiB)": 21.48, "step": 8987, "token_acc": 0.9825174825174825, "train_speed(iter/s)": 0.958306 }, { "epoch": 0.29197933924568753, "grad_norm": 0.5112504363059998, "learning_rate": 8.483020560428043e-06, "loss": 0.039855003356933594, "memory(GiB)": 21.48, "step": 8988, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.958323 }, { "epoch": 0.29201182470844295, "grad_norm": 0.32732877135276794, "learning_rate": 8.482635156883636e-06, "loss": 0.021443024277687073, "memory(GiB)": 21.48, "step": 8989, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.958342 }, { "epoch": 0.29204431017119836, "grad_norm": 0.4231870770454407, "learning_rate": 8.482249713144982e-06, "loss": 0.02918928861618042, "memory(GiB)": 21.48, "step": 8990, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.958363 }, { "epoch": 0.2920767956339538, "grad_norm": 0.4230222702026367, "learning_rate": 8.481864229216534e-06, "loss": 0.03542645275592804, "memory(GiB)": 21.48, "step": 8991, "token_acc": 0.9858490566037735, "train_speed(iter/s)": 0.958386 }, { "epoch": 0.29210928109670925, "grad_norm": 0.4979293644428253, "learning_rate": 8.481478705102738e-06, "loss": 0.03416150063276291, "memory(GiB)": 21.48, "step": 8992, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.958409 }, { "epoch": 0.29214176655946467, "grad_norm": 0.444020539522171, "learning_rate": 8.481093140808042e-06, "loss": 0.03495553508400917, "memory(GiB)": 21.48, "step": 8993, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.958431 }, { "epoch": 0.2921742520222201, "grad_norm": 0.403865784406662, "learning_rate": 8.480707536336897e-06, "loss": 0.04176295921206474, "memory(GiB)": 21.48, "step": 8994, "token_acc": 0.9754385964912281, "train_speed(iter/s)": 0.958454 }, { "epoch": 0.2922067374849755, "grad_norm": 0.49782589077949524, "learning_rate": 8.480321891693757e-06, "loss": 0.03648572415113449, "memory(GiB)": 21.48, "step": 8995, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.958476 }, { "epoch": 0.2922392229477309, "grad_norm": 0.5155343413352966, "learning_rate": 8.479936206883068e-06, "loss": 0.0338202565908432, "memory(GiB)": 21.48, "step": 8996, "token_acc": 0.9719298245614035, "train_speed(iter/s)": 0.958497 }, { "epoch": 0.29227170841048633, "grad_norm": 0.34973034262657166, "learning_rate": 8.479550481909284e-06, "loss": 0.03356482833623886, "memory(GiB)": 21.48, "step": 8997, "token_acc": 0.9816176470588235, "train_speed(iter/s)": 0.95852 }, { "epoch": 0.29230419387324175, "grad_norm": 0.4067360758781433, "learning_rate": 8.479164716776858e-06, "loss": 0.030244454741477966, "memory(GiB)": 21.48, "step": 8998, "token_acc": 0.9961832061068703, "train_speed(iter/s)": 0.958543 }, { "epoch": 0.29233667933599716, "grad_norm": 0.5587191581726074, "learning_rate": 8.478778911490238e-06, "loss": 0.033784784376621246, "memory(GiB)": 21.48, "step": 8999, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.958564 }, { "epoch": 0.2923691647987526, "grad_norm": 0.9472740888595581, "learning_rate": 8.47839306605388e-06, "loss": 0.037802115082740784, "memory(GiB)": 21.48, "step": 9000, "token_acc": 0.9530516431924883, "train_speed(iter/s)": 0.958588 }, { "epoch": 0.2923691647987526, "eval_loss": 0.03258657082915306, "eval_runtime": 80.9412, "eval_samples_per_second": 122.929, "eval_steps_per_second": 3.842, "eval_token_acc": 0.9874364399871636, "step": 9000 }, { "epoch": 0.292401650261508, "grad_norm": 0.2960616946220398, "learning_rate": 8.478007180472236e-06, "loss": 0.029702097177505493, "memory(GiB)": 21.48, "step": 9001, "token_acc": 0.9865373817056302, "train_speed(iter/s)": 0.949329 }, { "epoch": 0.2924341357242634, "grad_norm": 0.8182184100151062, "learning_rate": 8.477621254749762e-06, "loss": 0.029064573347568512, "memory(GiB)": 21.48, "step": 9002, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.949343 }, { "epoch": 0.2924666211870188, "grad_norm": 0.3648111820220947, "learning_rate": 8.477235288890907e-06, "loss": 0.03207852691411972, "memory(GiB)": 21.48, "step": 9003, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.949358 }, { "epoch": 0.29249910664977424, "grad_norm": 0.7210526466369629, "learning_rate": 8.47684928290013e-06, "loss": 0.03754904866218567, "memory(GiB)": 21.48, "step": 9004, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.949376 }, { "epoch": 0.29253159211252966, "grad_norm": 0.548159122467041, "learning_rate": 8.476463236781885e-06, "loss": 0.04197484254837036, "memory(GiB)": 21.48, "step": 9005, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.949395 }, { "epoch": 0.2925640775752851, "grad_norm": 0.5147044062614441, "learning_rate": 8.476077150540625e-06, "loss": 0.0316629596054554, "memory(GiB)": 21.48, "step": 9006, "token_acc": 0.9692307692307692, "train_speed(iter/s)": 0.949407 }, { "epoch": 0.2925965630380405, "grad_norm": 0.538279116153717, "learning_rate": 8.47569102418081e-06, "loss": 0.03195151686668396, "memory(GiB)": 21.48, "step": 9007, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.949425 }, { "epoch": 0.2926290485007959, "grad_norm": 0.3574800491333008, "learning_rate": 8.475304857706895e-06, "loss": 0.03035728819668293, "memory(GiB)": 21.48, "step": 9008, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.949445 }, { "epoch": 0.2926615339635513, "grad_norm": 0.410368412733078, "learning_rate": 8.474918651123336e-06, "loss": 0.034487515687942505, "memory(GiB)": 21.48, "step": 9009, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.949462 }, { "epoch": 0.29269401942630674, "grad_norm": 1.0293501615524292, "learning_rate": 8.474532404434587e-06, "loss": 0.04058451950550079, "memory(GiB)": 21.48, "step": 9010, "token_acc": 0.988, "train_speed(iter/s)": 0.949479 }, { "epoch": 0.29272650488906216, "grad_norm": 0.5439494848251343, "learning_rate": 8.474146117645114e-06, "loss": 0.03134872764348984, "memory(GiB)": 21.48, "step": 9011, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.949497 }, { "epoch": 0.29275899035181757, "grad_norm": 0.5049421191215515, "learning_rate": 8.47375979075937e-06, "loss": 0.028686080127954483, "memory(GiB)": 21.48, "step": 9012, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.949513 }, { "epoch": 0.292791475814573, "grad_norm": 0.4025111496448517, "learning_rate": 8.473373423781812e-06, "loss": 0.03394714742898941, "memory(GiB)": 21.48, "step": 9013, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.949528 }, { "epoch": 0.2928239612773284, "grad_norm": 1.196581244468689, "learning_rate": 8.472987016716904e-06, "loss": 0.03638489171862602, "memory(GiB)": 21.48, "step": 9014, "token_acc": 0.9744525547445255, "train_speed(iter/s)": 0.949543 }, { "epoch": 0.2928564467400838, "grad_norm": 0.38141804933547974, "learning_rate": 8.472600569569103e-06, "loss": 0.026815054938197136, "memory(GiB)": 21.48, "step": 9015, "token_acc": 1.0, "train_speed(iter/s)": 0.949561 }, { "epoch": 0.29288893220283924, "grad_norm": 0.5558325052261353, "learning_rate": 8.47221408234287e-06, "loss": 0.03094106912612915, "memory(GiB)": 21.48, "step": 9016, "token_acc": 0.99, "train_speed(iter/s)": 0.949577 }, { "epoch": 0.29292141766559465, "grad_norm": 1.5975751876831055, "learning_rate": 8.471827555042662e-06, "loss": 0.04108010232448578, "memory(GiB)": 21.48, "step": 9017, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.949593 }, { "epoch": 0.29295390312835007, "grad_norm": 0.39794036746025085, "learning_rate": 8.471440987672945e-06, "loss": 0.026877902448177338, "memory(GiB)": 21.48, "step": 9018, "token_acc": 1.0, "train_speed(iter/s)": 0.94961 }, { "epoch": 0.2929863885911055, "grad_norm": 0.46815067529678345, "learning_rate": 8.471054380238179e-06, "loss": 0.03455542027950287, "memory(GiB)": 21.48, "step": 9019, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.949627 }, { "epoch": 0.2930188740538609, "grad_norm": 0.45382407307624817, "learning_rate": 8.470667732742825e-06, "loss": 0.02858179807662964, "memory(GiB)": 21.48, "step": 9020, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.949643 }, { "epoch": 0.2930513595166163, "grad_norm": 0.4536643624305725, "learning_rate": 8.470281045191346e-06, "loss": 0.03898929804563522, "memory(GiB)": 21.48, "step": 9021, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.949661 }, { "epoch": 0.29308384497937173, "grad_norm": 0.4022068977355957, "learning_rate": 8.469894317588206e-06, "loss": 0.040836818516254425, "memory(GiB)": 21.48, "step": 9022, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.949678 }, { "epoch": 0.29311633044212715, "grad_norm": 0.36023780703544617, "learning_rate": 8.469507549937866e-06, "loss": 0.03264245390892029, "memory(GiB)": 21.48, "step": 9023, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.949696 }, { "epoch": 0.29314881590488256, "grad_norm": 0.3672129511833191, "learning_rate": 8.46912074224479e-06, "loss": 0.03160824999213219, "memory(GiB)": 21.48, "step": 9024, "token_acc": 0.991869918699187, "train_speed(iter/s)": 0.949714 }, { "epoch": 0.293181301367638, "grad_norm": 0.44754648208618164, "learning_rate": 8.468733894513443e-06, "loss": 0.03387414291501045, "memory(GiB)": 21.48, "step": 9025, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.949734 }, { "epoch": 0.2932137868303934, "grad_norm": 0.29713574051856995, "learning_rate": 8.468347006748291e-06, "loss": 0.029482807964086533, "memory(GiB)": 21.48, "step": 9026, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.949755 }, { "epoch": 0.2932462722931488, "grad_norm": 0.4386747181415558, "learning_rate": 8.4679600789538e-06, "loss": 0.027598444372415543, "memory(GiB)": 21.48, "step": 9027, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.949771 }, { "epoch": 0.29327875775590423, "grad_norm": 0.3778913915157318, "learning_rate": 8.46757311113443e-06, "loss": 0.02415180206298828, "memory(GiB)": 21.48, "step": 9028, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.949789 }, { "epoch": 0.29331124321865965, "grad_norm": 0.399326890707016, "learning_rate": 8.467186103294654e-06, "loss": 0.03356661647558212, "memory(GiB)": 21.48, "step": 9029, "token_acc": 0.9781659388646288, "train_speed(iter/s)": 0.949806 }, { "epoch": 0.29334372868141506, "grad_norm": 0.4171474277973175, "learning_rate": 8.466799055438934e-06, "loss": 0.03362724184989929, "memory(GiB)": 21.48, "step": 9030, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.949824 }, { "epoch": 0.2933762141441705, "grad_norm": 0.515152633190155, "learning_rate": 8.466411967571739e-06, "loss": 0.033386629074811935, "memory(GiB)": 21.48, "step": 9031, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.94984 }, { "epoch": 0.2934086996069259, "grad_norm": 0.4433392882347107, "learning_rate": 8.466024839697537e-06, "loss": 0.032582540065050125, "memory(GiB)": 21.48, "step": 9032, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.949858 }, { "epoch": 0.2934411850696813, "grad_norm": 0.6010439991950989, "learning_rate": 8.465637671820794e-06, "loss": 0.04102827608585358, "memory(GiB)": 21.48, "step": 9033, "token_acc": 0.9838056680161943, "train_speed(iter/s)": 0.949873 }, { "epoch": 0.2934736705324367, "grad_norm": 0.5503987669944763, "learning_rate": 8.46525046394598e-06, "loss": 0.035352785140275955, "memory(GiB)": 21.48, "step": 9034, "token_acc": 0.9634703196347032, "train_speed(iter/s)": 0.949887 }, { "epoch": 0.29350615599519214, "grad_norm": 0.5027404427528381, "learning_rate": 8.464863216077564e-06, "loss": 0.028877967968583107, "memory(GiB)": 21.48, "step": 9035, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.949903 }, { "epoch": 0.29353864145794756, "grad_norm": 0.4706264138221741, "learning_rate": 8.464475928220013e-06, "loss": 0.034028712660074234, "memory(GiB)": 21.48, "step": 9036, "token_acc": 0.9822064056939501, "train_speed(iter/s)": 0.94992 }, { "epoch": 0.293571126920703, "grad_norm": 0.3243546187877655, "learning_rate": 8.4640886003778e-06, "loss": 0.019392240792512894, "memory(GiB)": 21.48, "step": 9037, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.949936 }, { "epoch": 0.2936036123834584, "grad_norm": 1.016955852508545, "learning_rate": 8.463701232555395e-06, "loss": 0.03225439041852951, "memory(GiB)": 21.48, "step": 9038, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.949952 }, { "epoch": 0.2936360978462138, "grad_norm": 0.29370567202568054, "learning_rate": 8.463313824757266e-06, "loss": 0.02283749356865883, "memory(GiB)": 21.48, "step": 9039, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.94997 }, { "epoch": 0.2936685833089692, "grad_norm": 0.41975685954093933, "learning_rate": 8.462926376987884e-06, "loss": 0.031130872666835785, "memory(GiB)": 21.48, "step": 9040, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.949988 }, { "epoch": 0.29370106877172464, "grad_norm": 0.45191890001296997, "learning_rate": 8.462538889251725e-06, "loss": 0.04142517223954201, "memory(GiB)": 21.48, "step": 9041, "token_acc": 0.98, "train_speed(iter/s)": 0.950005 }, { "epoch": 0.29373355423448005, "grad_norm": 0.44475534558296204, "learning_rate": 8.462151361553258e-06, "loss": 0.03405798226594925, "memory(GiB)": 21.48, "step": 9042, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.950023 }, { "epoch": 0.29376603969723547, "grad_norm": 0.39342114329338074, "learning_rate": 8.461763793896957e-06, "loss": 0.027146730571985245, "memory(GiB)": 21.48, "step": 9043, "token_acc": 0.9891696750902527, "train_speed(iter/s)": 0.950038 }, { "epoch": 0.2937985251599909, "grad_norm": 0.3660842776298523, "learning_rate": 8.461376186287293e-06, "loss": 0.027977265417575836, "memory(GiB)": 21.48, "step": 9044, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.950057 }, { "epoch": 0.2938310106227463, "grad_norm": 0.39440250396728516, "learning_rate": 8.460988538728741e-06, "loss": 0.02974557690322399, "memory(GiB)": 21.48, "step": 9045, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.950073 }, { "epoch": 0.2938634960855017, "grad_norm": 0.41857999563217163, "learning_rate": 8.460600851225776e-06, "loss": 0.033282868564128876, "memory(GiB)": 21.48, "step": 9046, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.950093 }, { "epoch": 0.29389598154825713, "grad_norm": 0.38108375668525696, "learning_rate": 8.460213123782867e-06, "loss": 0.025827310979366302, "memory(GiB)": 21.48, "step": 9047, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.950112 }, { "epoch": 0.29392846701101255, "grad_norm": 0.505611002445221, "learning_rate": 8.459825356404498e-06, "loss": 0.03389113023877144, "memory(GiB)": 21.48, "step": 9048, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.950133 }, { "epoch": 0.29396095247376797, "grad_norm": 7.1901021003723145, "learning_rate": 8.459437549095137e-06, "loss": 0.04042089730501175, "memory(GiB)": 21.48, "step": 9049, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.950152 }, { "epoch": 0.2939934379365234, "grad_norm": 0.4342350959777832, "learning_rate": 8.459049701859263e-06, "loss": 0.02417878806591034, "memory(GiB)": 21.48, "step": 9050, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.950172 }, { "epoch": 0.2940259233992788, "grad_norm": 0.48181816935539246, "learning_rate": 8.45866181470135e-06, "loss": 0.026763860136270523, "memory(GiB)": 21.48, "step": 9051, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.95019 }, { "epoch": 0.2940584088620342, "grad_norm": 0.5626645684242249, "learning_rate": 8.458273887625878e-06, "loss": 0.04060616344213486, "memory(GiB)": 21.48, "step": 9052, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.950211 }, { "epoch": 0.29409089432478963, "grad_norm": 0.4159054458141327, "learning_rate": 8.457885920637322e-06, "loss": 0.04117078334093094, "memory(GiB)": 21.48, "step": 9053, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.950229 }, { "epoch": 0.29412337978754505, "grad_norm": 0.36126771569252014, "learning_rate": 8.457497913740158e-06, "loss": 0.035777200013399124, "memory(GiB)": 21.48, "step": 9054, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.950246 }, { "epoch": 0.29415586525030046, "grad_norm": 0.5523148775100708, "learning_rate": 8.457109866938869e-06, "loss": 0.038911428302526474, "memory(GiB)": 21.48, "step": 9055, "token_acc": 0.9707112970711297, "train_speed(iter/s)": 0.950265 }, { "epoch": 0.29418835071305593, "grad_norm": 0.3849114775657654, "learning_rate": 8.456721780237929e-06, "loss": 0.02891075238585472, "memory(GiB)": 21.48, "step": 9056, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.950285 }, { "epoch": 0.29422083617581135, "grad_norm": 0.3515741527080536, "learning_rate": 8.45633365364182e-06, "loss": 0.027111290022730827, "memory(GiB)": 21.48, "step": 9057, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.950304 }, { "epoch": 0.29425332163856677, "grad_norm": 0.38316836953163147, "learning_rate": 8.455945487155019e-06, "loss": 0.029128432273864746, "memory(GiB)": 21.48, "step": 9058, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.950323 }, { "epoch": 0.2942858071013222, "grad_norm": 0.45127373933792114, "learning_rate": 8.455557280782008e-06, "loss": 0.032049186527729034, "memory(GiB)": 21.48, "step": 9059, "token_acc": 0.9904761904761905, "train_speed(iter/s)": 0.950342 }, { "epoch": 0.2943182925640776, "grad_norm": 0.3420076072216034, "learning_rate": 8.455169034527265e-06, "loss": 0.02373121678829193, "memory(GiB)": 21.48, "step": 9060, "token_acc": 0.99609375, "train_speed(iter/s)": 0.950359 }, { "epoch": 0.294350778026833, "grad_norm": 0.3211947977542877, "learning_rate": 8.454780748395276e-06, "loss": 0.024409286677837372, "memory(GiB)": 21.48, "step": 9061, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.950378 }, { "epoch": 0.29438326348958843, "grad_norm": 0.6684739589691162, "learning_rate": 8.454392422390514e-06, "loss": 0.038895294070243835, "memory(GiB)": 21.48, "step": 9062, "token_acc": 0.9753086419753086, "train_speed(iter/s)": 0.950398 }, { "epoch": 0.29441574895234385, "grad_norm": 0.4617081582546234, "learning_rate": 8.454004056517469e-06, "loss": 0.034553781151771545, "memory(GiB)": 21.48, "step": 9063, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.950418 }, { "epoch": 0.29444823441509926, "grad_norm": 1.2084333896636963, "learning_rate": 8.45361565078062e-06, "loss": 0.042767483741045, "memory(GiB)": 21.48, "step": 9064, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.950437 }, { "epoch": 0.2944807198778547, "grad_norm": 0.5931013822555542, "learning_rate": 8.45322720518445e-06, "loss": 0.03656836599111557, "memory(GiB)": 21.48, "step": 9065, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.950452 }, { "epoch": 0.2945132053406101, "grad_norm": 0.6031588912010193, "learning_rate": 8.45283871973344e-06, "loss": 0.037722986191511154, "memory(GiB)": 21.48, "step": 9066, "token_acc": 0.99609375, "train_speed(iter/s)": 0.950466 }, { "epoch": 0.2945456908033655, "grad_norm": 0.37358558177948, "learning_rate": 8.452450194432075e-06, "loss": 0.02418985590338707, "memory(GiB)": 21.48, "step": 9067, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.950485 }, { "epoch": 0.29457817626612093, "grad_norm": 0.4276244044303894, "learning_rate": 8.452061629284842e-06, "loss": 0.03093700110912323, "memory(GiB)": 21.48, "step": 9068, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.950505 }, { "epoch": 0.29461066172887634, "grad_norm": 0.6945497989654541, "learning_rate": 8.45167302429622e-06, "loss": 0.04076302424073219, "memory(GiB)": 21.48, "step": 9069, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.950525 }, { "epoch": 0.29464314719163176, "grad_norm": 0.6713895797729492, "learning_rate": 8.4512843794707e-06, "loss": 0.034609757363796234, "memory(GiB)": 21.48, "step": 9070, "token_acc": 0.9776951672862454, "train_speed(iter/s)": 0.950546 }, { "epoch": 0.2946756326543872, "grad_norm": 0.5843331217765808, "learning_rate": 8.450895694812763e-06, "loss": 0.04223531484603882, "memory(GiB)": 21.48, "step": 9071, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.950567 }, { "epoch": 0.2947081181171426, "grad_norm": 0.455535352230072, "learning_rate": 8.450506970326898e-06, "loss": 0.03652837872505188, "memory(GiB)": 21.48, "step": 9072, "token_acc": 0.9893992932862191, "train_speed(iter/s)": 0.95059 }, { "epoch": 0.294740603579898, "grad_norm": 0.45993342995643616, "learning_rate": 8.45011820601759e-06, "loss": 0.028781849890947342, "memory(GiB)": 21.48, "step": 9073, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.950613 }, { "epoch": 0.2947730890426534, "grad_norm": 0.617199718952179, "learning_rate": 8.449729401889324e-06, "loss": 0.033504582941532135, "memory(GiB)": 21.48, "step": 9074, "token_acc": 0.9739130434782609, "train_speed(iter/s)": 0.950637 }, { "epoch": 0.29480557450540884, "grad_norm": 0.46472251415252686, "learning_rate": 8.449340557946591e-06, "loss": 0.03244742006063461, "memory(GiB)": 21.48, "step": 9075, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.950627 }, { "epoch": 0.29483805996816426, "grad_norm": 0.4538460075855255, "learning_rate": 8.448951674193875e-06, "loss": 0.036895301192998886, "memory(GiB)": 21.48, "step": 9076, "token_acc": 0.9691119691119691, "train_speed(iter/s)": 0.950649 }, { "epoch": 0.2948705454309197, "grad_norm": 0.3793802261352539, "learning_rate": 8.448562750635667e-06, "loss": 0.031143974512815475, "memory(GiB)": 21.48, "step": 9077, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.95067 }, { "epoch": 0.2949030308936751, "grad_norm": 0.4255748689174652, "learning_rate": 8.448173787276454e-06, "loss": 0.02503034844994545, "memory(GiB)": 21.48, "step": 9078, "token_acc": 0.979253112033195, "train_speed(iter/s)": 0.950694 }, { "epoch": 0.2949355163564305, "grad_norm": 0.368950217962265, "learning_rate": 8.447784784120729e-06, "loss": 0.02564464509487152, "memory(GiB)": 21.48, "step": 9079, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.950718 }, { "epoch": 0.2949680018191859, "grad_norm": 0.38281023502349854, "learning_rate": 8.447395741172976e-06, "loss": 0.030846521258354187, "memory(GiB)": 21.48, "step": 9080, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.950742 }, { "epoch": 0.29500048728194134, "grad_norm": 0.4403821527957916, "learning_rate": 8.447006658437689e-06, "loss": 0.03298083692789078, "memory(GiB)": 21.48, "step": 9081, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.950764 }, { "epoch": 0.29503297274469675, "grad_norm": 0.4929497539997101, "learning_rate": 8.446617535919356e-06, "loss": 0.03376210480928421, "memory(GiB)": 21.48, "step": 9082, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.950785 }, { "epoch": 0.29506545820745217, "grad_norm": 0.42154213786125183, "learning_rate": 8.44622837362247e-06, "loss": 0.030936628580093384, "memory(GiB)": 21.48, "step": 9083, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.950807 }, { "epoch": 0.2950979436702076, "grad_norm": 0.4979691505432129, "learning_rate": 8.44583917155152e-06, "loss": 0.0383528433740139, "memory(GiB)": 21.48, "step": 9084, "token_acc": 0.9835390946502057, "train_speed(iter/s)": 0.950832 }, { "epoch": 0.295130429132963, "grad_norm": 0.501338541507721, "learning_rate": 8.445449929711001e-06, "loss": 0.0401730015873909, "memory(GiB)": 21.48, "step": 9085, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.950854 }, { "epoch": 0.2951629145957184, "grad_norm": 0.7057408094406128, "learning_rate": 8.445060648105405e-06, "loss": 0.04065199941396713, "memory(GiB)": 21.48, "step": 9086, "token_acc": 0.9759036144578314, "train_speed(iter/s)": 0.95087 }, { "epoch": 0.29519540005847383, "grad_norm": 0.43370795249938965, "learning_rate": 8.44467132673922e-06, "loss": 0.03365705907344818, "memory(GiB)": 21.48, "step": 9087, "token_acc": 0.984, "train_speed(iter/s)": 0.950887 }, { "epoch": 0.29522788552122925, "grad_norm": 0.9477922916412354, "learning_rate": 8.444281965616948e-06, "loss": 0.02678767219185829, "memory(GiB)": 21.48, "step": 9088, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.950904 }, { "epoch": 0.29526037098398467, "grad_norm": 0.362925261259079, "learning_rate": 8.443892564743075e-06, "loss": 0.023933805525302887, "memory(GiB)": 21.48, "step": 9089, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.950919 }, { "epoch": 0.2952928564467401, "grad_norm": 0.3421025276184082, "learning_rate": 8.443503124122097e-06, "loss": 0.03199245035648346, "memory(GiB)": 21.48, "step": 9090, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.950937 }, { "epoch": 0.2953253419094955, "grad_norm": 0.6212390661239624, "learning_rate": 8.443113643758512e-06, "loss": 0.03877861425280571, "memory(GiB)": 21.48, "step": 9091, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.950955 }, { "epoch": 0.2953578273722509, "grad_norm": 0.47702518105506897, "learning_rate": 8.44272412365681e-06, "loss": 0.026248939335346222, "memory(GiB)": 21.48, "step": 9092, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.950974 }, { "epoch": 0.29539031283500633, "grad_norm": 0.4214008152484894, "learning_rate": 8.442334563821493e-06, "loss": 0.030846698209643364, "memory(GiB)": 21.48, "step": 9093, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.950991 }, { "epoch": 0.29542279829776175, "grad_norm": 0.5237084627151489, "learning_rate": 8.441944964257052e-06, "loss": 0.04070109501481056, "memory(GiB)": 21.48, "step": 9094, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.95101 }, { "epoch": 0.29545528376051716, "grad_norm": 0.5045416951179504, "learning_rate": 8.441555324967982e-06, "loss": 0.03620850294828415, "memory(GiB)": 21.48, "step": 9095, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.951026 }, { "epoch": 0.2954877692232726, "grad_norm": 0.41848641633987427, "learning_rate": 8.441165645958787e-06, "loss": 0.028595803305506706, "memory(GiB)": 21.48, "step": 9096, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.951043 }, { "epoch": 0.295520254686028, "grad_norm": 0.611138105392456, "learning_rate": 8.440775927233957e-06, "loss": 0.04023020714521408, "memory(GiB)": 21.48, "step": 9097, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.951061 }, { "epoch": 0.2955527401487834, "grad_norm": 0.7404148578643799, "learning_rate": 8.440386168797996e-06, "loss": 0.037821147590875626, "memory(GiB)": 21.48, "step": 9098, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.951078 }, { "epoch": 0.2955852256115388, "grad_norm": 0.493021696805954, "learning_rate": 8.439996370655398e-06, "loss": 0.03561435639858246, "memory(GiB)": 21.48, "step": 9099, "token_acc": 0.9711191335740073, "train_speed(iter/s)": 0.951094 }, { "epoch": 0.29561771107429424, "grad_norm": 0.5462107062339783, "learning_rate": 8.439606532810662e-06, "loss": 0.044659264385700226, "memory(GiB)": 21.48, "step": 9100, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.951111 }, { "epoch": 0.29565019653704966, "grad_norm": 0.7892366647720337, "learning_rate": 8.439216655268292e-06, "loss": 0.04474293068051338, "memory(GiB)": 21.48, "step": 9101, "token_acc": 0.9718875502008032, "train_speed(iter/s)": 0.951128 }, { "epoch": 0.2956826819998051, "grad_norm": 0.4471590518951416, "learning_rate": 8.43882673803278e-06, "loss": 0.03411640226840973, "memory(GiB)": 21.48, "step": 9102, "token_acc": 0.9796954314720813, "train_speed(iter/s)": 0.951146 }, { "epoch": 0.2957151674625605, "grad_norm": 0.4753473401069641, "learning_rate": 8.438436781108633e-06, "loss": 0.03787166625261307, "memory(GiB)": 21.48, "step": 9103, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.951161 }, { "epoch": 0.2957476529253159, "grad_norm": 0.4117435812950134, "learning_rate": 8.438046784500349e-06, "loss": 0.03232855349779129, "memory(GiB)": 21.48, "step": 9104, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.95116 }, { "epoch": 0.2957801383880713, "grad_norm": 0.3237588703632355, "learning_rate": 8.437656748212429e-06, "loss": 0.02637591026723385, "memory(GiB)": 21.48, "step": 9105, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.951181 }, { "epoch": 0.29581262385082674, "grad_norm": 0.4008941352367401, "learning_rate": 8.437266672249374e-06, "loss": 0.024095088243484497, "memory(GiB)": 21.48, "step": 9106, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.951202 }, { "epoch": 0.29584510931358216, "grad_norm": 0.763828456401825, "learning_rate": 8.436876556615687e-06, "loss": 0.04496271535754204, "memory(GiB)": 21.48, "step": 9107, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.951224 }, { "epoch": 0.29587759477633757, "grad_norm": 0.4451530873775482, "learning_rate": 8.436486401315868e-06, "loss": 0.030058447271585464, "memory(GiB)": 21.48, "step": 9108, "token_acc": 0.9823943661971831, "train_speed(iter/s)": 0.951247 }, { "epoch": 0.295910080239093, "grad_norm": 0.39794430136680603, "learning_rate": 8.436096206354425e-06, "loss": 0.02646631933748722, "memory(GiB)": 21.48, "step": 9109, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.951268 }, { "epoch": 0.2959425657018484, "grad_norm": 0.3742446005344391, "learning_rate": 8.435705971735857e-06, "loss": 0.03195173293352127, "memory(GiB)": 21.48, "step": 9110, "token_acc": 0.9774436090225563, "train_speed(iter/s)": 0.951291 }, { "epoch": 0.2959750511646038, "grad_norm": 0.40801727771759033, "learning_rate": 8.43531569746467e-06, "loss": 0.039300985634326935, "memory(GiB)": 21.48, "step": 9111, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.951309 }, { "epoch": 0.29600753662735924, "grad_norm": 0.5955540537834167, "learning_rate": 8.434925383545368e-06, "loss": 0.03698121756315231, "memory(GiB)": 21.48, "step": 9112, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.951327 }, { "epoch": 0.29604002209011465, "grad_norm": 1.911696195602417, "learning_rate": 8.434535029982452e-06, "loss": 0.03884267807006836, "memory(GiB)": 21.48, "step": 9113, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.951344 }, { "epoch": 0.29607250755287007, "grad_norm": 0.312193900346756, "learning_rate": 8.434144636780433e-06, "loss": 0.023257063701748848, "memory(GiB)": 21.48, "step": 9114, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.951362 }, { "epoch": 0.2961049930156255, "grad_norm": 0.39739277958869934, "learning_rate": 8.433754203943814e-06, "loss": 0.02680944837629795, "memory(GiB)": 21.48, "step": 9115, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.95138 }, { "epoch": 0.2961374784783809, "grad_norm": 0.6002552509307861, "learning_rate": 8.4333637314771e-06, "loss": 0.03135542571544647, "memory(GiB)": 21.48, "step": 9116, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.951397 }, { "epoch": 0.2961699639411363, "grad_norm": 0.4510325789451599, "learning_rate": 8.4329732193848e-06, "loss": 0.03780128434300423, "memory(GiB)": 21.48, "step": 9117, "token_acc": 0.9869565217391304, "train_speed(iter/s)": 0.951415 }, { "epoch": 0.29620244940389173, "grad_norm": 0.38552892208099365, "learning_rate": 8.432582667671419e-06, "loss": 0.027412207797169685, "memory(GiB)": 21.48, "step": 9118, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.951432 }, { "epoch": 0.29623493486664715, "grad_norm": 0.43310871720314026, "learning_rate": 8.432192076341465e-06, "loss": 0.03214365243911743, "memory(GiB)": 21.48, "step": 9119, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.95145 }, { "epoch": 0.2962674203294026, "grad_norm": 0.498024046421051, "learning_rate": 8.431801445399445e-06, "loss": 0.027514047920703888, "memory(GiB)": 21.48, "step": 9120, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.951466 }, { "epoch": 0.29629990579215804, "grad_norm": 0.5912401080131531, "learning_rate": 8.431410774849871e-06, "loss": 0.037247300148010254, "memory(GiB)": 21.48, "step": 9121, "token_acc": 0.969811320754717, "train_speed(iter/s)": 0.951483 }, { "epoch": 0.29633239125491345, "grad_norm": 0.5211476683616638, "learning_rate": 8.431020064697248e-06, "loss": 0.029119841754436493, "memory(GiB)": 21.48, "step": 9122, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.9515 }, { "epoch": 0.29636487671766887, "grad_norm": 0.46024227142333984, "learning_rate": 8.430629314946089e-06, "loss": 0.027209095656871796, "memory(GiB)": 21.48, "step": 9123, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.951518 }, { "epoch": 0.2963973621804243, "grad_norm": 0.47753193974494934, "learning_rate": 8.4302385256009e-06, "loss": 0.03300967067480087, "memory(GiB)": 21.48, "step": 9124, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.951536 }, { "epoch": 0.2964298476431797, "grad_norm": 0.34134432673454285, "learning_rate": 8.429847696666192e-06, "loss": 0.034987740218639374, "memory(GiB)": 21.48, "step": 9125, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.951555 }, { "epoch": 0.2964623331059351, "grad_norm": 0.3541111648082733, "learning_rate": 8.429456828146477e-06, "loss": 0.026251547038555145, "memory(GiB)": 21.48, "step": 9126, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.951573 }, { "epoch": 0.29649481856869053, "grad_norm": 0.3311349153518677, "learning_rate": 8.429065920046267e-06, "loss": 0.021566228941082954, "memory(GiB)": 21.48, "step": 9127, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.951591 }, { "epoch": 0.29652730403144595, "grad_norm": 0.38661321997642517, "learning_rate": 8.42867497237007e-06, "loss": 0.03270823508501053, "memory(GiB)": 21.48, "step": 9128, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.951609 }, { "epoch": 0.29655978949420136, "grad_norm": 0.4075375199317932, "learning_rate": 8.428283985122401e-06, "loss": 0.030235035344958305, "memory(GiB)": 21.48, "step": 9129, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.951626 }, { "epoch": 0.2965922749569568, "grad_norm": 0.34163954854011536, "learning_rate": 8.427892958307773e-06, "loss": 0.029128391295671463, "memory(GiB)": 21.48, "step": 9130, "token_acc": 0.9869565217391304, "train_speed(iter/s)": 0.951639 }, { "epoch": 0.2966247604197122, "grad_norm": 0.3851068317890167, "learning_rate": 8.427501891930696e-06, "loss": 0.021958578377962112, "memory(GiB)": 21.48, "step": 9131, "token_acc": 0.9931506849315068, "train_speed(iter/s)": 0.951656 }, { "epoch": 0.2966572458824676, "grad_norm": 0.392765611410141, "learning_rate": 8.427110785995687e-06, "loss": 0.027951708063483238, "memory(GiB)": 21.48, "step": 9132, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.951673 }, { "epoch": 0.29668973134522303, "grad_norm": 0.46932777762413025, "learning_rate": 8.426719640507255e-06, "loss": 0.03174000605940819, "memory(GiB)": 21.48, "step": 9133, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.95169 }, { "epoch": 0.29672221680797845, "grad_norm": 0.5518534779548645, "learning_rate": 8.426328455469922e-06, "loss": 0.03700047731399536, "memory(GiB)": 21.48, "step": 9134, "token_acc": 0.9759615384615384, "train_speed(iter/s)": 0.951709 }, { "epoch": 0.29675470227073386, "grad_norm": 0.3881947696208954, "learning_rate": 8.425937230888194e-06, "loss": 0.02826714515686035, "memory(GiB)": 21.48, "step": 9135, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.951727 }, { "epoch": 0.2967871877334893, "grad_norm": 0.5911363363265991, "learning_rate": 8.425545966766591e-06, "loss": 0.04510920122265816, "memory(GiB)": 21.48, "step": 9136, "token_acc": 0.9806201550387597, "train_speed(iter/s)": 0.951746 }, { "epoch": 0.2968196731962447, "grad_norm": 0.6746933460235596, "learning_rate": 8.42515466310963e-06, "loss": 0.039464958012104034, "memory(GiB)": 21.48, "step": 9137, "token_acc": 0.992, "train_speed(iter/s)": 0.951768 }, { "epoch": 0.2968521586590001, "grad_norm": 0.5708528161048889, "learning_rate": 8.424763319921825e-06, "loss": 0.033833086490631104, "memory(GiB)": 21.48, "step": 9138, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.951791 }, { "epoch": 0.2968846441217555, "grad_norm": 0.3522757291793823, "learning_rate": 8.424371937207693e-06, "loss": 0.030285555869340897, "memory(GiB)": 21.48, "step": 9139, "token_acc": 1.0, "train_speed(iter/s)": 0.951813 }, { "epoch": 0.29691712958451094, "grad_norm": 0.3104064166545868, "learning_rate": 8.42398051497175e-06, "loss": 0.028399605304002762, "memory(GiB)": 21.48, "step": 9140, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.951836 }, { "epoch": 0.29694961504726636, "grad_norm": 0.624518871307373, "learning_rate": 8.423589053218514e-06, "loss": 0.04006190598011017, "memory(GiB)": 21.48, "step": 9141, "token_acc": 0.9862068965517241, "train_speed(iter/s)": 0.951858 }, { "epoch": 0.2969821005100218, "grad_norm": 0.4684261977672577, "learning_rate": 8.423197551952506e-06, "loss": 0.031962718814611435, "memory(GiB)": 21.48, "step": 9142, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.95188 }, { "epoch": 0.2970145859727772, "grad_norm": 0.4168684482574463, "learning_rate": 8.42280601117824e-06, "loss": 0.03194175288081169, "memory(GiB)": 21.48, "step": 9143, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.951902 }, { "epoch": 0.2970470714355326, "grad_norm": 0.48386865854263306, "learning_rate": 8.422414430900237e-06, "loss": 0.031024489551782608, "memory(GiB)": 21.48, "step": 9144, "token_acc": 0.9737991266375546, "train_speed(iter/s)": 0.951924 }, { "epoch": 0.297079556898288, "grad_norm": 0.5583562254905701, "learning_rate": 8.422022811123018e-06, "loss": 0.025459861382842064, "memory(GiB)": 21.48, "step": 9145, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.951942 }, { "epoch": 0.29711204236104344, "grad_norm": 0.5219333171844482, "learning_rate": 8.4216311518511e-06, "loss": 0.026659730821847916, "memory(GiB)": 21.48, "step": 9146, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.95196 }, { "epoch": 0.29714452782379885, "grad_norm": 0.5260218381881714, "learning_rate": 8.421239453089003e-06, "loss": 0.03191186115145683, "memory(GiB)": 21.48, "step": 9147, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.951974 }, { "epoch": 0.29717701328655427, "grad_norm": 0.403539776802063, "learning_rate": 8.420847714841251e-06, "loss": 0.027565617114305496, "memory(GiB)": 21.48, "step": 9148, "token_acc": 0.984375, "train_speed(iter/s)": 0.951993 }, { "epoch": 0.2972094987493097, "grad_norm": 1.286867380142212, "learning_rate": 8.420455937112362e-06, "loss": 0.03694351762533188, "memory(GiB)": 21.48, "step": 9149, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.952008 }, { "epoch": 0.2972419842120651, "grad_norm": 1.1665271520614624, "learning_rate": 8.420064119906859e-06, "loss": 0.04366365075111389, "memory(GiB)": 21.48, "step": 9150, "token_acc": 0.981549815498155, "train_speed(iter/s)": 0.952025 }, { "epoch": 0.2972744696748205, "grad_norm": 0.528325617313385, "learning_rate": 8.419672263229265e-06, "loss": 0.04023626446723938, "memory(GiB)": 21.48, "step": 9151, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.952041 }, { "epoch": 0.29730695513757593, "grad_norm": 0.34040582180023193, "learning_rate": 8.419280367084099e-06, "loss": 0.032488785684108734, "memory(GiB)": 21.48, "step": 9152, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.952056 }, { "epoch": 0.29733944060033135, "grad_norm": 0.4104556441307068, "learning_rate": 8.418888431475889e-06, "loss": 0.028655903413891792, "memory(GiB)": 21.48, "step": 9153, "token_acc": 1.0, "train_speed(iter/s)": 0.952073 }, { "epoch": 0.29737192606308677, "grad_norm": 0.461765319108963, "learning_rate": 8.418496456409155e-06, "loss": 0.02934155985713005, "memory(GiB)": 21.48, "step": 9154, "token_acc": 0.9933333333333333, "train_speed(iter/s)": 0.952091 }, { "epoch": 0.2974044115258422, "grad_norm": 0.2685723602771759, "learning_rate": 8.418104441888421e-06, "loss": 0.025546954944729805, "memory(GiB)": 21.48, "step": 9155, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.952107 }, { "epoch": 0.2974368969885976, "grad_norm": 0.46301063895225525, "learning_rate": 8.417712387918215e-06, "loss": 0.03413468226790428, "memory(GiB)": 21.48, "step": 9156, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.952124 }, { "epoch": 0.297469382451353, "grad_norm": 0.4425428807735443, "learning_rate": 8.417320294503057e-06, "loss": 0.034322574734687805, "memory(GiB)": 21.48, "step": 9157, "token_acc": 0.9792387543252595, "train_speed(iter/s)": 0.952138 }, { "epoch": 0.29750186791410843, "grad_norm": 0.3634589910507202, "learning_rate": 8.416928161647473e-06, "loss": 0.03105059266090393, "memory(GiB)": 21.48, "step": 9158, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.952154 }, { "epoch": 0.29753435337686385, "grad_norm": 0.3739815056324005, "learning_rate": 8.416535989355994e-06, "loss": 0.025872547179460526, "memory(GiB)": 21.48, "step": 9159, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.95217 }, { "epoch": 0.29756683883961926, "grad_norm": 0.41254085302352905, "learning_rate": 8.416143777633139e-06, "loss": 0.03955899924039841, "memory(GiB)": 21.48, "step": 9160, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.952188 }, { "epoch": 0.2975993243023747, "grad_norm": 0.5954930186271667, "learning_rate": 8.41575152648344e-06, "loss": 0.04354724660515785, "memory(GiB)": 21.48, "step": 9161, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.952205 }, { "epoch": 0.2976318097651301, "grad_norm": 0.5186598896980286, "learning_rate": 8.41535923591142e-06, "loss": 0.03691844269633293, "memory(GiB)": 21.48, "step": 9162, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.952221 }, { "epoch": 0.2976642952278855, "grad_norm": 1.880242943763733, "learning_rate": 8.41496690592161e-06, "loss": 0.0515238419175148, "memory(GiB)": 21.48, "step": 9163, "token_acc": 0.970873786407767, "train_speed(iter/s)": 0.95224 }, { "epoch": 0.2976967806906409, "grad_norm": 0.3481762707233429, "learning_rate": 8.414574536518536e-06, "loss": 0.026334360241889954, "memory(GiB)": 21.48, "step": 9164, "token_acc": 0.993006993006993, "train_speed(iter/s)": 0.952259 }, { "epoch": 0.29772926615339634, "grad_norm": 0.35160303115844727, "learning_rate": 8.414182127706728e-06, "loss": 0.031662747263908386, "memory(GiB)": 21.48, "step": 9165, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.952281 }, { "epoch": 0.29776175161615176, "grad_norm": 0.4273079037666321, "learning_rate": 8.413789679490713e-06, "loss": 0.032097235321998596, "memory(GiB)": 21.48, "step": 9166, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.952303 }, { "epoch": 0.2977942370789072, "grad_norm": 0.3725849986076355, "learning_rate": 8.413397191875021e-06, "loss": 0.030560143291950226, "memory(GiB)": 21.48, "step": 9167, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.952324 }, { "epoch": 0.2978267225416626, "grad_norm": 0.9517709016799927, "learning_rate": 8.413004664864183e-06, "loss": 0.033252570778131485, "memory(GiB)": 21.48, "step": 9168, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.952347 }, { "epoch": 0.297859208004418, "grad_norm": 0.4183047115802765, "learning_rate": 8.41261209846273e-06, "loss": 0.029390007257461548, "memory(GiB)": 21.48, "step": 9169, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.95237 }, { "epoch": 0.2978916934671734, "grad_norm": 0.34768202900886536, "learning_rate": 8.412219492675188e-06, "loss": 0.02958131581544876, "memory(GiB)": 21.48, "step": 9170, "token_acc": 0.9846938775510204, "train_speed(iter/s)": 0.952391 }, { "epoch": 0.29792417892992884, "grad_norm": 0.24234601855278015, "learning_rate": 8.411826847506092e-06, "loss": 0.022094454616308212, "memory(GiB)": 21.48, "step": 9171, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.952413 }, { "epoch": 0.29795666439268426, "grad_norm": 0.6742632985115051, "learning_rate": 8.411434162959974e-06, "loss": 0.028579657897353172, "memory(GiB)": 21.48, "step": 9172, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.952434 }, { "epoch": 0.2979891498554397, "grad_norm": 0.4047979712486267, "learning_rate": 8.411041439041366e-06, "loss": 0.027533821761608124, "memory(GiB)": 21.48, "step": 9173, "token_acc": 0.984, "train_speed(iter/s)": 0.952455 }, { "epoch": 0.2980216353181951, "grad_norm": 0.4804137051105499, "learning_rate": 8.4106486757548e-06, "loss": 0.03242801874876022, "memory(GiB)": 21.48, "step": 9174, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.952478 }, { "epoch": 0.2980541207809505, "grad_norm": 0.8413389325141907, "learning_rate": 8.410255873104806e-06, "loss": 0.026307087391614914, "memory(GiB)": 21.48, "step": 9175, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.952498 }, { "epoch": 0.2980866062437059, "grad_norm": 0.5959446430206299, "learning_rate": 8.409863031095922e-06, "loss": 0.0421457476913929, "memory(GiB)": 21.48, "step": 9176, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.952516 }, { "epoch": 0.29811909170646134, "grad_norm": 0.5210638642311096, "learning_rate": 8.409470149732682e-06, "loss": 0.03148917108774185, "memory(GiB)": 21.48, "step": 9177, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.952533 }, { "epoch": 0.29815157716921675, "grad_norm": 0.3711608350276947, "learning_rate": 8.409077229019616e-06, "loss": 0.0271576177328825, "memory(GiB)": 21.48, "step": 9178, "token_acc": 0.9835390946502057, "train_speed(iter/s)": 0.952551 }, { "epoch": 0.29818406263197217, "grad_norm": 0.5086630582809448, "learning_rate": 8.408684268961261e-06, "loss": 0.026733485981822014, "memory(GiB)": 21.48, "step": 9179, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.952567 }, { "epoch": 0.2982165480947276, "grad_norm": 0.43620914220809937, "learning_rate": 8.408291269562153e-06, "loss": 0.028896164149045944, "memory(GiB)": 21.48, "step": 9180, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.952584 }, { "epoch": 0.298249033557483, "grad_norm": 0.5195807814598083, "learning_rate": 8.407898230826828e-06, "loss": 0.036939989775419235, "memory(GiB)": 21.48, "step": 9181, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.952602 }, { "epoch": 0.2982815190202384, "grad_norm": 0.37601780891418457, "learning_rate": 8.407505152759821e-06, "loss": 0.03291868790984154, "memory(GiB)": 21.48, "step": 9182, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.95262 }, { "epoch": 0.29831400448299383, "grad_norm": 0.6875269412994385, "learning_rate": 8.407112035365668e-06, "loss": 0.028592513874173164, "memory(GiB)": 21.48, "step": 9183, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.952637 }, { "epoch": 0.2983464899457493, "grad_norm": 0.47220197319984436, "learning_rate": 8.406718878648909e-06, "loss": 0.029825452715158463, "memory(GiB)": 21.48, "step": 9184, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.952654 }, { "epoch": 0.2983789754085047, "grad_norm": 0.5156891942024231, "learning_rate": 8.406325682614079e-06, "loss": 0.025733206421136856, "memory(GiB)": 21.48, "step": 9185, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.952671 }, { "epoch": 0.29841146087126014, "grad_norm": 0.571786642074585, "learning_rate": 8.405932447265717e-06, "loss": 0.03874235600233078, "memory(GiB)": 21.48, "step": 9186, "token_acc": 0.9854014598540146, "train_speed(iter/s)": 0.952687 }, { "epoch": 0.29844394633401555, "grad_norm": 0.4603491425514221, "learning_rate": 8.405539172608362e-06, "loss": 0.030180063098669052, "memory(GiB)": 21.48, "step": 9187, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.952703 }, { "epoch": 0.29847643179677097, "grad_norm": 0.5362608432769775, "learning_rate": 8.40514585864655e-06, "loss": 0.02537725307047367, "memory(GiB)": 21.48, "step": 9188, "token_acc": 1.0, "train_speed(iter/s)": 0.952721 }, { "epoch": 0.2985089172595264, "grad_norm": 0.7500089406967163, "learning_rate": 8.404752505384825e-06, "loss": 0.03565669804811478, "memory(GiB)": 21.48, "step": 9189, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.952738 }, { "epoch": 0.2985414027222818, "grad_norm": 0.6612399816513062, "learning_rate": 8.404359112827724e-06, "loss": 0.034988995641469955, "memory(GiB)": 21.48, "step": 9190, "token_acc": 0.9727272727272728, "train_speed(iter/s)": 0.952756 }, { "epoch": 0.2985738881850372, "grad_norm": 0.4860261082649231, "learning_rate": 8.403965680979787e-06, "loss": 0.02897864580154419, "memory(GiB)": 21.48, "step": 9191, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.952774 }, { "epoch": 0.29860637364779263, "grad_norm": 0.4249024987220764, "learning_rate": 8.403572209845555e-06, "loss": 0.03592279553413391, "memory(GiB)": 21.48, "step": 9192, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.952792 }, { "epoch": 0.29863885911054805, "grad_norm": 0.34793996810913086, "learning_rate": 8.403178699429571e-06, "loss": 0.025331348180770874, "memory(GiB)": 21.48, "step": 9193, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.952808 }, { "epoch": 0.29867134457330347, "grad_norm": 0.6284539699554443, "learning_rate": 8.402785149736373e-06, "loss": 0.04598185792565346, "memory(GiB)": 21.48, "step": 9194, "token_acc": 0.9764309764309764, "train_speed(iter/s)": 0.952823 }, { "epoch": 0.2987038300360589, "grad_norm": 0.5863491296768188, "learning_rate": 8.402391560770507e-06, "loss": 0.028049279004335403, "memory(GiB)": 21.48, "step": 9195, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.952839 }, { "epoch": 0.2987363154988143, "grad_norm": 0.4752229154109955, "learning_rate": 8.401997932536512e-06, "loss": 0.022034376859664917, "memory(GiB)": 21.48, "step": 9196, "token_acc": 1.0, "train_speed(iter/s)": 0.952857 }, { "epoch": 0.2987688009615697, "grad_norm": 0.5754375457763672, "learning_rate": 8.401604265038935e-06, "loss": 0.028724387288093567, "memory(GiB)": 21.48, "step": 9197, "token_acc": 0.98828125, "train_speed(iter/s)": 0.952874 }, { "epoch": 0.29880128642432513, "grad_norm": 0.4988781809806824, "learning_rate": 8.401210558282315e-06, "loss": 0.032007478177547455, "memory(GiB)": 21.48, "step": 9198, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.952892 }, { "epoch": 0.29883377188708055, "grad_norm": 0.4050787687301636, "learning_rate": 8.400816812271201e-06, "loss": 0.030442843213677406, "memory(GiB)": 21.48, "step": 9199, "token_acc": 0.985663082437276, "train_speed(iter/s)": 0.952913 }, { "epoch": 0.29886625734983596, "grad_norm": 0.5408922433853149, "learning_rate": 8.400423027010132e-06, "loss": 0.028129026293754578, "memory(GiB)": 21.48, "step": 9200, "token_acc": 1.0, "train_speed(iter/s)": 0.952936 }, { "epoch": 0.2988987428125914, "grad_norm": 0.4636714458465576, "learning_rate": 8.400029202503657e-06, "loss": 0.03441125899553299, "memory(GiB)": 21.48, "step": 9201, "token_acc": 0.978494623655914, "train_speed(iter/s)": 0.952958 }, { "epoch": 0.2989312282753468, "grad_norm": 0.3774900734424591, "learning_rate": 8.39963533875632e-06, "loss": 0.029106853529810905, "memory(GiB)": 21.48, "step": 9202, "token_acc": 0.9921875, "train_speed(iter/s)": 0.952981 }, { "epoch": 0.2989637137381022, "grad_norm": 0.38555195927619934, "learning_rate": 8.399241435772666e-06, "loss": 0.026802711188793182, "memory(GiB)": 21.48, "step": 9203, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.953003 }, { "epoch": 0.2989961992008576, "grad_norm": 0.3392806649208069, "learning_rate": 8.39884749355724e-06, "loss": 0.029464706778526306, "memory(GiB)": 21.48, "step": 9204, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.953021 }, { "epoch": 0.29902868466361304, "grad_norm": 0.40461134910583496, "learning_rate": 8.398453512114591e-06, "loss": 0.028958803042769432, "memory(GiB)": 21.48, "step": 9205, "token_acc": 0.9794238683127572, "train_speed(iter/s)": 0.953038 }, { "epoch": 0.29906117012636846, "grad_norm": 0.41805416345596313, "learning_rate": 8.398059491449264e-06, "loss": 0.032964929938316345, "memory(GiB)": 21.48, "step": 9206, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.953056 }, { "epoch": 0.2990936555891239, "grad_norm": 0.4717754125595093, "learning_rate": 8.39766543156581e-06, "loss": 0.03333951532840729, "memory(GiB)": 21.48, "step": 9207, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.953071 }, { "epoch": 0.2991261410518793, "grad_norm": 0.44321420788764954, "learning_rate": 8.397271332468772e-06, "loss": 0.03520085662603378, "memory(GiB)": 21.48, "step": 9208, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.953087 }, { "epoch": 0.2991586265146347, "grad_norm": 0.49847882986068726, "learning_rate": 8.396877194162702e-06, "loss": 0.0403059720993042, "memory(GiB)": 21.48, "step": 9209, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.953104 }, { "epoch": 0.2991911119773901, "grad_norm": 0.26676616072654724, "learning_rate": 8.396483016652148e-06, "loss": 0.023362094536423683, "memory(GiB)": 21.48, "step": 9210, "token_acc": 1.0, "train_speed(iter/s)": 0.953123 }, { "epoch": 0.29922359744014554, "grad_norm": 0.5146535038948059, "learning_rate": 8.396088799941659e-06, "loss": 0.03774639591574669, "memory(GiB)": 21.48, "step": 9211, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.95314 }, { "epoch": 0.29925608290290096, "grad_norm": 0.41623353958129883, "learning_rate": 8.395694544035786e-06, "loss": 0.02662501484155655, "memory(GiB)": 21.48, "step": 9212, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.953157 }, { "epoch": 0.29928856836565637, "grad_norm": 0.48176079988479614, "learning_rate": 8.395300248939077e-06, "loss": 0.03188702464103699, "memory(GiB)": 21.48, "step": 9213, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.953175 }, { "epoch": 0.2993210538284118, "grad_norm": 0.404985249042511, "learning_rate": 8.394905914656087e-06, "loss": 0.028634052723646164, "memory(GiB)": 21.48, "step": 9214, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.953193 }, { "epoch": 0.2993535392911672, "grad_norm": 0.29838815331459045, "learning_rate": 8.39451154119136e-06, "loss": 0.03001304343342781, "memory(GiB)": 21.48, "step": 9215, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.95321 }, { "epoch": 0.2993860247539226, "grad_norm": 0.6709513664245605, "learning_rate": 8.394117128549455e-06, "loss": 0.034629203379154205, "memory(GiB)": 21.48, "step": 9216, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.953225 }, { "epoch": 0.29941851021667804, "grad_norm": 0.39020922780036926, "learning_rate": 8.393722676734919e-06, "loss": 0.03501586988568306, "memory(GiB)": 21.48, "step": 9217, "token_acc": 1.0, "train_speed(iter/s)": 0.95324 }, { "epoch": 0.29945099567943345, "grad_norm": 0.5455659627914429, "learning_rate": 8.393328185752307e-06, "loss": 0.03891846537590027, "memory(GiB)": 21.48, "step": 9218, "token_acc": 0.9847908745247148, "train_speed(iter/s)": 0.953258 }, { "epoch": 0.29948348114218887, "grad_norm": 0.6615259051322937, "learning_rate": 8.39293365560617e-06, "loss": 0.03805222362279892, "memory(GiB)": 21.48, "step": 9219, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.953276 }, { "epoch": 0.2995159666049443, "grad_norm": 0.5916197896003723, "learning_rate": 8.392539086301063e-06, "loss": 0.03380968049168587, "memory(GiB)": 21.48, "step": 9220, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.953294 }, { "epoch": 0.2995484520676997, "grad_norm": 0.3609217405319214, "learning_rate": 8.392144477841542e-06, "loss": 0.03067972883582115, "memory(GiB)": 21.48, "step": 9221, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.953312 }, { "epoch": 0.2995809375304551, "grad_norm": 0.5154979825019836, "learning_rate": 8.391749830232156e-06, "loss": 0.04106006771326065, "memory(GiB)": 21.48, "step": 9222, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.953327 }, { "epoch": 0.29961342299321053, "grad_norm": 0.4840318560600281, "learning_rate": 8.391355143477465e-06, "loss": 0.03069670870900154, "memory(GiB)": 21.48, "step": 9223, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.953345 }, { "epoch": 0.29964590845596595, "grad_norm": 0.5254478454589844, "learning_rate": 8.390960417582019e-06, "loss": 0.03882443159818649, "memory(GiB)": 21.48, "step": 9224, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.953366 }, { "epoch": 0.29967839391872136, "grad_norm": 0.49145010113716125, "learning_rate": 8.390565652550378e-06, "loss": 0.03855983912944794, "memory(GiB)": 21.48, "step": 9225, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.953389 }, { "epoch": 0.2997108793814768, "grad_norm": 0.2587546706199646, "learning_rate": 8.390170848387096e-06, "loss": 0.026119258254766464, "memory(GiB)": 21.48, "step": 9226, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.953412 }, { "epoch": 0.2997433648442322, "grad_norm": 0.8374677300453186, "learning_rate": 8.389776005096731e-06, "loss": 0.03491941839456558, "memory(GiB)": 21.48, "step": 9227, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.953434 }, { "epoch": 0.2997758503069876, "grad_norm": 0.41708192229270935, "learning_rate": 8.389381122683838e-06, "loss": 0.02318263240158558, "memory(GiB)": 21.48, "step": 9228, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.953455 }, { "epoch": 0.29980833576974303, "grad_norm": 0.8772980570793152, "learning_rate": 8.388986201152975e-06, "loss": 0.04349145665764809, "memory(GiB)": 21.48, "step": 9229, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.953476 }, { "epoch": 0.29984082123249844, "grad_norm": 0.3040199875831604, "learning_rate": 8.388591240508702e-06, "loss": 0.021472640335559845, "memory(GiB)": 21.48, "step": 9230, "token_acc": 1.0, "train_speed(iter/s)": 0.953498 }, { "epoch": 0.29987330669525386, "grad_norm": 0.3505140244960785, "learning_rate": 8.388196240755576e-06, "loss": 0.027257874608039856, "memory(GiB)": 21.48, "step": 9231, "token_acc": 0.9966555183946488, "train_speed(iter/s)": 0.953521 }, { "epoch": 0.2999057921580093, "grad_norm": 0.46640363335609436, "learning_rate": 8.387801201898154e-06, "loss": 0.036428142338991165, "memory(GiB)": 21.48, "step": 9232, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.953543 }, { "epoch": 0.2999382776207647, "grad_norm": 0.4606074094772339, "learning_rate": 8.387406123940998e-06, "loss": 0.031911324709653854, "memory(GiB)": 21.48, "step": 9233, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.953564 }, { "epoch": 0.2999707630835201, "grad_norm": 0.4292941689491272, "learning_rate": 8.387011006888666e-06, "loss": 0.03762989491224289, "memory(GiB)": 21.48, "step": 9234, "token_acc": 0.9808429118773946, "train_speed(iter/s)": 0.953586 }, { "epoch": 0.3000032485462755, "grad_norm": 0.6178690195083618, "learning_rate": 8.386615850745721e-06, "loss": 0.04206050559878349, "memory(GiB)": 21.48, "step": 9235, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.953558 }, { "epoch": 0.30003573400903094, "grad_norm": 0.5396489500999451, "learning_rate": 8.386220655516718e-06, "loss": 0.03134866803884506, "memory(GiB)": 21.48, "step": 9236, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.953578 }, { "epoch": 0.30006821947178636, "grad_norm": 0.35462307929992676, "learning_rate": 8.385825421206225e-06, "loss": 0.032136693596839905, "memory(GiB)": 21.48, "step": 9237, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.953597 }, { "epoch": 0.3001007049345418, "grad_norm": 0.4975586533546448, "learning_rate": 8.385430147818798e-06, "loss": 0.034592531621456146, "memory(GiB)": 21.48, "step": 9238, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.953615 }, { "epoch": 0.3001331903972972, "grad_norm": 0.4307470917701721, "learning_rate": 8.385034835359004e-06, "loss": 0.02841581590473652, "memory(GiB)": 21.48, "step": 9239, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.953631 }, { "epoch": 0.3001656758600526, "grad_norm": 0.2762768268585205, "learning_rate": 8.3846394838314e-06, "loss": 0.027490269392728806, "memory(GiB)": 21.48, "step": 9240, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.953647 }, { "epoch": 0.300198161322808, "grad_norm": 0.7606258392333984, "learning_rate": 8.38424409324055e-06, "loss": 0.0397380106151104, "memory(GiB)": 21.48, "step": 9241, "token_acc": 0.99609375, "train_speed(iter/s)": 0.953663 }, { "epoch": 0.30023064678556344, "grad_norm": 2.4775099754333496, "learning_rate": 8.38384866359102e-06, "loss": 0.045525506138801575, "memory(GiB)": 21.48, "step": 9242, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.953678 }, { "epoch": 0.30026313224831885, "grad_norm": 0.5289356708526611, "learning_rate": 8.383453194887375e-06, "loss": 0.03464249521493912, "memory(GiB)": 21.48, "step": 9243, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.953695 }, { "epoch": 0.30029561771107427, "grad_norm": 0.3696177005767822, "learning_rate": 8.383057687134176e-06, "loss": 0.0362430065870285, "memory(GiB)": 21.48, "step": 9244, "token_acc": 0.986159169550173, "train_speed(iter/s)": 0.953713 }, { "epoch": 0.3003281031738297, "grad_norm": 0.5661122798919678, "learning_rate": 8.382662140335986e-06, "loss": 0.03376961871981621, "memory(GiB)": 21.48, "step": 9245, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.953729 }, { "epoch": 0.3003605886365851, "grad_norm": 0.46499717235565186, "learning_rate": 8.382266554497374e-06, "loss": 0.031415387988090515, "memory(GiB)": 21.48, "step": 9246, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.953744 }, { "epoch": 0.3003930740993405, "grad_norm": 0.3950554430484772, "learning_rate": 8.381870929622905e-06, "loss": 0.03287441283464432, "memory(GiB)": 21.48, "step": 9247, "token_acc": 0.9793388429752066, "train_speed(iter/s)": 0.95376 }, { "epoch": 0.300425559562096, "grad_norm": 0.7100783586502075, "learning_rate": 8.381475265717143e-06, "loss": 0.03512493520975113, "memory(GiB)": 21.48, "step": 9248, "token_acc": 0.9763313609467456, "train_speed(iter/s)": 0.953777 }, { "epoch": 0.3004580450248514, "grad_norm": 0.7567042112350464, "learning_rate": 8.381079562784656e-06, "loss": 0.033256929367780685, "memory(GiB)": 21.48, "step": 9249, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.953795 }, { "epoch": 0.3004905304876068, "grad_norm": 0.44882163405418396, "learning_rate": 8.38068382083001e-06, "loss": 0.030238818377256393, "memory(GiB)": 21.48, "step": 9250, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.953813 }, { "epoch": 0.30052301595036224, "grad_norm": 0.34657546877861023, "learning_rate": 8.380288039857773e-06, "loss": 0.0316443033516407, "memory(GiB)": 21.48, "step": 9251, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.953831 }, { "epoch": 0.30055550141311765, "grad_norm": 0.33638420701026917, "learning_rate": 8.379892219872515e-06, "loss": 0.023163005709648132, "memory(GiB)": 21.48, "step": 9252, "token_acc": 0.993006993006993, "train_speed(iter/s)": 0.953849 }, { "epoch": 0.30058798687587307, "grad_norm": 0.4146237075328827, "learning_rate": 8.379496360878801e-06, "loss": 0.034165605902671814, "memory(GiB)": 21.48, "step": 9253, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.953867 }, { "epoch": 0.3006204723386285, "grad_norm": 0.4512048065662384, "learning_rate": 8.3791004628812e-06, "loss": 0.032606448978185654, "memory(GiB)": 21.48, "step": 9254, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.953885 }, { "epoch": 0.3006529578013839, "grad_norm": 0.7270491123199463, "learning_rate": 8.378704525884282e-06, "loss": 0.03661080449819565, "memory(GiB)": 21.48, "step": 9255, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.953899 }, { "epoch": 0.3006854432641393, "grad_norm": 0.39603155851364136, "learning_rate": 8.378308549892618e-06, "loss": 0.0263497456908226, "memory(GiB)": 21.48, "step": 9256, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.953914 }, { "epoch": 0.30071792872689473, "grad_norm": 0.3554723262786865, "learning_rate": 8.377912534910775e-06, "loss": 0.021680159494280815, "memory(GiB)": 21.48, "step": 9257, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.953932 }, { "epoch": 0.30075041418965015, "grad_norm": 0.3314180076122284, "learning_rate": 8.377516480943328e-06, "loss": 0.0292877908796072, "memory(GiB)": 21.48, "step": 9258, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.953949 }, { "epoch": 0.30078289965240557, "grad_norm": 0.41233858466148376, "learning_rate": 8.377120387994844e-06, "loss": 0.025103457272052765, "memory(GiB)": 21.48, "step": 9259, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.953966 }, { "epoch": 0.300815385115161, "grad_norm": 0.5777756571769714, "learning_rate": 8.376724256069894e-06, "loss": 0.030871104449033737, "memory(GiB)": 21.48, "step": 9260, "token_acc": 0.9875, "train_speed(iter/s)": 0.953985 }, { "epoch": 0.3008478705779164, "grad_norm": 0.5175139904022217, "learning_rate": 8.376328085173055e-06, "loss": 0.025770287960767746, "memory(GiB)": 21.48, "step": 9261, "token_acc": 0.984, "train_speed(iter/s)": 0.954005 }, { "epoch": 0.3008803560406718, "grad_norm": 0.398362398147583, "learning_rate": 8.375931875308893e-06, "loss": 0.030893566086888313, "memory(GiB)": 21.48, "step": 9262, "token_acc": 0.9676113360323887, "train_speed(iter/s)": 0.954023 }, { "epoch": 0.30091284150342723, "grad_norm": 0.4914916157722473, "learning_rate": 8.375535626481987e-06, "loss": 0.03590754047036171, "memory(GiB)": 21.48, "step": 9263, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.954038 }, { "epoch": 0.30094532696618265, "grad_norm": 0.4320192039012909, "learning_rate": 8.375139338696906e-06, "loss": 0.034555066376924515, "memory(GiB)": 21.48, "step": 9264, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.954056 }, { "epoch": 0.30097781242893806, "grad_norm": 1.127276062965393, "learning_rate": 8.374743011958222e-06, "loss": 0.04583250731229782, "memory(GiB)": 21.48, "step": 9265, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.954074 }, { "epoch": 0.3010102978916935, "grad_norm": 0.3679385185241699, "learning_rate": 8.374346646270514e-06, "loss": 0.029920045286417007, "memory(GiB)": 21.48, "step": 9266, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.954091 }, { "epoch": 0.3010427833544489, "grad_norm": 0.6322349905967712, "learning_rate": 8.373950241638355e-06, "loss": 0.03323042765259743, "memory(GiB)": 21.48, "step": 9267, "token_acc": 0.9783393501805054, "train_speed(iter/s)": 0.954106 }, { "epoch": 0.3010752688172043, "grad_norm": 0.5369604229927063, "learning_rate": 8.373553798066319e-06, "loss": 0.036178648471832275, "memory(GiB)": 21.48, "step": 9268, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.954122 }, { "epoch": 0.3011077542799597, "grad_norm": 1.4070286750793457, "learning_rate": 8.373157315558982e-06, "loss": 0.028417445719242096, "memory(GiB)": 21.48, "step": 9269, "token_acc": 0.9823529411764705, "train_speed(iter/s)": 0.954139 }, { "epoch": 0.30114023974271514, "grad_norm": 0.3717634975910187, "learning_rate": 8.37276079412092e-06, "loss": 0.023102905601263046, "memory(GiB)": 21.48, "step": 9270, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.954156 }, { "epoch": 0.30117272520547056, "grad_norm": 0.5900014638900757, "learning_rate": 8.372364233756709e-06, "loss": 0.04226076230406761, "memory(GiB)": 21.48, "step": 9271, "token_acc": 0.9774436090225563, "train_speed(iter/s)": 0.954173 }, { "epoch": 0.301205210668226, "grad_norm": 0.32675817608833313, "learning_rate": 8.371967634470926e-06, "loss": 0.028819261118769646, "memory(GiB)": 21.48, "step": 9272, "token_acc": 0.9837837837837838, "train_speed(iter/s)": 0.954188 }, { "epoch": 0.3012376961309814, "grad_norm": 0.6121599674224854, "learning_rate": 8.371570996268148e-06, "loss": 0.03070741333067417, "memory(GiB)": 21.48, "step": 9273, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.954204 }, { "epoch": 0.3012701815937368, "grad_norm": 0.37618288397789, "learning_rate": 8.371174319152952e-06, "loss": 0.026119999587535858, "memory(GiB)": 21.48, "step": 9274, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.954222 }, { "epoch": 0.3013026670564922, "grad_norm": 0.5297830700874329, "learning_rate": 8.37077760312992e-06, "loss": 0.03446866571903229, "memory(GiB)": 21.48, "step": 9275, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.954238 }, { "epoch": 0.30133515251924764, "grad_norm": 0.441420316696167, "learning_rate": 8.370380848203627e-06, "loss": 0.038011081516742706, "memory(GiB)": 21.48, "step": 9276, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.954255 }, { "epoch": 0.30136763798200306, "grad_norm": 0.5035869479179382, "learning_rate": 8.369984054378654e-06, "loss": 0.028840690851211548, "memory(GiB)": 21.48, "step": 9277, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.95427 }, { "epoch": 0.30140012344475847, "grad_norm": 0.6072424650192261, "learning_rate": 8.369587221659579e-06, "loss": 0.04025889188051224, "memory(GiB)": 21.48, "step": 9278, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.954285 }, { "epoch": 0.3014326089075139, "grad_norm": 0.4630284309387207, "learning_rate": 8.369190350050982e-06, "loss": 0.031542789191007614, "memory(GiB)": 21.48, "step": 9279, "token_acc": 0.9731182795698925, "train_speed(iter/s)": 0.9543 }, { "epoch": 0.3014650943702693, "grad_norm": 0.4352375268936157, "learning_rate": 8.368793439557444e-06, "loss": 0.02849583886563778, "memory(GiB)": 21.48, "step": 9280, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.954317 }, { "epoch": 0.3014975798330247, "grad_norm": 0.5414227843284607, "learning_rate": 8.368396490183545e-06, "loss": 0.03995175659656525, "memory(GiB)": 21.48, "step": 9281, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.954335 }, { "epoch": 0.30153006529578014, "grad_norm": 0.516693651676178, "learning_rate": 8.367999501933868e-06, "loss": 0.04084300622344017, "memory(GiB)": 21.48, "step": 9282, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.954356 }, { "epoch": 0.30156255075853555, "grad_norm": 0.473906934261322, "learning_rate": 8.367602474812995e-06, "loss": 0.04195142537355423, "memory(GiB)": 21.48, "step": 9283, "token_acc": 0.981549815498155, "train_speed(iter/s)": 0.954377 }, { "epoch": 0.30159503622129097, "grad_norm": 0.5827725529670715, "learning_rate": 8.367205408825508e-06, "loss": 0.04073716700077057, "memory(GiB)": 21.48, "step": 9284, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.954399 }, { "epoch": 0.3016275216840464, "grad_norm": 0.44822385907173157, "learning_rate": 8.366808303975985e-06, "loss": 0.029738936573266983, "memory(GiB)": 21.48, "step": 9285, "token_acc": 0.9794871794871794, "train_speed(iter/s)": 0.95442 }, { "epoch": 0.3016600071468018, "grad_norm": 0.4156997799873352, "learning_rate": 8.366411160269015e-06, "loss": 0.02975991740822792, "memory(GiB)": 21.48, "step": 9286, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.954443 }, { "epoch": 0.3016924926095572, "grad_norm": 0.3522663712501526, "learning_rate": 8.36601397770918e-06, "loss": 0.022614043205976486, "memory(GiB)": 21.48, "step": 9287, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.954464 }, { "epoch": 0.30172497807231263, "grad_norm": 0.4156879186630249, "learning_rate": 8.365616756301062e-06, "loss": 0.027260512113571167, "memory(GiB)": 21.48, "step": 9288, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.954487 }, { "epoch": 0.30175746353506805, "grad_norm": 0.3610149025917053, "learning_rate": 8.36521949604925e-06, "loss": 0.02486371621489525, "memory(GiB)": 21.48, "step": 9289, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.954509 }, { "epoch": 0.30178994899782347, "grad_norm": 0.3573266267776489, "learning_rate": 8.364822196958325e-06, "loss": 0.035226307809352875, "memory(GiB)": 21.48, "step": 9290, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.954531 }, { "epoch": 0.3018224344605789, "grad_norm": 0.7407911419868469, "learning_rate": 8.364424859032871e-06, "loss": 0.04390067979693413, "memory(GiB)": 21.48, "step": 9291, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.954553 }, { "epoch": 0.3018549199233343, "grad_norm": 0.7298873662948608, "learning_rate": 8.364027482277477e-06, "loss": 0.034144945442676544, "memory(GiB)": 21.48, "step": 9292, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.954574 }, { "epoch": 0.3018874053860897, "grad_norm": 0.772881805896759, "learning_rate": 8.363630066696729e-06, "loss": 0.027593988925218582, "memory(GiB)": 21.48, "step": 9293, "token_acc": 1.0, "train_speed(iter/s)": 0.954596 }, { "epoch": 0.30191989084884513, "grad_norm": 0.3811693787574768, "learning_rate": 8.363232612295214e-06, "loss": 0.024327708408236504, "memory(GiB)": 21.48, "step": 9294, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.954615 }, { "epoch": 0.30195237631160055, "grad_norm": 0.6423739790916443, "learning_rate": 8.362835119077516e-06, "loss": 0.035276152193546295, "memory(GiB)": 21.48, "step": 9295, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.954637 }, { "epoch": 0.30198486177435596, "grad_norm": 0.5041701197624207, "learning_rate": 8.362437587048227e-06, "loss": 0.03496713936328888, "memory(GiB)": 21.48, "step": 9296, "token_acc": 1.0, "train_speed(iter/s)": 0.954658 }, { "epoch": 0.3020173472371114, "grad_norm": 0.4288730323314667, "learning_rate": 8.362040016211933e-06, "loss": 0.03589458763599396, "memory(GiB)": 21.48, "step": 9297, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.954679 }, { "epoch": 0.3020498326998668, "grad_norm": 0.43867388367652893, "learning_rate": 8.361642406573219e-06, "loss": 0.023611687123775482, "memory(GiB)": 21.48, "step": 9298, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.954696 }, { "epoch": 0.3020823181626222, "grad_norm": 0.5203248858451843, "learning_rate": 8.36124475813668e-06, "loss": 0.034305840730667114, "memory(GiB)": 21.48, "step": 9299, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.954714 }, { "epoch": 0.3021148036253776, "grad_norm": 0.37717393040657043, "learning_rate": 8.360847070906902e-06, "loss": 0.030365463346242905, "memory(GiB)": 21.48, "step": 9300, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.954732 }, { "epoch": 0.30214728908813304, "grad_norm": 0.9917218089103699, "learning_rate": 8.360449344888475e-06, "loss": 0.032561276108026505, "memory(GiB)": 21.48, "step": 9301, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.95475 }, { "epoch": 0.30217977455088846, "grad_norm": 0.35027098655700684, "learning_rate": 8.36005158008599e-06, "loss": 0.025754712522029877, "memory(GiB)": 21.48, "step": 9302, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.954768 }, { "epoch": 0.3022122600136439, "grad_norm": 0.3855791985988617, "learning_rate": 8.359653776504038e-06, "loss": 0.034354038536548615, "memory(GiB)": 21.48, "step": 9303, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.954786 }, { "epoch": 0.3022447454763993, "grad_norm": 0.41328054666519165, "learning_rate": 8.35925593414721e-06, "loss": 0.033898331224918365, "memory(GiB)": 21.48, "step": 9304, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.954804 }, { "epoch": 0.3022772309391547, "grad_norm": 0.4148056209087372, "learning_rate": 8.358858053020097e-06, "loss": 0.032857660204172134, "memory(GiB)": 21.48, "step": 9305, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.954823 }, { "epoch": 0.3023097164019101, "grad_norm": 0.6240099668502808, "learning_rate": 8.358460133127291e-06, "loss": 0.037244245409965515, "memory(GiB)": 21.48, "step": 9306, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.95484 }, { "epoch": 0.30234220186466554, "grad_norm": 0.2692927420139313, "learning_rate": 8.358062174473386e-06, "loss": 0.01735406555235386, "memory(GiB)": 21.48, "step": 9307, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.954857 }, { "epoch": 0.30237468732742095, "grad_norm": 0.3235291540622711, "learning_rate": 8.357664177062972e-06, "loss": 0.0325780026614666, "memory(GiB)": 21.48, "step": 9308, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.954874 }, { "epoch": 0.30240717279017637, "grad_norm": 0.9353143572807312, "learning_rate": 8.357266140900647e-06, "loss": 0.03893228992819786, "memory(GiB)": 21.48, "step": 9309, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.95489 }, { "epoch": 0.3024396582529318, "grad_norm": 0.4885941445827484, "learning_rate": 8.356868065991002e-06, "loss": 0.031828247010707855, "memory(GiB)": 21.48, "step": 9310, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.954908 }, { "epoch": 0.3024721437156872, "grad_norm": 0.7033783197402954, "learning_rate": 8.35646995233863e-06, "loss": 0.03290501609444618, "memory(GiB)": 21.48, "step": 9311, "token_acc": 0.9822695035460993, "train_speed(iter/s)": 0.954925 }, { "epoch": 0.3025046291784427, "grad_norm": 0.3570588231086731, "learning_rate": 8.356071799948127e-06, "loss": 0.02335885725915432, "memory(GiB)": 21.48, "step": 9312, "token_acc": 0.9834710743801653, "train_speed(iter/s)": 0.954941 }, { "epoch": 0.3025371146411981, "grad_norm": 0.5486128926277161, "learning_rate": 8.35567360882409e-06, "loss": 0.03065507300198078, "memory(GiB)": 21.48, "step": 9313, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.954959 }, { "epoch": 0.3025696001039535, "grad_norm": 0.43591973185539246, "learning_rate": 8.355275378971112e-06, "loss": 0.027738366276025772, "memory(GiB)": 21.48, "step": 9314, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.954975 }, { "epoch": 0.3026020855667089, "grad_norm": 0.41208505630493164, "learning_rate": 8.354877110393791e-06, "loss": 0.032396819442510605, "memory(GiB)": 21.48, "step": 9315, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.954992 }, { "epoch": 0.30263457102946434, "grad_norm": 3.7816576957702637, "learning_rate": 8.354478803096722e-06, "loss": 0.04262249544262886, "memory(GiB)": 21.48, "step": 9316, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.955009 }, { "epoch": 0.30266705649221975, "grad_norm": 0.5296310186386108, "learning_rate": 8.354080457084503e-06, "loss": 0.03283252194523811, "memory(GiB)": 21.48, "step": 9317, "token_acc": 0.986159169550173, "train_speed(iter/s)": 0.955023 }, { "epoch": 0.30269954195497517, "grad_norm": 0.46512091159820557, "learning_rate": 8.353682072361733e-06, "loss": 0.03421642631292343, "memory(GiB)": 21.48, "step": 9318, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955039 }, { "epoch": 0.3027320274177306, "grad_norm": 0.37452879548072815, "learning_rate": 8.353283648933005e-06, "loss": 0.029786434024572372, "memory(GiB)": 21.48, "step": 9319, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.955057 }, { "epoch": 0.302764512880486, "grad_norm": 2.118663787841797, "learning_rate": 8.352885186802923e-06, "loss": 0.03307407721877098, "memory(GiB)": 21.48, "step": 9320, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.955073 }, { "epoch": 0.3027969983432414, "grad_norm": 0.5253445506095886, "learning_rate": 8.352486685976082e-06, "loss": 0.028102170675992966, "memory(GiB)": 21.48, "step": 9321, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955091 }, { "epoch": 0.30282948380599684, "grad_norm": 0.4695342183113098, "learning_rate": 8.352088146457082e-06, "loss": 0.035203054547309875, "memory(GiB)": 21.48, "step": 9322, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.955106 }, { "epoch": 0.30286196926875225, "grad_norm": 0.3519381880760193, "learning_rate": 8.351689568250525e-06, "loss": 0.03580424189567566, "memory(GiB)": 21.48, "step": 9323, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.955125 }, { "epoch": 0.30289445473150767, "grad_norm": 0.37032631039619446, "learning_rate": 8.351290951361008e-06, "loss": 0.029730848968029022, "memory(GiB)": 21.48, "step": 9324, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.955142 }, { "epoch": 0.3029269401942631, "grad_norm": 0.4552975296974182, "learning_rate": 8.350892295793134e-06, "loss": 0.03776932507753372, "memory(GiB)": 21.48, "step": 9325, "token_acc": 0.98, "train_speed(iter/s)": 0.95516 }, { "epoch": 0.3029594256570185, "grad_norm": 0.3854297697544098, "learning_rate": 8.350493601551503e-06, "loss": 0.02782832458615303, "memory(GiB)": 21.48, "step": 9326, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955177 }, { "epoch": 0.3029919111197739, "grad_norm": 0.3455731272697449, "learning_rate": 8.350094868640714e-06, "loss": 0.031492479145526886, "memory(GiB)": 21.48, "step": 9327, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.955195 }, { "epoch": 0.30302439658252933, "grad_norm": 0.3292074203491211, "learning_rate": 8.349696097065374e-06, "loss": 0.03424179553985596, "memory(GiB)": 21.48, "step": 9328, "token_acc": 0.9859649122807017, "train_speed(iter/s)": 0.955212 }, { "epoch": 0.30305688204528475, "grad_norm": 0.47947800159454346, "learning_rate": 8.34929728683008e-06, "loss": 0.028754424303770065, "memory(GiB)": 21.48, "step": 9329, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955229 }, { "epoch": 0.30308936750804016, "grad_norm": 0.4456624984741211, "learning_rate": 8.34889843793944e-06, "loss": 0.031443558633327484, "memory(GiB)": 21.48, "step": 9330, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.955244 }, { "epoch": 0.3031218529707956, "grad_norm": 0.4729234576225281, "learning_rate": 8.348499550398054e-06, "loss": 0.02534853294491768, "memory(GiB)": 21.48, "step": 9331, "token_acc": 0.9813953488372092, "train_speed(iter/s)": 0.955259 }, { "epoch": 0.303154338433551, "grad_norm": 0.3597323000431061, "learning_rate": 8.348100624210524e-06, "loss": 0.03196488693356514, "memory(GiB)": 21.48, "step": 9332, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.955276 }, { "epoch": 0.3031868238963064, "grad_norm": 0.34986746311187744, "learning_rate": 8.34770165938146e-06, "loss": 0.02040114998817444, "memory(GiB)": 21.48, "step": 9333, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.955294 }, { "epoch": 0.30321930935906183, "grad_norm": 0.49645495414733887, "learning_rate": 8.347302655915461e-06, "loss": 0.03455037251114845, "memory(GiB)": 21.48, "step": 9334, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.955309 }, { "epoch": 0.30325179482181724, "grad_norm": 0.6556152105331421, "learning_rate": 8.346903613817134e-06, "loss": 0.029290108010172844, "memory(GiB)": 21.48, "step": 9335, "token_acc": 0.9968553459119497, "train_speed(iter/s)": 0.955327 }, { "epoch": 0.30328428028457266, "grad_norm": 0.5077918171882629, "learning_rate": 8.346504533091085e-06, "loss": 0.026361588388681412, "memory(GiB)": 21.48, "step": 9336, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.955344 }, { "epoch": 0.3033167657473281, "grad_norm": 0.41225796937942505, "learning_rate": 8.34610541374192e-06, "loss": 0.016849715262651443, "memory(GiB)": 21.48, "step": 9337, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.95536 }, { "epoch": 0.3033492512100835, "grad_norm": 0.400992214679718, "learning_rate": 8.345706255774245e-06, "loss": 0.026705343276262283, "memory(GiB)": 21.48, "step": 9338, "token_acc": 0.9717314487632509, "train_speed(iter/s)": 0.955377 }, { "epoch": 0.3033817366728389, "grad_norm": 0.6684608459472656, "learning_rate": 8.345307059192665e-06, "loss": 0.036381304264068604, "memory(GiB)": 21.48, "step": 9339, "token_acc": 0.9832214765100671, "train_speed(iter/s)": 0.955394 }, { "epoch": 0.3034142221355943, "grad_norm": 0.46828728914260864, "learning_rate": 8.34490782400179e-06, "loss": 0.025649849325418472, "memory(GiB)": 21.48, "step": 9340, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955411 }, { "epoch": 0.30344670759834974, "grad_norm": 0.7161285281181335, "learning_rate": 8.344508550206227e-06, "loss": 0.03541029244661331, "memory(GiB)": 21.48, "step": 9341, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.955429 }, { "epoch": 0.30347919306110516, "grad_norm": 0.42185866832733154, "learning_rate": 8.344109237810583e-06, "loss": 0.029858166351914406, "memory(GiB)": 21.48, "step": 9342, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.955452 }, { "epoch": 0.3035116785238606, "grad_norm": 0.5511470437049866, "learning_rate": 8.34370988681947e-06, "loss": 0.027721848338842392, "memory(GiB)": 21.48, "step": 9343, "token_acc": 1.0, "train_speed(iter/s)": 0.955471 }, { "epoch": 0.303544163986616, "grad_norm": 0.6711864471435547, "learning_rate": 8.34331049723749e-06, "loss": 0.04214651882648468, "memory(GiB)": 21.48, "step": 9344, "token_acc": 0.9700854700854701, "train_speed(iter/s)": 0.955492 }, { "epoch": 0.3035766494493714, "grad_norm": 0.43730810284614563, "learning_rate": 8.34291106906926e-06, "loss": 0.02949339896440506, "memory(GiB)": 21.48, "step": 9345, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.955514 }, { "epoch": 0.3036091349121268, "grad_norm": 0.3797171115875244, "learning_rate": 8.342511602319384e-06, "loss": 0.015524188987910748, "memory(GiB)": 21.48, "step": 9346, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.955535 }, { "epoch": 0.30364162037488224, "grad_norm": 0.5025712251663208, "learning_rate": 8.342112096992478e-06, "loss": 0.03640202805399895, "memory(GiB)": 21.48, "step": 9347, "token_acc": 0.9948979591836735, "train_speed(iter/s)": 0.955554 }, { "epoch": 0.30367410583763765, "grad_norm": 0.45743346214294434, "learning_rate": 8.341712553093151e-06, "loss": 0.03407517075538635, "memory(GiB)": 21.48, "step": 9348, "token_acc": 0.988950276243094, "train_speed(iter/s)": 0.955576 }, { "epoch": 0.30370659130039307, "grad_norm": 0.5023034811019897, "learning_rate": 8.34131297062601e-06, "loss": 0.03314400464296341, "memory(GiB)": 21.48, "step": 9349, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.955595 }, { "epoch": 0.3037390767631485, "grad_norm": 0.48170506954193115, "learning_rate": 8.340913349595672e-06, "loss": 0.03755604475736618, "memory(GiB)": 21.48, "step": 9350, "token_acc": 0.9836065573770492, "train_speed(iter/s)": 0.955616 }, { "epoch": 0.3037715622259039, "grad_norm": 0.4568176567554474, "learning_rate": 8.340513690006747e-06, "loss": 0.03217366337776184, "memory(GiB)": 21.48, "step": 9351, "token_acc": 0.966789667896679, "train_speed(iter/s)": 0.955638 }, { "epoch": 0.3038040476886593, "grad_norm": 0.5059437155723572, "learning_rate": 8.340113991863847e-06, "loss": 0.03872091323137283, "memory(GiB)": 21.48, "step": 9352, "token_acc": 0.988950276243094, "train_speed(iter/s)": 0.95566 }, { "epoch": 0.30383653315141473, "grad_norm": 0.5101795792579651, "learning_rate": 8.339714255171588e-06, "loss": 0.036815375089645386, "memory(GiB)": 21.48, "step": 9353, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.955681 }, { "epoch": 0.30386901861417015, "grad_norm": 0.47196561098098755, "learning_rate": 8.339314479934579e-06, "loss": 0.03702976554632187, "memory(GiB)": 21.48, "step": 9354, "token_acc": 0.9947916666666666, "train_speed(iter/s)": 0.955703 }, { "epoch": 0.30390150407692557, "grad_norm": 0.44994449615478516, "learning_rate": 8.338914666157438e-06, "loss": 0.039531536400318146, "memory(GiB)": 21.48, "step": 9355, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.955725 }, { "epoch": 0.303933989539681, "grad_norm": 0.6022024154663086, "learning_rate": 8.338514813844778e-06, "loss": 0.03106090985238552, "memory(GiB)": 21.48, "step": 9356, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.955747 }, { "epoch": 0.3039664750024364, "grad_norm": 0.49647271633148193, "learning_rate": 8.338114923001212e-06, "loss": 0.03710170462727547, "memory(GiB)": 21.48, "step": 9357, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.955769 }, { "epoch": 0.3039989604651918, "grad_norm": 0.6931911110877991, "learning_rate": 8.337714993631357e-06, "loss": 0.03894379734992981, "memory(GiB)": 21.48, "step": 9358, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.955791 }, { "epoch": 0.30403144592794723, "grad_norm": 0.36140650510787964, "learning_rate": 8.337315025739829e-06, "loss": 0.028668848797678947, "memory(GiB)": 21.48, "step": 9359, "token_acc": 0.9795918367346939, "train_speed(iter/s)": 0.955814 }, { "epoch": 0.30406393139070265, "grad_norm": 0.3474135100841522, "learning_rate": 8.336915019331243e-06, "loss": 0.02659495919942856, "memory(GiB)": 21.48, "step": 9360, "token_acc": 1.0, "train_speed(iter/s)": 0.955832 }, { "epoch": 0.30409641685345806, "grad_norm": 0.3712502717971802, "learning_rate": 8.336514974410216e-06, "loss": 0.0339527502655983, "memory(GiB)": 21.48, "step": 9361, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.955848 }, { "epoch": 0.3041289023162135, "grad_norm": 0.5073471069335938, "learning_rate": 8.336114890981367e-06, "loss": 0.035091228783130646, "memory(GiB)": 21.48, "step": 9362, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.955865 }, { "epoch": 0.3041613877789689, "grad_norm": 0.6560226678848267, "learning_rate": 8.33571476904931e-06, "loss": 0.03736206144094467, "memory(GiB)": 21.48, "step": 9363, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955883 }, { "epoch": 0.3041938732417243, "grad_norm": 0.43618300557136536, "learning_rate": 8.335314608618666e-06, "loss": 0.03762328252196312, "memory(GiB)": 21.48, "step": 9364, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.9559 }, { "epoch": 0.3042263587044797, "grad_norm": 0.39967459440231323, "learning_rate": 8.33491440969405e-06, "loss": 0.032682791352272034, "memory(GiB)": 21.48, "step": 9365, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.955916 }, { "epoch": 0.30425884416723514, "grad_norm": 0.3605444133281708, "learning_rate": 8.334514172280086e-06, "loss": 0.023015091195702553, "memory(GiB)": 21.48, "step": 9366, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955934 }, { "epoch": 0.30429132962999056, "grad_norm": 0.6290711164474487, "learning_rate": 8.334113896381388e-06, "loss": 0.030467387288808823, "memory(GiB)": 21.48, "step": 9367, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.95595 }, { "epoch": 0.304323815092746, "grad_norm": 0.5242014527320862, "learning_rate": 8.333713582002579e-06, "loss": 0.02930334210395813, "memory(GiB)": 21.48, "step": 9368, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.955967 }, { "epoch": 0.3043563005555014, "grad_norm": 0.4764302372932434, "learning_rate": 8.333313229148276e-06, "loss": 0.032368097454309464, "memory(GiB)": 21.48, "step": 9369, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.955984 }, { "epoch": 0.3043887860182568, "grad_norm": 0.5924933552742004, "learning_rate": 8.332912837823104e-06, "loss": 0.03342921659350395, "memory(GiB)": 21.48, "step": 9370, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956 }, { "epoch": 0.3044212714810122, "grad_norm": 0.40242692828178406, "learning_rate": 8.332512408031681e-06, "loss": 0.026359939947724342, "memory(GiB)": 21.48, "step": 9371, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.956016 }, { "epoch": 0.30445375694376764, "grad_norm": 0.34698423743247986, "learning_rate": 8.33211193977863e-06, "loss": 0.02201003208756447, "memory(GiB)": 21.48, "step": 9372, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.956032 }, { "epoch": 0.30448624240652306, "grad_norm": 0.8226525783538818, "learning_rate": 8.331711433068573e-06, "loss": 0.03983820974826813, "memory(GiB)": 21.48, "step": 9373, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.95605 }, { "epoch": 0.30451872786927847, "grad_norm": 0.5664886236190796, "learning_rate": 8.331310887906129e-06, "loss": 0.03145008161664009, "memory(GiB)": 21.48, "step": 9374, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956068 }, { "epoch": 0.3045512133320339, "grad_norm": 0.6100418567657471, "learning_rate": 8.330910304295925e-06, "loss": 0.036661528050899506, "memory(GiB)": 21.48, "step": 9375, "token_acc": 0.975, "train_speed(iter/s)": 0.956087 }, { "epoch": 0.30458369879478936, "grad_norm": 0.6616998910903931, "learning_rate": 8.330509682242583e-06, "loss": 0.034609779715538025, "memory(GiB)": 21.48, "step": 9376, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.956104 }, { "epoch": 0.3046161842575448, "grad_norm": 0.5183879733085632, "learning_rate": 8.330109021750725e-06, "loss": 0.03080862946808338, "memory(GiB)": 21.48, "step": 9377, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.956121 }, { "epoch": 0.3046486697203002, "grad_norm": 0.6538926362991333, "learning_rate": 8.329708322824976e-06, "loss": 0.04185260087251663, "memory(GiB)": 21.48, "step": 9378, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.956136 }, { "epoch": 0.3046811551830556, "grad_norm": 1.0855481624603271, "learning_rate": 8.329307585469963e-06, "loss": 0.029462266713380814, "memory(GiB)": 21.48, "step": 9379, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.956148 }, { "epoch": 0.304713640645811, "grad_norm": 0.5329586863517761, "learning_rate": 8.328906809690308e-06, "loss": 0.03078259900212288, "memory(GiB)": 21.48, "step": 9380, "token_acc": 0.9965156794425087, "train_speed(iter/s)": 0.956165 }, { "epoch": 0.30474612610856644, "grad_norm": 0.6408154368400574, "learning_rate": 8.328505995490639e-06, "loss": 0.029582342132925987, "memory(GiB)": 21.48, "step": 9381, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.956179 }, { "epoch": 0.30477861157132186, "grad_norm": 0.4841061234474182, "learning_rate": 8.328105142875579e-06, "loss": 0.029692433774471283, "memory(GiB)": 21.48, "step": 9382, "token_acc": 0.977859778597786, "train_speed(iter/s)": 0.956193 }, { "epoch": 0.30481109703407727, "grad_norm": 0.42436185479164124, "learning_rate": 8.327704251849757e-06, "loss": 0.03114272654056549, "memory(GiB)": 21.48, "step": 9383, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.956209 }, { "epoch": 0.3048435824968327, "grad_norm": 0.41473624110221863, "learning_rate": 8.327303322417796e-06, "loss": 0.034851156175136566, "memory(GiB)": 21.48, "step": 9384, "token_acc": 0.9781659388646288, "train_speed(iter/s)": 0.956225 }, { "epoch": 0.3048760679595881, "grad_norm": 0.40515798330307007, "learning_rate": 8.32690235458433e-06, "loss": 0.02778611332178116, "memory(GiB)": 21.48, "step": 9385, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.956241 }, { "epoch": 0.3049085534223435, "grad_norm": 0.6839832067489624, "learning_rate": 8.326501348353982e-06, "loss": 0.0391034260392189, "memory(GiB)": 21.48, "step": 9386, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.956256 }, { "epoch": 0.30494103888509894, "grad_norm": 0.49313753843307495, "learning_rate": 8.32610030373138e-06, "loss": 0.033015117049217224, "memory(GiB)": 21.48, "step": 9387, "token_acc": 0.9776785714285714, "train_speed(iter/s)": 0.956273 }, { "epoch": 0.30497352434785435, "grad_norm": 0.4108031094074249, "learning_rate": 8.325699220721153e-06, "loss": 0.02802901715040207, "memory(GiB)": 21.48, "step": 9388, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.956287 }, { "epoch": 0.30500600981060977, "grad_norm": 0.34879860281944275, "learning_rate": 8.32529809932793e-06, "loss": 0.031019221991300583, "memory(GiB)": 21.48, "step": 9389, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956301 }, { "epoch": 0.3050384952733652, "grad_norm": 0.6558551788330078, "learning_rate": 8.324896939556343e-06, "loss": 0.02457970380783081, "memory(GiB)": 21.48, "step": 9390, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.956316 }, { "epoch": 0.3050709807361206, "grad_norm": 0.8846299648284912, "learning_rate": 8.324495741411018e-06, "loss": 0.04569586366415024, "memory(GiB)": 21.48, "step": 9391, "token_acc": 0.9779735682819384, "train_speed(iter/s)": 0.956332 }, { "epoch": 0.305103466198876, "grad_norm": 0.4089967906475067, "learning_rate": 8.32409450489659e-06, "loss": 0.025556884706020355, "memory(GiB)": 21.48, "step": 9392, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.956348 }, { "epoch": 0.30513595166163143, "grad_norm": 0.31618475914001465, "learning_rate": 8.323693230017684e-06, "loss": 0.029981687664985657, "memory(GiB)": 21.48, "step": 9393, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.956363 }, { "epoch": 0.30516843712438685, "grad_norm": 0.3677648603916168, "learning_rate": 8.323291916778935e-06, "loss": 0.031005673110485077, "memory(GiB)": 21.48, "step": 9394, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.956378 }, { "epoch": 0.30520092258714226, "grad_norm": 0.4472907483577728, "learning_rate": 8.322890565184974e-06, "loss": 0.029523443430662155, "memory(GiB)": 21.48, "step": 9395, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.956394 }, { "epoch": 0.3052334080498977, "grad_norm": 0.4040749669075012, "learning_rate": 8.322489175240434e-06, "loss": 0.03000808134675026, "memory(GiB)": 21.48, "step": 9396, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.956409 }, { "epoch": 0.3052658935126531, "grad_norm": 0.406850665807724, "learning_rate": 8.322087746949945e-06, "loss": 0.03168734908103943, "memory(GiB)": 21.48, "step": 9397, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956425 }, { "epoch": 0.3052983789754085, "grad_norm": 0.41515934467315674, "learning_rate": 8.321686280318144e-06, "loss": 0.03334960713982582, "memory(GiB)": 21.48, "step": 9398, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.95644 }, { "epoch": 0.30533086443816393, "grad_norm": 0.5427894592285156, "learning_rate": 8.321284775349661e-06, "loss": 0.033494189381599426, "memory(GiB)": 21.48, "step": 9399, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.956457 }, { "epoch": 0.30536334990091935, "grad_norm": 0.41021451354026794, "learning_rate": 8.32088323204913e-06, "loss": 0.027501305565238, "memory(GiB)": 21.48, "step": 9400, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956473 }, { "epoch": 0.30539583536367476, "grad_norm": 0.3759754002094269, "learning_rate": 8.320481650421188e-06, "loss": 0.030475107952952385, "memory(GiB)": 21.48, "step": 9401, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.956495 }, { "epoch": 0.3054283208264302, "grad_norm": 0.38232800364494324, "learning_rate": 8.32008003047047e-06, "loss": 0.031599801033735275, "memory(GiB)": 21.48, "step": 9402, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.956517 }, { "epoch": 0.3054608062891856, "grad_norm": 0.3870866298675537, "learning_rate": 8.319678372201606e-06, "loss": 0.026125594973564148, "memory(GiB)": 21.48, "step": 9403, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.956537 }, { "epoch": 0.305493291751941, "grad_norm": 0.4701650142669678, "learning_rate": 8.319276675619236e-06, "loss": 0.032302044332027435, "memory(GiB)": 21.48, "step": 9404, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.956559 }, { "epoch": 0.3055257772146964, "grad_norm": 0.3871432840824127, "learning_rate": 8.318874940727996e-06, "loss": 0.026470664888620377, "memory(GiB)": 21.48, "step": 9405, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.956581 }, { "epoch": 0.30555826267745184, "grad_norm": 0.8007247447967529, "learning_rate": 8.318473167532522e-06, "loss": 0.027632692828774452, "memory(GiB)": 21.48, "step": 9406, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.956604 }, { "epoch": 0.30559074814020726, "grad_norm": 0.5001876354217529, "learning_rate": 8.31807135603745e-06, "loss": 0.031905077397823334, "memory(GiB)": 21.48, "step": 9407, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.956625 }, { "epoch": 0.3056232336029627, "grad_norm": 0.427610844373703, "learning_rate": 8.317669506247418e-06, "loss": 0.03150435909628868, "memory(GiB)": 21.48, "step": 9408, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.956645 }, { "epoch": 0.3056557190657181, "grad_norm": 0.7549891471862793, "learning_rate": 8.317267618167065e-06, "loss": 0.031253378838300705, "memory(GiB)": 21.48, "step": 9409, "token_acc": 0.98828125, "train_speed(iter/s)": 0.956665 }, { "epoch": 0.3056882045284735, "grad_norm": 0.3864400088787079, "learning_rate": 8.316865691801026e-06, "loss": 0.029597271233797073, "memory(GiB)": 21.48, "step": 9410, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.956685 }, { "epoch": 0.3057206899912289, "grad_norm": 0.40079665184020996, "learning_rate": 8.316463727153946e-06, "loss": 0.029736321419477463, "memory(GiB)": 21.48, "step": 9411, "token_acc": 0.9845559845559846, "train_speed(iter/s)": 0.956707 }, { "epoch": 0.30575317545398434, "grad_norm": 0.5588581562042236, "learning_rate": 8.316061724230458e-06, "loss": 0.03782260790467262, "memory(GiB)": 21.48, "step": 9412, "token_acc": 0.9781420765027322, "train_speed(iter/s)": 0.956728 }, { "epoch": 0.30578566091673975, "grad_norm": 0.40744125843048096, "learning_rate": 8.315659683035204e-06, "loss": 0.02943427860736847, "memory(GiB)": 21.48, "step": 9413, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.956748 }, { "epoch": 0.30581814637949517, "grad_norm": 0.40518367290496826, "learning_rate": 8.315257603572824e-06, "loss": 0.02486908994615078, "memory(GiB)": 21.48, "step": 9414, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.95677 }, { "epoch": 0.3058506318422506, "grad_norm": 0.5470868945121765, "learning_rate": 8.314855485847959e-06, "loss": 0.026506859809160233, "memory(GiB)": 21.48, "step": 9415, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.956792 }, { "epoch": 0.305883117305006, "grad_norm": 0.36792486906051636, "learning_rate": 8.31445332986525e-06, "loss": 0.03087504208087921, "memory(GiB)": 21.48, "step": 9416, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956814 }, { "epoch": 0.3059156027677614, "grad_norm": 0.5674799680709839, "learning_rate": 8.314051135629338e-06, "loss": 0.04009329900145531, "memory(GiB)": 21.48, "step": 9417, "token_acc": 0.9875, "train_speed(iter/s)": 0.956835 }, { "epoch": 0.30594808823051683, "grad_norm": 0.6832297444343567, "learning_rate": 8.313648903144864e-06, "loss": 0.04322528839111328, "memory(GiB)": 21.48, "step": 9418, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.956857 }, { "epoch": 0.30598057369327225, "grad_norm": 0.554923415184021, "learning_rate": 8.313246632416473e-06, "loss": 0.037594228982925415, "memory(GiB)": 21.48, "step": 9419, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.956878 }, { "epoch": 0.30601305915602767, "grad_norm": 0.5641316771507263, "learning_rate": 8.312844323448806e-06, "loss": 0.03746679797768593, "memory(GiB)": 21.48, "step": 9420, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.956898 }, { "epoch": 0.3060455446187831, "grad_norm": 0.5265404582023621, "learning_rate": 8.312441976246504e-06, "loss": 0.03401022404432297, "memory(GiB)": 21.48, "step": 9421, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956918 }, { "epoch": 0.3060780300815385, "grad_norm": 0.5619698762893677, "learning_rate": 8.312039590814213e-06, "loss": 0.023568790405988693, "memory(GiB)": 21.48, "step": 9422, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.956934 }, { "epoch": 0.3061105155442939, "grad_norm": 0.4358346462249756, "learning_rate": 8.311637167156578e-06, "loss": 0.032586999237537384, "memory(GiB)": 21.48, "step": 9423, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.956951 }, { "epoch": 0.30614300100704933, "grad_norm": 0.3729427754878998, "learning_rate": 8.311234705278242e-06, "loss": 0.03071765787899494, "memory(GiB)": 21.48, "step": 9424, "token_acc": 0.9753694581280788, "train_speed(iter/s)": 0.956968 }, { "epoch": 0.30617548646980475, "grad_norm": 0.4486158788204193, "learning_rate": 8.31083220518385e-06, "loss": 0.03360161930322647, "memory(GiB)": 21.48, "step": 9425, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.956985 }, { "epoch": 0.30620797193256016, "grad_norm": 0.538351833820343, "learning_rate": 8.310429666878049e-06, "loss": 0.03723070025444031, "memory(GiB)": 21.48, "step": 9426, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.957001 }, { "epoch": 0.3062404573953156, "grad_norm": 0.3546011447906494, "learning_rate": 8.31002709036548e-06, "loss": 0.030955493450164795, "memory(GiB)": 21.48, "step": 9427, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957018 }, { "epoch": 0.306272942858071, "grad_norm": 0.5067906379699707, "learning_rate": 8.309624475650796e-06, "loss": 0.040534697473049164, "memory(GiB)": 21.48, "step": 9428, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.957035 }, { "epoch": 0.3063054283208264, "grad_norm": 0.48028764128685, "learning_rate": 8.309221822738639e-06, "loss": 0.027302801609039307, "memory(GiB)": 21.48, "step": 9429, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.957051 }, { "epoch": 0.30633791378358183, "grad_norm": 0.4521423578262329, "learning_rate": 8.30881913163366e-06, "loss": 0.02699628472328186, "memory(GiB)": 21.48, "step": 9430, "token_acc": 0.9828571428571429, "train_speed(iter/s)": 0.957067 }, { "epoch": 0.30637039924633724, "grad_norm": 0.4350569546222687, "learning_rate": 8.3084164023405e-06, "loss": 0.03878898546099663, "memory(GiB)": 21.48, "step": 9431, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.957083 }, { "epoch": 0.30640288470909266, "grad_norm": 0.39631325006484985, "learning_rate": 8.308013634863814e-06, "loss": 0.03670087456703186, "memory(GiB)": 21.48, "step": 9432, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.957099 }, { "epoch": 0.3064353701718481, "grad_norm": 0.41189172863960266, "learning_rate": 8.307610829208247e-06, "loss": 0.027974268421530724, "memory(GiB)": 21.48, "step": 9433, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.957114 }, { "epoch": 0.3064678556346035, "grad_norm": 0.32297593355178833, "learning_rate": 8.307207985378449e-06, "loss": 0.024251483380794525, "memory(GiB)": 21.48, "step": 9434, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957131 }, { "epoch": 0.3065003410973589, "grad_norm": 0.4050484001636505, "learning_rate": 8.30680510337907e-06, "loss": 0.0363960862159729, "memory(GiB)": 21.48, "step": 9435, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.957146 }, { "epoch": 0.3065328265601143, "grad_norm": 0.33630630373954773, "learning_rate": 8.306402183214757e-06, "loss": 0.026627440005540848, "memory(GiB)": 21.48, "step": 9436, "token_acc": 1.0, "train_speed(iter/s)": 0.957161 }, { "epoch": 0.30656531202286974, "grad_norm": 0.4029000997543335, "learning_rate": 8.305999224890162e-06, "loss": 0.02797425538301468, "memory(GiB)": 21.48, "step": 9437, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957178 }, { "epoch": 0.30659779748562516, "grad_norm": 0.34034815430641174, "learning_rate": 8.305596228409935e-06, "loss": 0.027873145416378975, "memory(GiB)": 21.48, "step": 9438, "token_acc": 0.9858156028368794, "train_speed(iter/s)": 0.957195 }, { "epoch": 0.3066302829483806, "grad_norm": 0.3583248257637024, "learning_rate": 8.30519319377873e-06, "loss": 0.028549563139677048, "memory(GiB)": 21.48, "step": 9439, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957213 }, { "epoch": 0.30666276841113604, "grad_norm": 0.9903751015663147, "learning_rate": 8.304790121001196e-06, "loss": 0.0268859826028347, "memory(GiB)": 21.48, "step": 9440, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.957229 }, { "epoch": 0.30669525387389146, "grad_norm": 0.4465359151363373, "learning_rate": 8.304387010081985e-06, "loss": 0.03369123488664627, "memory(GiB)": 21.48, "step": 9441, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.957242 }, { "epoch": 0.3067277393366469, "grad_norm": 0.3704632520675659, "learning_rate": 8.30398386102575e-06, "loss": 0.025615617632865906, "memory(GiB)": 21.48, "step": 9442, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.957257 }, { "epoch": 0.3067602247994023, "grad_norm": 0.397686243057251, "learning_rate": 8.303580673837142e-06, "loss": 0.02735746279358864, "memory(GiB)": 21.48, "step": 9443, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.95727 }, { "epoch": 0.3067927102621577, "grad_norm": 0.36114567518234253, "learning_rate": 8.303177448520816e-06, "loss": 0.0230049230158329, "memory(GiB)": 21.48, "step": 9444, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957285 }, { "epoch": 0.3068251957249131, "grad_norm": 1.6633107662200928, "learning_rate": 8.302774185081427e-06, "loss": 0.03091835416853428, "memory(GiB)": 21.48, "step": 9445, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957298 }, { "epoch": 0.30685768118766854, "grad_norm": 0.46343445777893066, "learning_rate": 8.302370883523628e-06, "loss": 0.03104986622929573, "memory(GiB)": 21.48, "step": 9446, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.957315 }, { "epoch": 0.30689016665042396, "grad_norm": 0.532843291759491, "learning_rate": 8.301967543852074e-06, "loss": 0.04535893350839615, "memory(GiB)": 21.48, "step": 9447, "token_acc": 0.9796747967479674, "train_speed(iter/s)": 0.957328 }, { "epoch": 0.3069226521131794, "grad_norm": 0.30582183599472046, "learning_rate": 8.30156416607142e-06, "loss": 0.02091021090745926, "memory(GiB)": 21.48, "step": 9448, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.957345 }, { "epoch": 0.3069551375759348, "grad_norm": 0.42050230503082275, "learning_rate": 8.301160750186319e-06, "loss": 0.02339138835668564, "memory(GiB)": 21.48, "step": 9449, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.957359 }, { "epoch": 0.3069876230386902, "grad_norm": 0.353684663772583, "learning_rate": 8.300757296201429e-06, "loss": 0.0291594248265028, "memory(GiB)": 21.48, "step": 9450, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.957373 }, { "epoch": 0.3070201085014456, "grad_norm": 0.4784444570541382, "learning_rate": 8.300353804121409e-06, "loss": 0.03568996489048004, "memory(GiB)": 21.48, "step": 9451, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.957388 }, { "epoch": 0.30705259396420104, "grad_norm": 0.49667057394981384, "learning_rate": 8.299950273950911e-06, "loss": 0.031115585938096046, "memory(GiB)": 21.48, "step": 9452, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.957405 }, { "epoch": 0.30708507942695645, "grad_norm": 0.5180239081382751, "learning_rate": 8.299546705694595e-06, "loss": 0.03686988726258278, "memory(GiB)": 21.48, "step": 9453, "token_acc": 0.9728682170542635, "train_speed(iter/s)": 0.957419 }, { "epoch": 0.30711756488971187, "grad_norm": 0.44452592730522156, "learning_rate": 8.299143099357118e-06, "loss": 0.028803981840610504, "memory(GiB)": 21.48, "step": 9454, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.957436 }, { "epoch": 0.3071500503524673, "grad_norm": 0.37835177779197693, "learning_rate": 8.298739454943142e-06, "loss": 0.025671672075986862, "memory(GiB)": 21.48, "step": 9455, "token_acc": 0.9932203389830508, "train_speed(iter/s)": 0.957453 }, { "epoch": 0.3071825358152227, "grad_norm": 0.43820247054100037, "learning_rate": 8.29833577245732e-06, "loss": 0.030607659369707108, "memory(GiB)": 21.48, "step": 9456, "token_acc": 1.0, "train_speed(iter/s)": 0.95747 }, { "epoch": 0.3072150212779781, "grad_norm": 0.4240992069244385, "learning_rate": 8.297932051904312e-06, "loss": 0.03265438601374626, "memory(GiB)": 21.48, "step": 9457, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.957485 }, { "epoch": 0.30724750674073353, "grad_norm": 0.4343909025192261, "learning_rate": 8.297528293288776e-06, "loss": 0.025472572073340416, "memory(GiB)": 21.48, "step": 9458, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957503 }, { "epoch": 0.30727999220348895, "grad_norm": 0.8390397429466248, "learning_rate": 8.29712449661538e-06, "loss": 0.03593984246253967, "memory(GiB)": 21.48, "step": 9459, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.95752 }, { "epoch": 0.30731247766624437, "grad_norm": 0.5149137377738953, "learning_rate": 8.296720661888775e-06, "loss": 0.02965802326798439, "memory(GiB)": 21.48, "step": 9460, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.95754 }, { "epoch": 0.3073449631289998, "grad_norm": 0.3205263018608093, "learning_rate": 8.296316789113628e-06, "loss": 0.024162663146853447, "memory(GiB)": 21.48, "step": 9461, "token_acc": 1.0, "train_speed(iter/s)": 0.957562 }, { "epoch": 0.3073774485917552, "grad_norm": 0.4786006808280945, "learning_rate": 8.295912878294598e-06, "loss": 0.03260865807533264, "memory(GiB)": 21.48, "step": 9462, "token_acc": 0.9887005649717514, "train_speed(iter/s)": 0.957584 }, { "epoch": 0.3074099340545106, "grad_norm": 0.6377846598625183, "learning_rate": 8.295508929436345e-06, "loss": 0.04005908593535423, "memory(GiB)": 21.48, "step": 9463, "token_acc": 0.9840425531914894, "train_speed(iter/s)": 0.957606 }, { "epoch": 0.30744241951726603, "grad_norm": 0.4169785678386688, "learning_rate": 8.295104942543532e-06, "loss": 0.03777982294559479, "memory(GiB)": 21.48, "step": 9464, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.957627 }, { "epoch": 0.30747490498002145, "grad_norm": 0.40491023659706116, "learning_rate": 8.294700917620825e-06, "loss": 0.02560686692595482, "memory(GiB)": 21.48, "step": 9465, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957648 }, { "epoch": 0.30750739044277686, "grad_norm": 0.4466521441936493, "learning_rate": 8.294296854672881e-06, "loss": 0.02381570264697075, "memory(GiB)": 21.48, "step": 9466, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.957669 }, { "epoch": 0.3075398759055323, "grad_norm": 0.4904211461544037, "learning_rate": 8.293892753704369e-06, "loss": 0.030747544020414352, "memory(GiB)": 21.48, "step": 9467, "token_acc": 0.9890909090909091, "train_speed(iter/s)": 0.957689 }, { "epoch": 0.3075723613682877, "grad_norm": 0.6118881702423096, "learning_rate": 8.293488614719948e-06, "loss": 0.022805538028478622, "memory(GiB)": 21.48, "step": 9468, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.957711 }, { "epoch": 0.3076048468310431, "grad_norm": 0.4609227776527405, "learning_rate": 8.293084437724287e-06, "loss": 0.027065381407737732, "memory(GiB)": 21.48, "step": 9469, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.957731 }, { "epoch": 0.3076373322937985, "grad_norm": 0.40365639328956604, "learning_rate": 8.29268022272205e-06, "loss": 0.027224689722061157, "memory(GiB)": 21.48, "step": 9470, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.957752 }, { "epoch": 0.30766981775655394, "grad_norm": 0.6639640927314758, "learning_rate": 8.292275969717897e-06, "loss": 0.03237082064151764, "memory(GiB)": 21.48, "step": 9471, "token_acc": 0.9644444444444444, "train_speed(iter/s)": 0.957774 }, { "epoch": 0.30770230321930936, "grad_norm": 0.3048665225505829, "learning_rate": 8.2918716787165e-06, "loss": 0.025299139320850372, "memory(GiB)": 21.48, "step": 9472, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.957795 }, { "epoch": 0.3077347886820648, "grad_norm": 0.4790857136249542, "learning_rate": 8.29146734972252e-06, "loss": 0.036308713257312775, "memory(GiB)": 21.48, "step": 9473, "token_acc": 0.9826839826839827, "train_speed(iter/s)": 0.957816 }, { "epoch": 0.3077672741448202, "grad_norm": 0.4425654709339142, "learning_rate": 8.291062982740628e-06, "loss": 0.033991407603025436, "memory(GiB)": 21.48, "step": 9474, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.957838 }, { "epoch": 0.3077997596075756, "grad_norm": 0.3796405494213104, "learning_rate": 8.290658577775488e-06, "loss": 0.023284954950213432, "memory(GiB)": 21.48, "step": 9475, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.957859 }, { "epoch": 0.307832245070331, "grad_norm": 0.47507157921791077, "learning_rate": 8.290254134831768e-06, "loss": 0.02813337743282318, "memory(GiB)": 21.48, "step": 9476, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.957881 }, { "epoch": 0.30786473053308644, "grad_norm": 0.528435230255127, "learning_rate": 8.289849653914135e-06, "loss": 0.03794204816222191, "memory(GiB)": 21.48, "step": 9477, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.957902 }, { "epoch": 0.30789721599584186, "grad_norm": 0.41200706362724304, "learning_rate": 8.28944513502726e-06, "loss": 0.03415529057383537, "memory(GiB)": 21.48, "step": 9478, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.957923 }, { "epoch": 0.30792970145859727, "grad_norm": 0.9818488359451294, "learning_rate": 8.289040578175809e-06, "loss": 0.0274585522711277, "memory(GiB)": 21.48, "step": 9479, "token_acc": 0.9798387096774194, "train_speed(iter/s)": 0.957945 }, { "epoch": 0.3079621869213527, "grad_norm": 0.5320882201194763, "learning_rate": 8.288635983364453e-06, "loss": 0.03656875342130661, "memory(GiB)": 21.48, "step": 9480, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.957966 }, { "epoch": 0.3079946723841081, "grad_norm": 0.3850202262401581, "learning_rate": 8.288231350597858e-06, "loss": 0.03089931234717369, "memory(GiB)": 21.48, "step": 9481, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957987 }, { "epoch": 0.3080271578468635, "grad_norm": 0.42674750089645386, "learning_rate": 8.287826679880699e-06, "loss": 0.036566950380802155, "memory(GiB)": 21.48, "step": 9482, "token_acc": 0.9858657243816255, "train_speed(iter/s)": 0.958009 }, { "epoch": 0.30805964330961894, "grad_norm": 0.34919053316116333, "learning_rate": 8.287421971217642e-06, "loss": 0.02926248125731945, "memory(GiB)": 21.48, "step": 9483, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.958027 }, { "epoch": 0.30809212877237435, "grad_norm": 0.31207075715065, "learning_rate": 8.287017224613363e-06, "loss": 0.02245873399078846, "memory(GiB)": 21.48, "step": 9484, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.958042 }, { "epoch": 0.30812461423512977, "grad_norm": 0.45263704657554626, "learning_rate": 8.286612440072528e-06, "loss": 0.04105766862630844, "memory(GiB)": 21.48, "step": 9485, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.958058 }, { "epoch": 0.3081570996978852, "grad_norm": 0.38210129737854004, "learning_rate": 8.286207617599812e-06, "loss": 0.025807589292526245, "memory(GiB)": 21.48, "step": 9486, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.958075 }, { "epoch": 0.3081895851606406, "grad_norm": 0.637737512588501, "learning_rate": 8.285802757199885e-06, "loss": 0.031145455315709114, "memory(GiB)": 21.48, "step": 9487, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.95809 }, { "epoch": 0.308222070623396, "grad_norm": 0.5909556746482849, "learning_rate": 8.285397858877421e-06, "loss": 0.03398797661066055, "memory(GiB)": 21.48, "step": 9488, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.958105 }, { "epoch": 0.30825455608615143, "grad_norm": 0.475212961435318, "learning_rate": 8.284992922637095e-06, "loss": 0.03833902254700661, "memory(GiB)": 21.48, "step": 9489, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.958121 }, { "epoch": 0.30828704154890685, "grad_norm": 0.26929575204849243, "learning_rate": 8.284587948483576e-06, "loss": 0.02887287363409996, "memory(GiB)": 21.48, "step": 9490, "token_acc": 0.9897959183673469, "train_speed(iter/s)": 0.958138 }, { "epoch": 0.30831952701166226, "grad_norm": 0.31902745366096497, "learning_rate": 8.28418293642154e-06, "loss": 0.03148861974477768, "memory(GiB)": 21.48, "step": 9491, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.958154 }, { "epoch": 0.3083520124744177, "grad_norm": 0.5347422361373901, "learning_rate": 8.283777886455662e-06, "loss": 0.0315483920276165, "memory(GiB)": 21.48, "step": 9492, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.958169 }, { "epoch": 0.3083844979371731, "grad_norm": 1.0711535215377808, "learning_rate": 8.283372798590616e-06, "loss": 0.03997679799795151, "memory(GiB)": 21.48, "step": 9493, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.958185 }, { "epoch": 0.3084169833999285, "grad_norm": 0.351917028427124, "learning_rate": 8.282967672831079e-06, "loss": 0.03489537537097931, "memory(GiB)": 21.48, "step": 9494, "token_acc": 0.9834254143646409, "train_speed(iter/s)": 0.9582 }, { "epoch": 0.30844946886268393, "grad_norm": 0.4525497853755951, "learning_rate": 8.282562509181727e-06, "loss": 0.03078635409474373, "memory(GiB)": 21.48, "step": 9495, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.958216 }, { "epoch": 0.30848195432543934, "grad_norm": 0.36541640758514404, "learning_rate": 8.282157307647231e-06, "loss": 0.03024294599890709, "memory(GiB)": 21.48, "step": 9496, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.958232 }, { "epoch": 0.30851443978819476, "grad_norm": 0.4431873857975006, "learning_rate": 8.281752068232274e-06, "loss": 0.03313130885362625, "memory(GiB)": 21.48, "step": 9497, "token_acc": 0.972, "train_speed(iter/s)": 0.958249 }, { "epoch": 0.3085469252509502, "grad_norm": 0.4653094410896301, "learning_rate": 8.281346790941528e-06, "loss": 0.028209783136844635, "memory(GiB)": 21.48, "step": 9498, "token_acc": 0.984, "train_speed(iter/s)": 0.958266 }, { "epoch": 0.3085794107137056, "grad_norm": 0.7100604772567749, "learning_rate": 8.280941475779676e-06, "loss": 0.036549657583236694, "memory(GiB)": 21.48, "step": 9499, "token_acc": 0.97, "train_speed(iter/s)": 0.958283 }, { "epoch": 0.308611896176461, "grad_norm": 0.5109920501708984, "learning_rate": 8.28053612275139e-06, "loss": 0.03252408653497696, "memory(GiB)": 21.48, "step": 9500, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.958298 }, { "epoch": 0.308611896176461, "eval_loss": 0.03205224126577377, "eval_runtime": 81.4166, "eval_samples_per_second": 122.211, "eval_steps_per_second": 3.82, "eval_token_acc": 0.9874965034485007, "step": 9500 }, { "epoch": 0.3086443816392164, "grad_norm": 0.8264499306678772, "learning_rate": 8.28013073186135e-06, "loss": 0.037958718836307526, "memory(GiB)": 21.48, "step": 9501, "token_acc": 0.9869343705489207, "train_speed(iter/s)": 0.949458 }, { "epoch": 0.30867686710197184, "grad_norm": 0.37927278876304626, "learning_rate": 8.279725303114236e-06, "loss": 0.033848125487565994, "memory(GiB)": 21.48, "step": 9502, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.949474 }, { "epoch": 0.30870935256472726, "grad_norm": 0.40975356101989746, "learning_rate": 8.279319836514728e-06, "loss": 0.029715951532125473, "memory(GiB)": 21.48, "step": 9503, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.949491 }, { "epoch": 0.30874183802748273, "grad_norm": 0.45304709672927856, "learning_rate": 8.278914332067503e-06, "loss": 0.026944737881422043, "memory(GiB)": 21.48, "step": 9504, "token_acc": 0.990521327014218, "train_speed(iter/s)": 0.949508 }, { "epoch": 0.30877432349023815, "grad_norm": 0.31510934233665466, "learning_rate": 8.278508789777243e-06, "loss": 0.0262899287045002, "memory(GiB)": 21.48, "step": 9505, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.949525 }, { "epoch": 0.30880680895299356, "grad_norm": 0.34953129291534424, "learning_rate": 8.278103209648626e-06, "loss": 0.025017447769641876, "memory(GiB)": 21.48, "step": 9506, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.949543 }, { "epoch": 0.308839294415749, "grad_norm": 0.6795195937156677, "learning_rate": 8.277697591686336e-06, "loss": 0.03520815074443817, "memory(GiB)": 21.48, "step": 9507, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.94956 }, { "epoch": 0.3088717798785044, "grad_norm": 0.6794294714927673, "learning_rate": 8.277291935895055e-06, "loss": 0.034768033772706985, "memory(GiB)": 21.48, "step": 9508, "token_acc": 0.99609375, "train_speed(iter/s)": 0.949575 }, { "epoch": 0.3089042653412598, "grad_norm": 0.749573290348053, "learning_rate": 8.27688624227946e-06, "loss": 0.03014538064599037, "memory(GiB)": 21.48, "step": 9509, "token_acc": 0.9846938775510204, "train_speed(iter/s)": 0.949591 }, { "epoch": 0.3089367508040152, "grad_norm": 0.43505722284317017, "learning_rate": 8.276480510844239e-06, "loss": 0.04137583076953888, "memory(GiB)": 21.48, "step": 9510, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.94961 }, { "epoch": 0.30896923626677064, "grad_norm": 0.5201494693756104, "learning_rate": 8.276074741594072e-06, "loss": 0.0284645464271307, "memory(GiB)": 21.48, "step": 9511, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.949628 }, { "epoch": 0.30900172172952606, "grad_norm": 1.2984700202941895, "learning_rate": 8.275668934533641e-06, "loss": 0.03488650545477867, "memory(GiB)": 21.48, "step": 9512, "token_acc": 0.98828125, "train_speed(iter/s)": 0.949646 }, { "epoch": 0.3090342071922815, "grad_norm": 0.45925217866897583, "learning_rate": 8.275263089667628e-06, "loss": 0.02822633646428585, "memory(GiB)": 21.48, "step": 9513, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.949663 }, { "epoch": 0.3090666926550369, "grad_norm": 0.48758435249328613, "learning_rate": 8.274857207000724e-06, "loss": 0.0372302420437336, "memory(GiB)": 21.48, "step": 9514, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.949684 }, { "epoch": 0.3090991781177923, "grad_norm": 0.5113537311553955, "learning_rate": 8.274451286537607e-06, "loss": 0.025704635307192802, "memory(GiB)": 21.48, "step": 9515, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.949706 }, { "epoch": 0.3091316635805477, "grad_norm": 0.3175378441810608, "learning_rate": 8.274045328282962e-06, "loss": 0.021690353751182556, "memory(GiB)": 21.48, "step": 9516, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.949728 }, { "epoch": 0.30916414904330314, "grad_norm": 0.46150052547454834, "learning_rate": 8.273639332241479e-06, "loss": 0.028026588261127472, "memory(GiB)": 21.48, "step": 9517, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.94975 }, { "epoch": 0.30919663450605855, "grad_norm": 0.4811279773712158, "learning_rate": 8.27323329841784e-06, "loss": 0.03777947649359703, "memory(GiB)": 21.48, "step": 9518, "token_acc": 0.9716981132075472, "train_speed(iter/s)": 0.949771 }, { "epoch": 0.30922911996881397, "grad_norm": 0.5275608897209167, "learning_rate": 8.272827226816731e-06, "loss": 0.027758602052927017, "memory(GiB)": 21.48, "step": 9519, "token_acc": 0.9836065573770492, "train_speed(iter/s)": 0.949793 }, { "epoch": 0.3092616054315694, "grad_norm": 0.4525096118450165, "learning_rate": 8.27242111744284e-06, "loss": 0.03421313315629959, "memory(GiB)": 21.48, "step": 9520, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.949815 }, { "epoch": 0.3092940908943248, "grad_norm": 0.6258348822593689, "learning_rate": 8.272014970300856e-06, "loss": 0.0383908674120903, "memory(GiB)": 21.48, "step": 9521, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.949837 }, { "epoch": 0.3093265763570802, "grad_norm": 0.5371283888816833, "learning_rate": 8.271608785395462e-06, "loss": 0.039321400225162506, "memory(GiB)": 21.48, "step": 9522, "token_acc": 0.9725274725274725, "train_speed(iter/s)": 0.949858 }, { "epoch": 0.30935906181983563, "grad_norm": 0.44750526547431946, "learning_rate": 8.271202562731348e-06, "loss": 0.027025369927287102, "memory(GiB)": 21.48, "step": 9523, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.949879 }, { "epoch": 0.30939154728259105, "grad_norm": 0.41284996271133423, "learning_rate": 8.270796302313204e-06, "loss": 0.032106913626194, "memory(GiB)": 21.48, "step": 9524, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.9499 }, { "epoch": 0.30942403274534647, "grad_norm": 0.5579303503036499, "learning_rate": 8.270390004145717e-06, "loss": 0.031086202710866928, "memory(GiB)": 21.48, "step": 9525, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.949921 }, { "epoch": 0.3094565182081019, "grad_norm": 0.4053260087966919, "learning_rate": 8.269983668233574e-06, "loss": 0.03424444422125816, "memory(GiB)": 21.48, "step": 9526, "token_acc": 0.9797979797979798, "train_speed(iter/s)": 0.949941 }, { "epoch": 0.3094890036708573, "grad_norm": 0.42802536487579346, "learning_rate": 8.26957729458147e-06, "loss": 0.030198033899068832, "memory(GiB)": 21.48, "step": 9527, "token_acc": 0.9895104895104895, "train_speed(iter/s)": 0.949962 }, { "epoch": 0.3095214891336127, "grad_norm": 0.3091951906681061, "learning_rate": 8.269170883194092e-06, "loss": 0.02752608433365822, "memory(GiB)": 21.48, "step": 9528, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.949983 }, { "epoch": 0.30955397459636813, "grad_norm": 1.6441401243209839, "learning_rate": 8.26876443407613e-06, "loss": 0.03156955540180206, "memory(GiB)": 21.48, "step": 9529, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.950004 }, { "epoch": 0.30958646005912355, "grad_norm": 0.3643653392791748, "learning_rate": 8.268357947232275e-06, "loss": 0.02468913234770298, "memory(GiB)": 21.48, "step": 9530, "token_acc": 0.99609375, "train_speed(iter/s)": 0.950026 }, { "epoch": 0.30961894552187896, "grad_norm": 0.477963387966156, "learning_rate": 8.26795142266722e-06, "loss": 0.032818082720041275, "memory(GiB)": 21.48, "step": 9531, "token_acc": 0.9875, "train_speed(iter/s)": 0.950045 }, { "epoch": 0.3096514309846344, "grad_norm": 0.3415721356868744, "learning_rate": 8.267544860385656e-06, "loss": 0.035722553730010986, "memory(GiB)": 21.48, "step": 9532, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.950068 }, { "epoch": 0.3096839164473898, "grad_norm": 0.45432716608047485, "learning_rate": 8.267138260392275e-06, "loss": 0.03304041177034378, "memory(GiB)": 21.48, "step": 9533, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.950089 }, { "epoch": 0.3097164019101452, "grad_norm": 0.3377320468425751, "learning_rate": 8.26673162269177e-06, "loss": 0.02318522334098816, "memory(GiB)": 21.48, "step": 9534, "token_acc": 0.9893238434163701, "train_speed(iter/s)": 0.950111 }, { "epoch": 0.3097488873729006, "grad_norm": 0.39661556482315063, "learning_rate": 8.266324947288832e-06, "loss": 0.025890231132507324, "memory(GiB)": 21.48, "step": 9535, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.95013 }, { "epoch": 0.30978137283565604, "grad_norm": 0.47833430767059326, "learning_rate": 8.26591823418816e-06, "loss": 0.02674790844321251, "memory(GiB)": 21.48, "step": 9536, "token_acc": 0.9927536231884058, "train_speed(iter/s)": 0.950147 }, { "epoch": 0.30981385829841146, "grad_norm": 0.41526147723197937, "learning_rate": 8.265511483394443e-06, "loss": 0.030031166970729828, "memory(GiB)": 21.48, "step": 9537, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.950132 }, { "epoch": 0.3098463437611669, "grad_norm": 0.768379807472229, "learning_rate": 8.265104694912378e-06, "loss": 0.04153720661997795, "memory(GiB)": 21.48, "step": 9538, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.950148 }, { "epoch": 0.3098788292239223, "grad_norm": 0.483910471200943, "learning_rate": 8.264697868746658e-06, "loss": 0.028471939265727997, "memory(GiB)": 21.48, "step": 9539, "token_acc": 0.996, "train_speed(iter/s)": 0.950164 }, { "epoch": 0.3099113146866777, "grad_norm": 0.6561548113822937, "learning_rate": 8.26429100490198e-06, "loss": 0.03435517102479935, "memory(GiB)": 21.48, "step": 9540, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.950179 }, { "epoch": 0.3099438001494331, "grad_norm": 0.5327438116073608, "learning_rate": 8.263884103383038e-06, "loss": 0.029541444033384323, "memory(GiB)": 21.48, "step": 9541, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.950196 }, { "epoch": 0.30997628561218854, "grad_norm": 0.2991526424884796, "learning_rate": 8.263477164194532e-06, "loss": 0.021830685436725616, "memory(GiB)": 21.48, "step": 9542, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.950212 }, { "epoch": 0.31000877107494396, "grad_norm": 0.40182265639305115, "learning_rate": 8.263070187341152e-06, "loss": 0.02692762389779091, "memory(GiB)": 21.48, "step": 9543, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.950228 }, { "epoch": 0.3100412565376994, "grad_norm": 0.5364314913749695, "learning_rate": 8.262663172827602e-06, "loss": 0.030887002125382423, "memory(GiB)": 21.48, "step": 9544, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.950243 }, { "epoch": 0.3100737420004548, "grad_norm": 0.5616424083709717, "learning_rate": 8.262256120658575e-06, "loss": 0.034328922629356384, "memory(GiB)": 21.48, "step": 9545, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.950259 }, { "epoch": 0.3101062274632102, "grad_norm": 0.4672033190727234, "learning_rate": 8.261849030838769e-06, "loss": 0.030515272170305252, "memory(GiB)": 21.48, "step": 9546, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.950276 }, { "epoch": 0.3101387129259656, "grad_norm": 0.5479970574378967, "learning_rate": 8.261441903372885e-06, "loss": 0.03643073886632919, "memory(GiB)": 21.48, "step": 9547, "token_acc": 0.9793388429752066, "train_speed(iter/s)": 0.950292 }, { "epoch": 0.31017119838872104, "grad_norm": 0.4484871029853821, "learning_rate": 8.261034738265622e-06, "loss": 0.030011679977178574, "memory(GiB)": 21.48, "step": 9548, "token_acc": 0.9824561403508771, "train_speed(iter/s)": 0.950308 }, { "epoch": 0.31020368385147645, "grad_norm": 0.36677396297454834, "learning_rate": 8.260627535521676e-06, "loss": 0.03280767798423767, "memory(GiB)": 21.48, "step": 9549, "token_acc": 0.9878787878787879, "train_speed(iter/s)": 0.950323 }, { "epoch": 0.31023616931423187, "grad_norm": 0.3848593235015869, "learning_rate": 8.260220295145746e-06, "loss": 0.02493700385093689, "memory(GiB)": 21.48, "step": 9550, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.950341 }, { "epoch": 0.3102686547769873, "grad_norm": 0.6375542879104614, "learning_rate": 8.259813017142538e-06, "loss": 0.04105573892593384, "memory(GiB)": 21.48, "step": 9551, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.950356 }, { "epoch": 0.3103011402397427, "grad_norm": 0.3494105041027069, "learning_rate": 8.259405701516749e-06, "loss": 0.02565736509859562, "memory(GiB)": 21.48, "step": 9552, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.95037 }, { "epoch": 0.3103336257024981, "grad_norm": 0.462943971157074, "learning_rate": 8.258998348273078e-06, "loss": 0.036596886813640594, "memory(GiB)": 21.48, "step": 9553, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.950385 }, { "epoch": 0.31036611116525353, "grad_norm": 0.43353551626205444, "learning_rate": 8.25859095741623e-06, "loss": 0.03185155615210533, "memory(GiB)": 21.48, "step": 9554, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.950399 }, { "epoch": 0.31039859662800895, "grad_norm": 0.516871452331543, "learning_rate": 8.258183528950905e-06, "loss": 0.03272106871008873, "memory(GiB)": 21.48, "step": 9555, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.950415 }, { "epoch": 0.31043108209076437, "grad_norm": 0.480386346578598, "learning_rate": 8.257776062881803e-06, "loss": 0.03257529065012932, "memory(GiB)": 21.48, "step": 9556, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.950431 }, { "epoch": 0.3104635675535198, "grad_norm": 0.4355912506580353, "learning_rate": 8.257368559213633e-06, "loss": 0.03918861225247383, "memory(GiB)": 21.48, "step": 9557, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.950448 }, { "epoch": 0.3104960530162752, "grad_norm": 0.4605962336063385, "learning_rate": 8.25696101795109e-06, "loss": 0.03630690276622772, "memory(GiB)": 21.48, "step": 9558, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.950466 }, { "epoch": 0.3105285384790306, "grad_norm": 0.7679468989372253, "learning_rate": 8.256553439098886e-06, "loss": 0.031064793467521667, "memory(GiB)": 21.48, "step": 9559, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.950482 }, { "epoch": 0.31056102394178603, "grad_norm": 0.3735743761062622, "learning_rate": 8.256145822661719e-06, "loss": 0.030024299398064613, "memory(GiB)": 21.48, "step": 9560, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.950499 }, { "epoch": 0.31059350940454145, "grad_norm": 0.5241806507110596, "learning_rate": 8.255738168644296e-06, "loss": 0.027380434796214104, "memory(GiB)": 21.48, "step": 9561, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.950516 }, { "epoch": 0.31062599486729686, "grad_norm": 0.36898863315582275, "learning_rate": 8.25533047705132e-06, "loss": 0.026179304346442223, "memory(GiB)": 21.48, "step": 9562, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.950531 }, { "epoch": 0.3106584803300523, "grad_norm": 0.40517550706863403, "learning_rate": 8.254922747887497e-06, "loss": 0.030745871365070343, "memory(GiB)": 21.48, "step": 9563, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.950547 }, { "epoch": 0.3106909657928077, "grad_norm": 3.911062717437744, "learning_rate": 8.254514981157535e-06, "loss": 0.025611963123083115, "memory(GiB)": 21.48, "step": 9564, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.950564 }, { "epoch": 0.3107234512555631, "grad_norm": 0.3747401833534241, "learning_rate": 8.254107176866138e-06, "loss": 0.02562112919986248, "memory(GiB)": 21.48, "step": 9565, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.950581 }, { "epoch": 0.3107559367183185, "grad_norm": 0.4676370620727539, "learning_rate": 8.253699335018011e-06, "loss": 0.029884416610002518, "memory(GiB)": 21.48, "step": 9566, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.950599 }, { "epoch": 0.31078842218107394, "grad_norm": 0.5333181619644165, "learning_rate": 8.253291455617865e-06, "loss": 0.0339893102645874, "memory(GiB)": 21.48, "step": 9567, "token_acc": 0.9783549783549783, "train_speed(iter/s)": 0.950617 }, { "epoch": 0.3108209076438294, "grad_norm": 0.36934036016464233, "learning_rate": 8.252883538670405e-06, "loss": 0.03201325982809067, "memory(GiB)": 21.48, "step": 9568, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.950634 }, { "epoch": 0.31085339310658483, "grad_norm": 0.33800601959228516, "learning_rate": 8.252475584180339e-06, "loss": 0.027214469388127327, "memory(GiB)": 21.48, "step": 9569, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.950651 }, { "epoch": 0.31088587856934025, "grad_norm": 0.3956739902496338, "learning_rate": 8.252067592152375e-06, "loss": 0.0254257433116436, "memory(GiB)": 21.48, "step": 9570, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.950667 }, { "epoch": 0.31091836403209566, "grad_norm": 0.46856772899627686, "learning_rate": 8.251659562591224e-06, "loss": 0.036331821233034134, "memory(GiB)": 21.48, "step": 9571, "token_acc": 0.9809523809523809, "train_speed(iter/s)": 0.950679 }, { "epoch": 0.3109508494948511, "grad_norm": 0.6199150085449219, "learning_rate": 8.251251495501593e-06, "loss": 0.03674229979515076, "memory(GiB)": 21.48, "step": 9572, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.950694 }, { "epoch": 0.3109833349576065, "grad_norm": 0.3810937702655792, "learning_rate": 8.250843390888192e-06, "loss": 0.02753915637731552, "memory(GiB)": 21.48, "step": 9573, "token_acc": 1.0, "train_speed(iter/s)": 0.950712 }, { "epoch": 0.3110158204203619, "grad_norm": 0.5077351927757263, "learning_rate": 8.250435248755732e-06, "loss": 0.03768976777791977, "memory(GiB)": 21.48, "step": 9574, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.950729 }, { "epoch": 0.3110483058831173, "grad_norm": 0.36901476979255676, "learning_rate": 8.250027069108923e-06, "loss": 0.02597108855843544, "memory(GiB)": 21.48, "step": 9575, "token_acc": 0.985781990521327, "train_speed(iter/s)": 0.950748 }, { "epoch": 0.31108079134587274, "grad_norm": 0.3774934411048889, "learning_rate": 8.249618851952476e-06, "loss": 0.027723422273993492, "memory(GiB)": 21.48, "step": 9576, "token_acc": 1.0, "train_speed(iter/s)": 0.950767 }, { "epoch": 0.31111327680862816, "grad_norm": 0.5990731120109558, "learning_rate": 8.249210597291101e-06, "loss": 0.03737039119005203, "memory(GiB)": 21.48, "step": 9577, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.950787 }, { "epoch": 0.3111457622713836, "grad_norm": 0.48840487003326416, "learning_rate": 8.248802305129515e-06, "loss": 0.03338243067264557, "memory(GiB)": 21.48, "step": 9578, "token_acc": 0.9824561403508771, "train_speed(iter/s)": 0.950808 }, { "epoch": 0.311178247734139, "grad_norm": 0.4787289500236511, "learning_rate": 8.248393975472422e-06, "loss": 0.02818339318037033, "memory(GiB)": 21.48, "step": 9579, "token_acc": 0.9858156028368794, "train_speed(iter/s)": 0.950829 }, { "epoch": 0.3112107331968944, "grad_norm": 0.5654789805412292, "learning_rate": 8.24798560832454e-06, "loss": 0.03296701982617378, "memory(GiB)": 21.48, "step": 9580, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.95085 }, { "epoch": 0.3112432186596498, "grad_norm": 0.5831627249717712, "learning_rate": 8.24757720369058e-06, "loss": 0.03131307661533356, "memory(GiB)": 21.48, "step": 9581, "token_acc": 1.0, "train_speed(iter/s)": 0.950872 }, { "epoch": 0.31127570412240524, "grad_norm": 0.2816547155380249, "learning_rate": 8.247168761575258e-06, "loss": 0.02392040379345417, "memory(GiB)": 21.48, "step": 9582, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.950894 }, { "epoch": 0.31130818958516066, "grad_norm": 0.4923821985721588, "learning_rate": 8.246760281983286e-06, "loss": 0.032113030552864075, "memory(GiB)": 21.48, "step": 9583, "token_acc": 0.995, "train_speed(iter/s)": 0.950913 }, { "epoch": 0.31134067504791607, "grad_norm": 0.3682733476161957, "learning_rate": 8.24635176491938e-06, "loss": 0.030172839760780334, "memory(GiB)": 21.48, "step": 9584, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.950934 }, { "epoch": 0.3113731605106715, "grad_norm": 2.187335252761841, "learning_rate": 8.245943210388254e-06, "loss": 0.031014813110232353, "memory(GiB)": 21.48, "step": 9585, "token_acc": 0.9896373056994818, "train_speed(iter/s)": 0.950956 }, { "epoch": 0.3114056459734269, "grad_norm": 0.3580102324485779, "learning_rate": 8.245534618394622e-06, "loss": 0.02935958467423916, "memory(GiB)": 21.48, "step": 9586, "token_acc": 0.9810606060606061, "train_speed(iter/s)": 0.950978 }, { "epoch": 0.3114381314361823, "grad_norm": 0.5550360083580017, "learning_rate": 8.245125988943199e-06, "loss": 0.027388446033000946, "memory(GiB)": 21.48, "step": 9587, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.950999 }, { "epoch": 0.31147061689893774, "grad_norm": 0.2972431182861328, "learning_rate": 8.244717322038705e-06, "loss": 0.02398439310491085, "memory(GiB)": 21.48, "step": 9588, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.95102 }, { "epoch": 0.31150310236169315, "grad_norm": 0.5030562877655029, "learning_rate": 8.244308617685855e-06, "loss": 0.025631334632635117, "memory(GiB)": 21.48, "step": 9589, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.951041 }, { "epoch": 0.31153558782444857, "grad_norm": 0.6672141551971436, "learning_rate": 8.243899875889362e-06, "loss": 0.028459105640649796, "memory(GiB)": 21.48, "step": 9590, "token_acc": 1.0, "train_speed(iter/s)": 0.951062 }, { "epoch": 0.311568073287204, "grad_norm": 0.526606023311615, "learning_rate": 8.24349109665395e-06, "loss": 0.03028196282684803, "memory(GiB)": 21.48, "step": 9591, "token_acc": 0.9819494584837545, "train_speed(iter/s)": 0.951084 }, { "epoch": 0.3116005587499594, "grad_norm": 0.44412028789520264, "learning_rate": 8.243082279984333e-06, "loss": 0.02652796357870102, "memory(GiB)": 21.48, "step": 9592, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.951103 }, { "epoch": 0.3116330442127148, "grad_norm": 0.45028945803642273, "learning_rate": 8.242673425885231e-06, "loss": 0.0352875217795372, "memory(GiB)": 21.48, "step": 9593, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.951112 }, { "epoch": 0.31166552967547023, "grad_norm": 4.66597843170166, "learning_rate": 8.242264534361359e-06, "loss": 0.04341413825750351, "memory(GiB)": 21.48, "step": 9594, "token_acc": 0.983739837398374, "train_speed(iter/s)": 0.951129 }, { "epoch": 0.31169801513822565, "grad_norm": 0.6599665880203247, "learning_rate": 8.241855605417441e-06, "loss": 0.03793463110923767, "memory(GiB)": 21.48, "step": 9595, "token_acc": 0.9826839826839827, "train_speed(iter/s)": 0.951142 }, { "epoch": 0.31173050060098106, "grad_norm": 0.4945369362831116, "learning_rate": 8.241446639058193e-06, "loss": 0.034665755927562714, "memory(GiB)": 21.48, "step": 9596, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.951154 }, { "epoch": 0.3117629860637365, "grad_norm": 0.5340588092803955, "learning_rate": 8.241037635288337e-06, "loss": 0.03576168045401573, "memory(GiB)": 21.48, "step": 9597, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.951164 }, { "epoch": 0.3117954715264919, "grad_norm": 0.4168926179409027, "learning_rate": 8.240628594112592e-06, "loss": 0.025223243981599808, "memory(GiB)": 21.48, "step": 9598, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.951176 }, { "epoch": 0.3118279569892473, "grad_norm": 0.4318320155143738, "learning_rate": 8.240219515535681e-06, "loss": 0.03546861559152603, "memory(GiB)": 21.48, "step": 9599, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.951187 }, { "epoch": 0.31186044245200273, "grad_norm": 0.37141579389572144, "learning_rate": 8.239810399562324e-06, "loss": 0.023248087614774704, "memory(GiB)": 21.48, "step": 9600, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.951198 }, { "epoch": 0.31189292791475814, "grad_norm": 0.46321073174476624, "learning_rate": 8.239401246197243e-06, "loss": 0.03577937185764313, "memory(GiB)": 21.48, "step": 9601, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.951208 }, { "epoch": 0.31192541337751356, "grad_norm": 0.42461153864860535, "learning_rate": 8.238992055445159e-06, "loss": 0.02470581792294979, "memory(GiB)": 21.48, "step": 9602, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.951221 }, { "epoch": 0.311957898840269, "grad_norm": 0.4989231526851654, "learning_rate": 8.238582827310797e-06, "loss": 0.027387253940105438, "memory(GiB)": 21.48, "step": 9603, "token_acc": 0.9783783783783784, "train_speed(iter/s)": 0.951238 }, { "epoch": 0.3119903843030244, "grad_norm": 0.38562750816345215, "learning_rate": 8.238173561798876e-06, "loss": 0.02520092763006687, "memory(GiB)": 21.48, "step": 9604, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.951254 }, { "epoch": 0.3120228697657798, "grad_norm": 0.5945953130722046, "learning_rate": 8.237764258914125e-06, "loss": 0.03603237494826317, "memory(GiB)": 21.48, "step": 9605, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.951271 }, { "epoch": 0.3120553552285352, "grad_norm": 0.3914027810096741, "learning_rate": 8.237354918661264e-06, "loss": 0.037335045635700226, "memory(GiB)": 21.48, "step": 9606, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.951287 }, { "epoch": 0.31208784069129064, "grad_norm": 0.4555443227291107, "learning_rate": 8.236945541045018e-06, "loss": 0.030037399381399155, "memory(GiB)": 21.48, "step": 9607, "token_acc": 0.9684684684684685, "train_speed(iter/s)": 0.951302 }, { "epoch": 0.31212032615404606, "grad_norm": 0.47616928815841675, "learning_rate": 8.236536126070112e-06, "loss": 0.03681676834821701, "memory(GiB)": 21.48, "step": 9608, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.951319 }, { "epoch": 0.3121528116168015, "grad_norm": 0.5612488985061646, "learning_rate": 8.236126673741272e-06, "loss": 0.0306481271982193, "memory(GiB)": 21.48, "step": 9609, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.951336 }, { "epoch": 0.3121852970795569, "grad_norm": 0.4077099561691284, "learning_rate": 8.235717184063222e-06, "loss": 0.03628804534673691, "memory(GiB)": 21.48, "step": 9610, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.951353 }, { "epoch": 0.3122177825423123, "grad_norm": 0.42313888669013977, "learning_rate": 8.23530765704069e-06, "loss": 0.028869567438960075, "memory(GiB)": 21.48, "step": 9611, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.951369 }, { "epoch": 0.3122502680050677, "grad_norm": 0.3603302240371704, "learning_rate": 8.2348980926784e-06, "loss": 0.024756796658039093, "memory(GiB)": 21.48, "step": 9612, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.951385 }, { "epoch": 0.31228275346782314, "grad_norm": 2.373899459838867, "learning_rate": 8.23448849098108e-06, "loss": 0.028288576751947403, "memory(GiB)": 21.48, "step": 9613, "token_acc": 0.9952830188679245, "train_speed(iter/s)": 0.951402 }, { "epoch": 0.31231523893057855, "grad_norm": 0.8876637816429138, "learning_rate": 8.23407885195346e-06, "loss": 0.06083950400352478, "memory(GiB)": 21.48, "step": 9614, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.951416 }, { "epoch": 0.31234772439333397, "grad_norm": 0.6205797791481018, "learning_rate": 8.233669175600264e-06, "loss": 0.03681530803442001, "memory(GiB)": 21.48, "step": 9615, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.951434 }, { "epoch": 0.3123802098560894, "grad_norm": 0.3709196448326111, "learning_rate": 8.233259461926222e-06, "loss": 0.027585651725530624, "memory(GiB)": 21.48, "step": 9616, "token_acc": 1.0, "train_speed(iter/s)": 0.951451 }, { "epoch": 0.3124126953188448, "grad_norm": 0.5528913140296936, "learning_rate": 8.232849710936063e-06, "loss": 0.03557967022061348, "memory(GiB)": 21.48, "step": 9617, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.951468 }, { "epoch": 0.3124451807816002, "grad_norm": 0.40381473302841187, "learning_rate": 8.232439922634515e-06, "loss": 0.027434328570961952, "memory(GiB)": 21.48, "step": 9618, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.951484 }, { "epoch": 0.31247766624435563, "grad_norm": 0.8389521241188049, "learning_rate": 8.232030097026308e-06, "loss": 0.04500153660774231, "memory(GiB)": 21.48, "step": 9619, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.9515 }, { "epoch": 0.31251015170711105, "grad_norm": 0.38941943645477295, "learning_rate": 8.231620234116172e-06, "loss": 0.02499416656792164, "memory(GiB)": 21.48, "step": 9620, "token_acc": 0.9921875, "train_speed(iter/s)": 0.951515 }, { "epoch": 0.31254263716986647, "grad_norm": 0.4357315003871918, "learning_rate": 8.231210333908837e-06, "loss": 0.03548457473516464, "memory(GiB)": 21.48, "step": 9621, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.951531 }, { "epoch": 0.3125751226326219, "grad_norm": 2.3755006790161133, "learning_rate": 8.230800396409033e-06, "loss": 0.04403964802622795, "memory(GiB)": 21.48, "step": 9622, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.951548 }, { "epoch": 0.3126076080953773, "grad_norm": 0.39497873187065125, "learning_rate": 8.230390421621494e-06, "loss": 0.03688672184944153, "memory(GiB)": 21.48, "step": 9623, "token_acc": 0.9884169884169884, "train_speed(iter/s)": 0.951564 }, { "epoch": 0.3126400935581327, "grad_norm": 0.38583511114120483, "learning_rate": 8.22998040955095e-06, "loss": 0.028765203431248665, "memory(GiB)": 21.48, "step": 9624, "token_acc": 0.9846938775510204, "train_speed(iter/s)": 0.95158 }, { "epoch": 0.31267257902088813, "grad_norm": 0.42756935954093933, "learning_rate": 8.229570360202131e-06, "loss": 0.033708322793245316, "memory(GiB)": 21.48, "step": 9625, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.951596 }, { "epoch": 0.31270506448364355, "grad_norm": 0.36180585622787476, "learning_rate": 8.229160273579773e-06, "loss": 0.028188077732920647, "memory(GiB)": 21.48, "step": 9626, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.951613 }, { "epoch": 0.31273754994639896, "grad_norm": 0.3107985854148865, "learning_rate": 8.228750149688608e-06, "loss": 0.02845538966357708, "memory(GiB)": 21.48, "step": 9627, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.95163 }, { "epoch": 0.3127700354091544, "grad_norm": 0.5933916568756104, "learning_rate": 8.228339988533368e-06, "loss": 0.02805698662996292, "memory(GiB)": 21.48, "step": 9628, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.951647 }, { "epoch": 0.3128025208719098, "grad_norm": 0.4856196939945221, "learning_rate": 8.227929790118785e-06, "loss": 0.03619586303830147, "memory(GiB)": 21.48, "step": 9629, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.951665 }, { "epoch": 0.3128350063346652, "grad_norm": 0.4262484610080719, "learning_rate": 8.2275195544496e-06, "loss": 0.03831550478935242, "memory(GiB)": 21.48, "step": 9630, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.951682 }, { "epoch": 0.3128674917974206, "grad_norm": 0.6958052515983582, "learning_rate": 8.22710928153054e-06, "loss": 0.03256630897521973, "memory(GiB)": 21.48, "step": 9631, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.9517 }, { "epoch": 0.3128999772601761, "grad_norm": 0.518683135509491, "learning_rate": 8.226698971366347e-06, "loss": 0.03177502378821373, "memory(GiB)": 21.48, "step": 9632, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.951716 }, { "epoch": 0.3129324627229315, "grad_norm": 0.367037832736969, "learning_rate": 8.226288623961752e-06, "loss": 0.027095511555671692, "memory(GiB)": 21.48, "step": 9633, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.951732 }, { "epoch": 0.31296494818568693, "grad_norm": 0.40760675072669983, "learning_rate": 8.22587823932149e-06, "loss": 0.034647852182388306, "memory(GiB)": 21.48, "step": 9634, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.951745 }, { "epoch": 0.31299743364844235, "grad_norm": 0.5407686829566956, "learning_rate": 8.225467817450302e-06, "loss": 0.04291301593184471, "memory(GiB)": 21.48, "step": 9635, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.951762 }, { "epoch": 0.31302991911119776, "grad_norm": 0.4422573447227478, "learning_rate": 8.225057358352922e-06, "loss": 0.03090585581958294, "memory(GiB)": 21.48, "step": 9636, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.951779 }, { "epoch": 0.3130624045739532, "grad_norm": 0.8835350275039673, "learning_rate": 8.224646862034087e-06, "loss": 0.03445178642868996, "memory(GiB)": 21.48, "step": 9637, "token_acc": 0.9744897959183674, "train_speed(iter/s)": 0.951794 }, { "epoch": 0.3130948900367086, "grad_norm": 0.4786650240421295, "learning_rate": 8.224236328498535e-06, "loss": 0.03155869245529175, "memory(GiB)": 21.48, "step": 9638, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.951812 }, { "epoch": 0.313127375499464, "grad_norm": 0.34428101778030396, "learning_rate": 8.223825757751005e-06, "loss": 0.029027707874774933, "memory(GiB)": 21.48, "step": 9639, "token_acc": 0.9751243781094527, "train_speed(iter/s)": 0.951832 }, { "epoch": 0.3131598609622194, "grad_norm": 0.3199554979801178, "learning_rate": 8.223415149796234e-06, "loss": 0.02826869487762451, "memory(GiB)": 21.48, "step": 9640, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.951853 }, { "epoch": 0.31319234642497484, "grad_norm": 0.32743269205093384, "learning_rate": 8.223004504638962e-06, "loss": 0.023122821003198624, "memory(GiB)": 21.48, "step": 9641, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.951874 }, { "epoch": 0.31322483188773026, "grad_norm": 1.0811525583267212, "learning_rate": 8.22259382228393e-06, "loss": 0.03688894957304001, "memory(GiB)": 21.48, "step": 9642, "token_acc": 0.9715909090909091, "train_speed(iter/s)": 0.951896 }, { "epoch": 0.3132573173504857, "grad_norm": 0.3538627028465271, "learning_rate": 8.222183102735871e-06, "loss": 0.02560230903327465, "memory(GiB)": 21.48, "step": 9643, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.951918 }, { "epoch": 0.3132898028132411, "grad_norm": 0.4156228303909302, "learning_rate": 8.221772345999535e-06, "loss": 0.027548804879188538, "memory(GiB)": 21.48, "step": 9644, "token_acc": 0.988, "train_speed(iter/s)": 0.95194 }, { "epoch": 0.3133222882759965, "grad_norm": 0.2610287368297577, "learning_rate": 8.221361552079657e-06, "loss": 0.018337607383728027, "memory(GiB)": 21.48, "step": 9645, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.951961 }, { "epoch": 0.3133547737387519, "grad_norm": 0.370842844247818, "learning_rate": 8.220950720980978e-06, "loss": 0.029328444972634315, "memory(GiB)": 21.48, "step": 9646, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.951981 }, { "epoch": 0.31338725920150734, "grad_norm": 0.45855075120925903, "learning_rate": 8.220539852708243e-06, "loss": 0.04061812534928322, "memory(GiB)": 21.48, "step": 9647, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.952002 }, { "epoch": 0.31341974466426276, "grad_norm": 0.6568278074264526, "learning_rate": 8.220128947266189e-06, "loss": 0.029368754476308823, "memory(GiB)": 21.48, "step": 9648, "token_acc": 0.9681818181818181, "train_speed(iter/s)": 0.952024 }, { "epoch": 0.31345223012701817, "grad_norm": 0.4467460513114929, "learning_rate": 8.219718004659563e-06, "loss": 0.02934001386165619, "memory(GiB)": 21.48, "step": 9649, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.952044 }, { "epoch": 0.3134847155897736, "grad_norm": 0.5751178860664368, "learning_rate": 8.219307024893104e-06, "loss": 0.03705665096640587, "memory(GiB)": 21.48, "step": 9650, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.952066 }, { "epoch": 0.313517201052529, "grad_norm": 0.4723166227340698, "learning_rate": 8.218896007971557e-06, "loss": 0.03516478091478348, "memory(GiB)": 21.48, "step": 9651, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.952088 }, { "epoch": 0.3135496865152844, "grad_norm": 0.4166044294834137, "learning_rate": 8.218484953899668e-06, "loss": 0.030187848955392838, "memory(GiB)": 21.48, "step": 9652, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.952108 }, { "epoch": 0.31358217197803984, "grad_norm": 0.7691404819488525, "learning_rate": 8.218073862682177e-06, "loss": 0.033934593200683594, "memory(GiB)": 21.48, "step": 9653, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.952126 }, { "epoch": 0.31361465744079525, "grad_norm": 0.3365958034992218, "learning_rate": 8.21766273432383e-06, "loss": 0.024155184626579285, "memory(GiB)": 21.48, "step": 9654, "token_acc": 0.9875776397515528, "train_speed(iter/s)": 0.952143 }, { "epoch": 0.31364714290355067, "grad_norm": 0.486350953578949, "learning_rate": 8.217251568829373e-06, "loss": 0.034886252135038376, "memory(GiB)": 21.48, "step": 9655, "token_acc": 0.9849056603773585, "train_speed(iter/s)": 0.95216 }, { "epoch": 0.3136796283663061, "grad_norm": 0.4343497157096863, "learning_rate": 8.216840366203551e-06, "loss": 0.025443807244300842, "memory(GiB)": 21.48, "step": 9656, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.952176 }, { "epoch": 0.3137121138290615, "grad_norm": 0.48236510157585144, "learning_rate": 8.216429126451108e-06, "loss": 0.038044996559619904, "memory(GiB)": 21.48, "step": 9657, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.952193 }, { "epoch": 0.3137445992918169, "grad_norm": 0.4511639177799225, "learning_rate": 8.216017849576794e-06, "loss": 0.029255656525492668, "memory(GiB)": 21.48, "step": 9658, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.952208 }, { "epoch": 0.31377708475457233, "grad_norm": 0.474417507648468, "learning_rate": 8.215606535585353e-06, "loss": 0.03881823271512985, "memory(GiB)": 21.48, "step": 9659, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.952222 }, { "epoch": 0.31380957021732775, "grad_norm": 0.45915427803993225, "learning_rate": 8.215195184481533e-06, "loss": 0.031228814274072647, "memory(GiB)": 21.48, "step": 9660, "token_acc": 0.9809885931558935, "train_speed(iter/s)": 0.95224 }, { "epoch": 0.31384205568008317, "grad_norm": 0.6056409478187561, "learning_rate": 8.21478379627008e-06, "loss": 0.038057416677474976, "memory(GiB)": 21.48, "step": 9661, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.952257 }, { "epoch": 0.3138745411428386, "grad_norm": 0.31502199172973633, "learning_rate": 8.214372370955744e-06, "loss": 0.020908329635858536, "memory(GiB)": 21.48, "step": 9662, "token_acc": 1.0, "train_speed(iter/s)": 0.952273 }, { "epoch": 0.313907026605594, "grad_norm": 0.39117202162742615, "learning_rate": 8.213960908543272e-06, "loss": 0.026351310312747955, "memory(GiB)": 21.48, "step": 9663, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.952289 }, { "epoch": 0.3139395120683494, "grad_norm": 0.481781929731369, "learning_rate": 8.213549409037412e-06, "loss": 0.030672438442707062, "memory(GiB)": 21.48, "step": 9664, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.952306 }, { "epoch": 0.31397199753110483, "grad_norm": 0.3937588334083557, "learning_rate": 8.213137872442916e-06, "loss": 0.023437218740582466, "memory(GiB)": 21.48, "step": 9665, "token_acc": 1.0, "train_speed(iter/s)": 0.952321 }, { "epoch": 0.31400448299386025, "grad_norm": 0.5128741264343262, "learning_rate": 8.212726298764532e-06, "loss": 0.03739699721336365, "memory(GiB)": 21.48, "step": 9666, "token_acc": 0.9877551020408163, "train_speed(iter/s)": 0.952336 }, { "epoch": 0.31403696845661566, "grad_norm": 0.33960017561912537, "learning_rate": 8.212314688007012e-06, "loss": 0.024144228547811508, "memory(GiB)": 21.48, "step": 9667, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.952353 }, { "epoch": 0.3140694539193711, "grad_norm": 0.555142343044281, "learning_rate": 8.211903040175103e-06, "loss": 0.02943049743771553, "memory(GiB)": 21.48, "step": 9668, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.952368 }, { "epoch": 0.3141019393821265, "grad_norm": 0.437411904335022, "learning_rate": 8.211491355273558e-06, "loss": 0.025622859597206116, "memory(GiB)": 21.48, "step": 9669, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.952382 }, { "epoch": 0.3141344248448819, "grad_norm": 0.41771018505096436, "learning_rate": 8.211079633307128e-06, "loss": 0.03423226997256279, "memory(GiB)": 21.48, "step": 9670, "token_acc": 0.9752650176678446, "train_speed(iter/s)": 0.952397 }, { "epoch": 0.3141669103076373, "grad_norm": 0.3355371952056885, "learning_rate": 8.210667874280566e-06, "loss": 0.02210341952741146, "memory(GiB)": 21.48, "step": 9671, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.952413 }, { "epoch": 0.31419939577039274, "grad_norm": 0.4596092700958252, "learning_rate": 8.210256078198623e-06, "loss": 0.033109381794929504, "memory(GiB)": 21.48, "step": 9672, "token_acc": 1.0, "train_speed(iter/s)": 0.95243 }, { "epoch": 0.31423188123314816, "grad_norm": 0.4372895359992981, "learning_rate": 8.209844245066055e-06, "loss": 0.026070933789014816, "memory(GiB)": 21.48, "step": 9673, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.95245 }, { "epoch": 0.3142643666959036, "grad_norm": 0.3685052990913391, "learning_rate": 8.209432374887606e-06, "loss": 0.023374654352664948, "memory(GiB)": 21.48, "step": 9674, "token_acc": 0.980544747081712, "train_speed(iter/s)": 0.95247 }, { "epoch": 0.314296852158659, "grad_norm": 0.7741453051567078, "learning_rate": 8.20902046766804e-06, "loss": 0.03346765786409378, "memory(GiB)": 21.48, "step": 9675, "token_acc": 0.9968652037617555, "train_speed(iter/s)": 0.952491 }, { "epoch": 0.3143293376214144, "grad_norm": 0.8351407051086426, "learning_rate": 8.208608523412105e-06, "loss": 0.032210346311330795, "memory(GiB)": 21.48, "step": 9676, "token_acc": 0.9905660377358491, "train_speed(iter/s)": 0.952512 }, { "epoch": 0.3143618230841698, "grad_norm": 0.3983357846736908, "learning_rate": 8.208196542124559e-06, "loss": 0.029618585482239723, "memory(GiB)": 21.48, "step": 9677, "token_acc": 0.9921875, "train_speed(iter/s)": 0.95253 }, { "epoch": 0.31439430854692524, "grad_norm": 0.5722705721855164, "learning_rate": 8.207784523810154e-06, "loss": 0.0335114561021328, "memory(GiB)": 21.48, "step": 9678, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.952546 }, { "epoch": 0.31442679400968065, "grad_norm": 1.704095721244812, "learning_rate": 8.207372468473644e-06, "loss": 0.01956193894147873, "memory(GiB)": 21.48, "step": 9679, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.952561 }, { "epoch": 0.31445927947243607, "grad_norm": 0.5208066701889038, "learning_rate": 8.20696037611979e-06, "loss": 0.026782188564538956, "memory(GiB)": 21.48, "step": 9680, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.952578 }, { "epoch": 0.3144917649351915, "grad_norm": 0.5610830187797546, "learning_rate": 8.206548246753342e-06, "loss": 0.04481511563062668, "memory(GiB)": 21.48, "step": 9681, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.952595 }, { "epoch": 0.3145242503979469, "grad_norm": 0.5040581226348877, "learning_rate": 8.20613608037906e-06, "loss": 0.03708527982234955, "memory(GiB)": 21.48, "step": 9682, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.952611 }, { "epoch": 0.3145567358607023, "grad_norm": 0.43454477190971375, "learning_rate": 8.205723877001701e-06, "loss": 0.029071614146232605, "memory(GiB)": 21.48, "step": 9683, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.952628 }, { "epoch": 0.31458922132345774, "grad_norm": 0.40810832381248474, "learning_rate": 8.20531163662602e-06, "loss": 0.033580511808395386, "memory(GiB)": 21.48, "step": 9684, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.952645 }, { "epoch": 0.31462170678621315, "grad_norm": 0.36394190788269043, "learning_rate": 8.204899359256779e-06, "loss": 0.025635283440351486, "memory(GiB)": 21.48, "step": 9685, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.952662 }, { "epoch": 0.31465419224896857, "grad_norm": 0.5399007797241211, "learning_rate": 8.204487044898732e-06, "loss": 0.027574174106121063, "memory(GiB)": 21.48, "step": 9686, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.95268 }, { "epoch": 0.314686677711724, "grad_norm": 0.35283347964286804, "learning_rate": 8.20407469355664e-06, "loss": 0.028689615428447723, "memory(GiB)": 21.48, "step": 9687, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.952696 }, { "epoch": 0.3147191631744794, "grad_norm": 0.4386613965034485, "learning_rate": 8.20366230523526e-06, "loss": 0.038513343781232834, "memory(GiB)": 21.48, "step": 9688, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.952712 }, { "epoch": 0.3147516486372348, "grad_norm": 0.4331371486186981, "learning_rate": 8.203249879939354e-06, "loss": 0.029704460874199867, "memory(GiB)": 21.48, "step": 9689, "token_acc": 0.9846153846153847, "train_speed(iter/s)": 0.952728 }, { "epoch": 0.31478413409999023, "grad_norm": 0.3718836307525635, "learning_rate": 8.202837417673681e-06, "loss": 0.02809077501296997, "memory(GiB)": 21.48, "step": 9690, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.952744 }, { "epoch": 0.31481661956274565, "grad_norm": 0.46022018790245056, "learning_rate": 8.202424918443002e-06, "loss": 0.0385233536362648, "memory(GiB)": 21.48, "step": 9691, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.952762 }, { "epoch": 0.31484910502550106, "grad_norm": 0.4092395603656769, "learning_rate": 8.202012382252076e-06, "loss": 0.03258819878101349, "memory(GiB)": 21.48, "step": 9692, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.952781 }, { "epoch": 0.3148815904882565, "grad_norm": 0.267702579498291, "learning_rate": 8.201599809105665e-06, "loss": 0.019803987815976143, "memory(GiB)": 21.48, "step": 9693, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.952798 }, { "epoch": 0.3149140759510119, "grad_norm": 0.5017937421798706, "learning_rate": 8.20118719900853e-06, "loss": 0.03778839856386185, "memory(GiB)": 21.48, "step": 9694, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.952815 }, { "epoch": 0.3149465614137673, "grad_norm": 0.4515353739261627, "learning_rate": 8.200774551965435e-06, "loss": 0.033894021064043045, "memory(GiB)": 21.48, "step": 9695, "token_acc": 0.991869918699187, "train_speed(iter/s)": 0.952831 }, { "epoch": 0.3149790468765228, "grad_norm": 0.5507225394248962, "learning_rate": 8.200361867981143e-06, "loss": 0.03450474515557289, "memory(GiB)": 21.48, "step": 9696, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.952845 }, { "epoch": 0.3150115323392782, "grad_norm": 0.32544073462486267, "learning_rate": 8.199949147060413e-06, "loss": 0.025435645133256912, "memory(GiB)": 21.48, "step": 9697, "token_acc": 0.9878048780487805, "train_speed(iter/s)": 0.952862 }, { "epoch": 0.3150440178020336, "grad_norm": 0.44102540612220764, "learning_rate": 8.199536389208012e-06, "loss": 0.028445778414607048, "memory(GiB)": 21.48, "step": 9698, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.952878 }, { "epoch": 0.31507650326478903, "grad_norm": 0.8545432686805725, "learning_rate": 8.199123594428704e-06, "loss": 0.04345221072435379, "memory(GiB)": 21.48, "step": 9699, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.952894 }, { "epoch": 0.31510898872754445, "grad_norm": 0.4029851257801056, "learning_rate": 8.198710762727249e-06, "loss": 0.033282794058322906, "memory(GiB)": 21.48, "step": 9700, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.952911 }, { "epoch": 0.31514147419029986, "grad_norm": 0.7110683917999268, "learning_rate": 8.198297894108415e-06, "loss": 0.03195023536682129, "memory(GiB)": 21.48, "step": 9701, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.952933 }, { "epoch": 0.3151739596530553, "grad_norm": 0.42025795578956604, "learning_rate": 8.197884988576967e-06, "loss": 0.028059404343366623, "memory(GiB)": 21.48, "step": 9702, "token_acc": 0.98, "train_speed(iter/s)": 0.952953 }, { "epoch": 0.3152064451158107, "grad_norm": 0.45946380496025085, "learning_rate": 8.19747204613767e-06, "loss": 0.030897732824087143, "memory(GiB)": 21.48, "step": 9703, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.952975 }, { "epoch": 0.3152389305785661, "grad_norm": 0.4355272948741913, "learning_rate": 8.19705906679529e-06, "loss": 0.02763361483812332, "memory(GiB)": 21.48, "step": 9704, "token_acc": 0.991869918699187, "train_speed(iter/s)": 0.952994 }, { "epoch": 0.31527141604132153, "grad_norm": 0.3482096195220947, "learning_rate": 8.19664605055459e-06, "loss": 0.029177598655223846, "memory(GiB)": 21.48, "step": 9705, "token_acc": 0.991869918699187, "train_speed(iter/s)": 0.953016 }, { "epoch": 0.31530390150407694, "grad_norm": 0.4027702808380127, "learning_rate": 8.196232997420344e-06, "loss": 0.026652373373508453, "memory(GiB)": 21.48, "step": 9706, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.953038 }, { "epoch": 0.31533638696683236, "grad_norm": 0.268744558095932, "learning_rate": 8.195819907397313e-06, "loss": 0.018139956519007683, "memory(GiB)": 21.48, "step": 9707, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.953059 }, { "epoch": 0.3153688724295878, "grad_norm": 0.40668153762817383, "learning_rate": 8.195406780490267e-06, "loss": 0.027434632182121277, "memory(GiB)": 21.48, "step": 9708, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.95308 }, { "epoch": 0.3154013578923432, "grad_norm": 0.3795859217643738, "learning_rate": 8.194993616703972e-06, "loss": 0.03292904049158096, "memory(GiB)": 21.48, "step": 9709, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.953101 }, { "epoch": 0.3154338433550986, "grad_norm": 0.4034896492958069, "learning_rate": 8.1945804160432e-06, "loss": 0.022791756317019463, "memory(GiB)": 21.48, "step": 9710, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.953122 }, { "epoch": 0.315466328817854, "grad_norm": 0.3957640528678894, "learning_rate": 8.194167178512719e-06, "loss": 0.035861823707818985, "memory(GiB)": 21.48, "step": 9711, "token_acc": 0.9678714859437751, "train_speed(iter/s)": 0.953142 }, { "epoch": 0.31549881428060944, "grad_norm": 0.3575497269630432, "learning_rate": 8.193753904117295e-06, "loss": 0.02412748895585537, "memory(GiB)": 21.48, "step": 9712, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.953162 }, { "epoch": 0.31553129974336486, "grad_norm": 0.27887651324272156, "learning_rate": 8.193340592861702e-06, "loss": 0.024061068892478943, "memory(GiB)": 21.48, "step": 9713, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.953178 }, { "epoch": 0.3155637852061203, "grad_norm": 0.5066457390785217, "learning_rate": 8.192927244750708e-06, "loss": 0.02793233096599579, "memory(GiB)": 21.48, "step": 9714, "token_acc": 0.9734513274336283, "train_speed(iter/s)": 0.953194 }, { "epoch": 0.3155962706688757, "grad_norm": 0.5100566148757935, "learning_rate": 8.192513859789083e-06, "loss": 0.026976197957992554, "memory(GiB)": 21.48, "step": 9715, "token_acc": 0.99609375, "train_speed(iter/s)": 0.953211 }, { "epoch": 0.3156287561316311, "grad_norm": 0.6418341994285583, "learning_rate": 8.192100437981598e-06, "loss": 0.03270271420478821, "memory(GiB)": 21.48, "step": 9716, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.953227 }, { "epoch": 0.3156612415943865, "grad_norm": 0.4426013231277466, "learning_rate": 8.191686979333027e-06, "loss": 0.03630140423774719, "memory(GiB)": 21.48, "step": 9717, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.953241 }, { "epoch": 0.31569372705714194, "grad_norm": 0.43895649909973145, "learning_rate": 8.191273483848139e-06, "loss": 0.029640233144164085, "memory(GiB)": 21.48, "step": 9718, "token_acc": 1.0, "train_speed(iter/s)": 0.953258 }, { "epoch": 0.31572621251989735, "grad_norm": 0.4198799729347229, "learning_rate": 8.19085995153171e-06, "loss": 0.03122381493449211, "memory(GiB)": 21.48, "step": 9719, "token_acc": 1.0, "train_speed(iter/s)": 0.953274 }, { "epoch": 0.31575869798265277, "grad_norm": 0.7738915681838989, "learning_rate": 8.190446382388506e-06, "loss": 0.03229570388793945, "memory(GiB)": 21.48, "step": 9720, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.95329 }, { "epoch": 0.3157911834454082, "grad_norm": 0.4047498106956482, "learning_rate": 8.190032776423305e-06, "loss": 0.028155246749520302, "memory(GiB)": 21.48, "step": 9721, "token_acc": 1.0, "train_speed(iter/s)": 0.953306 }, { "epoch": 0.3158236689081636, "grad_norm": 0.4670776128768921, "learning_rate": 8.189619133640883e-06, "loss": 0.030131883919239044, "memory(GiB)": 21.48, "step": 9722, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.95332 }, { "epoch": 0.315856154370919, "grad_norm": 0.49480265378952026, "learning_rate": 8.189205454046007e-06, "loss": 0.024188503623008728, "memory(GiB)": 21.48, "step": 9723, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.953335 }, { "epoch": 0.31588863983367443, "grad_norm": 0.3819616436958313, "learning_rate": 8.188791737643457e-06, "loss": 0.024838417768478394, "memory(GiB)": 21.48, "step": 9724, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.953352 }, { "epoch": 0.31592112529642985, "grad_norm": 0.4973022937774658, "learning_rate": 8.188377984438007e-06, "loss": 0.02084936574101448, "memory(GiB)": 21.48, "step": 9725, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.953368 }, { "epoch": 0.31595361075918527, "grad_norm": 0.33085721731185913, "learning_rate": 8.18796419443443e-06, "loss": 0.02810312621295452, "memory(GiB)": 21.48, "step": 9726, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.953381 }, { "epoch": 0.3159860962219407, "grad_norm": 0.4484933018684387, "learning_rate": 8.187550367637504e-06, "loss": 0.025895915925502777, "memory(GiB)": 21.48, "step": 9727, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.953397 }, { "epoch": 0.3160185816846961, "grad_norm": 0.5897852778434753, "learning_rate": 8.187136504052004e-06, "loss": 0.0297713503241539, "memory(GiB)": 21.48, "step": 9728, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.953413 }, { "epoch": 0.3160510671474515, "grad_norm": 0.6018635034561157, "learning_rate": 8.186722603682707e-06, "loss": 0.038413580507040024, "memory(GiB)": 21.48, "step": 9729, "token_acc": 0.984, "train_speed(iter/s)": 0.953428 }, { "epoch": 0.31608355261020693, "grad_norm": 0.3597981333732605, "learning_rate": 8.186308666534389e-06, "loss": 0.03229484707117081, "memory(GiB)": 21.48, "step": 9730, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.953444 }, { "epoch": 0.31611603807296235, "grad_norm": 0.6542682647705078, "learning_rate": 8.185894692611828e-06, "loss": 0.04341452196240425, "memory(GiB)": 21.48, "step": 9731, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.953461 }, { "epoch": 0.31614852353571776, "grad_norm": 0.4033401608467102, "learning_rate": 8.185480681919802e-06, "loss": 0.02540535479784012, "memory(GiB)": 21.48, "step": 9732, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.953476 }, { "epoch": 0.3161810089984732, "grad_norm": 0.6544093489646912, "learning_rate": 8.185066634463088e-06, "loss": 0.036131296306848526, "memory(GiB)": 21.48, "step": 9733, "token_acc": 0.984, "train_speed(iter/s)": 0.953497 }, { "epoch": 0.3162134944612286, "grad_norm": 0.46812593936920166, "learning_rate": 8.184652550246469e-06, "loss": 0.03088330291211605, "memory(GiB)": 21.48, "step": 9734, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.953518 }, { "epoch": 0.316245979923984, "grad_norm": 0.419889897108078, "learning_rate": 8.184238429274716e-06, "loss": 0.03044566698372364, "memory(GiB)": 21.48, "step": 9735, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.95354 }, { "epoch": 0.3162784653867394, "grad_norm": 0.5210464000701904, "learning_rate": 8.183824271552617e-06, "loss": 0.03371027112007141, "memory(GiB)": 21.48, "step": 9736, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.95356 }, { "epoch": 0.31631095084949484, "grad_norm": 0.3201403021812439, "learning_rate": 8.183410077084946e-06, "loss": 0.02415030263364315, "memory(GiB)": 21.48, "step": 9737, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.953581 }, { "epoch": 0.31634343631225026, "grad_norm": 0.6673988699913025, "learning_rate": 8.182995845876487e-06, "loss": 0.02658659778535366, "memory(GiB)": 21.48, "step": 9738, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.953601 }, { "epoch": 0.3163759217750057, "grad_norm": 0.3317866921424866, "learning_rate": 8.182581577932018e-06, "loss": 0.02808869630098343, "memory(GiB)": 21.48, "step": 9739, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.953618 }, { "epoch": 0.3164084072377611, "grad_norm": 0.44269856810569763, "learning_rate": 8.182167273256323e-06, "loss": 0.03211558610200882, "memory(GiB)": 21.48, "step": 9740, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.953635 }, { "epoch": 0.3164408927005165, "grad_norm": 0.4726293087005615, "learning_rate": 8.181752931854182e-06, "loss": 0.029541634023189545, "memory(GiB)": 21.48, "step": 9741, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.953651 }, { "epoch": 0.3164733781632719, "grad_norm": 0.43523985147476196, "learning_rate": 8.181338553730377e-06, "loss": 0.034322697669267654, "memory(GiB)": 21.48, "step": 9742, "token_acc": 0.981203007518797, "train_speed(iter/s)": 0.953666 }, { "epoch": 0.31650586362602734, "grad_norm": 0.6132363080978394, "learning_rate": 8.180924138889692e-06, "loss": 0.029693543910980225, "memory(GiB)": 21.48, "step": 9743, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.953684 }, { "epoch": 0.31653834908878276, "grad_norm": 0.40610840916633606, "learning_rate": 8.180509687336906e-06, "loss": 0.026083892211318016, "memory(GiB)": 21.48, "step": 9744, "token_acc": 0.9823788546255506, "train_speed(iter/s)": 0.953701 }, { "epoch": 0.31657083455153817, "grad_norm": 0.40840351581573486, "learning_rate": 8.180095199076808e-06, "loss": 0.027469530701637268, "memory(GiB)": 21.48, "step": 9745, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.953717 }, { "epoch": 0.3166033200142936, "grad_norm": 0.45788443088531494, "learning_rate": 8.179680674114177e-06, "loss": 0.021881982684135437, "memory(GiB)": 21.48, "step": 9746, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.953733 }, { "epoch": 0.316635805477049, "grad_norm": 0.27580714225769043, "learning_rate": 8.179266112453798e-06, "loss": 0.021037813276052475, "memory(GiB)": 21.48, "step": 9747, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.95375 }, { "epoch": 0.3166682909398044, "grad_norm": 0.4520718455314636, "learning_rate": 8.178851514100457e-06, "loss": 0.03260968625545502, "memory(GiB)": 21.48, "step": 9748, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.953766 }, { "epoch": 0.31670077640255984, "grad_norm": 0.35144907236099243, "learning_rate": 8.178436879058939e-06, "loss": 0.021614111959934235, "memory(GiB)": 21.48, "step": 9749, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.95378 }, { "epoch": 0.31673326186531525, "grad_norm": 0.3690536320209503, "learning_rate": 8.17802220733403e-06, "loss": 0.028220295906066895, "memory(GiB)": 21.48, "step": 9750, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.953795 }, { "epoch": 0.31676574732807067, "grad_norm": 0.3337858319282532, "learning_rate": 8.177607498930512e-06, "loss": 0.02150803804397583, "memory(GiB)": 21.48, "step": 9751, "token_acc": 1.0, "train_speed(iter/s)": 0.95381 }, { "epoch": 0.3167982327908261, "grad_norm": 0.44146522879600525, "learning_rate": 8.177192753853175e-06, "loss": 0.029654474928975105, "memory(GiB)": 21.48, "step": 9752, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.953825 }, { "epoch": 0.3168307182535815, "grad_norm": 0.42960473895072937, "learning_rate": 8.176777972106806e-06, "loss": 0.027984974905848503, "memory(GiB)": 21.48, "step": 9753, "token_acc": 0.9798994974874372, "train_speed(iter/s)": 0.953842 }, { "epoch": 0.3168632037163369, "grad_norm": 0.416737824678421, "learning_rate": 8.176363153696192e-06, "loss": 0.027164235711097717, "memory(GiB)": 21.48, "step": 9754, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.953858 }, { "epoch": 0.31689568917909233, "grad_norm": 0.40026041865348816, "learning_rate": 8.175948298626118e-06, "loss": 0.031291622668504715, "memory(GiB)": 21.48, "step": 9755, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.953875 }, { "epoch": 0.31692817464184775, "grad_norm": 0.8327081799507141, "learning_rate": 8.175533406901372e-06, "loss": 0.036593250930309296, "memory(GiB)": 21.48, "step": 9756, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.95389 }, { "epoch": 0.31696066010460316, "grad_norm": 0.45054206252098083, "learning_rate": 8.175118478526747e-06, "loss": 0.03507992625236511, "memory(GiB)": 21.48, "step": 9757, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.953903 }, { "epoch": 0.3169931455673586, "grad_norm": 0.35447174310684204, "learning_rate": 8.174703513507026e-06, "loss": 0.026459548622369766, "memory(GiB)": 21.48, "step": 9758, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.953918 }, { "epoch": 0.317025631030114, "grad_norm": 0.4249759018421173, "learning_rate": 8.174288511847002e-06, "loss": 0.02078600600361824, "memory(GiB)": 21.48, "step": 9759, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.953933 }, { "epoch": 0.31705811649286947, "grad_norm": 0.4715133309364319, "learning_rate": 8.173873473551463e-06, "loss": 0.03009769320487976, "memory(GiB)": 21.48, "step": 9760, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.953949 }, { "epoch": 0.3170906019556249, "grad_norm": 1.2737170457839966, "learning_rate": 8.1734583986252e-06, "loss": 0.03037889674305916, "memory(GiB)": 21.48, "step": 9761, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.953967 }, { "epoch": 0.3171230874183803, "grad_norm": 0.5387337803840637, "learning_rate": 8.173043287073004e-06, "loss": 0.02801309898495674, "memory(GiB)": 21.48, "step": 9762, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.953984 }, { "epoch": 0.3171555728811357, "grad_norm": 0.4320281445980072, "learning_rate": 8.172628138899665e-06, "loss": 0.023214198648929596, "memory(GiB)": 21.48, "step": 9763, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.954003 }, { "epoch": 0.31718805834389113, "grad_norm": 0.4048115313053131, "learning_rate": 8.172212954109974e-06, "loss": 0.026905115693807602, "memory(GiB)": 21.48, "step": 9764, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.954024 }, { "epoch": 0.31722054380664655, "grad_norm": 0.43471503257751465, "learning_rate": 8.171797732708723e-06, "loss": 0.029569922015070915, "memory(GiB)": 21.48, "step": 9765, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.954045 }, { "epoch": 0.31725302926940196, "grad_norm": 0.3836513161659241, "learning_rate": 8.171382474700705e-06, "loss": 0.027509376406669617, "memory(GiB)": 21.48, "step": 9766, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.954066 }, { "epoch": 0.3172855147321574, "grad_norm": 0.4885663390159607, "learning_rate": 8.170967180090712e-06, "loss": 0.03826539218425751, "memory(GiB)": 21.48, "step": 9767, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.954087 }, { "epoch": 0.3173180001949128, "grad_norm": 0.49159568548202515, "learning_rate": 8.170551848883537e-06, "loss": 0.028684690594673157, "memory(GiB)": 21.48, "step": 9768, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.954108 }, { "epoch": 0.3173504856576682, "grad_norm": 0.38373109698295593, "learning_rate": 8.170136481083974e-06, "loss": 0.026156924664974213, "memory(GiB)": 21.48, "step": 9769, "token_acc": 1.0, "train_speed(iter/s)": 0.954128 }, { "epoch": 0.31738297112042363, "grad_norm": 0.41070637106895447, "learning_rate": 8.169721076696816e-06, "loss": 0.03551100566983223, "memory(GiB)": 21.48, "step": 9770, "token_acc": 0.9708333333333333, "train_speed(iter/s)": 0.954148 }, { "epoch": 0.31741545658317905, "grad_norm": 0.306860089302063, "learning_rate": 8.169305635726859e-06, "loss": 0.01955944113433361, "memory(GiB)": 21.48, "step": 9771, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.95417 }, { "epoch": 0.31744794204593446, "grad_norm": 0.400784432888031, "learning_rate": 8.168890158178895e-06, "loss": 0.02371685579419136, "memory(GiB)": 21.48, "step": 9772, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.954185 }, { "epoch": 0.3174804275086899, "grad_norm": 0.7133158445358276, "learning_rate": 8.168474644057722e-06, "loss": 0.03607473522424698, "memory(GiB)": 21.48, "step": 9773, "token_acc": 0.9923371647509579, "train_speed(iter/s)": 0.954199 }, { "epoch": 0.3175129129714453, "grad_norm": 0.4384042024612427, "learning_rate": 8.168059093368135e-06, "loss": 0.029520487412810326, "memory(GiB)": 21.48, "step": 9774, "token_acc": 0.9898305084745763, "train_speed(iter/s)": 0.954216 }, { "epoch": 0.3175453984342007, "grad_norm": 0.6568424105644226, "learning_rate": 8.167643506114928e-06, "loss": 0.03782098740339279, "memory(GiB)": 21.48, "step": 9775, "token_acc": 0.9847328244274809, "train_speed(iter/s)": 0.954231 }, { "epoch": 0.3175778838969561, "grad_norm": 0.5746302604675293, "learning_rate": 8.167227882302899e-06, "loss": 0.03318299725651741, "memory(GiB)": 21.48, "step": 9776, "token_acc": 0.9678899082568807, "train_speed(iter/s)": 0.954247 }, { "epoch": 0.31761036935971154, "grad_norm": 0.32808393239974976, "learning_rate": 8.166812221936844e-06, "loss": 0.020459556952118874, "memory(GiB)": 21.48, "step": 9777, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.954262 }, { "epoch": 0.31764285482246696, "grad_norm": 0.40709617733955383, "learning_rate": 8.166396525021562e-06, "loss": 0.02520645782351494, "memory(GiB)": 21.48, "step": 9778, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.954279 }, { "epoch": 0.3176753402852224, "grad_norm": 0.407612144947052, "learning_rate": 8.16598079156185e-06, "loss": 0.03011774644255638, "memory(GiB)": 21.48, "step": 9779, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.954294 }, { "epoch": 0.3177078257479778, "grad_norm": 0.6002683639526367, "learning_rate": 8.165565021562507e-06, "loss": 0.03591877967119217, "memory(GiB)": 21.48, "step": 9780, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.95431 }, { "epoch": 0.3177403112107332, "grad_norm": 0.5098933577537537, "learning_rate": 8.165149215028328e-06, "loss": 0.031639765948057175, "memory(GiB)": 21.48, "step": 9781, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.954325 }, { "epoch": 0.3177727966734886, "grad_norm": 0.5127363204956055, "learning_rate": 8.164733371964114e-06, "loss": 0.03619644045829773, "memory(GiB)": 21.48, "step": 9782, "token_acc": 0.9696969696969697, "train_speed(iter/s)": 0.954342 }, { "epoch": 0.31780528213624404, "grad_norm": 1.703630805015564, "learning_rate": 8.164317492374664e-06, "loss": 0.02780817449092865, "memory(GiB)": 21.48, "step": 9783, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.954358 }, { "epoch": 0.31783776759899945, "grad_norm": 0.4191155731678009, "learning_rate": 8.16390157626478e-06, "loss": 0.029447447508573532, "memory(GiB)": 21.48, "step": 9784, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.954371 }, { "epoch": 0.31787025306175487, "grad_norm": 0.4873400628566742, "learning_rate": 8.163485623639262e-06, "loss": 0.03792005777359009, "memory(GiB)": 21.48, "step": 9785, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.954386 }, { "epoch": 0.3179027385245103, "grad_norm": 0.49642106890678406, "learning_rate": 8.163069634502908e-06, "loss": 0.03441696614027023, "memory(GiB)": 21.48, "step": 9786, "token_acc": 0.9961832061068703, "train_speed(iter/s)": 0.954401 }, { "epoch": 0.3179352239872657, "grad_norm": 0.5678386688232422, "learning_rate": 8.16265360886052e-06, "loss": 0.04265905171632767, "memory(GiB)": 21.48, "step": 9787, "token_acc": 1.0, "train_speed(iter/s)": 0.954417 }, { "epoch": 0.3179677094500211, "grad_norm": 0.46593743562698364, "learning_rate": 8.1622375467169e-06, "loss": 0.03285884112119675, "memory(GiB)": 21.48, "step": 9788, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.954434 }, { "epoch": 0.31800019491277653, "grad_norm": 0.4197118282318115, "learning_rate": 8.16182144807685e-06, "loss": 0.028525155037641525, "memory(GiB)": 21.48, "step": 9789, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.954451 }, { "epoch": 0.31803268037553195, "grad_norm": 0.9047430157661438, "learning_rate": 8.161405312945172e-06, "loss": 0.05915476381778717, "memory(GiB)": 21.48, "step": 9790, "token_acc": 0.9776951672862454, "train_speed(iter/s)": 0.954468 }, { "epoch": 0.31806516583828737, "grad_norm": 0.5285710692405701, "learning_rate": 8.16098914132667e-06, "loss": 0.02876110002398491, "memory(GiB)": 21.48, "step": 9791, "token_acc": 1.0, "train_speed(iter/s)": 0.954484 }, { "epoch": 0.3180976513010428, "grad_norm": 0.45270174741744995, "learning_rate": 8.160572933226145e-06, "loss": 0.028056535869836807, "memory(GiB)": 21.48, "step": 9792, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.954505 }, { "epoch": 0.3181301367637982, "grad_norm": 0.3388358950614929, "learning_rate": 8.160156688648401e-06, "loss": 0.03113328292965889, "memory(GiB)": 21.48, "step": 9793, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.954525 }, { "epoch": 0.3181626222265536, "grad_norm": 0.5878937840461731, "learning_rate": 8.159740407598245e-06, "loss": 0.032965559512376785, "memory(GiB)": 21.48, "step": 9794, "token_acc": 0.988, "train_speed(iter/s)": 0.954545 }, { "epoch": 0.31819510768930903, "grad_norm": 0.7012346982955933, "learning_rate": 8.159324090080476e-06, "loss": 0.03643391653895378, "memory(GiB)": 21.48, "step": 9795, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.954567 }, { "epoch": 0.31822759315206445, "grad_norm": 0.3688318431377411, "learning_rate": 8.158907736099903e-06, "loss": 0.03683733195066452, "memory(GiB)": 21.48, "step": 9796, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.954586 }, { "epoch": 0.31826007861481986, "grad_norm": 0.5890375375747681, "learning_rate": 8.15849134566133e-06, "loss": 0.03333783522248268, "memory(GiB)": 21.48, "step": 9797, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.954608 }, { "epoch": 0.3182925640775753, "grad_norm": 0.3906516432762146, "learning_rate": 8.158074918769565e-06, "loss": 0.02852454036474228, "memory(GiB)": 21.48, "step": 9798, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.954626 }, { "epoch": 0.3183250495403307, "grad_norm": 0.511263906955719, "learning_rate": 8.157658455429409e-06, "loss": 0.032157283276319504, "memory(GiB)": 21.48, "step": 9799, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.954647 }, { "epoch": 0.3183575350030861, "grad_norm": 0.4529666602611542, "learning_rate": 8.157241955645674e-06, "loss": 0.033081844449043274, "memory(GiB)": 21.48, "step": 9800, "token_acc": 0.9918367346938776, "train_speed(iter/s)": 0.954665 }, { "epoch": 0.31839002046584153, "grad_norm": 2.392648696899414, "learning_rate": 8.156825419423163e-06, "loss": 0.0380069836974144, "memory(GiB)": 21.48, "step": 9801, "token_acc": 1.0, "train_speed(iter/s)": 0.954683 }, { "epoch": 0.31842250592859694, "grad_norm": 0.44355642795562744, "learning_rate": 8.156408846766686e-06, "loss": 0.03385445848107338, "memory(GiB)": 21.48, "step": 9802, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.954699 }, { "epoch": 0.31845499139135236, "grad_norm": 0.6318718791007996, "learning_rate": 8.15599223768105e-06, "loss": 0.03422671929001808, "memory(GiB)": 21.48, "step": 9803, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.954715 }, { "epoch": 0.3184874768541078, "grad_norm": 0.35919108986854553, "learning_rate": 8.15557559217106e-06, "loss": 0.03306283801794052, "memory(GiB)": 21.48, "step": 9804, "token_acc": 1.0, "train_speed(iter/s)": 0.95473 }, { "epoch": 0.3185199623168632, "grad_norm": 0.47202834486961365, "learning_rate": 8.155158910241532e-06, "loss": 0.03134226053953171, "memory(GiB)": 21.48, "step": 9805, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.954746 }, { "epoch": 0.3185524477796186, "grad_norm": 0.4026883840560913, "learning_rate": 8.154742191897267e-06, "loss": 0.03125781565904617, "memory(GiB)": 21.48, "step": 9806, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.954762 }, { "epoch": 0.318584933242374, "grad_norm": 0.4555715322494507, "learning_rate": 8.15432543714308e-06, "loss": 0.03443554788827896, "memory(GiB)": 21.48, "step": 9807, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.954778 }, { "epoch": 0.31861741870512944, "grad_norm": 0.3843652307987213, "learning_rate": 8.15390864598378e-06, "loss": 0.02905115857720375, "memory(GiB)": 21.48, "step": 9808, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.954793 }, { "epoch": 0.31864990416788486, "grad_norm": 0.40988218784332275, "learning_rate": 8.153491818424175e-06, "loss": 0.030967451632022858, "memory(GiB)": 21.48, "step": 9809, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.954808 }, { "epoch": 0.3186823896306403, "grad_norm": 0.4184420108795166, "learning_rate": 8.153074954469079e-06, "loss": 0.023315206170082092, "memory(GiB)": 21.48, "step": 9810, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.954821 }, { "epoch": 0.3187148750933957, "grad_norm": 0.36501753330230713, "learning_rate": 8.152658054123299e-06, "loss": 0.030998367816209793, "memory(GiB)": 21.48, "step": 9811, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.954837 }, { "epoch": 0.3187473605561511, "grad_norm": 0.4165824055671692, "learning_rate": 8.152241117391648e-06, "loss": 0.027836160734295845, "memory(GiB)": 21.48, "step": 9812, "token_acc": 0.9849056603773585, "train_speed(iter/s)": 0.954852 }, { "epoch": 0.3187798460189065, "grad_norm": 0.3957630395889282, "learning_rate": 8.15182414427894e-06, "loss": 0.026142515242099762, "memory(GiB)": 21.48, "step": 9813, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.954869 }, { "epoch": 0.31881233148166194, "grad_norm": 0.503193736076355, "learning_rate": 8.151407134789986e-06, "loss": 0.03596919775009155, "memory(GiB)": 21.48, "step": 9814, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.954884 }, { "epoch": 0.31884481694441735, "grad_norm": 0.529948353767395, "learning_rate": 8.150990088929601e-06, "loss": 0.027157504111528397, "memory(GiB)": 21.48, "step": 9815, "token_acc": 1.0, "train_speed(iter/s)": 0.9549 }, { "epoch": 0.31887730240717277, "grad_norm": 0.408944308757782, "learning_rate": 8.150573006702596e-06, "loss": 0.023675380274653435, "memory(GiB)": 21.48, "step": 9816, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.954917 }, { "epoch": 0.3189097878699282, "grad_norm": 0.40378338098526, "learning_rate": 8.150155888113783e-06, "loss": 0.027182474732398987, "memory(GiB)": 21.48, "step": 9817, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.954933 }, { "epoch": 0.3189422733326836, "grad_norm": 0.5435056090354919, "learning_rate": 8.14973873316798e-06, "loss": 0.03925398737192154, "memory(GiB)": 21.48, "step": 9818, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.95495 }, { "epoch": 0.318974758795439, "grad_norm": 0.45155447721481323, "learning_rate": 8.14932154187e-06, "loss": 0.029742315411567688, "memory(GiB)": 21.48, "step": 9819, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.954963 }, { "epoch": 0.31900724425819443, "grad_norm": 0.49154552817344666, "learning_rate": 8.148904314224656e-06, "loss": 0.03798816353082657, "memory(GiB)": 21.48, "step": 9820, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.954978 }, { "epoch": 0.31903972972094985, "grad_norm": 0.3367651700973511, "learning_rate": 8.148487050236766e-06, "loss": 0.020349351689219475, "memory(GiB)": 21.48, "step": 9821, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.954995 }, { "epoch": 0.31907221518370527, "grad_norm": 0.839312732219696, "learning_rate": 8.148069749911144e-06, "loss": 0.03634054213762283, "memory(GiB)": 21.48, "step": 9822, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.955011 }, { "epoch": 0.3191047006464607, "grad_norm": 0.8467754125595093, "learning_rate": 8.14765241325261e-06, "loss": 0.04230741411447525, "memory(GiB)": 21.48, "step": 9823, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.955028 }, { "epoch": 0.31913718610921615, "grad_norm": 0.47379666566848755, "learning_rate": 8.147235040265976e-06, "loss": 0.026166606694459915, "memory(GiB)": 21.48, "step": 9824, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.955048 }, { "epoch": 0.31916967157197157, "grad_norm": 0.30191972851753235, "learning_rate": 8.146817630956058e-06, "loss": 0.024925433099269867, "memory(GiB)": 21.48, "step": 9825, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955067 }, { "epoch": 0.319202157034727, "grad_norm": 0.5091876983642578, "learning_rate": 8.146400185327679e-06, "loss": 0.03520255163311958, "memory(GiB)": 21.48, "step": 9826, "token_acc": 0.9774436090225563, "train_speed(iter/s)": 0.955087 }, { "epoch": 0.3192346424974824, "grad_norm": 0.37445440888404846, "learning_rate": 8.145982703385653e-06, "loss": 0.031466737389564514, "memory(GiB)": 21.48, "step": 9827, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955108 }, { "epoch": 0.3192671279602378, "grad_norm": 1.3044263124465942, "learning_rate": 8.145565185134798e-06, "loss": 0.033663712441921234, "memory(GiB)": 21.48, "step": 9828, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.955128 }, { "epoch": 0.31929961342299323, "grad_norm": 0.35718485713005066, "learning_rate": 8.145147630579937e-06, "loss": 0.024743910878896713, "memory(GiB)": 21.48, "step": 9829, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.955146 }, { "epoch": 0.31933209888574865, "grad_norm": 0.4192262887954712, "learning_rate": 8.144730039725883e-06, "loss": 0.026609469205141068, "memory(GiB)": 21.48, "step": 9830, "token_acc": 0.9846153846153847, "train_speed(iter/s)": 0.955167 }, { "epoch": 0.31936458434850407, "grad_norm": 0.3980703353881836, "learning_rate": 8.144312412577462e-06, "loss": 0.032639965415000916, "memory(GiB)": 21.48, "step": 9831, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.955185 }, { "epoch": 0.3193970698112595, "grad_norm": 0.3990008234977722, "learning_rate": 8.14389474913949e-06, "loss": 0.034064508974552155, "memory(GiB)": 21.48, "step": 9832, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.9552 }, { "epoch": 0.3194295552740149, "grad_norm": 0.3687365651130676, "learning_rate": 8.143477049416786e-06, "loss": 0.026946118101477623, "memory(GiB)": 21.48, "step": 9833, "token_acc": 1.0, "train_speed(iter/s)": 0.955215 }, { "epoch": 0.3194620407367703, "grad_norm": 0.3634113669395447, "learning_rate": 8.143059313414174e-06, "loss": 0.028612608090043068, "memory(GiB)": 21.48, "step": 9834, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.955232 }, { "epoch": 0.31949452619952573, "grad_norm": 0.312270849943161, "learning_rate": 8.142641541136473e-06, "loss": 0.02429429069161415, "memory(GiB)": 21.48, "step": 9835, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955248 }, { "epoch": 0.31952701166228115, "grad_norm": 0.49551400542259216, "learning_rate": 8.142223732588507e-06, "loss": 0.031020091846585274, "memory(GiB)": 21.48, "step": 9836, "token_acc": 0.9948979591836735, "train_speed(iter/s)": 0.955263 }, { "epoch": 0.31955949712503656, "grad_norm": 0.42383891344070435, "learning_rate": 8.141805887775099e-06, "loss": 0.03172393888235092, "memory(GiB)": 21.48, "step": 9837, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.955275 }, { "epoch": 0.319591982587792, "grad_norm": 0.3245263695716858, "learning_rate": 8.141388006701066e-06, "loss": 0.025295637547969818, "memory(GiB)": 21.48, "step": 9838, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.95529 }, { "epoch": 0.3196244680505474, "grad_norm": 0.32802703976631165, "learning_rate": 8.140970089371236e-06, "loss": 0.025195065885782242, "memory(GiB)": 21.48, "step": 9839, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.955305 }, { "epoch": 0.3196569535133028, "grad_norm": 0.32733985781669617, "learning_rate": 8.14055213579043e-06, "loss": 0.025017403066158295, "memory(GiB)": 21.48, "step": 9840, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.955322 }, { "epoch": 0.3196894389760582, "grad_norm": 0.859927773475647, "learning_rate": 8.140134145963472e-06, "loss": 0.03582904115319252, "memory(GiB)": 21.48, "step": 9841, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.955339 }, { "epoch": 0.31972192443881364, "grad_norm": 0.5935431122779846, "learning_rate": 8.139716119895188e-06, "loss": 0.038070909678936005, "memory(GiB)": 21.48, "step": 9842, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.955353 }, { "epoch": 0.31975440990156906, "grad_norm": 0.3041337728500366, "learning_rate": 8.139298057590398e-06, "loss": 0.027736622840166092, "memory(GiB)": 21.48, "step": 9843, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955367 }, { "epoch": 0.3197868953643245, "grad_norm": 0.37737804651260376, "learning_rate": 8.13887995905393e-06, "loss": 0.027941809967160225, "memory(GiB)": 21.48, "step": 9844, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.955381 }, { "epoch": 0.3198193808270799, "grad_norm": 0.8233505487442017, "learning_rate": 8.138461824290612e-06, "loss": 0.041771143674850464, "memory(GiB)": 21.48, "step": 9845, "token_acc": 0.9695431472081218, "train_speed(iter/s)": 0.955397 }, { "epoch": 0.3198518662898353, "grad_norm": 0.9072263836860657, "learning_rate": 8.138043653305267e-06, "loss": 0.025772493332624435, "memory(GiB)": 21.48, "step": 9846, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.955411 }, { "epoch": 0.3198843517525907, "grad_norm": 0.32535985112190247, "learning_rate": 8.137625446102722e-06, "loss": 0.02316686138510704, "memory(GiB)": 21.48, "step": 9847, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.955423 }, { "epoch": 0.31991683721534614, "grad_norm": 0.36601635813713074, "learning_rate": 8.1372072026878e-06, "loss": 0.023090243339538574, "memory(GiB)": 21.48, "step": 9848, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.955438 }, { "epoch": 0.31994932267810156, "grad_norm": 0.3607010841369629, "learning_rate": 8.136788923065334e-06, "loss": 0.022665001451969147, "memory(GiB)": 21.48, "step": 9849, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.955454 }, { "epoch": 0.31998180814085697, "grad_norm": 0.4653746485710144, "learning_rate": 8.136370607240148e-06, "loss": 0.02198454737663269, "memory(GiB)": 21.48, "step": 9850, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.95547 }, { "epoch": 0.3200142936036124, "grad_norm": 0.5112541317939758, "learning_rate": 8.135952255217071e-06, "loss": 0.030929600819945335, "memory(GiB)": 21.48, "step": 9851, "token_acc": 0.9798387096774194, "train_speed(iter/s)": 0.955491 }, { "epoch": 0.3200467790663678, "grad_norm": 0.43726420402526855, "learning_rate": 8.135533867000931e-06, "loss": 0.030358051881194115, "memory(GiB)": 21.48, "step": 9852, "token_acc": 1.0, "train_speed(iter/s)": 0.955511 }, { "epoch": 0.3200792645291232, "grad_norm": 0.46619531512260437, "learning_rate": 8.135115442596557e-06, "loss": 0.03773866593837738, "memory(GiB)": 21.48, "step": 9853, "token_acc": 0.995, "train_speed(iter/s)": 0.955531 }, { "epoch": 0.32011174999187864, "grad_norm": 0.32617121934890747, "learning_rate": 8.134696982008778e-06, "loss": 0.027970055118203163, "memory(GiB)": 21.48, "step": 9854, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.955552 }, { "epoch": 0.32014423545463405, "grad_norm": 0.4408057928085327, "learning_rate": 8.134278485242423e-06, "loss": 0.03490216284990311, "memory(GiB)": 21.48, "step": 9855, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.955572 }, { "epoch": 0.32017672091738947, "grad_norm": 0.3034104108810425, "learning_rate": 8.133859952302324e-06, "loss": 0.02983551099896431, "memory(GiB)": 21.48, "step": 9856, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.955592 }, { "epoch": 0.3202092063801449, "grad_norm": 0.3890407979488373, "learning_rate": 8.133441383193307e-06, "loss": 0.026807349175214767, "memory(GiB)": 21.48, "step": 9857, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955612 }, { "epoch": 0.3202416918429003, "grad_norm": 0.6376892924308777, "learning_rate": 8.133022777920207e-06, "loss": 0.036088645458221436, "memory(GiB)": 21.48, "step": 9858, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.955631 }, { "epoch": 0.3202741773056557, "grad_norm": 0.4870535433292389, "learning_rate": 8.132604136487855e-06, "loss": 0.023690663278102875, "memory(GiB)": 21.48, "step": 9859, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.955651 }, { "epoch": 0.32030666276841113, "grad_norm": 0.5318349003791809, "learning_rate": 8.13218545890108e-06, "loss": 0.03631246089935303, "memory(GiB)": 21.48, "step": 9860, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955672 }, { "epoch": 0.32033914823116655, "grad_norm": 0.5806205868721008, "learning_rate": 8.131766745164719e-06, "loss": 0.03334299474954605, "memory(GiB)": 21.48, "step": 9861, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.955692 }, { "epoch": 0.32037163369392196, "grad_norm": 0.4513632655143738, "learning_rate": 8.1313479952836e-06, "loss": 0.025012336671352386, "memory(GiB)": 21.48, "step": 9862, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.955709 }, { "epoch": 0.3204041191566774, "grad_norm": 0.704988420009613, "learning_rate": 8.130929209262557e-06, "loss": 0.03416242450475693, "memory(GiB)": 21.48, "step": 9863, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.955725 }, { "epoch": 0.3204366046194328, "grad_norm": 0.3224993944168091, "learning_rate": 8.130510387106422e-06, "loss": 0.023457685485482216, "memory(GiB)": 21.48, "step": 9864, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.955741 }, { "epoch": 0.3204690900821882, "grad_norm": 0.40637755393981934, "learning_rate": 8.130091528820034e-06, "loss": 0.026580732315778732, "memory(GiB)": 21.48, "step": 9865, "token_acc": 0.9745762711864406, "train_speed(iter/s)": 0.955758 }, { "epoch": 0.32050157554494363, "grad_norm": 0.41991910338401794, "learning_rate": 8.12967263440822e-06, "loss": 0.03094261884689331, "memory(GiB)": 21.48, "step": 9866, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.955774 }, { "epoch": 0.32053406100769904, "grad_norm": 0.40069201588630676, "learning_rate": 8.12925370387582e-06, "loss": 0.030598634853959084, "memory(GiB)": 21.48, "step": 9867, "token_acc": 0.982532751091703, "train_speed(iter/s)": 0.955791 }, { "epoch": 0.32056654647045446, "grad_norm": 0.6199387907981873, "learning_rate": 8.128834737227669e-06, "loss": 0.029184428974986076, "memory(GiB)": 21.48, "step": 9868, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.955807 }, { "epoch": 0.3205990319332099, "grad_norm": 0.3684335947036743, "learning_rate": 8.128415734468597e-06, "loss": 0.02122100442647934, "memory(GiB)": 21.48, "step": 9869, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.955823 }, { "epoch": 0.3206315173959653, "grad_norm": 0.4247221052646637, "learning_rate": 8.127996695603446e-06, "loss": 0.03533243387937546, "memory(GiB)": 21.48, "step": 9870, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.95584 }, { "epoch": 0.3206640028587207, "grad_norm": 0.4330621361732483, "learning_rate": 8.12757762063705e-06, "loss": 0.023214250802993774, "memory(GiB)": 21.48, "step": 9871, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955856 }, { "epoch": 0.3206964883214761, "grad_norm": 0.5426686406135559, "learning_rate": 8.127158509574244e-06, "loss": 0.03726246953010559, "memory(GiB)": 21.48, "step": 9872, "token_acc": 0.9863945578231292, "train_speed(iter/s)": 0.955871 }, { "epoch": 0.32072897378423154, "grad_norm": 0.41975072026252747, "learning_rate": 8.126739362419868e-06, "loss": 0.03664124011993408, "memory(GiB)": 21.48, "step": 9873, "token_acc": 0.9686274509803922, "train_speed(iter/s)": 0.955886 }, { "epoch": 0.32076145924698696, "grad_norm": 0.5034934282302856, "learning_rate": 8.126320179178758e-06, "loss": 0.02895563468337059, "memory(GiB)": 21.48, "step": 9874, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.955903 }, { "epoch": 0.3207939447097424, "grad_norm": 0.45635706186294556, "learning_rate": 8.12590095985575e-06, "loss": 0.028638701885938644, "memory(GiB)": 21.48, "step": 9875, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.955919 }, { "epoch": 0.3208264301724978, "grad_norm": 0.38338860869407654, "learning_rate": 8.125481704455684e-06, "loss": 0.028022753074765205, "memory(GiB)": 21.48, "step": 9876, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.955936 }, { "epoch": 0.3208589156352532, "grad_norm": 0.3720480799674988, "learning_rate": 8.125062412983403e-06, "loss": 0.02500678040087223, "memory(GiB)": 21.48, "step": 9877, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.955953 }, { "epoch": 0.3208914010980086, "grad_norm": 0.44938924908638, "learning_rate": 8.124643085443738e-06, "loss": 0.035285305231809616, "memory(GiB)": 21.48, "step": 9878, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.955969 }, { "epoch": 0.32092388656076404, "grad_norm": 10.499588966369629, "learning_rate": 8.124223721841536e-06, "loss": 0.04751816764473915, "memory(GiB)": 21.48, "step": 9879, "token_acc": 0.9944444444444445, "train_speed(iter/s)": 0.955984 }, { "epoch": 0.32095637202351945, "grad_norm": 0.4295942783355713, "learning_rate": 8.123804322181634e-06, "loss": 0.026451386511325836, "memory(GiB)": 21.48, "step": 9880, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955999 }, { "epoch": 0.32098885748627487, "grad_norm": 0.5509657859802246, "learning_rate": 8.12338488646887e-06, "loss": 0.033070020377635956, "memory(GiB)": 21.48, "step": 9881, "token_acc": 0.9858657243816255, "train_speed(iter/s)": 0.95601 }, { "epoch": 0.3210213429490303, "grad_norm": 0.6647478342056274, "learning_rate": 8.122965414708089e-06, "loss": 0.03596561402082443, "memory(GiB)": 21.48, "step": 9882, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.956026 }, { "epoch": 0.3210538284117857, "grad_norm": 0.38847866654396057, "learning_rate": 8.12254590690413e-06, "loss": 0.0278792567551136, "memory(GiB)": 21.48, "step": 9883, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.956044 }, { "epoch": 0.3210863138745411, "grad_norm": 0.9422544836997986, "learning_rate": 8.122126363061834e-06, "loss": 0.0268023032695055, "memory(GiB)": 21.48, "step": 9884, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.95606 }, { "epoch": 0.32111879933729653, "grad_norm": 0.36420631408691406, "learning_rate": 8.121706783186045e-06, "loss": 0.025215355679392815, "memory(GiB)": 21.48, "step": 9885, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.956076 }, { "epoch": 0.32115128480005195, "grad_norm": 0.969941258430481, "learning_rate": 8.121287167281605e-06, "loss": 0.026433121412992477, "memory(GiB)": 21.48, "step": 9886, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.956094 }, { "epoch": 0.32118377026280737, "grad_norm": 0.35640937089920044, "learning_rate": 8.120867515353356e-06, "loss": 0.026830321177840233, "memory(GiB)": 21.48, "step": 9887, "token_acc": 0.9940828402366864, "train_speed(iter/s)": 0.956114 }, { "epoch": 0.32121625572556284, "grad_norm": 0.4231891334056854, "learning_rate": 8.120447827406143e-06, "loss": 0.02489851601421833, "memory(GiB)": 21.48, "step": 9888, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.956133 }, { "epoch": 0.32124874118831825, "grad_norm": 0.3804308772087097, "learning_rate": 8.120028103444809e-06, "loss": 0.027693722397089005, "memory(GiB)": 21.48, "step": 9889, "token_acc": 0.9940828402366864, "train_speed(iter/s)": 0.956152 }, { "epoch": 0.32128122665107367, "grad_norm": 1.461167573928833, "learning_rate": 8.119608343474197e-06, "loss": 0.041375331580638885, "memory(GiB)": 21.48, "step": 9890, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.956169 }, { "epoch": 0.3213137121138291, "grad_norm": 0.43377387523651123, "learning_rate": 8.119188547499153e-06, "loss": 0.025999870151281357, "memory(GiB)": 21.48, "step": 9891, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956183 }, { "epoch": 0.3213461975765845, "grad_norm": 0.5134474039077759, "learning_rate": 8.118768715524521e-06, "loss": 0.03321480751037598, "memory(GiB)": 21.48, "step": 9892, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956199 }, { "epoch": 0.3213786830393399, "grad_norm": 0.3416484296321869, "learning_rate": 8.11834884755515e-06, "loss": 0.025253739207983017, "memory(GiB)": 21.48, "step": 9893, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.956215 }, { "epoch": 0.32141116850209533, "grad_norm": 0.4421111047267914, "learning_rate": 8.117928943595878e-06, "loss": 0.032687753438949585, "memory(GiB)": 21.48, "step": 9894, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.956229 }, { "epoch": 0.32144365396485075, "grad_norm": 0.7123860120773315, "learning_rate": 8.117509003651558e-06, "loss": 0.025533080101013184, "memory(GiB)": 21.48, "step": 9895, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956242 }, { "epoch": 0.32147613942760617, "grad_norm": 0.3673846423625946, "learning_rate": 8.117089027727035e-06, "loss": 0.03436508774757385, "memory(GiB)": 21.48, "step": 9896, "token_acc": 0.9702970297029703, "train_speed(iter/s)": 0.956257 }, { "epoch": 0.3215086248903616, "grad_norm": 0.8989688754081726, "learning_rate": 8.116669015827156e-06, "loss": 0.030720729380846024, "memory(GiB)": 21.48, "step": 9897, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.956271 }, { "epoch": 0.321541110353117, "grad_norm": 0.3958907127380371, "learning_rate": 8.116248967956768e-06, "loss": 0.02502916008234024, "memory(GiB)": 21.48, "step": 9898, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956281 }, { "epoch": 0.3215735958158724, "grad_norm": 0.5691653490066528, "learning_rate": 8.115828884120718e-06, "loss": 0.033667564392089844, "memory(GiB)": 21.48, "step": 9899, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.956296 }, { "epoch": 0.32160608127862783, "grad_norm": 0.4909275770187378, "learning_rate": 8.115408764323858e-06, "loss": 0.03962064906954765, "memory(GiB)": 21.48, "step": 9900, "token_acc": 0.988950276243094, "train_speed(iter/s)": 0.956312 }, { "epoch": 0.32163856674138325, "grad_norm": 0.4183766841888428, "learning_rate": 8.114988608571033e-06, "loss": 0.02343425713479519, "memory(GiB)": 21.48, "step": 9901, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.956328 }, { "epoch": 0.32167105220413866, "grad_norm": 0.4153788089752197, "learning_rate": 8.114568416867093e-06, "loss": 0.027671126648783684, "memory(GiB)": 21.48, "step": 9902, "token_acc": 0.9812206572769953, "train_speed(iter/s)": 0.956342 }, { "epoch": 0.3217035376668941, "grad_norm": 0.25970548391342163, "learning_rate": 8.114148189216888e-06, "loss": 0.02186085283756256, "memory(GiB)": 21.48, "step": 9903, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956358 }, { "epoch": 0.3217360231296495, "grad_norm": 0.4466026723384857, "learning_rate": 8.113727925625267e-06, "loss": 0.031643398106098175, "memory(GiB)": 21.48, "step": 9904, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.956374 }, { "epoch": 0.3217685085924049, "grad_norm": 0.7476081848144531, "learning_rate": 8.113307626097085e-06, "loss": 0.03732059523463249, "memory(GiB)": 21.48, "step": 9905, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.956391 }, { "epoch": 0.3218009940551603, "grad_norm": 0.32445019483566284, "learning_rate": 8.112887290637185e-06, "loss": 0.01994863897562027, "memory(GiB)": 21.48, "step": 9906, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.956406 }, { "epoch": 0.32183347951791574, "grad_norm": 0.4566553831100464, "learning_rate": 8.112466919250425e-06, "loss": 0.028418995440006256, "memory(GiB)": 21.48, "step": 9907, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.956421 }, { "epoch": 0.32186596498067116, "grad_norm": 0.45598918199539185, "learning_rate": 8.112046511941654e-06, "loss": 0.033057183027267456, "memory(GiB)": 21.48, "step": 9908, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.956436 }, { "epoch": 0.3218984504434266, "grad_norm": 0.34080106019973755, "learning_rate": 8.111626068715722e-06, "loss": 0.025583181530237198, "memory(GiB)": 21.48, "step": 9909, "token_acc": 0.9846153846153847, "train_speed(iter/s)": 0.956452 }, { "epoch": 0.321930935906182, "grad_norm": 0.610568106174469, "learning_rate": 8.111205589577486e-06, "loss": 0.0350358784198761, "memory(GiB)": 21.48, "step": 9910, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.956471 }, { "epoch": 0.3219634213689374, "grad_norm": 0.47706595063209534, "learning_rate": 8.110785074531795e-06, "loss": 0.026506103575229645, "memory(GiB)": 21.48, "step": 9911, "token_acc": 0.98046875, "train_speed(iter/s)": 0.956492 }, { "epoch": 0.3219959068316928, "grad_norm": 0.5768751502037048, "learning_rate": 8.110364523583505e-06, "loss": 0.03125199303030968, "memory(GiB)": 21.48, "step": 9912, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.956511 }, { "epoch": 0.32202839229444824, "grad_norm": 0.43458348512649536, "learning_rate": 8.109943936737468e-06, "loss": 0.027164284139871597, "memory(GiB)": 21.48, "step": 9913, "token_acc": 0.9806763285024155, "train_speed(iter/s)": 0.956531 }, { "epoch": 0.32206087775720366, "grad_norm": 0.3241264224052429, "learning_rate": 8.109523313998538e-06, "loss": 0.019918251782655716, "memory(GiB)": 21.48, "step": 9914, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956552 }, { "epoch": 0.3220933632199591, "grad_norm": 0.4439769983291626, "learning_rate": 8.10910265537157e-06, "loss": 0.03314508497714996, "memory(GiB)": 21.48, "step": 9915, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.956573 }, { "epoch": 0.3221258486827145, "grad_norm": 0.4147644340991974, "learning_rate": 8.10868196086142e-06, "loss": 0.03422785550355911, "memory(GiB)": 21.48, "step": 9916, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.956593 }, { "epoch": 0.3221583341454699, "grad_norm": 0.518925666809082, "learning_rate": 8.108261230472942e-06, "loss": 0.03171220421791077, "memory(GiB)": 21.48, "step": 9917, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.956614 }, { "epoch": 0.3221908196082253, "grad_norm": 0.5617534518241882, "learning_rate": 8.107840464210994e-06, "loss": 0.035386763513088226, "memory(GiB)": 21.48, "step": 9918, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.956634 }, { "epoch": 0.32222330507098074, "grad_norm": 0.5151195526123047, "learning_rate": 8.107419662080428e-06, "loss": 0.030549637973308563, "memory(GiB)": 21.48, "step": 9919, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.956653 }, { "epoch": 0.32225579053373615, "grad_norm": 0.5038355588912964, "learning_rate": 8.106998824086105e-06, "loss": 0.03770271688699722, "memory(GiB)": 21.48, "step": 9920, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.956673 }, { "epoch": 0.32228827599649157, "grad_norm": 0.3867034614086151, "learning_rate": 8.106577950232878e-06, "loss": 0.026516079902648926, "memory(GiB)": 21.48, "step": 9921, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.956693 }, { "epoch": 0.322320761459247, "grad_norm": 0.46737101674079895, "learning_rate": 8.106157040525608e-06, "loss": 0.026378542184829712, "memory(GiB)": 21.48, "step": 9922, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.956713 }, { "epoch": 0.3223532469220024, "grad_norm": 0.7227214574813843, "learning_rate": 8.105736094969152e-06, "loss": 0.026028206571936607, "memory(GiB)": 21.48, "step": 9923, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.956732 }, { "epoch": 0.3223857323847578, "grad_norm": 0.6878476142883301, "learning_rate": 8.105315113568368e-06, "loss": 0.03900788351893425, "memory(GiB)": 21.48, "step": 9924, "token_acc": 0.9774436090225563, "train_speed(iter/s)": 0.956749 }, { "epoch": 0.32241821784751323, "grad_norm": 0.3400048315525055, "learning_rate": 8.104894096328113e-06, "loss": 0.03352980688214302, "memory(GiB)": 21.48, "step": 9925, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.956764 }, { "epoch": 0.32245070331026865, "grad_norm": 0.3528919517993927, "learning_rate": 8.104473043253249e-06, "loss": 0.02479127235710621, "memory(GiB)": 21.48, "step": 9926, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.956781 }, { "epoch": 0.32248318877302407, "grad_norm": 0.40754395723342896, "learning_rate": 8.104051954348634e-06, "loss": 0.03152313455939293, "memory(GiB)": 21.48, "step": 9927, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.956797 }, { "epoch": 0.3225156742357795, "grad_norm": 0.34077712893486023, "learning_rate": 8.103630829619127e-06, "loss": 0.030959326773881912, "memory(GiB)": 21.48, "step": 9928, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956813 }, { "epoch": 0.3225481596985349, "grad_norm": 0.6907799243927002, "learning_rate": 8.103209669069589e-06, "loss": 0.04169343039393425, "memory(GiB)": 21.48, "step": 9929, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.95683 }, { "epoch": 0.3225806451612903, "grad_norm": 0.3681354224681854, "learning_rate": 8.102788472704882e-06, "loss": 0.034260183572769165, "memory(GiB)": 21.48, "step": 9930, "token_acc": 1.0, "train_speed(iter/s)": 0.956846 }, { "epoch": 0.32261313062404573, "grad_norm": 0.42368465662002563, "learning_rate": 8.102367240529866e-06, "loss": 0.02686510980129242, "memory(GiB)": 21.48, "step": 9931, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.956861 }, { "epoch": 0.32264561608680115, "grad_norm": 0.37604913115501404, "learning_rate": 8.101945972549404e-06, "loss": 0.032536983489990234, "memory(GiB)": 21.48, "step": 9932, "token_acc": 1.0, "train_speed(iter/s)": 0.956877 }, { "epoch": 0.32267810154955656, "grad_norm": 0.6840289831161499, "learning_rate": 8.101524668768354e-06, "loss": 0.03406515344977379, "memory(GiB)": 21.48, "step": 9933, "token_acc": 0.9739130434782609, "train_speed(iter/s)": 0.956891 }, { "epoch": 0.322710587012312, "grad_norm": 0.6190574169158936, "learning_rate": 8.101103329191582e-06, "loss": 0.03426145017147064, "memory(GiB)": 21.48, "step": 9934, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.956906 }, { "epoch": 0.3227430724750674, "grad_norm": 0.4167444109916687, "learning_rate": 8.10068195382395e-06, "loss": 0.02547454461455345, "memory(GiB)": 21.48, "step": 9935, "token_acc": 0.9921875, "train_speed(iter/s)": 0.95692 }, { "epoch": 0.3227755579378228, "grad_norm": 0.38117700815200806, "learning_rate": 8.100260542670323e-06, "loss": 0.02873009629547596, "memory(GiB)": 21.48, "step": 9936, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.956936 }, { "epoch": 0.3228080434005782, "grad_norm": 0.3410254120826721, "learning_rate": 8.099839095735562e-06, "loss": 0.03350347280502319, "memory(GiB)": 21.48, "step": 9937, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.95695 }, { "epoch": 0.32284052886333364, "grad_norm": 0.8012109994888306, "learning_rate": 8.099417613024531e-06, "loss": 0.03130698576569557, "memory(GiB)": 21.48, "step": 9938, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.956968 }, { "epoch": 0.32287301432608906, "grad_norm": 0.5190257430076599, "learning_rate": 8.098996094542096e-06, "loss": 0.027865616604685783, "memory(GiB)": 21.48, "step": 9939, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.956984 }, { "epoch": 0.3229054997888445, "grad_norm": 0.3689897358417511, "learning_rate": 8.098574540293123e-06, "loss": 0.028723785653710365, "memory(GiB)": 21.48, "step": 9940, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.957 }, { "epoch": 0.3229379852515999, "grad_norm": 0.9027321934700012, "learning_rate": 8.098152950282474e-06, "loss": 0.025737229734659195, "memory(GiB)": 21.48, "step": 9941, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.957014 }, { "epoch": 0.3229704707143553, "grad_norm": 0.4770963788032532, "learning_rate": 8.097731324515016e-06, "loss": 0.02837328054010868, "memory(GiB)": 21.48, "step": 9942, "token_acc": 0.9781420765027322, "train_speed(iter/s)": 0.957026 }, { "epoch": 0.3230029561771107, "grad_norm": 0.5742515325546265, "learning_rate": 8.097309662995616e-06, "loss": 0.028155187144875526, "memory(GiB)": 21.48, "step": 9943, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957041 }, { "epoch": 0.32303544163986614, "grad_norm": 0.6612903475761414, "learning_rate": 8.09688796572914e-06, "loss": 0.034657374024391174, "memory(GiB)": 21.48, "step": 9944, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.957057 }, { "epoch": 0.32306792710262155, "grad_norm": 0.3572060167789459, "learning_rate": 8.096466232720454e-06, "loss": 0.02994396910071373, "memory(GiB)": 21.48, "step": 9945, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957074 }, { "epoch": 0.32310041256537697, "grad_norm": 0.6970921158790588, "learning_rate": 8.096044463974429e-06, "loss": 0.03676750883460045, "memory(GiB)": 21.48, "step": 9946, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.957091 }, { "epoch": 0.3231328980281324, "grad_norm": 0.43787217140197754, "learning_rate": 8.095622659495927e-06, "loss": 0.03130539506673813, "memory(GiB)": 21.48, "step": 9947, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.957109 }, { "epoch": 0.3231653834908878, "grad_norm": 0.3774779736995697, "learning_rate": 8.09520081928982e-06, "loss": 0.025304503738880157, "memory(GiB)": 21.48, "step": 9948, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.957129 }, { "epoch": 0.3231978689536432, "grad_norm": 0.6060919165611267, "learning_rate": 8.094778943360977e-06, "loss": 0.022778570652008057, "memory(GiB)": 21.48, "step": 9949, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957148 }, { "epoch": 0.32323035441639864, "grad_norm": 0.3419344127178192, "learning_rate": 8.094357031714264e-06, "loss": 0.026320364326238632, "memory(GiB)": 21.48, "step": 9950, "token_acc": 0.9880239520958084, "train_speed(iter/s)": 0.957162 }, { "epoch": 0.32326283987915405, "grad_norm": 1.5102839469909668, "learning_rate": 8.093935084354554e-06, "loss": 0.03089657425880432, "memory(GiB)": 21.48, "step": 9951, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.957177 }, { "epoch": 0.3232953253419095, "grad_norm": 0.6384379863739014, "learning_rate": 8.093513101286714e-06, "loss": 0.03626260161399841, "memory(GiB)": 21.48, "step": 9952, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.957191 }, { "epoch": 0.32332781080466494, "grad_norm": 0.44301819801330566, "learning_rate": 8.093091082515616e-06, "loss": 0.03640066832304001, "memory(GiB)": 21.48, "step": 9953, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.957206 }, { "epoch": 0.32336029626742036, "grad_norm": 0.5053354501724243, "learning_rate": 8.09266902804613e-06, "loss": 0.03365842625498772, "memory(GiB)": 21.48, "step": 9954, "token_acc": 0.9842105263157894, "train_speed(iter/s)": 0.95722 }, { "epoch": 0.32339278173017577, "grad_norm": 0.2712515592575073, "learning_rate": 8.092246937883126e-06, "loss": 0.022518988698720932, "memory(GiB)": 21.48, "step": 9955, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.957236 }, { "epoch": 0.3234252671929312, "grad_norm": 0.5756895542144775, "learning_rate": 8.091824812031477e-06, "loss": 0.03160662204027176, "memory(GiB)": 21.48, "step": 9956, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.957252 }, { "epoch": 0.3234577526556866, "grad_norm": 0.3212936818599701, "learning_rate": 8.091402650496056e-06, "loss": 0.026330068707466125, "memory(GiB)": 21.48, "step": 9957, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.957267 }, { "epoch": 0.323490238118442, "grad_norm": 0.3521415889263153, "learning_rate": 8.090980453281733e-06, "loss": 0.025727521628141403, "memory(GiB)": 21.48, "step": 9958, "token_acc": 0.974025974025974, "train_speed(iter/s)": 0.95728 }, { "epoch": 0.32352272358119744, "grad_norm": 0.406965970993042, "learning_rate": 8.09055822039338e-06, "loss": 0.028367727994918823, "memory(GiB)": 21.48, "step": 9959, "token_acc": 0.9940828402366864, "train_speed(iter/s)": 0.957294 }, { "epoch": 0.32355520904395285, "grad_norm": 0.33506226539611816, "learning_rate": 8.090135951835873e-06, "loss": 0.03524432331323624, "memory(GiB)": 21.48, "step": 9960, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.957309 }, { "epoch": 0.32358769450670827, "grad_norm": 0.36671143770217896, "learning_rate": 8.089713647614085e-06, "loss": 0.022168578580021858, "memory(GiB)": 21.48, "step": 9961, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957325 }, { "epoch": 0.3236201799694637, "grad_norm": 0.4558671712875366, "learning_rate": 8.089291307732888e-06, "loss": 0.031586140394210815, "memory(GiB)": 21.48, "step": 9962, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.95734 }, { "epoch": 0.3236526654322191, "grad_norm": 0.43277475237846375, "learning_rate": 8.088868932197159e-06, "loss": 0.02284472994506359, "memory(GiB)": 21.48, "step": 9963, "token_acc": 0.9921875, "train_speed(iter/s)": 0.957357 }, { "epoch": 0.3236851508949745, "grad_norm": 0.384355366230011, "learning_rate": 8.088446521011768e-06, "loss": 0.02867656759917736, "memory(GiB)": 21.48, "step": 9964, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.957371 }, { "epoch": 0.32371763635772993, "grad_norm": 0.3594668209552765, "learning_rate": 8.088024074181596e-06, "loss": 0.029204268008470535, "memory(GiB)": 21.48, "step": 9965, "token_acc": 0.996, "train_speed(iter/s)": 0.957386 }, { "epoch": 0.32375012182048535, "grad_norm": 0.34534651041030884, "learning_rate": 8.087601591711514e-06, "loss": 0.025525588542222977, "memory(GiB)": 21.48, "step": 9966, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.957402 }, { "epoch": 0.32378260728324076, "grad_norm": 0.49869564175605774, "learning_rate": 8.087179073606401e-06, "loss": 0.03117934614419937, "memory(GiB)": 21.48, "step": 9967, "token_acc": 0.982532751091703, "train_speed(iter/s)": 0.957413 }, { "epoch": 0.3238150927459962, "grad_norm": 0.39805737137794495, "learning_rate": 8.086756519871133e-06, "loss": 0.034675151109695435, "memory(GiB)": 21.48, "step": 9968, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.957428 }, { "epoch": 0.3238475782087516, "grad_norm": 0.3644495904445648, "learning_rate": 8.086333930510586e-06, "loss": 0.020846135914325714, "memory(GiB)": 21.48, "step": 9969, "token_acc": 0.9932885906040269, "train_speed(iter/s)": 0.957446 }, { "epoch": 0.323880063671507, "grad_norm": 0.5969082713127136, "learning_rate": 8.085911305529638e-06, "loss": 0.03345348313450813, "memory(GiB)": 21.48, "step": 9970, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.957465 }, { "epoch": 0.32391254913426243, "grad_norm": 0.49828824400901794, "learning_rate": 8.085488644933166e-06, "loss": 0.030091581866145134, "memory(GiB)": 21.48, "step": 9971, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.957484 }, { "epoch": 0.32394503459701784, "grad_norm": 0.5194729566574097, "learning_rate": 8.085065948726047e-06, "loss": 0.03098166733980179, "memory(GiB)": 21.48, "step": 9972, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957505 }, { "epoch": 0.32397752005977326, "grad_norm": 0.3885248899459839, "learning_rate": 8.084643216913164e-06, "loss": 0.03247877582907677, "memory(GiB)": 21.48, "step": 9973, "token_acc": 0.9867986798679867, "train_speed(iter/s)": 0.957525 }, { "epoch": 0.3240100055225287, "grad_norm": 0.624809980392456, "learning_rate": 8.08422044949939e-06, "loss": 0.0291790459305048, "memory(GiB)": 21.48, "step": 9974, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957545 }, { "epoch": 0.3240424909852841, "grad_norm": 0.49879586696624756, "learning_rate": 8.083797646489607e-06, "loss": 0.030480848625302315, "memory(GiB)": 21.48, "step": 9975, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.957566 }, { "epoch": 0.3240749764480395, "grad_norm": 1.2694288492202759, "learning_rate": 8.083374807888695e-06, "loss": 0.044326819479465485, "memory(GiB)": 21.48, "step": 9976, "token_acc": 0.985, "train_speed(iter/s)": 0.957587 }, { "epoch": 0.3241074619107949, "grad_norm": 0.4345356822013855, "learning_rate": 8.082951933701534e-06, "loss": 0.03284135460853577, "memory(GiB)": 21.48, "step": 9977, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.957605 }, { "epoch": 0.32413994737355034, "grad_norm": 0.3433155119419098, "learning_rate": 8.082529023933003e-06, "loss": 0.0311041921377182, "memory(GiB)": 21.48, "step": 9978, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.957624 }, { "epoch": 0.32417243283630576, "grad_norm": 0.47821706533432007, "learning_rate": 8.082106078587986e-06, "loss": 0.0269741490483284, "memory(GiB)": 21.48, "step": 9979, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.957645 }, { "epoch": 0.3242049182990612, "grad_norm": 0.44246336817741394, "learning_rate": 8.081683097671362e-06, "loss": 0.03380604833364487, "memory(GiB)": 21.48, "step": 9980, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957664 }, { "epoch": 0.3242374037618166, "grad_norm": 0.3767743706703186, "learning_rate": 8.081260081188012e-06, "loss": 0.028091754764318466, "memory(GiB)": 21.48, "step": 9981, "token_acc": 0.9894179894179894, "train_speed(iter/s)": 0.957684 }, { "epoch": 0.324269889224572, "grad_norm": 0.5194573402404785, "learning_rate": 8.080837029142821e-06, "loss": 0.0334988534450531, "memory(GiB)": 21.48, "step": 9982, "token_acc": 0.9847715736040609, "train_speed(iter/s)": 0.957704 }, { "epoch": 0.3243023746873274, "grad_norm": 0.5754404664039612, "learning_rate": 8.08041394154067e-06, "loss": 0.03592299297451973, "memory(GiB)": 21.48, "step": 9983, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.957722 }, { "epoch": 0.32433486015008284, "grad_norm": 0.3608699440956116, "learning_rate": 8.079990818386442e-06, "loss": 0.02614063024520874, "memory(GiB)": 21.48, "step": 9984, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.957743 }, { "epoch": 0.32436734561283825, "grad_norm": 0.36423516273498535, "learning_rate": 8.079567659685019e-06, "loss": 0.02512838877737522, "memory(GiB)": 21.48, "step": 9985, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.957758 }, { "epoch": 0.32439983107559367, "grad_norm": 0.5736536383628845, "learning_rate": 8.079144465441288e-06, "loss": 0.041458193212747574, "memory(GiB)": 21.48, "step": 9986, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.957774 }, { "epoch": 0.3244323165383491, "grad_norm": 0.6426624655723572, "learning_rate": 8.078721235660131e-06, "loss": 0.031395815312862396, "memory(GiB)": 21.48, "step": 9987, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.95779 }, { "epoch": 0.3244648020011045, "grad_norm": 0.4753882586956024, "learning_rate": 8.078297970346435e-06, "loss": 0.03451603278517723, "memory(GiB)": 21.48, "step": 9988, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.957807 }, { "epoch": 0.3244972874638599, "grad_norm": 0.5391085147857666, "learning_rate": 8.077874669505082e-06, "loss": 0.03034297376871109, "memory(GiB)": 21.48, "step": 9989, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.957822 }, { "epoch": 0.32452977292661533, "grad_norm": 0.3952469527721405, "learning_rate": 8.077451333140958e-06, "loss": 0.020633123815059662, "memory(GiB)": 21.48, "step": 9990, "token_acc": 1.0, "train_speed(iter/s)": 0.957838 }, { "epoch": 0.32456225838937075, "grad_norm": 0.5519003868103027, "learning_rate": 8.07702796125895e-06, "loss": 0.04324038326740265, "memory(GiB)": 21.48, "step": 9991, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.957854 }, { "epoch": 0.32459474385212617, "grad_norm": 1.113969326019287, "learning_rate": 8.076604553863942e-06, "loss": 0.026617705821990967, "memory(GiB)": 21.48, "step": 9992, "token_acc": 0.9751243781094527, "train_speed(iter/s)": 0.957869 }, { "epoch": 0.3246272293148816, "grad_norm": 0.4713553488254547, "learning_rate": 8.076181110960825e-06, "loss": 0.02126966416835785, "memory(GiB)": 21.48, "step": 9993, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.957884 }, { "epoch": 0.324659714777637, "grad_norm": 0.3038505017757416, "learning_rate": 8.075757632554483e-06, "loss": 0.025322366505861282, "memory(GiB)": 21.48, "step": 9994, "token_acc": 0.9942857142857143, "train_speed(iter/s)": 0.957901 }, { "epoch": 0.3246922002403924, "grad_norm": 0.5136074423789978, "learning_rate": 8.075334118649804e-06, "loss": 0.03148685768246651, "memory(GiB)": 21.48, "step": 9995, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.957915 }, { "epoch": 0.32472468570314783, "grad_norm": 0.3117813169956207, "learning_rate": 8.074910569251676e-06, "loss": 0.023304563015699387, "memory(GiB)": 21.48, "step": 9996, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.957931 }, { "epoch": 0.32475717116590325, "grad_norm": 0.4372539818286896, "learning_rate": 8.074486984364986e-06, "loss": 0.03053920529782772, "memory(GiB)": 21.48, "step": 9997, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.957947 }, { "epoch": 0.32478965662865866, "grad_norm": 0.4023326337337494, "learning_rate": 8.074063363994625e-06, "loss": 0.031007908284664154, "memory(GiB)": 21.48, "step": 9998, "token_acc": 0.9805825242718447, "train_speed(iter/s)": 0.957964 }, { "epoch": 0.3248221420914141, "grad_norm": 0.3150951862335205, "learning_rate": 8.073639708145482e-06, "loss": 0.020444996654987335, "memory(GiB)": 21.48, "step": 9999, "token_acc": 1.0, "train_speed(iter/s)": 0.95798 }, { "epoch": 0.3248546275541695, "grad_norm": 0.3171684741973877, "learning_rate": 8.073216016822445e-06, "loss": 0.02658364549279213, "memory(GiB)": 21.48, "step": 10000, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.957995 }, { "epoch": 0.3248546275541695, "eval_loss": 0.030256494879722595, "eval_runtime": 80.9827, "eval_samples_per_second": 122.866, "eval_steps_per_second": 3.84, "eval_token_acc": 0.9882327098746047, "step": 10000 }, { "epoch": 0.3248871130169249, "grad_norm": 0.2613978981971741, "learning_rate": 8.072792290030405e-06, "loss": 0.019371893256902695, "memory(GiB)": 21.48, "step": 10001, "token_acc": 0.9878481498225857, "train_speed(iter/s)": 0.949669 }, { "epoch": 0.3249195984796803, "grad_norm": 0.3667328953742981, "learning_rate": 8.072368527774252e-06, "loss": 0.028897244483232498, "memory(GiB)": 21.48, "step": 10002, "token_acc": 0.9776785714285714, "train_speed(iter/s)": 0.949682 }, { "epoch": 0.32495208394243574, "grad_norm": 0.3800068497657776, "learning_rate": 8.071944730058876e-06, "loss": 0.02935890480875969, "memory(GiB)": 21.48, "step": 10003, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.949697 }, { "epoch": 0.32498456940519116, "grad_norm": 0.3157484531402588, "learning_rate": 8.071520896889171e-06, "loss": 0.025720560923218727, "memory(GiB)": 21.48, "step": 10004, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.94971 }, { "epoch": 0.3250170548679466, "grad_norm": 0.4129450023174286, "learning_rate": 8.071097028270026e-06, "loss": 0.030223535373806953, "memory(GiB)": 21.48, "step": 10005, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.949722 }, { "epoch": 0.325049540330702, "grad_norm": 0.3472897708415985, "learning_rate": 8.070673124206333e-06, "loss": 0.0199049711227417, "memory(GiB)": 21.48, "step": 10006, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.949737 }, { "epoch": 0.3250820257934574, "grad_norm": 0.3476102948188782, "learning_rate": 8.070249184702985e-06, "loss": 0.029105231165885925, "memory(GiB)": 21.48, "step": 10007, "token_acc": 0.9764705882352941, "train_speed(iter/s)": 0.949752 }, { "epoch": 0.3251145112562128, "grad_norm": 0.5548304915428162, "learning_rate": 8.069825209764877e-06, "loss": 0.029268348589539528, "memory(GiB)": 21.48, "step": 10008, "token_acc": 0.9780701754385965, "train_speed(iter/s)": 0.949767 }, { "epoch": 0.32514699671896824, "grad_norm": 0.41625455021858215, "learning_rate": 8.069401199396898e-06, "loss": 0.020504437386989594, "memory(GiB)": 21.48, "step": 10009, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.94978 }, { "epoch": 0.32517948218172366, "grad_norm": 0.42214375734329224, "learning_rate": 8.068977153603944e-06, "loss": 0.03657679259777069, "memory(GiB)": 21.48, "step": 10010, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.949794 }, { "epoch": 0.32521196764447907, "grad_norm": 0.4831744134426117, "learning_rate": 8.06855307239091e-06, "loss": 0.029896847903728485, "memory(GiB)": 21.48, "step": 10011, "token_acc": 0.9865771812080537, "train_speed(iter/s)": 0.949808 }, { "epoch": 0.3252444531072345, "grad_norm": 0.32464081048965454, "learning_rate": 8.068128955762687e-06, "loss": 0.017547812312841415, "memory(GiB)": 21.48, "step": 10012, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.949822 }, { "epoch": 0.3252769385699899, "grad_norm": 0.477226585149765, "learning_rate": 8.067704803724175e-06, "loss": 0.026964392513036728, "memory(GiB)": 21.48, "step": 10013, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.949835 }, { "epoch": 0.3253094240327453, "grad_norm": 0.48007500171661377, "learning_rate": 8.067280616280263e-06, "loss": 0.03000831790268421, "memory(GiB)": 21.48, "step": 10014, "token_acc": 0.9905660377358491, "train_speed(iter/s)": 0.949848 }, { "epoch": 0.32534190949550074, "grad_norm": 0.42313531041145325, "learning_rate": 8.066856393435854e-06, "loss": 0.024240516126155853, "memory(GiB)": 21.48, "step": 10015, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.949864 }, { "epoch": 0.3253743949582562, "grad_norm": 0.4397059381008148, "learning_rate": 8.066432135195837e-06, "loss": 0.02685669995844364, "memory(GiB)": 21.48, "step": 10016, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.949881 }, { "epoch": 0.3254068804210116, "grad_norm": 0.5386457443237305, "learning_rate": 8.066007841565114e-06, "loss": 0.04486976936459541, "memory(GiB)": 21.48, "step": 10017, "token_acc": 0.9875, "train_speed(iter/s)": 0.9499 }, { "epoch": 0.32543936588376704, "grad_norm": 0.6938819289207458, "learning_rate": 8.065583512548579e-06, "loss": 0.030747447162866592, "memory(GiB)": 21.48, "step": 10018, "token_acc": 0.9813084112149533, "train_speed(iter/s)": 0.949921 }, { "epoch": 0.32547185134652246, "grad_norm": 0.4243863523006439, "learning_rate": 8.06515914815113e-06, "loss": 0.030437717214226723, "memory(GiB)": 21.48, "step": 10019, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.949942 }, { "epoch": 0.32550433680927787, "grad_norm": 0.5104816555976868, "learning_rate": 8.064734748377664e-06, "loss": 0.031769976019859314, "memory(GiB)": 21.48, "step": 10020, "token_acc": 0.9787985865724381, "train_speed(iter/s)": 0.949963 }, { "epoch": 0.3255368222720333, "grad_norm": 0.3998767137527466, "learning_rate": 8.06431031323308e-06, "loss": 0.0294533371925354, "memory(GiB)": 21.48, "step": 10021, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.949983 }, { "epoch": 0.3255693077347887, "grad_norm": 0.40912190079689026, "learning_rate": 8.063885842722276e-06, "loss": 0.02298206090927124, "memory(GiB)": 21.48, "step": 10022, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.950004 }, { "epoch": 0.3256017931975441, "grad_norm": 0.37879306077957153, "learning_rate": 8.063461336850153e-06, "loss": 0.026201345026493073, "memory(GiB)": 21.48, "step": 10023, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.950024 }, { "epoch": 0.32563427866029954, "grad_norm": 0.6895400881767273, "learning_rate": 8.063036795621609e-06, "loss": 0.029276052489876747, "memory(GiB)": 21.48, "step": 10024, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.950045 }, { "epoch": 0.32566676412305495, "grad_norm": 0.3087156414985657, "learning_rate": 8.062612219041542e-06, "loss": 0.020992878824472427, "memory(GiB)": 21.48, "step": 10025, "token_acc": 1.0, "train_speed(iter/s)": 0.950067 }, { "epoch": 0.32569924958581037, "grad_norm": 0.47097277641296387, "learning_rate": 8.062187607114853e-06, "loss": 0.027740981429815292, "memory(GiB)": 21.48, "step": 10026, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.950086 }, { "epoch": 0.3257317350485658, "grad_norm": 0.5936729311943054, "learning_rate": 8.061762959846444e-06, "loss": 0.040045060217380524, "memory(GiB)": 21.48, "step": 10027, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.950107 }, { "epoch": 0.3257642205113212, "grad_norm": 0.4015229046344757, "learning_rate": 8.061338277241216e-06, "loss": 0.03041246347129345, "memory(GiB)": 21.48, "step": 10028, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.950128 }, { "epoch": 0.3257967059740766, "grad_norm": 0.3757028877735138, "learning_rate": 8.06091355930407e-06, "loss": 0.021567583084106445, "memory(GiB)": 21.48, "step": 10029, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.95015 }, { "epoch": 0.32582919143683203, "grad_norm": 0.6002678275108337, "learning_rate": 8.060488806039906e-06, "loss": 0.033455319702625275, "memory(GiB)": 21.48, "step": 10030, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.95017 }, { "epoch": 0.32586167689958745, "grad_norm": 0.43223893642425537, "learning_rate": 8.060064017453628e-06, "loss": 0.03045252338051796, "memory(GiB)": 21.48, "step": 10031, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.950191 }, { "epoch": 0.32589416236234287, "grad_norm": 0.7176334261894226, "learning_rate": 8.059639193550139e-06, "loss": 0.027070114389061928, "memory(GiB)": 21.48, "step": 10032, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.950212 }, { "epoch": 0.3259266478250983, "grad_norm": 0.3804642856121063, "learning_rate": 8.05921433433434e-06, "loss": 0.030085459351539612, "memory(GiB)": 21.48, "step": 10033, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.950231 }, { "epoch": 0.3259591332878537, "grad_norm": 0.36962929368019104, "learning_rate": 8.058789439811136e-06, "loss": 0.02618090808391571, "memory(GiB)": 21.48, "step": 10034, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.95025 }, { "epoch": 0.3259916187506091, "grad_norm": 0.47029340267181396, "learning_rate": 8.058364509985433e-06, "loss": 0.02950439602136612, "memory(GiB)": 21.48, "step": 10035, "token_acc": 1.0, "train_speed(iter/s)": 0.950271 }, { "epoch": 0.32602410421336453, "grad_norm": 0.5307018756866455, "learning_rate": 8.05793954486213e-06, "loss": 0.04249339550733566, "memory(GiB)": 21.48, "step": 10036, "token_acc": 0.972, "train_speed(iter/s)": 0.950291 }, { "epoch": 0.32605658967611995, "grad_norm": 0.5470523834228516, "learning_rate": 8.057514544446136e-06, "loss": 0.036393530666828156, "memory(GiB)": 21.48, "step": 10037, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.950312 }, { "epoch": 0.32608907513887536, "grad_norm": 0.3772933781147003, "learning_rate": 8.057089508742355e-06, "loss": 0.025202173739671707, "memory(GiB)": 21.48, "step": 10038, "token_acc": 1.0, "train_speed(iter/s)": 0.950331 }, { "epoch": 0.3261215606016308, "grad_norm": 0.38067030906677246, "learning_rate": 8.056664437755692e-06, "loss": 0.024190586060285568, "memory(GiB)": 21.48, "step": 10039, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.950352 }, { "epoch": 0.3261540460643862, "grad_norm": 0.44639748334884644, "learning_rate": 8.056239331491051e-06, "loss": 0.03014206513762474, "memory(GiB)": 21.48, "step": 10040, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.950373 }, { "epoch": 0.3261865315271416, "grad_norm": 0.4437883794307709, "learning_rate": 8.055814189953342e-06, "loss": 0.027605608105659485, "memory(GiB)": 21.48, "step": 10041, "token_acc": 0.9893992932862191, "train_speed(iter/s)": 0.950394 }, { "epoch": 0.326219016989897, "grad_norm": 0.44586536288261414, "learning_rate": 8.05538901314747e-06, "loss": 0.02687448263168335, "memory(GiB)": 21.48, "step": 10042, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.950415 }, { "epoch": 0.32625150245265244, "grad_norm": 0.34100836515426636, "learning_rate": 8.054963801078342e-06, "loss": 0.029947832226753235, "memory(GiB)": 21.48, "step": 10043, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.950435 }, { "epoch": 0.32628398791540786, "grad_norm": 0.34567439556121826, "learning_rate": 8.054538553750865e-06, "loss": 0.024654043838381767, "memory(GiB)": 21.48, "step": 10044, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.950456 }, { "epoch": 0.3263164733781633, "grad_norm": 0.5464537143707275, "learning_rate": 8.054113271169948e-06, "loss": 0.025513898581266403, "memory(GiB)": 21.48, "step": 10045, "token_acc": 1.0, "train_speed(iter/s)": 0.950476 }, { "epoch": 0.3263489588409187, "grad_norm": 0.44134077429771423, "learning_rate": 8.053687953340498e-06, "loss": 0.03956976532936096, "memory(GiB)": 21.48, "step": 10046, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.950497 }, { "epoch": 0.3263814443036741, "grad_norm": 0.3929961025714874, "learning_rate": 8.053262600267426e-06, "loss": 0.027002256363630295, "memory(GiB)": 21.48, "step": 10047, "token_acc": 0.9759036144578314, "train_speed(iter/s)": 0.950518 }, { "epoch": 0.3264139297664295, "grad_norm": 0.4212287664413452, "learning_rate": 8.05283721195564e-06, "loss": 0.03111334890127182, "memory(GiB)": 21.48, "step": 10048, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.950538 }, { "epoch": 0.32644641522918494, "grad_norm": 0.4359557330608368, "learning_rate": 8.052411788410047e-06, "loss": 0.029412884265184402, "memory(GiB)": 21.48, "step": 10049, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.950557 }, { "epoch": 0.32647890069194035, "grad_norm": 0.5429575443267822, "learning_rate": 8.05198632963556e-06, "loss": 0.03175903856754303, "memory(GiB)": 21.48, "step": 10050, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.950578 }, { "epoch": 0.32651138615469577, "grad_norm": 0.3530282974243164, "learning_rate": 8.05156083563709e-06, "loss": 0.025613747537136078, "memory(GiB)": 21.48, "step": 10051, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.950599 }, { "epoch": 0.3265438716174512, "grad_norm": 0.3864651322364807, "learning_rate": 8.051135306419545e-06, "loss": 0.021110696718096733, "memory(GiB)": 21.48, "step": 10052, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.950619 }, { "epoch": 0.3265763570802066, "grad_norm": 0.39571768045425415, "learning_rate": 8.050709741987838e-06, "loss": 0.030231695622205734, "memory(GiB)": 21.48, "step": 10053, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.950639 }, { "epoch": 0.326608842542962, "grad_norm": 0.7935106754302979, "learning_rate": 8.05028414234688e-06, "loss": 0.0575902983546257, "memory(GiB)": 21.48, "step": 10054, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.950659 }, { "epoch": 0.32664132800571744, "grad_norm": 2.068131923675537, "learning_rate": 8.049858507501582e-06, "loss": 0.041253961622714996, "memory(GiB)": 21.48, "step": 10055, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.950677 }, { "epoch": 0.32667381346847285, "grad_norm": 0.4588269293308258, "learning_rate": 8.049432837456859e-06, "loss": 0.02608121931552887, "memory(GiB)": 21.48, "step": 10056, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.950694 }, { "epoch": 0.32670629893122827, "grad_norm": 0.7054109573364258, "learning_rate": 8.049007132217623e-06, "loss": 0.03888777643442154, "memory(GiB)": 21.48, "step": 10057, "token_acc": 0.9834254143646409, "train_speed(iter/s)": 0.95071 }, { "epoch": 0.3267387843939837, "grad_norm": 0.6617637872695923, "learning_rate": 8.048581391788785e-06, "loss": 0.03756193444132805, "memory(GiB)": 21.48, "step": 10058, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.950724 }, { "epoch": 0.3267712698567391, "grad_norm": 0.5126659274101257, "learning_rate": 8.04815561617526e-06, "loss": 0.029033344238996506, "memory(GiB)": 21.48, "step": 10059, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.950736 }, { "epoch": 0.3268037553194945, "grad_norm": 0.4604129493236542, "learning_rate": 8.047729805381962e-06, "loss": 0.026710093021392822, "memory(GiB)": 21.48, "step": 10060, "token_acc": 1.0, "train_speed(iter/s)": 0.95075 }, { "epoch": 0.32683624078224993, "grad_norm": 0.36237016320228577, "learning_rate": 8.047303959413806e-06, "loss": 0.029443494975566864, "memory(GiB)": 21.48, "step": 10061, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.950765 }, { "epoch": 0.32686872624500535, "grad_norm": 0.3900322914123535, "learning_rate": 8.046878078275706e-06, "loss": 0.024437472224235535, "memory(GiB)": 21.48, "step": 10062, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.950778 }, { "epoch": 0.32690121170776076, "grad_norm": 0.4880862832069397, "learning_rate": 8.046452161972578e-06, "loss": 0.03300204128026962, "memory(GiB)": 21.48, "step": 10063, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.950793 }, { "epoch": 0.3269336971705162, "grad_norm": 0.47868236899375916, "learning_rate": 8.046026210509338e-06, "loss": 0.031936630606651306, "memory(GiB)": 21.48, "step": 10064, "token_acc": 0.9923664122137404, "train_speed(iter/s)": 0.950808 }, { "epoch": 0.3269661826332716, "grad_norm": 1.524198293685913, "learning_rate": 8.0456002238909e-06, "loss": 0.035482995212078094, "memory(GiB)": 21.48, "step": 10065, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.950821 }, { "epoch": 0.326998668096027, "grad_norm": 0.3787189722061157, "learning_rate": 8.045174202122181e-06, "loss": 0.02976445108652115, "memory(GiB)": 21.48, "step": 10066, "token_acc": 0.9936908517350158, "train_speed(iter/s)": 0.950835 }, { "epoch": 0.32703115355878243, "grad_norm": 0.6505484580993652, "learning_rate": 8.0447481452081e-06, "loss": 0.02609899267554283, "memory(GiB)": 21.48, "step": 10067, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.950849 }, { "epoch": 0.32706363902153784, "grad_norm": 0.4331992566585541, "learning_rate": 8.044322053153573e-06, "loss": 0.0360441580414772, "memory(GiB)": 21.48, "step": 10068, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.950863 }, { "epoch": 0.32709612448429326, "grad_norm": 0.3991973102092743, "learning_rate": 8.043895925963517e-06, "loss": 0.02696661278605461, "memory(GiB)": 21.48, "step": 10069, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.950877 }, { "epoch": 0.3271286099470487, "grad_norm": 0.4110618829727173, "learning_rate": 8.04346976364285e-06, "loss": 0.030445970594882965, "memory(GiB)": 21.48, "step": 10070, "token_acc": 0.9776951672862454, "train_speed(iter/s)": 0.950889 }, { "epoch": 0.3271610954098041, "grad_norm": 0.4670374393463135, "learning_rate": 8.043043566196494e-06, "loss": 0.0355309396982193, "memory(GiB)": 21.48, "step": 10071, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.950899 }, { "epoch": 0.3271935808725595, "grad_norm": 0.45984500646591187, "learning_rate": 8.04261733362936e-06, "loss": 0.02176794968545437, "memory(GiB)": 21.48, "step": 10072, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.950914 }, { "epoch": 0.3272260663353149, "grad_norm": 0.3540344536304474, "learning_rate": 8.042191065946376e-06, "loss": 0.02691083960235119, "memory(GiB)": 21.48, "step": 10073, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.950928 }, { "epoch": 0.32725855179807034, "grad_norm": 0.4450633227825165, "learning_rate": 8.041764763152458e-06, "loss": 0.033921968191862106, "memory(GiB)": 21.48, "step": 10074, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.950942 }, { "epoch": 0.32729103726082576, "grad_norm": 0.34024709463119507, "learning_rate": 8.041338425252525e-06, "loss": 0.023317718878388405, "memory(GiB)": 21.48, "step": 10075, "token_acc": 1.0, "train_speed(iter/s)": 0.950955 }, { "epoch": 0.3273235227235812, "grad_norm": 0.32277002930641174, "learning_rate": 8.040912052251496e-06, "loss": 0.02474392205476761, "memory(GiB)": 21.48, "step": 10076, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.950969 }, { "epoch": 0.3273560081863366, "grad_norm": 0.48324379324913025, "learning_rate": 8.040485644154298e-06, "loss": 0.0347944051027298, "memory(GiB)": 21.48, "step": 10077, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.950984 }, { "epoch": 0.327388493649092, "grad_norm": 0.3080741763114929, "learning_rate": 8.040059200965849e-06, "loss": 0.022601677104830742, "memory(GiB)": 21.48, "step": 10078, "token_acc": 0.9964285714285714, "train_speed(iter/s)": 0.951001 }, { "epoch": 0.3274209791118474, "grad_norm": 0.37046101689338684, "learning_rate": 8.039632722691068e-06, "loss": 0.03181663900613785, "memory(GiB)": 21.48, "step": 10079, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.951018 }, { "epoch": 0.3274534645746029, "grad_norm": 0.4527333676815033, "learning_rate": 8.039206209334883e-06, "loss": 0.02631952613592148, "memory(GiB)": 21.48, "step": 10080, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.951035 }, { "epoch": 0.3274859500373583, "grad_norm": 0.631578266620636, "learning_rate": 8.03877966090221e-06, "loss": 0.041659265756607056, "memory(GiB)": 21.48, "step": 10081, "token_acc": 0.9693486590038314, "train_speed(iter/s)": 0.951053 }, { "epoch": 0.3275184355001137, "grad_norm": 0.29668325185775757, "learning_rate": 8.038353077397977e-06, "loss": 0.026756776496767998, "memory(GiB)": 21.48, "step": 10082, "token_acc": 0.979381443298969, "train_speed(iter/s)": 0.951074 }, { "epoch": 0.32755092096286914, "grad_norm": 0.4440913796424866, "learning_rate": 8.037926458827105e-06, "loss": 0.021244902163743973, "memory(GiB)": 21.48, "step": 10083, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.951067 }, { "epoch": 0.32758340642562456, "grad_norm": 0.369294136762619, "learning_rate": 8.037499805194517e-06, "loss": 0.02914787083864212, "memory(GiB)": 21.48, "step": 10084, "token_acc": 0.9865319865319865, "train_speed(iter/s)": 0.951087 }, { "epoch": 0.32761589188838, "grad_norm": 0.5706946849822998, "learning_rate": 8.037073116505139e-06, "loss": 0.03587549924850464, "memory(GiB)": 21.48, "step": 10085, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.951108 }, { "epoch": 0.3276483773511354, "grad_norm": 0.5909658074378967, "learning_rate": 8.036646392763897e-06, "loss": 0.030818277969956398, "memory(GiB)": 21.48, "step": 10086, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.951128 }, { "epoch": 0.3276808628138908, "grad_norm": 0.4537467658519745, "learning_rate": 8.03621963397571e-06, "loss": 0.030298452824354172, "memory(GiB)": 21.48, "step": 10087, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.951148 }, { "epoch": 0.3277133482766462, "grad_norm": 0.35579702258110046, "learning_rate": 8.03579284014551e-06, "loss": 0.02062634751200676, "memory(GiB)": 21.48, "step": 10088, "token_acc": 0.9895470383275261, "train_speed(iter/s)": 0.951167 }, { "epoch": 0.32774583373940164, "grad_norm": 0.3663405478000641, "learning_rate": 8.035366011278218e-06, "loss": 0.02264397032558918, "memory(GiB)": 21.48, "step": 10089, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.951186 }, { "epoch": 0.32777831920215705, "grad_norm": 0.5259213447570801, "learning_rate": 8.034939147378766e-06, "loss": 0.02878226898610592, "memory(GiB)": 21.48, "step": 10090, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.951204 }, { "epoch": 0.32781080466491247, "grad_norm": 0.3215564787387848, "learning_rate": 8.034512248452072e-06, "loss": 0.02516436018049717, "memory(GiB)": 21.48, "step": 10091, "token_acc": 1.0, "train_speed(iter/s)": 0.951223 }, { "epoch": 0.3278432901276679, "grad_norm": 0.3488664925098419, "learning_rate": 8.034085314503071e-06, "loss": 0.02615690603852272, "memory(GiB)": 21.48, "step": 10092, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.951242 }, { "epoch": 0.3278757755904233, "grad_norm": 0.38652777671813965, "learning_rate": 8.033658345536685e-06, "loss": 0.028551239520311356, "memory(GiB)": 21.48, "step": 10093, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.951261 }, { "epoch": 0.3279082610531787, "grad_norm": 0.3982064425945282, "learning_rate": 8.033231341557846e-06, "loss": 0.02520499750971794, "memory(GiB)": 21.48, "step": 10094, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.95128 }, { "epoch": 0.32794074651593413, "grad_norm": 0.27949172258377075, "learning_rate": 8.032804302571478e-06, "loss": 0.026940537616610527, "memory(GiB)": 21.48, "step": 10095, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.951299 }, { "epoch": 0.32797323197868955, "grad_norm": 0.5735905766487122, "learning_rate": 8.032377228582513e-06, "loss": 0.03355616703629494, "memory(GiB)": 21.48, "step": 10096, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.951317 }, { "epoch": 0.32800571744144497, "grad_norm": 0.44953641295433044, "learning_rate": 8.031950119595879e-06, "loss": 0.03337319195270538, "memory(GiB)": 21.48, "step": 10097, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.951336 }, { "epoch": 0.3280382029042004, "grad_norm": 0.5426738858222961, "learning_rate": 8.031522975616504e-06, "loss": 0.032199934124946594, "memory(GiB)": 21.48, "step": 10098, "token_acc": 0.9732620320855615, "train_speed(iter/s)": 0.951354 }, { "epoch": 0.3280706883669558, "grad_norm": 0.27374109625816345, "learning_rate": 8.03109579664932e-06, "loss": 0.02363809384405613, "memory(GiB)": 21.48, "step": 10099, "token_acc": 1.0, "train_speed(iter/s)": 0.951367 }, { "epoch": 0.3281031738297112, "grad_norm": 0.36410757899284363, "learning_rate": 8.030668582699255e-06, "loss": 0.030553266406059265, "memory(GiB)": 21.48, "step": 10100, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.951384 }, { "epoch": 0.32813565929246663, "grad_norm": 0.38622230291366577, "learning_rate": 8.030241333771241e-06, "loss": 0.02915235236287117, "memory(GiB)": 21.48, "step": 10101, "token_acc": 1.0, "train_speed(iter/s)": 0.951402 }, { "epoch": 0.32816814475522205, "grad_norm": 0.330215185880661, "learning_rate": 8.029814049870209e-06, "loss": 0.026696976274251938, "memory(GiB)": 21.48, "step": 10102, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.951421 }, { "epoch": 0.32820063021797746, "grad_norm": 0.6109236478805542, "learning_rate": 8.02938673100109e-06, "loss": 0.034340597689151764, "memory(GiB)": 21.48, "step": 10103, "token_acc": 0.9929824561403509, "train_speed(iter/s)": 0.95144 }, { "epoch": 0.3282331156807329, "grad_norm": 0.404526948928833, "learning_rate": 8.028959377168816e-06, "loss": 0.02315029874444008, "memory(GiB)": 21.48, "step": 10104, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.951458 }, { "epoch": 0.3282656011434883, "grad_norm": 1.3177601099014282, "learning_rate": 8.02853198837832e-06, "loss": 0.02673310972750187, "memory(GiB)": 21.48, "step": 10105, "token_acc": 1.0, "train_speed(iter/s)": 0.951475 }, { "epoch": 0.3282980866062437, "grad_norm": 0.44994640350341797, "learning_rate": 8.028104564634534e-06, "loss": 0.022265546023845673, "memory(GiB)": 21.48, "step": 10106, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.951494 }, { "epoch": 0.3283305720689991, "grad_norm": 0.496559739112854, "learning_rate": 8.027677105942388e-06, "loss": 0.03673771396279335, "memory(GiB)": 21.48, "step": 10107, "token_acc": 0.98046875, "train_speed(iter/s)": 0.951512 }, { "epoch": 0.32836305753175454, "grad_norm": 0.43190473318099976, "learning_rate": 8.027249612306822e-06, "loss": 0.03325825184583664, "memory(GiB)": 21.48, "step": 10108, "token_acc": 1.0, "train_speed(iter/s)": 0.951529 }, { "epoch": 0.32839554299450996, "grad_norm": 0.6836791038513184, "learning_rate": 8.026822083732765e-06, "loss": 0.028613757342100143, "memory(GiB)": 21.48, "step": 10109, "token_acc": 0.9707602339181286, "train_speed(iter/s)": 0.951547 }, { "epoch": 0.3284280284572654, "grad_norm": 0.34780701994895935, "learning_rate": 8.026394520225153e-06, "loss": 0.023083733394742012, "memory(GiB)": 21.48, "step": 10110, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.951566 }, { "epoch": 0.3284605139200208, "grad_norm": 0.4834097623825073, "learning_rate": 8.025966921788919e-06, "loss": 0.029667671769857407, "memory(GiB)": 21.48, "step": 10111, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.951584 }, { "epoch": 0.3284929993827762, "grad_norm": 0.4905932545661926, "learning_rate": 8.025539288429e-06, "loss": 0.03448028117418289, "memory(GiB)": 21.48, "step": 10112, "token_acc": 1.0, "train_speed(iter/s)": 0.951604 }, { "epoch": 0.3285254848455316, "grad_norm": 0.6547068953514099, "learning_rate": 8.02511162015033e-06, "loss": 0.03895902261137962, "memory(GiB)": 21.48, "step": 10113, "token_acc": 0.9824561403508771, "train_speed(iter/s)": 0.951623 }, { "epoch": 0.32855797030828704, "grad_norm": 0.4505687654018402, "learning_rate": 8.024683916957844e-06, "loss": 0.03293188661336899, "memory(GiB)": 21.48, "step": 10114, "token_acc": 0.975, "train_speed(iter/s)": 0.951643 }, { "epoch": 0.32859045577104246, "grad_norm": 0.46591341495513916, "learning_rate": 8.024256178856482e-06, "loss": 0.031218713149428368, "memory(GiB)": 21.48, "step": 10115, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.95166 }, { "epoch": 0.32862294123379787, "grad_norm": 0.355051726102829, "learning_rate": 8.023828405851179e-06, "loss": 0.02766009047627449, "memory(GiB)": 21.48, "step": 10116, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.951676 }, { "epoch": 0.3286554266965533, "grad_norm": 0.35042765736579895, "learning_rate": 8.023400597946872e-06, "loss": 0.02394649013876915, "memory(GiB)": 21.48, "step": 10117, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.951691 }, { "epoch": 0.3286879121593087, "grad_norm": 0.3670377731323242, "learning_rate": 8.022972755148495e-06, "loss": 0.02485927753150463, "memory(GiB)": 21.48, "step": 10118, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.951705 }, { "epoch": 0.3287203976220641, "grad_norm": 0.2816208600997925, "learning_rate": 8.02254487746099e-06, "loss": 0.024508707225322723, "memory(GiB)": 21.48, "step": 10119, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.951718 }, { "epoch": 0.32875288308481954, "grad_norm": 0.5129764676094055, "learning_rate": 8.022116964889296e-06, "loss": 0.0276978500187397, "memory(GiB)": 21.48, "step": 10120, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.951731 }, { "epoch": 0.32878536854757495, "grad_norm": 0.40434446930885315, "learning_rate": 8.021689017438346e-06, "loss": 0.023903781548142433, "memory(GiB)": 21.48, "step": 10121, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.951744 }, { "epoch": 0.32881785401033037, "grad_norm": 0.5248753428459167, "learning_rate": 8.021261035113086e-06, "loss": 0.028193462640047073, "memory(GiB)": 21.48, "step": 10122, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.951757 }, { "epoch": 0.3288503394730858, "grad_norm": 0.39031580090522766, "learning_rate": 8.02083301791845e-06, "loss": 0.02567656897008419, "memory(GiB)": 21.48, "step": 10123, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.951771 }, { "epoch": 0.3288828249358412, "grad_norm": 0.45030099153518677, "learning_rate": 8.020404965859382e-06, "loss": 0.03771362453699112, "memory(GiB)": 21.48, "step": 10124, "token_acc": 0.973404255319149, "train_speed(iter/s)": 0.951783 }, { "epoch": 0.3289153103985966, "grad_norm": 0.536398708820343, "learning_rate": 8.019976878940821e-06, "loss": 0.04590459167957306, "memory(GiB)": 21.48, "step": 10125, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.951795 }, { "epoch": 0.32894779586135203, "grad_norm": 0.4518507719039917, "learning_rate": 8.019548757167706e-06, "loss": 0.02904558926820755, "memory(GiB)": 21.48, "step": 10126, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.951809 }, { "epoch": 0.32898028132410745, "grad_norm": 0.9905980825424194, "learning_rate": 8.01912060054498e-06, "loss": 0.034288156777620316, "memory(GiB)": 21.48, "step": 10127, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.951822 }, { "epoch": 0.32901276678686286, "grad_norm": 0.3760160803794861, "learning_rate": 8.018692409077584e-06, "loss": 0.030277002602815628, "memory(GiB)": 21.48, "step": 10128, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.951834 }, { "epoch": 0.3290452522496183, "grad_norm": 0.40006428956985474, "learning_rate": 8.01826418277046e-06, "loss": 0.02334406226873398, "memory(GiB)": 21.48, "step": 10129, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.951846 }, { "epoch": 0.3290777377123737, "grad_norm": 0.3587343096733093, "learning_rate": 8.017835921628547e-06, "loss": 0.025073500350117683, "memory(GiB)": 21.48, "step": 10130, "token_acc": 0.9921875, "train_speed(iter/s)": 0.951859 }, { "epoch": 0.3291102231751291, "grad_norm": 0.44025006890296936, "learning_rate": 8.017407625656793e-06, "loss": 0.027473479509353638, "memory(GiB)": 21.48, "step": 10131, "token_acc": 0.9809160305343512, "train_speed(iter/s)": 0.951872 }, { "epoch": 0.32914270863788453, "grad_norm": 0.4105129837989807, "learning_rate": 8.01697929486014e-06, "loss": 0.03257320076227188, "memory(GiB)": 21.48, "step": 10132, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.951884 }, { "epoch": 0.32917519410063995, "grad_norm": 0.37307873368263245, "learning_rate": 8.01655092924353e-06, "loss": 0.02979045733809471, "memory(GiB)": 21.48, "step": 10133, "token_acc": 0.9858657243816255, "train_speed(iter/s)": 0.951897 }, { "epoch": 0.32920767956339536, "grad_norm": 0.47520944476127625, "learning_rate": 8.016122528811905e-06, "loss": 0.029929660260677338, "memory(GiB)": 21.48, "step": 10134, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.95191 }, { "epoch": 0.3292401650261508, "grad_norm": 0.46023866534233093, "learning_rate": 8.015694093570214e-06, "loss": 0.031196443364024162, "memory(GiB)": 21.48, "step": 10135, "token_acc": 0.9869565217391304, "train_speed(iter/s)": 0.951924 }, { "epoch": 0.3292726504889062, "grad_norm": 0.5270617604255676, "learning_rate": 8.015265623523399e-06, "loss": 0.02693229727447033, "memory(GiB)": 21.48, "step": 10136, "token_acc": 0.9744897959183674, "train_speed(iter/s)": 0.951938 }, { "epoch": 0.3293051359516616, "grad_norm": 0.3945518136024475, "learning_rate": 8.014837118676403e-06, "loss": 0.02697010338306427, "memory(GiB)": 21.48, "step": 10137, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.951953 }, { "epoch": 0.329337621414417, "grad_norm": 0.36792314052581787, "learning_rate": 8.014408579034177e-06, "loss": 0.028294391930103302, "memory(GiB)": 21.48, "step": 10138, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.951966 }, { "epoch": 0.32937010687717244, "grad_norm": 0.37122881412506104, "learning_rate": 8.01398000460166e-06, "loss": 0.022669242694973946, "memory(GiB)": 21.48, "step": 10139, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.95198 }, { "epoch": 0.32940259233992786, "grad_norm": 0.3875056505203247, "learning_rate": 8.013551395383806e-06, "loss": 0.024939367547631264, "memory(GiB)": 21.48, "step": 10140, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.951996 }, { "epoch": 0.3294350778026833, "grad_norm": 0.5786216855049133, "learning_rate": 8.013122751385556e-06, "loss": 0.04425302892923355, "memory(GiB)": 21.48, "step": 10141, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.952013 }, { "epoch": 0.3294675632654387, "grad_norm": 0.4130276143550873, "learning_rate": 8.01269407261186e-06, "loss": 0.023875046521425247, "memory(GiB)": 21.48, "step": 10142, "token_acc": 0.9835390946502057, "train_speed(iter/s)": 0.95203 }, { "epoch": 0.3295000487281941, "grad_norm": 0.42064428329467773, "learning_rate": 8.012265359067663e-06, "loss": 0.0359489843249321, "memory(GiB)": 21.48, "step": 10143, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.952049 }, { "epoch": 0.3295325341909496, "grad_norm": 0.3352641761302948, "learning_rate": 8.011836610757916e-06, "loss": 0.028666727244853973, "memory(GiB)": 21.48, "step": 10144, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.952066 }, { "epoch": 0.329565019653705, "grad_norm": 0.3820074200630188, "learning_rate": 8.011407827687565e-06, "loss": 0.03105270490050316, "memory(GiB)": 21.48, "step": 10145, "token_acc": 0.9813953488372092, "train_speed(iter/s)": 0.952085 }, { "epoch": 0.3295975051164604, "grad_norm": 0.5826160311698914, "learning_rate": 8.01097900986156e-06, "loss": 0.03794161230325699, "memory(GiB)": 21.48, "step": 10146, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.952104 }, { "epoch": 0.3296299905792158, "grad_norm": 0.3615303337574005, "learning_rate": 8.010550157284849e-06, "loss": 0.0310981385409832, "memory(GiB)": 21.48, "step": 10147, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.952125 }, { "epoch": 0.32966247604197124, "grad_norm": 0.558358907699585, "learning_rate": 8.010121269962382e-06, "loss": 0.02797781117260456, "memory(GiB)": 21.48, "step": 10148, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.952145 }, { "epoch": 0.32969496150472666, "grad_norm": 0.32523736357688904, "learning_rate": 8.009692347899112e-06, "loss": 0.0201270654797554, "memory(GiB)": 21.48, "step": 10149, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.952165 }, { "epoch": 0.3297274469674821, "grad_norm": 0.3904837667942047, "learning_rate": 8.009263391099982e-06, "loss": 0.034453488886356354, "memory(GiB)": 21.48, "step": 10150, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.952184 }, { "epoch": 0.3297599324302375, "grad_norm": 0.641438901424408, "learning_rate": 8.008834399569951e-06, "loss": 0.029476463794708252, "memory(GiB)": 21.48, "step": 10151, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.952203 }, { "epoch": 0.3297924178929929, "grad_norm": 0.4574827253818512, "learning_rate": 8.008405373313964e-06, "loss": 0.027786292135715485, "memory(GiB)": 21.48, "step": 10152, "token_acc": 0.9893238434163701, "train_speed(iter/s)": 0.952222 }, { "epoch": 0.3298249033557483, "grad_norm": 0.5724259614944458, "learning_rate": 8.007976312336973e-06, "loss": 0.024932095780968666, "memory(GiB)": 21.48, "step": 10153, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.952243 }, { "epoch": 0.32985738881850374, "grad_norm": 0.4009682834148407, "learning_rate": 8.007547216643936e-06, "loss": 0.028725869953632355, "memory(GiB)": 21.48, "step": 10154, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.952261 }, { "epoch": 0.32988987428125915, "grad_norm": 0.6692596673965454, "learning_rate": 8.007118086239798e-06, "loss": 0.03811701759696007, "memory(GiB)": 21.48, "step": 10155, "token_acc": 0.98046875, "train_speed(iter/s)": 0.952281 }, { "epoch": 0.32992235974401457, "grad_norm": 0.3324235677719116, "learning_rate": 8.006688921129515e-06, "loss": 0.02177157998085022, "memory(GiB)": 21.48, "step": 10156, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.952298 }, { "epoch": 0.32995484520677, "grad_norm": 0.3096984922885895, "learning_rate": 8.006259721318041e-06, "loss": 0.02896920219063759, "memory(GiB)": 21.48, "step": 10157, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.952314 }, { "epoch": 0.3299873306695254, "grad_norm": 0.26207873225212097, "learning_rate": 8.005830486810329e-06, "loss": 0.02017202414572239, "memory(GiB)": 21.48, "step": 10158, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.952329 }, { "epoch": 0.3300198161322808, "grad_norm": 0.4219531714916229, "learning_rate": 8.005401217611333e-06, "loss": 0.029358595609664917, "memory(GiB)": 21.48, "step": 10159, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.952344 }, { "epoch": 0.33005230159503623, "grad_norm": 0.3707182705402374, "learning_rate": 8.004971913726005e-06, "loss": 0.023857129737734795, "memory(GiB)": 21.48, "step": 10160, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.952361 }, { "epoch": 0.33008478705779165, "grad_norm": 0.9175756573677063, "learning_rate": 8.004542575159302e-06, "loss": 0.0357109010219574, "memory(GiB)": 21.48, "step": 10161, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.952377 }, { "epoch": 0.33011727252054707, "grad_norm": 0.35879218578338623, "learning_rate": 8.004113201916179e-06, "loss": 0.025199055671691895, "memory(GiB)": 21.48, "step": 10162, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.952392 }, { "epoch": 0.3301497579833025, "grad_norm": 0.6552633047103882, "learning_rate": 8.003683794001591e-06, "loss": 0.041509270668029785, "memory(GiB)": 21.48, "step": 10163, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.952408 }, { "epoch": 0.3301822434460579, "grad_norm": 0.5760850310325623, "learning_rate": 8.003254351420493e-06, "loss": 0.03819813206791878, "memory(GiB)": 21.48, "step": 10164, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.952423 }, { "epoch": 0.3302147289088133, "grad_norm": 0.3388506770133972, "learning_rate": 8.002824874177844e-06, "loss": 0.0339219756424427, "memory(GiB)": 21.48, "step": 10165, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.952437 }, { "epoch": 0.33024721437156873, "grad_norm": 0.5427300930023193, "learning_rate": 8.002395362278598e-06, "loss": 0.04047999158501625, "memory(GiB)": 21.48, "step": 10166, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.952452 }, { "epoch": 0.33027969983432415, "grad_norm": 0.5646286606788635, "learning_rate": 8.001965815727714e-06, "loss": 0.03348172828555107, "memory(GiB)": 21.48, "step": 10167, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.952467 }, { "epoch": 0.33031218529707956, "grad_norm": 0.43181630969047546, "learning_rate": 8.00153623453015e-06, "loss": 0.031851258128881454, "memory(GiB)": 21.48, "step": 10168, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.952484 }, { "epoch": 0.330344670759835, "grad_norm": 0.6177889108657837, "learning_rate": 8.00110661869086e-06, "loss": 0.021951787173748016, "memory(GiB)": 21.48, "step": 10169, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.9525 }, { "epoch": 0.3303771562225904, "grad_norm": 0.44730815291404724, "learning_rate": 8.000676968214807e-06, "loss": 0.03615545481443405, "memory(GiB)": 21.48, "step": 10170, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.952515 }, { "epoch": 0.3304096416853458, "grad_norm": 0.41972026228904724, "learning_rate": 8.000247283106948e-06, "loss": 0.03242050111293793, "memory(GiB)": 21.48, "step": 10171, "token_acc": 0.9822695035460993, "train_speed(iter/s)": 0.952532 }, { "epoch": 0.33044212714810123, "grad_norm": 0.3261862099170685, "learning_rate": 7.999817563372243e-06, "loss": 0.023191239684820175, "memory(GiB)": 21.48, "step": 10172, "token_acc": 1.0, "train_speed(iter/s)": 0.952548 }, { "epoch": 0.33047461261085664, "grad_norm": 0.39539188146591187, "learning_rate": 7.999387809015648e-06, "loss": 0.023823462426662445, "memory(GiB)": 21.48, "step": 10173, "token_acc": 0.9798994974874372, "train_speed(iter/s)": 0.952565 }, { "epoch": 0.33050709807361206, "grad_norm": 0.3671109676361084, "learning_rate": 7.998958020042127e-06, "loss": 0.03204456344246864, "memory(GiB)": 21.48, "step": 10174, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.95258 }, { "epoch": 0.3305395835363675, "grad_norm": 0.40228456258773804, "learning_rate": 7.998528196456639e-06, "loss": 0.029952913522720337, "memory(GiB)": 21.48, "step": 10175, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.952592 }, { "epoch": 0.3305720689991229, "grad_norm": 0.42965418100357056, "learning_rate": 7.998098338264144e-06, "loss": 0.029991010203957558, "memory(GiB)": 21.48, "step": 10176, "token_acc": 1.0, "train_speed(iter/s)": 0.952606 }, { "epoch": 0.3306045544618783, "grad_norm": 0.33527669310569763, "learning_rate": 7.997668445469605e-06, "loss": 0.03222379460930824, "memory(GiB)": 21.48, "step": 10177, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.95262 }, { "epoch": 0.3306370399246337, "grad_norm": 0.5134705901145935, "learning_rate": 7.99723851807798e-06, "loss": 0.03918010741472244, "memory(GiB)": 21.48, "step": 10178, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.952634 }, { "epoch": 0.33066952538738914, "grad_norm": 0.46085551381111145, "learning_rate": 7.996808556094236e-06, "loss": 0.0428592711687088, "memory(GiB)": 21.48, "step": 10179, "token_acc": 0.961038961038961, "train_speed(iter/s)": 0.952648 }, { "epoch": 0.33070201085014456, "grad_norm": 0.28610536456108093, "learning_rate": 7.996378559523331e-06, "loss": 0.025829287245869637, "memory(GiB)": 21.48, "step": 10180, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.952665 }, { "epoch": 0.3307344963129, "grad_norm": 0.5208475589752197, "learning_rate": 7.995948528370227e-06, "loss": 0.030206847935914993, "memory(GiB)": 21.48, "step": 10181, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.952681 }, { "epoch": 0.3307669817756554, "grad_norm": 0.3995157480239868, "learning_rate": 7.995518462639893e-06, "loss": 0.029641708359122276, "memory(GiB)": 21.48, "step": 10182, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.952694 }, { "epoch": 0.3307994672384108, "grad_norm": 0.4721303880214691, "learning_rate": 7.995088362337286e-06, "loss": 0.038269296288490295, "memory(GiB)": 21.48, "step": 10183, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.95271 }, { "epoch": 0.3308319527011662, "grad_norm": 0.4754701256752014, "learning_rate": 7.994658227467376e-06, "loss": 0.027662191540002823, "memory(GiB)": 21.48, "step": 10184, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.952727 }, { "epoch": 0.33086443816392164, "grad_norm": 0.31445884704589844, "learning_rate": 7.994228058035122e-06, "loss": 0.024032261222600937, "memory(GiB)": 21.48, "step": 10185, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.952743 }, { "epoch": 0.33089692362667705, "grad_norm": 0.2847309708595276, "learning_rate": 7.99379785404549e-06, "loss": 0.031074965372681618, "memory(GiB)": 21.48, "step": 10186, "token_acc": 0.984375, "train_speed(iter/s)": 0.95276 }, { "epoch": 0.33092940908943247, "grad_norm": 0.4917895793914795, "learning_rate": 7.993367615503449e-06, "loss": 0.03337422013282776, "memory(GiB)": 21.48, "step": 10187, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.952776 }, { "epoch": 0.3309618945521879, "grad_norm": 0.36422157287597656, "learning_rate": 7.99293734241396e-06, "loss": 0.03273510932922363, "memory(GiB)": 21.48, "step": 10188, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.95279 }, { "epoch": 0.3309943800149433, "grad_norm": 0.37205585837364197, "learning_rate": 7.99250703478199e-06, "loss": 0.030319655314087868, "memory(GiB)": 21.48, "step": 10189, "token_acc": 0.9896907216494846, "train_speed(iter/s)": 0.952806 }, { "epoch": 0.3310268654776987, "grad_norm": 0.4936736226081848, "learning_rate": 7.992076692612507e-06, "loss": 0.030323367565870285, "memory(GiB)": 21.48, "step": 10190, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.952822 }, { "epoch": 0.33105935094045413, "grad_norm": 0.49115169048309326, "learning_rate": 7.991646315910476e-06, "loss": 0.027644557878375053, "memory(GiB)": 21.48, "step": 10191, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.952836 }, { "epoch": 0.33109183640320955, "grad_norm": 0.32350507378578186, "learning_rate": 7.991215904680863e-06, "loss": 0.027168750762939453, "memory(GiB)": 21.48, "step": 10192, "token_acc": 0.9754901960784313, "train_speed(iter/s)": 0.952852 }, { "epoch": 0.33112432186596497, "grad_norm": 0.438536137342453, "learning_rate": 7.99078545892864e-06, "loss": 0.0465942807495594, "memory(GiB)": 21.48, "step": 10193, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.952869 }, { "epoch": 0.3311568073287204, "grad_norm": 0.3566072881221771, "learning_rate": 7.99035497865877e-06, "loss": 0.029735438525676727, "memory(GiB)": 21.48, "step": 10194, "token_acc": 0.9877049180327869, "train_speed(iter/s)": 0.952884 }, { "epoch": 0.3311892927914758, "grad_norm": 0.4252229332923889, "learning_rate": 7.989924463876225e-06, "loss": 0.03256534785032272, "memory(GiB)": 21.48, "step": 10195, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.952903 }, { "epoch": 0.3312217782542312, "grad_norm": 0.4238647222518921, "learning_rate": 7.98949391458597e-06, "loss": 0.030182739719748497, "memory(GiB)": 21.48, "step": 10196, "token_acc": 1.0, "train_speed(iter/s)": 0.952923 }, { "epoch": 0.33125426371698663, "grad_norm": 0.5650337934494019, "learning_rate": 7.989063330792978e-06, "loss": 0.036001354455947876, "memory(GiB)": 21.48, "step": 10197, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.952943 }, { "epoch": 0.33128674917974205, "grad_norm": 0.40796923637390137, "learning_rate": 7.988632712502215e-06, "loss": 0.02864256501197815, "memory(GiB)": 21.48, "step": 10198, "token_acc": 0.9798387096774194, "train_speed(iter/s)": 0.952963 }, { "epoch": 0.33131923464249746, "grad_norm": 0.35449421405792236, "learning_rate": 7.988202059718653e-06, "loss": 0.024107199162244797, "memory(GiB)": 21.48, "step": 10199, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.952982 }, { "epoch": 0.3313517201052529, "grad_norm": 0.3834356367588043, "learning_rate": 7.987771372447264e-06, "loss": 0.03387578949332237, "memory(GiB)": 21.48, "step": 10200, "token_acc": 0.9702127659574468, "train_speed(iter/s)": 0.953 }, { "epoch": 0.3313842055680083, "grad_norm": 0.4001714587211609, "learning_rate": 7.987340650693016e-06, "loss": 0.025825724005699158, "memory(GiB)": 21.48, "step": 10201, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.953019 }, { "epoch": 0.3314166910307637, "grad_norm": 0.3644557297229767, "learning_rate": 7.98690989446088e-06, "loss": 0.03254967927932739, "memory(GiB)": 21.48, "step": 10202, "token_acc": 0.9842105263157894, "train_speed(iter/s)": 0.953039 }, { "epoch": 0.3314491764935191, "grad_norm": 0.5063753128051758, "learning_rate": 7.986479103755827e-06, "loss": 0.040201712399721146, "memory(GiB)": 21.48, "step": 10203, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.953059 }, { "epoch": 0.33148166195627454, "grad_norm": 0.3052685558795929, "learning_rate": 7.986048278582832e-06, "loss": 0.0270032100379467, "memory(GiB)": 21.48, "step": 10204, "token_acc": 0.9895104895104895, "train_speed(iter/s)": 0.953079 }, { "epoch": 0.33151414741902996, "grad_norm": 0.38480791449546814, "learning_rate": 7.985617418946865e-06, "loss": 0.030538663268089294, "memory(GiB)": 21.48, "step": 10205, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.953098 }, { "epoch": 0.3315466328817854, "grad_norm": 0.34601691365242004, "learning_rate": 7.9851865248529e-06, "loss": 0.02609112486243248, "memory(GiB)": 21.48, "step": 10206, "token_acc": 1.0, "train_speed(iter/s)": 0.953118 }, { "epoch": 0.3315791183445408, "grad_norm": 0.37246790528297424, "learning_rate": 7.984755596305908e-06, "loss": 0.027322513982653618, "memory(GiB)": 21.48, "step": 10207, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.953139 }, { "epoch": 0.33161160380729626, "grad_norm": 0.4644742012023926, "learning_rate": 7.984324633310864e-06, "loss": 0.033421751111745834, "memory(GiB)": 21.48, "step": 10208, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.953158 }, { "epoch": 0.3316440892700517, "grad_norm": 0.45032811164855957, "learning_rate": 7.983893635872742e-06, "loss": 0.02699907124042511, "memory(GiB)": 21.48, "step": 10209, "token_acc": 0.985, "train_speed(iter/s)": 0.953176 }, { "epoch": 0.3316765747328071, "grad_norm": 0.5315701961517334, "learning_rate": 7.983462603996515e-06, "loss": 0.04385031759738922, "memory(GiB)": 21.48, "step": 10210, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.953195 }, { "epoch": 0.3317090601955625, "grad_norm": 0.3976476192474365, "learning_rate": 7.983031537687158e-06, "loss": 0.03537449240684509, "memory(GiB)": 21.48, "step": 10211, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.953215 }, { "epoch": 0.3317415456583179, "grad_norm": 0.5321736335754395, "learning_rate": 7.98260043694965e-06, "loss": 0.036147747188806534, "memory(GiB)": 21.48, "step": 10212, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.953235 }, { "epoch": 0.33177403112107334, "grad_norm": 0.7486823201179504, "learning_rate": 7.98216930178896e-06, "loss": 0.027975089848041534, "memory(GiB)": 21.48, "step": 10213, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.953255 }, { "epoch": 0.33180651658382876, "grad_norm": 0.4350135326385498, "learning_rate": 7.981738132210068e-06, "loss": 0.03536378964781761, "memory(GiB)": 21.48, "step": 10214, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.953276 }, { "epoch": 0.3318390020465842, "grad_norm": 0.43472158908843994, "learning_rate": 7.981306928217947e-06, "loss": 0.029883144423365593, "memory(GiB)": 21.48, "step": 10215, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.953295 }, { "epoch": 0.3318714875093396, "grad_norm": 0.4669830799102783, "learning_rate": 7.98087568981758e-06, "loss": 0.025067836046218872, "memory(GiB)": 21.48, "step": 10216, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.953315 }, { "epoch": 0.331903972972095, "grad_norm": 0.3528406620025635, "learning_rate": 7.980444417013937e-06, "loss": 0.02625749632716179, "memory(GiB)": 21.48, "step": 10217, "token_acc": 0.984375, "train_speed(iter/s)": 0.953336 }, { "epoch": 0.3319364584348504, "grad_norm": 0.5551584959030151, "learning_rate": 7.980013109812e-06, "loss": 0.03465304896235466, "memory(GiB)": 21.48, "step": 10218, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.953355 }, { "epoch": 0.33196894389760584, "grad_norm": 0.4778534770011902, "learning_rate": 7.979581768216744e-06, "loss": 0.03426875174045563, "memory(GiB)": 21.48, "step": 10219, "token_acc": 0.9826086956521739, "train_speed(iter/s)": 0.953371 }, { "epoch": 0.33200142936036126, "grad_norm": 0.4019845724105835, "learning_rate": 7.979150392233151e-06, "loss": 0.03357718884944916, "memory(GiB)": 21.48, "step": 10220, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.953387 }, { "epoch": 0.33203391482311667, "grad_norm": 0.3941722810268402, "learning_rate": 7.978718981866194e-06, "loss": 0.030983854085206985, "memory(GiB)": 21.48, "step": 10221, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.953404 }, { "epoch": 0.3320664002858721, "grad_norm": 0.3738035261631012, "learning_rate": 7.978287537120858e-06, "loss": 0.02856791950762272, "memory(GiB)": 21.48, "step": 10222, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.953418 }, { "epoch": 0.3320988857486275, "grad_norm": 0.43502551317214966, "learning_rate": 7.977856058002118e-06, "loss": 0.030257200822234154, "memory(GiB)": 21.48, "step": 10223, "token_acc": 0.985, "train_speed(iter/s)": 0.953435 }, { "epoch": 0.3321313712113829, "grad_norm": 0.8512302041053772, "learning_rate": 7.977424544514957e-06, "loss": 0.027633801102638245, "memory(GiB)": 21.48, "step": 10224, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.953451 }, { "epoch": 0.33216385667413834, "grad_norm": 0.35583260655403137, "learning_rate": 7.976992996664353e-06, "loss": 0.02319503203034401, "memory(GiB)": 21.48, "step": 10225, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.953468 }, { "epoch": 0.33219634213689375, "grad_norm": 0.6897823810577393, "learning_rate": 7.976561414455289e-06, "loss": 0.04950707405805588, "memory(GiB)": 21.48, "step": 10226, "token_acc": 0.9610894941634242, "train_speed(iter/s)": 0.953485 }, { "epoch": 0.33222882759964917, "grad_norm": 0.5107985138893127, "learning_rate": 7.976129797892742e-06, "loss": 0.045620545744895935, "memory(GiB)": 21.48, "step": 10227, "token_acc": 0.9752650176678446, "train_speed(iter/s)": 0.953501 }, { "epoch": 0.3322613130624046, "grad_norm": 0.5744169354438782, "learning_rate": 7.975698146981695e-06, "loss": 0.03515968844294548, "memory(GiB)": 21.48, "step": 10228, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.953514 }, { "epoch": 0.33229379852516, "grad_norm": 0.3712060749530792, "learning_rate": 7.975266461727133e-06, "loss": 0.02825463004410267, "memory(GiB)": 21.48, "step": 10229, "token_acc": 1.0, "train_speed(iter/s)": 0.953528 }, { "epoch": 0.3323262839879154, "grad_norm": 0.4789191484451294, "learning_rate": 7.974834742134037e-06, "loss": 0.023153336718678474, "memory(GiB)": 21.48, "step": 10230, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.953545 }, { "epoch": 0.33235876945067083, "grad_norm": 0.33178332448005676, "learning_rate": 7.974402988207386e-06, "loss": 0.01675761118531227, "memory(GiB)": 21.48, "step": 10231, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.953561 }, { "epoch": 0.33239125491342625, "grad_norm": 0.42443543672561646, "learning_rate": 7.973971199952167e-06, "loss": 0.03129984438419342, "memory(GiB)": 21.48, "step": 10232, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.953577 }, { "epoch": 0.33242374037618166, "grad_norm": 0.5575288534164429, "learning_rate": 7.973539377373362e-06, "loss": 0.03204548358917236, "memory(GiB)": 21.48, "step": 10233, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.953594 }, { "epoch": 0.3324562258389371, "grad_norm": 0.49965962767601013, "learning_rate": 7.973107520475956e-06, "loss": 0.02577708661556244, "memory(GiB)": 21.48, "step": 10234, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.95361 }, { "epoch": 0.3324887113016925, "grad_norm": 0.9872062802314758, "learning_rate": 7.97267562926493e-06, "loss": 0.036661297082901, "memory(GiB)": 21.48, "step": 10235, "token_acc": 0.9752475247524752, "train_speed(iter/s)": 0.953624 }, { "epoch": 0.3325211967644479, "grad_norm": 0.4104130268096924, "learning_rate": 7.97224370374527e-06, "loss": 0.024153444916009903, "memory(GiB)": 21.48, "step": 10236, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.953638 }, { "epoch": 0.33255368222720333, "grad_norm": 0.3408689498901367, "learning_rate": 7.971811743921964e-06, "loss": 0.027457136660814285, "memory(GiB)": 21.48, "step": 10237, "token_acc": 0.9921875, "train_speed(iter/s)": 0.953649 }, { "epoch": 0.33258616768995874, "grad_norm": 0.3857289254665375, "learning_rate": 7.971379749799993e-06, "loss": 0.025686191394925117, "memory(GiB)": 21.48, "step": 10238, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.953662 }, { "epoch": 0.33261865315271416, "grad_norm": 0.3776460587978363, "learning_rate": 7.970947721384346e-06, "loss": 0.028842784464359283, "memory(GiB)": 21.48, "step": 10239, "token_acc": 0.9779735682819384, "train_speed(iter/s)": 0.953675 }, { "epoch": 0.3326511386154696, "grad_norm": 0.38327425718307495, "learning_rate": 7.970515658680008e-06, "loss": 0.03231140226125717, "memory(GiB)": 21.48, "step": 10240, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.95369 }, { "epoch": 0.332683624078225, "grad_norm": 2.283156633377075, "learning_rate": 7.970083561691966e-06, "loss": 0.03457285091280937, "memory(GiB)": 21.48, "step": 10241, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.953703 }, { "epoch": 0.3327161095409804, "grad_norm": 0.3292212188243866, "learning_rate": 7.969651430425205e-06, "loss": 0.024664971977472305, "memory(GiB)": 21.48, "step": 10242, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.953717 }, { "epoch": 0.3327485950037358, "grad_norm": 0.36876180768013, "learning_rate": 7.969219264884714e-06, "loss": 0.03370979055762291, "memory(GiB)": 21.48, "step": 10243, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.953732 }, { "epoch": 0.33278108046649124, "grad_norm": 0.3372304439544678, "learning_rate": 7.968787065075481e-06, "loss": 0.02437458746135235, "memory(GiB)": 21.48, "step": 10244, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.953747 }, { "epoch": 0.33281356592924666, "grad_norm": 0.5006992220878601, "learning_rate": 7.968354831002494e-06, "loss": 0.03161109238862991, "memory(GiB)": 21.48, "step": 10245, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.953762 }, { "epoch": 0.3328460513920021, "grad_norm": 0.4766351580619812, "learning_rate": 7.967922562670742e-06, "loss": 0.039734043180942535, "memory(GiB)": 21.48, "step": 10246, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.953777 }, { "epoch": 0.3328785368547575, "grad_norm": 0.3829915225505829, "learning_rate": 7.967490260085213e-06, "loss": 0.03207229822874069, "memory(GiB)": 21.48, "step": 10247, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.953792 }, { "epoch": 0.3329110223175129, "grad_norm": 0.48385748267173767, "learning_rate": 7.967057923250896e-06, "loss": 0.03351125493645668, "memory(GiB)": 21.48, "step": 10248, "token_acc": 1.0, "train_speed(iter/s)": 0.953807 }, { "epoch": 0.3329435077802683, "grad_norm": 0.5458787083625793, "learning_rate": 7.966625552172782e-06, "loss": 0.03145105764269829, "memory(GiB)": 21.48, "step": 10249, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.953822 }, { "epoch": 0.33297599324302374, "grad_norm": 0.38095980882644653, "learning_rate": 7.966193146855861e-06, "loss": 0.029937081038951874, "memory(GiB)": 21.48, "step": 10250, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.953838 }, { "epoch": 0.33300847870577915, "grad_norm": 0.4818257987499237, "learning_rate": 7.965760707305122e-06, "loss": 0.029252050444483757, "memory(GiB)": 21.48, "step": 10251, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.953853 }, { "epoch": 0.33304096416853457, "grad_norm": 0.3065193295478821, "learning_rate": 7.965328233525559e-06, "loss": 0.026339421048760414, "memory(GiB)": 21.48, "step": 10252, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.953868 }, { "epoch": 0.33307344963129, "grad_norm": 0.5796827077865601, "learning_rate": 7.964895725522158e-06, "loss": 0.041373111307621, "memory(GiB)": 21.48, "step": 10253, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.953884 }, { "epoch": 0.3331059350940454, "grad_norm": 0.49493107199668884, "learning_rate": 7.964463183299917e-06, "loss": 0.025361064821481705, "memory(GiB)": 21.48, "step": 10254, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.953901 }, { "epoch": 0.3331384205568008, "grad_norm": 0.43729153275489807, "learning_rate": 7.964030606863823e-06, "loss": 0.026319021359086037, "memory(GiB)": 21.48, "step": 10255, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.953921 }, { "epoch": 0.33317090601955623, "grad_norm": 0.5707586407661438, "learning_rate": 7.963597996218873e-06, "loss": 0.034590404480695724, "memory(GiB)": 21.48, "step": 10256, "token_acc": 0.9790940766550522, "train_speed(iter/s)": 0.95394 }, { "epoch": 0.33320339148231165, "grad_norm": 0.2702835202217102, "learning_rate": 7.963165351370056e-06, "loss": 0.02599395252764225, "memory(GiB)": 21.48, "step": 10257, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.95396 }, { "epoch": 0.33323587694506707, "grad_norm": 0.6767525672912598, "learning_rate": 7.962732672322368e-06, "loss": 0.04305806756019592, "memory(GiB)": 21.48, "step": 10258, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.953979 }, { "epoch": 0.3332683624078225, "grad_norm": 0.4532412588596344, "learning_rate": 7.9622999590808e-06, "loss": 0.027145911008119583, "memory(GiB)": 21.48, "step": 10259, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.953999 }, { "epoch": 0.3333008478705779, "grad_norm": 0.5172255039215088, "learning_rate": 7.961867211650349e-06, "loss": 0.02634412981569767, "memory(GiB)": 21.48, "step": 10260, "token_acc": 0.9933110367892977, "train_speed(iter/s)": 0.954019 }, { "epoch": 0.3333333333333333, "grad_norm": 0.41767656803131104, "learning_rate": 7.961434430036008e-06, "loss": 0.031040243804454803, "memory(GiB)": 21.48, "step": 10261, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.954038 }, { "epoch": 0.33336581879608873, "grad_norm": 0.4513767659664154, "learning_rate": 7.96100161424277e-06, "loss": 0.029792428016662598, "memory(GiB)": 21.48, "step": 10262, "token_acc": 0.993103448275862, "train_speed(iter/s)": 0.954057 }, { "epoch": 0.33339830425884415, "grad_norm": 0.3643594980239868, "learning_rate": 7.960568764275633e-06, "loss": 0.03113347478210926, "memory(GiB)": 21.48, "step": 10263, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.954077 }, { "epoch": 0.33343078972159956, "grad_norm": 0.38641616702079773, "learning_rate": 7.960135880139595e-06, "loss": 0.034109484404325485, "memory(GiB)": 21.48, "step": 10264, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.954096 }, { "epoch": 0.333463275184355, "grad_norm": 0.32519224286079407, "learning_rate": 7.959702961839645e-06, "loss": 0.02145715057849884, "memory(GiB)": 21.48, "step": 10265, "token_acc": 1.0, "train_speed(iter/s)": 0.954116 }, { "epoch": 0.3334957606471104, "grad_norm": 0.4274165630340576, "learning_rate": 7.959270009380786e-06, "loss": 0.025640983134508133, "memory(GiB)": 21.48, "step": 10266, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.954136 }, { "epoch": 0.3335282461098658, "grad_norm": 0.4031006097793579, "learning_rate": 7.958837022768012e-06, "loss": 0.023957455530762672, "memory(GiB)": 21.48, "step": 10267, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.954156 }, { "epoch": 0.3335607315726212, "grad_norm": 0.6139973402023315, "learning_rate": 7.95840400200632e-06, "loss": 0.03832776099443436, "memory(GiB)": 21.48, "step": 10268, "token_acc": 0.9858490566037735, "train_speed(iter/s)": 0.954175 }, { "epoch": 0.33359321703537664, "grad_norm": 0.3698813319206238, "learning_rate": 7.95797094710071e-06, "loss": 0.02386811561882496, "memory(GiB)": 21.48, "step": 10269, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.954194 }, { "epoch": 0.33362570249813206, "grad_norm": 0.5330790877342224, "learning_rate": 7.957537858056176e-06, "loss": 0.033487603068351746, "memory(GiB)": 21.48, "step": 10270, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.954213 }, { "epoch": 0.3336581879608875, "grad_norm": 0.395985871553421, "learning_rate": 7.95710473487772e-06, "loss": 0.034776460379362106, "memory(GiB)": 21.48, "step": 10271, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.954233 }, { "epoch": 0.33369067342364295, "grad_norm": 0.3555343747138977, "learning_rate": 7.956671577570339e-06, "loss": 0.02929886244237423, "memory(GiB)": 21.48, "step": 10272, "token_acc": 0.98828125, "train_speed(iter/s)": 0.954252 }, { "epoch": 0.33372315888639836, "grad_norm": 0.39871200919151306, "learning_rate": 7.956238386139031e-06, "loss": 0.03175583481788635, "memory(GiB)": 21.48, "step": 10273, "token_acc": 0.9898648648648649, "train_speed(iter/s)": 0.954271 }, { "epoch": 0.3337556443491538, "grad_norm": 0.33219316601753235, "learning_rate": 7.955805160588798e-06, "loss": 0.020803041756153107, "memory(GiB)": 21.48, "step": 10274, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.95429 }, { "epoch": 0.3337881298119092, "grad_norm": 0.45426973700523376, "learning_rate": 7.955371900924642e-06, "loss": 0.026013806462287903, "memory(GiB)": 21.48, "step": 10275, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.954309 }, { "epoch": 0.3338206152746646, "grad_norm": 0.5162678360939026, "learning_rate": 7.954938607151556e-06, "loss": 0.033236064016819, "memory(GiB)": 21.48, "step": 10276, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.954326 }, { "epoch": 0.33385310073742, "grad_norm": 0.46517413854599, "learning_rate": 7.954505279274549e-06, "loss": 0.02895556204020977, "memory(GiB)": 21.48, "step": 10277, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.954347 }, { "epoch": 0.33388558620017544, "grad_norm": 0.4023717939853668, "learning_rate": 7.954071917298618e-06, "loss": 0.02041114680469036, "memory(GiB)": 21.48, "step": 10278, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.954365 }, { "epoch": 0.33391807166293086, "grad_norm": 0.5968534350395203, "learning_rate": 7.953638521228766e-06, "loss": 0.023513849824666977, "memory(GiB)": 21.48, "step": 10279, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.954383 }, { "epoch": 0.3339505571256863, "grad_norm": 0.46543067693710327, "learning_rate": 7.953205091069991e-06, "loss": 0.03361460939049721, "memory(GiB)": 21.48, "step": 10280, "token_acc": 0.98, "train_speed(iter/s)": 0.9544 }, { "epoch": 0.3339830425884417, "grad_norm": 0.41089069843292236, "learning_rate": 7.952771626827302e-06, "loss": 0.02613939717411995, "memory(GiB)": 21.48, "step": 10281, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.954417 }, { "epoch": 0.3340155280511971, "grad_norm": 0.5266710519790649, "learning_rate": 7.952338128505697e-06, "loss": 0.036971237510442734, "memory(GiB)": 21.48, "step": 10282, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.954432 }, { "epoch": 0.3340480135139525, "grad_norm": 0.5115914344787598, "learning_rate": 7.95190459611018e-06, "loss": 0.024482477456331253, "memory(GiB)": 21.48, "step": 10283, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.954449 }, { "epoch": 0.33408049897670794, "grad_norm": 0.40780535340309143, "learning_rate": 7.951471029645756e-06, "loss": 0.0278845876455307, "memory(GiB)": 21.48, "step": 10284, "token_acc": 0.9819277108433735, "train_speed(iter/s)": 0.954464 }, { "epoch": 0.33411298443946336, "grad_norm": 0.4773639440536499, "learning_rate": 7.951037429117428e-06, "loss": 0.03181306645274162, "memory(GiB)": 21.48, "step": 10285, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.95448 }, { "epoch": 0.3341454699022188, "grad_norm": 0.6182274222373962, "learning_rate": 7.9506037945302e-06, "loss": 0.02903849445283413, "memory(GiB)": 21.48, "step": 10286, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.954497 }, { "epoch": 0.3341779553649742, "grad_norm": 0.8895683288574219, "learning_rate": 7.950170125889076e-06, "loss": 0.023292137309908867, "memory(GiB)": 21.48, "step": 10287, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.954513 }, { "epoch": 0.3342104408277296, "grad_norm": 0.6419587135314941, "learning_rate": 7.949736423199063e-06, "loss": 0.03522587940096855, "memory(GiB)": 21.48, "step": 10288, "token_acc": 0.9836065573770492, "train_speed(iter/s)": 0.954529 }, { "epoch": 0.334242926290485, "grad_norm": 0.3809210956096649, "learning_rate": 7.949302686465164e-06, "loss": 0.02667154371738434, "memory(GiB)": 21.48, "step": 10289, "token_acc": 0.9941860465116279, "train_speed(iter/s)": 0.954545 }, { "epoch": 0.33427541175324044, "grad_norm": 0.3287021219730377, "learning_rate": 7.948868915692388e-06, "loss": 0.020458519458770752, "memory(GiB)": 21.48, "step": 10290, "token_acc": 1.0, "train_speed(iter/s)": 0.954561 }, { "epoch": 0.33430789721599585, "grad_norm": 0.4068848490715027, "learning_rate": 7.94843511088574e-06, "loss": 0.03330878168344498, "memory(GiB)": 21.48, "step": 10291, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.954576 }, { "epoch": 0.33434038267875127, "grad_norm": 0.4509143531322479, "learning_rate": 7.948001272050225e-06, "loss": 0.03375566750764847, "memory(GiB)": 21.48, "step": 10292, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.95459 }, { "epoch": 0.3343728681415067, "grad_norm": 0.3812488913536072, "learning_rate": 7.947567399190853e-06, "loss": 0.028676148504018784, "memory(GiB)": 21.48, "step": 10293, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.954607 }, { "epoch": 0.3344053536042621, "grad_norm": 0.3387533128261566, "learning_rate": 7.94713349231263e-06, "loss": 0.020425088703632355, "memory(GiB)": 21.48, "step": 10294, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.954622 }, { "epoch": 0.3344378390670175, "grad_norm": 0.4659932255744934, "learning_rate": 7.946699551420563e-06, "loss": 0.027036739513278008, "memory(GiB)": 21.48, "step": 10295, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.954635 }, { "epoch": 0.33447032452977293, "grad_norm": 0.3622230887413025, "learning_rate": 7.94626557651966e-06, "loss": 0.030726265162229538, "memory(GiB)": 21.48, "step": 10296, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.954648 }, { "epoch": 0.33450280999252835, "grad_norm": 0.3645392060279846, "learning_rate": 7.945831567614934e-06, "loss": 0.030156690627336502, "memory(GiB)": 21.48, "step": 10297, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.954662 }, { "epoch": 0.33453529545528377, "grad_norm": 0.40829193592071533, "learning_rate": 7.945397524711388e-06, "loss": 0.030588964000344276, "memory(GiB)": 21.48, "step": 10298, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.954675 }, { "epoch": 0.3345677809180392, "grad_norm": 0.47109925746917725, "learning_rate": 7.944963447814035e-06, "loss": 0.040170956403017044, "memory(GiB)": 21.48, "step": 10299, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.954686 }, { "epoch": 0.3346002663807946, "grad_norm": 0.3017159104347229, "learning_rate": 7.944529336927886e-06, "loss": 0.023795470595359802, "memory(GiB)": 21.48, "step": 10300, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.9547 }, { "epoch": 0.33463275184355, "grad_norm": 0.42543333768844604, "learning_rate": 7.944095192057948e-06, "loss": 0.02836945280432701, "memory(GiB)": 21.48, "step": 10301, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.954714 }, { "epoch": 0.33466523730630543, "grad_norm": 0.4579046070575714, "learning_rate": 7.943661013209231e-06, "loss": 0.03157118707895279, "memory(GiB)": 21.48, "step": 10302, "token_acc": 0.9822695035460993, "train_speed(iter/s)": 0.954727 }, { "epoch": 0.33469772276906085, "grad_norm": 0.338305801153183, "learning_rate": 7.94322680038675e-06, "loss": 0.025051046162843704, "memory(GiB)": 21.48, "step": 10303, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.954739 }, { "epoch": 0.33473020823181626, "grad_norm": 0.41323211789131165, "learning_rate": 7.942792553595514e-06, "loss": 0.02307754009962082, "memory(GiB)": 21.48, "step": 10304, "token_acc": 0.9877551020408163, "train_speed(iter/s)": 0.954754 }, { "epoch": 0.3347626936945717, "grad_norm": 0.4158734083175659, "learning_rate": 7.942358272840536e-06, "loss": 0.03576761484146118, "memory(GiB)": 21.48, "step": 10305, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.95477 }, { "epoch": 0.3347951791573271, "grad_norm": 0.2991582751274109, "learning_rate": 7.941923958126825e-06, "loss": 0.02341800555586815, "memory(GiB)": 21.48, "step": 10306, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.954785 }, { "epoch": 0.3348276646200825, "grad_norm": 0.3969500660896301, "learning_rate": 7.941489609459396e-06, "loss": 0.026735913008451462, "memory(GiB)": 21.48, "step": 10307, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.954801 }, { "epoch": 0.3348601500828379, "grad_norm": 0.29983842372894287, "learning_rate": 7.941055226843265e-06, "loss": 0.022494632750749588, "memory(GiB)": 21.48, "step": 10308, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.954817 }, { "epoch": 0.33489263554559334, "grad_norm": 0.4866504371166229, "learning_rate": 7.940620810283438e-06, "loss": 0.03166350722312927, "memory(GiB)": 21.48, "step": 10309, "token_acc": 0.9869565217391304, "train_speed(iter/s)": 0.95483 }, { "epoch": 0.33492512100834876, "grad_norm": 0.29862889647483826, "learning_rate": 7.940186359784936e-06, "loss": 0.030283797532320023, "memory(GiB)": 21.48, "step": 10310, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.954845 }, { "epoch": 0.3349576064711042, "grad_norm": 0.40897127985954285, "learning_rate": 7.939751875352768e-06, "loss": 0.0306216012686491, "memory(GiB)": 21.48, "step": 10311, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.95486 }, { "epoch": 0.3349900919338596, "grad_norm": 0.5111988186836243, "learning_rate": 7.939317356991951e-06, "loss": 0.04091034084558487, "memory(GiB)": 21.48, "step": 10312, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.954875 }, { "epoch": 0.335022577396615, "grad_norm": 0.3637152910232544, "learning_rate": 7.9388828047075e-06, "loss": 0.0316278375685215, "memory(GiB)": 21.48, "step": 10313, "token_acc": 0.992, "train_speed(iter/s)": 0.954891 }, { "epoch": 0.3350550628593704, "grad_norm": 1.0704576969146729, "learning_rate": 7.93844821850443e-06, "loss": 0.031116556376218796, "memory(GiB)": 21.48, "step": 10314, "token_acc": 1.0, "train_speed(iter/s)": 0.954908 }, { "epoch": 0.33508754832212584, "grad_norm": 0.4852985739707947, "learning_rate": 7.938013598387757e-06, "loss": 0.04184756800532341, "memory(GiB)": 21.48, "step": 10315, "token_acc": 0.9825783972125436, "train_speed(iter/s)": 0.954927 }, { "epoch": 0.33512003378488125, "grad_norm": 0.3463760316371918, "learning_rate": 7.937578944362494e-06, "loss": 0.02419731393456459, "memory(GiB)": 21.48, "step": 10316, "token_acc": 0.9932885906040269, "train_speed(iter/s)": 0.954947 }, { "epoch": 0.33515251924763667, "grad_norm": 0.5018832683563232, "learning_rate": 7.937144256433662e-06, "loss": 0.03246961534023285, "memory(GiB)": 21.48, "step": 10317, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.954968 }, { "epoch": 0.3351850047103921, "grad_norm": 0.41263818740844727, "learning_rate": 7.936709534606277e-06, "loss": 0.03130210191011429, "memory(GiB)": 21.48, "step": 10318, "token_acc": 0.988, "train_speed(iter/s)": 0.954987 }, { "epoch": 0.3352174901731475, "grad_norm": 0.4259748160839081, "learning_rate": 7.936274778885353e-06, "loss": 0.033997856080532074, "memory(GiB)": 21.48, "step": 10319, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.955007 }, { "epoch": 0.3352499756359029, "grad_norm": 0.45321956276893616, "learning_rate": 7.935839989275911e-06, "loss": 0.028997743502259254, "memory(GiB)": 21.48, "step": 10320, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.955025 }, { "epoch": 0.33528246109865834, "grad_norm": 0.38962802290916443, "learning_rate": 7.93540516578297e-06, "loss": 0.030626680701971054, "memory(GiB)": 21.48, "step": 10321, "token_acc": 0.9836956521739131, "train_speed(iter/s)": 0.955044 }, { "epoch": 0.33531494656141375, "grad_norm": 0.42641595005989075, "learning_rate": 7.934970308411543e-06, "loss": 0.021801887080073357, "memory(GiB)": 21.48, "step": 10322, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.955064 }, { "epoch": 0.33534743202416917, "grad_norm": 0.4293976426124573, "learning_rate": 7.934535417166653e-06, "loss": 0.028681235387921333, "memory(GiB)": 21.48, "step": 10323, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.955083 }, { "epoch": 0.3353799174869246, "grad_norm": 0.36985814571380615, "learning_rate": 7.934100492053319e-06, "loss": 0.027683820575475693, "memory(GiB)": 21.48, "step": 10324, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955102 }, { "epoch": 0.33541240294968, "grad_norm": 0.47090402245521545, "learning_rate": 7.93366553307656e-06, "loss": 0.02837255969643593, "memory(GiB)": 21.48, "step": 10325, "token_acc": 1.0, "train_speed(iter/s)": 0.955122 }, { "epoch": 0.3354448884124354, "grad_norm": 0.33559688925743103, "learning_rate": 7.933230540241396e-06, "loss": 0.02153116650879383, "memory(GiB)": 21.48, "step": 10326, "token_acc": 1.0, "train_speed(iter/s)": 0.955141 }, { "epoch": 0.33547737387519083, "grad_norm": 0.2960459887981415, "learning_rate": 7.932795513552846e-06, "loss": 0.01785794273018837, "memory(GiB)": 21.48, "step": 10327, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.955161 }, { "epoch": 0.33550985933794625, "grad_norm": 0.46961522102355957, "learning_rate": 7.932360453015935e-06, "loss": 0.03097359463572502, "memory(GiB)": 21.48, "step": 10328, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.95518 }, { "epoch": 0.33554234480070166, "grad_norm": 0.42264094948768616, "learning_rate": 7.931925358635679e-06, "loss": 0.033218443393707275, "memory(GiB)": 21.48, "step": 10329, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.9552 }, { "epoch": 0.3355748302634571, "grad_norm": 0.5200034976005554, "learning_rate": 7.931490230417103e-06, "loss": 0.03413345292210579, "memory(GiB)": 21.48, "step": 10330, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.95522 }, { "epoch": 0.3356073157262125, "grad_norm": 0.4049120545387268, "learning_rate": 7.931055068365228e-06, "loss": 0.02420908212661743, "memory(GiB)": 21.48, "step": 10331, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.95524 }, { "epoch": 0.3356398011889679, "grad_norm": 0.5381325483322144, "learning_rate": 7.930619872485076e-06, "loss": 0.03223930299282074, "memory(GiB)": 21.48, "step": 10332, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.955259 }, { "epoch": 0.33567228665172333, "grad_norm": 0.3819485902786255, "learning_rate": 7.93018464278167e-06, "loss": 0.02242402359843254, "memory(GiB)": 21.48, "step": 10333, "token_acc": 0.9849056603773585, "train_speed(iter/s)": 0.955277 }, { "epoch": 0.33570477211447874, "grad_norm": 0.7965050935745239, "learning_rate": 7.929749379260034e-06, "loss": 0.04174291342496872, "memory(GiB)": 21.48, "step": 10334, "token_acc": 0.9529411764705882, "train_speed(iter/s)": 0.955296 }, { "epoch": 0.33573725757723416, "grad_norm": 0.44047293066978455, "learning_rate": 7.92931408192519e-06, "loss": 0.0157547015696764, "memory(GiB)": 21.48, "step": 10335, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955314 }, { "epoch": 0.33576974303998963, "grad_norm": 0.3682403862476349, "learning_rate": 7.928878750782164e-06, "loss": 0.02879720740020275, "memory(GiB)": 21.48, "step": 10336, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.955333 }, { "epoch": 0.33580222850274505, "grad_norm": 0.4488724172115326, "learning_rate": 7.928443385835977e-06, "loss": 0.038090065121650696, "memory(GiB)": 21.48, "step": 10337, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.955353 }, { "epoch": 0.33583471396550046, "grad_norm": 0.5002060532569885, "learning_rate": 7.928007987091654e-06, "loss": 0.027659602463245392, "memory(GiB)": 21.48, "step": 10338, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955372 }, { "epoch": 0.3358671994282559, "grad_norm": 0.3799339532852173, "learning_rate": 7.927572554554224e-06, "loss": 0.03212672844529152, "memory(GiB)": 21.48, "step": 10339, "token_acc": 0.9845559845559846, "train_speed(iter/s)": 0.95539 }, { "epoch": 0.3358996848910113, "grad_norm": 0.4244193136692047, "learning_rate": 7.92713708822871e-06, "loss": 0.02639879286289215, "memory(GiB)": 21.48, "step": 10340, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.955408 }, { "epoch": 0.3359321703537667, "grad_norm": 0.5734627842903137, "learning_rate": 7.926701588120138e-06, "loss": 0.03757404536008835, "memory(GiB)": 21.48, "step": 10341, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.955428 }, { "epoch": 0.33596465581652213, "grad_norm": 0.33601856231689453, "learning_rate": 7.926266054233535e-06, "loss": 0.022826705127954483, "memory(GiB)": 21.48, "step": 10342, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.955447 }, { "epoch": 0.33599714127927754, "grad_norm": 0.43008172512054443, "learning_rate": 7.925830486573926e-06, "loss": 0.02747933566570282, "memory(GiB)": 21.48, "step": 10343, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.955462 }, { "epoch": 0.33602962674203296, "grad_norm": 0.37995314598083496, "learning_rate": 7.925394885146339e-06, "loss": 0.022616297006607056, "memory(GiB)": 21.48, "step": 10344, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.955477 }, { "epoch": 0.3360621122047884, "grad_norm": 0.34808778762817383, "learning_rate": 7.924959249955802e-06, "loss": 0.02451249584555626, "memory(GiB)": 21.48, "step": 10345, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.955492 }, { "epoch": 0.3360945976675438, "grad_norm": 0.5034744143486023, "learning_rate": 7.92452358100734e-06, "loss": 0.0407332107424736, "memory(GiB)": 21.48, "step": 10346, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955506 }, { "epoch": 0.3361270831302992, "grad_norm": 0.3589349091053009, "learning_rate": 7.924087878305985e-06, "loss": 0.02515276148915291, "memory(GiB)": 21.48, "step": 10347, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955523 }, { "epoch": 0.3361595685930546, "grad_norm": 0.39984196424484253, "learning_rate": 7.923652141856764e-06, "loss": 0.03669501096010208, "memory(GiB)": 21.48, "step": 10348, "token_acc": 1.0, "train_speed(iter/s)": 0.95554 }, { "epoch": 0.33619205405581004, "grad_norm": 0.32434821128845215, "learning_rate": 7.923216371664706e-06, "loss": 0.026776213198900223, "memory(GiB)": 21.48, "step": 10349, "token_acc": 0.984, "train_speed(iter/s)": 0.955554 }, { "epoch": 0.33622453951856546, "grad_norm": 0.3360882103443146, "learning_rate": 7.922780567734841e-06, "loss": 0.0272649098187685, "memory(GiB)": 21.48, "step": 10350, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.955569 }, { "epoch": 0.3362570249813209, "grad_norm": 0.5077940225601196, "learning_rate": 7.922344730072196e-06, "loss": 0.03937792032957077, "memory(GiB)": 21.48, "step": 10351, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.955586 }, { "epoch": 0.3362895104440763, "grad_norm": 0.33279600739479065, "learning_rate": 7.921908858681803e-06, "loss": 0.025258179754018784, "memory(GiB)": 21.48, "step": 10352, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.9556 }, { "epoch": 0.3363219959068317, "grad_norm": 0.4816793203353882, "learning_rate": 7.921472953568694e-06, "loss": 0.037427373230457306, "memory(GiB)": 21.48, "step": 10353, "token_acc": 0.9786096256684492, "train_speed(iter/s)": 0.955616 }, { "epoch": 0.3363544813695871, "grad_norm": 0.466672420501709, "learning_rate": 7.921037014737898e-06, "loss": 0.024245237931609154, "memory(GiB)": 21.48, "step": 10354, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.95563 }, { "epoch": 0.33638696683234254, "grad_norm": 0.39853397011756897, "learning_rate": 7.920601042194448e-06, "loss": 0.02673444151878357, "memory(GiB)": 21.48, "step": 10355, "token_acc": 1.0, "train_speed(iter/s)": 0.955641 }, { "epoch": 0.33641945229509795, "grad_norm": 0.33301037549972534, "learning_rate": 7.920165035943374e-06, "loss": 0.027564186602830887, "memory(GiB)": 21.48, "step": 10356, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.955654 }, { "epoch": 0.33645193775785337, "grad_norm": 0.30776524543762207, "learning_rate": 7.919728995989708e-06, "loss": 0.02490447275340557, "memory(GiB)": 21.48, "step": 10357, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.955667 }, { "epoch": 0.3364844232206088, "grad_norm": 0.25732213258743286, "learning_rate": 7.919292922338485e-06, "loss": 0.01897202618420124, "memory(GiB)": 21.48, "step": 10358, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955679 }, { "epoch": 0.3365169086833642, "grad_norm": 0.521870493888855, "learning_rate": 7.918856814994735e-06, "loss": 0.03069119155406952, "memory(GiB)": 21.48, "step": 10359, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.955691 }, { "epoch": 0.3365493941461196, "grad_norm": 0.4539661705493927, "learning_rate": 7.918420673963491e-06, "loss": 0.02838503196835518, "memory(GiB)": 21.48, "step": 10360, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955704 }, { "epoch": 0.33658187960887503, "grad_norm": 0.23427797853946686, "learning_rate": 7.917984499249792e-06, "loss": 0.022078847512602806, "memory(GiB)": 21.48, "step": 10361, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.955716 }, { "epoch": 0.33661436507163045, "grad_norm": 0.4468926787376404, "learning_rate": 7.917548290858664e-06, "loss": 0.026712799444794655, "memory(GiB)": 21.48, "step": 10362, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.955728 }, { "epoch": 0.33664685053438587, "grad_norm": 0.4122355878353119, "learning_rate": 7.917112048795148e-06, "loss": 0.03205185383558273, "memory(GiB)": 21.48, "step": 10363, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.955741 }, { "epoch": 0.3366793359971413, "grad_norm": 0.5119988322257996, "learning_rate": 7.916675773064275e-06, "loss": 0.03502852842211723, "memory(GiB)": 21.48, "step": 10364, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.955754 }, { "epoch": 0.3367118214598967, "grad_norm": 0.4514819383621216, "learning_rate": 7.916239463671082e-06, "loss": 0.029720641672611237, "memory(GiB)": 21.48, "step": 10365, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.955766 }, { "epoch": 0.3367443069226521, "grad_norm": 0.383086621761322, "learning_rate": 7.915803120620605e-06, "loss": 0.01780989021062851, "memory(GiB)": 21.48, "step": 10366, "token_acc": 0.9921875, "train_speed(iter/s)": 0.95578 }, { "epoch": 0.33677679238540753, "grad_norm": 0.749556839466095, "learning_rate": 7.915366743917876e-06, "loss": 0.026384232565760612, "memory(GiB)": 21.48, "step": 10367, "token_acc": 0.9928571428571429, "train_speed(iter/s)": 0.955794 }, { "epoch": 0.33680927784816295, "grad_norm": 0.4272979497909546, "learning_rate": 7.914930333567937e-06, "loss": 0.036391906440258026, "memory(GiB)": 21.48, "step": 10368, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.955809 }, { "epoch": 0.33684176331091836, "grad_norm": 0.3577825129032135, "learning_rate": 7.91449388957582e-06, "loss": 0.026213377714157104, "memory(GiB)": 21.48, "step": 10369, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955824 }, { "epoch": 0.3368742487736738, "grad_norm": 0.32014819979667664, "learning_rate": 7.914057411946568e-06, "loss": 0.02143140509724617, "memory(GiB)": 21.48, "step": 10370, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.95584 }, { "epoch": 0.3369067342364292, "grad_norm": 0.41386979818344116, "learning_rate": 7.913620900685213e-06, "loss": 0.02946014702320099, "memory(GiB)": 21.48, "step": 10371, "token_acc": 0.9890510948905109, "train_speed(iter/s)": 0.955855 }, { "epoch": 0.3369392196991846, "grad_norm": 0.284742146730423, "learning_rate": 7.913184355796793e-06, "loss": 0.02275683358311653, "memory(GiB)": 21.48, "step": 10372, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.955869 }, { "epoch": 0.33697170516194, "grad_norm": 0.6903049349784851, "learning_rate": 7.91274777728635e-06, "loss": 0.038272738456726074, "memory(GiB)": 21.48, "step": 10373, "token_acc": 0.9758454106280193, "train_speed(iter/s)": 0.955883 }, { "epoch": 0.33700419062469544, "grad_norm": 0.29589343070983887, "learning_rate": 7.912311165158921e-06, "loss": 0.020151615142822266, "memory(GiB)": 21.48, "step": 10374, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.955902 }, { "epoch": 0.33703667608745086, "grad_norm": 0.3943657875061035, "learning_rate": 7.911874519419545e-06, "loss": 0.02265963889658451, "memory(GiB)": 21.48, "step": 10375, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.955921 }, { "epoch": 0.3370691615502063, "grad_norm": 0.37339821457862854, "learning_rate": 7.91143784007326e-06, "loss": 0.02710978500545025, "memory(GiB)": 21.48, "step": 10376, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.955941 }, { "epoch": 0.3371016470129617, "grad_norm": 0.4353907108306885, "learning_rate": 7.911001127125109e-06, "loss": 0.026641570031642914, "memory(GiB)": 21.48, "step": 10377, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955961 }, { "epoch": 0.3371341324757171, "grad_norm": 0.4375638961791992, "learning_rate": 7.910564380580128e-06, "loss": 0.03366084396839142, "memory(GiB)": 21.48, "step": 10378, "token_acc": 0.9645669291338582, "train_speed(iter/s)": 0.955981 }, { "epoch": 0.3371666179384725, "grad_norm": 0.5000256299972534, "learning_rate": 7.910127600443362e-06, "loss": 0.04174149036407471, "memory(GiB)": 21.48, "step": 10379, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.956 }, { "epoch": 0.33719910340122794, "grad_norm": 0.5719005465507507, "learning_rate": 7.90969078671985e-06, "loss": 0.04004976153373718, "memory(GiB)": 21.48, "step": 10380, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.956019 }, { "epoch": 0.33723158886398336, "grad_norm": 0.4559250473976135, "learning_rate": 7.909253939414633e-06, "loss": 0.02977951243519783, "memory(GiB)": 21.48, "step": 10381, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.956038 }, { "epoch": 0.33726407432673877, "grad_norm": 0.5804769992828369, "learning_rate": 7.908817058532753e-06, "loss": 0.02892262488603592, "memory(GiB)": 21.48, "step": 10382, "token_acc": 0.9779005524861878, "train_speed(iter/s)": 0.956057 }, { "epoch": 0.3372965597894942, "grad_norm": 0.554139256477356, "learning_rate": 7.908380144079253e-06, "loss": 0.03577524423599243, "memory(GiB)": 21.48, "step": 10383, "token_acc": 0.9838056680161943, "train_speed(iter/s)": 0.956076 }, { "epoch": 0.3373290452522496, "grad_norm": 0.43917205929756165, "learning_rate": 7.907943196059174e-06, "loss": 0.028159942477941513, "memory(GiB)": 21.48, "step": 10384, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.956096 }, { "epoch": 0.337361530715005, "grad_norm": 0.4410748779773712, "learning_rate": 7.907506214477561e-06, "loss": 0.030687052756547928, "memory(GiB)": 21.48, "step": 10385, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.956115 }, { "epoch": 0.33739401617776044, "grad_norm": 0.3407430350780487, "learning_rate": 7.907069199339456e-06, "loss": 0.024139240384101868, "memory(GiB)": 21.48, "step": 10386, "token_acc": 1.0, "train_speed(iter/s)": 0.956135 }, { "epoch": 0.33742650164051585, "grad_norm": 0.3494225740432739, "learning_rate": 7.906632150649904e-06, "loss": 0.023467952385544777, "memory(GiB)": 21.48, "step": 10387, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.956153 }, { "epoch": 0.33745898710327127, "grad_norm": 0.5153074264526367, "learning_rate": 7.906195068413949e-06, "loss": 0.027894604951143265, "memory(GiB)": 21.48, "step": 10388, "token_acc": 0.9900332225913622, "train_speed(iter/s)": 0.956172 }, { "epoch": 0.3374914725660267, "grad_norm": 0.3206203579902649, "learning_rate": 7.905757952636633e-06, "loss": 0.030862335115671158, "memory(GiB)": 21.48, "step": 10389, "token_acc": 1.0, "train_speed(iter/s)": 0.956192 }, { "epoch": 0.3375239580287821, "grad_norm": 0.3976799249649048, "learning_rate": 7.905320803323005e-06, "loss": 0.026927383616566658, "memory(GiB)": 21.48, "step": 10390, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956212 }, { "epoch": 0.3375564434915375, "grad_norm": 0.2144218236207962, "learning_rate": 7.904883620478105e-06, "loss": 0.018567703664302826, "memory(GiB)": 21.48, "step": 10391, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956231 }, { "epoch": 0.33758892895429293, "grad_norm": 0.3473387360572815, "learning_rate": 7.904446404106984e-06, "loss": 0.03577708452939987, "memory(GiB)": 21.48, "step": 10392, "token_acc": 0.9758454106280193, "train_speed(iter/s)": 0.956251 }, { "epoch": 0.33762141441704835, "grad_norm": 0.4157499372959137, "learning_rate": 7.904009154214685e-06, "loss": 0.026089482009410858, "memory(GiB)": 21.48, "step": 10393, "token_acc": 0.996, "train_speed(iter/s)": 0.956269 }, { "epoch": 0.33765389987980376, "grad_norm": 0.7360461354255676, "learning_rate": 7.903571870806254e-06, "loss": 0.03394324332475662, "memory(GiB)": 21.48, "step": 10394, "token_acc": 0.9796954314720813, "train_speed(iter/s)": 0.956289 }, { "epoch": 0.3376863853425592, "grad_norm": 0.4538920521736145, "learning_rate": 7.90313455388674e-06, "loss": 0.03535197675228119, "memory(GiB)": 21.48, "step": 10395, "token_acc": 0.9810606060606061, "train_speed(iter/s)": 0.956309 }, { "epoch": 0.3377188708053146, "grad_norm": 0.5186126232147217, "learning_rate": 7.902697203461188e-06, "loss": 0.03102244809269905, "memory(GiB)": 21.48, "step": 10396, "token_acc": 0.9779005524861878, "train_speed(iter/s)": 0.956328 }, { "epoch": 0.33775135626807, "grad_norm": 0.5825423002243042, "learning_rate": 7.902259819534648e-06, "loss": 0.02489236369729042, "memory(GiB)": 21.48, "step": 10397, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.956347 }, { "epoch": 0.33778384173082543, "grad_norm": 0.33894994854927063, "learning_rate": 7.901822402112168e-06, "loss": 0.023622801527380943, "memory(GiB)": 21.48, "step": 10398, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.956367 }, { "epoch": 0.33781632719358085, "grad_norm": 0.322292298078537, "learning_rate": 7.901384951198792e-06, "loss": 0.027830209583044052, "memory(GiB)": 21.48, "step": 10399, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.956387 }, { "epoch": 0.3378488126563363, "grad_norm": 0.3448601961135864, "learning_rate": 7.900947466799574e-06, "loss": 0.029164990410208702, "memory(GiB)": 21.48, "step": 10400, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956406 }, { "epoch": 0.33788129811909173, "grad_norm": 2.1080398559570312, "learning_rate": 7.900509948919559e-06, "loss": 0.03449476510286331, "memory(GiB)": 21.48, "step": 10401, "token_acc": 0.9746835443037974, "train_speed(iter/s)": 0.956426 }, { "epoch": 0.33791378358184715, "grad_norm": 0.5170996785163879, "learning_rate": 7.9000723975638e-06, "loss": 0.03127779811620712, "memory(GiB)": 21.48, "step": 10402, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.956444 }, { "epoch": 0.33794626904460257, "grad_norm": 0.428270548582077, "learning_rate": 7.899634812737347e-06, "loss": 0.0327431745827198, "memory(GiB)": 21.48, "step": 10403, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.956463 }, { "epoch": 0.337978754507358, "grad_norm": 0.3122745454311371, "learning_rate": 7.899197194445247e-06, "loss": 0.023545704782009125, "memory(GiB)": 21.48, "step": 10404, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.956482 }, { "epoch": 0.3380112399701134, "grad_norm": 0.4037235379219055, "learning_rate": 7.898759542692555e-06, "loss": 0.022390170022845268, "memory(GiB)": 21.48, "step": 10405, "token_acc": 0.9847908745247148, "train_speed(iter/s)": 0.956498 }, { "epoch": 0.3380437254328688, "grad_norm": 0.4404600262641907, "learning_rate": 7.898321857484317e-06, "loss": 0.0264140497893095, "memory(GiB)": 21.48, "step": 10406, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956514 }, { "epoch": 0.33807621089562423, "grad_norm": 0.2933236062526703, "learning_rate": 7.897884138825589e-06, "loss": 0.023257508873939514, "memory(GiB)": 21.48, "step": 10407, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956529 }, { "epoch": 0.33810869635837965, "grad_norm": 0.5053012371063232, "learning_rate": 7.897446386721419e-06, "loss": 0.03241444751620293, "memory(GiB)": 21.48, "step": 10408, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.956545 }, { "epoch": 0.33814118182113506, "grad_norm": 0.4142659306526184, "learning_rate": 7.897008601176861e-06, "loss": 0.0272449292242527, "memory(GiB)": 21.48, "step": 10409, "token_acc": 1.0, "train_speed(iter/s)": 0.95656 }, { "epoch": 0.3381736672838905, "grad_norm": 0.2648409605026245, "learning_rate": 7.89657078219697e-06, "loss": 0.018787603825330734, "memory(GiB)": 21.48, "step": 10410, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.956576 }, { "epoch": 0.3382061527466459, "grad_norm": 0.4822988510131836, "learning_rate": 7.896132929786795e-06, "loss": 0.03160472586750984, "memory(GiB)": 21.48, "step": 10411, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.956591 }, { "epoch": 0.3382386382094013, "grad_norm": 0.34564515948295593, "learning_rate": 7.895695043951392e-06, "loss": 0.025372343137860298, "memory(GiB)": 21.48, "step": 10412, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.956606 }, { "epoch": 0.3382711236721567, "grad_norm": 0.3309903144836426, "learning_rate": 7.895257124695814e-06, "loss": 0.026879625394940376, "memory(GiB)": 21.48, "step": 10413, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.956623 }, { "epoch": 0.33830360913491214, "grad_norm": 0.40301838517189026, "learning_rate": 7.894819172025117e-06, "loss": 0.024878470227122307, "memory(GiB)": 21.48, "step": 10414, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.956637 }, { "epoch": 0.33833609459766756, "grad_norm": 0.42702096700668335, "learning_rate": 7.89438118594435e-06, "loss": 0.03294360637664795, "memory(GiB)": 21.48, "step": 10415, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.956649 }, { "epoch": 0.338368580060423, "grad_norm": 0.4308903217315674, "learning_rate": 7.893943166458576e-06, "loss": 0.03544235602021217, "memory(GiB)": 21.48, "step": 10416, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.956659 }, { "epoch": 0.3384010655231784, "grad_norm": 0.3708943724632263, "learning_rate": 7.893505113572843e-06, "loss": 0.027204398065805435, "memory(GiB)": 21.48, "step": 10417, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.956673 }, { "epoch": 0.3384335509859338, "grad_norm": 0.6284059286117554, "learning_rate": 7.89306702729221e-06, "loss": 0.044898129999637604, "memory(GiB)": 21.48, "step": 10418, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.956683 }, { "epoch": 0.3384660364486892, "grad_norm": 0.4955936372280121, "learning_rate": 7.892628907621734e-06, "loss": 0.03243805840611458, "memory(GiB)": 21.48, "step": 10419, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956691 }, { "epoch": 0.33849852191144464, "grad_norm": 0.43570253252983093, "learning_rate": 7.892190754566468e-06, "loss": 0.03010738641023636, "memory(GiB)": 21.48, "step": 10420, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956705 }, { "epoch": 0.33853100737420005, "grad_norm": 0.3795779049396515, "learning_rate": 7.891752568131474e-06, "loss": 0.029321039095520973, "memory(GiB)": 21.48, "step": 10421, "token_acc": 0.9858490566037735, "train_speed(iter/s)": 0.956716 }, { "epoch": 0.33856349283695547, "grad_norm": 0.370713472366333, "learning_rate": 7.891314348321807e-06, "loss": 0.02673972398042679, "memory(GiB)": 21.48, "step": 10422, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956728 }, { "epoch": 0.3385959782997109, "grad_norm": 0.3476389944553375, "learning_rate": 7.89087609514252e-06, "loss": 0.03103158250451088, "memory(GiB)": 21.48, "step": 10423, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.95674 }, { "epoch": 0.3386284637624663, "grad_norm": 0.638239324092865, "learning_rate": 7.89043780859868e-06, "loss": 0.04224562644958496, "memory(GiB)": 21.48, "step": 10424, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.956754 }, { "epoch": 0.3386609492252217, "grad_norm": 0.43435144424438477, "learning_rate": 7.889999488695337e-06, "loss": 0.03579463064670563, "memory(GiB)": 21.48, "step": 10425, "token_acc": 0.9898477157360406, "train_speed(iter/s)": 0.956767 }, { "epoch": 0.33869343468797714, "grad_norm": 0.3483901619911194, "learning_rate": 7.889561135437553e-06, "loss": 0.02284495159983635, "memory(GiB)": 21.48, "step": 10426, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.95678 }, { "epoch": 0.33872592015073255, "grad_norm": 0.3517124354839325, "learning_rate": 7.88912274883039e-06, "loss": 0.026395585387945175, "memory(GiB)": 21.48, "step": 10427, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956791 }, { "epoch": 0.33875840561348797, "grad_norm": 0.4397192597389221, "learning_rate": 7.888684328878905e-06, "loss": 0.037715595215559006, "memory(GiB)": 21.48, "step": 10428, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.956805 }, { "epoch": 0.3387908910762434, "grad_norm": 0.25490641593933105, "learning_rate": 7.888245875588156e-06, "loss": 0.021798335015773773, "memory(GiB)": 21.48, "step": 10429, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.95682 }, { "epoch": 0.3388233765389988, "grad_norm": 0.5757359266281128, "learning_rate": 7.887807388963206e-06, "loss": 0.028841881081461906, "memory(GiB)": 21.48, "step": 10430, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.956834 }, { "epoch": 0.3388558620017542, "grad_norm": 0.7154537439346313, "learning_rate": 7.887368869009116e-06, "loss": 0.022484127432107925, "memory(GiB)": 21.48, "step": 10431, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.956847 }, { "epoch": 0.33888834746450963, "grad_norm": 0.41883155703544617, "learning_rate": 7.886930315730944e-06, "loss": 0.03155384957790375, "memory(GiB)": 21.48, "step": 10432, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.95686 }, { "epoch": 0.33892083292726505, "grad_norm": 0.36478057503700256, "learning_rate": 7.886491729133756e-06, "loss": 0.030930830165743828, "memory(GiB)": 21.48, "step": 10433, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.956876 }, { "epoch": 0.33895331839002046, "grad_norm": 0.39314478635787964, "learning_rate": 7.886053109222612e-06, "loss": 0.025898413732647896, "memory(GiB)": 21.48, "step": 10434, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.956896 }, { "epoch": 0.3389858038527759, "grad_norm": 0.4647548198699951, "learning_rate": 7.885614456002572e-06, "loss": 0.0327354297041893, "memory(GiB)": 21.48, "step": 10435, "token_acc": 0.9875, "train_speed(iter/s)": 0.956915 }, { "epoch": 0.3390182893155313, "grad_norm": 0.3845522105693817, "learning_rate": 7.8851757694787e-06, "loss": 0.02412102371454239, "memory(GiB)": 21.48, "step": 10436, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.956934 }, { "epoch": 0.3390507747782867, "grad_norm": 0.3810161054134369, "learning_rate": 7.884737049656062e-06, "loss": 0.033311229199171066, "memory(GiB)": 21.48, "step": 10437, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.956953 }, { "epoch": 0.33908326024104213, "grad_norm": 0.4320412278175354, "learning_rate": 7.884298296539716e-06, "loss": 0.026982728391885757, "memory(GiB)": 21.48, "step": 10438, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.956971 }, { "epoch": 0.33911574570379754, "grad_norm": 0.4298391044139862, "learning_rate": 7.88385951013473e-06, "loss": 0.0264514721930027, "memory(GiB)": 21.48, "step": 10439, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.956989 }, { "epoch": 0.33914823116655296, "grad_norm": 0.6274273991584778, "learning_rate": 7.883420690446167e-06, "loss": 0.02378908358514309, "memory(GiB)": 21.48, "step": 10440, "token_acc": 1.0, "train_speed(iter/s)": 0.957008 }, { "epoch": 0.3391807166293084, "grad_norm": 0.44265803694725037, "learning_rate": 7.882981837479092e-06, "loss": 0.023478776216506958, "memory(GiB)": 21.48, "step": 10441, "token_acc": 1.0, "train_speed(iter/s)": 0.957027 }, { "epoch": 0.3392132020920638, "grad_norm": 0.3771474063396454, "learning_rate": 7.882542951238568e-06, "loss": 0.024190109223127365, "memory(GiB)": 21.48, "step": 10442, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.957047 }, { "epoch": 0.3392456875548192, "grad_norm": 0.4102053940296173, "learning_rate": 7.882104031729662e-06, "loss": 0.03287763148546219, "memory(GiB)": 21.48, "step": 10443, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.957065 }, { "epoch": 0.3392781730175746, "grad_norm": 0.4579169750213623, "learning_rate": 7.88166507895744e-06, "loss": 0.022527361288666725, "memory(GiB)": 21.48, "step": 10444, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957085 }, { "epoch": 0.33931065848033004, "grad_norm": 0.4296916127204895, "learning_rate": 7.881226092926968e-06, "loss": 0.038268059492111206, "memory(GiB)": 21.48, "step": 10445, "token_acc": 0.9739583333333334, "train_speed(iter/s)": 0.957105 }, { "epoch": 0.33934314394308546, "grad_norm": 0.43360161781311035, "learning_rate": 7.88078707364331e-06, "loss": 0.03171434998512268, "memory(GiB)": 21.48, "step": 10446, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.957125 }, { "epoch": 0.3393756294058409, "grad_norm": 0.5286714434623718, "learning_rate": 7.880348021111535e-06, "loss": 0.03876180946826935, "memory(GiB)": 21.48, "step": 10447, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.957145 }, { "epoch": 0.3394081148685963, "grad_norm": 0.7002393007278442, "learning_rate": 7.879908935336714e-06, "loss": 0.03133445605635643, "memory(GiB)": 21.48, "step": 10448, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.957164 }, { "epoch": 0.3394406003313517, "grad_norm": 0.3642616271972656, "learning_rate": 7.879469816323907e-06, "loss": 0.021534575149416924, "memory(GiB)": 21.48, "step": 10449, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.957184 }, { "epoch": 0.3394730857941071, "grad_norm": 0.42841440439224243, "learning_rate": 7.879030664078187e-06, "loss": 0.021798977628350258, "memory(GiB)": 21.48, "step": 10450, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.957204 }, { "epoch": 0.33950557125686254, "grad_norm": 0.3443288207054138, "learning_rate": 7.87859147860462e-06, "loss": 0.029094599187374115, "memory(GiB)": 21.48, "step": 10451, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.957223 }, { "epoch": 0.33953805671961795, "grad_norm": 0.452008455991745, "learning_rate": 7.878152259908276e-06, "loss": 0.030405867844820023, "memory(GiB)": 21.48, "step": 10452, "token_acc": 0.9961538461538462, "train_speed(iter/s)": 0.957243 }, { "epoch": 0.33957054218237337, "grad_norm": 0.5480612516403198, "learning_rate": 7.877713007994226e-06, "loss": 0.029757626354694366, "memory(GiB)": 21.48, "step": 10453, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957263 }, { "epoch": 0.3396030276451288, "grad_norm": 0.4329110383987427, "learning_rate": 7.877273722867536e-06, "loss": 0.02863231860101223, "memory(GiB)": 21.48, "step": 10454, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.957283 }, { "epoch": 0.3396355131078842, "grad_norm": 0.4303956925868988, "learning_rate": 7.876834404533278e-06, "loss": 0.031006118282675743, "memory(GiB)": 21.48, "step": 10455, "token_acc": 0.984375, "train_speed(iter/s)": 0.957303 }, { "epoch": 0.3396679985706396, "grad_norm": 0.37072598934173584, "learning_rate": 7.876395052996521e-06, "loss": 0.026131970807909966, "memory(GiB)": 21.48, "step": 10456, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.957322 }, { "epoch": 0.33970048403339503, "grad_norm": 0.37997087836265564, "learning_rate": 7.875955668262338e-06, "loss": 0.03174585476517677, "memory(GiB)": 21.48, "step": 10457, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.957342 }, { "epoch": 0.33973296949615045, "grad_norm": 0.3415567874908447, "learning_rate": 7.8755162503358e-06, "loss": 0.029644185677170753, "memory(GiB)": 21.48, "step": 10458, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.957361 }, { "epoch": 0.33976545495890587, "grad_norm": 0.35759231448173523, "learning_rate": 7.875076799221974e-06, "loss": 0.019876856356859207, "memory(GiB)": 21.48, "step": 10459, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.95738 }, { "epoch": 0.3397979404216613, "grad_norm": 0.3800238072872162, "learning_rate": 7.874637314925937e-06, "loss": 0.023654714226722717, "memory(GiB)": 21.48, "step": 10460, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.9574 }, { "epoch": 0.3398304258844167, "grad_norm": 0.44439032673835754, "learning_rate": 7.874197797452758e-06, "loss": 0.028349189087748528, "memory(GiB)": 21.48, "step": 10461, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957419 }, { "epoch": 0.3398629113471721, "grad_norm": 0.33871281147003174, "learning_rate": 7.873758246807512e-06, "loss": 0.02881556749343872, "memory(GiB)": 21.48, "step": 10462, "token_acc": 0.9933333333333333, "train_speed(iter/s)": 0.957439 }, { "epoch": 0.33989539680992753, "grad_norm": 0.37767818570137024, "learning_rate": 7.87331866299527e-06, "loss": 0.02841731533408165, "memory(GiB)": 21.48, "step": 10463, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957457 }, { "epoch": 0.339927882272683, "grad_norm": 0.5559654235839844, "learning_rate": 7.872879046021107e-06, "loss": 0.03428032994270325, "memory(GiB)": 21.48, "step": 10464, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.957477 }, { "epoch": 0.3399603677354384, "grad_norm": 0.9324043989181519, "learning_rate": 7.872439395890095e-06, "loss": 0.027961192652583122, "memory(GiB)": 21.48, "step": 10465, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.957497 }, { "epoch": 0.33999285319819383, "grad_norm": 0.43339335918426514, "learning_rate": 7.87199971260731e-06, "loss": 0.02179894968867302, "memory(GiB)": 21.48, "step": 10466, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.957516 }, { "epoch": 0.34002533866094925, "grad_norm": 0.5619242787361145, "learning_rate": 7.871559996177826e-06, "loss": 0.035693325102329254, "memory(GiB)": 21.48, "step": 10467, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957536 }, { "epoch": 0.34005782412370467, "grad_norm": 0.39663371443748474, "learning_rate": 7.871120246606717e-06, "loss": 0.03126504272222519, "memory(GiB)": 21.48, "step": 10468, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.957556 }, { "epoch": 0.3400903095864601, "grad_norm": 0.34209880232810974, "learning_rate": 7.870680463899058e-06, "loss": 0.02995988540351391, "memory(GiB)": 21.48, "step": 10469, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.957573 }, { "epoch": 0.3401227950492155, "grad_norm": 0.5160295963287354, "learning_rate": 7.870240648059928e-06, "loss": 0.031661126762628555, "memory(GiB)": 21.48, "step": 10470, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.957589 }, { "epoch": 0.3401552805119709, "grad_norm": 1.342725157737732, "learning_rate": 7.8698007990944e-06, "loss": 0.05003473907709122, "memory(GiB)": 21.48, "step": 10471, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.957604 }, { "epoch": 0.34018776597472633, "grad_norm": 0.4952698349952698, "learning_rate": 7.86936091700755e-06, "loss": 0.027934391051530838, "memory(GiB)": 21.48, "step": 10472, "token_acc": 0.9961832061068703, "train_speed(iter/s)": 0.957619 }, { "epoch": 0.34022025143748175, "grad_norm": 0.4451085925102234, "learning_rate": 7.868921001804457e-06, "loss": 0.027427103370428085, "memory(GiB)": 21.48, "step": 10473, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.957633 }, { "epoch": 0.34025273690023716, "grad_norm": 0.3989076018333435, "learning_rate": 7.868481053490196e-06, "loss": 0.024672213941812515, "memory(GiB)": 21.48, "step": 10474, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.957647 }, { "epoch": 0.3402852223629926, "grad_norm": 0.39662638306617737, "learning_rate": 7.868041072069844e-06, "loss": 0.021733928471803665, "memory(GiB)": 21.48, "step": 10475, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.957659 }, { "epoch": 0.340317707825748, "grad_norm": 0.6181864142417908, "learning_rate": 7.867601057548484e-06, "loss": 0.030113091692328453, "memory(GiB)": 21.48, "step": 10476, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.95767 }, { "epoch": 0.3403501932885034, "grad_norm": 0.5112954378128052, "learning_rate": 7.86716100993119e-06, "loss": 0.03532523661851883, "memory(GiB)": 21.48, "step": 10477, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.957683 }, { "epoch": 0.3403826787512588, "grad_norm": 0.45279660820961, "learning_rate": 7.866720929223041e-06, "loss": 0.0279054194688797, "memory(GiB)": 21.48, "step": 10478, "token_acc": 0.9759450171821306, "train_speed(iter/s)": 0.957697 }, { "epoch": 0.34041516421401424, "grad_norm": 0.33962157368659973, "learning_rate": 7.866280815429118e-06, "loss": 0.033125318586826324, "memory(GiB)": 21.48, "step": 10479, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.957706 }, { "epoch": 0.34044764967676966, "grad_norm": 0.45114436745643616, "learning_rate": 7.865840668554499e-06, "loss": 0.029910612851381302, "memory(GiB)": 21.48, "step": 10480, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.957719 }, { "epoch": 0.3404801351395251, "grad_norm": 0.42934650182724, "learning_rate": 7.865400488604265e-06, "loss": 0.022924885153770447, "memory(GiB)": 21.48, "step": 10481, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.957732 }, { "epoch": 0.3405126206022805, "grad_norm": 0.38419580459594727, "learning_rate": 7.864960275583493e-06, "loss": 0.03199899569153786, "memory(GiB)": 21.48, "step": 10482, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.957742 }, { "epoch": 0.3405451060650359, "grad_norm": 0.37765100598335266, "learning_rate": 7.864520029497269e-06, "loss": 0.028349339962005615, "memory(GiB)": 21.48, "step": 10483, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957755 }, { "epoch": 0.3405775915277913, "grad_norm": 0.5066224336624146, "learning_rate": 7.86407975035067e-06, "loss": 0.03375301510095596, "memory(GiB)": 21.48, "step": 10484, "token_acc": 1.0, "train_speed(iter/s)": 0.957766 }, { "epoch": 0.34061007699054674, "grad_norm": 0.6549086570739746, "learning_rate": 7.863639438148778e-06, "loss": 0.026226066052913666, "memory(GiB)": 21.48, "step": 10485, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.957777 }, { "epoch": 0.34064256245330216, "grad_norm": 0.30734190344810486, "learning_rate": 7.863199092896677e-06, "loss": 0.02113322727382183, "memory(GiB)": 21.48, "step": 10486, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957788 }, { "epoch": 0.34067504791605757, "grad_norm": 0.374446302652359, "learning_rate": 7.862758714599446e-06, "loss": 0.02815905027091503, "memory(GiB)": 21.48, "step": 10487, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.9578 }, { "epoch": 0.340707533378813, "grad_norm": 0.5331777334213257, "learning_rate": 7.86231830326217e-06, "loss": 0.025753315538167953, "memory(GiB)": 21.48, "step": 10488, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.957811 }, { "epoch": 0.3407400188415684, "grad_norm": 0.5561439394950867, "learning_rate": 7.86187785888993e-06, "loss": 0.028857402503490448, "memory(GiB)": 21.48, "step": 10489, "token_acc": 1.0, "train_speed(iter/s)": 0.957823 }, { "epoch": 0.3407725043043238, "grad_norm": 0.3686617314815521, "learning_rate": 7.86143738148781e-06, "loss": 0.027378063648939133, "memory(GiB)": 21.48, "step": 10490, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957836 }, { "epoch": 0.34080498976707924, "grad_norm": 0.43268248438835144, "learning_rate": 7.860996871060896e-06, "loss": 0.030883699655532837, "memory(GiB)": 21.48, "step": 10491, "token_acc": 0.994475138121547, "train_speed(iter/s)": 0.957848 }, { "epoch": 0.34083747522983465, "grad_norm": 0.6513492465019226, "learning_rate": 7.86055632761427e-06, "loss": 0.03327450156211853, "memory(GiB)": 21.48, "step": 10492, "token_acc": 0.9857651245551602, "train_speed(iter/s)": 0.957862 }, { "epoch": 0.34086996069259007, "grad_norm": 0.5535101294517517, "learning_rate": 7.860115751153014e-06, "loss": 0.04070749878883362, "memory(GiB)": 21.48, "step": 10493, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.957878 }, { "epoch": 0.3409024461553455, "grad_norm": 0.24473190307617188, "learning_rate": 7.859675141682219e-06, "loss": 0.019364727661013603, "memory(GiB)": 21.48, "step": 10494, "token_acc": 1.0, "train_speed(iter/s)": 0.957896 }, { "epoch": 0.3409349316181009, "grad_norm": 0.3980211317539215, "learning_rate": 7.859234499206964e-06, "loss": 0.03123488649725914, "memory(GiB)": 21.48, "step": 10495, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.957915 }, { "epoch": 0.3409674170808563, "grad_norm": 0.7866989970207214, "learning_rate": 7.858793823732337e-06, "loss": 0.03455109894275665, "memory(GiB)": 21.48, "step": 10496, "token_acc": 0.975, "train_speed(iter/s)": 0.957935 }, { "epoch": 0.34099990254361173, "grad_norm": 0.4027816951274872, "learning_rate": 7.858353115263426e-06, "loss": 0.028269900009036064, "memory(GiB)": 21.48, "step": 10497, "token_acc": 0.9831460674157303, "train_speed(iter/s)": 0.957953 }, { "epoch": 0.34103238800636715, "grad_norm": 0.5122796893119812, "learning_rate": 7.857912373805315e-06, "loss": 0.029706677421927452, "memory(GiB)": 21.48, "step": 10498, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.957973 }, { "epoch": 0.34106487346912256, "grad_norm": 0.41183605790138245, "learning_rate": 7.857471599363093e-06, "loss": 0.028651943430304527, "memory(GiB)": 21.48, "step": 10499, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957993 }, { "epoch": 0.341097358931878, "grad_norm": 0.4145294725894928, "learning_rate": 7.857030791941843e-06, "loss": 0.03040282428264618, "memory(GiB)": 21.48, "step": 10500, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.958011 }, { "epoch": 0.341097358931878, "eval_loss": 0.03112662583589554, "eval_runtime": 79.1637, "eval_samples_per_second": 125.689, "eval_steps_per_second": 3.929, "eval_token_acc": 0.9879049349855934, "step": 10500 }, { "epoch": 0.3411298443946334, "grad_norm": 1.5153419971466064, "learning_rate": 7.856589951546656e-06, "loss": 0.03150281310081482, "memory(GiB)": 21.48, "step": 10501, "token_acc": 0.987662773342827, "train_speed(iter/s)": 0.950247 }, { "epoch": 0.3411623298573888, "grad_norm": 0.5088366866111755, "learning_rate": 7.856149078182618e-06, "loss": 0.028831303119659424, "memory(GiB)": 21.48, "step": 10502, "token_acc": 1.0, "train_speed(iter/s)": 0.950261 }, { "epoch": 0.34119481532014423, "grad_norm": 0.3929094076156616, "learning_rate": 7.855708171854818e-06, "loss": 0.03214564174413681, "memory(GiB)": 21.48, "step": 10503, "token_acc": 0.9869565217391304, "train_speed(iter/s)": 0.950276 }, { "epoch": 0.34122730078289965, "grad_norm": 0.7595739960670471, "learning_rate": 7.855267232568346e-06, "loss": 0.0373850017786026, "memory(GiB)": 21.48, "step": 10504, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.950291 }, { "epoch": 0.34125978624565506, "grad_norm": 0.45908868312835693, "learning_rate": 7.85482626032829e-06, "loss": 0.021771950647234917, "memory(GiB)": 21.48, "step": 10505, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.950305 }, { "epoch": 0.3412922717084105, "grad_norm": 0.43535932898521423, "learning_rate": 7.854385255139739e-06, "loss": 0.034053102135658264, "memory(GiB)": 21.48, "step": 10506, "token_acc": 0.98046875, "train_speed(iter/s)": 0.950318 }, { "epoch": 0.3413247571711659, "grad_norm": 0.3744204342365265, "learning_rate": 7.853944217007781e-06, "loss": 0.029523391276597977, "memory(GiB)": 21.48, "step": 10507, "token_acc": 0.9930555555555556, "train_speed(iter/s)": 0.950331 }, { "epoch": 0.3413572426339213, "grad_norm": 1.604637861251831, "learning_rate": 7.853503145937509e-06, "loss": 0.03341321647167206, "memory(GiB)": 21.48, "step": 10508, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.950344 }, { "epoch": 0.3413897280966767, "grad_norm": 0.4929189383983612, "learning_rate": 7.853062041934015e-06, "loss": 0.034760624170303345, "memory(GiB)": 21.48, "step": 10509, "token_acc": 0.9790940766550522, "train_speed(iter/s)": 0.950356 }, { "epoch": 0.34142221355943214, "grad_norm": 0.42960837483406067, "learning_rate": 7.852620905002384e-06, "loss": 0.02926088124513626, "memory(GiB)": 21.48, "step": 10510, "token_acc": 0.9940828402366864, "train_speed(iter/s)": 0.950371 }, { "epoch": 0.34145469902218756, "grad_norm": 0.516743004322052, "learning_rate": 7.852179735147713e-06, "loss": 0.030167164281010628, "memory(GiB)": 21.48, "step": 10511, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.950384 }, { "epoch": 0.341487184484943, "grad_norm": 0.515290379524231, "learning_rate": 7.851738532375092e-06, "loss": 0.04142200946807861, "memory(GiB)": 21.48, "step": 10512, "token_acc": 0.976, "train_speed(iter/s)": 0.950398 }, { "epoch": 0.3415196699476984, "grad_norm": 0.34192657470703125, "learning_rate": 7.851297296689611e-06, "loss": 0.022167561575770378, "memory(GiB)": 21.48, "step": 10513, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.95041 }, { "epoch": 0.3415521554104538, "grad_norm": 0.31485462188720703, "learning_rate": 7.850856028096367e-06, "loss": 0.02210117317736149, "memory(GiB)": 21.48, "step": 10514, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.950424 }, { "epoch": 0.3415846408732092, "grad_norm": 0.46605169773101807, "learning_rate": 7.850414726600448e-06, "loss": 0.02535872906446457, "memory(GiB)": 21.48, "step": 10515, "token_acc": 0.9895104895104895, "train_speed(iter/s)": 0.950437 }, { "epoch": 0.34161712633596464, "grad_norm": 0.6347101330757141, "learning_rate": 7.849973392206948e-06, "loss": 0.036506883800029755, "memory(GiB)": 21.48, "step": 10516, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.950451 }, { "epoch": 0.34164961179872005, "grad_norm": 0.32625800371170044, "learning_rate": 7.849532024920962e-06, "loss": 0.02029954455792904, "memory(GiB)": 21.48, "step": 10517, "token_acc": 1.0, "train_speed(iter/s)": 0.950464 }, { "epoch": 0.34168209726147547, "grad_norm": 0.407676100730896, "learning_rate": 7.849090624747587e-06, "loss": 0.027530957013368607, "memory(GiB)": 21.48, "step": 10518, "token_acc": 0.983957219251337, "train_speed(iter/s)": 0.950477 }, { "epoch": 0.3417145827242309, "grad_norm": 0.34961438179016113, "learning_rate": 7.84864919169191e-06, "loss": 0.021255621686577797, "memory(GiB)": 21.48, "step": 10519, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.950491 }, { "epoch": 0.3417470681869863, "grad_norm": 0.45692384243011475, "learning_rate": 7.848207725759033e-06, "loss": 0.03559694066643715, "memory(GiB)": 21.48, "step": 10520, "token_acc": 0.9899328859060402, "train_speed(iter/s)": 0.950506 }, { "epoch": 0.3417795536497417, "grad_norm": 0.4325966536998749, "learning_rate": 7.847766226954045e-06, "loss": 0.021780602633953094, "memory(GiB)": 21.48, "step": 10521, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.950522 }, { "epoch": 0.34181203911249713, "grad_norm": 0.6301698088645935, "learning_rate": 7.847324695282047e-06, "loss": 0.02902970090508461, "memory(GiB)": 21.48, "step": 10522, "token_acc": 0.985781990521327, "train_speed(iter/s)": 0.950539 }, { "epoch": 0.34184452457525255, "grad_norm": 0.3813784420490265, "learning_rate": 7.84688313074813e-06, "loss": 0.02013454958796501, "memory(GiB)": 21.48, "step": 10523, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.950551 }, { "epoch": 0.34187701003800797, "grad_norm": 0.36472398042678833, "learning_rate": 7.846441533357394e-06, "loss": 0.019237075001001358, "memory(GiB)": 21.48, "step": 10524, "token_acc": 0.9901639344262295, "train_speed(iter/s)": 0.950564 }, { "epoch": 0.3419094955007634, "grad_norm": 0.2946547269821167, "learning_rate": 7.845999903114934e-06, "loss": 0.025624610483646393, "memory(GiB)": 21.48, "step": 10525, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.950577 }, { "epoch": 0.3419419809635188, "grad_norm": 0.3824152648448944, "learning_rate": 7.845558240025847e-06, "loss": 0.025681406259536743, "memory(GiB)": 21.48, "step": 10526, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.950593 }, { "epoch": 0.3419744664262742, "grad_norm": 0.31765472888946533, "learning_rate": 7.84511654409523e-06, "loss": 0.02022716775536537, "memory(GiB)": 21.48, "step": 10527, "token_acc": 1.0, "train_speed(iter/s)": 0.95061 }, { "epoch": 0.3420069518890297, "grad_norm": 0.3066459000110626, "learning_rate": 7.844674815328182e-06, "loss": 0.02069413661956787, "memory(GiB)": 21.48, "step": 10528, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.950626 }, { "epoch": 0.3420394373517851, "grad_norm": 0.404807984828949, "learning_rate": 7.844233053729799e-06, "loss": 0.020842300727963448, "memory(GiB)": 21.48, "step": 10529, "token_acc": 1.0, "train_speed(iter/s)": 0.950642 }, { "epoch": 0.3420719228145405, "grad_norm": 0.3900739848613739, "learning_rate": 7.843791259305183e-06, "loss": 0.031223520636558533, "memory(GiB)": 21.48, "step": 10530, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.950657 }, { "epoch": 0.34210440827729593, "grad_norm": 0.775568962097168, "learning_rate": 7.843349432059428e-06, "loss": 0.0374334454536438, "memory(GiB)": 21.48, "step": 10531, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.950672 }, { "epoch": 0.34213689374005135, "grad_norm": 0.356584370136261, "learning_rate": 7.84290757199764e-06, "loss": 0.023663144558668137, "memory(GiB)": 21.48, "step": 10532, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.950691 }, { "epoch": 0.34216937920280677, "grad_norm": 0.637854278087616, "learning_rate": 7.84246567912491e-06, "loss": 0.03850990906357765, "memory(GiB)": 21.48, "step": 10533, "token_acc": 0.9753086419753086, "train_speed(iter/s)": 0.950708 }, { "epoch": 0.3422018646655622, "grad_norm": 0.6798805594444275, "learning_rate": 7.842023753446345e-06, "loss": 0.03624966740608215, "memory(GiB)": 21.48, "step": 10534, "token_acc": 0.9807692307692307, "train_speed(iter/s)": 0.950727 }, { "epoch": 0.3422343501283176, "grad_norm": 0.632546067237854, "learning_rate": 7.841581794967044e-06, "loss": 0.03112415224313736, "memory(GiB)": 21.48, "step": 10535, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.950747 }, { "epoch": 0.342266835591073, "grad_norm": 0.42299389839172363, "learning_rate": 7.841139803692108e-06, "loss": 0.02762019820511341, "memory(GiB)": 21.48, "step": 10536, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.950765 }, { "epoch": 0.34229932105382843, "grad_norm": 0.4393637776374817, "learning_rate": 7.840697779626634e-06, "loss": 0.02957465499639511, "memory(GiB)": 21.48, "step": 10537, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.950784 }, { "epoch": 0.34233180651658385, "grad_norm": 0.46929195523262024, "learning_rate": 7.840255722775728e-06, "loss": 0.02678774483501911, "memory(GiB)": 21.48, "step": 10538, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.950803 }, { "epoch": 0.34236429197933926, "grad_norm": 0.5348209738731384, "learning_rate": 7.83981363314449e-06, "loss": 0.03405188396573067, "memory(GiB)": 21.48, "step": 10539, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.950824 }, { "epoch": 0.3423967774420947, "grad_norm": 0.6496565937995911, "learning_rate": 7.839371510738023e-06, "loss": 0.0336863212287426, "memory(GiB)": 21.48, "step": 10540, "token_acc": 0.9865319865319865, "train_speed(iter/s)": 0.950844 }, { "epoch": 0.3424292629048501, "grad_norm": 0.5826122760772705, "learning_rate": 7.83892935556143e-06, "loss": 0.03949294611811638, "memory(GiB)": 21.48, "step": 10541, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.950862 }, { "epoch": 0.3424617483676055, "grad_norm": 0.5338467359542847, "learning_rate": 7.838487167619815e-06, "loss": 0.02597871609032154, "memory(GiB)": 21.48, "step": 10542, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.950881 }, { "epoch": 0.34249423383036093, "grad_norm": 0.526077389717102, "learning_rate": 7.838044946918278e-06, "loss": 0.029058299958705902, "memory(GiB)": 21.48, "step": 10543, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.9509 }, { "epoch": 0.34252671929311634, "grad_norm": 0.42920824885368347, "learning_rate": 7.837602693461925e-06, "loss": 0.030858049169182777, "memory(GiB)": 21.48, "step": 10544, "token_acc": 0.9823943661971831, "train_speed(iter/s)": 0.950919 }, { "epoch": 0.34255920475587176, "grad_norm": 0.48019880056381226, "learning_rate": 7.837160407255861e-06, "loss": 0.030632108449935913, "memory(GiB)": 21.48, "step": 10545, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.950939 }, { "epoch": 0.3425916902186272, "grad_norm": 0.44047796726226807, "learning_rate": 7.83671808830519e-06, "loss": 0.030436357483267784, "memory(GiB)": 21.48, "step": 10546, "token_acc": 0.9888268156424581, "train_speed(iter/s)": 0.950934 }, { "epoch": 0.3426241756813826, "grad_norm": 0.4893595278263092, "learning_rate": 7.836275736615016e-06, "loss": 0.026306135579943657, "memory(GiB)": 21.48, "step": 10547, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.950954 }, { "epoch": 0.342656661144138, "grad_norm": 0.4057465195655823, "learning_rate": 7.835833352190445e-06, "loss": 0.027550622820854187, "memory(GiB)": 21.48, "step": 10548, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.950972 }, { "epoch": 0.3426891466068934, "grad_norm": 0.3833303451538086, "learning_rate": 7.835390935036583e-06, "loss": 0.036455199122428894, "memory(GiB)": 21.48, "step": 10549, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.95099 }, { "epoch": 0.34272163206964884, "grad_norm": 0.4514599144458771, "learning_rate": 7.834948485158534e-06, "loss": 0.03160626441240311, "memory(GiB)": 21.48, "step": 10550, "token_acc": 0.9877551020408163, "train_speed(iter/s)": 0.95101 }, { "epoch": 0.34275411753240426, "grad_norm": 0.6313471794128418, "learning_rate": 7.834506002561408e-06, "loss": 0.03601297736167908, "memory(GiB)": 21.48, "step": 10551, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.95103 }, { "epoch": 0.3427866029951597, "grad_norm": 0.3297739028930664, "learning_rate": 7.83406348725031e-06, "loss": 0.025415267795324326, "memory(GiB)": 21.48, "step": 10552, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.95105 }, { "epoch": 0.3428190884579151, "grad_norm": 0.7257143259048462, "learning_rate": 7.833620939230346e-06, "loss": 0.03162665292620659, "memory(GiB)": 21.48, "step": 10553, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.95107 }, { "epoch": 0.3428515739206705, "grad_norm": 1.0363717079162598, "learning_rate": 7.833178358506625e-06, "loss": 0.02232883684337139, "memory(GiB)": 21.48, "step": 10554, "token_acc": 0.9806201550387597, "train_speed(iter/s)": 0.951089 }, { "epoch": 0.3428840593834259, "grad_norm": 0.35770314931869507, "learning_rate": 7.832735745084254e-06, "loss": 0.027369409799575806, "memory(GiB)": 21.48, "step": 10555, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.951107 }, { "epoch": 0.34291654484618134, "grad_norm": 0.34664708375930786, "learning_rate": 7.832293098968345e-06, "loss": 0.027098961174488068, "memory(GiB)": 21.48, "step": 10556, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.951125 }, { "epoch": 0.34294903030893675, "grad_norm": 0.5675469636917114, "learning_rate": 7.831850420164002e-06, "loss": 0.03819043189287186, "memory(GiB)": 21.48, "step": 10557, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.951144 }, { "epoch": 0.34298151577169217, "grad_norm": 0.5663265585899353, "learning_rate": 7.831407708676336e-06, "loss": 0.036777786910533905, "memory(GiB)": 21.48, "step": 10558, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.951164 }, { "epoch": 0.3430140012344476, "grad_norm": 0.4498383402824402, "learning_rate": 7.830964964510457e-06, "loss": 0.04390848055481911, "memory(GiB)": 21.48, "step": 10559, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.951181 }, { "epoch": 0.343046486697203, "grad_norm": 0.34498417377471924, "learning_rate": 7.830522187671475e-06, "loss": 0.02724478766322136, "memory(GiB)": 21.48, "step": 10560, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.951193 }, { "epoch": 0.3430789721599584, "grad_norm": 0.3369826078414917, "learning_rate": 7.830079378164497e-06, "loss": 0.02339676022529602, "memory(GiB)": 21.48, "step": 10561, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.951208 }, { "epoch": 0.34311145762271383, "grad_norm": 0.4555431604385376, "learning_rate": 7.829636535994638e-06, "loss": 0.029001206159591675, "memory(GiB)": 21.48, "step": 10562, "token_acc": 0.9823008849557522, "train_speed(iter/s)": 0.951224 }, { "epoch": 0.34314394308546925, "grad_norm": 0.5409367084503174, "learning_rate": 7.829193661167007e-06, "loss": 0.03554593771696091, "memory(GiB)": 21.48, "step": 10563, "token_acc": 0.9875, "train_speed(iter/s)": 0.951239 }, { "epoch": 0.34317642854822467, "grad_norm": 0.3943725824356079, "learning_rate": 7.828750753686715e-06, "loss": 0.034463658928871155, "memory(GiB)": 21.48, "step": 10564, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.951251 }, { "epoch": 0.3432089140109801, "grad_norm": 0.3644789159297943, "learning_rate": 7.828307813558876e-06, "loss": 0.030508939176797867, "memory(GiB)": 21.48, "step": 10565, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.951266 }, { "epoch": 0.3432413994737355, "grad_norm": 0.44410380721092224, "learning_rate": 7.827864840788597e-06, "loss": 0.029764937236905098, "memory(GiB)": 21.48, "step": 10566, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.951282 }, { "epoch": 0.3432738849364909, "grad_norm": 0.3050605356693268, "learning_rate": 7.827421835380996e-06, "loss": 0.028425609692931175, "memory(GiB)": 21.48, "step": 10567, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.951297 }, { "epoch": 0.34330637039924633, "grad_norm": 0.4498586356639862, "learning_rate": 7.826978797341183e-06, "loss": 0.03616601973772049, "memory(GiB)": 21.48, "step": 10568, "token_acc": 0.985, "train_speed(iter/s)": 0.951311 }, { "epoch": 0.34333885586200175, "grad_norm": 0.4364263713359833, "learning_rate": 7.826535726674274e-06, "loss": 0.028161566704511642, "memory(GiB)": 21.48, "step": 10569, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.951325 }, { "epoch": 0.34337134132475716, "grad_norm": 0.321120947599411, "learning_rate": 7.826092623385377e-06, "loss": 0.024590399116277695, "memory(GiB)": 21.48, "step": 10570, "token_acc": 1.0, "train_speed(iter/s)": 0.951341 }, { "epoch": 0.3434038267875126, "grad_norm": 0.44893911480903625, "learning_rate": 7.825649487479612e-06, "loss": 0.025952108204364777, "memory(GiB)": 21.48, "step": 10571, "token_acc": 1.0, "train_speed(iter/s)": 0.951354 }, { "epoch": 0.343436312250268, "grad_norm": 0.3999670147895813, "learning_rate": 7.82520631896209e-06, "loss": 0.03859787434339523, "memory(GiB)": 21.48, "step": 10572, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.951367 }, { "epoch": 0.3434687977130234, "grad_norm": 0.5411379337310791, "learning_rate": 7.824763117837928e-06, "loss": 0.031539157032966614, "memory(GiB)": 21.48, "step": 10573, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.95138 }, { "epoch": 0.3435012831757788, "grad_norm": 0.4365778863430023, "learning_rate": 7.824319884112239e-06, "loss": 0.028169456869363785, "memory(GiB)": 21.48, "step": 10574, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.951394 }, { "epoch": 0.34353376863853424, "grad_norm": 0.41942042112350464, "learning_rate": 7.82387661779014e-06, "loss": 0.0321626253426075, "memory(GiB)": 21.48, "step": 10575, "token_acc": 1.0, "train_speed(iter/s)": 0.951408 }, { "epoch": 0.34356625410128966, "grad_norm": 0.3316068947315216, "learning_rate": 7.823433318876745e-06, "loss": 0.026085589081048965, "memory(GiB)": 21.48, "step": 10576, "token_acc": 1.0, "train_speed(iter/s)": 0.951422 }, { "epoch": 0.3435987395640451, "grad_norm": 0.367612361907959, "learning_rate": 7.822989987377175e-06, "loss": 0.03377637267112732, "memory(GiB)": 21.48, "step": 10577, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.951436 }, { "epoch": 0.3436312250268005, "grad_norm": 0.3942994773387909, "learning_rate": 7.822546623296539e-06, "loss": 0.025030992925167084, "memory(GiB)": 21.48, "step": 10578, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.951449 }, { "epoch": 0.3436637104895559, "grad_norm": 0.8611204028129578, "learning_rate": 7.82210322663996e-06, "loss": 0.03319723159074783, "memory(GiB)": 21.48, "step": 10579, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.951464 }, { "epoch": 0.3436961959523113, "grad_norm": 1.1139932870864868, "learning_rate": 7.821659797412552e-06, "loss": 0.032523173838853836, "memory(GiB)": 21.48, "step": 10580, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.95148 }, { "epoch": 0.34372868141506674, "grad_norm": 0.4879244863986969, "learning_rate": 7.821216335619437e-06, "loss": 0.03095146454870701, "memory(GiB)": 21.48, "step": 10581, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.951494 }, { "epoch": 0.34376116687782216, "grad_norm": 0.46051865816116333, "learning_rate": 7.820772841265728e-06, "loss": 0.035437583923339844, "memory(GiB)": 21.48, "step": 10582, "token_acc": 0.9793388429752066, "train_speed(iter/s)": 0.951508 }, { "epoch": 0.34379365234057757, "grad_norm": 0.35929811000823975, "learning_rate": 7.820329314356547e-06, "loss": 0.026635266840457916, "memory(GiB)": 21.48, "step": 10583, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.951523 }, { "epoch": 0.343826137803333, "grad_norm": 0.36421939730644226, "learning_rate": 7.819885754897013e-06, "loss": 0.023419629782438278, "memory(GiB)": 21.48, "step": 10584, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.951538 }, { "epoch": 0.3438586232660884, "grad_norm": 0.42321130633354187, "learning_rate": 7.819442162892242e-06, "loss": 0.029724933207035065, "memory(GiB)": 21.48, "step": 10585, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.951554 }, { "epoch": 0.3438911087288438, "grad_norm": 0.4140467643737793, "learning_rate": 7.818998538347358e-06, "loss": 0.02326164022088051, "memory(GiB)": 21.48, "step": 10586, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.95157 }, { "epoch": 0.34392359419159924, "grad_norm": 0.3783416152000427, "learning_rate": 7.818554881267477e-06, "loss": 0.026073787361383438, "memory(GiB)": 21.48, "step": 10587, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.951586 }, { "epoch": 0.34395607965435465, "grad_norm": 0.33107998967170715, "learning_rate": 7.81811119165772e-06, "loss": 0.026279117912054062, "memory(GiB)": 21.48, "step": 10588, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.9516 }, { "epoch": 0.34398856511711007, "grad_norm": 0.3221156597137451, "learning_rate": 7.817667469523212e-06, "loss": 0.019969679415225983, "memory(GiB)": 21.48, "step": 10589, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.951615 }, { "epoch": 0.3440210505798655, "grad_norm": 0.4722348153591156, "learning_rate": 7.81722371486907e-06, "loss": 0.03351961821317673, "memory(GiB)": 21.48, "step": 10590, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.951628 }, { "epoch": 0.3440535360426209, "grad_norm": 0.3705870807170868, "learning_rate": 7.816779927700417e-06, "loss": 0.023462794721126556, "memory(GiB)": 21.48, "step": 10591, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.951642 }, { "epoch": 0.34408602150537637, "grad_norm": 0.4614645540714264, "learning_rate": 7.816336108022373e-06, "loss": 0.031509168446063995, "memory(GiB)": 21.48, "step": 10592, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.951657 }, { "epoch": 0.3441185069681318, "grad_norm": 0.549877941608429, "learning_rate": 7.815892255840063e-06, "loss": 0.04021592438220978, "memory(GiB)": 21.48, "step": 10593, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.951671 }, { "epoch": 0.3441509924308872, "grad_norm": 0.38877952098846436, "learning_rate": 7.815448371158609e-06, "loss": 0.02369549497961998, "memory(GiB)": 21.48, "step": 10594, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.951686 }, { "epoch": 0.3441834778936426, "grad_norm": 0.4002464711666107, "learning_rate": 7.815004453983132e-06, "loss": 0.023532867431640625, "memory(GiB)": 21.48, "step": 10595, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.951702 }, { "epoch": 0.34421596335639804, "grad_norm": 0.44288283586502075, "learning_rate": 7.814560504318758e-06, "loss": 0.029275167733430862, "memory(GiB)": 21.48, "step": 10596, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.951721 }, { "epoch": 0.34424844881915345, "grad_norm": 0.5232624411582947, "learning_rate": 7.814116522170608e-06, "loss": 0.03089210018515587, "memory(GiB)": 21.48, "step": 10597, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.95174 }, { "epoch": 0.34428093428190887, "grad_norm": 0.626531183719635, "learning_rate": 7.813672507543808e-06, "loss": 0.03556641936302185, "memory(GiB)": 21.48, "step": 10598, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.95176 }, { "epoch": 0.3443134197446643, "grad_norm": 0.4371359646320343, "learning_rate": 7.813228460443485e-06, "loss": 0.02835153229534626, "memory(GiB)": 21.48, "step": 10599, "token_acc": 0.9896551724137931, "train_speed(iter/s)": 0.95178 }, { "epoch": 0.3443459052074197, "grad_norm": 0.40639594197273254, "learning_rate": 7.812784380874759e-06, "loss": 0.026348203420639038, "memory(GiB)": 21.48, "step": 10600, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.951798 }, { "epoch": 0.3443783906701751, "grad_norm": 0.36692774295806885, "learning_rate": 7.812340268842758e-06, "loss": 0.025924496352672577, "memory(GiB)": 21.48, "step": 10601, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.951816 }, { "epoch": 0.34441087613293053, "grad_norm": 0.7093736529350281, "learning_rate": 7.811896124352605e-06, "loss": 0.028757132589817047, "memory(GiB)": 21.48, "step": 10602, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.951836 }, { "epoch": 0.34444336159568595, "grad_norm": 0.4267081618309021, "learning_rate": 7.81145194740943e-06, "loss": 0.026744477450847626, "memory(GiB)": 21.48, "step": 10603, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.951855 }, { "epoch": 0.34447584705844136, "grad_norm": 0.671462893486023, "learning_rate": 7.811007738018356e-06, "loss": 0.03561133146286011, "memory(GiB)": 21.48, "step": 10604, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.951874 }, { "epoch": 0.3445083325211968, "grad_norm": 0.48340466618537903, "learning_rate": 7.810563496184513e-06, "loss": 0.03167914226651192, "memory(GiB)": 21.48, "step": 10605, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.951893 }, { "epoch": 0.3445408179839522, "grad_norm": 0.4648934006690979, "learning_rate": 7.810119221913025e-06, "loss": 0.023419804871082306, "memory(GiB)": 21.48, "step": 10606, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.951913 }, { "epoch": 0.3445733034467076, "grad_norm": 0.5325967073440552, "learning_rate": 7.809674915209025e-06, "loss": 0.0302170030772686, "memory(GiB)": 21.48, "step": 10607, "token_acc": 0.9746192893401016, "train_speed(iter/s)": 0.951932 }, { "epoch": 0.34460578890946303, "grad_norm": 0.8177430033683777, "learning_rate": 7.809230576077633e-06, "loss": 0.02482481859624386, "memory(GiB)": 21.48, "step": 10608, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.951952 }, { "epoch": 0.34463827437221844, "grad_norm": 0.40893852710723877, "learning_rate": 7.808786204523981e-06, "loss": 0.026434408500790596, "memory(GiB)": 21.48, "step": 10609, "token_acc": 0.9856115107913669, "train_speed(iter/s)": 0.951972 }, { "epoch": 0.34467075983497386, "grad_norm": 0.815460205078125, "learning_rate": 7.808341800553198e-06, "loss": 0.03618498891592026, "memory(GiB)": 21.48, "step": 10610, "token_acc": 0.9737827715355806, "train_speed(iter/s)": 0.951991 }, { "epoch": 0.3447032452977293, "grad_norm": 0.5225691199302673, "learning_rate": 7.807897364170415e-06, "loss": 0.03256797045469284, "memory(GiB)": 21.48, "step": 10611, "token_acc": 0.9824561403508771, "train_speed(iter/s)": 0.95201 }, { "epoch": 0.3447357307604847, "grad_norm": 0.5053731799125671, "learning_rate": 7.807452895380756e-06, "loss": 0.032703712582588196, "memory(GiB)": 21.48, "step": 10612, "token_acc": 1.0, "train_speed(iter/s)": 0.95203 }, { "epoch": 0.3447682162232401, "grad_norm": 0.39437082409858704, "learning_rate": 7.807008394189356e-06, "loss": 0.02123936638236046, "memory(GiB)": 21.48, "step": 10613, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.952048 }, { "epoch": 0.3448007016859955, "grad_norm": 0.3831726908683777, "learning_rate": 7.806563860601342e-06, "loss": 0.023857353255152702, "memory(GiB)": 21.48, "step": 10614, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.952068 }, { "epoch": 0.34483318714875094, "grad_norm": 0.3405223786830902, "learning_rate": 7.806119294621845e-06, "loss": 0.026002120226621628, "memory(GiB)": 21.48, "step": 10615, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.952087 }, { "epoch": 0.34486567261150636, "grad_norm": 0.4430062174797058, "learning_rate": 7.805674696255998e-06, "loss": 0.027600305154919624, "memory(GiB)": 21.48, "step": 10616, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.952105 }, { "epoch": 0.3448981580742618, "grad_norm": 0.47690415382385254, "learning_rate": 7.80523006550893e-06, "loss": 0.038270287215709686, "memory(GiB)": 21.48, "step": 10617, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.952124 }, { "epoch": 0.3449306435370172, "grad_norm": 0.5421208143234253, "learning_rate": 7.804785402385772e-06, "loss": 0.036823008209466934, "memory(GiB)": 21.48, "step": 10618, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.952142 }, { "epoch": 0.3449631289997726, "grad_norm": 0.5350589156150818, "learning_rate": 7.804340706891658e-06, "loss": 0.0310111865401268, "memory(GiB)": 21.48, "step": 10619, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.952158 }, { "epoch": 0.344995614462528, "grad_norm": 0.4335222840309143, "learning_rate": 7.803895979031722e-06, "loss": 0.028277236968278885, "memory(GiB)": 21.48, "step": 10620, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.952171 }, { "epoch": 0.34502809992528344, "grad_norm": 0.43188926577568054, "learning_rate": 7.80345121881109e-06, "loss": 0.03241275995969772, "memory(GiB)": 21.48, "step": 10621, "token_acc": 0.9813953488372092, "train_speed(iter/s)": 0.952186 }, { "epoch": 0.34506058538803885, "grad_norm": 0.4060593843460083, "learning_rate": 7.803006426234905e-06, "loss": 0.028429651632905006, "memory(GiB)": 21.48, "step": 10622, "token_acc": 1.0, "train_speed(iter/s)": 0.9522 }, { "epoch": 0.34509307085079427, "grad_norm": 0.5128370523452759, "learning_rate": 7.802561601308292e-06, "loss": 0.027606654912233353, "memory(GiB)": 21.48, "step": 10623, "token_acc": 0.9952830188679245, "train_speed(iter/s)": 0.952215 }, { "epoch": 0.3451255563135497, "grad_norm": 0.7374517917633057, "learning_rate": 7.802116744036387e-06, "loss": 0.024761933833360672, "memory(GiB)": 21.48, "step": 10624, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.95223 }, { "epoch": 0.3451580417763051, "grad_norm": 1.063231110572815, "learning_rate": 7.801671854424327e-06, "loss": 0.026235435158014297, "memory(GiB)": 21.48, "step": 10625, "token_acc": 0.992, "train_speed(iter/s)": 0.952244 }, { "epoch": 0.3451905272390605, "grad_norm": 0.7728230357170105, "learning_rate": 7.801226932477244e-06, "loss": 0.033666349947452545, "memory(GiB)": 21.48, "step": 10626, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.952258 }, { "epoch": 0.34522301270181593, "grad_norm": 0.5426203608512878, "learning_rate": 7.800781978200273e-06, "loss": 0.024464115500450134, "memory(GiB)": 21.48, "step": 10627, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.952272 }, { "epoch": 0.34525549816457135, "grad_norm": 0.36233216524124146, "learning_rate": 7.800336991598551e-06, "loss": 0.022468149662017822, "memory(GiB)": 21.48, "step": 10628, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.952287 }, { "epoch": 0.34528798362732677, "grad_norm": 0.43241894245147705, "learning_rate": 7.799891972677212e-06, "loss": 0.02662528306245804, "memory(GiB)": 21.48, "step": 10629, "token_acc": 0.9849056603773585, "train_speed(iter/s)": 0.952301 }, { "epoch": 0.3453204690900822, "grad_norm": 0.4940487742424011, "learning_rate": 7.799446921441394e-06, "loss": 0.035103339701890945, "memory(GiB)": 21.48, "step": 10630, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.952315 }, { "epoch": 0.3453529545528376, "grad_norm": 0.45289236307144165, "learning_rate": 7.799001837896234e-06, "loss": 0.0379418320953846, "memory(GiB)": 21.48, "step": 10631, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.95233 }, { "epoch": 0.345385440015593, "grad_norm": 0.6851629018783569, "learning_rate": 7.798556722046864e-06, "loss": 0.048804063349962234, "memory(GiB)": 21.48, "step": 10632, "token_acc": 0.9724770642201835, "train_speed(iter/s)": 0.952345 }, { "epoch": 0.34541792547834843, "grad_norm": 0.4557182788848877, "learning_rate": 7.798111573898426e-06, "loss": 0.03242582082748413, "memory(GiB)": 21.48, "step": 10633, "token_acc": 0.9895470383275261, "train_speed(iter/s)": 0.95236 }, { "epoch": 0.34545041094110385, "grad_norm": 0.31174367666244507, "learning_rate": 7.797666393456056e-06, "loss": 0.025959651917219162, "memory(GiB)": 21.48, "step": 10634, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.952376 }, { "epoch": 0.34548289640385926, "grad_norm": 0.42864924669265747, "learning_rate": 7.797221180724892e-06, "loss": 0.026222478598356247, "memory(GiB)": 21.48, "step": 10635, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.952389 }, { "epoch": 0.3455153818666147, "grad_norm": 0.5167312026023865, "learning_rate": 7.796775935710073e-06, "loss": 0.03467301279306412, "memory(GiB)": 21.48, "step": 10636, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.952402 }, { "epoch": 0.3455478673293701, "grad_norm": 0.5082632899284363, "learning_rate": 7.796330658416737e-06, "loss": 0.03862415999174118, "memory(GiB)": 21.48, "step": 10637, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.952415 }, { "epoch": 0.3455803527921255, "grad_norm": 0.4149068295955658, "learning_rate": 7.795885348850025e-06, "loss": 0.03129131719470024, "memory(GiB)": 21.48, "step": 10638, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.952428 }, { "epoch": 0.3456128382548809, "grad_norm": 0.4806230366230011, "learning_rate": 7.795440007015072e-06, "loss": 0.033291056752204895, "memory(GiB)": 21.48, "step": 10639, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.952443 }, { "epoch": 0.34564532371763634, "grad_norm": 0.8630151152610779, "learning_rate": 7.79499463291702e-06, "loss": 0.022904042154550552, "memory(GiB)": 21.48, "step": 10640, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.952458 }, { "epoch": 0.34567780918039176, "grad_norm": 0.47611597180366516, "learning_rate": 7.794549226561014e-06, "loss": 0.024967540055513382, "memory(GiB)": 21.48, "step": 10641, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.952459 }, { "epoch": 0.3457102946431472, "grad_norm": 0.4174019396305084, "learning_rate": 7.794103787952188e-06, "loss": 0.019865557551383972, "memory(GiB)": 21.48, "step": 10642, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.952474 }, { "epoch": 0.3457427801059026, "grad_norm": 0.41230475902557373, "learning_rate": 7.793658317095684e-06, "loss": 0.025425981730222702, "memory(GiB)": 21.48, "step": 10643, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.95249 }, { "epoch": 0.345775265568658, "grad_norm": 0.4577416181564331, "learning_rate": 7.793212813996646e-06, "loss": 0.031321875751018524, "memory(GiB)": 21.48, "step": 10644, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.952506 }, { "epoch": 0.3458077510314134, "grad_norm": 0.43459439277648926, "learning_rate": 7.792767278660215e-06, "loss": 0.0214717835187912, "memory(GiB)": 21.48, "step": 10645, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.95252 }, { "epoch": 0.34584023649416884, "grad_norm": 0.683751106262207, "learning_rate": 7.792321711091533e-06, "loss": 0.03212258219718933, "memory(GiB)": 21.48, "step": 10646, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.952534 }, { "epoch": 0.34587272195692426, "grad_norm": 0.3614245653152466, "learning_rate": 7.791876111295738e-06, "loss": 0.030163463205099106, "memory(GiB)": 21.48, "step": 10647, "token_acc": 0.9796954314720813, "train_speed(iter/s)": 0.952549 }, { "epoch": 0.34590520741967967, "grad_norm": 0.585665225982666, "learning_rate": 7.791430479277979e-06, "loss": 0.03609849512577057, "memory(GiB)": 21.48, "step": 10648, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.952562 }, { "epoch": 0.3459376928824351, "grad_norm": 0.3967894911766052, "learning_rate": 7.790984815043398e-06, "loss": 0.02452205866575241, "memory(GiB)": 21.48, "step": 10649, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.952579 }, { "epoch": 0.3459701783451905, "grad_norm": 0.4564524292945862, "learning_rate": 7.790539118597138e-06, "loss": 0.03463727608323097, "memory(GiB)": 21.48, "step": 10650, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.952595 }, { "epoch": 0.3460026638079459, "grad_norm": 0.44598594307899475, "learning_rate": 7.790093389944339e-06, "loss": 0.02650836668908596, "memory(GiB)": 21.48, "step": 10651, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.952611 }, { "epoch": 0.34603514927070134, "grad_norm": 1.0413658618927002, "learning_rate": 7.789647629090151e-06, "loss": 0.035873278975486755, "memory(GiB)": 21.48, "step": 10652, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.952625 }, { "epoch": 0.34606763473345675, "grad_norm": 0.3939465582370758, "learning_rate": 7.789201836039718e-06, "loss": 0.024402104318141937, "memory(GiB)": 21.48, "step": 10653, "token_acc": 1.0, "train_speed(iter/s)": 0.95264 }, { "epoch": 0.34610012019621217, "grad_norm": 0.7067056894302368, "learning_rate": 7.78875601079818e-06, "loss": 0.03151317685842514, "memory(GiB)": 21.48, "step": 10654, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.952653 }, { "epoch": 0.3461326056589676, "grad_norm": 0.6596706509590149, "learning_rate": 7.788310153370687e-06, "loss": 0.029517177492380142, "memory(GiB)": 21.48, "step": 10655, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.952668 }, { "epoch": 0.34616509112172306, "grad_norm": 0.4594031572341919, "learning_rate": 7.787864263762384e-06, "loss": 0.03355002403259277, "memory(GiB)": 21.48, "step": 10656, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.952683 }, { "epoch": 0.3461975765844785, "grad_norm": 0.4429301619529724, "learning_rate": 7.787418341978416e-06, "loss": 0.03461487963795662, "memory(GiB)": 21.48, "step": 10657, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.952694 }, { "epoch": 0.3462300620472339, "grad_norm": 0.2966492176055908, "learning_rate": 7.786972388023932e-06, "loss": 0.023284070193767548, "memory(GiB)": 21.48, "step": 10658, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.952709 }, { "epoch": 0.3462625475099893, "grad_norm": 0.3616361618041992, "learning_rate": 7.786526401904076e-06, "loss": 0.02363239973783493, "memory(GiB)": 21.48, "step": 10659, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.952727 }, { "epoch": 0.3462950329727447, "grad_norm": 0.29670554399490356, "learning_rate": 7.786080383623997e-06, "loss": 0.02352163754403591, "memory(GiB)": 21.48, "step": 10660, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.952745 }, { "epoch": 0.34632751843550014, "grad_norm": 0.43641340732574463, "learning_rate": 7.785634333188841e-06, "loss": 0.02847258746623993, "memory(GiB)": 21.48, "step": 10661, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.952764 }, { "epoch": 0.34636000389825555, "grad_norm": 0.5219308137893677, "learning_rate": 7.785188250603759e-06, "loss": 0.04081806167960167, "memory(GiB)": 21.48, "step": 10662, "token_acc": 0.9680365296803652, "train_speed(iter/s)": 0.952781 }, { "epoch": 0.34639248936101097, "grad_norm": 0.3754110634326935, "learning_rate": 7.784742135873895e-06, "loss": 0.029410891234874725, "memory(GiB)": 21.48, "step": 10663, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.952797 }, { "epoch": 0.3464249748237664, "grad_norm": 0.4087541997432709, "learning_rate": 7.784295989004403e-06, "loss": 0.02767735905945301, "memory(GiB)": 21.48, "step": 10664, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.952816 }, { "epoch": 0.3464574602865218, "grad_norm": 0.5023855566978455, "learning_rate": 7.783849810000427e-06, "loss": 0.035979628562927246, "memory(GiB)": 21.48, "step": 10665, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.952835 }, { "epoch": 0.3464899457492772, "grad_norm": 0.5263110995292664, "learning_rate": 7.783403598867119e-06, "loss": 0.029816536232829094, "memory(GiB)": 21.48, "step": 10666, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.952853 }, { "epoch": 0.34652243121203263, "grad_norm": 0.3266545832157135, "learning_rate": 7.782957355609631e-06, "loss": 0.02944124862551689, "memory(GiB)": 21.48, "step": 10667, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.952871 }, { "epoch": 0.34655491667478805, "grad_norm": 0.392622709274292, "learning_rate": 7.782511080233108e-06, "loss": 0.03734280914068222, "memory(GiB)": 21.48, "step": 10668, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.952891 }, { "epoch": 0.34658740213754347, "grad_norm": 0.6189818382263184, "learning_rate": 7.782064772742706e-06, "loss": 0.03772248327732086, "memory(GiB)": 21.48, "step": 10669, "token_acc": 1.0, "train_speed(iter/s)": 0.95291 }, { "epoch": 0.3466198876002989, "grad_norm": 0.36135947704315186, "learning_rate": 7.781618433143572e-06, "loss": 0.0319327674806118, "memory(GiB)": 21.48, "step": 10670, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.95293 }, { "epoch": 0.3466523730630543, "grad_norm": 0.3842368721961975, "learning_rate": 7.78117206144086e-06, "loss": 0.035440877079963684, "memory(GiB)": 21.48, "step": 10671, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.952948 }, { "epoch": 0.3466848585258097, "grad_norm": 0.555090069770813, "learning_rate": 7.780725657639721e-06, "loss": 0.024886921048164368, "memory(GiB)": 21.48, "step": 10672, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.952967 }, { "epoch": 0.34671734398856513, "grad_norm": 0.3463706374168396, "learning_rate": 7.780279221745306e-06, "loss": 0.025133026763796806, "memory(GiB)": 21.48, "step": 10673, "token_acc": 0.9921875, "train_speed(iter/s)": 0.952987 }, { "epoch": 0.34674982945132055, "grad_norm": 0.3590995967388153, "learning_rate": 7.779832753762769e-06, "loss": 0.027384210377931595, "memory(GiB)": 21.48, "step": 10674, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.953007 }, { "epoch": 0.34678231491407596, "grad_norm": 0.3457823395729065, "learning_rate": 7.779386253697261e-06, "loss": 0.02125609666109085, "memory(GiB)": 21.48, "step": 10675, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.953025 }, { "epoch": 0.3468148003768314, "grad_norm": 0.3715829849243164, "learning_rate": 7.778939721553938e-06, "loss": 0.029690835624933243, "memory(GiB)": 21.48, "step": 10676, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.953043 }, { "epoch": 0.3468472858395868, "grad_norm": 0.4402425289154053, "learning_rate": 7.77849315733795e-06, "loss": 0.037680186331272125, "memory(GiB)": 21.48, "step": 10677, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.953062 }, { "epoch": 0.3468797713023422, "grad_norm": 0.31462353467941284, "learning_rate": 7.778046561054453e-06, "loss": 0.03022419661283493, "memory(GiB)": 21.48, "step": 10678, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.953077 }, { "epoch": 0.3469122567650976, "grad_norm": 0.36858850717544556, "learning_rate": 7.777599932708602e-06, "loss": 0.030572891235351562, "memory(GiB)": 21.48, "step": 10679, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.953092 }, { "epoch": 0.34694474222785304, "grad_norm": 0.38134267926216125, "learning_rate": 7.77715327230555e-06, "loss": 0.033839963376522064, "memory(GiB)": 21.48, "step": 10680, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.953104 }, { "epoch": 0.34697722769060846, "grad_norm": 0.3907892405986786, "learning_rate": 7.776706579850452e-06, "loss": 0.032841045409440994, "memory(GiB)": 21.48, "step": 10681, "token_acc": 0.9836065573770492, "train_speed(iter/s)": 0.953119 }, { "epoch": 0.3470097131533639, "grad_norm": 1.4973739385604858, "learning_rate": 7.776259855348466e-06, "loss": 0.03169366717338562, "memory(GiB)": 21.48, "step": 10682, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.953134 }, { "epoch": 0.3470421986161193, "grad_norm": 0.6611727476119995, "learning_rate": 7.775813098804747e-06, "loss": 0.03767239674925804, "memory(GiB)": 21.48, "step": 10683, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.953148 }, { "epoch": 0.3470746840788747, "grad_norm": 0.2973240315914154, "learning_rate": 7.775366310224449e-06, "loss": 0.025022640824317932, "memory(GiB)": 21.48, "step": 10684, "token_acc": 0.9895287958115183, "train_speed(iter/s)": 0.953164 }, { "epoch": 0.3471071695416301, "grad_norm": 0.4313606321811676, "learning_rate": 7.774919489612729e-06, "loss": 0.025364873930811882, "memory(GiB)": 21.48, "step": 10685, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.953178 }, { "epoch": 0.34713965500438554, "grad_norm": 1.4540314674377441, "learning_rate": 7.774472636974746e-06, "loss": 0.025082919746637344, "memory(GiB)": 21.48, "step": 10686, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.953192 }, { "epoch": 0.34717214046714095, "grad_norm": 0.41189077496528625, "learning_rate": 7.774025752315655e-06, "loss": 0.03340790420770645, "memory(GiB)": 21.48, "step": 10687, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.953205 }, { "epoch": 0.34720462592989637, "grad_norm": 0.32232585549354553, "learning_rate": 7.773578835640614e-06, "loss": 0.020992940291762352, "memory(GiB)": 21.48, "step": 10688, "token_acc": 0.98828125, "train_speed(iter/s)": 0.953219 }, { "epoch": 0.3472371113926518, "grad_norm": 0.32864469289779663, "learning_rate": 7.773131886954784e-06, "loss": 0.025442561134696007, "memory(GiB)": 21.48, "step": 10689, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.953234 }, { "epoch": 0.3472695968554072, "grad_norm": 0.35422322154045105, "learning_rate": 7.772684906263319e-06, "loss": 0.026408568024635315, "memory(GiB)": 21.48, "step": 10690, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.953249 }, { "epoch": 0.3473020823181626, "grad_norm": 0.43738844990730286, "learning_rate": 7.77223789357138e-06, "loss": 0.03208281844854355, "memory(GiB)": 21.48, "step": 10691, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.953264 }, { "epoch": 0.34733456778091804, "grad_norm": 0.3411817252635956, "learning_rate": 7.771790848884126e-06, "loss": 0.021166112273931503, "memory(GiB)": 21.48, "step": 10692, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.953279 }, { "epoch": 0.34736705324367345, "grad_norm": 0.2888248860836029, "learning_rate": 7.771343772206716e-06, "loss": 0.02070610597729683, "memory(GiB)": 21.48, "step": 10693, "token_acc": 0.9836734693877551, "train_speed(iter/s)": 0.953294 }, { "epoch": 0.34739953870642887, "grad_norm": 0.46242108941078186, "learning_rate": 7.770896663544311e-06, "loss": 0.032546304166316986, "memory(GiB)": 21.48, "step": 10694, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.953307 }, { "epoch": 0.3474320241691843, "grad_norm": 0.4457707107067108, "learning_rate": 7.77044952290207e-06, "loss": 0.028181787580251694, "memory(GiB)": 21.48, "step": 10695, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.953322 }, { "epoch": 0.3474645096319397, "grad_norm": 0.4130929410457611, "learning_rate": 7.770002350285153e-06, "loss": 0.034415196627378464, "memory(GiB)": 21.48, "step": 10696, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.953337 }, { "epoch": 0.3474969950946951, "grad_norm": 0.3820735216140747, "learning_rate": 7.769555145698725e-06, "loss": 0.023520898073911667, "memory(GiB)": 21.48, "step": 10697, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.95335 }, { "epoch": 0.34752948055745053, "grad_norm": 0.8718360662460327, "learning_rate": 7.76910790914794e-06, "loss": 0.025874491780996323, "memory(GiB)": 21.48, "step": 10698, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.953362 }, { "epoch": 0.34756196602020595, "grad_norm": 0.4810046851634979, "learning_rate": 7.768660640637968e-06, "loss": 0.03486199676990509, "memory(GiB)": 21.48, "step": 10699, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.953361 }, { "epoch": 0.34759445148296136, "grad_norm": 0.5286234617233276, "learning_rate": 7.768213340173966e-06, "loss": 0.02844792790710926, "memory(GiB)": 21.48, "step": 10700, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.953376 }, { "epoch": 0.3476269369457168, "grad_norm": 0.524597704410553, "learning_rate": 7.767766007761098e-06, "loss": 0.03847348690032959, "memory(GiB)": 21.48, "step": 10701, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.953391 }, { "epoch": 0.3476594224084722, "grad_norm": 0.5236040949821472, "learning_rate": 7.767318643404524e-06, "loss": 0.030680641531944275, "memory(GiB)": 21.48, "step": 10702, "token_acc": 0.9823008849557522, "train_speed(iter/s)": 0.953407 }, { "epoch": 0.3476919078712276, "grad_norm": 0.2532540559768677, "learning_rate": 7.766871247109412e-06, "loss": 0.01381718274205923, "memory(GiB)": 21.48, "step": 10703, "token_acc": 1.0, "train_speed(iter/s)": 0.953423 }, { "epoch": 0.34772439333398303, "grad_norm": 0.4895780384540558, "learning_rate": 7.766423818880923e-06, "loss": 0.031339921057224274, "memory(GiB)": 21.48, "step": 10704, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.953437 }, { "epoch": 0.34775687879673844, "grad_norm": 0.42420119047164917, "learning_rate": 7.76597635872422e-06, "loss": 0.029463009908795357, "memory(GiB)": 21.48, "step": 10705, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.953451 }, { "epoch": 0.34778936425949386, "grad_norm": 0.3485671877861023, "learning_rate": 7.76552886664447e-06, "loss": 0.02481923997402191, "memory(GiB)": 21.48, "step": 10706, "token_acc": 1.0, "train_speed(iter/s)": 0.953466 }, { "epoch": 0.3478218497222493, "grad_norm": 0.4767734408378601, "learning_rate": 7.765081342646832e-06, "loss": 0.024177756160497665, "memory(GiB)": 21.48, "step": 10707, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.953481 }, { "epoch": 0.3478543351850047, "grad_norm": 0.7085104584693909, "learning_rate": 7.76463378673648e-06, "loss": 0.028536513447761536, "memory(GiB)": 21.48, "step": 10708, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.953493 }, { "epoch": 0.3478868206477601, "grad_norm": 0.42800337076187134, "learning_rate": 7.76418619891857e-06, "loss": 0.03349890559911728, "memory(GiB)": 21.48, "step": 10709, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.953508 }, { "epoch": 0.3479193061105155, "grad_norm": 0.42733773589134216, "learning_rate": 7.763738579198274e-06, "loss": 0.02600822225213051, "memory(GiB)": 21.48, "step": 10710, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.953522 }, { "epoch": 0.34795179157327094, "grad_norm": 0.47711846232414246, "learning_rate": 7.763290927580758e-06, "loss": 0.027306046336889267, "memory(GiB)": 21.48, "step": 10711, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.953538 }, { "epoch": 0.34798427703602636, "grad_norm": 0.5242344737052917, "learning_rate": 7.762843244071184e-06, "loss": 0.036518394947052, "memory(GiB)": 21.48, "step": 10712, "token_acc": 0.9757085020242915, "train_speed(iter/s)": 0.953553 }, { "epoch": 0.3480167624987818, "grad_norm": 0.48584285378456116, "learning_rate": 7.762395528674721e-06, "loss": 0.024456091225147247, "memory(GiB)": 21.48, "step": 10713, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.953569 }, { "epoch": 0.3480492479615372, "grad_norm": 0.521788477897644, "learning_rate": 7.76194778139654e-06, "loss": 0.026707682758569717, "memory(GiB)": 21.48, "step": 10714, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.953582 }, { "epoch": 0.3480817334242926, "grad_norm": 0.46289947628974915, "learning_rate": 7.761500002241805e-06, "loss": 0.02596474625170231, "memory(GiB)": 21.48, "step": 10715, "token_acc": 1.0, "train_speed(iter/s)": 0.953592 }, { "epoch": 0.348114218887048, "grad_norm": 0.47585031390190125, "learning_rate": 7.761052191215683e-06, "loss": 0.02668006345629692, "memory(GiB)": 21.48, "step": 10716, "token_acc": 0.9835390946502057, "train_speed(iter/s)": 0.953605 }, { "epoch": 0.34814670434980344, "grad_norm": 0.6544293165206909, "learning_rate": 7.760604348323345e-06, "loss": 0.03358251973986626, "memory(GiB)": 21.48, "step": 10717, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.95362 }, { "epoch": 0.34817918981255885, "grad_norm": 0.36000409722328186, "learning_rate": 7.760156473569958e-06, "loss": 0.026282500475645065, "memory(GiB)": 21.48, "step": 10718, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.953634 }, { "epoch": 0.34821167527531427, "grad_norm": 0.5888490676879883, "learning_rate": 7.759708566960692e-06, "loss": 0.034813784062862396, "memory(GiB)": 21.48, "step": 10719, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.95365 }, { "epoch": 0.34824416073806974, "grad_norm": 0.5448222756385803, "learning_rate": 7.759260628500715e-06, "loss": 0.029320448637008667, "memory(GiB)": 21.48, "step": 10720, "token_acc": 1.0, "train_speed(iter/s)": 0.953667 }, { "epoch": 0.34827664620082516, "grad_norm": 0.3918994963169098, "learning_rate": 7.758812658195197e-06, "loss": 0.026084911078214645, "memory(GiB)": 21.48, "step": 10721, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.953685 }, { "epoch": 0.3483091316635806, "grad_norm": 0.35129135847091675, "learning_rate": 7.758364656049312e-06, "loss": 0.03261588513851166, "memory(GiB)": 21.48, "step": 10722, "token_acc": 1.0, "train_speed(iter/s)": 0.953704 }, { "epoch": 0.348341617126336, "grad_norm": 0.4837682545185089, "learning_rate": 7.757916622068225e-06, "loss": 0.028169948607683182, "memory(GiB)": 21.48, "step": 10723, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.953722 }, { "epoch": 0.3483741025890914, "grad_norm": 0.5360048413276672, "learning_rate": 7.757468556257108e-06, "loss": 0.034880731254816055, "memory(GiB)": 21.48, "step": 10724, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.95374 }, { "epoch": 0.3484065880518468, "grad_norm": 0.5201460123062134, "learning_rate": 7.757020458621136e-06, "loss": 0.032729893922805786, "memory(GiB)": 21.48, "step": 10725, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.953759 }, { "epoch": 0.34843907351460224, "grad_norm": 0.5537164807319641, "learning_rate": 7.756572329165477e-06, "loss": 0.03221654146909714, "memory(GiB)": 21.48, "step": 10726, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.953776 }, { "epoch": 0.34847155897735765, "grad_norm": 0.3886707127094269, "learning_rate": 7.756124167895306e-06, "loss": 0.028832346200942993, "memory(GiB)": 21.48, "step": 10727, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.953794 }, { "epoch": 0.34850404444011307, "grad_norm": 0.3303893506526947, "learning_rate": 7.755675974815793e-06, "loss": 0.02506772056221962, "memory(GiB)": 21.48, "step": 10728, "token_acc": 0.9928571428571429, "train_speed(iter/s)": 0.953814 }, { "epoch": 0.3485365299028685, "grad_norm": 0.5139846205711365, "learning_rate": 7.755227749932113e-06, "loss": 0.03338482603430748, "memory(GiB)": 21.48, "step": 10729, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.953832 }, { "epoch": 0.3485690153656239, "grad_norm": 0.3603745996952057, "learning_rate": 7.754779493249435e-06, "loss": 0.032827988266944885, "memory(GiB)": 21.48, "step": 10730, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.953851 }, { "epoch": 0.3486015008283793, "grad_norm": 0.37004536390304565, "learning_rate": 7.754331204772935e-06, "loss": 0.022532518953084946, "memory(GiB)": 21.48, "step": 10731, "token_acc": 1.0, "train_speed(iter/s)": 0.95387 }, { "epoch": 0.34863398629113473, "grad_norm": 0.43115198612213135, "learning_rate": 7.753882884507788e-06, "loss": 0.03222924470901489, "memory(GiB)": 21.48, "step": 10732, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.953886 }, { "epoch": 0.34866647175389015, "grad_norm": 0.3598038852214813, "learning_rate": 7.753434532459167e-06, "loss": 0.033309757709503174, "memory(GiB)": 21.48, "step": 10733, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.953904 }, { "epoch": 0.34869895721664557, "grad_norm": 0.3546343147754669, "learning_rate": 7.752986148632246e-06, "loss": 0.027799837291240692, "memory(GiB)": 21.48, "step": 10734, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.953922 }, { "epoch": 0.348731442679401, "grad_norm": 0.5671650171279907, "learning_rate": 7.752537733032203e-06, "loss": 0.030355816707015038, "memory(GiB)": 21.48, "step": 10735, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.953942 }, { "epoch": 0.3487639281421564, "grad_norm": 0.4482945203781128, "learning_rate": 7.752089285664208e-06, "loss": 0.03089597448706627, "memory(GiB)": 21.48, "step": 10736, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.953961 }, { "epoch": 0.3487964136049118, "grad_norm": 0.35966283082962036, "learning_rate": 7.751640806533442e-06, "loss": 0.02592906728386879, "memory(GiB)": 21.48, "step": 10737, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.953975 }, { "epoch": 0.34882889906766723, "grad_norm": 0.5001254081726074, "learning_rate": 7.751192295645078e-06, "loss": 0.024357818067073822, "memory(GiB)": 21.48, "step": 10738, "token_acc": 0.9901315789473685, "train_speed(iter/s)": 0.953989 }, { "epoch": 0.34886138453042265, "grad_norm": 0.44877350330352783, "learning_rate": 7.750743753004294e-06, "loss": 0.026983458548784256, "memory(GiB)": 21.48, "step": 10739, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.954003 }, { "epoch": 0.34889386999317806, "grad_norm": 0.3603377342224121, "learning_rate": 7.750295178616262e-06, "loss": 0.034009844064712524, "memory(GiB)": 21.48, "step": 10740, "token_acc": 1.0, "train_speed(iter/s)": 0.954017 }, { "epoch": 0.3489263554559335, "grad_norm": 0.582271158695221, "learning_rate": 7.749846572486166e-06, "loss": 0.04765985906124115, "memory(GiB)": 21.48, "step": 10741, "token_acc": 0.9730941704035875, "train_speed(iter/s)": 0.954032 }, { "epoch": 0.3489588409186889, "grad_norm": 0.4045930504798889, "learning_rate": 7.749397934619179e-06, "loss": 0.02770180255174637, "memory(GiB)": 21.48, "step": 10742, "token_acc": 0.9823788546255506, "train_speed(iter/s)": 0.954047 }, { "epoch": 0.3489913263814443, "grad_norm": 0.37371671199798584, "learning_rate": 7.748949265020482e-06, "loss": 0.017150606960058212, "memory(GiB)": 21.48, "step": 10743, "token_acc": 1.0, "train_speed(iter/s)": 0.95406 }, { "epoch": 0.3490238118441997, "grad_norm": 0.48955291509628296, "learning_rate": 7.748500563695251e-06, "loss": 0.02850412018597126, "memory(GiB)": 21.48, "step": 10744, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.954073 }, { "epoch": 0.34905629730695514, "grad_norm": 0.5192198753356934, "learning_rate": 7.748051830648666e-06, "loss": 0.04128376394510269, "memory(GiB)": 21.48, "step": 10745, "token_acc": 0.9644268774703557, "train_speed(iter/s)": 0.954087 }, { "epoch": 0.34908878276971056, "grad_norm": 0.7277706861495972, "learning_rate": 7.747603065885904e-06, "loss": 0.01882404088973999, "memory(GiB)": 21.48, "step": 10746, "token_acc": 1.0, "train_speed(iter/s)": 0.954101 }, { "epoch": 0.349121268232466, "grad_norm": 0.44489404559135437, "learning_rate": 7.747154269412148e-06, "loss": 0.022790929302573204, "memory(GiB)": 21.48, "step": 10747, "token_acc": 0.9895287958115183, "train_speed(iter/s)": 0.954117 }, { "epoch": 0.3491537536952214, "grad_norm": 0.4841320514678955, "learning_rate": 7.746705441232571e-06, "loss": 0.028538178652524948, "memory(GiB)": 21.48, "step": 10748, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.954132 }, { "epoch": 0.3491862391579768, "grad_norm": 0.8134359121322632, "learning_rate": 7.746256581352361e-06, "loss": 0.028290022164583206, "memory(GiB)": 21.48, "step": 10749, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.954144 }, { "epoch": 0.3492187246207322, "grad_norm": 0.4705566167831421, "learning_rate": 7.745807689776692e-06, "loss": 0.029754184186458588, "memory(GiB)": 21.48, "step": 10750, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.954157 }, { "epoch": 0.34925121008348764, "grad_norm": 0.6625228524208069, "learning_rate": 7.74535876651075e-06, "loss": 0.034255292266607285, "memory(GiB)": 21.48, "step": 10751, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.954172 }, { "epoch": 0.34928369554624306, "grad_norm": 0.527618944644928, "learning_rate": 7.744909811559713e-06, "loss": 0.03543412685394287, "memory(GiB)": 21.48, "step": 10752, "token_acc": 1.0, "train_speed(iter/s)": 0.954187 }, { "epoch": 0.34931618100899847, "grad_norm": 0.3802686333656311, "learning_rate": 7.744460824928763e-06, "loss": 0.03206997737288475, "memory(GiB)": 21.48, "step": 10753, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.954201 }, { "epoch": 0.3493486664717539, "grad_norm": 0.32274219393730164, "learning_rate": 7.744011806623083e-06, "loss": 0.020563172176480293, "memory(GiB)": 21.48, "step": 10754, "token_acc": 1.0, "train_speed(iter/s)": 0.954215 }, { "epoch": 0.3493811519345093, "grad_norm": 0.39588698744773865, "learning_rate": 7.743562756647853e-06, "loss": 0.028244705870747566, "memory(GiB)": 21.48, "step": 10755, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.95423 }, { "epoch": 0.3494136373972647, "grad_norm": 0.3779073655605316, "learning_rate": 7.743113675008258e-06, "loss": 0.026283659040927887, "memory(GiB)": 21.48, "step": 10756, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.954245 }, { "epoch": 0.34944612286002014, "grad_norm": 0.41233018040657043, "learning_rate": 7.74266456170948e-06, "loss": 0.03272264450788498, "memory(GiB)": 21.48, "step": 10757, "token_acc": 0.9806763285024155, "train_speed(iter/s)": 0.954263 }, { "epoch": 0.34947860832277555, "grad_norm": 0.4473940432071686, "learning_rate": 7.742215416756703e-06, "loss": 0.025696884840726852, "memory(GiB)": 21.48, "step": 10758, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.954278 }, { "epoch": 0.34951109378553097, "grad_norm": 0.344458669424057, "learning_rate": 7.74176624015511e-06, "loss": 0.025106290355324745, "memory(GiB)": 21.48, "step": 10759, "token_acc": 1.0, "train_speed(iter/s)": 0.954293 }, { "epoch": 0.3495435792482864, "grad_norm": 0.33118218183517456, "learning_rate": 7.741317031909886e-06, "loss": 0.026709601283073425, "memory(GiB)": 21.48, "step": 10760, "token_acc": 0.9812734082397003, "train_speed(iter/s)": 0.954308 }, { "epoch": 0.3495760647110418, "grad_norm": 0.44912445545196533, "learning_rate": 7.740867792026211e-06, "loss": 0.030616503208875656, "memory(GiB)": 21.48, "step": 10761, "token_acc": 0.9923371647509579, "train_speed(iter/s)": 0.954323 }, { "epoch": 0.3496085501737972, "grad_norm": 0.402749240398407, "learning_rate": 7.740418520509278e-06, "loss": 0.02804817631840706, "memory(GiB)": 21.48, "step": 10762, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.954338 }, { "epoch": 0.34964103563655263, "grad_norm": 0.30075156688690186, "learning_rate": 7.739969217364267e-06, "loss": 0.024218428879976273, "memory(GiB)": 21.48, "step": 10763, "token_acc": 0.9856459330143541, "train_speed(iter/s)": 0.954353 }, { "epoch": 0.34967352109930805, "grad_norm": 0.4026164412498474, "learning_rate": 7.739519882596362e-06, "loss": 0.03700777143239975, "memory(GiB)": 21.48, "step": 10764, "token_acc": 0.9644444444444444, "train_speed(iter/s)": 0.954368 }, { "epoch": 0.34970600656206347, "grad_norm": 0.5953713059425354, "learning_rate": 7.739070516210753e-06, "loss": 0.024134550243616104, "memory(GiB)": 21.48, "step": 10765, "token_acc": 1.0, "train_speed(iter/s)": 0.954384 }, { "epoch": 0.3497384920248189, "grad_norm": 0.4250725209712982, "learning_rate": 7.738621118212626e-06, "loss": 0.03556482493877411, "memory(GiB)": 21.48, "step": 10766, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.954398 }, { "epoch": 0.3497709774875743, "grad_norm": 0.2889023721218109, "learning_rate": 7.738171688607163e-06, "loss": 0.023382840678095818, "memory(GiB)": 21.48, "step": 10767, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.954412 }, { "epoch": 0.3498034629503297, "grad_norm": 0.30140960216522217, "learning_rate": 7.737722227399558e-06, "loss": 0.026019079610705376, "memory(GiB)": 21.48, "step": 10768, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.954425 }, { "epoch": 0.34983594841308513, "grad_norm": 0.4692995846271515, "learning_rate": 7.737272734594992e-06, "loss": 0.026417601853609085, "memory(GiB)": 21.48, "step": 10769, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.954439 }, { "epoch": 0.34986843387584055, "grad_norm": 0.3529652953147888, "learning_rate": 7.736823210198656e-06, "loss": 0.018507931381464005, "memory(GiB)": 21.48, "step": 10770, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.954453 }, { "epoch": 0.34990091933859596, "grad_norm": 0.5143450498580933, "learning_rate": 7.736373654215735e-06, "loss": 0.03142939507961273, "memory(GiB)": 21.48, "step": 10771, "token_acc": 0.9682539682539683, "train_speed(iter/s)": 0.954468 }, { "epoch": 0.3499334048013514, "grad_norm": 0.3979962170124054, "learning_rate": 7.735924066651424e-06, "loss": 0.027232376858592033, "memory(GiB)": 21.48, "step": 10772, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.954483 }, { "epoch": 0.3499658902641068, "grad_norm": 0.3875163495540619, "learning_rate": 7.735474447510904e-06, "loss": 0.027126777917146683, "memory(GiB)": 21.48, "step": 10773, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.954497 }, { "epoch": 0.3499983757268622, "grad_norm": 0.4426608085632324, "learning_rate": 7.73502479679937e-06, "loss": 0.03016538918018341, "memory(GiB)": 21.48, "step": 10774, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.954511 }, { "epoch": 0.3500308611896176, "grad_norm": 0.530219316482544, "learning_rate": 7.73457511452201e-06, "loss": 0.030661841854453087, "memory(GiB)": 21.48, "step": 10775, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.954526 }, { "epoch": 0.35006334665237304, "grad_norm": 0.3934277892112732, "learning_rate": 7.734125400684012e-06, "loss": 0.02245938777923584, "memory(GiB)": 21.48, "step": 10776, "token_acc": 0.9923076923076923, "train_speed(iter/s)": 0.954539 }, { "epoch": 0.35009583211512846, "grad_norm": 0.3990241587162018, "learning_rate": 7.733675655290568e-06, "loss": 0.029845358803868294, "memory(GiB)": 21.48, "step": 10777, "token_acc": 0.9799196787148594, "train_speed(iter/s)": 0.954552 }, { "epoch": 0.3501283175778839, "grad_norm": 0.7988870143890381, "learning_rate": 7.73322587834687e-06, "loss": 0.02882225066423416, "memory(GiB)": 21.48, "step": 10778, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.954567 }, { "epoch": 0.3501608030406393, "grad_norm": 0.3962644040584564, "learning_rate": 7.732776069858108e-06, "loss": 0.02452327311038971, "memory(GiB)": 21.48, "step": 10779, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.954581 }, { "epoch": 0.3501932885033947, "grad_norm": 0.5772324204444885, "learning_rate": 7.73232622982947e-06, "loss": 0.03555051609873772, "memory(GiB)": 21.48, "step": 10780, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.954597 }, { "epoch": 0.3502257739661501, "grad_norm": 0.5144574642181396, "learning_rate": 7.731876358266153e-06, "loss": 0.0317254513502121, "memory(GiB)": 21.48, "step": 10781, "token_acc": 0.9893992932862191, "train_speed(iter/s)": 0.954613 }, { "epoch": 0.35025825942890554, "grad_norm": 0.39978957176208496, "learning_rate": 7.731426455173345e-06, "loss": 0.02934623882174492, "memory(GiB)": 21.48, "step": 10782, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.954632 }, { "epoch": 0.35029074489166095, "grad_norm": 0.4346611797809601, "learning_rate": 7.730976520556241e-06, "loss": 0.02770286798477173, "memory(GiB)": 21.48, "step": 10783, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.95465 }, { "epoch": 0.3503232303544164, "grad_norm": 0.41743144392967224, "learning_rate": 7.730526554420033e-06, "loss": 0.03463156893849373, "memory(GiB)": 21.48, "step": 10784, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.954668 }, { "epoch": 0.35035571581717184, "grad_norm": 0.49757811427116394, "learning_rate": 7.730076556769915e-06, "loss": 0.030817706137895584, "memory(GiB)": 21.48, "step": 10785, "token_acc": 1.0, "train_speed(iter/s)": 0.954687 }, { "epoch": 0.35038820127992726, "grad_norm": 0.33253970742225647, "learning_rate": 7.729626527611079e-06, "loss": 0.01752946339547634, "memory(GiB)": 21.48, "step": 10786, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.954705 }, { "epoch": 0.3504206867426827, "grad_norm": 0.49203717708587646, "learning_rate": 7.729176466948718e-06, "loss": 0.036042921245098114, "memory(GiB)": 21.48, "step": 10787, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.954724 }, { "epoch": 0.3504531722054381, "grad_norm": 0.7290068864822388, "learning_rate": 7.728726374788029e-06, "loss": 0.04188218712806702, "memory(GiB)": 21.48, "step": 10788, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.954743 }, { "epoch": 0.3504856576681935, "grad_norm": 0.5543737411499023, "learning_rate": 7.728276251134208e-06, "loss": 0.028296422213315964, "memory(GiB)": 21.48, "step": 10789, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.954762 }, { "epoch": 0.3505181431309489, "grad_norm": 0.363384872674942, "learning_rate": 7.727826095992445e-06, "loss": 0.02881113439798355, "memory(GiB)": 21.48, "step": 10790, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.954781 }, { "epoch": 0.35055062859370434, "grad_norm": 0.2621569037437439, "learning_rate": 7.72737590936794e-06, "loss": 0.020308762788772583, "memory(GiB)": 21.48, "step": 10791, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.954799 }, { "epoch": 0.35058311405645975, "grad_norm": 0.5193933248519897, "learning_rate": 7.726925691265886e-06, "loss": 0.02706943079829216, "memory(GiB)": 21.48, "step": 10792, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.954817 }, { "epoch": 0.35061559951921517, "grad_norm": 0.8761346936225891, "learning_rate": 7.72647544169148e-06, "loss": 0.03286323696374893, "memory(GiB)": 21.48, "step": 10793, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.954836 }, { "epoch": 0.3506480849819706, "grad_norm": 0.43791115283966064, "learning_rate": 7.726025160649917e-06, "loss": 0.027157319709658623, "memory(GiB)": 21.48, "step": 10794, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.954855 }, { "epoch": 0.350680570444726, "grad_norm": 0.585879921913147, "learning_rate": 7.725574848146399e-06, "loss": 0.032897546887397766, "memory(GiB)": 21.48, "step": 10795, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.954873 }, { "epoch": 0.3507130559074814, "grad_norm": 0.3497033715248108, "learning_rate": 7.725124504186117e-06, "loss": 0.025185134261846542, "memory(GiB)": 21.48, "step": 10796, "token_acc": 1.0, "train_speed(iter/s)": 0.954887 }, { "epoch": 0.35074554137023684, "grad_norm": 0.39260244369506836, "learning_rate": 7.724674128774273e-06, "loss": 0.03053528256714344, "memory(GiB)": 21.48, "step": 10797, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.954903 }, { "epoch": 0.35077802683299225, "grad_norm": 0.3976282477378845, "learning_rate": 7.72422372191606e-06, "loss": 0.023989584296941757, "memory(GiB)": 21.48, "step": 10798, "token_acc": 0.98828125, "train_speed(iter/s)": 0.954916 }, { "epoch": 0.35081051229574767, "grad_norm": 0.6291771531105042, "learning_rate": 7.723773283616682e-06, "loss": 0.040519244968891144, "memory(GiB)": 21.48, "step": 10799, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.954931 }, { "epoch": 0.3508429977585031, "grad_norm": 0.44940757751464844, "learning_rate": 7.723322813881333e-06, "loss": 0.022546999156475067, "memory(GiB)": 21.48, "step": 10800, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.954945 }, { "epoch": 0.3508754832212585, "grad_norm": 0.4577507972717285, "learning_rate": 7.722872312715215e-06, "loss": 0.03882227838039398, "memory(GiB)": 21.48, "step": 10801, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.954955 }, { "epoch": 0.3509079686840139, "grad_norm": 0.5287162065505981, "learning_rate": 7.722421780123526e-06, "loss": 0.025009339675307274, "memory(GiB)": 21.48, "step": 10802, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.954969 }, { "epoch": 0.35094045414676933, "grad_norm": 0.42424312233924866, "learning_rate": 7.721971216111465e-06, "loss": 0.024268876761198044, "memory(GiB)": 21.48, "step": 10803, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.954983 }, { "epoch": 0.35097293960952475, "grad_norm": 0.368915319442749, "learning_rate": 7.721520620684234e-06, "loss": 0.025461038574576378, "memory(GiB)": 21.48, "step": 10804, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.954998 }, { "epoch": 0.35100542507228016, "grad_norm": 0.5018189549446106, "learning_rate": 7.721069993847035e-06, "loss": 0.03210245817899704, "memory(GiB)": 21.48, "step": 10805, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.955011 }, { "epoch": 0.3510379105350356, "grad_norm": 0.6042431592941284, "learning_rate": 7.720619335605064e-06, "loss": 0.0390624925494194, "memory(GiB)": 21.48, "step": 10806, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.955026 }, { "epoch": 0.351070395997791, "grad_norm": 0.3539992868900299, "learning_rate": 7.720168645963526e-06, "loss": 0.02490106225013733, "memory(GiB)": 21.48, "step": 10807, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.955041 }, { "epoch": 0.3511028814605464, "grad_norm": 0.441827654838562, "learning_rate": 7.719717924927622e-06, "loss": 0.029699038714170456, "memory(GiB)": 21.48, "step": 10808, "token_acc": 0.99, "train_speed(iter/s)": 0.955052 }, { "epoch": 0.35113536692330183, "grad_norm": 0.48450803756713867, "learning_rate": 7.719267172502551e-06, "loss": 0.02364898845553398, "memory(GiB)": 21.48, "step": 10809, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.955065 }, { "epoch": 0.35116785238605724, "grad_norm": 0.41539332270622253, "learning_rate": 7.718816388693518e-06, "loss": 0.026473883539438248, "memory(GiB)": 21.48, "step": 10810, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.955079 }, { "epoch": 0.35120033784881266, "grad_norm": 0.44773077964782715, "learning_rate": 7.718365573505725e-06, "loss": 0.026661623269319534, "memory(GiB)": 21.48, "step": 10811, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.955093 }, { "epoch": 0.3512328233115681, "grad_norm": 0.344108521938324, "learning_rate": 7.717914726944378e-06, "loss": 0.017845170572400093, "memory(GiB)": 21.48, "step": 10812, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.955106 }, { "epoch": 0.3512653087743235, "grad_norm": 0.45527881383895874, "learning_rate": 7.717463849014675e-06, "loss": 0.029279150068759918, "memory(GiB)": 21.48, "step": 10813, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955119 }, { "epoch": 0.3512977942370789, "grad_norm": 0.5383312702178955, "learning_rate": 7.717012939721822e-06, "loss": 0.034552399069070816, "memory(GiB)": 21.48, "step": 10814, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.955133 }, { "epoch": 0.3513302796998343, "grad_norm": 0.3302672803401947, "learning_rate": 7.716561999071024e-06, "loss": 0.02126610279083252, "memory(GiB)": 21.48, "step": 10815, "token_acc": 1.0, "train_speed(iter/s)": 0.955147 }, { "epoch": 0.35136276516258974, "grad_norm": 0.3808920085430145, "learning_rate": 7.716111027067484e-06, "loss": 0.02581608295440674, "memory(GiB)": 21.48, "step": 10816, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.955166 }, { "epoch": 0.35139525062534516, "grad_norm": 0.3657219111919403, "learning_rate": 7.715660023716408e-06, "loss": 0.032603342086076736, "memory(GiB)": 21.48, "step": 10817, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955186 }, { "epoch": 0.3514277360881006, "grad_norm": 1.1420789957046509, "learning_rate": 7.715208989023002e-06, "loss": 0.040885068476200104, "memory(GiB)": 21.48, "step": 10818, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955204 }, { "epoch": 0.351460221550856, "grad_norm": 0.2824156582355499, "learning_rate": 7.714757922992468e-06, "loss": 0.027082402259111404, "memory(GiB)": 21.48, "step": 10819, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.95522 }, { "epoch": 0.3514927070136114, "grad_norm": 0.6249030232429504, "learning_rate": 7.714306825630015e-06, "loss": 0.05094388872385025, "memory(GiB)": 21.48, "step": 10820, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955236 }, { "epoch": 0.3515251924763668, "grad_norm": 0.4299030601978302, "learning_rate": 7.713855696940851e-06, "loss": 0.026054752990603447, "memory(GiB)": 21.48, "step": 10821, "token_acc": 0.992, "train_speed(iter/s)": 0.955249 }, { "epoch": 0.35155767793912224, "grad_norm": 0.36540162563323975, "learning_rate": 7.713404536930178e-06, "loss": 0.03260550647974014, "memory(GiB)": 21.48, "step": 10822, "token_acc": 0.9771241830065359, "train_speed(iter/s)": 0.955264 }, { "epoch": 0.35159016340187765, "grad_norm": 0.2808256149291992, "learning_rate": 7.712953345603205e-06, "loss": 0.023535214364528656, "memory(GiB)": 21.48, "step": 10823, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.955278 }, { "epoch": 0.35162264886463307, "grad_norm": 0.3359907567501068, "learning_rate": 7.71250212296514e-06, "loss": 0.02583441324532032, "memory(GiB)": 21.48, "step": 10824, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.955293 }, { "epoch": 0.3516551343273885, "grad_norm": 0.9029760360717773, "learning_rate": 7.712050869021191e-06, "loss": 0.03453132510185242, "memory(GiB)": 21.48, "step": 10825, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955307 }, { "epoch": 0.3516876197901439, "grad_norm": 0.6444351673126221, "learning_rate": 7.711599583776564e-06, "loss": 0.029391523450613022, "memory(GiB)": 21.48, "step": 10826, "token_acc": 1.0, "train_speed(iter/s)": 0.955322 }, { "epoch": 0.3517201052528993, "grad_norm": 0.28058120608329773, "learning_rate": 7.711148267236468e-06, "loss": 0.022143220528960228, "memory(GiB)": 21.48, "step": 10827, "token_acc": 1.0, "train_speed(iter/s)": 0.955336 }, { "epoch": 0.35175259071565473, "grad_norm": 0.46334242820739746, "learning_rate": 7.710696919406115e-06, "loss": 0.02525767683982849, "memory(GiB)": 21.48, "step": 10828, "token_acc": 0.9813084112149533, "train_speed(iter/s)": 0.955351 }, { "epoch": 0.35178507617841015, "grad_norm": 0.7818986773490906, "learning_rate": 7.710245540290708e-06, "loss": 0.020596368238329887, "memory(GiB)": 21.48, "step": 10829, "token_acc": 1.0, "train_speed(iter/s)": 0.955366 }, { "epoch": 0.35181756164116557, "grad_norm": 0.4950014352798462, "learning_rate": 7.709794129895463e-06, "loss": 0.03036351129412651, "memory(GiB)": 21.48, "step": 10830, "token_acc": 0.9923076923076923, "train_speed(iter/s)": 0.95538 }, { "epoch": 0.351850047103921, "grad_norm": 0.48461171984672546, "learning_rate": 7.709342688225587e-06, "loss": 0.02975972555577755, "memory(GiB)": 21.48, "step": 10831, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955393 }, { "epoch": 0.3518825325666764, "grad_norm": 0.35336264967918396, "learning_rate": 7.708891215286289e-06, "loss": 0.025773655623197556, "memory(GiB)": 21.48, "step": 10832, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955408 }, { "epoch": 0.3519150180294318, "grad_norm": 0.49627920985221863, "learning_rate": 7.708439711082781e-06, "loss": 0.03485426306724548, "memory(GiB)": 21.48, "step": 10833, "token_acc": 0.992, "train_speed(iter/s)": 0.955422 }, { "epoch": 0.35194750349218723, "grad_norm": 0.3273678421974182, "learning_rate": 7.707988175620273e-06, "loss": 0.025577060878276825, "memory(GiB)": 21.48, "step": 10834, "token_acc": 0.9895104895104895, "train_speed(iter/s)": 0.955437 }, { "epoch": 0.35197998895494265, "grad_norm": 0.6685433387756348, "learning_rate": 7.707536608903978e-06, "loss": 0.03161657601594925, "memory(GiB)": 21.48, "step": 10835, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.955452 }, { "epoch": 0.35201247441769806, "grad_norm": 1.159238338470459, "learning_rate": 7.707085010939109e-06, "loss": 0.024248024448752403, "memory(GiB)": 21.48, "step": 10836, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.955466 }, { "epoch": 0.3520449598804535, "grad_norm": 0.28737854957580566, "learning_rate": 7.706633381730873e-06, "loss": 0.02347070351243019, "memory(GiB)": 21.48, "step": 10837, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.95548 }, { "epoch": 0.3520774453432089, "grad_norm": 0.5018813014030457, "learning_rate": 7.706181721284486e-06, "loss": 0.03914584219455719, "memory(GiB)": 21.48, "step": 10838, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.955491 }, { "epoch": 0.3521099308059643, "grad_norm": 1.3362990617752075, "learning_rate": 7.70573002960516e-06, "loss": 0.03403792902827263, "memory(GiB)": 21.48, "step": 10839, "token_acc": 0.9961977186311787, "train_speed(iter/s)": 0.955506 }, { "epoch": 0.3521424162687197, "grad_norm": 0.43810510635375977, "learning_rate": 7.705278306698108e-06, "loss": 0.023981675505638123, "memory(GiB)": 21.48, "step": 10840, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955522 }, { "epoch": 0.35217490173147514, "grad_norm": 0.34450283646583557, "learning_rate": 7.704826552568546e-06, "loss": 0.022079775109887123, "memory(GiB)": 21.48, "step": 10841, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.955536 }, { "epoch": 0.35220738719423056, "grad_norm": 0.38716191053390503, "learning_rate": 7.704374767221682e-06, "loss": 0.02731982246041298, "memory(GiB)": 21.48, "step": 10842, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.95555 }, { "epoch": 0.352239872656986, "grad_norm": 0.4673691391944885, "learning_rate": 7.703922950662735e-06, "loss": 0.0248948335647583, "memory(GiB)": 21.48, "step": 10843, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.955567 }, { "epoch": 0.3522723581197414, "grad_norm": 2.1322362422943115, "learning_rate": 7.703471102896918e-06, "loss": 0.048176683485507965, "memory(GiB)": 21.48, "step": 10844, "token_acc": 0.9799196787148594, "train_speed(iter/s)": 0.955586 }, { "epoch": 0.3523048435824968, "grad_norm": 0.3957056999206543, "learning_rate": 7.703019223929447e-06, "loss": 0.029732273891568184, "memory(GiB)": 21.48, "step": 10845, "token_acc": 0.9834254143646409, "train_speed(iter/s)": 0.955604 }, { "epoch": 0.3523373290452522, "grad_norm": 0.6094580292701721, "learning_rate": 7.702567313765537e-06, "loss": 0.03722139075398445, "memory(GiB)": 21.48, "step": 10846, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.955623 }, { "epoch": 0.35236981450800764, "grad_norm": 0.34242498874664307, "learning_rate": 7.702115372410402e-06, "loss": 0.026091523468494415, "memory(GiB)": 21.48, "step": 10847, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955641 }, { "epoch": 0.3524022999707631, "grad_norm": 0.41180017590522766, "learning_rate": 7.701663399869257e-06, "loss": 0.027984246611595154, "memory(GiB)": 21.48, "step": 10848, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.955659 }, { "epoch": 0.3524347854335185, "grad_norm": 0.4461733102798462, "learning_rate": 7.701211396147323e-06, "loss": 0.03033558651804924, "memory(GiB)": 21.48, "step": 10849, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.955678 }, { "epoch": 0.35246727089627394, "grad_norm": 0.4088067412376404, "learning_rate": 7.700759361249813e-06, "loss": 0.027805782854557037, "memory(GiB)": 21.48, "step": 10850, "token_acc": 0.982532751091703, "train_speed(iter/s)": 0.955695 }, { "epoch": 0.35249975635902936, "grad_norm": 0.33741524815559387, "learning_rate": 7.700307295181947e-06, "loss": 0.029550783336162567, "memory(GiB)": 21.48, "step": 10851, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955714 }, { "epoch": 0.3525322418217848, "grad_norm": 0.3551430106163025, "learning_rate": 7.69985519794894e-06, "loss": 0.01946127414703369, "memory(GiB)": 21.48, "step": 10852, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.955733 }, { "epoch": 0.3525647272845402, "grad_norm": 0.4078314006328583, "learning_rate": 7.699403069556009e-06, "loss": 0.02349865809082985, "memory(GiB)": 21.48, "step": 10853, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.95575 }, { "epoch": 0.3525972127472956, "grad_norm": 0.5047953128814697, "learning_rate": 7.698950910008375e-06, "loss": 0.02479289099574089, "memory(GiB)": 21.48, "step": 10854, "token_acc": 1.0, "train_speed(iter/s)": 0.955767 }, { "epoch": 0.352629698210051, "grad_norm": 0.5189399123191833, "learning_rate": 7.698498719311255e-06, "loss": 0.030589820817112923, "memory(GiB)": 21.48, "step": 10855, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.955783 }, { "epoch": 0.35266218367280644, "grad_norm": 0.28440430760383606, "learning_rate": 7.698046497469869e-06, "loss": 0.019930893555283546, "memory(GiB)": 21.48, "step": 10856, "token_acc": 1.0, "train_speed(iter/s)": 0.955797 }, { "epoch": 0.35269466913556186, "grad_norm": 0.3631044328212738, "learning_rate": 7.697594244489434e-06, "loss": 0.01787682995200157, "memory(GiB)": 21.48, "step": 10857, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.955812 }, { "epoch": 0.35272715459831727, "grad_norm": 0.341114342212677, "learning_rate": 7.697141960375171e-06, "loss": 0.026397472247481346, "memory(GiB)": 21.48, "step": 10858, "token_acc": 0.979933110367893, "train_speed(iter/s)": 0.955828 }, { "epoch": 0.3527596400610727, "grad_norm": 1.8985849618911743, "learning_rate": 7.6966896451323e-06, "loss": 0.042056821286678314, "memory(GiB)": 21.48, "step": 10859, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.955841 }, { "epoch": 0.3527921255238281, "grad_norm": 0.5082286596298218, "learning_rate": 7.696237298766043e-06, "loss": 0.03253413364291191, "memory(GiB)": 21.48, "step": 10860, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.955854 }, { "epoch": 0.3528246109865835, "grad_norm": 0.5752450823783875, "learning_rate": 7.695784921281617e-06, "loss": 0.03647247701883316, "memory(GiB)": 21.48, "step": 10861, "token_acc": 0.9703703703703703, "train_speed(iter/s)": 0.955869 }, { "epoch": 0.35285709644933894, "grad_norm": 0.450374573469162, "learning_rate": 7.695332512684247e-06, "loss": 0.02790069207549095, "memory(GiB)": 21.48, "step": 10862, "token_acc": 1.0, "train_speed(iter/s)": 0.955882 }, { "epoch": 0.35288958191209435, "grad_norm": 0.4809708595275879, "learning_rate": 7.69488007297915e-06, "loss": 0.032362230122089386, "memory(GiB)": 21.48, "step": 10863, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.955895 }, { "epoch": 0.35292206737484977, "grad_norm": 0.46797385811805725, "learning_rate": 7.694427602171549e-06, "loss": 0.032192766666412354, "memory(GiB)": 21.48, "step": 10864, "token_acc": 1.0, "train_speed(iter/s)": 0.95591 }, { "epoch": 0.3529545528376052, "grad_norm": 0.4754331409931183, "learning_rate": 7.693975100266669e-06, "loss": 0.019792238250374794, "memory(GiB)": 21.48, "step": 10865, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.955924 }, { "epoch": 0.3529870383003606, "grad_norm": 0.4019908010959625, "learning_rate": 7.69352256726973e-06, "loss": 0.027330107986927032, "memory(GiB)": 21.48, "step": 10866, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.955939 }, { "epoch": 0.353019523763116, "grad_norm": 0.3813822865486145, "learning_rate": 7.693070003185957e-06, "loss": 0.028157416731119156, "memory(GiB)": 21.48, "step": 10867, "token_acc": 0.97265625, "train_speed(iter/s)": 0.955951 }, { "epoch": 0.35305200922587143, "grad_norm": 0.7209815979003906, "learning_rate": 7.692617408020571e-06, "loss": 0.0279838927090168, "memory(GiB)": 21.48, "step": 10868, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.955964 }, { "epoch": 0.35308449468862685, "grad_norm": 0.5608509182929993, "learning_rate": 7.692164781778797e-06, "loss": 0.03472871705889702, "memory(GiB)": 21.48, "step": 10869, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.955979 }, { "epoch": 0.35311698015138226, "grad_norm": 0.5099664926528931, "learning_rate": 7.691712124465856e-06, "loss": 0.04245544224977493, "memory(GiB)": 21.48, "step": 10870, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.95599 }, { "epoch": 0.3531494656141377, "grad_norm": 0.617923378944397, "learning_rate": 7.691259436086976e-06, "loss": 0.02847425825893879, "memory(GiB)": 21.48, "step": 10871, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.956002 }, { "epoch": 0.3531819510768931, "grad_norm": 0.6387350559234619, "learning_rate": 7.690806716647381e-06, "loss": 0.03807520866394043, "memory(GiB)": 21.48, "step": 10872, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.956016 }, { "epoch": 0.3532144365396485, "grad_norm": 0.40366530418395996, "learning_rate": 7.690353966152293e-06, "loss": 0.028602514415979385, "memory(GiB)": 21.48, "step": 10873, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.956032 }, { "epoch": 0.35324692200240393, "grad_norm": 0.44004321098327637, "learning_rate": 7.68990118460694e-06, "loss": 0.027293527498841286, "memory(GiB)": 21.48, "step": 10874, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.956047 }, { "epoch": 0.35327940746515935, "grad_norm": 0.4452245533466339, "learning_rate": 7.689448372016546e-06, "loss": 0.030756337568163872, "memory(GiB)": 21.48, "step": 10875, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.956066 }, { "epoch": 0.35331189292791476, "grad_norm": 0.5151558518409729, "learning_rate": 7.68899552838634e-06, "loss": 0.02959917113184929, "memory(GiB)": 21.48, "step": 10876, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.956085 }, { "epoch": 0.3533443783906702, "grad_norm": 0.35285061597824097, "learning_rate": 7.688542653721546e-06, "loss": 0.024856925010681152, "memory(GiB)": 21.48, "step": 10877, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.956103 }, { "epoch": 0.3533768638534256, "grad_norm": 0.3367255628108978, "learning_rate": 7.688089748027391e-06, "loss": 0.03592928498983383, "memory(GiB)": 21.48, "step": 10878, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956121 }, { "epoch": 0.353409349316181, "grad_norm": 0.3145098090171814, "learning_rate": 7.687636811309103e-06, "loss": 0.020968075841665268, "memory(GiB)": 21.48, "step": 10879, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956139 }, { "epoch": 0.3534418347789364, "grad_norm": 0.2843942642211914, "learning_rate": 7.687183843571908e-06, "loss": 0.019206304103136063, "memory(GiB)": 21.48, "step": 10880, "token_acc": 0.9947916666666666, "train_speed(iter/s)": 0.956158 }, { "epoch": 0.35347432024169184, "grad_norm": 0.37476491928100586, "learning_rate": 7.686730844821035e-06, "loss": 0.029393833130598068, "memory(GiB)": 21.48, "step": 10881, "token_acc": 0.984, "train_speed(iter/s)": 0.956173 }, { "epoch": 0.35350680570444726, "grad_norm": 0.8494888544082642, "learning_rate": 7.686277815061714e-06, "loss": 0.02537551522254944, "memory(GiB)": 21.48, "step": 10882, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956189 }, { "epoch": 0.3535392911672027, "grad_norm": 0.6184162497520447, "learning_rate": 7.68582475429917e-06, "loss": 0.035026583820581436, "memory(GiB)": 21.48, "step": 10883, "token_acc": 0.982532751091703, "train_speed(iter/s)": 0.956203 }, { "epoch": 0.3535717766299581, "grad_norm": 0.42946717143058777, "learning_rate": 7.685371662538633e-06, "loss": 0.034541547298431396, "memory(GiB)": 21.48, "step": 10884, "token_acc": 0.9869565217391304, "train_speed(iter/s)": 0.956218 }, { "epoch": 0.3536042620927135, "grad_norm": 0.5170892477035522, "learning_rate": 7.684918539785333e-06, "loss": 0.034164026379585266, "memory(GiB)": 21.48, "step": 10885, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.956234 }, { "epoch": 0.3536367475554689, "grad_norm": 0.4666387736797333, "learning_rate": 7.6844653860445e-06, "loss": 0.02774379774928093, "memory(GiB)": 21.48, "step": 10886, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.956249 }, { "epoch": 0.35366923301822434, "grad_norm": 0.23682767152786255, "learning_rate": 7.684012201321362e-06, "loss": 0.020668715238571167, "memory(GiB)": 21.48, "step": 10887, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.956263 }, { "epoch": 0.35370171848097975, "grad_norm": 0.4954715073108673, "learning_rate": 7.683558985621153e-06, "loss": 0.02795453555881977, "memory(GiB)": 21.48, "step": 10888, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.956279 }, { "epoch": 0.35373420394373517, "grad_norm": 0.48149335384368896, "learning_rate": 7.683105738949102e-06, "loss": 0.032291486859321594, "memory(GiB)": 21.48, "step": 10889, "token_acc": 0.9799196787148594, "train_speed(iter/s)": 0.956293 }, { "epoch": 0.3537666894064906, "grad_norm": 0.38130804896354675, "learning_rate": 7.682652461310439e-06, "loss": 0.024760577827692032, "memory(GiB)": 21.48, "step": 10890, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.956307 }, { "epoch": 0.353799174869246, "grad_norm": 0.302087664604187, "learning_rate": 7.682199152710392e-06, "loss": 0.026963375508785248, "memory(GiB)": 21.48, "step": 10891, "token_acc": 0.982532751091703, "train_speed(iter/s)": 0.95632 }, { "epoch": 0.3538316603320014, "grad_norm": 0.43823835253715515, "learning_rate": 7.6817458131542e-06, "loss": 0.03319019824266434, "memory(GiB)": 21.48, "step": 10892, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.956335 }, { "epoch": 0.35386414579475683, "grad_norm": 0.6317775249481201, "learning_rate": 7.681292442647094e-06, "loss": 0.02583964914083481, "memory(GiB)": 21.48, "step": 10893, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.956349 }, { "epoch": 0.35389663125751225, "grad_norm": 0.43442338705062866, "learning_rate": 7.6808390411943e-06, "loss": 0.030935630202293396, "memory(GiB)": 21.48, "step": 10894, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.956364 }, { "epoch": 0.35392911672026767, "grad_norm": 0.459421306848526, "learning_rate": 7.680385608801057e-06, "loss": 0.024121083319187164, "memory(GiB)": 21.48, "step": 10895, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.956379 }, { "epoch": 0.3539616021830231, "grad_norm": 0.3822435438632965, "learning_rate": 7.679932145472598e-06, "loss": 0.03147195652127266, "memory(GiB)": 21.48, "step": 10896, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.956394 }, { "epoch": 0.3539940876457785, "grad_norm": 0.3756318986415863, "learning_rate": 7.679478651214154e-06, "loss": 0.02684718929231167, "memory(GiB)": 21.48, "step": 10897, "token_acc": 0.9877049180327869, "train_speed(iter/s)": 0.95641 }, { "epoch": 0.3540265731085339, "grad_norm": 0.4685058295726776, "learning_rate": 7.679025126030961e-06, "loss": 0.022185854613780975, "memory(GiB)": 21.48, "step": 10898, "token_acc": 1.0, "train_speed(iter/s)": 0.956424 }, { "epoch": 0.35405905857128933, "grad_norm": 0.3925265073776245, "learning_rate": 7.67857156992825e-06, "loss": 0.029626572504639626, "memory(GiB)": 21.48, "step": 10899, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.956439 }, { "epoch": 0.35409154403404475, "grad_norm": 0.35224777460098267, "learning_rate": 7.678117982911259e-06, "loss": 0.02957046404480934, "memory(GiB)": 21.48, "step": 10900, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.956449 }, { "epoch": 0.35412402949680016, "grad_norm": 0.37184444069862366, "learning_rate": 7.677664364985222e-06, "loss": 0.027673061937093735, "memory(GiB)": 21.48, "step": 10901, "token_acc": 0.9873817034700315, "train_speed(iter/s)": 0.956464 }, { "epoch": 0.3541565149595556, "grad_norm": 0.24393858015537262, "learning_rate": 7.677210716155375e-06, "loss": 0.01702994480729103, "memory(GiB)": 21.48, "step": 10902, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956478 }, { "epoch": 0.354189000422311, "grad_norm": 0.37665948271751404, "learning_rate": 7.676757036426952e-06, "loss": 0.02367672510445118, "memory(GiB)": 21.48, "step": 10903, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.956492 }, { "epoch": 0.3542214858850664, "grad_norm": 0.4787886142730713, "learning_rate": 7.676303325805192e-06, "loss": 0.028601566329598427, "memory(GiB)": 21.48, "step": 10904, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.956507 }, { "epoch": 0.3542539713478218, "grad_norm": 0.4225689470767975, "learning_rate": 7.675849584295327e-06, "loss": 0.0317719429731369, "memory(GiB)": 21.48, "step": 10905, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956525 }, { "epoch": 0.35428645681057724, "grad_norm": 0.37537166476249695, "learning_rate": 7.675395811902597e-06, "loss": 0.026596784591674805, "memory(GiB)": 21.48, "step": 10906, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956543 }, { "epoch": 0.35431894227333266, "grad_norm": 0.4871596097946167, "learning_rate": 7.67494200863224e-06, "loss": 0.03239702805876732, "memory(GiB)": 21.48, "step": 10907, "token_acc": 0.9840425531914894, "train_speed(iter/s)": 0.956562 }, { "epoch": 0.3543514277360881, "grad_norm": 0.45091739296913147, "learning_rate": 7.67448817448949e-06, "loss": 0.03442990407347679, "memory(GiB)": 21.48, "step": 10908, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.956581 }, { "epoch": 0.3543839131988435, "grad_norm": 0.6486580967903137, "learning_rate": 7.674034309479585e-06, "loss": 0.025516562163829803, "memory(GiB)": 21.48, "step": 10909, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.9566 }, { "epoch": 0.3544163986615989, "grad_norm": 0.4599006474018097, "learning_rate": 7.673580413607766e-06, "loss": 0.026860486716032028, "memory(GiB)": 21.48, "step": 10910, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.956619 }, { "epoch": 0.3544488841243543, "grad_norm": 0.4550369083881378, "learning_rate": 7.673126486879274e-06, "loss": 0.024882476776838303, "memory(GiB)": 21.48, "step": 10911, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.956638 }, { "epoch": 0.3544813695871098, "grad_norm": 0.4658457636833191, "learning_rate": 7.67267252929934e-06, "loss": 0.03606399893760681, "memory(GiB)": 21.48, "step": 10912, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.956657 }, { "epoch": 0.3545138550498652, "grad_norm": 0.3779941201210022, "learning_rate": 7.67221854087321e-06, "loss": 0.0261223167181015, "memory(GiB)": 21.48, "step": 10913, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956675 }, { "epoch": 0.35454634051262063, "grad_norm": 0.4479268491268158, "learning_rate": 7.671764521606123e-06, "loss": 0.038637444376945496, "memory(GiB)": 21.48, "step": 10914, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.956693 }, { "epoch": 0.35457882597537604, "grad_norm": 0.3666123151779175, "learning_rate": 7.671310471503314e-06, "loss": 0.02372199296951294, "memory(GiB)": 21.48, "step": 10915, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.956707 }, { "epoch": 0.35461131143813146, "grad_norm": 0.4111616313457489, "learning_rate": 7.670856390570027e-06, "loss": 0.034359049052000046, "memory(GiB)": 21.48, "step": 10916, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.956723 }, { "epoch": 0.3546437969008869, "grad_norm": 0.4436253607273102, "learning_rate": 7.670402278811504e-06, "loss": 0.027909675613045692, "memory(GiB)": 21.48, "step": 10917, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956739 }, { "epoch": 0.3546762823636423, "grad_norm": 0.5062596201896667, "learning_rate": 7.669948136232983e-06, "loss": 0.03524389490485191, "memory(GiB)": 21.48, "step": 10918, "token_acc": 0.9723320158102767, "train_speed(iter/s)": 0.956753 }, { "epoch": 0.3547087678263977, "grad_norm": 0.4976561963558197, "learning_rate": 7.669493962839708e-06, "loss": 0.028998296707868576, "memory(GiB)": 21.48, "step": 10919, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.956767 }, { "epoch": 0.3547412532891531, "grad_norm": 0.4292343556880951, "learning_rate": 7.66903975863692e-06, "loss": 0.025413982570171356, "memory(GiB)": 21.48, "step": 10920, "token_acc": 1.0, "train_speed(iter/s)": 0.956783 }, { "epoch": 0.35477373875190854, "grad_norm": 0.35394713282585144, "learning_rate": 7.66858552362986e-06, "loss": 0.02875036559998989, "memory(GiB)": 21.48, "step": 10921, "token_acc": 0.9961685823754789, "train_speed(iter/s)": 0.956798 }, { "epoch": 0.35480622421466396, "grad_norm": 0.4585725963115692, "learning_rate": 7.66813125782377e-06, "loss": 0.033167481422424316, "memory(GiB)": 21.48, "step": 10922, "token_acc": 0.9875, "train_speed(iter/s)": 0.956814 }, { "epoch": 0.3548387096774194, "grad_norm": 0.3986288905143738, "learning_rate": 7.667676961223895e-06, "loss": 0.027072910219430923, "memory(GiB)": 21.48, "step": 10923, "token_acc": 0.9899328859060402, "train_speed(iter/s)": 0.956826 }, { "epoch": 0.3548711951401748, "grad_norm": 0.45934799313545227, "learning_rate": 7.667222633835476e-06, "loss": 0.03020884096622467, "memory(GiB)": 21.48, "step": 10924, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.956839 }, { "epoch": 0.3549036806029302, "grad_norm": 0.4804922044277191, "learning_rate": 7.666768275663759e-06, "loss": 0.02567155845463276, "memory(GiB)": 21.48, "step": 10925, "token_acc": 0.9834710743801653, "train_speed(iter/s)": 0.956852 }, { "epoch": 0.3549361660656856, "grad_norm": 0.43932005763053894, "learning_rate": 7.666313886713988e-06, "loss": 0.03780381381511688, "memory(GiB)": 21.48, "step": 10926, "token_acc": 0.9807692307692307, "train_speed(iter/s)": 0.956867 }, { "epoch": 0.35496865152844104, "grad_norm": 0.2916545569896698, "learning_rate": 7.665859466991404e-06, "loss": 0.01950274407863617, "memory(GiB)": 21.48, "step": 10927, "token_acc": 1.0, "train_speed(iter/s)": 0.956881 }, { "epoch": 0.35500113699119645, "grad_norm": 0.3743179738521576, "learning_rate": 7.665405016501255e-06, "loss": 0.02184930257499218, "memory(GiB)": 21.48, "step": 10928, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956895 }, { "epoch": 0.35503362245395187, "grad_norm": 0.5176491141319275, "learning_rate": 7.664950535248781e-06, "loss": 0.027392808347940445, "memory(GiB)": 21.48, "step": 10929, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.95691 }, { "epoch": 0.3550661079167073, "grad_norm": 0.36906465888023376, "learning_rate": 7.664496023239235e-06, "loss": 0.030868232250213623, "memory(GiB)": 21.48, "step": 10930, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956924 }, { "epoch": 0.3550985933794627, "grad_norm": 1.1021581888198853, "learning_rate": 7.664041480477856e-06, "loss": 0.028503337875008583, "memory(GiB)": 21.48, "step": 10931, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956937 }, { "epoch": 0.3551310788422181, "grad_norm": 0.3450291156768799, "learning_rate": 7.663586906969894e-06, "loss": 0.0272795632481575, "memory(GiB)": 21.48, "step": 10932, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.956952 }, { "epoch": 0.35516356430497353, "grad_norm": 0.3194494843482971, "learning_rate": 7.663132302720592e-06, "loss": 0.02893037348985672, "memory(GiB)": 21.48, "step": 10933, "token_acc": 1.0, "train_speed(iter/s)": 0.956966 }, { "epoch": 0.35519604976772895, "grad_norm": 0.35190442204475403, "learning_rate": 7.6626776677352e-06, "loss": 0.034650981426239014, "memory(GiB)": 21.48, "step": 10934, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.956982 }, { "epoch": 0.35522853523048437, "grad_norm": 0.4495460093021393, "learning_rate": 7.662223002018963e-06, "loss": 0.02731042355298996, "memory(GiB)": 21.48, "step": 10935, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.956999 }, { "epoch": 0.3552610206932398, "grad_norm": 0.4454302191734314, "learning_rate": 7.66176830557713e-06, "loss": 0.030881743878126144, "memory(GiB)": 21.48, "step": 10936, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.957017 }, { "epoch": 0.3552935061559952, "grad_norm": 0.45636439323425293, "learning_rate": 7.661313578414948e-06, "loss": 0.02205364778637886, "memory(GiB)": 21.48, "step": 10937, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957036 }, { "epoch": 0.3553259916187506, "grad_norm": 0.4486130177974701, "learning_rate": 7.660858820537665e-06, "loss": 0.03195519000291824, "memory(GiB)": 21.48, "step": 10938, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.957055 }, { "epoch": 0.35535847708150603, "grad_norm": 0.38883569836616516, "learning_rate": 7.660404031950528e-06, "loss": 0.025365905836224556, "memory(GiB)": 21.48, "step": 10939, "token_acc": 1.0, "train_speed(iter/s)": 0.957074 }, { "epoch": 0.35539096254426145, "grad_norm": 0.4715557098388672, "learning_rate": 7.659949212658789e-06, "loss": 0.033476322889328, "memory(GiB)": 21.48, "step": 10940, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.957091 }, { "epoch": 0.35542344800701686, "grad_norm": 0.5126826763153076, "learning_rate": 7.659494362667694e-06, "loss": 0.03765387833118439, "memory(GiB)": 21.48, "step": 10941, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.957109 }, { "epoch": 0.3554559334697723, "grad_norm": 0.35351940989494324, "learning_rate": 7.659039481982495e-06, "loss": 0.02618401125073433, "memory(GiB)": 21.48, "step": 10942, "token_acc": 0.9805825242718447, "train_speed(iter/s)": 0.957128 }, { "epoch": 0.3554884189325277, "grad_norm": 0.37876462936401367, "learning_rate": 7.658584570608442e-06, "loss": 0.03056926280260086, "memory(GiB)": 21.48, "step": 10943, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.957143 }, { "epoch": 0.3555209043952831, "grad_norm": 0.3542659878730774, "learning_rate": 7.658129628550785e-06, "loss": 0.027408115565776825, "memory(GiB)": 21.48, "step": 10944, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.957157 }, { "epoch": 0.3555533898580385, "grad_norm": 0.9038196802139282, "learning_rate": 7.657674655814773e-06, "loss": 0.03485830873250961, "memory(GiB)": 21.48, "step": 10945, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.957171 }, { "epoch": 0.35558587532079394, "grad_norm": 0.48858776688575745, "learning_rate": 7.65721965240566e-06, "loss": 0.032342419028282166, "memory(GiB)": 21.48, "step": 10946, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957186 }, { "epoch": 0.35561836078354936, "grad_norm": 0.54612797498703, "learning_rate": 7.656764618328692e-06, "loss": 0.0336858406662941, "memory(GiB)": 21.48, "step": 10947, "token_acc": 0.9798387096774194, "train_speed(iter/s)": 0.9572 }, { "epoch": 0.3556508462463048, "grad_norm": 0.5052803754806519, "learning_rate": 7.656309553589128e-06, "loss": 0.02056276798248291, "memory(GiB)": 21.48, "step": 10948, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957215 }, { "epoch": 0.3556833317090602, "grad_norm": 0.35642868280410767, "learning_rate": 7.655854458192213e-06, "loss": 0.02588534727692604, "memory(GiB)": 21.48, "step": 10949, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.957228 }, { "epoch": 0.3557158171718156, "grad_norm": 0.5497080683708191, "learning_rate": 7.655399332143205e-06, "loss": 0.021284054964780807, "memory(GiB)": 21.48, "step": 10950, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.957241 }, { "epoch": 0.355748302634571, "grad_norm": 0.3167911171913147, "learning_rate": 7.654944175447353e-06, "loss": 0.022991053760051727, "memory(GiB)": 21.48, "step": 10951, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.957256 }, { "epoch": 0.35578078809732644, "grad_norm": 0.51435786485672, "learning_rate": 7.654488988109911e-06, "loss": 0.030851805582642555, "memory(GiB)": 21.48, "step": 10952, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.957271 }, { "epoch": 0.35581327356008186, "grad_norm": 0.8148927688598633, "learning_rate": 7.654033770136134e-06, "loss": 0.03167266026139259, "memory(GiB)": 21.48, "step": 10953, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957283 }, { "epoch": 0.35584575902283727, "grad_norm": 0.35617756843566895, "learning_rate": 7.653578521531275e-06, "loss": 0.03333020955324173, "memory(GiB)": 21.48, "step": 10954, "token_acc": 0.9779411764705882, "train_speed(iter/s)": 0.957295 }, { "epoch": 0.3558782444855927, "grad_norm": 0.348919540643692, "learning_rate": 7.653123242300587e-06, "loss": 0.018218670040369034, "memory(GiB)": 21.48, "step": 10955, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.95731 }, { "epoch": 0.3559107299483481, "grad_norm": 0.40637072920799255, "learning_rate": 7.652667932449327e-06, "loss": 0.030197277665138245, "memory(GiB)": 21.48, "step": 10956, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957323 }, { "epoch": 0.3559432154111035, "grad_norm": 0.3370477259159088, "learning_rate": 7.652212591982747e-06, "loss": 0.02672586962580681, "memory(GiB)": 21.48, "step": 10957, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.957337 }, { "epoch": 0.35597570087385894, "grad_norm": 0.39397993683815, "learning_rate": 7.651757220906104e-06, "loss": 0.034173958003520966, "memory(GiB)": 21.48, "step": 10958, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.957351 }, { "epoch": 0.35600818633661435, "grad_norm": 0.3835132420063019, "learning_rate": 7.651301819224652e-06, "loss": 0.029663924127817154, "memory(GiB)": 21.48, "step": 10959, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957366 }, { "epoch": 0.35604067179936977, "grad_norm": 0.31592419743537903, "learning_rate": 7.65084638694365e-06, "loss": 0.027041401714086533, "memory(GiB)": 21.48, "step": 10960, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.957378 }, { "epoch": 0.3560731572621252, "grad_norm": 0.5045382976531982, "learning_rate": 7.650390924068352e-06, "loss": 0.03425600752234459, "memory(GiB)": 21.48, "step": 10961, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.957392 }, { "epoch": 0.3561056427248806, "grad_norm": 0.8330345153808594, "learning_rate": 7.649935430604013e-06, "loss": 0.03754650056362152, "memory(GiB)": 21.48, "step": 10962, "token_acc": 0.9669811320754716, "train_speed(iter/s)": 0.957403 }, { "epoch": 0.356138128187636, "grad_norm": 0.27528682351112366, "learning_rate": 7.649479906555895e-06, "loss": 0.019671205431222916, "memory(GiB)": 21.48, "step": 10963, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.957417 }, { "epoch": 0.35617061365039143, "grad_norm": 0.33302217721939087, "learning_rate": 7.649024351929252e-06, "loss": 0.01910337619483471, "memory(GiB)": 21.48, "step": 10964, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.957431 }, { "epoch": 0.35620309911314685, "grad_norm": 0.3386169970035553, "learning_rate": 7.64856876672934e-06, "loss": 0.028210045769810677, "memory(GiB)": 21.48, "step": 10965, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957445 }, { "epoch": 0.35623558457590226, "grad_norm": 0.4298696219921112, "learning_rate": 7.648113150961422e-06, "loss": 0.03157052770256996, "memory(GiB)": 21.48, "step": 10966, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.95746 }, { "epoch": 0.3562680700386577, "grad_norm": 0.4333726763725281, "learning_rate": 7.647657504630753e-06, "loss": 0.02163652330636978, "memory(GiB)": 21.48, "step": 10967, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957478 }, { "epoch": 0.3563005555014131, "grad_norm": 0.30566349625587463, "learning_rate": 7.64720182774259e-06, "loss": 0.01961025409400463, "memory(GiB)": 21.48, "step": 10968, "token_acc": 1.0, "train_speed(iter/s)": 0.957497 }, { "epoch": 0.3563330409641685, "grad_norm": 0.4229561686515808, "learning_rate": 7.646746120302197e-06, "loss": 0.03321462869644165, "memory(GiB)": 21.48, "step": 10969, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.957515 }, { "epoch": 0.35636552642692393, "grad_norm": 0.4486484229564667, "learning_rate": 7.646290382314829e-06, "loss": 0.028863294050097466, "memory(GiB)": 21.48, "step": 10970, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.957533 }, { "epoch": 0.35639801188967934, "grad_norm": 0.3876534104347229, "learning_rate": 7.645834613785749e-06, "loss": 0.029168643057346344, "memory(GiB)": 21.48, "step": 10971, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.957552 }, { "epoch": 0.35643049735243476, "grad_norm": 0.32033777236938477, "learning_rate": 7.645378814720217e-06, "loss": 0.028543047606945038, "memory(GiB)": 21.48, "step": 10972, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.95757 }, { "epoch": 0.3564629828151902, "grad_norm": 0.4455365836620331, "learning_rate": 7.64492298512349e-06, "loss": 0.027389898896217346, "memory(GiB)": 21.48, "step": 10973, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.957589 }, { "epoch": 0.3564954682779456, "grad_norm": 0.38538655638694763, "learning_rate": 7.644467125000832e-06, "loss": 0.023869261145591736, "memory(GiB)": 21.48, "step": 10974, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.957604 }, { "epoch": 0.356527953740701, "grad_norm": 0.32522469758987427, "learning_rate": 7.644011234357505e-06, "loss": 0.02486211061477661, "memory(GiB)": 21.48, "step": 10975, "token_acc": 0.995, "train_speed(iter/s)": 0.95762 }, { "epoch": 0.3565604392034565, "grad_norm": 0.4011068642139435, "learning_rate": 7.64355531319877e-06, "loss": 0.026418842375278473, "memory(GiB)": 21.48, "step": 10976, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957634 }, { "epoch": 0.3565929246662119, "grad_norm": 0.4836727976799011, "learning_rate": 7.643099361529885e-06, "loss": 0.026848018169403076, "memory(GiB)": 21.48, "step": 10977, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.957649 }, { "epoch": 0.3566254101289673, "grad_norm": 0.41383787989616394, "learning_rate": 7.642643379356117e-06, "loss": 0.0219903364777565, "memory(GiB)": 21.48, "step": 10978, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.957662 }, { "epoch": 0.35665789559172273, "grad_norm": 0.44097667932510376, "learning_rate": 7.642187366682727e-06, "loss": 0.031225092709064484, "memory(GiB)": 21.48, "step": 10979, "token_acc": 0.9796954314720813, "train_speed(iter/s)": 0.957677 }, { "epoch": 0.35669038105447815, "grad_norm": 0.5371465086936951, "learning_rate": 7.641731323514978e-06, "loss": 0.02701360173523426, "memory(GiB)": 21.48, "step": 10980, "token_acc": 1.0, "train_speed(iter/s)": 0.957692 }, { "epoch": 0.35672286651723356, "grad_norm": 0.3512941002845764, "learning_rate": 7.641275249858134e-06, "loss": 0.02649892494082451, "memory(GiB)": 21.48, "step": 10981, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.957704 }, { "epoch": 0.356755351979989, "grad_norm": 0.4561433792114258, "learning_rate": 7.640819145717458e-06, "loss": 0.02904825285077095, "memory(GiB)": 21.48, "step": 10982, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.957719 }, { "epoch": 0.3567878374427444, "grad_norm": 0.3921779990196228, "learning_rate": 7.640363011098213e-06, "loss": 0.03187406435608864, "memory(GiB)": 21.48, "step": 10983, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.957732 }, { "epoch": 0.3568203229054998, "grad_norm": 0.49647459387779236, "learning_rate": 7.639906846005665e-06, "loss": 0.026172930374741554, "memory(GiB)": 21.48, "step": 10984, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957747 }, { "epoch": 0.3568528083682552, "grad_norm": 0.355624794960022, "learning_rate": 7.63945065044508e-06, "loss": 0.02637164294719696, "memory(GiB)": 21.48, "step": 10985, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.95776 }, { "epoch": 0.35688529383101064, "grad_norm": 0.3410505950450897, "learning_rate": 7.63899442442172e-06, "loss": 0.022408248856663704, "memory(GiB)": 21.48, "step": 10986, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.957775 }, { "epoch": 0.35691777929376606, "grad_norm": 0.3682889938354492, "learning_rate": 7.63853816794085e-06, "loss": 0.027664389461278915, "memory(GiB)": 21.48, "step": 10987, "token_acc": 0.990521327014218, "train_speed(iter/s)": 0.957788 }, { "epoch": 0.3569502647565215, "grad_norm": 0.33007708191871643, "learning_rate": 7.638081881007739e-06, "loss": 0.023285336792469025, "memory(GiB)": 21.48, "step": 10988, "token_acc": 1.0, "train_speed(iter/s)": 0.957802 }, { "epoch": 0.3569827502192769, "grad_norm": 0.4185643494129181, "learning_rate": 7.637625563627652e-06, "loss": 0.025166762992739677, "memory(GiB)": 21.48, "step": 10989, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.957817 }, { "epoch": 0.3570152356820323, "grad_norm": 0.4530879855155945, "learning_rate": 7.637169215805857e-06, "loss": 0.028960855677723885, "memory(GiB)": 21.48, "step": 10990, "token_acc": 0.981203007518797, "train_speed(iter/s)": 0.957832 }, { "epoch": 0.3570477211447877, "grad_norm": 1.6732252836227417, "learning_rate": 7.636712837547617e-06, "loss": 0.031625621020793915, "memory(GiB)": 21.48, "step": 10991, "token_acc": 0.9877049180327869, "train_speed(iter/s)": 0.957846 }, { "epoch": 0.35708020660754314, "grad_norm": 0.4373568594455719, "learning_rate": 7.636256428858202e-06, "loss": 0.032201237976551056, "memory(GiB)": 21.48, "step": 10992, "token_acc": 0.9877300613496932, "train_speed(iter/s)": 0.957859 }, { "epoch": 0.35711269207029855, "grad_norm": 0.31167668104171753, "learning_rate": 7.63579998974288e-06, "loss": 0.020946886390447617, "memory(GiB)": 21.48, "step": 10993, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957874 }, { "epoch": 0.35714517753305397, "grad_norm": 0.49140459299087524, "learning_rate": 7.635343520206916e-06, "loss": 0.030694885179400444, "memory(GiB)": 21.48, "step": 10994, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.957892 }, { "epoch": 0.3571776629958094, "grad_norm": 0.35965850949287415, "learning_rate": 7.634887020255581e-06, "loss": 0.024624831974506378, "memory(GiB)": 21.48, "step": 10995, "token_acc": 0.9822695035460993, "train_speed(iter/s)": 0.957909 }, { "epoch": 0.3572101484585648, "grad_norm": 0.5032176971435547, "learning_rate": 7.634430489894142e-06, "loss": 0.026623444631695747, "memory(GiB)": 21.48, "step": 10996, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.957928 }, { "epoch": 0.3572426339213202, "grad_norm": 0.4107193946838379, "learning_rate": 7.633973929127871e-06, "loss": 0.029199497774243355, "memory(GiB)": 21.48, "step": 10997, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.957946 }, { "epoch": 0.35727511938407563, "grad_norm": 0.4162710905075073, "learning_rate": 7.633517337962033e-06, "loss": 0.031659457832574844, "memory(GiB)": 21.48, "step": 10998, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957964 }, { "epoch": 0.35730760484683105, "grad_norm": 0.4714907705783844, "learning_rate": 7.6330607164019e-06, "loss": 0.025080187246203423, "memory(GiB)": 21.48, "step": 10999, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957983 }, { "epoch": 0.35734009030958647, "grad_norm": 0.39285197854042053, "learning_rate": 7.632604064452742e-06, "loss": 0.03311954811215401, "memory(GiB)": 21.48, "step": 11000, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.958001 }, { "epoch": 0.35734009030958647, "eval_loss": 0.0284825898706913, "eval_runtime": 80.0009, "eval_samples_per_second": 124.374, "eval_steps_per_second": 3.887, "eval_token_acc": 0.9887715649277437, "step": 11000 }, { "epoch": 0.3573725757723419, "grad_norm": 1.143050193786621, "learning_rate": 7.63214738211983e-06, "loss": 0.03660469129681587, "memory(GiB)": 21.48, "step": 11001, "token_acc": 0.9884351877226637, "train_speed(iter/s)": 0.950466 }, { "epoch": 0.3574050612350973, "grad_norm": 0.5698909163475037, "learning_rate": 7.631690669408433e-06, "loss": 0.03985535725951195, "memory(GiB)": 21.48, "step": 11002, "token_acc": 0.99, "train_speed(iter/s)": 0.950479 }, { "epoch": 0.3574375466978527, "grad_norm": 0.329186350107193, "learning_rate": 7.631233926323823e-06, "loss": 0.01855861209332943, "memory(GiB)": 21.48, "step": 11003, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.950491 }, { "epoch": 0.35747003216060813, "grad_norm": 0.36301398277282715, "learning_rate": 7.630777152871272e-06, "loss": 0.023336589336395264, "memory(GiB)": 21.48, "step": 11004, "token_acc": 0.9940476190476191, "train_speed(iter/s)": 0.950502 }, { "epoch": 0.35750251762336355, "grad_norm": 0.35223710536956787, "learning_rate": 7.630320349056049e-06, "loss": 0.02202000841498375, "memory(GiB)": 21.48, "step": 11005, "token_acc": 1.0, "train_speed(iter/s)": 0.950515 }, { "epoch": 0.35753500308611896, "grad_norm": 0.4670998752117157, "learning_rate": 7.629863514883429e-06, "loss": 0.032351840287446976, "memory(GiB)": 21.48, "step": 11006, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.950529 }, { "epoch": 0.3575674885488744, "grad_norm": 0.5503541231155396, "learning_rate": 7.629406650358687e-06, "loss": 0.03159240633249283, "memory(GiB)": 21.48, "step": 11007, "token_acc": 0.9730941704035875, "train_speed(iter/s)": 0.950542 }, { "epoch": 0.3575999740116298, "grad_norm": 0.4814968705177307, "learning_rate": 7.628949755487087e-06, "loss": 0.02973175421357155, "memory(GiB)": 21.48, "step": 11008, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.950533 }, { "epoch": 0.3576324594743852, "grad_norm": 0.42891913652420044, "learning_rate": 7.628492830273912e-06, "loss": 0.033435992896556854, "memory(GiB)": 21.48, "step": 11009, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.950547 }, { "epoch": 0.3576649449371406, "grad_norm": 0.334570050239563, "learning_rate": 7.628035874724429e-06, "loss": 0.026591453701257706, "memory(GiB)": 21.48, "step": 11010, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.950563 }, { "epoch": 0.35769743039989604, "grad_norm": 0.4432013928890228, "learning_rate": 7.627578888843916e-06, "loss": 0.02223382517695427, "memory(GiB)": 21.48, "step": 11011, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.950578 }, { "epoch": 0.35772991586265146, "grad_norm": 0.43598800897598267, "learning_rate": 7.627121872637644e-06, "loss": 0.03695176914334297, "memory(GiB)": 21.48, "step": 11012, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.950593 }, { "epoch": 0.3577624013254069, "grad_norm": 0.40350979566574097, "learning_rate": 7.626664826110889e-06, "loss": 0.027296001091599464, "memory(GiB)": 21.48, "step": 11013, "token_acc": 0.9806201550387597, "train_speed(iter/s)": 0.950607 }, { "epoch": 0.3577948867881623, "grad_norm": 0.3274064362049103, "learning_rate": 7.626207749268925e-06, "loss": 0.02592061087489128, "memory(GiB)": 21.48, "step": 11014, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.950621 }, { "epoch": 0.3578273722509177, "grad_norm": 0.2998270094394684, "learning_rate": 7.625750642117029e-06, "loss": 0.022097446024417877, "memory(GiB)": 21.48, "step": 11015, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.950635 }, { "epoch": 0.3578598577136731, "grad_norm": 0.34306278824806213, "learning_rate": 7.625293504660476e-06, "loss": 0.026477526873350143, "memory(GiB)": 21.48, "step": 11016, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.950651 }, { "epoch": 0.35789234317642854, "grad_norm": 0.9438685774803162, "learning_rate": 7.624836336904542e-06, "loss": 0.0222802571952343, "memory(GiB)": 21.48, "step": 11017, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.950665 }, { "epoch": 0.35792482863918396, "grad_norm": 0.4746946394443512, "learning_rate": 7.624379138854501e-06, "loss": 0.02375534176826477, "memory(GiB)": 21.48, "step": 11018, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.950682 }, { "epoch": 0.3579573141019394, "grad_norm": 0.37974458932876587, "learning_rate": 7.623921910515632e-06, "loss": 0.027104441076517105, "memory(GiB)": 21.48, "step": 11019, "token_acc": 0.98828125, "train_speed(iter/s)": 0.9507 }, { "epoch": 0.3579897995646948, "grad_norm": 0.5650562047958374, "learning_rate": 7.623464651893215e-06, "loss": 0.025090526789426804, "memory(GiB)": 21.48, "step": 11020, "token_acc": 1.0, "train_speed(iter/s)": 0.950718 }, { "epoch": 0.3580222850274502, "grad_norm": 0.5107470154762268, "learning_rate": 7.623007362992522e-06, "loss": 0.02927699312567711, "memory(GiB)": 21.48, "step": 11021, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.950738 }, { "epoch": 0.3580547704902056, "grad_norm": 0.4470890164375305, "learning_rate": 7.6225500438188325e-06, "loss": 0.030601859092712402, "memory(GiB)": 21.48, "step": 11022, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.950757 }, { "epoch": 0.35808725595296104, "grad_norm": 0.41116178035736084, "learning_rate": 7.622092694377425e-06, "loss": 0.03046756610274315, "memory(GiB)": 21.48, "step": 11023, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.950776 }, { "epoch": 0.35811974141571645, "grad_norm": 0.3351248800754547, "learning_rate": 7.6216353146735775e-06, "loss": 0.023643814027309418, "memory(GiB)": 21.48, "step": 11024, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.950793 }, { "epoch": 0.35815222687847187, "grad_norm": 0.25352194905281067, "learning_rate": 7.62117790471257e-06, "loss": 0.021182697266340256, "memory(GiB)": 21.48, "step": 11025, "token_acc": 0.9923664122137404, "train_speed(iter/s)": 0.950811 }, { "epoch": 0.3581847123412273, "grad_norm": 0.39543694257736206, "learning_rate": 7.62072046449968e-06, "loss": 0.03542063385248184, "memory(GiB)": 21.48, "step": 11026, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.95083 }, { "epoch": 0.3582171978039827, "grad_norm": 0.40299782156944275, "learning_rate": 7.620262994040188e-06, "loss": 0.029076609760522842, "memory(GiB)": 21.48, "step": 11027, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.950849 }, { "epoch": 0.3582496832667381, "grad_norm": 0.3371404707431793, "learning_rate": 7.619805493339373e-06, "loss": 0.02214847132563591, "memory(GiB)": 21.48, "step": 11028, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.950868 }, { "epoch": 0.35828216872949353, "grad_norm": 0.7255316972732544, "learning_rate": 7.619347962402517e-06, "loss": 0.02063652127981186, "memory(GiB)": 21.48, "step": 11029, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.950887 }, { "epoch": 0.35831465419224895, "grad_norm": 0.4230078458786011, "learning_rate": 7.618890401234898e-06, "loss": 0.027142606675624847, "memory(GiB)": 21.48, "step": 11030, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.950905 }, { "epoch": 0.35834713965500437, "grad_norm": 0.5519891977310181, "learning_rate": 7.618432809841798e-06, "loss": 0.035625021904706955, "memory(GiB)": 21.48, "step": 11031, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.950923 }, { "epoch": 0.3583796251177598, "grad_norm": 0.417733371257782, "learning_rate": 7.617975188228499e-06, "loss": 0.024271614849567413, "memory(GiB)": 21.48, "step": 11032, "token_acc": 1.0, "train_speed(iter/s)": 0.950942 }, { "epoch": 0.3584121105805152, "grad_norm": 0.8530871272087097, "learning_rate": 7.617517536400279e-06, "loss": 0.0377969965338707, "memory(GiB)": 21.48, "step": 11033, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.950961 }, { "epoch": 0.3584445960432706, "grad_norm": 0.48423680663108826, "learning_rate": 7.617059854362426e-06, "loss": 0.032215945422649384, "memory(GiB)": 21.48, "step": 11034, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.95098 }, { "epoch": 0.35847708150602603, "grad_norm": 0.372385710477829, "learning_rate": 7.616602142120219e-06, "loss": 0.02732590027153492, "memory(GiB)": 21.48, "step": 11035, "token_acc": 0.9757575757575757, "train_speed(iter/s)": 0.950999 }, { "epoch": 0.35850956696878145, "grad_norm": 0.5785072445869446, "learning_rate": 7.616144399678939e-06, "loss": 0.04095766320824623, "memory(GiB)": 21.48, "step": 11036, "token_acc": 0.975177304964539, "train_speed(iter/s)": 0.951018 }, { "epoch": 0.35854205243153686, "grad_norm": 0.34240713715553284, "learning_rate": 7.615686627043871e-06, "loss": 0.02606009691953659, "memory(GiB)": 21.48, "step": 11037, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.951036 }, { "epoch": 0.3585745378942923, "grad_norm": 0.6396321058273315, "learning_rate": 7.615228824220298e-06, "loss": 0.036180879920721054, "memory(GiB)": 21.48, "step": 11038, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.951054 }, { "epoch": 0.3586070233570477, "grad_norm": 0.5507401823997498, "learning_rate": 7.614770991213504e-06, "loss": 0.029872676357626915, "memory(GiB)": 21.48, "step": 11039, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.951072 }, { "epoch": 0.35863950881980317, "grad_norm": 0.3620896339416504, "learning_rate": 7.6143131280287715e-06, "loss": 0.02320660650730133, "memory(GiB)": 21.48, "step": 11040, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.95109 }, { "epoch": 0.3586719942825586, "grad_norm": 0.5063878893852234, "learning_rate": 7.6138552346713855e-06, "loss": 0.039999715983867645, "memory(GiB)": 21.48, "step": 11041, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.951109 }, { "epoch": 0.358704479745314, "grad_norm": 0.6515899896621704, "learning_rate": 7.613397311146633e-06, "loss": 0.03400138020515442, "memory(GiB)": 21.48, "step": 11042, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.951128 }, { "epoch": 0.3587369652080694, "grad_norm": 0.387196809053421, "learning_rate": 7.6129393574597965e-06, "loss": 0.024419348686933517, "memory(GiB)": 21.48, "step": 11043, "token_acc": 0.9940476190476191, "train_speed(iter/s)": 0.951144 }, { "epoch": 0.35876945067082483, "grad_norm": 0.6343508362770081, "learning_rate": 7.612481373616161e-06, "loss": 0.03449510410428047, "memory(GiB)": 21.48, "step": 11044, "token_acc": 1.0, "train_speed(iter/s)": 0.951161 }, { "epoch": 0.35880193613358025, "grad_norm": 0.315887987613678, "learning_rate": 7.612023359621014e-06, "loss": 0.020957088097929955, "memory(GiB)": 21.48, "step": 11045, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.95118 }, { "epoch": 0.35883442159633566, "grad_norm": 0.2901712954044342, "learning_rate": 7.611565315479641e-06, "loss": 0.01763625629246235, "memory(GiB)": 21.48, "step": 11046, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.951199 }, { "epoch": 0.3588669070590911, "grad_norm": 0.45136380195617676, "learning_rate": 7.611107241197327e-06, "loss": 0.02629712037742138, "memory(GiB)": 21.48, "step": 11047, "token_acc": 0.9895287958115183, "train_speed(iter/s)": 0.951218 }, { "epoch": 0.3588993925218465, "grad_norm": 0.36500781774520874, "learning_rate": 7.610649136779361e-06, "loss": 0.03207406401634216, "memory(GiB)": 21.48, "step": 11048, "token_acc": 0.9896907216494846, "train_speed(iter/s)": 0.951236 }, { "epoch": 0.3589318779846019, "grad_norm": 0.37120360136032104, "learning_rate": 7.610191002231029e-06, "loss": 0.03517783433198929, "memory(GiB)": 21.48, "step": 11049, "token_acc": 0.9854545454545455, "train_speed(iter/s)": 0.951251 }, { "epoch": 0.3589643634473573, "grad_norm": 0.5107068419456482, "learning_rate": 7.6097328375576185e-06, "loss": 0.025938253849744797, "memory(GiB)": 21.48, "step": 11050, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.951266 }, { "epoch": 0.35899684891011274, "grad_norm": 0.4090111553668976, "learning_rate": 7.609274642764418e-06, "loss": 0.026072215288877487, "memory(GiB)": 21.48, "step": 11051, "token_acc": 0.9793103448275862, "train_speed(iter/s)": 0.951281 }, { "epoch": 0.35902933437286816, "grad_norm": 0.29223135113716125, "learning_rate": 7.608816417856715e-06, "loss": 0.02214287593960762, "memory(GiB)": 21.48, "step": 11052, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.951296 }, { "epoch": 0.3590618198356236, "grad_norm": 0.3428633511066437, "learning_rate": 7.608358162839799e-06, "loss": 0.027379417791962624, "memory(GiB)": 21.48, "step": 11053, "token_acc": 0.9961832061068703, "train_speed(iter/s)": 0.951311 }, { "epoch": 0.359094305298379, "grad_norm": 0.3637349009513855, "learning_rate": 7.607899877718956e-06, "loss": 0.03049425408244133, "memory(GiB)": 21.48, "step": 11054, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.951326 }, { "epoch": 0.3591267907611344, "grad_norm": 0.5261790156364441, "learning_rate": 7.607441562499479e-06, "loss": 0.021107200533151627, "memory(GiB)": 21.48, "step": 11055, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.951341 }, { "epoch": 0.3591592762238898, "grad_norm": 0.2951222062110901, "learning_rate": 7.606983217186655e-06, "loss": 0.015367384068667889, "memory(GiB)": 21.48, "step": 11056, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.951355 }, { "epoch": 0.35919176168664524, "grad_norm": 0.9164537787437439, "learning_rate": 7.606524841785776e-06, "loss": 0.04081445932388306, "memory(GiB)": 21.48, "step": 11057, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.951369 }, { "epoch": 0.35922424714940066, "grad_norm": 0.3752516806125641, "learning_rate": 7.60606643630213e-06, "loss": 0.034097518771886826, "memory(GiB)": 21.48, "step": 11058, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.951382 }, { "epoch": 0.35925673261215607, "grad_norm": 0.48000016808509827, "learning_rate": 7.60560800074101e-06, "loss": 0.03244687616825104, "memory(GiB)": 21.48, "step": 11059, "token_acc": 0.9813953488372092, "train_speed(iter/s)": 0.951396 }, { "epoch": 0.3592892180749115, "grad_norm": 0.5689947009086609, "learning_rate": 7.6051495351077045e-06, "loss": 0.033620502799749374, "memory(GiB)": 21.48, "step": 11060, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.951409 }, { "epoch": 0.3593217035376669, "grad_norm": 0.46257010102272034, "learning_rate": 7.6046910394075054e-06, "loss": 0.029389681294560432, "memory(GiB)": 21.48, "step": 11061, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.951421 }, { "epoch": 0.3593541890004223, "grad_norm": 2.111809253692627, "learning_rate": 7.604232513645706e-06, "loss": 0.022591162472963333, "memory(GiB)": 21.48, "step": 11062, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.951433 }, { "epoch": 0.35938667446317774, "grad_norm": 0.7127833366394043, "learning_rate": 7.603773957827596e-06, "loss": 0.036800675094127655, "memory(GiB)": 21.48, "step": 11063, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.951446 }, { "epoch": 0.35941915992593315, "grad_norm": 0.3415144085884094, "learning_rate": 7.603315371958468e-06, "loss": 0.026276733726263046, "memory(GiB)": 21.48, "step": 11064, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.951458 }, { "epoch": 0.35945164538868857, "grad_norm": 0.6657503247261047, "learning_rate": 7.602856756043619e-06, "loss": 0.03290325403213501, "memory(GiB)": 21.48, "step": 11065, "token_acc": 0.9853479853479854, "train_speed(iter/s)": 0.951471 }, { "epoch": 0.359484130851444, "grad_norm": 0.35034510493278503, "learning_rate": 7.602398110088336e-06, "loss": 0.026860959827899933, "memory(GiB)": 21.48, "step": 11066, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.951484 }, { "epoch": 0.3595166163141994, "grad_norm": 0.4547922611236572, "learning_rate": 7.601939434097916e-06, "loss": 0.033104054629802704, "memory(GiB)": 21.48, "step": 11067, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.951496 }, { "epoch": 0.3595491017769548, "grad_norm": 0.479377806186676, "learning_rate": 7.6014807280776504e-06, "loss": 0.028733721002936363, "memory(GiB)": 21.48, "step": 11068, "token_acc": 0.9887005649717514, "train_speed(iter/s)": 0.951507 }, { "epoch": 0.35958158723971023, "grad_norm": 0.46327435970306396, "learning_rate": 7.601021992032838e-06, "loss": 0.03156321495771408, "memory(GiB)": 21.48, "step": 11069, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.95152 }, { "epoch": 0.35961407270246565, "grad_norm": 0.41753169894218445, "learning_rate": 7.6005632259687664e-06, "loss": 0.03129507601261139, "memory(GiB)": 21.48, "step": 11070, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.951533 }, { "epoch": 0.35964655816522106, "grad_norm": 0.3685078024864197, "learning_rate": 7.600104429890736e-06, "loss": 0.02704755589365959, "memory(GiB)": 21.48, "step": 11071, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.951546 }, { "epoch": 0.3596790436279765, "grad_norm": 0.6100444793701172, "learning_rate": 7.599645603804038e-06, "loss": 0.027597371488809586, "memory(GiB)": 21.48, "step": 11072, "token_acc": 1.0, "train_speed(iter/s)": 0.95156 }, { "epoch": 0.3597115290907319, "grad_norm": 0.296584814786911, "learning_rate": 7.599186747713971e-06, "loss": 0.025751976296305656, "memory(GiB)": 21.48, "step": 11073, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.951572 }, { "epoch": 0.3597440145534873, "grad_norm": 0.40468335151672363, "learning_rate": 7.598727861625829e-06, "loss": 0.03540603816509247, "memory(GiB)": 21.48, "step": 11074, "token_acc": 0.9771863117870723, "train_speed(iter/s)": 0.951586 }, { "epoch": 0.35977650001624273, "grad_norm": 0.7909191846847534, "learning_rate": 7.598268945544907e-06, "loss": 0.035128816962242126, "memory(GiB)": 21.48, "step": 11075, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.9516 }, { "epoch": 0.35980898547899814, "grad_norm": 1.1303578615188599, "learning_rate": 7.597809999476507e-06, "loss": 0.029039323329925537, "memory(GiB)": 21.48, "step": 11076, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.951616 }, { "epoch": 0.35984147094175356, "grad_norm": 0.4374248683452606, "learning_rate": 7.597351023425919e-06, "loss": 0.025762418285012245, "memory(GiB)": 21.48, "step": 11077, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.951631 }, { "epoch": 0.359873956404509, "grad_norm": 0.38088518381118774, "learning_rate": 7.596892017398443e-06, "loss": 0.029421472921967506, "memory(GiB)": 21.48, "step": 11078, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.951649 }, { "epoch": 0.3599064418672644, "grad_norm": 0.3659012019634247, "learning_rate": 7.596432981399377e-06, "loss": 0.0254441499710083, "memory(GiB)": 21.48, "step": 11079, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.951668 }, { "epoch": 0.3599389273300198, "grad_norm": 0.3629229962825775, "learning_rate": 7.595973915434018e-06, "loss": 0.03441394492983818, "memory(GiB)": 21.48, "step": 11080, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.951686 }, { "epoch": 0.3599714127927752, "grad_norm": 0.33301055431365967, "learning_rate": 7.595514819507666e-06, "loss": 0.024383854120969772, "memory(GiB)": 21.48, "step": 11081, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.951705 }, { "epoch": 0.36000389825553064, "grad_norm": 0.36391064524650574, "learning_rate": 7.595055693625617e-06, "loss": 0.026150211691856384, "memory(GiB)": 21.48, "step": 11082, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.951722 }, { "epoch": 0.36003638371828606, "grad_norm": 0.33900177478790283, "learning_rate": 7.594596537793173e-06, "loss": 0.021658357232809067, "memory(GiB)": 21.48, "step": 11083, "token_acc": 1.0, "train_speed(iter/s)": 0.951741 }, { "epoch": 0.3600688691810415, "grad_norm": 0.5419313311576843, "learning_rate": 7.59413735201563e-06, "loss": 0.031656067818403244, "memory(GiB)": 21.48, "step": 11084, "token_acc": 0.984, "train_speed(iter/s)": 0.95176 }, { "epoch": 0.3601013546437969, "grad_norm": 0.3152519762516022, "learning_rate": 7.59367813629829e-06, "loss": 0.02194385416805744, "memory(GiB)": 21.48, "step": 11085, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.95178 }, { "epoch": 0.3601338401065523, "grad_norm": 0.4415203928947449, "learning_rate": 7.593218890646453e-06, "loss": 0.030373141169548035, "memory(GiB)": 21.48, "step": 11086, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.9518 }, { "epoch": 0.3601663255693077, "grad_norm": 0.2927863895893097, "learning_rate": 7.592759615065417e-06, "loss": 0.01845417357981205, "memory(GiB)": 21.48, "step": 11087, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.951819 }, { "epoch": 0.36019881103206314, "grad_norm": 0.5452991127967834, "learning_rate": 7.592300309560485e-06, "loss": 0.050380218774080276, "memory(GiB)": 21.48, "step": 11088, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.951839 }, { "epoch": 0.36023129649481855, "grad_norm": 0.4566718637943268, "learning_rate": 7.591840974136957e-06, "loss": 0.03533964604139328, "memory(GiB)": 21.48, "step": 11089, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.951857 }, { "epoch": 0.36026378195757397, "grad_norm": 0.3767874836921692, "learning_rate": 7.5913816088001345e-06, "loss": 0.028891516849398613, "memory(GiB)": 21.48, "step": 11090, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.951876 }, { "epoch": 0.3602962674203294, "grad_norm": 0.3948855996131897, "learning_rate": 7.590922213555319e-06, "loss": 0.025551171973347664, "memory(GiB)": 21.48, "step": 11091, "token_acc": 1.0, "train_speed(iter/s)": 0.951895 }, { "epoch": 0.3603287528830848, "grad_norm": 2.8533105850219727, "learning_rate": 7.590462788407812e-06, "loss": 0.029673419892787933, "memory(GiB)": 21.48, "step": 11092, "token_acc": 0.984, "train_speed(iter/s)": 0.951913 }, { "epoch": 0.3603612383458402, "grad_norm": 0.5006044507026672, "learning_rate": 7.590003333362917e-06, "loss": 0.03802867978811264, "memory(GiB)": 21.48, "step": 11093, "token_acc": 1.0, "train_speed(iter/s)": 0.951932 }, { "epoch": 0.36039372380859563, "grad_norm": 0.9121105074882507, "learning_rate": 7.589543848425935e-06, "loss": 0.036274854093790054, "memory(GiB)": 21.48, "step": 11094, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.951951 }, { "epoch": 0.36042620927135105, "grad_norm": 0.3902139663696289, "learning_rate": 7.589084333602172e-06, "loss": 0.027610905468463898, "memory(GiB)": 21.48, "step": 11095, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.95197 }, { "epoch": 0.36045869473410647, "grad_norm": 0.32621467113494873, "learning_rate": 7.588624788896928e-06, "loss": 0.023164736106991768, "memory(GiB)": 21.48, "step": 11096, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.951988 }, { "epoch": 0.3604911801968619, "grad_norm": 0.6589730381965637, "learning_rate": 7.58816521431551e-06, "loss": 0.024542029947042465, "memory(GiB)": 21.48, "step": 11097, "token_acc": 0.9905660377358491, "train_speed(iter/s)": 0.952006 }, { "epoch": 0.3605236656596173, "grad_norm": 0.3179318606853485, "learning_rate": 7.58770560986322e-06, "loss": 0.025545930489897728, "memory(GiB)": 21.48, "step": 11098, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.952023 }, { "epoch": 0.3605561511223727, "grad_norm": 0.5232716798782349, "learning_rate": 7.587245975545364e-06, "loss": 0.024310696870088577, "memory(GiB)": 21.48, "step": 11099, "token_acc": 0.9938650306748467, "train_speed(iter/s)": 0.952041 }, { "epoch": 0.36058863658512813, "grad_norm": 0.5403839945793152, "learning_rate": 7.5867863113672455e-06, "loss": 0.028096135705709457, "memory(GiB)": 21.48, "step": 11100, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.952059 }, { "epoch": 0.36062112204788355, "grad_norm": 0.35653841495513916, "learning_rate": 7.58632661733417e-06, "loss": 0.028286678716540337, "memory(GiB)": 21.48, "step": 11101, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.952077 }, { "epoch": 0.36065360751063896, "grad_norm": 0.36298057436943054, "learning_rate": 7.585866893451442e-06, "loss": 0.026618175208568573, "memory(GiB)": 21.48, "step": 11102, "token_acc": 0.995, "train_speed(iter/s)": 0.952097 }, { "epoch": 0.3606860929733944, "grad_norm": 1.3301150798797607, "learning_rate": 7.585407139724368e-06, "loss": 0.027712497860193253, "memory(GiB)": 21.48, "step": 11103, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.952116 }, { "epoch": 0.36071857843614985, "grad_norm": 0.4244183301925659, "learning_rate": 7.5849473561582565e-06, "loss": 0.020686369389295578, "memory(GiB)": 21.48, "step": 11104, "token_acc": 0.984375, "train_speed(iter/s)": 0.952135 }, { "epoch": 0.36075106389890527, "grad_norm": 0.3749017119407654, "learning_rate": 7.5844875427584095e-06, "loss": 0.026698719710111618, "memory(GiB)": 21.48, "step": 11105, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.952155 }, { "epoch": 0.3607835493616607, "grad_norm": 0.6774434447288513, "learning_rate": 7.584027699530139e-06, "loss": 0.03496461734175682, "memory(GiB)": 21.48, "step": 11106, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.952173 }, { "epoch": 0.3608160348244161, "grad_norm": 0.4564642608165741, "learning_rate": 7.583567826478747e-06, "loss": 0.02728121355175972, "memory(GiB)": 21.48, "step": 11107, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.952191 }, { "epoch": 0.3608485202871715, "grad_norm": 0.5572955012321472, "learning_rate": 7.583107923609545e-06, "loss": 0.030863802880048752, "memory(GiB)": 21.48, "step": 11108, "token_acc": 0.9885714285714285, "train_speed(iter/s)": 0.952211 }, { "epoch": 0.36088100574992693, "grad_norm": 0.4776614308357239, "learning_rate": 7.58264799092784e-06, "loss": 0.029566112905740738, "memory(GiB)": 21.48, "step": 11109, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.952229 }, { "epoch": 0.36091349121268235, "grad_norm": 0.42956262826919556, "learning_rate": 7.582188028438937e-06, "loss": 0.026504982262849808, "memory(GiB)": 21.48, "step": 11110, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.952248 }, { "epoch": 0.36094597667543776, "grad_norm": 0.3650270104408264, "learning_rate": 7.58172803614815e-06, "loss": 0.02410818263888359, "memory(GiB)": 21.48, "step": 11111, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.952266 }, { "epoch": 0.3609784621381932, "grad_norm": 0.48947829008102417, "learning_rate": 7.581268014060783e-06, "loss": 0.029479870572686195, "memory(GiB)": 21.48, "step": 11112, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.952285 }, { "epoch": 0.3610109476009486, "grad_norm": 0.35018137097358704, "learning_rate": 7.580807962182149e-06, "loss": 0.023147491738200188, "memory(GiB)": 21.48, "step": 11113, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.952304 }, { "epoch": 0.361043433063704, "grad_norm": 0.6786619424819946, "learning_rate": 7.580347880517556e-06, "loss": 0.027441412210464478, "memory(GiB)": 21.48, "step": 11114, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.952319 }, { "epoch": 0.3610759185264594, "grad_norm": 0.3257756233215332, "learning_rate": 7.579887769072314e-06, "loss": 0.03097911924123764, "memory(GiB)": 21.48, "step": 11115, "token_acc": 0.9763313609467456, "train_speed(iter/s)": 0.952333 }, { "epoch": 0.36110840398921484, "grad_norm": 0.2911493182182312, "learning_rate": 7.579427627851732e-06, "loss": 0.017558567225933075, "memory(GiB)": 21.48, "step": 11116, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.952349 }, { "epoch": 0.36114088945197026, "grad_norm": 0.45068666338920593, "learning_rate": 7.578967456861124e-06, "loss": 0.02993171662092209, "memory(GiB)": 21.48, "step": 11117, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.952364 }, { "epoch": 0.3611733749147257, "grad_norm": 0.3197453022003174, "learning_rate": 7.578507256105798e-06, "loss": 0.02372811734676361, "memory(GiB)": 21.48, "step": 11118, "token_acc": 0.981549815498155, "train_speed(iter/s)": 0.95238 }, { "epoch": 0.3612058603774811, "grad_norm": 0.3565520942211151, "learning_rate": 7.5780470255910654e-06, "loss": 0.01751488819718361, "memory(GiB)": 21.48, "step": 11119, "token_acc": 1.0, "train_speed(iter/s)": 0.952393 }, { "epoch": 0.3612383458402365, "grad_norm": 0.4116060137748718, "learning_rate": 7.57758676532224e-06, "loss": 0.03625015169382095, "memory(GiB)": 21.48, "step": 11120, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.952406 }, { "epoch": 0.3612708313029919, "grad_norm": 0.3623809814453125, "learning_rate": 7.5771264753046314e-06, "loss": 0.028167739510536194, "memory(GiB)": 21.48, "step": 11121, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.95242 }, { "epoch": 0.36130331676574734, "grad_norm": 0.33240216970443726, "learning_rate": 7.576666155543555e-06, "loss": 0.025866083800792694, "memory(GiB)": 21.48, "step": 11122, "token_acc": 0.9806763285024155, "train_speed(iter/s)": 0.952433 }, { "epoch": 0.36133580222850276, "grad_norm": 0.47346800565719604, "learning_rate": 7.576205806044319e-06, "loss": 0.027321310713887215, "memory(GiB)": 21.48, "step": 11123, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.952447 }, { "epoch": 0.36136828769125817, "grad_norm": 1.2522358894348145, "learning_rate": 7.575745426812242e-06, "loss": 0.036616142839193344, "memory(GiB)": 21.48, "step": 11124, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.95246 }, { "epoch": 0.3614007731540136, "grad_norm": 0.9889662265777588, "learning_rate": 7.575285017852634e-06, "loss": 0.020971570163965225, "memory(GiB)": 21.48, "step": 11125, "token_acc": 0.9930555555555556, "train_speed(iter/s)": 0.952473 }, { "epoch": 0.361433258616769, "grad_norm": 0.5036208629608154, "learning_rate": 7.574824579170805e-06, "loss": 0.025678005069494247, "memory(GiB)": 21.48, "step": 11126, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.952486 }, { "epoch": 0.3614657440795244, "grad_norm": 0.31724804639816284, "learning_rate": 7.574364110772078e-06, "loss": 0.023608747869729996, "memory(GiB)": 21.48, "step": 11127, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.952498 }, { "epoch": 0.36149822954227984, "grad_norm": 0.5883055329322815, "learning_rate": 7.57390361266176e-06, "loss": 0.027265319600701332, "memory(GiB)": 21.48, "step": 11128, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.952511 }, { "epoch": 0.36153071500503525, "grad_norm": 0.4351257085800171, "learning_rate": 7.573443084845171e-06, "loss": 0.026427600532770157, "memory(GiB)": 21.48, "step": 11129, "token_acc": 0.9885931558935361, "train_speed(iter/s)": 0.952523 }, { "epoch": 0.36156320046779067, "grad_norm": 0.4640214443206787, "learning_rate": 7.572982527327622e-06, "loss": 0.030633237212896347, "memory(GiB)": 21.48, "step": 11130, "token_acc": 0.9890909090909091, "train_speed(iter/s)": 0.952535 }, { "epoch": 0.3615956859305461, "grad_norm": 0.41428813338279724, "learning_rate": 7.57252194011443e-06, "loss": 0.025370951741933823, "memory(GiB)": 21.48, "step": 11131, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.952548 }, { "epoch": 0.3616281713933015, "grad_norm": 0.47446373105049133, "learning_rate": 7.5720613232109115e-06, "loss": 0.027503132820129395, "memory(GiB)": 21.48, "step": 11132, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.95256 }, { "epoch": 0.3616606568560569, "grad_norm": 0.30264338850975037, "learning_rate": 7.571600676622381e-06, "loss": 0.020750388503074646, "memory(GiB)": 21.48, "step": 11133, "token_acc": 0.9966996699669967, "train_speed(iter/s)": 0.952572 }, { "epoch": 0.36169314231881233, "grad_norm": 0.4372638463973999, "learning_rate": 7.571140000354157e-06, "loss": 0.019767167046666145, "memory(GiB)": 21.48, "step": 11134, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.952584 }, { "epoch": 0.36172562778156775, "grad_norm": 0.5707616209983826, "learning_rate": 7.570679294411554e-06, "loss": 0.025213059037923813, "memory(GiB)": 21.48, "step": 11135, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.952598 }, { "epoch": 0.36175811324432317, "grad_norm": 0.24528123438358307, "learning_rate": 7.570218558799891e-06, "loss": 0.01698736846446991, "memory(GiB)": 21.48, "step": 11136, "token_acc": 1.0, "train_speed(iter/s)": 0.95261 }, { "epoch": 0.3617905987070786, "grad_norm": 0.5336721539497375, "learning_rate": 7.569757793524485e-06, "loss": 0.028831878677010536, "memory(GiB)": 21.48, "step": 11137, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.952623 }, { "epoch": 0.361823084169834, "grad_norm": 0.4670853316783905, "learning_rate": 7.569296998590654e-06, "loss": 0.037010565400123596, "memory(GiB)": 21.48, "step": 11138, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.952638 }, { "epoch": 0.3618555696325894, "grad_norm": 0.3229161500930786, "learning_rate": 7.5688361740037155e-06, "loss": 0.016877662390470505, "memory(GiB)": 21.48, "step": 11139, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.952657 }, { "epoch": 0.36188805509534483, "grad_norm": 0.3774798810482025, "learning_rate": 7.568375319768988e-06, "loss": 0.024800773710012436, "memory(GiB)": 21.48, "step": 11140, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.952676 }, { "epoch": 0.36192054055810025, "grad_norm": 0.3847992420196533, "learning_rate": 7.56791443589179e-06, "loss": 0.022385720163583755, "memory(GiB)": 21.48, "step": 11141, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.952695 }, { "epoch": 0.36195302602085566, "grad_norm": 0.3657284379005432, "learning_rate": 7.5674535223774435e-06, "loss": 0.03292207047343254, "memory(GiB)": 21.48, "step": 11142, "token_acc": 0.9877551020408163, "train_speed(iter/s)": 0.952714 }, { "epoch": 0.3619855114836111, "grad_norm": 0.4594847559928894, "learning_rate": 7.5669925792312646e-06, "loss": 0.03310853987932205, "memory(GiB)": 21.48, "step": 11143, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.952733 }, { "epoch": 0.3620179969463665, "grad_norm": 0.4465893805027008, "learning_rate": 7.566531606458576e-06, "loss": 0.028160300105810165, "memory(GiB)": 21.48, "step": 11144, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.952753 }, { "epoch": 0.3620504824091219, "grad_norm": 0.3356051445007324, "learning_rate": 7.5660706040646955e-06, "loss": 0.026569869369268417, "memory(GiB)": 21.48, "step": 11145, "token_acc": 0.9893238434163701, "train_speed(iter/s)": 0.952772 }, { "epoch": 0.3620829678718773, "grad_norm": 1.0265003442764282, "learning_rate": 7.565609572054946e-06, "loss": 0.028180228546261787, "memory(GiB)": 21.48, "step": 11146, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.952791 }, { "epoch": 0.36211545333463274, "grad_norm": 0.5466146469116211, "learning_rate": 7.5651485104346454e-06, "loss": 0.030537821352481842, "memory(GiB)": 21.48, "step": 11147, "token_acc": 0.9798387096774194, "train_speed(iter/s)": 0.952811 }, { "epoch": 0.36214793879738816, "grad_norm": 0.4616595208644867, "learning_rate": 7.564687419209119e-06, "loss": 0.020038725808262825, "memory(GiB)": 21.48, "step": 11148, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.95283 }, { "epoch": 0.3621804242601436, "grad_norm": 0.5039592981338501, "learning_rate": 7.5642262983836846e-06, "loss": 0.030971821397542953, "memory(GiB)": 21.48, "step": 11149, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.952849 }, { "epoch": 0.362212909722899, "grad_norm": 0.36375024914741516, "learning_rate": 7.563765147963666e-06, "loss": 0.02909410372376442, "memory(GiB)": 21.48, "step": 11150, "token_acc": 1.0, "train_speed(iter/s)": 0.952868 }, { "epoch": 0.3622453951856544, "grad_norm": 0.3211314082145691, "learning_rate": 7.563303967954385e-06, "loss": 0.023037943989038467, "memory(GiB)": 21.48, "step": 11151, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.952887 }, { "epoch": 0.3622778806484098, "grad_norm": 0.48570194840431213, "learning_rate": 7.5628427583611665e-06, "loss": 0.03566737473011017, "memory(GiB)": 21.48, "step": 11152, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.952906 }, { "epoch": 0.36231036611116524, "grad_norm": 0.5142902135848999, "learning_rate": 7.56238151918933e-06, "loss": 0.0295025035738945, "memory(GiB)": 21.48, "step": 11153, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.952926 }, { "epoch": 0.36234285157392065, "grad_norm": 0.5345895886421204, "learning_rate": 7.561920250444199e-06, "loss": 0.03315453976392746, "memory(GiB)": 21.48, "step": 11154, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.952945 }, { "epoch": 0.36237533703667607, "grad_norm": 0.4279131591320038, "learning_rate": 7.5614589521311e-06, "loss": 0.03326743096113205, "memory(GiB)": 21.48, "step": 11155, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.952964 }, { "epoch": 0.3624078224994315, "grad_norm": 0.46130502223968506, "learning_rate": 7.560997624255355e-06, "loss": 0.025270473212003708, "memory(GiB)": 21.48, "step": 11156, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.95298 }, { "epoch": 0.3624403079621869, "grad_norm": 0.35581183433532715, "learning_rate": 7.560536266822287e-06, "loss": 0.0282368753105402, "memory(GiB)": 21.48, "step": 11157, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.952999 }, { "epoch": 0.3624727934249423, "grad_norm": 0.2913742661476135, "learning_rate": 7.560074879837225e-06, "loss": 0.024551719427108765, "memory(GiB)": 21.48, "step": 11158, "token_acc": 0.9795918367346939, "train_speed(iter/s)": 0.953018 }, { "epoch": 0.36250527888769773, "grad_norm": 0.37055420875549316, "learning_rate": 7.559613463305491e-06, "loss": 0.03248785808682442, "memory(GiB)": 21.48, "step": 11159, "token_acc": 0.9856459330143541, "train_speed(iter/s)": 0.953037 }, { "epoch": 0.36253776435045315, "grad_norm": 0.4958726167678833, "learning_rate": 7.559152017232409e-06, "loss": 0.03228551521897316, "memory(GiB)": 21.48, "step": 11160, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.953056 }, { "epoch": 0.36257024981320857, "grad_norm": 0.4046424925327301, "learning_rate": 7.558690541623308e-06, "loss": 0.02728932723402977, "memory(GiB)": 21.48, "step": 11161, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.953074 }, { "epoch": 0.362602735275964, "grad_norm": 0.5275179147720337, "learning_rate": 7.558229036483513e-06, "loss": 0.03241237252950668, "memory(GiB)": 21.48, "step": 11162, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.953093 }, { "epoch": 0.3626352207387194, "grad_norm": 0.39556875824928284, "learning_rate": 7.5577675018183495e-06, "loss": 0.024404995143413544, "memory(GiB)": 21.48, "step": 11163, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.953112 }, { "epoch": 0.3626677062014748, "grad_norm": 0.2850051522254944, "learning_rate": 7.557305937633144e-06, "loss": 0.019638795405626297, "memory(GiB)": 21.48, "step": 11164, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.953132 }, { "epoch": 0.36270019166423023, "grad_norm": 0.4365612268447876, "learning_rate": 7.556844343933224e-06, "loss": 0.029109865427017212, "memory(GiB)": 21.48, "step": 11165, "token_acc": 1.0, "train_speed(iter/s)": 0.953151 }, { "epoch": 0.36273267712698565, "grad_norm": 0.4900135397911072, "learning_rate": 7.556382720723919e-06, "loss": 0.03369652107357979, "memory(GiB)": 21.48, "step": 11166, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.95317 }, { "epoch": 0.36276516258974106, "grad_norm": 0.28896865248680115, "learning_rate": 7.555921068010553e-06, "loss": 0.019496329128742218, "memory(GiB)": 21.48, "step": 11167, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.953187 }, { "epoch": 0.36279764805249654, "grad_norm": 0.4605247974395752, "learning_rate": 7.555459385798456e-06, "loss": 0.026140807196497917, "memory(GiB)": 21.48, "step": 11168, "token_acc": 1.0, "train_speed(iter/s)": 0.953204 }, { "epoch": 0.36283013351525195, "grad_norm": 0.5025503635406494, "learning_rate": 7.554997674092958e-06, "loss": 0.03362828493118286, "memory(GiB)": 21.48, "step": 11169, "token_acc": 1.0, "train_speed(iter/s)": 0.953223 }, { "epoch": 0.36286261897800737, "grad_norm": 1.2753167152404785, "learning_rate": 7.554535932899385e-06, "loss": 0.01948927342891693, "memory(GiB)": 21.48, "step": 11170, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.953242 }, { "epoch": 0.3628951044407628, "grad_norm": 0.3832559287548065, "learning_rate": 7.554074162223067e-06, "loss": 0.0260603204369545, "memory(GiB)": 21.48, "step": 11171, "token_acc": 0.9795081967213115, "train_speed(iter/s)": 0.953261 }, { "epoch": 0.3629275899035182, "grad_norm": 0.337427020072937, "learning_rate": 7.553612362069334e-06, "loss": 0.025915788486599922, "memory(GiB)": 21.48, "step": 11172, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.95328 }, { "epoch": 0.3629600753662736, "grad_norm": 0.2885228991508484, "learning_rate": 7.553150532443514e-06, "loss": 0.022784272208809853, "memory(GiB)": 21.48, "step": 11173, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.953299 }, { "epoch": 0.36299256082902903, "grad_norm": 0.45286932587623596, "learning_rate": 7.552688673350941e-06, "loss": 0.025748852640390396, "memory(GiB)": 21.48, "step": 11174, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.953316 }, { "epoch": 0.36302504629178445, "grad_norm": 0.5537194013595581, "learning_rate": 7.552226784796941e-06, "loss": 0.029697535559535027, "memory(GiB)": 21.48, "step": 11175, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.953335 }, { "epoch": 0.36305753175453986, "grad_norm": 0.35046449303627014, "learning_rate": 7.551764866786849e-06, "loss": 0.025801243260502815, "memory(GiB)": 21.48, "step": 11176, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.953352 }, { "epoch": 0.3630900172172953, "grad_norm": 0.5131091475486755, "learning_rate": 7.5513029193259935e-06, "loss": 0.022089969366788864, "memory(GiB)": 21.48, "step": 11177, "token_acc": 0.976303317535545, "train_speed(iter/s)": 0.953367 }, { "epoch": 0.3631225026800507, "grad_norm": 0.4035913050174713, "learning_rate": 7.550840942419706e-06, "loss": 0.02593635767698288, "memory(GiB)": 21.48, "step": 11178, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.953383 }, { "epoch": 0.3631549881428061, "grad_norm": 0.5422457456588745, "learning_rate": 7.55037893607332e-06, "loss": 0.030187061056494713, "memory(GiB)": 21.48, "step": 11179, "token_acc": 1.0, "train_speed(iter/s)": 0.953397 }, { "epoch": 0.36318747360556153, "grad_norm": 0.5398083925247192, "learning_rate": 7.549916900292165e-06, "loss": 0.03622712939977646, "memory(GiB)": 21.48, "step": 11180, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.953409 }, { "epoch": 0.36321995906831694, "grad_norm": 0.41323405504226685, "learning_rate": 7.549454835081575e-06, "loss": 0.029824845492839813, "memory(GiB)": 21.48, "step": 11181, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.953422 }, { "epoch": 0.36325244453107236, "grad_norm": 0.47536417841911316, "learning_rate": 7.548992740446883e-06, "loss": 0.03662731498479843, "memory(GiB)": 21.48, "step": 11182, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.953436 }, { "epoch": 0.3632849299938278, "grad_norm": 0.41147956252098083, "learning_rate": 7.548530616393424e-06, "loss": 0.027114402502775192, "memory(GiB)": 21.48, "step": 11183, "token_acc": 1.0, "train_speed(iter/s)": 0.953449 }, { "epoch": 0.3633174154565832, "grad_norm": 0.4276885390281677, "learning_rate": 7.548068462926528e-06, "loss": 0.022515328601002693, "memory(GiB)": 21.48, "step": 11184, "token_acc": 1.0, "train_speed(iter/s)": 0.953461 }, { "epoch": 0.3633499009193386, "grad_norm": 0.5308518409729004, "learning_rate": 7.547606280051531e-06, "loss": 0.03374404087662697, "memory(GiB)": 21.48, "step": 11185, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.953473 }, { "epoch": 0.363382386382094, "grad_norm": 0.4268885850906372, "learning_rate": 7.547144067773766e-06, "loss": 0.025959551334381104, "memory(GiB)": 21.48, "step": 11186, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.953487 }, { "epoch": 0.36341487184484944, "grad_norm": 0.42560508847236633, "learning_rate": 7.546681826098568e-06, "loss": 0.023783039301633835, "memory(GiB)": 21.48, "step": 11187, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.9535 }, { "epoch": 0.36344735730760486, "grad_norm": 0.4236477315425873, "learning_rate": 7.546219555031273e-06, "loss": 0.027062419801950455, "memory(GiB)": 21.48, "step": 11188, "token_acc": 0.9853479853479854, "train_speed(iter/s)": 0.953513 }, { "epoch": 0.3634798427703603, "grad_norm": 0.2939222753047943, "learning_rate": 7.545757254577214e-06, "loss": 0.01856943592429161, "memory(GiB)": 21.48, "step": 11189, "token_acc": 1.0, "train_speed(iter/s)": 0.953526 }, { "epoch": 0.3635123282331157, "grad_norm": 0.469167023897171, "learning_rate": 7.545294924741729e-06, "loss": 0.0281058456748724, "memory(GiB)": 21.48, "step": 11190, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.953539 }, { "epoch": 0.3635448136958711, "grad_norm": 0.5039453506469727, "learning_rate": 7.5448325655301524e-06, "loss": 0.03064846433699131, "memory(GiB)": 21.48, "step": 11191, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.953552 }, { "epoch": 0.3635772991586265, "grad_norm": 0.3684757649898529, "learning_rate": 7.544370176947822e-06, "loss": 0.023671172559261322, "memory(GiB)": 21.48, "step": 11192, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.953565 }, { "epoch": 0.36360978462138194, "grad_norm": 0.42598456144332886, "learning_rate": 7.543907759000071e-06, "loss": 0.026757339015603065, "memory(GiB)": 21.48, "step": 11193, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.953578 }, { "epoch": 0.36364227008413735, "grad_norm": 0.28962862491607666, "learning_rate": 7.543445311692241e-06, "loss": 0.027524927631020546, "memory(GiB)": 21.48, "step": 11194, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.953591 }, { "epoch": 0.36367475554689277, "grad_norm": 0.4202868938446045, "learning_rate": 7.542982835029667e-06, "loss": 0.0304437056183815, "memory(GiB)": 21.48, "step": 11195, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.953603 }, { "epoch": 0.3637072410096482, "grad_norm": 0.7268503904342651, "learning_rate": 7.542520329017684e-06, "loss": 0.0317174568772316, "memory(GiB)": 21.48, "step": 11196, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.953615 }, { "epoch": 0.3637397264724036, "grad_norm": 0.4685031473636627, "learning_rate": 7.5420577936616325e-06, "loss": 0.027653899043798447, "memory(GiB)": 21.48, "step": 11197, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.953627 }, { "epoch": 0.363772211935159, "grad_norm": 1.6398426294326782, "learning_rate": 7.541595228966852e-06, "loss": 0.02917628362774849, "memory(GiB)": 21.48, "step": 11198, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.95364 }, { "epoch": 0.36380469739791443, "grad_norm": 0.39845192432403564, "learning_rate": 7.5411326349386795e-06, "loss": 0.02313256822526455, "memory(GiB)": 21.48, "step": 11199, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.953656 }, { "epoch": 0.36383718286066985, "grad_norm": 0.3394814431667328, "learning_rate": 7.540670011582453e-06, "loss": 0.03061695210635662, "memory(GiB)": 21.48, "step": 11200, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.953673 }, { "epoch": 0.36386966832342527, "grad_norm": 0.4085811674594879, "learning_rate": 7.540207358903513e-06, "loss": 0.024289527907967567, "memory(GiB)": 21.48, "step": 11201, "token_acc": 0.9869565217391304, "train_speed(iter/s)": 0.95369 }, { "epoch": 0.3639021537861807, "grad_norm": 0.40859800577163696, "learning_rate": 7.539744676907199e-06, "loss": 0.032127611339092255, "memory(GiB)": 21.48, "step": 11202, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.953708 }, { "epoch": 0.3639346392489361, "grad_norm": 0.4209331274032593, "learning_rate": 7.539281965598853e-06, "loss": 0.02409069985151291, "memory(GiB)": 21.48, "step": 11203, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.953726 }, { "epoch": 0.3639671247116915, "grad_norm": 0.535330057144165, "learning_rate": 7.538819224983811e-06, "loss": 0.03930323198437691, "memory(GiB)": 21.48, "step": 11204, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.953746 }, { "epoch": 0.36399961017444693, "grad_norm": 0.5755064487457275, "learning_rate": 7.538356455067417e-06, "loss": 0.025164805352687836, "memory(GiB)": 21.48, "step": 11205, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.953764 }, { "epoch": 0.36403209563720235, "grad_norm": 0.3585813045501709, "learning_rate": 7.537893655855012e-06, "loss": 0.024670280516147614, "memory(GiB)": 21.48, "step": 11206, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.953783 }, { "epoch": 0.36406458109995776, "grad_norm": 0.5115139484405518, "learning_rate": 7.537430827351935e-06, "loss": 0.023066503927111626, "memory(GiB)": 21.48, "step": 11207, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.953802 }, { "epoch": 0.3640970665627132, "grad_norm": 0.34273067116737366, "learning_rate": 7.536967969563529e-06, "loss": 0.02682490274310112, "memory(GiB)": 21.48, "step": 11208, "token_acc": 0.996, "train_speed(iter/s)": 0.953822 }, { "epoch": 0.3641295520254686, "grad_norm": 0.5737704634666443, "learning_rate": 7.536505082495138e-06, "loss": 0.024196717888116837, "memory(GiB)": 21.48, "step": 11209, "token_acc": 0.975, "train_speed(iter/s)": 0.953841 }, { "epoch": 0.364162037488224, "grad_norm": 0.3316200077533722, "learning_rate": 7.5360421661521e-06, "loss": 0.023416686803102493, "memory(GiB)": 21.48, "step": 11210, "token_acc": 0.9918367346938776, "train_speed(iter/s)": 0.953861 }, { "epoch": 0.3641945229509794, "grad_norm": 0.37718600034713745, "learning_rate": 7.535579220539759e-06, "loss": 0.027470573782920837, "memory(GiB)": 21.48, "step": 11211, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.953881 }, { "epoch": 0.36422700841373484, "grad_norm": 0.42621910572052, "learning_rate": 7.535116245663461e-06, "loss": 0.030802570283412933, "memory(GiB)": 21.48, "step": 11212, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.953899 }, { "epoch": 0.36425949387649026, "grad_norm": 0.372708797454834, "learning_rate": 7.534653241528547e-06, "loss": 0.023571904748678207, "memory(GiB)": 21.48, "step": 11213, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.953917 }, { "epoch": 0.3642919793392457, "grad_norm": 0.328316867351532, "learning_rate": 7.534190208140359e-06, "loss": 0.024204764515161514, "memory(GiB)": 21.48, "step": 11214, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.953936 }, { "epoch": 0.3643244648020011, "grad_norm": 0.47415801882743835, "learning_rate": 7.533727145504246e-06, "loss": 0.02645125798881054, "memory(GiB)": 21.48, "step": 11215, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.953955 }, { "epoch": 0.3643569502647565, "grad_norm": 0.46038153767585754, "learning_rate": 7.5332640536255485e-06, "loss": 0.029683206230401993, "memory(GiB)": 21.48, "step": 11216, "token_acc": 0.979253112033195, "train_speed(iter/s)": 0.953974 }, { "epoch": 0.3643894357275119, "grad_norm": 0.7651655673980713, "learning_rate": 7.532800932509612e-06, "loss": 0.03059866651892662, "memory(GiB)": 21.48, "step": 11217, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.953992 }, { "epoch": 0.36442192119026734, "grad_norm": 0.429210364818573, "learning_rate": 7.532337782161781e-06, "loss": 0.03156380355358124, "memory(GiB)": 21.48, "step": 11218, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.954012 }, { "epoch": 0.36445440665302276, "grad_norm": 0.5222322344779968, "learning_rate": 7.5318746025874e-06, "loss": 0.034869588911533356, "memory(GiB)": 21.48, "step": 11219, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.95403 }, { "epoch": 0.36448689211577817, "grad_norm": 0.4198702275753021, "learning_rate": 7.531411393791819e-06, "loss": 0.033163297921419144, "memory(GiB)": 21.48, "step": 11220, "token_acc": 0.9828178694158075, "train_speed(iter/s)": 0.954049 }, { "epoch": 0.3645193775785336, "grad_norm": 0.34670543670654297, "learning_rate": 7.530948155780379e-06, "loss": 0.024318277835845947, "memory(GiB)": 21.48, "step": 11221, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.954067 }, { "epoch": 0.364551863041289, "grad_norm": 0.4636530876159668, "learning_rate": 7.530484888558429e-06, "loss": 0.03352297469973564, "memory(GiB)": 21.48, "step": 11222, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.954085 }, { "epoch": 0.3645843485040444, "grad_norm": 0.393398255109787, "learning_rate": 7.530021592131317e-06, "loss": 0.03292054682970047, "memory(GiB)": 21.48, "step": 11223, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.954104 }, { "epoch": 0.36461683396679984, "grad_norm": 0.39228859543800354, "learning_rate": 7.529558266504387e-06, "loss": 0.027562854811549187, "memory(GiB)": 21.48, "step": 11224, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.954122 }, { "epoch": 0.36464931942955525, "grad_norm": 0.3168638348579407, "learning_rate": 7.529094911682988e-06, "loss": 0.027118651196360588, "memory(GiB)": 21.48, "step": 11225, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.954142 }, { "epoch": 0.36468180489231067, "grad_norm": 0.34193018078804016, "learning_rate": 7.528631527672468e-06, "loss": 0.021254394203424454, "memory(GiB)": 21.48, "step": 11226, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.95416 }, { "epoch": 0.3647142903550661, "grad_norm": 0.4596995413303375, "learning_rate": 7.528168114478174e-06, "loss": 0.03125233203172684, "memory(GiB)": 21.48, "step": 11227, "token_acc": 0.9842105263157894, "train_speed(iter/s)": 0.954177 }, { "epoch": 0.3647467758178215, "grad_norm": 0.4073205292224884, "learning_rate": 7.527704672105455e-06, "loss": 0.026804551482200623, "memory(GiB)": 21.48, "step": 11228, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.954196 }, { "epoch": 0.3647792612805769, "grad_norm": 0.3591534197330475, "learning_rate": 7.527241200559661e-06, "loss": 0.02526823617517948, "memory(GiB)": 21.48, "step": 11229, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.954215 }, { "epoch": 0.36481174674333233, "grad_norm": 0.4188431203365326, "learning_rate": 7.526777699846137e-06, "loss": 0.031804464757442474, "memory(GiB)": 21.48, "step": 11230, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.954233 }, { "epoch": 0.36484423220608775, "grad_norm": 0.29561948776245117, "learning_rate": 7.526314169970237e-06, "loss": 0.01908676140010357, "memory(GiB)": 21.48, "step": 11231, "token_acc": 0.9862542955326461, "train_speed(iter/s)": 0.954252 }, { "epoch": 0.3648767176688432, "grad_norm": 0.3863189220428467, "learning_rate": 7.52585061093731e-06, "loss": 0.028661180287599564, "memory(GiB)": 21.48, "step": 11232, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.95427 }, { "epoch": 0.36490920313159864, "grad_norm": 0.574134349822998, "learning_rate": 7.525387022752703e-06, "loss": 0.040353771299123764, "memory(GiB)": 21.48, "step": 11233, "token_acc": 0.9730769230769231, "train_speed(iter/s)": 0.954287 }, { "epoch": 0.36494168859435405, "grad_norm": 0.3618174195289612, "learning_rate": 7.524923405421769e-06, "loss": 0.03462981432676315, "memory(GiB)": 21.48, "step": 11234, "token_acc": 0.979253112033195, "train_speed(iter/s)": 0.954306 }, { "epoch": 0.36497417405710947, "grad_norm": 0.2921423614025116, "learning_rate": 7.524459758949858e-06, "loss": 0.02029665932059288, "memory(GiB)": 21.48, "step": 11235, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.954322 }, { "epoch": 0.3650066595198649, "grad_norm": 0.3711392283439636, "learning_rate": 7.523996083342322e-06, "loss": 0.02930408902466297, "memory(GiB)": 21.48, "step": 11236, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.95434 }, { "epoch": 0.3650391449826203, "grad_norm": 0.43299782276153564, "learning_rate": 7.5235323786045126e-06, "loss": 0.03169926255941391, "memory(GiB)": 21.48, "step": 11237, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.954359 }, { "epoch": 0.3650716304453757, "grad_norm": 0.33019787073135376, "learning_rate": 7.523068644741779e-06, "loss": 0.023111477494239807, "memory(GiB)": 21.48, "step": 11238, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.954377 }, { "epoch": 0.36510411590813113, "grad_norm": 0.5998063683509827, "learning_rate": 7.522604881759477e-06, "loss": 0.03029034473001957, "memory(GiB)": 21.48, "step": 11239, "token_acc": 0.9896193771626297, "train_speed(iter/s)": 0.954396 }, { "epoch": 0.36513660137088655, "grad_norm": 0.35789361596107483, "learning_rate": 7.522141089662955e-06, "loss": 0.02817879617214203, "memory(GiB)": 21.48, "step": 11240, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.954409 }, { "epoch": 0.36516908683364196, "grad_norm": 0.464205265045166, "learning_rate": 7.521677268457569e-06, "loss": 0.022849712520837784, "memory(GiB)": 21.48, "step": 11241, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.95442 }, { "epoch": 0.3652015722963974, "grad_norm": 0.3609946370124817, "learning_rate": 7.52121341814867e-06, "loss": 0.028311103582382202, "memory(GiB)": 21.48, "step": 11242, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.954433 }, { "epoch": 0.3652340577591528, "grad_norm": 0.37519198656082153, "learning_rate": 7.520749538741614e-06, "loss": 0.026854882016777992, "memory(GiB)": 21.48, "step": 11243, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.954446 }, { "epoch": 0.3652665432219082, "grad_norm": 0.40955302119255066, "learning_rate": 7.520285630241751e-06, "loss": 0.02813388593494892, "memory(GiB)": 21.48, "step": 11244, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.95446 }, { "epoch": 0.36529902868466363, "grad_norm": 0.5163887739181519, "learning_rate": 7.5198216926544396e-06, "loss": 0.02350451424717903, "memory(GiB)": 21.48, "step": 11245, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.954472 }, { "epoch": 0.36533151414741905, "grad_norm": 0.31299322843551636, "learning_rate": 7.519357725985032e-06, "loss": 0.018487513065338135, "memory(GiB)": 21.48, "step": 11246, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.954485 }, { "epoch": 0.36536399961017446, "grad_norm": 0.30990833044052124, "learning_rate": 7.518893730238881e-06, "loss": 0.022349059581756592, "memory(GiB)": 21.48, "step": 11247, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.954498 }, { "epoch": 0.3653964850729299, "grad_norm": 0.42439424991607666, "learning_rate": 7.518429705421346e-06, "loss": 0.027009673416614532, "memory(GiB)": 21.48, "step": 11248, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.954509 }, { "epoch": 0.3654289705356853, "grad_norm": 0.35525310039520264, "learning_rate": 7.517965651537778e-06, "loss": 0.024570975452661514, "memory(GiB)": 21.48, "step": 11249, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.954522 }, { "epoch": 0.3654614559984407, "grad_norm": 0.3652830719947815, "learning_rate": 7.517501568593535e-06, "loss": 0.029615553095936775, "memory(GiB)": 21.48, "step": 11250, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.954534 }, { "epoch": 0.3654939414611961, "grad_norm": 0.3950863778591156, "learning_rate": 7.517037456593974e-06, "loss": 0.027813713997602463, "memory(GiB)": 21.48, "step": 11251, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.954546 }, { "epoch": 0.36552642692395154, "grad_norm": 0.4753406345844269, "learning_rate": 7.516573315544451e-06, "loss": 0.029366418719291687, "memory(GiB)": 21.48, "step": 11252, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.95456 }, { "epoch": 0.36555891238670696, "grad_norm": 0.46835649013519287, "learning_rate": 7.516109145450323e-06, "loss": 0.02411707490682602, "memory(GiB)": 21.48, "step": 11253, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.954572 }, { "epoch": 0.3655913978494624, "grad_norm": 0.40844422578811646, "learning_rate": 7.515644946316944e-06, "loss": 0.02917690947651863, "memory(GiB)": 21.48, "step": 11254, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.954584 }, { "epoch": 0.3656238833122178, "grad_norm": 0.42596662044525146, "learning_rate": 7.515180718149677e-06, "loss": 0.029603034257888794, "memory(GiB)": 21.48, "step": 11255, "token_acc": 0.980544747081712, "train_speed(iter/s)": 0.954595 }, { "epoch": 0.3656563687749732, "grad_norm": 0.5928332209587097, "learning_rate": 7.5147164609538745e-06, "loss": 0.027507953345775604, "memory(GiB)": 21.48, "step": 11256, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.954607 }, { "epoch": 0.3656888542377286, "grad_norm": 0.3966730833053589, "learning_rate": 7.514252174734898e-06, "loss": 0.031194912269711494, "memory(GiB)": 21.48, "step": 11257, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.954619 }, { "epoch": 0.36572133970048404, "grad_norm": 0.3695860803127289, "learning_rate": 7.513787859498105e-06, "loss": 0.0228715930134058, "memory(GiB)": 21.48, "step": 11258, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.954632 }, { "epoch": 0.36575382516323945, "grad_norm": 0.46874111890792847, "learning_rate": 7.513323515248854e-06, "loss": 0.030711840838193893, "memory(GiB)": 21.48, "step": 11259, "token_acc": 1.0, "train_speed(iter/s)": 0.954644 }, { "epoch": 0.36578631062599487, "grad_norm": 0.43414321541786194, "learning_rate": 7.512859141992505e-06, "loss": 0.03256838396191597, "memory(GiB)": 21.48, "step": 11260, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.954659 }, { "epoch": 0.3658187960887503, "grad_norm": 0.3409872055053711, "learning_rate": 7.5123947397344165e-06, "loss": 0.030056504532694817, "memory(GiB)": 21.48, "step": 11261, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.954674 }, { "epoch": 0.3658512815515057, "grad_norm": 0.3007972240447998, "learning_rate": 7.511930308479949e-06, "loss": 0.019634973257780075, "memory(GiB)": 21.48, "step": 11262, "token_acc": 0.9885931558935361, "train_speed(iter/s)": 0.954691 }, { "epoch": 0.3658837670142611, "grad_norm": 0.5183420181274414, "learning_rate": 7.51146584823446e-06, "loss": 0.03318540006875992, "memory(GiB)": 21.48, "step": 11263, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.954699 }, { "epoch": 0.36591625247701653, "grad_norm": 0.3896534740924835, "learning_rate": 7.511001359003316e-06, "loss": 0.02945708855986595, "memory(GiB)": 21.48, "step": 11264, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.954718 }, { "epoch": 0.36594873793977195, "grad_norm": 0.5578521490097046, "learning_rate": 7.510536840791871e-06, "loss": 0.028079260140657425, "memory(GiB)": 21.48, "step": 11265, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.954737 }, { "epoch": 0.36598122340252737, "grad_norm": 0.3511140048503876, "learning_rate": 7.51007229360549e-06, "loss": 0.0293746255338192, "memory(GiB)": 21.48, "step": 11266, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.954754 }, { "epoch": 0.3660137088652828, "grad_norm": 0.33740970492362976, "learning_rate": 7.5096077174495365e-06, "loss": 0.022323036566376686, "memory(GiB)": 21.48, "step": 11267, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.954773 }, { "epoch": 0.3660461943280382, "grad_norm": 0.3712589740753174, "learning_rate": 7.5091431123293666e-06, "loss": 0.02848317101597786, "memory(GiB)": 21.48, "step": 11268, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.954789 }, { "epoch": 0.3660786797907936, "grad_norm": 0.4794325530529022, "learning_rate": 7.508678478250347e-06, "loss": 0.03159052133560181, "memory(GiB)": 21.48, "step": 11269, "token_acc": 1.0, "train_speed(iter/s)": 0.954807 }, { "epoch": 0.36611116525354903, "grad_norm": 0.5874074697494507, "learning_rate": 7.5082138152178384e-06, "loss": 0.027415238320827484, "memory(GiB)": 21.48, "step": 11270, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.954825 }, { "epoch": 0.36614365071630445, "grad_norm": 0.47524532675743103, "learning_rate": 7.5077491232372045e-06, "loss": 0.027473796159029007, "memory(GiB)": 21.48, "step": 11271, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.954844 }, { "epoch": 0.36617613617905986, "grad_norm": 0.29717519879341125, "learning_rate": 7.507284402313807e-06, "loss": 0.025511017069220543, "memory(GiB)": 21.48, "step": 11272, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.954863 }, { "epoch": 0.3662086216418153, "grad_norm": 0.47798362374305725, "learning_rate": 7.50681965245301e-06, "loss": 0.024814872071146965, "memory(GiB)": 21.48, "step": 11273, "token_acc": 1.0, "train_speed(iter/s)": 0.954881 }, { "epoch": 0.3662411071045707, "grad_norm": 0.5374142527580261, "learning_rate": 7.50635487366018e-06, "loss": 0.02556092105805874, "memory(GiB)": 21.48, "step": 11274, "token_acc": 1.0, "train_speed(iter/s)": 0.954898 }, { "epoch": 0.3662735925673261, "grad_norm": 0.428378701210022, "learning_rate": 7.505890065940676e-06, "loss": 0.031644150614738464, "memory(GiB)": 21.48, "step": 11275, "token_acc": 0.984, "train_speed(iter/s)": 0.954915 }, { "epoch": 0.36630607803008153, "grad_norm": 0.6564247608184814, "learning_rate": 7.5054252292998676e-06, "loss": 0.02768784947693348, "memory(GiB)": 21.48, "step": 11276, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.954932 }, { "epoch": 0.36633856349283694, "grad_norm": 0.5588319897651672, "learning_rate": 7.504960363743116e-06, "loss": 0.04664478078484535, "memory(GiB)": 21.48, "step": 11277, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.954951 }, { "epoch": 0.36637104895559236, "grad_norm": 0.5525262355804443, "learning_rate": 7.5044954692757874e-06, "loss": 0.03769662603735924, "memory(GiB)": 21.48, "step": 11278, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.954969 }, { "epoch": 0.3664035344183478, "grad_norm": 0.44358521699905396, "learning_rate": 7.504030545903248e-06, "loss": 0.02802913263440132, "memory(GiB)": 21.48, "step": 11279, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.954987 }, { "epoch": 0.3664360198811032, "grad_norm": 0.42207223176956177, "learning_rate": 7.503565593630863e-06, "loss": 0.028920430690050125, "memory(GiB)": 21.48, "step": 11280, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.955005 }, { "epoch": 0.3664685053438586, "grad_norm": 0.7915191650390625, "learning_rate": 7.503100612463999e-06, "loss": 0.03967508673667908, "memory(GiB)": 21.48, "step": 11281, "token_acc": 0.981042654028436, "train_speed(iter/s)": 0.955024 }, { "epoch": 0.366500990806614, "grad_norm": 0.29862168431282043, "learning_rate": 7.502635602408023e-06, "loss": 0.031500935554504395, "memory(GiB)": 21.48, "step": 11282, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.955042 }, { "epoch": 0.36653347626936944, "grad_norm": 0.3731590509414673, "learning_rate": 7.502170563468301e-06, "loss": 0.024149933829903603, "memory(GiB)": 21.48, "step": 11283, "token_acc": 1.0, "train_speed(iter/s)": 0.95506 }, { "epoch": 0.36656596173212486, "grad_norm": 0.4559820294380188, "learning_rate": 7.501705495650199e-06, "loss": 0.0254482664167881, "memory(GiB)": 21.48, "step": 11284, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955078 }, { "epoch": 0.3665984471948803, "grad_norm": 0.38357049226760864, "learning_rate": 7.501240398959087e-06, "loss": 0.026282358914613724, "memory(GiB)": 21.48, "step": 11285, "token_acc": 1.0, "train_speed(iter/s)": 0.955095 }, { "epoch": 0.3666309326576357, "grad_norm": 0.2810610830783844, "learning_rate": 7.500775273400331e-06, "loss": 0.023758091032505035, "memory(GiB)": 21.48, "step": 11286, "token_acc": 0.9894179894179894, "train_speed(iter/s)": 0.955113 }, { "epoch": 0.3666634181203911, "grad_norm": 0.31461960077285767, "learning_rate": 7.500310118979302e-06, "loss": 0.023936517536640167, "memory(GiB)": 21.48, "step": 11287, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.955132 }, { "epoch": 0.3666959035831465, "grad_norm": 0.3805987238883972, "learning_rate": 7.499844935701363e-06, "loss": 0.02806181088089943, "memory(GiB)": 21.48, "step": 11288, "token_acc": 0.9779005524861878, "train_speed(iter/s)": 0.95515 }, { "epoch": 0.36672838904590194, "grad_norm": 0.430366575717926, "learning_rate": 7.499379723571888e-06, "loss": 0.029750464484095573, "memory(GiB)": 21.48, "step": 11289, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955167 }, { "epoch": 0.36676087450865735, "grad_norm": 0.514606773853302, "learning_rate": 7.498914482596244e-06, "loss": 0.03728075325489044, "memory(GiB)": 21.48, "step": 11290, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955186 }, { "epoch": 0.36679335997141277, "grad_norm": 0.3833642899990082, "learning_rate": 7.498449212779802e-06, "loss": 0.02706748992204666, "memory(GiB)": 21.48, "step": 11291, "token_acc": 0.988, "train_speed(iter/s)": 0.955203 }, { "epoch": 0.3668258454341682, "grad_norm": 0.32704782485961914, "learning_rate": 7.497983914127929e-06, "loss": 0.023785244673490524, "memory(GiB)": 21.48, "step": 11292, "token_acc": 1.0, "train_speed(iter/s)": 0.955221 }, { "epoch": 0.3668583308969236, "grad_norm": 0.3357328474521637, "learning_rate": 7.4975185866459965e-06, "loss": 0.03043610416352749, "memory(GiB)": 21.48, "step": 11293, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.955238 }, { "epoch": 0.366890816359679, "grad_norm": 0.29135861992836, "learning_rate": 7.497053230339377e-06, "loss": 0.024562858045101166, "memory(GiB)": 21.48, "step": 11294, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.955257 }, { "epoch": 0.36692330182243443, "grad_norm": 0.2953820526599884, "learning_rate": 7.496587845213439e-06, "loss": 0.02523241750895977, "memory(GiB)": 21.48, "step": 11295, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.955274 }, { "epoch": 0.3669557872851899, "grad_norm": 0.3182544410228729, "learning_rate": 7.4961224312735525e-06, "loss": 0.022870037704706192, "memory(GiB)": 21.48, "step": 11296, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.955291 }, { "epoch": 0.3669882727479453, "grad_norm": 0.5149585008621216, "learning_rate": 7.495656988525093e-06, "loss": 0.02817184291779995, "memory(GiB)": 21.48, "step": 11297, "token_acc": 1.0, "train_speed(iter/s)": 0.95531 }, { "epoch": 0.36702075821070074, "grad_norm": 0.39205247163772583, "learning_rate": 7.495191516973429e-06, "loss": 0.03883980214595795, "memory(GiB)": 21.48, "step": 11298, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.955328 }, { "epoch": 0.36705324367345615, "grad_norm": 0.41261371970176697, "learning_rate": 7.494726016623935e-06, "loss": 0.02614334225654602, "memory(GiB)": 21.48, "step": 11299, "token_acc": 0.9814126394052045, "train_speed(iter/s)": 0.955346 }, { "epoch": 0.36708572913621157, "grad_norm": 0.31908372044563293, "learning_rate": 7.494260487481981e-06, "loss": 0.026127934455871582, "memory(GiB)": 21.48, "step": 11300, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955364 }, { "epoch": 0.367118214598967, "grad_norm": 0.37909257411956787, "learning_rate": 7.493794929552942e-06, "loss": 0.02977374568581581, "memory(GiB)": 21.48, "step": 11301, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.955378 }, { "epoch": 0.3671507000617224, "grad_norm": 0.3346223831176758, "learning_rate": 7.493329342842192e-06, "loss": 0.024201786145567894, "memory(GiB)": 21.48, "step": 11302, "token_acc": 0.985, "train_speed(iter/s)": 0.955392 }, { "epoch": 0.3671831855244778, "grad_norm": 0.47613590955734253, "learning_rate": 7.4928637273551e-06, "loss": 0.03474746644496918, "memory(GiB)": 21.48, "step": 11303, "token_acc": 0.9759615384615384, "train_speed(iter/s)": 0.955405 }, { "epoch": 0.36721567098723323, "grad_norm": 0.4229261875152588, "learning_rate": 7.492398083097043e-06, "loss": 0.028201701119542122, "memory(GiB)": 21.48, "step": 11304, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.955417 }, { "epoch": 0.36724815644998865, "grad_norm": 0.42190223932266235, "learning_rate": 7.491932410073394e-06, "loss": 0.029536809772253036, "memory(GiB)": 21.48, "step": 11305, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955429 }, { "epoch": 0.36728064191274407, "grad_norm": 0.5152167677879333, "learning_rate": 7.491466708289531e-06, "loss": 0.019021617248654366, "memory(GiB)": 21.48, "step": 11306, "token_acc": 1.0, "train_speed(iter/s)": 0.955442 }, { "epoch": 0.3673131273754995, "grad_norm": 0.3853268623352051, "learning_rate": 7.491000977750824e-06, "loss": 0.028851887211203575, "memory(GiB)": 21.48, "step": 11307, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.955455 }, { "epoch": 0.3673456128382549, "grad_norm": 0.3886658251285553, "learning_rate": 7.490535218462651e-06, "loss": 0.02685493230819702, "memory(GiB)": 21.48, "step": 11308, "token_acc": 0.9895287958115183, "train_speed(iter/s)": 0.955467 }, { "epoch": 0.3673780983010103, "grad_norm": 0.33163076639175415, "learning_rate": 7.490069430430386e-06, "loss": 0.02382257767021656, "memory(GiB)": 21.48, "step": 11309, "token_acc": 0.975103734439834, "train_speed(iter/s)": 0.955478 }, { "epoch": 0.36741058376376573, "grad_norm": 0.3742114305496216, "learning_rate": 7.489603613659406e-06, "loss": 0.017822779715061188, "memory(GiB)": 21.48, "step": 11310, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.955491 }, { "epoch": 0.36744306922652115, "grad_norm": 0.5342077016830444, "learning_rate": 7.489137768155086e-06, "loss": 0.032276272773742676, "memory(GiB)": 21.48, "step": 11311, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.955504 }, { "epoch": 0.36747555468927656, "grad_norm": 0.4652078151702881, "learning_rate": 7.488671893922803e-06, "loss": 0.036790795624256134, "memory(GiB)": 21.48, "step": 11312, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.955517 }, { "epoch": 0.367508040152032, "grad_norm": 0.23933760821819305, "learning_rate": 7.488205990967936e-06, "loss": 0.019806653261184692, "memory(GiB)": 21.48, "step": 11313, "token_acc": 1.0, "train_speed(iter/s)": 0.955529 }, { "epoch": 0.3675405256147874, "grad_norm": 0.5114074349403381, "learning_rate": 7.4877400592958585e-06, "loss": 0.030203934758901596, "memory(GiB)": 21.48, "step": 11314, "token_acc": 0.9768518518518519, "train_speed(iter/s)": 0.955542 }, { "epoch": 0.3675730110775428, "grad_norm": 0.5086953639984131, "learning_rate": 7.48727409891195e-06, "loss": 0.03390795737504959, "memory(GiB)": 21.48, "step": 11315, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.955554 }, { "epoch": 0.3676054965402982, "grad_norm": 0.4984106421470642, "learning_rate": 7.486808109821587e-06, "loss": 0.03536922484636307, "memory(GiB)": 21.48, "step": 11316, "token_acc": 0.9656652360515021, "train_speed(iter/s)": 0.955565 }, { "epoch": 0.36763798200305364, "grad_norm": 0.5530914664268494, "learning_rate": 7.486342092030149e-06, "loss": 0.03482482582330704, "memory(GiB)": 21.48, "step": 11317, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.955577 }, { "epoch": 0.36767046746580906, "grad_norm": 0.41246140003204346, "learning_rate": 7.485876045543014e-06, "loss": 0.02706540748476982, "memory(GiB)": 21.48, "step": 11318, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955589 }, { "epoch": 0.3677029529285645, "grad_norm": 0.32891374826431274, "learning_rate": 7.485409970365561e-06, "loss": 0.026574086397886276, "memory(GiB)": 21.48, "step": 11319, "token_acc": 0.9932885906040269, "train_speed(iter/s)": 0.955602 }, { "epoch": 0.3677354383913199, "grad_norm": 0.3868054449558258, "learning_rate": 7.484943866503168e-06, "loss": 0.03319842740893364, "memory(GiB)": 21.48, "step": 11320, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.955615 }, { "epoch": 0.3677679238540753, "grad_norm": 0.40879398584365845, "learning_rate": 7.484477733961215e-06, "loss": 0.029936956241726875, "memory(GiB)": 21.48, "step": 11321, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.955626 }, { "epoch": 0.3678004093168307, "grad_norm": 0.44040513038635254, "learning_rate": 7.484011572745085e-06, "loss": 0.03288378566503525, "memory(GiB)": 21.48, "step": 11322, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.95564 }, { "epoch": 0.36783289477958614, "grad_norm": 0.31281012296676636, "learning_rate": 7.483545382860151e-06, "loss": 0.02962583675980568, "memory(GiB)": 21.48, "step": 11323, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955654 }, { "epoch": 0.36786538024234156, "grad_norm": 0.35831671953201294, "learning_rate": 7.483079164311802e-06, "loss": 0.024429742246866226, "memory(GiB)": 21.48, "step": 11324, "token_acc": 1.0, "train_speed(iter/s)": 0.955669 }, { "epoch": 0.36789786570509697, "grad_norm": 0.4312194287776947, "learning_rate": 7.482612917105412e-06, "loss": 0.02910817787051201, "memory(GiB)": 21.48, "step": 11325, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.955683 }, { "epoch": 0.3679303511678524, "grad_norm": 0.27484166622161865, "learning_rate": 7.482146641246364e-06, "loss": 0.024113085120916367, "memory(GiB)": 21.48, "step": 11326, "token_acc": 1.0, "train_speed(iter/s)": 0.955698 }, { "epoch": 0.3679628366306078, "grad_norm": 0.43650102615356445, "learning_rate": 7.4816803367400405e-06, "loss": 0.03162505477666855, "memory(GiB)": 21.48, "step": 11327, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.955716 }, { "epoch": 0.3679953220933632, "grad_norm": 0.3571232557296753, "learning_rate": 7.481214003591823e-06, "loss": 0.033365096896886826, "memory(GiB)": 21.48, "step": 11328, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.955734 }, { "epoch": 0.36802780755611864, "grad_norm": 0.5085047483444214, "learning_rate": 7.480747641807094e-06, "loss": 0.029379678890109062, "memory(GiB)": 21.48, "step": 11329, "token_acc": 0.980544747081712, "train_speed(iter/s)": 0.955751 }, { "epoch": 0.36806029301887405, "grad_norm": 0.38125309348106384, "learning_rate": 7.480281251391235e-06, "loss": 0.031390801072120667, "memory(GiB)": 21.48, "step": 11330, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.955768 }, { "epoch": 0.36809277848162947, "grad_norm": 0.36534202098846436, "learning_rate": 7.47981483234963e-06, "loss": 0.02318638190627098, "memory(GiB)": 21.48, "step": 11331, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.955783 }, { "epoch": 0.3681252639443849, "grad_norm": 0.3294358551502228, "learning_rate": 7.47934838468766e-06, "loss": 0.02173847332596779, "memory(GiB)": 21.48, "step": 11332, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955799 }, { "epoch": 0.3681577494071403, "grad_norm": 0.2789645195007324, "learning_rate": 7.47888190841071e-06, "loss": 0.02473365142941475, "memory(GiB)": 21.48, "step": 11333, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.955816 }, { "epoch": 0.3681902348698957, "grad_norm": 0.6903741359710693, "learning_rate": 7.478415403524164e-06, "loss": 0.03354811668395996, "memory(GiB)": 21.48, "step": 11334, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.955834 }, { "epoch": 0.36822272033265113, "grad_norm": 0.2825223505496979, "learning_rate": 7.477948870033405e-06, "loss": 0.024419602006673813, "memory(GiB)": 21.48, "step": 11335, "token_acc": 0.988, "train_speed(iter/s)": 0.955852 }, { "epoch": 0.36825520579540655, "grad_norm": 0.3903163969516754, "learning_rate": 7.477482307943818e-06, "loss": 0.017632436007261276, "memory(GiB)": 21.48, "step": 11336, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.955871 }, { "epoch": 0.36828769125816196, "grad_norm": 0.3821725845336914, "learning_rate": 7.477015717260786e-06, "loss": 0.02590223401784897, "memory(GiB)": 21.48, "step": 11337, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.955885 }, { "epoch": 0.3683201767209174, "grad_norm": 0.35018134117126465, "learning_rate": 7.476549097989698e-06, "loss": 0.0239991694688797, "memory(GiB)": 21.48, "step": 11338, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955903 }, { "epoch": 0.3683526621836728, "grad_norm": 0.28941604495048523, "learning_rate": 7.476082450135935e-06, "loss": 0.023741338402032852, "memory(GiB)": 21.48, "step": 11339, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.955919 }, { "epoch": 0.3683851476464282, "grad_norm": 0.3737947344779968, "learning_rate": 7.475615773704888e-06, "loss": 0.021498562768101692, "memory(GiB)": 21.48, "step": 11340, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.955937 }, { "epoch": 0.36841763310918363, "grad_norm": 0.534296452999115, "learning_rate": 7.475149068701938e-06, "loss": 0.03704327344894409, "memory(GiB)": 21.48, "step": 11341, "token_acc": 0.992, "train_speed(iter/s)": 0.955955 }, { "epoch": 0.36845011857193904, "grad_norm": 0.34859248995780945, "learning_rate": 7.4746823351324715e-06, "loss": 0.031144525855779648, "memory(GiB)": 21.48, "step": 11342, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.955973 }, { "epoch": 0.36848260403469446, "grad_norm": 0.39424410462379456, "learning_rate": 7.4742155730018796e-06, "loss": 0.020010871812701225, "memory(GiB)": 21.48, "step": 11343, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955988 }, { "epoch": 0.3685150894974499, "grad_norm": 1.204293131828308, "learning_rate": 7.473748782315546e-06, "loss": 0.032160162925720215, "memory(GiB)": 21.48, "step": 11344, "token_acc": 1.0, "train_speed(iter/s)": 0.956004 }, { "epoch": 0.3685475749602053, "grad_norm": 0.4662057161331177, "learning_rate": 7.47328196307886e-06, "loss": 0.026297949254512787, "memory(GiB)": 21.48, "step": 11345, "token_acc": 1.0, "train_speed(iter/s)": 0.956021 }, { "epoch": 0.3685800604229607, "grad_norm": 0.45925280451774597, "learning_rate": 7.4728151152972075e-06, "loss": 0.032934676855802536, "memory(GiB)": 21.48, "step": 11346, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.956038 }, { "epoch": 0.3686125458857161, "grad_norm": 0.5039734840393066, "learning_rate": 7.472348238975976e-06, "loss": 0.028777532279491425, "memory(GiB)": 21.48, "step": 11347, "token_acc": 0.9836065573770492, "train_speed(iter/s)": 0.956055 }, { "epoch": 0.36864503134847154, "grad_norm": 0.33492350578308105, "learning_rate": 7.471881334120556e-06, "loss": 0.021991468966007233, "memory(GiB)": 21.48, "step": 11348, "token_acc": 1.0, "train_speed(iter/s)": 0.956073 }, { "epoch": 0.36867751681122696, "grad_norm": 0.5684168934822083, "learning_rate": 7.471414400736337e-06, "loss": 0.033330220729112625, "memory(GiB)": 21.48, "step": 11349, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.95609 }, { "epoch": 0.3687100022739824, "grad_norm": 0.7562024593353271, "learning_rate": 7.470947438828704e-06, "loss": 0.034229379147291183, "memory(GiB)": 21.48, "step": 11350, "token_acc": 0.981549815498155, "train_speed(iter/s)": 0.956107 }, { "epoch": 0.3687424877367378, "grad_norm": 0.4510992169380188, "learning_rate": 7.4704804484030506e-06, "loss": 0.026712041348218918, "memory(GiB)": 21.48, "step": 11351, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956124 }, { "epoch": 0.3687749731994932, "grad_norm": 0.37395867705345154, "learning_rate": 7.470013429464763e-06, "loss": 0.027167711406946182, "memory(GiB)": 21.48, "step": 11352, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.956142 }, { "epoch": 0.3688074586622486, "grad_norm": 0.5239317417144775, "learning_rate": 7.469546382019233e-06, "loss": 0.02908455580472946, "memory(GiB)": 21.48, "step": 11353, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.956159 }, { "epoch": 0.36883994412500404, "grad_norm": 0.5529485940933228, "learning_rate": 7.4690793060718516e-06, "loss": 0.037366822361946106, "memory(GiB)": 21.48, "step": 11354, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.956176 }, { "epoch": 0.36887242958775945, "grad_norm": 0.5320346355438232, "learning_rate": 7.468612201628009e-06, "loss": 0.02646634727716446, "memory(GiB)": 21.48, "step": 11355, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956193 }, { "epoch": 0.36890491505051487, "grad_norm": 0.49629688262939453, "learning_rate": 7.468145068693094e-06, "loss": 0.026630394160747528, "memory(GiB)": 21.48, "step": 11356, "token_acc": 1.0, "train_speed(iter/s)": 0.956211 }, { "epoch": 0.3689374005132703, "grad_norm": 0.37109076976776123, "learning_rate": 7.4676779072725025e-06, "loss": 0.02699725516140461, "memory(GiB)": 21.48, "step": 11357, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.956228 }, { "epoch": 0.3689698859760257, "grad_norm": 0.4862096309661865, "learning_rate": 7.4672107173716205e-06, "loss": 0.029858335852622986, "memory(GiB)": 21.48, "step": 11358, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.956245 }, { "epoch": 0.3690023714387811, "grad_norm": 0.5474688410758972, "learning_rate": 7.466743498995846e-06, "loss": 0.030465666204690933, "memory(GiB)": 21.48, "step": 11359, "token_acc": 0.9875, "train_speed(iter/s)": 0.956262 }, { "epoch": 0.3690348569015366, "grad_norm": 0.5056914687156677, "learning_rate": 7.466276252150566e-06, "loss": 0.026384759694337845, "memory(GiB)": 21.48, "step": 11360, "token_acc": 0.9831460674157303, "train_speed(iter/s)": 0.956277 }, { "epoch": 0.369067342364292, "grad_norm": 0.4959923326969147, "learning_rate": 7.465808976841176e-06, "loss": 0.03242628276348114, "memory(GiB)": 21.48, "step": 11361, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.956289 }, { "epoch": 0.3690998278270474, "grad_norm": 0.3177104890346527, "learning_rate": 7.465341673073069e-06, "loss": 0.0205992441624403, "memory(GiB)": 21.48, "step": 11362, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956303 }, { "epoch": 0.36913231328980284, "grad_norm": 0.7633737921714783, "learning_rate": 7.4648743408516365e-06, "loss": 0.031082285568118095, "memory(GiB)": 21.48, "step": 11363, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.956316 }, { "epoch": 0.36916479875255825, "grad_norm": 0.6890237331390381, "learning_rate": 7.464406980182275e-06, "loss": 0.031599875539541245, "memory(GiB)": 21.48, "step": 11364, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.956328 }, { "epoch": 0.36919728421531367, "grad_norm": 0.3985535800457001, "learning_rate": 7.463939591070376e-06, "loss": 0.030404014512896538, "memory(GiB)": 21.48, "step": 11365, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.956341 }, { "epoch": 0.3692297696780691, "grad_norm": 0.35213303565979004, "learning_rate": 7.463472173521336e-06, "loss": 0.02567056007683277, "memory(GiB)": 21.48, "step": 11366, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.956353 }, { "epoch": 0.3692622551408245, "grad_norm": 0.352439820766449, "learning_rate": 7.463004727540546e-06, "loss": 0.02764301188290119, "memory(GiB)": 21.48, "step": 11367, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.956365 }, { "epoch": 0.3692947406035799, "grad_norm": 0.3558160066604614, "learning_rate": 7.462537253133404e-06, "loss": 0.030522409826517105, "memory(GiB)": 21.48, "step": 11368, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.956377 }, { "epoch": 0.36932722606633533, "grad_norm": 0.32952702045440674, "learning_rate": 7.462069750305305e-06, "loss": 0.031804561614990234, "memory(GiB)": 21.48, "step": 11369, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.956389 }, { "epoch": 0.36935971152909075, "grad_norm": 0.5417152643203735, "learning_rate": 7.461602219061643e-06, "loss": 0.043450258672237396, "memory(GiB)": 21.48, "step": 11370, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.956401 }, { "epoch": 0.36939219699184617, "grad_norm": 0.3316991329193115, "learning_rate": 7.461134659407816e-06, "loss": 0.016518335789442062, "memory(GiB)": 21.48, "step": 11371, "token_acc": 1.0, "train_speed(iter/s)": 0.956414 }, { "epoch": 0.3694246824546016, "grad_norm": 0.6755481362342834, "learning_rate": 7.460667071349219e-06, "loss": 0.04468420147895813, "memory(GiB)": 21.48, "step": 11372, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.956426 }, { "epoch": 0.369457167917357, "grad_norm": 0.26363635063171387, "learning_rate": 7.460199454891248e-06, "loss": 0.017084721475839615, "memory(GiB)": 21.48, "step": 11373, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956437 }, { "epoch": 0.3694896533801124, "grad_norm": 0.26563483476638794, "learning_rate": 7.459731810039302e-06, "loss": 0.02694479376077652, "memory(GiB)": 21.48, "step": 11374, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956449 }, { "epoch": 0.36952213884286783, "grad_norm": 0.4263480603694916, "learning_rate": 7.459264136798776e-06, "loss": 0.026842283084988594, "memory(GiB)": 21.48, "step": 11375, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.956462 }, { "epoch": 0.36955462430562325, "grad_norm": 0.30301305651664734, "learning_rate": 7.458796435175071e-06, "loss": 0.028451528400182724, "memory(GiB)": 21.48, "step": 11376, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.956473 }, { "epoch": 0.36958710976837866, "grad_norm": 0.37168723344802856, "learning_rate": 7.4583287051735796e-06, "loss": 0.01945953071117401, "memory(GiB)": 21.48, "step": 11377, "token_acc": 0.994475138121547, "train_speed(iter/s)": 0.956474 }, { "epoch": 0.3696195952311341, "grad_norm": 0.457019567489624, "learning_rate": 7.457860946799704e-06, "loss": 0.03391974791884422, "memory(GiB)": 21.48, "step": 11378, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.956485 }, { "epoch": 0.3696520806938895, "grad_norm": 0.4080842435359955, "learning_rate": 7.457393160058841e-06, "loss": 0.030054517090320587, "memory(GiB)": 21.48, "step": 11379, "token_acc": 0.992, "train_speed(iter/s)": 0.956497 }, { "epoch": 0.3696845661566449, "grad_norm": 0.32640987634658813, "learning_rate": 7.456925344956392e-06, "loss": 0.027297982946038246, "memory(GiB)": 21.48, "step": 11380, "token_acc": 0.9835164835164835, "train_speed(iter/s)": 0.956511 }, { "epoch": 0.3697170516194003, "grad_norm": 0.716627299785614, "learning_rate": 7.4564575014977525e-06, "loss": 0.02675073780119419, "memory(GiB)": 21.48, "step": 11381, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.956525 }, { "epoch": 0.36974953708215574, "grad_norm": 0.4227887690067291, "learning_rate": 7.455989629688325e-06, "loss": 0.02749025821685791, "memory(GiB)": 21.48, "step": 11382, "token_acc": 1.0, "train_speed(iter/s)": 0.956538 }, { "epoch": 0.36978202254491116, "grad_norm": 0.3912953734397888, "learning_rate": 7.455521729533508e-06, "loss": 0.02083105780184269, "memory(GiB)": 21.48, "step": 11383, "token_acc": 1.0, "train_speed(iter/s)": 0.956548 }, { "epoch": 0.3698145080076666, "grad_norm": 0.4128171503543854, "learning_rate": 7.455053801038702e-06, "loss": 0.030450675636529922, "memory(GiB)": 21.48, "step": 11384, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.956561 }, { "epoch": 0.369846993470422, "grad_norm": 0.6317918300628662, "learning_rate": 7.454585844209307e-06, "loss": 0.035468894988298416, "memory(GiB)": 21.48, "step": 11385, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.956575 }, { "epoch": 0.3698794789331774, "grad_norm": 0.32292380928993225, "learning_rate": 7.454117859050725e-06, "loss": 0.021662551909685135, "memory(GiB)": 21.48, "step": 11386, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.956589 }, { "epoch": 0.3699119643959328, "grad_norm": 0.5643246173858643, "learning_rate": 7.453649845568356e-06, "loss": 0.021297167986631393, "memory(GiB)": 21.48, "step": 11387, "token_acc": 1.0, "train_speed(iter/s)": 0.956604 }, { "epoch": 0.36994444985868824, "grad_norm": 0.3025934398174286, "learning_rate": 7.453181803767602e-06, "loss": 0.028238175436854362, "memory(GiB)": 21.48, "step": 11388, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.95662 }, { "epoch": 0.36997693532144366, "grad_norm": 0.5149878263473511, "learning_rate": 7.452713733653863e-06, "loss": 0.02242526412010193, "memory(GiB)": 21.48, "step": 11389, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.956637 }, { "epoch": 0.3700094207841991, "grad_norm": 0.4396653175354004, "learning_rate": 7.452245635232545e-06, "loss": 0.029064837843179703, "memory(GiB)": 21.48, "step": 11390, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.956653 }, { "epoch": 0.3700419062469545, "grad_norm": 1.087377905845642, "learning_rate": 7.451777508509048e-06, "loss": 0.03134534880518913, "memory(GiB)": 21.48, "step": 11391, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.956671 }, { "epoch": 0.3700743917097099, "grad_norm": 0.4005744457244873, "learning_rate": 7.451309353488777e-06, "loss": 0.025691457092761993, "memory(GiB)": 21.48, "step": 11392, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.956688 }, { "epoch": 0.3701068771724653, "grad_norm": 0.3601824939250946, "learning_rate": 7.4508411701771315e-06, "loss": 0.027156494557857513, "memory(GiB)": 21.48, "step": 11393, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.956706 }, { "epoch": 0.37013936263522074, "grad_norm": 0.394999623298645, "learning_rate": 7.450372958579517e-06, "loss": 0.027769874781370163, "memory(GiB)": 21.48, "step": 11394, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.956723 }, { "epoch": 0.37017184809797615, "grad_norm": 0.30618321895599365, "learning_rate": 7.449904718701337e-06, "loss": 0.019357023760676384, "memory(GiB)": 21.48, "step": 11395, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956738 }, { "epoch": 0.37020433356073157, "grad_norm": 0.441760778427124, "learning_rate": 7.4494364505479965e-06, "loss": 0.0378568172454834, "memory(GiB)": 21.48, "step": 11396, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.956753 }, { "epoch": 0.370236819023487, "grad_norm": 1.106649398803711, "learning_rate": 7.448968154124899e-06, "loss": 0.026593469083309174, "memory(GiB)": 21.48, "step": 11397, "token_acc": 1.0, "train_speed(iter/s)": 0.956772 }, { "epoch": 0.3702693044862424, "grad_norm": 0.3872416913509369, "learning_rate": 7.4484998294374485e-06, "loss": 0.026366591453552246, "memory(GiB)": 21.48, "step": 11398, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.95679 }, { "epoch": 0.3703017899489978, "grad_norm": 0.5097939968109131, "learning_rate": 7.448031476491052e-06, "loss": 0.028607668355107307, "memory(GiB)": 21.48, "step": 11399, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.956808 }, { "epoch": 0.37033427541175323, "grad_norm": 0.47417694330215454, "learning_rate": 7.4475630952911135e-06, "loss": 0.02930673584342003, "memory(GiB)": 21.48, "step": 11400, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.956826 }, { "epoch": 0.37036676087450865, "grad_norm": 0.43750545382499695, "learning_rate": 7.4470946858430395e-06, "loss": 0.027349252253770828, "memory(GiB)": 21.48, "step": 11401, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956842 }, { "epoch": 0.37039924633726407, "grad_norm": 0.384196013212204, "learning_rate": 7.446626248152236e-06, "loss": 0.030685458332300186, "memory(GiB)": 21.48, "step": 11402, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.956859 }, { "epoch": 0.3704317318000195, "grad_norm": 0.2946685254573822, "learning_rate": 7.446157782224108e-06, "loss": 0.015325237065553665, "memory(GiB)": 21.48, "step": 11403, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.956877 }, { "epoch": 0.3704642172627749, "grad_norm": 0.2730054557323456, "learning_rate": 7.445689288064064e-06, "loss": 0.021569840610027313, "memory(GiB)": 21.48, "step": 11404, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.956894 }, { "epoch": 0.3704967027255303, "grad_norm": 0.38757428526878357, "learning_rate": 7.44522076567751e-06, "loss": 0.023607365787029266, "memory(GiB)": 21.48, "step": 11405, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956912 }, { "epoch": 0.37052918818828573, "grad_norm": 0.4223320782184601, "learning_rate": 7.444752215069855e-06, "loss": 0.027414320036768913, "memory(GiB)": 21.48, "step": 11406, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.95693 }, { "epoch": 0.37056167365104115, "grad_norm": 0.4317869544029236, "learning_rate": 7.444283636246503e-06, "loss": 0.03924812003970146, "memory(GiB)": 21.48, "step": 11407, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956948 }, { "epoch": 0.37059415911379656, "grad_norm": 0.3222111761569977, "learning_rate": 7.443815029212868e-06, "loss": 0.021093860268592834, "memory(GiB)": 21.48, "step": 11408, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.956965 }, { "epoch": 0.370626644576552, "grad_norm": 0.38095712661743164, "learning_rate": 7.443346393974353e-06, "loss": 0.02539847604930401, "memory(GiB)": 21.48, "step": 11409, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.956983 }, { "epoch": 0.3706591300393074, "grad_norm": 0.3189629316329956, "learning_rate": 7.442877730536369e-06, "loss": 0.026384001597762108, "memory(GiB)": 21.48, "step": 11410, "token_acc": 1.0, "train_speed(iter/s)": 0.957 }, { "epoch": 0.3706916155020628, "grad_norm": 0.38272950053215027, "learning_rate": 7.442409038904324e-06, "loss": 0.026309475302696228, "memory(GiB)": 21.48, "step": 11411, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.957018 }, { "epoch": 0.3707241009648182, "grad_norm": 1.180943250656128, "learning_rate": 7.441940319083627e-06, "loss": 0.03588538244366646, "memory(GiB)": 21.48, "step": 11412, "token_acc": 0.995, "train_speed(iter/s)": 0.957036 }, { "epoch": 0.37075658642757364, "grad_norm": 0.43953341245651245, "learning_rate": 7.441471571079691e-06, "loss": 0.03595144301652908, "memory(GiB)": 21.48, "step": 11413, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.957053 }, { "epoch": 0.37078907189032906, "grad_norm": 0.4027959704399109, "learning_rate": 7.44100279489792e-06, "loss": 0.026151243597269058, "memory(GiB)": 21.48, "step": 11414, "token_acc": 0.983739837398374, "train_speed(iter/s)": 0.957071 }, { "epoch": 0.3708215573530845, "grad_norm": 0.3387700915336609, "learning_rate": 7.44053399054373e-06, "loss": 0.016796398907899857, "memory(GiB)": 21.48, "step": 11415, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.95709 }, { "epoch": 0.3708540428158399, "grad_norm": 0.42075902223587036, "learning_rate": 7.4400651580225295e-06, "loss": 0.03349975496530533, "memory(GiB)": 21.48, "step": 11416, "token_acc": 1.0, "train_speed(iter/s)": 0.957108 }, { "epoch": 0.3708865282785953, "grad_norm": 0.4201176166534424, "learning_rate": 7.439596297339729e-06, "loss": 0.03144511952996254, "memory(GiB)": 21.48, "step": 11417, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957125 }, { "epoch": 0.3709190137413507, "grad_norm": 0.5178898572921753, "learning_rate": 7.4391274085007404e-06, "loss": 0.02470868080854416, "memory(GiB)": 21.48, "step": 11418, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.957143 }, { "epoch": 0.37095149920410614, "grad_norm": 0.5098183751106262, "learning_rate": 7.438658491510974e-06, "loss": 0.028455737978219986, "memory(GiB)": 21.48, "step": 11419, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957159 }, { "epoch": 0.37098398466686155, "grad_norm": 0.29665178060531616, "learning_rate": 7.438189546375844e-06, "loss": 0.014371432363986969, "memory(GiB)": 21.48, "step": 11420, "token_acc": 1.0, "train_speed(iter/s)": 0.957168 }, { "epoch": 0.37101647012961697, "grad_norm": 0.4319189786911011, "learning_rate": 7.4377205731007604e-06, "loss": 0.02582147717475891, "memory(GiB)": 21.48, "step": 11421, "token_acc": 1.0, "train_speed(iter/s)": 0.957182 }, { "epoch": 0.3710489555923724, "grad_norm": 0.4141804277896881, "learning_rate": 7.437251571691138e-06, "loss": 0.02592313662171364, "memory(GiB)": 21.48, "step": 11422, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957197 }, { "epoch": 0.3710814410551278, "grad_norm": 0.36923080682754517, "learning_rate": 7.436782542152387e-06, "loss": 0.03266458958387375, "memory(GiB)": 21.48, "step": 11423, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.957211 }, { "epoch": 0.3711139265178833, "grad_norm": 0.5286357402801514, "learning_rate": 7.4363134844899235e-06, "loss": 0.02902338281273842, "memory(GiB)": 21.48, "step": 11424, "token_acc": 0.9807692307692307, "train_speed(iter/s)": 0.957224 }, { "epoch": 0.3711464119806387, "grad_norm": 0.3942674398422241, "learning_rate": 7.43584439870916e-06, "loss": 0.020545417442917824, "memory(GiB)": 21.48, "step": 11425, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.957238 }, { "epoch": 0.3711788974433941, "grad_norm": 0.5044000744819641, "learning_rate": 7.435375284815508e-06, "loss": 0.02292974479496479, "memory(GiB)": 21.48, "step": 11426, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.95725 }, { "epoch": 0.3712113829061495, "grad_norm": 0.4873885214328766, "learning_rate": 7.434906142814385e-06, "loss": 0.033939179033041, "memory(GiB)": 21.48, "step": 11427, "token_acc": 0.976, "train_speed(iter/s)": 0.957262 }, { "epoch": 0.37124386836890494, "grad_norm": 0.686922550201416, "learning_rate": 7.434436972711205e-06, "loss": 0.03165903687477112, "memory(GiB)": 21.48, "step": 11428, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.957274 }, { "epoch": 0.37127635383166036, "grad_norm": 0.3944935202598572, "learning_rate": 7.4339677745113806e-06, "loss": 0.023799706250429153, "memory(GiB)": 21.48, "step": 11429, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.957286 }, { "epoch": 0.37130883929441577, "grad_norm": 0.6507464051246643, "learning_rate": 7.433498548220328e-06, "loss": 0.03057112917304039, "memory(GiB)": 21.48, "step": 11430, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957298 }, { "epoch": 0.3713413247571712, "grad_norm": 0.35935625433921814, "learning_rate": 7.433029293843465e-06, "loss": 0.028896862640976906, "memory(GiB)": 21.48, "step": 11431, "token_acc": 1.0, "train_speed(iter/s)": 0.95731 }, { "epoch": 0.3713738102199266, "grad_norm": 0.35355857014656067, "learning_rate": 7.432560011386205e-06, "loss": 0.02702806144952774, "memory(GiB)": 21.48, "step": 11432, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957322 }, { "epoch": 0.371406295682682, "grad_norm": 0.6713797450065613, "learning_rate": 7.432090700853964e-06, "loss": 0.03631042689085007, "memory(GiB)": 21.48, "step": 11433, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.957335 }, { "epoch": 0.37143878114543744, "grad_norm": 0.6615903973579407, "learning_rate": 7.431621362252161e-06, "loss": 0.03159612417221069, "memory(GiB)": 21.48, "step": 11434, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.957347 }, { "epoch": 0.37147126660819285, "grad_norm": 0.5316032767295837, "learning_rate": 7.431151995586209e-06, "loss": 0.03106570988893509, "memory(GiB)": 21.48, "step": 11435, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.957359 }, { "epoch": 0.37150375207094827, "grad_norm": 0.5217832922935486, "learning_rate": 7.430682600861528e-06, "loss": 0.029343584552407265, "memory(GiB)": 21.48, "step": 11436, "token_acc": 0.9766666666666667, "train_speed(iter/s)": 0.957372 }, { "epoch": 0.3715362375337037, "grad_norm": 1.0608856678009033, "learning_rate": 7.430213178083534e-06, "loss": 0.028993114829063416, "memory(GiB)": 21.48, "step": 11437, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957383 }, { "epoch": 0.3715687229964591, "grad_norm": 0.43977558612823486, "learning_rate": 7.4297437272576455e-06, "loss": 0.023414678871631622, "memory(GiB)": 21.48, "step": 11438, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.957395 }, { "epoch": 0.3716012084592145, "grad_norm": 0.40675753355026245, "learning_rate": 7.429274248389282e-06, "loss": 0.025243710726499557, "memory(GiB)": 21.48, "step": 11439, "token_acc": 1.0, "train_speed(iter/s)": 0.957407 }, { "epoch": 0.37163369392196993, "grad_norm": 0.3667863607406616, "learning_rate": 7.428804741483858e-06, "loss": 0.021695444360375404, "memory(GiB)": 21.48, "step": 11440, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957422 }, { "epoch": 0.37166617938472535, "grad_norm": 0.39126214385032654, "learning_rate": 7.428335206546796e-06, "loss": 0.036589521914720535, "memory(GiB)": 21.48, "step": 11441, "token_acc": 0.9742647058823529, "train_speed(iter/s)": 0.957438 }, { "epoch": 0.37169866484748076, "grad_norm": 0.41688406467437744, "learning_rate": 7.427865643583513e-06, "loss": 0.030653174966573715, "memory(GiB)": 21.48, "step": 11442, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.957453 }, { "epoch": 0.3717311503102362, "grad_norm": 0.3602941334247589, "learning_rate": 7.427396052599429e-06, "loss": 0.02692500129342079, "memory(GiB)": 21.48, "step": 11443, "token_acc": 0.991869918699187, "train_speed(iter/s)": 0.957468 }, { "epoch": 0.3717636357729916, "grad_norm": 0.7642953395843506, "learning_rate": 7.426926433599963e-06, "loss": 0.024794239550828934, "memory(GiB)": 21.48, "step": 11444, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.957483 }, { "epoch": 0.371796121235747, "grad_norm": 1.8293777704238892, "learning_rate": 7.4264567865905375e-06, "loss": 0.030173368752002716, "memory(GiB)": 21.48, "step": 11445, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.957495 }, { "epoch": 0.37182860669850243, "grad_norm": 0.40926459431648254, "learning_rate": 7.42598711157657e-06, "loss": 0.03282573074102402, "memory(GiB)": 21.48, "step": 11446, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.957509 }, { "epoch": 0.37186109216125784, "grad_norm": 0.40027526021003723, "learning_rate": 7.425517408563483e-06, "loss": 0.025967925786972046, "memory(GiB)": 21.48, "step": 11447, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957523 }, { "epoch": 0.37189357762401326, "grad_norm": 0.43365147709846497, "learning_rate": 7.425047677556697e-06, "loss": 0.019915815442800522, "memory(GiB)": 21.48, "step": 11448, "token_acc": 1.0, "train_speed(iter/s)": 0.957536 }, { "epoch": 0.3719260630867687, "grad_norm": 0.35261523723602295, "learning_rate": 7.424577918561632e-06, "loss": 0.021911054849624634, "memory(GiB)": 21.48, "step": 11449, "token_acc": 0.9884615384615385, "train_speed(iter/s)": 0.957551 }, { "epoch": 0.3719585485495241, "grad_norm": 0.4688395857810974, "learning_rate": 7.4241081315837105e-06, "loss": 0.02553136646747589, "memory(GiB)": 21.48, "step": 11450, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.957567 }, { "epoch": 0.3719910340122795, "grad_norm": 0.3497682213783264, "learning_rate": 7.4236383166283545e-06, "loss": 0.025582529604434967, "memory(GiB)": 21.48, "step": 11451, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.957584 }, { "epoch": 0.3720235194750349, "grad_norm": 0.43760010600090027, "learning_rate": 7.423168473700988e-06, "loss": 0.026584792882204056, "memory(GiB)": 21.48, "step": 11452, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.957602 }, { "epoch": 0.37205600493779034, "grad_norm": 0.3891618549823761, "learning_rate": 7.422698602807032e-06, "loss": 0.029542692005634308, "memory(GiB)": 21.48, "step": 11453, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.957615 }, { "epoch": 0.37208849040054576, "grad_norm": 0.3922017812728882, "learning_rate": 7.422228703951907e-06, "loss": 0.03279689699411392, "memory(GiB)": 21.48, "step": 11454, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.957632 }, { "epoch": 0.3721209758633012, "grad_norm": 0.2399798035621643, "learning_rate": 7.421758777141042e-06, "loss": 0.019703447818756104, "memory(GiB)": 21.48, "step": 11455, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.95765 }, { "epoch": 0.3721534613260566, "grad_norm": 1.7080079317092896, "learning_rate": 7.421288822379855e-06, "loss": 0.018638975918293, "memory(GiB)": 21.48, "step": 11456, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.957667 }, { "epoch": 0.372185946788812, "grad_norm": 0.383391410112381, "learning_rate": 7.420818839673774e-06, "loss": 0.03088577836751938, "memory(GiB)": 21.48, "step": 11457, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957684 }, { "epoch": 0.3722184322515674, "grad_norm": 0.8659613132476807, "learning_rate": 7.420348829028219e-06, "loss": 0.022182542830705643, "memory(GiB)": 21.48, "step": 11458, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.957701 }, { "epoch": 0.37225091771432284, "grad_norm": 0.373558908700943, "learning_rate": 7.419878790448619e-06, "loss": 0.029058333486318588, "memory(GiB)": 21.48, "step": 11459, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.957718 }, { "epoch": 0.37228340317707825, "grad_norm": 0.44231846928596497, "learning_rate": 7.419408723940396e-06, "loss": 0.033395349979400635, "memory(GiB)": 21.48, "step": 11460, "token_acc": 0.9812734082397003, "train_speed(iter/s)": 0.957734 }, { "epoch": 0.37231588863983367, "grad_norm": 0.37305963039398193, "learning_rate": 7.4189386295089775e-06, "loss": 0.02723018452525139, "memory(GiB)": 21.48, "step": 11461, "token_acc": 0.9795918367346939, "train_speed(iter/s)": 0.957752 }, { "epoch": 0.3723483741025891, "grad_norm": 0.4035067856311798, "learning_rate": 7.418468507159787e-06, "loss": 0.028941379860043526, "memory(GiB)": 21.48, "step": 11462, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.957771 }, { "epoch": 0.3723808595653445, "grad_norm": 0.3517431914806366, "learning_rate": 7.417998356898251e-06, "loss": 0.03043746016919613, "memory(GiB)": 21.48, "step": 11463, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.957788 }, { "epoch": 0.3724133450280999, "grad_norm": 0.3225865960121155, "learning_rate": 7.417528178729796e-06, "loss": 0.025255199521780014, "memory(GiB)": 21.48, "step": 11464, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.957807 }, { "epoch": 0.37244583049085533, "grad_norm": 0.4257725775241852, "learning_rate": 7.417057972659847e-06, "loss": 0.02894527092576027, "memory(GiB)": 21.48, "step": 11465, "token_acc": 0.9807692307692307, "train_speed(iter/s)": 0.957824 }, { "epoch": 0.37247831595361075, "grad_norm": 0.3388199508190155, "learning_rate": 7.416587738693831e-06, "loss": 0.02731046825647354, "memory(GiB)": 21.48, "step": 11466, "token_acc": 0.9808612440191388, "train_speed(iter/s)": 0.957841 }, { "epoch": 0.37251080141636617, "grad_norm": 0.3659067451953888, "learning_rate": 7.416117476837178e-06, "loss": 0.02549458108842373, "memory(GiB)": 21.48, "step": 11467, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.957857 }, { "epoch": 0.3725432868791216, "grad_norm": 0.5879424214363098, "learning_rate": 7.4156471870953115e-06, "loss": 0.029084086418151855, "memory(GiB)": 21.48, "step": 11468, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.957876 }, { "epoch": 0.372575772341877, "grad_norm": 1.9836344718933105, "learning_rate": 7.415176869473664e-06, "loss": 0.02259504795074463, "memory(GiB)": 21.48, "step": 11469, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.957893 }, { "epoch": 0.3726082578046324, "grad_norm": 0.2881026566028595, "learning_rate": 7.4147065239776585e-06, "loss": 0.01886497437953949, "memory(GiB)": 21.48, "step": 11470, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957911 }, { "epoch": 0.37264074326738783, "grad_norm": 0.3615611493587494, "learning_rate": 7.414236150612727e-06, "loss": 0.02934946119785309, "memory(GiB)": 21.48, "step": 11471, "token_acc": 0.9740932642487047, "train_speed(iter/s)": 0.957928 }, { "epoch": 0.37267322873014325, "grad_norm": 0.5311485528945923, "learning_rate": 7.4137657493842965e-06, "loss": 0.02872573956847191, "memory(GiB)": 21.48, "step": 11472, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.957946 }, { "epoch": 0.37270571419289866, "grad_norm": 0.3184417188167572, "learning_rate": 7.413295320297798e-06, "loss": 0.02666080743074417, "memory(GiB)": 21.48, "step": 11473, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.957963 }, { "epoch": 0.3727381996556541, "grad_norm": 1.1484615802764893, "learning_rate": 7.412824863358656e-06, "loss": 0.02307535521686077, "memory(GiB)": 21.48, "step": 11474, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.957981 }, { "epoch": 0.3727706851184095, "grad_norm": 0.4513656497001648, "learning_rate": 7.412354378572306e-06, "loss": 0.035893507301807404, "memory(GiB)": 21.48, "step": 11475, "token_acc": 0.9885496183206107, "train_speed(iter/s)": 0.957998 }, { "epoch": 0.3728031705811649, "grad_norm": 0.2829630374908447, "learning_rate": 7.411883865944176e-06, "loss": 0.018021082505583763, "memory(GiB)": 21.48, "step": 11476, "token_acc": 1.0, "train_speed(iter/s)": 0.958015 }, { "epoch": 0.3728356560439203, "grad_norm": 0.4405408799648285, "learning_rate": 7.411413325479695e-06, "loss": 0.02616315707564354, "memory(GiB)": 21.48, "step": 11477, "token_acc": 0.9847715736040609, "train_speed(iter/s)": 0.95803 }, { "epoch": 0.37286814150667574, "grad_norm": 0.46756893396377563, "learning_rate": 7.410942757184296e-06, "loss": 0.02487233653664589, "memory(GiB)": 21.48, "step": 11478, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.958048 }, { "epoch": 0.37290062696943116, "grad_norm": 0.28751641511917114, "learning_rate": 7.410472161063407e-06, "loss": 0.017033690586686134, "memory(GiB)": 21.48, "step": 11479, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.958065 }, { "epoch": 0.3729331124321866, "grad_norm": 0.5113526582717896, "learning_rate": 7.410001537122462e-06, "loss": 0.025648929178714752, "memory(GiB)": 21.48, "step": 11480, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.958077 }, { "epoch": 0.372965597894942, "grad_norm": 0.436283677816391, "learning_rate": 7.40953088536689e-06, "loss": 0.025852389633655548, "memory(GiB)": 21.48, "step": 11481, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.95809 }, { "epoch": 0.3729980833576974, "grad_norm": 0.29895779490470886, "learning_rate": 7.409060205802125e-06, "loss": 0.024479711428284645, "memory(GiB)": 21.48, "step": 11482, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.958105 }, { "epoch": 0.3730305688204528, "grad_norm": 0.5186010599136353, "learning_rate": 7.408589498433601e-06, "loss": 0.020312180742621422, "memory(GiB)": 21.48, "step": 11483, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.958117 }, { "epoch": 0.37306305428320824, "grad_norm": 0.3073790967464447, "learning_rate": 7.408118763266745e-06, "loss": 0.024867890402674675, "memory(GiB)": 21.48, "step": 11484, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.95813 }, { "epoch": 0.37309553974596366, "grad_norm": 0.5059212446212769, "learning_rate": 7.407648000306995e-06, "loss": 0.029801547527313232, "memory(GiB)": 21.48, "step": 11485, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.958144 }, { "epoch": 0.37312802520871907, "grad_norm": 0.42255568504333496, "learning_rate": 7.407177209559783e-06, "loss": 0.025000864639878273, "memory(GiB)": 21.48, "step": 11486, "token_acc": 0.991869918699187, "train_speed(iter/s)": 0.958157 }, { "epoch": 0.3731605106714745, "grad_norm": 0.4612988233566284, "learning_rate": 7.406706391030541e-06, "loss": 0.03160285949707031, "memory(GiB)": 21.48, "step": 11487, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.958171 }, { "epoch": 0.37319299613422996, "grad_norm": 0.35008513927459717, "learning_rate": 7.406235544724705e-06, "loss": 0.02313031256198883, "memory(GiB)": 21.48, "step": 11488, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.958183 }, { "epoch": 0.3732254815969854, "grad_norm": 0.4352468252182007, "learning_rate": 7.4057646706477086e-06, "loss": 0.029088735580444336, "memory(GiB)": 21.48, "step": 11489, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.958194 }, { "epoch": 0.3732579670597408, "grad_norm": 0.3099757134914398, "learning_rate": 7.405293768804984e-06, "loss": 0.02654494158923626, "memory(GiB)": 21.48, "step": 11490, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.958206 }, { "epoch": 0.3732904525224962, "grad_norm": 0.48278361558914185, "learning_rate": 7.404822839201968e-06, "loss": 0.041035138070583344, "memory(GiB)": 21.48, "step": 11491, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.958217 }, { "epoch": 0.3733229379852516, "grad_norm": 0.39632031321525574, "learning_rate": 7.404351881844097e-06, "loss": 0.026795916259288788, "memory(GiB)": 21.48, "step": 11492, "token_acc": 0.9905660377358491, "train_speed(iter/s)": 0.95823 }, { "epoch": 0.37335542344800704, "grad_norm": 0.37275731563568115, "learning_rate": 7.403880896736804e-06, "loss": 0.023934565484523773, "memory(GiB)": 21.48, "step": 11493, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.958243 }, { "epoch": 0.37338790891076246, "grad_norm": 0.47850501537323, "learning_rate": 7.403409883885526e-06, "loss": 0.03383517265319824, "memory(GiB)": 21.48, "step": 11494, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.958254 }, { "epoch": 0.37342039437351787, "grad_norm": 0.4960513114929199, "learning_rate": 7.4029388432956995e-06, "loss": 0.028359564021229744, "memory(GiB)": 21.48, "step": 11495, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.958266 }, { "epoch": 0.3734528798362733, "grad_norm": 0.45177483558654785, "learning_rate": 7.4024677749727615e-06, "loss": 0.03457501530647278, "memory(GiB)": 21.48, "step": 11496, "token_acc": 0.9897959183673469, "train_speed(iter/s)": 0.958278 }, { "epoch": 0.3734853652990287, "grad_norm": 0.33375057578086853, "learning_rate": 7.401996678922146e-06, "loss": 0.014972390606999397, "memory(GiB)": 21.48, "step": 11497, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.958291 }, { "epoch": 0.3735178507617841, "grad_norm": 1.1807323694229126, "learning_rate": 7.401525555149291e-06, "loss": 0.037267837673425674, "memory(GiB)": 21.48, "step": 11498, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.958303 }, { "epoch": 0.37355033622453954, "grad_norm": 0.2969566583633423, "learning_rate": 7.401054403659636e-06, "loss": 0.02025700733065605, "memory(GiB)": 21.48, "step": 11499, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.958316 }, { "epoch": 0.37358282168729495, "grad_norm": 0.4405827522277832, "learning_rate": 7.4005832244586176e-06, "loss": 0.03159160166978836, "memory(GiB)": 21.48, "step": 11500, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.958328 }, { "epoch": 0.37358282168729495, "eval_loss": 0.026915492489933968, "eval_runtime": 80.2654, "eval_samples_per_second": 123.964, "eval_steps_per_second": 3.875, "eval_token_acc": 0.9894717332770453, "step": 11500 }, { "epoch": 0.37361530715005037, "grad_norm": 0.4381368160247803, "learning_rate": 7.400112017551673e-06, "loss": 0.026148246601223946, "memory(GiB)": 21.48, "step": 11501, "token_acc": 0.9892618920436647, "train_speed(iter/s)": 0.951118 }, { "epoch": 0.3736477926128058, "grad_norm": 0.3332034647464752, "learning_rate": 7.399640782944242e-06, "loss": 0.021973881870508194, "memory(GiB)": 21.48, "step": 11502, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.95113 }, { "epoch": 0.3736802780755612, "grad_norm": 0.49711835384368896, "learning_rate": 7.399169520641761e-06, "loss": 0.030202489346265793, "memory(GiB)": 21.48, "step": 11503, "token_acc": 1.0, "train_speed(iter/s)": 0.951142 }, { "epoch": 0.3737127635383166, "grad_norm": 0.48205310106277466, "learning_rate": 7.398698230649672e-06, "loss": 0.030770156532526016, "memory(GiB)": 21.48, "step": 11504, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.951156 }, { "epoch": 0.37374524900107203, "grad_norm": 0.4474952816963196, "learning_rate": 7.398226912973412e-06, "loss": 0.027695968747138977, "memory(GiB)": 21.48, "step": 11505, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.95117 }, { "epoch": 0.37377773446382745, "grad_norm": 1.557435154914856, "learning_rate": 7.397755567618422e-06, "loss": 0.027323223650455475, "memory(GiB)": 21.48, "step": 11506, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.951184 }, { "epoch": 0.37381021992658287, "grad_norm": 0.4215199053287506, "learning_rate": 7.397284194590141e-06, "loss": 0.023875588551163673, "memory(GiB)": 21.48, "step": 11507, "token_acc": 0.9793388429752066, "train_speed(iter/s)": 0.951198 }, { "epoch": 0.3738427053893383, "grad_norm": 0.993920087814331, "learning_rate": 7.39681279389401e-06, "loss": 0.036241814494132996, "memory(GiB)": 21.48, "step": 11508, "token_acc": 0.9965753424657534, "train_speed(iter/s)": 0.951212 }, { "epoch": 0.3738751908520937, "grad_norm": 0.39984679222106934, "learning_rate": 7.396341365535468e-06, "loss": 0.02898494526743889, "memory(GiB)": 21.48, "step": 11509, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.951227 }, { "epoch": 0.3739076763148491, "grad_norm": 0.5709941983222961, "learning_rate": 7.395869909519959e-06, "loss": 0.03673045337200165, "memory(GiB)": 21.48, "step": 11510, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.95124 }, { "epoch": 0.37394016177760453, "grad_norm": 0.317266047000885, "learning_rate": 7.39539842585292e-06, "loss": 0.022654512897133827, "memory(GiB)": 21.48, "step": 11511, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.951254 }, { "epoch": 0.37397264724035995, "grad_norm": 0.26331573724746704, "learning_rate": 7.394926914539795e-06, "loss": 0.017484594136476517, "memory(GiB)": 21.48, "step": 11512, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.951269 }, { "epoch": 0.37400513270311536, "grad_norm": 0.3031022250652313, "learning_rate": 7.394455375586027e-06, "loss": 0.021397225558757782, "memory(GiB)": 21.48, "step": 11513, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.951284 }, { "epoch": 0.3740376181658708, "grad_norm": 0.6631205081939697, "learning_rate": 7.3939838089970564e-06, "loss": 0.023930009454488754, "memory(GiB)": 21.48, "step": 11514, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.951303 }, { "epoch": 0.3740701036286262, "grad_norm": 0.3893982768058777, "learning_rate": 7.393512214778327e-06, "loss": 0.02649734541773796, "memory(GiB)": 21.48, "step": 11515, "token_acc": 1.0, "train_speed(iter/s)": 0.951322 }, { "epoch": 0.3741025890913816, "grad_norm": 0.5496358871459961, "learning_rate": 7.39304059293528e-06, "loss": 0.03711605817079544, "memory(GiB)": 21.48, "step": 11516, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.95134 }, { "epoch": 0.374135074554137, "grad_norm": 0.35465937852859497, "learning_rate": 7.392568943473361e-06, "loss": 0.026785466820001602, "memory(GiB)": 21.48, "step": 11517, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.951358 }, { "epoch": 0.37416756001689244, "grad_norm": 0.35881295800209045, "learning_rate": 7.3920972663980106e-06, "loss": 0.0286282766610384, "memory(GiB)": 21.48, "step": 11518, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.951376 }, { "epoch": 0.37420004547964786, "grad_norm": 1.182314157485962, "learning_rate": 7.391625561714673e-06, "loss": 0.031209081411361694, "memory(GiB)": 21.48, "step": 11519, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.951394 }, { "epoch": 0.3742325309424033, "grad_norm": 0.42308732867240906, "learning_rate": 7.391153829428796e-06, "loss": 0.030100956559181213, "memory(GiB)": 21.48, "step": 11520, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.951412 }, { "epoch": 0.3742650164051587, "grad_norm": 0.31853458285331726, "learning_rate": 7.390682069545818e-06, "loss": 0.023480691015720367, "memory(GiB)": 21.48, "step": 11521, "token_acc": 0.9966777408637874, "train_speed(iter/s)": 0.95143 }, { "epoch": 0.3742975018679141, "grad_norm": 0.3943103551864624, "learning_rate": 7.3902102820711895e-06, "loss": 0.02085069939494133, "memory(GiB)": 21.48, "step": 11522, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.951449 }, { "epoch": 0.3743299873306695, "grad_norm": 0.39546817541122437, "learning_rate": 7.389738467010351e-06, "loss": 0.025260616093873978, "memory(GiB)": 21.48, "step": 11523, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.951468 }, { "epoch": 0.37436247279342494, "grad_norm": 0.44288742542266846, "learning_rate": 7.389266624368751e-06, "loss": 0.023480229079723358, "memory(GiB)": 21.48, "step": 11524, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.951487 }, { "epoch": 0.37439495825618035, "grad_norm": 0.366952508687973, "learning_rate": 7.388794754151834e-06, "loss": 0.023582424968481064, "memory(GiB)": 21.48, "step": 11525, "token_acc": 1.0, "train_speed(iter/s)": 0.951505 }, { "epoch": 0.37442744371893577, "grad_norm": 0.47759637236595154, "learning_rate": 7.388322856365047e-06, "loss": 0.02747296169400215, "memory(GiB)": 21.48, "step": 11526, "token_acc": 1.0, "train_speed(iter/s)": 0.951522 }, { "epoch": 0.3744599291816912, "grad_norm": 0.3729872405529022, "learning_rate": 7.387850931013834e-06, "loss": 0.027148019522428513, "memory(GiB)": 21.48, "step": 11527, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.951539 }, { "epoch": 0.3744924146444466, "grad_norm": 0.6536346673965454, "learning_rate": 7.387378978103641e-06, "loss": 0.041229866445064545, "memory(GiB)": 21.48, "step": 11528, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.951557 }, { "epoch": 0.374524900107202, "grad_norm": 0.4765472412109375, "learning_rate": 7.38690699763992e-06, "loss": 0.026772260665893555, "memory(GiB)": 21.48, "step": 11529, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.951576 }, { "epoch": 0.37455738556995744, "grad_norm": 0.40791258215904236, "learning_rate": 7.386434989628114e-06, "loss": 0.026065826416015625, "memory(GiB)": 21.48, "step": 11530, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.951594 }, { "epoch": 0.37458987103271285, "grad_norm": 0.3623366355895996, "learning_rate": 7.385962954073673e-06, "loss": 0.028679704293608665, "memory(GiB)": 21.48, "step": 11531, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.951609 }, { "epoch": 0.37462235649546827, "grad_norm": 0.41351643204689026, "learning_rate": 7.385490890982044e-06, "loss": 0.0268060564994812, "memory(GiB)": 21.48, "step": 11532, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.951625 }, { "epoch": 0.3746548419582237, "grad_norm": 0.3448392152786255, "learning_rate": 7.3850188003586744e-06, "loss": 0.02109473943710327, "memory(GiB)": 21.48, "step": 11533, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.95164 }, { "epoch": 0.3746873274209791, "grad_norm": 0.39403510093688965, "learning_rate": 7.384546682209013e-06, "loss": 0.02466687746345997, "memory(GiB)": 21.48, "step": 11534, "token_acc": 0.981549815498155, "train_speed(iter/s)": 0.951654 }, { "epoch": 0.3747198128837345, "grad_norm": 0.4354369342327118, "learning_rate": 7.384074536538508e-06, "loss": 0.028005309402942657, "memory(GiB)": 21.48, "step": 11535, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.95167 }, { "epoch": 0.37475229834648993, "grad_norm": 0.4430391192436218, "learning_rate": 7.383602363352612e-06, "loss": 0.02795172855257988, "memory(GiB)": 21.48, "step": 11536, "token_acc": 1.0, "train_speed(iter/s)": 0.951686 }, { "epoch": 0.37478478380924535, "grad_norm": 0.39016392827033997, "learning_rate": 7.383130162656771e-06, "loss": 0.030623000115156174, "memory(GiB)": 21.48, "step": 11537, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.951701 }, { "epoch": 0.37481726927200076, "grad_norm": 0.49743226170539856, "learning_rate": 7.382657934456436e-06, "loss": 0.025718096643686295, "memory(GiB)": 21.48, "step": 11538, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.951715 }, { "epoch": 0.3748497547347562, "grad_norm": 0.537737250328064, "learning_rate": 7.382185678757057e-06, "loss": 0.025467906147241592, "memory(GiB)": 21.48, "step": 11539, "token_acc": 0.9868995633187773, "train_speed(iter/s)": 0.95173 }, { "epoch": 0.3748822401975116, "grad_norm": 0.27730804681777954, "learning_rate": 7.381713395564085e-06, "loss": 0.017636965960264206, "memory(GiB)": 21.48, "step": 11540, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.951745 }, { "epoch": 0.374914725660267, "grad_norm": 0.41382983326911926, "learning_rate": 7.381241084882969e-06, "loss": 0.029966289177536964, "memory(GiB)": 21.48, "step": 11541, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.951761 }, { "epoch": 0.37494721112302243, "grad_norm": 0.4572201073169708, "learning_rate": 7.3807687467191645e-06, "loss": 0.03381292149424553, "memory(GiB)": 21.48, "step": 11542, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.951776 }, { "epoch": 0.37497969658577784, "grad_norm": 0.3542305827140808, "learning_rate": 7.380296381078117e-06, "loss": 0.02501656301319599, "memory(GiB)": 21.48, "step": 11543, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.95179 }, { "epoch": 0.37501218204853326, "grad_norm": 0.42880508303642273, "learning_rate": 7.379823987965282e-06, "loss": 0.029786519706249237, "memory(GiB)": 21.48, "step": 11544, "token_acc": 0.9875, "train_speed(iter/s)": 0.951804 }, { "epoch": 0.3750446675112887, "grad_norm": 0.3614870011806488, "learning_rate": 7.379351567386111e-06, "loss": 0.028786197304725647, "memory(GiB)": 21.48, "step": 11545, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.95182 }, { "epoch": 0.3750771529740441, "grad_norm": 0.5540465712547302, "learning_rate": 7.378879119346055e-06, "loss": 0.03010210022330284, "memory(GiB)": 21.48, "step": 11546, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.951835 }, { "epoch": 0.3751096384367995, "grad_norm": 0.4726272523403168, "learning_rate": 7.3784066438505685e-06, "loss": 0.026456940919160843, "memory(GiB)": 21.48, "step": 11547, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.95185 }, { "epoch": 0.3751421238995549, "grad_norm": 0.35073626041412354, "learning_rate": 7.3779341409051046e-06, "loss": 0.021406671032309532, "memory(GiB)": 21.48, "step": 11548, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.951865 }, { "epoch": 0.37517460936231034, "grad_norm": 0.44762468338012695, "learning_rate": 7.377461610515114e-06, "loss": 0.028893569484353065, "memory(GiB)": 21.48, "step": 11549, "token_acc": 0.9890909090909091, "train_speed(iter/s)": 0.95188 }, { "epoch": 0.37520709482506576, "grad_norm": 0.3388930857181549, "learning_rate": 7.376989052686053e-06, "loss": 0.025162022560834885, "memory(GiB)": 21.48, "step": 11550, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.951896 }, { "epoch": 0.3752395802878212, "grad_norm": 0.36083081364631653, "learning_rate": 7.3765164674233735e-06, "loss": 0.027780400589108467, "memory(GiB)": 21.48, "step": 11551, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.951908 }, { "epoch": 0.37527206575057664, "grad_norm": 0.393428236246109, "learning_rate": 7.376043854732533e-06, "loss": 0.024238845333456993, "memory(GiB)": 21.48, "step": 11552, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.951919 }, { "epoch": 0.37530455121333206, "grad_norm": 0.8361708521842957, "learning_rate": 7.375571214618981e-06, "loss": 0.022098232060670853, "memory(GiB)": 21.48, "step": 11553, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.95193 }, { "epoch": 0.3753370366760875, "grad_norm": 0.36836808919906616, "learning_rate": 7.375098547088178e-06, "loss": 0.026835298165678978, "memory(GiB)": 21.48, "step": 11554, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.95192 }, { "epoch": 0.3753695221388429, "grad_norm": 0.3975188732147217, "learning_rate": 7.374625852145575e-06, "loss": 0.02856956422328949, "memory(GiB)": 21.48, "step": 11555, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.951932 }, { "epoch": 0.3754020076015983, "grad_norm": 0.4603217542171478, "learning_rate": 7.37415312979663e-06, "loss": 0.02341686747968197, "memory(GiB)": 21.48, "step": 11556, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.951944 }, { "epoch": 0.3754344930643537, "grad_norm": 0.48828357458114624, "learning_rate": 7.3736803800467975e-06, "loss": 0.03222755342721939, "memory(GiB)": 21.48, "step": 11557, "token_acc": 0.9846938775510204, "train_speed(iter/s)": 0.951957 }, { "epoch": 0.37546697852710914, "grad_norm": 1.5448050498962402, "learning_rate": 7.373207602901533e-06, "loss": 0.039421167224645615, "memory(GiB)": 21.48, "step": 11558, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.95197 }, { "epoch": 0.37549946398986456, "grad_norm": 0.35618269443511963, "learning_rate": 7.372734798366294e-06, "loss": 0.026553474366664886, "memory(GiB)": 21.48, "step": 11559, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.951984 }, { "epoch": 0.37553194945262, "grad_norm": 0.3955874741077423, "learning_rate": 7.372261966446538e-06, "loss": 0.024008259177207947, "memory(GiB)": 21.48, "step": 11560, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.951997 }, { "epoch": 0.3755644349153754, "grad_norm": 0.7068026661872864, "learning_rate": 7.371789107147722e-06, "loss": 0.04209523648023605, "memory(GiB)": 21.48, "step": 11561, "token_acc": 0.994475138121547, "train_speed(iter/s)": 0.95201 }, { "epoch": 0.3755969203781308, "grad_norm": 0.8664793372154236, "learning_rate": 7.371316220475303e-06, "loss": 0.030867334455251694, "memory(GiB)": 21.48, "step": 11562, "token_acc": 0.9700854700854701, "train_speed(iter/s)": 0.952025 }, { "epoch": 0.3756294058408862, "grad_norm": 0.38475194573402405, "learning_rate": 7.370843306434739e-06, "loss": 0.025461511686444283, "memory(GiB)": 21.48, "step": 11563, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.952039 }, { "epoch": 0.37566189130364164, "grad_norm": 0.6577713489532471, "learning_rate": 7.3703703650314865e-06, "loss": 0.026996847242116928, "memory(GiB)": 21.48, "step": 11564, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.952054 }, { "epoch": 0.37569437676639705, "grad_norm": 0.4467509388923645, "learning_rate": 7.369897396271004e-06, "loss": 0.035680074244737625, "memory(GiB)": 21.48, "step": 11565, "token_acc": 0.9816849816849816, "train_speed(iter/s)": 0.952067 }, { "epoch": 0.37572686222915247, "grad_norm": 0.4066963791847229, "learning_rate": 7.369424400158753e-06, "loss": 0.03620009124279022, "memory(GiB)": 21.48, "step": 11566, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.952082 }, { "epoch": 0.3757593476919079, "grad_norm": 0.22683347761631012, "learning_rate": 7.3689513767001895e-06, "loss": 0.016972418874502182, "memory(GiB)": 21.48, "step": 11567, "token_acc": 0.9898477157360406, "train_speed(iter/s)": 0.952095 }, { "epoch": 0.3757918331546633, "grad_norm": 0.5503965020179749, "learning_rate": 7.368478325900775e-06, "loss": 0.04071924835443497, "memory(GiB)": 21.48, "step": 11568, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.952108 }, { "epoch": 0.3758243186174187, "grad_norm": 0.35389575362205505, "learning_rate": 7.368005247765968e-06, "loss": 0.02692522294819355, "memory(GiB)": 21.48, "step": 11569, "token_acc": 0.9884615384615385, "train_speed(iter/s)": 0.952119 }, { "epoch": 0.37585680408017413, "grad_norm": 0.8836485147476196, "learning_rate": 7.367532142301229e-06, "loss": 0.029020998626947403, "memory(GiB)": 21.48, "step": 11570, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.952134 }, { "epoch": 0.37588928954292955, "grad_norm": 0.4668998122215271, "learning_rate": 7.367059009512019e-06, "loss": 0.028534818440675735, "memory(GiB)": 21.48, "step": 11571, "token_acc": 0.9715447154471545, "train_speed(iter/s)": 0.952146 }, { "epoch": 0.37592177500568497, "grad_norm": 0.35720357298851013, "learning_rate": 7.366585849403795e-06, "loss": 0.024628734216094017, "memory(GiB)": 21.48, "step": 11572, "token_acc": 0.9921875, "train_speed(iter/s)": 0.952162 }, { "epoch": 0.3759542604684404, "grad_norm": 0.40015414357185364, "learning_rate": 7.366112661982021e-06, "loss": 0.0263025164604187, "memory(GiB)": 21.48, "step": 11573, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.952179 }, { "epoch": 0.3759867459311958, "grad_norm": 0.2870711088180542, "learning_rate": 7.3656394472521565e-06, "loss": 0.01920551247894764, "memory(GiB)": 21.48, "step": 11574, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.952196 }, { "epoch": 0.3760192313939512, "grad_norm": 0.5245904326438904, "learning_rate": 7.365166205219666e-06, "loss": 0.028513234108686447, "memory(GiB)": 21.48, "step": 11575, "token_acc": 0.9783549783549783, "train_speed(iter/s)": 0.952215 }, { "epoch": 0.37605171685670663, "grad_norm": 0.3060528039932251, "learning_rate": 7.36469293589001e-06, "loss": 0.017680343240499496, "memory(GiB)": 21.48, "step": 11576, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.952233 }, { "epoch": 0.37608420231946205, "grad_norm": 0.3889952301979065, "learning_rate": 7.364219639268648e-06, "loss": 0.020614581182599068, "memory(GiB)": 21.48, "step": 11577, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.952249 }, { "epoch": 0.37611668778221746, "grad_norm": 0.8147252202033997, "learning_rate": 7.363746315361047e-06, "loss": 0.03596215695142746, "memory(GiB)": 21.48, "step": 11578, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.952268 }, { "epoch": 0.3761491732449729, "grad_norm": 0.4114522337913513, "learning_rate": 7.363272964172665e-06, "loss": 0.02387012541294098, "memory(GiB)": 21.48, "step": 11579, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.952285 }, { "epoch": 0.3761816587077283, "grad_norm": 0.33766743540763855, "learning_rate": 7.362799585708969e-06, "loss": 0.025152385234832764, "memory(GiB)": 21.48, "step": 11580, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.952302 }, { "epoch": 0.3762141441704837, "grad_norm": 0.5128849148750305, "learning_rate": 7.3623261799754194e-06, "loss": 0.018912211060523987, "memory(GiB)": 21.48, "step": 11581, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.952319 }, { "epoch": 0.3762466296332391, "grad_norm": 0.4450594186782837, "learning_rate": 7.361852746977484e-06, "loss": 0.028220072388648987, "memory(GiB)": 21.48, "step": 11582, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.952337 }, { "epoch": 0.37627911509599454, "grad_norm": 0.6060247421264648, "learning_rate": 7.3613792867206226e-06, "loss": 0.03161823749542236, "memory(GiB)": 21.48, "step": 11583, "token_acc": 1.0, "train_speed(iter/s)": 0.952355 }, { "epoch": 0.37631160055874996, "grad_norm": 0.2858772277832031, "learning_rate": 7.360905799210302e-06, "loss": 0.017118632793426514, "memory(GiB)": 21.48, "step": 11584, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.952373 }, { "epoch": 0.3763440860215054, "grad_norm": 0.43365392088890076, "learning_rate": 7.360432284451986e-06, "loss": 0.026647869497537613, "memory(GiB)": 21.48, "step": 11585, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.952388 }, { "epoch": 0.3763765714842608, "grad_norm": 0.3655840754508972, "learning_rate": 7.3599587424511385e-06, "loss": 0.02825057879090309, "memory(GiB)": 21.48, "step": 11586, "token_acc": 0.9845559845559846, "train_speed(iter/s)": 0.952406 }, { "epoch": 0.3764090569470162, "grad_norm": 0.48954465985298157, "learning_rate": 7.359485173213227e-06, "loss": 0.02646895870566368, "memory(GiB)": 21.48, "step": 11587, "token_acc": 0.9864406779661017, "train_speed(iter/s)": 0.952423 }, { "epoch": 0.3764415424097716, "grad_norm": 0.4979901611804962, "learning_rate": 7.359011576743715e-06, "loss": 0.021294843405485153, "memory(GiB)": 21.48, "step": 11588, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.952441 }, { "epoch": 0.37647402787252704, "grad_norm": 0.7035247683525085, "learning_rate": 7.358537953048071e-06, "loss": 0.0326409637928009, "memory(GiB)": 21.48, "step": 11589, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.952457 }, { "epoch": 0.37650651333528246, "grad_norm": 0.4333694279193878, "learning_rate": 7.358064302131759e-06, "loss": 0.031929634511470795, "memory(GiB)": 21.48, "step": 11590, "token_acc": 1.0, "train_speed(iter/s)": 0.952475 }, { "epoch": 0.37653899879803787, "grad_norm": 0.394965797662735, "learning_rate": 7.357590624000246e-06, "loss": 0.03397826477885246, "memory(GiB)": 21.48, "step": 11591, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.952492 }, { "epoch": 0.3765714842607933, "grad_norm": 0.5045132040977478, "learning_rate": 7.357116918659001e-06, "loss": 0.028579451143741608, "memory(GiB)": 21.48, "step": 11592, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.95251 }, { "epoch": 0.3766039697235487, "grad_norm": 0.4142349362373352, "learning_rate": 7.356643186113488e-06, "loss": 0.02966390736401081, "memory(GiB)": 21.48, "step": 11593, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.952527 }, { "epoch": 0.3766364551863041, "grad_norm": 0.3637197017669678, "learning_rate": 7.356169426369177e-06, "loss": 0.025914201512932777, "memory(GiB)": 21.48, "step": 11594, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.952545 }, { "epoch": 0.37666894064905954, "grad_norm": 0.32777300477027893, "learning_rate": 7.3556956394315325e-06, "loss": 0.027302173897624016, "memory(GiB)": 21.48, "step": 11595, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.95256 }, { "epoch": 0.37670142611181495, "grad_norm": 0.428463339805603, "learning_rate": 7.355221825306027e-06, "loss": 0.03087051585316658, "memory(GiB)": 21.48, "step": 11596, "token_acc": 0.9765258215962441, "train_speed(iter/s)": 0.952574 }, { "epoch": 0.37673391157457037, "grad_norm": 0.3553681969642639, "learning_rate": 7.354747983998126e-06, "loss": 0.024217048659920692, "memory(GiB)": 21.48, "step": 11597, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.952587 }, { "epoch": 0.3767663970373258, "grad_norm": 0.3674069344997406, "learning_rate": 7.3542741155133005e-06, "loss": 0.029953697696328163, "memory(GiB)": 21.48, "step": 11598, "token_acc": 0.976, "train_speed(iter/s)": 0.9526 }, { "epoch": 0.3767988825000812, "grad_norm": 0.2976568043231964, "learning_rate": 7.353800219857017e-06, "loss": 0.028747636824846268, "memory(GiB)": 21.48, "step": 11599, "token_acc": 0.979253112033195, "train_speed(iter/s)": 0.952614 }, { "epoch": 0.3768313679628366, "grad_norm": 0.3228533864021301, "learning_rate": 7.3533262970347465e-06, "loss": 0.02403896674513817, "memory(GiB)": 21.48, "step": 11600, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.95263 }, { "epoch": 0.37686385342559203, "grad_norm": 0.6033848524093628, "learning_rate": 7.352852347051957e-06, "loss": 0.03355919569730759, "memory(GiB)": 21.48, "step": 11601, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.952643 }, { "epoch": 0.37689633888834745, "grad_norm": 0.33639195561408997, "learning_rate": 7.35237836991412e-06, "loss": 0.020400863140821457, "memory(GiB)": 21.48, "step": 11602, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.952658 }, { "epoch": 0.37692882435110286, "grad_norm": 0.313038170337677, "learning_rate": 7.351904365626707e-06, "loss": 0.03294968977570534, "memory(GiB)": 21.48, "step": 11603, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.952673 }, { "epoch": 0.3769613098138583, "grad_norm": 0.5011695623397827, "learning_rate": 7.351430334195187e-06, "loss": 0.03472299873828888, "memory(GiB)": 21.48, "step": 11604, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.952687 }, { "epoch": 0.3769937952766137, "grad_norm": 0.3478795289993286, "learning_rate": 7.350956275625029e-06, "loss": 0.02985745668411255, "memory(GiB)": 21.48, "step": 11605, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.952701 }, { "epoch": 0.3770262807393691, "grad_norm": 0.39714565873146057, "learning_rate": 7.35048218992171e-06, "loss": 0.027701780200004578, "memory(GiB)": 21.48, "step": 11606, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.952716 }, { "epoch": 0.37705876620212453, "grad_norm": 0.25754237174987793, "learning_rate": 7.3500080770906955e-06, "loss": 0.022535286843776703, "memory(GiB)": 21.48, "step": 11607, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.95273 }, { "epoch": 0.37709125166487995, "grad_norm": 0.41543054580688477, "learning_rate": 7.349533937137461e-06, "loss": 0.028773020952939987, "memory(GiB)": 21.48, "step": 11608, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.952741 }, { "epoch": 0.37712373712763536, "grad_norm": 0.4101794362068176, "learning_rate": 7.349059770067477e-06, "loss": 0.023936063051223755, "memory(GiB)": 21.48, "step": 11609, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.952756 }, { "epoch": 0.3771562225903908, "grad_norm": 0.28936636447906494, "learning_rate": 7.348585575886217e-06, "loss": 0.0193566232919693, "memory(GiB)": 21.48, "step": 11610, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.95277 }, { "epoch": 0.3771887080531462, "grad_norm": 0.40076300501823425, "learning_rate": 7.348111354599153e-06, "loss": 0.02650853618979454, "memory(GiB)": 21.48, "step": 11611, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.952783 }, { "epoch": 0.3772211935159016, "grad_norm": 0.6547351479530334, "learning_rate": 7.34763710621176e-06, "loss": 0.026545848697423935, "memory(GiB)": 21.48, "step": 11612, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.952794 }, { "epoch": 0.377253678978657, "grad_norm": 0.35747602581977844, "learning_rate": 7.3471628307295094e-06, "loss": 0.026109619066119194, "memory(GiB)": 21.48, "step": 11613, "token_acc": 0.9923664122137404, "train_speed(iter/s)": 0.952803 }, { "epoch": 0.37728616444141244, "grad_norm": 0.5477601885795593, "learning_rate": 7.3466885281578744e-06, "loss": 0.02903003990650177, "memory(GiB)": 21.48, "step": 11614, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.952814 }, { "epoch": 0.37731864990416786, "grad_norm": 0.23992151021957397, "learning_rate": 7.346214198502333e-06, "loss": 0.018843865022063255, "memory(GiB)": 21.48, "step": 11615, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.952827 }, { "epoch": 0.37735113536692333, "grad_norm": 0.5024315118789673, "learning_rate": 7.345739841768354e-06, "loss": 0.027320213615894318, "memory(GiB)": 21.48, "step": 11616, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.952838 }, { "epoch": 0.37738362082967875, "grad_norm": 0.3220500349998474, "learning_rate": 7.345265457961417e-06, "loss": 0.024678580462932587, "memory(GiB)": 21.48, "step": 11617, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.952851 }, { "epoch": 0.37741610629243416, "grad_norm": 0.47897711396217346, "learning_rate": 7.344791047086994e-06, "loss": 0.028805751353502274, "memory(GiB)": 21.48, "step": 11618, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.952862 }, { "epoch": 0.3774485917551896, "grad_norm": 0.4660496413707733, "learning_rate": 7.344316609150562e-06, "loss": 0.025945670902729034, "memory(GiB)": 21.48, "step": 11619, "token_acc": 0.980327868852459, "train_speed(iter/s)": 0.952874 }, { "epoch": 0.377481077217945, "grad_norm": 0.4626166820526123, "learning_rate": 7.343842144157596e-06, "loss": 0.02757713943719864, "memory(GiB)": 21.48, "step": 11620, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.952889 }, { "epoch": 0.3775135626807004, "grad_norm": 0.3712926506996155, "learning_rate": 7.343367652113571e-06, "loss": 0.02865752950310707, "memory(GiB)": 21.48, "step": 11621, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.952904 }, { "epoch": 0.3775460481434558, "grad_norm": 0.40045082569122314, "learning_rate": 7.342893133023966e-06, "loss": 0.026128556579351425, "memory(GiB)": 21.48, "step": 11622, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.952918 }, { "epoch": 0.37757853360621124, "grad_norm": 0.4063807725906372, "learning_rate": 7.3424185868942555e-06, "loss": 0.024469532072544098, "memory(GiB)": 21.48, "step": 11623, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.952932 }, { "epoch": 0.37761101906896666, "grad_norm": 0.41921356320381165, "learning_rate": 7.341944013729916e-06, "loss": 0.023916512727737427, "memory(GiB)": 21.48, "step": 11624, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.952942 }, { "epoch": 0.3776435045317221, "grad_norm": 0.46731188893318176, "learning_rate": 7.341469413536426e-06, "loss": 0.02544144168496132, "memory(GiB)": 21.48, "step": 11625, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.952956 }, { "epoch": 0.3776759899944775, "grad_norm": 0.5087000727653503, "learning_rate": 7.340994786319262e-06, "loss": 0.028938107192516327, "memory(GiB)": 21.48, "step": 11626, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.952971 }, { "epoch": 0.3777084754572329, "grad_norm": 0.7493711113929749, "learning_rate": 7.340520132083904e-06, "loss": 0.03475276008248329, "memory(GiB)": 21.48, "step": 11627, "token_acc": 0.9778761061946902, "train_speed(iter/s)": 0.952985 }, { "epoch": 0.3777409609199883, "grad_norm": 0.49552932381629944, "learning_rate": 7.340045450835826e-06, "loss": 0.03047131560742855, "memory(GiB)": 21.48, "step": 11628, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.953 }, { "epoch": 0.37777344638274374, "grad_norm": 0.4958624541759491, "learning_rate": 7.339570742580511e-06, "loss": 0.030179476365447044, "memory(GiB)": 21.48, "step": 11629, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.953015 }, { "epoch": 0.37780593184549915, "grad_norm": 0.5807962417602539, "learning_rate": 7.339096007323434e-06, "loss": 0.03127952665090561, "memory(GiB)": 21.48, "step": 11630, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.953028 }, { "epoch": 0.37783841730825457, "grad_norm": 0.41178059577941895, "learning_rate": 7.338621245070077e-06, "loss": 0.027728606015443802, "memory(GiB)": 21.48, "step": 11631, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.953043 }, { "epoch": 0.37787090277101, "grad_norm": 0.39707544445991516, "learning_rate": 7.338146455825916e-06, "loss": 0.03409680724143982, "memory(GiB)": 21.48, "step": 11632, "token_acc": 0.9794238683127572, "train_speed(iter/s)": 0.953062 }, { "epoch": 0.3779033882337654, "grad_norm": 3.6109211444854736, "learning_rate": 7.337671639596435e-06, "loss": 0.028530612587928772, "memory(GiB)": 21.48, "step": 11633, "token_acc": 0.9905660377358491, "train_speed(iter/s)": 0.953079 }, { "epoch": 0.3779358736965208, "grad_norm": 0.4690214693546295, "learning_rate": 7.3371967963871115e-06, "loss": 0.02058243192732334, "memory(GiB)": 21.48, "step": 11634, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.953098 }, { "epoch": 0.37796835915927623, "grad_norm": 0.4004468321800232, "learning_rate": 7.3367219262034265e-06, "loss": 0.018529396504163742, "memory(GiB)": 21.48, "step": 11635, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.953115 }, { "epoch": 0.37800084462203165, "grad_norm": 0.5560054779052734, "learning_rate": 7.336247029050858e-06, "loss": 0.03630350902676582, "memory(GiB)": 21.48, "step": 11636, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.953133 }, { "epoch": 0.37803333008478707, "grad_norm": 0.3751835823059082, "learning_rate": 7.33577210493489e-06, "loss": 0.03269922733306885, "memory(GiB)": 21.48, "step": 11637, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.95315 }, { "epoch": 0.3780658155475425, "grad_norm": 0.25844070315361023, "learning_rate": 7.335297153861005e-06, "loss": 0.018813207745552063, "memory(GiB)": 21.48, "step": 11638, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.953168 }, { "epoch": 0.3780983010102979, "grad_norm": 0.3315064013004303, "learning_rate": 7.334822175834682e-06, "loss": 0.02150244265794754, "memory(GiB)": 21.48, "step": 11639, "token_acc": 0.9947916666666666, "train_speed(iter/s)": 0.953184 }, { "epoch": 0.3781307864730533, "grad_norm": 0.3134723901748657, "learning_rate": 7.334347170861402e-06, "loss": 0.02386663481593132, "memory(GiB)": 21.48, "step": 11640, "token_acc": 1.0, "train_speed(iter/s)": 0.953202 }, { "epoch": 0.37816327193580873, "grad_norm": 0.7981862425804138, "learning_rate": 7.333872138946651e-06, "loss": 0.03552336245775223, "memory(GiB)": 21.48, "step": 11641, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.953221 }, { "epoch": 0.37819575739856415, "grad_norm": 0.3925749659538269, "learning_rate": 7.333397080095907e-06, "loss": 0.027387414127588272, "memory(GiB)": 21.48, "step": 11642, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.953239 }, { "epoch": 0.37822824286131956, "grad_norm": 0.439528226852417, "learning_rate": 7.332921994314657e-06, "loss": 0.029329393059015274, "memory(GiB)": 21.48, "step": 11643, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.953256 }, { "epoch": 0.378260728324075, "grad_norm": 0.4548157751560211, "learning_rate": 7.332446881608379e-06, "loss": 0.026975709944963455, "memory(GiB)": 21.48, "step": 11644, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.953274 }, { "epoch": 0.3782932137868304, "grad_norm": 0.5682544708251953, "learning_rate": 7.331971741982562e-06, "loss": 0.027361437678337097, "memory(GiB)": 21.48, "step": 11645, "token_acc": 0.996, "train_speed(iter/s)": 0.953292 }, { "epoch": 0.3783256992495858, "grad_norm": 0.45013877749443054, "learning_rate": 7.331496575442686e-06, "loss": 0.0370820127427578, "memory(GiB)": 21.48, "step": 11646, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.953312 }, { "epoch": 0.37835818471234123, "grad_norm": 0.33724772930145264, "learning_rate": 7.331021381994238e-06, "loss": 0.024380650371313095, "memory(GiB)": 21.48, "step": 11647, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.95333 }, { "epoch": 0.37839067017509664, "grad_norm": 0.3715042769908905, "learning_rate": 7.3305461616426985e-06, "loss": 0.03401676565408707, "memory(GiB)": 21.48, "step": 11648, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.953348 }, { "epoch": 0.37842315563785206, "grad_norm": 0.40474289655685425, "learning_rate": 7.330070914393556e-06, "loss": 0.033585064113140106, "memory(GiB)": 21.48, "step": 11649, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.953366 }, { "epoch": 0.3784556411006075, "grad_norm": 0.46228134632110596, "learning_rate": 7.329595640252294e-06, "loss": 0.03015085682272911, "memory(GiB)": 21.48, "step": 11650, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.953383 }, { "epoch": 0.3784881265633629, "grad_norm": 0.3427586257457733, "learning_rate": 7.329120339224395e-06, "loss": 0.026935569941997528, "memory(GiB)": 21.48, "step": 11651, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.953401 }, { "epoch": 0.3785206120261183, "grad_norm": 0.3558596074581146, "learning_rate": 7.32864501131535e-06, "loss": 0.02627178281545639, "memory(GiB)": 21.48, "step": 11652, "token_acc": 1.0, "train_speed(iter/s)": 0.953419 }, { "epoch": 0.3785530974888737, "grad_norm": 0.3616594672203064, "learning_rate": 7.3281696565306395e-06, "loss": 0.024646807461977005, "memory(GiB)": 21.48, "step": 11653, "token_acc": 0.9890510948905109, "train_speed(iter/s)": 0.953437 }, { "epoch": 0.37858558295162914, "grad_norm": 0.4060310125350952, "learning_rate": 7.327694274875755e-06, "loss": 0.02672737091779709, "memory(GiB)": 21.48, "step": 11654, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.953455 }, { "epoch": 0.37861806841438456, "grad_norm": 0.41258397698402405, "learning_rate": 7.32721886635618e-06, "loss": 0.024186693131923676, "memory(GiB)": 21.48, "step": 11655, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.953472 }, { "epoch": 0.37865055387714, "grad_norm": 0.3425194025039673, "learning_rate": 7.326743430977401e-06, "loss": 0.028087012469768524, "memory(GiB)": 21.48, "step": 11656, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.95349 }, { "epoch": 0.3786830393398954, "grad_norm": 0.3919854164123535, "learning_rate": 7.3262679687449065e-06, "loss": 0.022310717031359673, "memory(GiB)": 21.48, "step": 11657, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.953507 }, { "epoch": 0.3787155248026508, "grad_norm": 0.2808564007282257, "learning_rate": 7.325792479664182e-06, "loss": 0.018849872052669525, "memory(GiB)": 21.48, "step": 11658, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.953525 }, { "epoch": 0.3787480102654062, "grad_norm": 0.4723692834377289, "learning_rate": 7.325316963740718e-06, "loss": 0.02779724821448326, "memory(GiB)": 21.48, "step": 11659, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.953543 }, { "epoch": 0.37878049572816164, "grad_norm": 0.39203596115112305, "learning_rate": 7.32484142098e-06, "loss": 0.027478065341711044, "memory(GiB)": 21.48, "step": 11660, "token_acc": 0.9897959183673469, "train_speed(iter/s)": 0.953556 }, { "epoch": 0.37881298119091705, "grad_norm": 0.38518527150154114, "learning_rate": 7.3243658513875206e-06, "loss": 0.02003517374396324, "memory(GiB)": 21.48, "step": 11661, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.953571 }, { "epoch": 0.37884546665367247, "grad_norm": 0.4730168879032135, "learning_rate": 7.323890254968762e-06, "loss": 0.025807440280914307, "memory(GiB)": 21.48, "step": 11662, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.953586 }, { "epoch": 0.3788779521164279, "grad_norm": 0.3535028100013733, "learning_rate": 7.323414631729219e-06, "loss": 0.027889259159564972, "memory(GiB)": 21.48, "step": 11663, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.9536 }, { "epoch": 0.3789104375791833, "grad_norm": 0.3630713224411011, "learning_rate": 7.322938981674379e-06, "loss": 0.028008844703435898, "memory(GiB)": 21.48, "step": 11664, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.953615 }, { "epoch": 0.3789429230419387, "grad_norm": 0.4716375768184662, "learning_rate": 7.322463304809729e-06, "loss": 0.024655593559145927, "memory(GiB)": 21.48, "step": 11665, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.95363 }, { "epoch": 0.37897540850469413, "grad_norm": 0.37329134345054626, "learning_rate": 7.321987601140763e-06, "loss": 0.021625297144055367, "memory(GiB)": 21.48, "step": 11666, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.953644 }, { "epoch": 0.37900789396744955, "grad_norm": 0.3135523200035095, "learning_rate": 7.321511870672968e-06, "loss": 0.022043641656637192, "memory(GiB)": 21.48, "step": 11667, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.953658 }, { "epoch": 0.37904037943020497, "grad_norm": 0.25590094923973083, "learning_rate": 7.321036113411838e-06, "loss": 0.021966923028230667, "memory(GiB)": 21.48, "step": 11668, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.953672 }, { "epoch": 0.3790728648929604, "grad_norm": 0.4779157042503357, "learning_rate": 7.3205603293628615e-06, "loss": 0.030524756759405136, "memory(GiB)": 21.48, "step": 11669, "token_acc": 0.9793103448275862, "train_speed(iter/s)": 0.953685 }, { "epoch": 0.3791053503557158, "grad_norm": 0.4280626177787781, "learning_rate": 7.32008451853153e-06, "loss": 0.023105449974536896, "memory(GiB)": 21.48, "step": 11670, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.953698 }, { "epoch": 0.3791378358184712, "grad_norm": 0.3683760464191437, "learning_rate": 7.319608680923335e-06, "loss": 0.020987071096897125, "memory(GiB)": 21.48, "step": 11671, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.953712 }, { "epoch": 0.37917032128122663, "grad_norm": 0.5541747212409973, "learning_rate": 7.319132816543767e-06, "loss": 0.0342421792447567, "memory(GiB)": 21.48, "step": 11672, "token_acc": 1.0, "train_speed(iter/s)": 0.953725 }, { "epoch": 0.37920280674398205, "grad_norm": 0.3121007978916168, "learning_rate": 7.318656925398322e-06, "loss": 0.023207779973745346, "memory(GiB)": 21.48, "step": 11673, "token_acc": 0.9790940766550522, "train_speed(iter/s)": 0.953737 }, { "epoch": 0.37923529220673746, "grad_norm": 0.3817419409751892, "learning_rate": 7.3181810074924886e-06, "loss": 0.017247065901756287, "memory(GiB)": 21.48, "step": 11674, "token_acc": 1.0, "train_speed(iter/s)": 0.95375 }, { "epoch": 0.3792677776694929, "grad_norm": 0.773688554763794, "learning_rate": 7.317705062831761e-06, "loss": 0.028199629858136177, "memory(GiB)": 21.48, "step": 11675, "token_acc": 1.0, "train_speed(iter/s)": 0.953762 }, { "epoch": 0.3793002631322483, "grad_norm": 0.42517951130867004, "learning_rate": 7.317229091421632e-06, "loss": 0.02569866180419922, "memory(GiB)": 21.48, "step": 11676, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.953775 }, { "epoch": 0.3793327485950037, "grad_norm": 0.5081735253334045, "learning_rate": 7.3167530932675955e-06, "loss": 0.02990376576781273, "memory(GiB)": 21.48, "step": 11677, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.953787 }, { "epoch": 0.3793652340577591, "grad_norm": 0.2790623605251312, "learning_rate": 7.316277068375145e-06, "loss": 0.01931089535355568, "memory(GiB)": 21.48, "step": 11678, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.953799 }, { "epoch": 0.37939771952051454, "grad_norm": 0.7127023339271545, "learning_rate": 7.315801016749773e-06, "loss": 0.03641991317272186, "memory(GiB)": 21.48, "step": 11679, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.953811 }, { "epoch": 0.37943020498327, "grad_norm": 0.4584178030490875, "learning_rate": 7.315324938396976e-06, "loss": 0.024147681891918182, "memory(GiB)": 21.48, "step": 11680, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.953824 }, { "epoch": 0.37946269044602543, "grad_norm": 0.5593900084495544, "learning_rate": 7.314848833322246e-06, "loss": 0.041680268943309784, "memory(GiB)": 21.48, "step": 11681, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.953837 }, { "epoch": 0.37949517590878085, "grad_norm": 0.3923994302749634, "learning_rate": 7.314372701531081e-06, "loss": 0.026102567091584206, "memory(GiB)": 21.48, "step": 11682, "token_acc": 0.9875, "train_speed(iter/s)": 0.95385 }, { "epoch": 0.37952766137153626, "grad_norm": 0.44381511211395264, "learning_rate": 7.313896543028975e-06, "loss": 0.02760475128889084, "memory(GiB)": 21.48, "step": 11683, "token_acc": 0.9893992932862191, "train_speed(iter/s)": 0.953861 }, { "epoch": 0.3795601468342917, "grad_norm": 0.6177757382392883, "learning_rate": 7.313420357821421e-06, "loss": 0.027676407247781754, "memory(GiB)": 21.48, "step": 11684, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.953874 }, { "epoch": 0.3795926322970471, "grad_norm": 0.7394981384277344, "learning_rate": 7.312944145913918e-06, "loss": 0.03689643368124962, "memory(GiB)": 21.48, "step": 11685, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.953887 }, { "epoch": 0.3796251177598025, "grad_norm": 0.4127611815929413, "learning_rate": 7.312467907311962e-06, "loss": 0.025057412683963776, "memory(GiB)": 21.48, "step": 11686, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.953902 }, { "epoch": 0.3796576032225579, "grad_norm": 0.3910200297832489, "learning_rate": 7.311991642021048e-06, "loss": 0.02877872623503208, "memory(GiB)": 21.48, "step": 11687, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.953913 }, { "epoch": 0.37969008868531334, "grad_norm": 0.8540329933166504, "learning_rate": 7.311515350046672e-06, "loss": 0.044785067439079285, "memory(GiB)": 21.48, "step": 11688, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.953925 }, { "epoch": 0.37972257414806876, "grad_norm": 0.36701810359954834, "learning_rate": 7.3110390313943334e-06, "loss": 0.020500920712947845, "memory(GiB)": 21.48, "step": 11689, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.953939 }, { "epoch": 0.3797550596108242, "grad_norm": 0.6020402908325195, "learning_rate": 7.310562686069528e-06, "loss": 0.023610681295394897, "memory(GiB)": 21.48, "step": 11690, "token_acc": 0.996551724137931, "train_speed(iter/s)": 0.953953 }, { "epoch": 0.3797875450735796, "grad_norm": 0.3459172248840332, "learning_rate": 7.310086314077755e-06, "loss": 0.02765200100839138, "memory(GiB)": 21.48, "step": 11691, "token_acc": 0.9824561403508771, "train_speed(iter/s)": 0.953971 }, { "epoch": 0.379820030536335, "grad_norm": 0.3881780803203583, "learning_rate": 7.30960991542451e-06, "loss": 0.025483321398496628, "memory(GiB)": 21.48, "step": 11692, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.953988 }, { "epoch": 0.3798525159990904, "grad_norm": 0.42369556427001953, "learning_rate": 7.309133490115292e-06, "loss": 0.02440904825925827, "memory(GiB)": 21.48, "step": 11693, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.954005 }, { "epoch": 0.37988500146184584, "grad_norm": 0.9928306341171265, "learning_rate": 7.308657038155601e-06, "loss": 0.03197856247425079, "memory(GiB)": 21.48, "step": 11694, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.954023 }, { "epoch": 0.37991748692460126, "grad_norm": 0.48669010400772095, "learning_rate": 7.308180559550935e-06, "loss": 0.03922145068645477, "memory(GiB)": 21.48, "step": 11695, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.95404 }, { "epoch": 0.37994997238735667, "grad_norm": 0.3893592059612274, "learning_rate": 7.307704054306792e-06, "loss": 0.028759945183992386, "memory(GiB)": 21.48, "step": 11696, "token_acc": 0.9771428571428571, "train_speed(iter/s)": 0.954057 }, { "epoch": 0.3799824578501121, "grad_norm": 0.47909456491470337, "learning_rate": 7.307227522428674e-06, "loss": 0.02365199476480484, "memory(GiB)": 21.48, "step": 11697, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.954075 }, { "epoch": 0.3800149433128675, "grad_norm": 0.3713880479335785, "learning_rate": 7.306750963922078e-06, "loss": 0.02296164631843567, "memory(GiB)": 21.48, "step": 11698, "token_acc": 1.0, "train_speed(iter/s)": 0.954094 }, { "epoch": 0.3800474287756229, "grad_norm": 0.3178280293941498, "learning_rate": 7.306274378792508e-06, "loss": 0.01993134617805481, "memory(GiB)": 21.48, "step": 11699, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.954112 }, { "epoch": 0.38007991423837834, "grad_norm": 0.44563502073287964, "learning_rate": 7.305797767045459e-06, "loss": 0.020029017701745033, "memory(GiB)": 21.48, "step": 11700, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.954129 }, { "epoch": 0.38011239970113375, "grad_norm": 0.644352376461029, "learning_rate": 7.305321128686438e-06, "loss": 0.039488740265369415, "memory(GiB)": 21.48, "step": 11701, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.954146 }, { "epoch": 0.38014488516388917, "grad_norm": 0.5239213705062866, "learning_rate": 7.30484446372094e-06, "loss": 0.029164951294660568, "memory(GiB)": 21.48, "step": 11702, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.954163 }, { "epoch": 0.3801773706266446, "grad_norm": 0.41350582242012024, "learning_rate": 7.304367772154471e-06, "loss": 0.019086088985204697, "memory(GiB)": 21.48, "step": 11703, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.954181 }, { "epoch": 0.3802098560894, "grad_norm": 0.45467284321784973, "learning_rate": 7.3038910539925305e-06, "loss": 0.031933922320604324, "memory(GiB)": 21.48, "step": 11704, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.954199 }, { "epoch": 0.3802423415521554, "grad_norm": 0.35905221104621887, "learning_rate": 7.3034143092406196e-06, "loss": 0.021421195939183235, "memory(GiB)": 21.48, "step": 11705, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.954217 }, { "epoch": 0.38027482701491083, "grad_norm": 0.32212603092193604, "learning_rate": 7.302937537904243e-06, "loss": 0.018782420083880424, "memory(GiB)": 21.48, "step": 11706, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.954235 }, { "epoch": 0.38030731247766625, "grad_norm": 0.45533114671707153, "learning_rate": 7.302460739988902e-06, "loss": 0.0296596959233284, "memory(GiB)": 21.48, "step": 11707, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.954252 }, { "epoch": 0.38033979794042166, "grad_norm": 0.38745608925819397, "learning_rate": 7.301983915500099e-06, "loss": 0.029805414378643036, "memory(GiB)": 21.48, "step": 11708, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.954269 }, { "epoch": 0.3803722834031771, "grad_norm": 0.5468171238899231, "learning_rate": 7.3015070644433376e-06, "loss": 0.030556010082364082, "memory(GiB)": 21.48, "step": 11709, "token_acc": 0.9699570815450643, "train_speed(iter/s)": 0.954286 }, { "epoch": 0.3804047688659325, "grad_norm": 0.6072536706924438, "learning_rate": 7.301030186824122e-06, "loss": 0.030238430947065353, "memory(GiB)": 21.48, "step": 11710, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.954302 }, { "epoch": 0.3804372543286879, "grad_norm": 0.4885857403278351, "learning_rate": 7.3005532826479546e-06, "loss": 0.02910166233778, "memory(GiB)": 21.48, "step": 11711, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.954321 }, { "epoch": 0.38046973979144333, "grad_norm": 0.32104170322418213, "learning_rate": 7.30007635192034e-06, "loss": 0.018955163657665253, "memory(GiB)": 21.48, "step": 11712, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.954338 }, { "epoch": 0.38050222525419874, "grad_norm": 0.7899348139762878, "learning_rate": 7.299599394646784e-06, "loss": 0.034725259989500046, "memory(GiB)": 21.48, "step": 11713, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.954357 }, { "epoch": 0.38053471071695416, "grad_norm": 0.33113330602645874, "learning_rate": 7.299122410832788e-06, "loss": 0.0239691361784935, "memory(GiB)": 21.48, "step": 11714, "token_acc": 0.9943820224719101, "train_speed(iter/s)": 0.954374 }, { "epoch": 0.3805671961797096, "grad_norm": 0.4163735806941986, "learning_rate": 7.298645400483861e-06, "loss": 0.022855620831251144, "memory(GiB)": 21.48, "step": 11715, "token_acc": 0.9897959183673469, "train_speed(iter/s)": 0.954392 }, { "epoch": 0.380599681642465, "grad_norm": 0.6933635473251343, "learning_rate": 7.298168363605507e-06, "loss": 0.03703318536281586, "memory(GiB)": 21.48, "step": 11716, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.95441 }, { "epoch": 0.3806321671052204, "grad_norm": 0.39326342940330505, "learning_rate": 7.297691300203231e-06, "loss": 0.020934315398335457, "memory(GiB)": 21.48, "step": 11717, "token_acc": 0.9816849816849816, "train_speed(iter/s)": 0.954427 }, { "epoch": 0.3806646525679758, "grad_norm": 0.5291928648948669, "learning_rate": 7.2972142102825396e-06, "loss": 0.030063273385167122, "memory(GiB)": 21.48, "step": 11718, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.954446 }, { "epoch": 0.38069713803073124, "grad_norm": 0.24011164903640747, "learning_rate": 7.296737093848938e-06, "loss": 0.018905580043792725, "memory(GiB)": 21.48, "step": 11719, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.954464 }, { "epoch": 0.38072962349348666, "grad_norm": 0.4121619760990143, "learning_rate": 7.2962599509079335e-06, "loss": 0.02018696628510952, "memory(GiB)": 21.48, "step": 11720, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.954481 }, { "epoch": 0.3807621089562421, "grad_norm": 0.34174975752830505, "learning_rate": 7.295782781465032e-06, "loss": 0.01572497934103012, "memory(GiB)": 21.48, "step": 11721, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.954496 }, { "epoch": 0.3807945944189975, "grad_norm": 0.436093270778656, "learning_rate": 7.295305585525743e-06, "loss": 0.027263497933745384, "memory(GiB)": 21.48, "step": 11722, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.954512 }, { "epoch": 0.3808270798817529, "grad_norm": 0.43756434321403503, "learning_rate": 7.29482836309557e-06, "loss": 0.026699248701334, "memory(GiB)": 21.48, "step": 11723, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.954528 }, { "epoch": 0.3808595653445083, "grad_norm": 0.4437906742095947, "learning_rate": 7.294351114180027e-06, "loss": 0.030511662364006042, "memory(GiB)": 21.48, "step": 11724, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.954542 }, { "epoch": 0.38089205080726374, "grad_norm": 0.34639304876327515, "learning_rate": 7.293873838784615e-06, "loss": 0.023604247719049454, "memory(GiB)": 21.48, "step": 11725, "token_acc": 0.99609375, "train_speed(iter/s)": 0.954557 }, { "epoch": 0.38092453627001915, "grad_norm": 0.349124550819397, "learning_rate": 7.293396536914848e-06, "loss": 0.02452096715569496, "memory(GiB)": 21.48, "step": 11726, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.95457 }, { "epoch": 0.38095702173277457, "grad_norm": 0.3633725345134735, "learning_rate": 7.292919208576231e-06, "loss": 0.025304196402430534, "memory(GiB)": 21.48, "step": 11727, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.954585 }, { "epoch": 0.38098950719553, "grad_norm": 0.46759214997291565, "learning_rate": 7.2924418537742744e-06, "loss": 0.026629019528627396, "memory(GiB)": 21.48, "step": 11728, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.954599 }, { "epoch": 0.3810219926582854, "grad_norm": 0.508303165435791, "learning_rate": 7.291964472514489e-06, "loss": 0.03630174696445465, "memory(GiB)": 21.48, "step": 11729, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.954614 }, { "epoch": 0.3810544781210408, "grad_norm": 0.3538784086704254, "learning_rate": 7.291487064802382e-06, "loss": 0.025037409737706184, "memory(GiB)": 21.48, "step": 11730, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.954628 }, { "epoch": 0.38108696358379623, "grad_norm": 0.3755424916744232, "learning_rate": 7.291009630643464e-06, "loss": 0.02087068371474743, "memory(GiB)": 21.48, "step": 11731, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.954642 }, { "epoch": 0.38111944904655165, "grad_norm": 0.3299298584461212, "learning_rate": 7.2905321700432465e-06, "loss": 0.018766911700367928, "memory(GiB)": 21.48, "step": 11732, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.954654 }, { "epoch": 0.38115193450930707, "grad_norm": 0.44307827949523926, "learning_rate": 7.2900546830072395e-06, "loss": 0.02662833034992218, "memory(GiB)": 21.48, "step": 11733, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.954663 }, { "epoch": 0.3811844199720625, "grad_norm": 0.398999959230423, "learning_rate": 7.289577169540951e-06, "loss": 0.02604554407298565, "memory(GiB)": 21.48, "step": 11734, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.954675 }, { "epoch": 0.3812169054348179, "grad_norm": 0.34332606196403503, "learning_rate": 7.2890996296498975e-06, "loss": 0.028773106634616852, "memory(GiB)": 21.48, "step": 11735, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.954687 }, { "epoch": 0.3812493908975733, "grad_norm": 0.4448312520980835, "learning_rate": 7.288622063339586e-06, "loss": 0.025338787585496902, "memory(GiB)": 21.48, "step": 11736, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.954662 }, { "epoch": 0.38128187636032873, "grad_norm": 0.3986634314060211, "learning_rate": 7.288144470615529e-06, "loss": 0.0352184996008873, "memory(GiB)": 21.48, "step": 11737, "token_acc": 0.9836734693877551, "train_speed(iter/s)": 0.954674 }, { "epoch": 0.38131436182308415, "grad_norm": 0.26673194766044617, "learning_rate": 7.2876668514832414e-06, "loss": 0.011319093406200409, "memory(GiB)": 21.48, "step": 11738, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.954685 }, { "epoch": 0.38134684728583956, "grad_norm": 0.28631648421287537, "learning_rate": 7.2871892059482305e-06, "loss": 0.021341415122151375, "memory(GiB)": 21.48, "step": 11739, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.954697 }, { "epoch": 0.381379332748595, "grad_norm": 0.2535496950149536, "learning_rate": 7.286711534016015e-06, "loss": 0.013449940830469131, "memory(GiB)": 21.48, "step": 11740, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.954709 }, { "epoch": 0.3814118182113504, "grad_norm": 0.3882802128791809, "learning_rate": 7.286233835692103e-06, "loss": 0.025562215596437454, "memory(GiB)": 21.48, "step": 11741, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.954721 }, { "epoch": 0.3814443036741058, "grad_norm": 0.41823455691337585, "learning_rate": 7.285756110982009e-06, "loss": 0.029689274728298187, "memory(GiB)": 21.48, "step": 11742, "token_acc": 0.9918367346938776, "train_speed(iter/s)": 0.954732 }, { "epoch": 0.3814767891368612, "grad_norm": 0.4519977271556854, "learning_rate": 7.285278359891248e-06, "loss": 0.028855986893177032, "memory(GiB)": 21.48, "step": 11743, "token_acc": 0.985, "train_speed(iter/s)": 0.954744 }, { "epoch": 0.3815092745996167, "grad_norm": 0.30330467224121094, "learning_rate": 7.284800582425333e-06, "loss": 0.018024489283561707, "memory(GiB)": 21.48, "step": 11744, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.954756 }, { "epoch": 0.3815417600623721, "grad_norm": 0.3985263705253601, "learning_rate": 7.284322778589778e-06, "loss": 0.021791744977235794, "memory(GiB)": 21.48, "step": 11745, "token_acc": 0.9834254143646409, "train_speed(iter/s)": 0.954768 }, { "epoch": 0.38157424552512753, "grad_norm": 0.28367936611175537, "learning_rate": 7.283844948390099e-06, "loss": 0.023153571411967278, "memory(GiB)": 21.48, "step": 11746, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.954781 }, { "epoch": 0.38160673098788295, "grad_norm": 0.4182646870613098, "learning_rate": 7.283367091831809e-06, "loss": 0.02822721190750599, "memory(GiB)": 21.48, "step": 11747, "token_acc": 0.9766536964980544, "train_speed(iter/s)": 0.954795 }, { "epoch": 0.38163921645063836, "grad_norm": 0.3715917468070984, "learning_rate": 7.282889208920422e-06, "loss": 0.03287944197654724, "memory(GiB)": 21.48, "step": 11748, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.954807 }, { "epoch": 0.3816717019133938, "grad_norm": 0.5570895671844482, "learning_rate": 7.282411299661457e-06, "loss": 0.0325138196349144, "memory(GiB)": 21.48, "step": 11749, "token_acc": 0.9806763285024155, "train_speed(iter/s)": 0.954821 }, { "epoch": 0.3817041873761492, "grad_norm": 0.28504279255867004, "learning_rate": 7.281933364060427e-06, "loss": 0.025534551590681076, "memory(GiB)": 21.48, "step": 11750, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.954836 }, { "epoch": 0.3817366728389046, "grad_norm": 0.3989322781562805, "learning_rate": 7.281455402122848e-06, "loss": 0.03760305792093277, "memory(GiB)": 21.48, "step": 11751, "token_acc": 0.988950276243094, "train_speed(iter/s)": 0.954854 }, { "epoch": 0.38176915830166, "grad_norm": 0.42331886291503906, "learning_rate": 7.280977413854239e-06, "loss": 0.022631272673606873, "memory(GiB)": 21.48, "step": 11752, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.954872 }, { "epoch": 0.38180164376441544, "grad_norm": 0.41525283455848694, "learning_rate": 7.280499399260113e-06, "loss": 0.022129584103822708, "memory(GiB)": 21.48, "step": 11753, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.954888 }, { "epoch": 0.38183412922717086, "grad_norm": 0.40484508872032166, "learning_rate": 7.280021358345989e-06, "loss": 0.02514931932091713, "memory(GiB)": 21.48, "step": 11754, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.954905 }, { "epoch": 0.3818666146899263, "grad_norm": 0.34128814935684204, "learning_rate": 7.2795432911173835e-06, "loss": 0.020967502146959305, "memory(GiB)": 21.48, "step": 11755, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.954923 }, { "epoch": 0.3818991001526817, "grad_norm": 0.5428133010864258, "learning_rate": 7.2790651975798146e-06, "loss": 0.02620399184525013, "memory(GiB)": 21.48, "step": 11756, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.95494 }, { "epoch": 0.3819315856154371, "grad_norm": 0.3687524199485779, "learning_rate": 7.278587077738799e-06, "loss": 0.018346188589930534, "memory(GiB)": 21.48, "step": 11757, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.954957 }, { "epoch": 0.3819640710781925, "grad_norm": 0.42573973536491394, "learning_rate": 7.278108931599858e-06, "loss": 0.02862013876438141, "memory(GiB)": 21.48, "step": 11758, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.954975 }, { "epoch": 0.38199655654094794, "grad_norm": 1.0734492540359497, "learning_rate": 7.277630759168506e-06, "loss": 0.037810876965522766, "memory(GiB)": 21.48, "step": 11759, "token_acc": 0.9672131147540983, "train_speed(iter/s)": 0.954992 }, { "epoch": 0.38202904200370336, "grad_norm": 0.3664838969707489, "learning_rate": 7.277152560450263e-06, "loss": 0.02086522802710533, "memory(GiB)": 21.48, "step": 11760, "token_acc": 0.979253112033195, "train_speed(iter/s)": 0.955009 }, { "epoch": 0.3820615274664588, "grad_norm": 0.4466390013694763, "learning_rate": 7.276674335450649e-06, "loss": 0.03193625062704086, "memory(GiB)": 21.48, "step": 11761, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.955027 }, { "epoch": 0.3820940129292142, "grad_norm": 0.3160105049610138, "learning_rate": 7.276196084175183e-06, "loss": 0.02385493367910385, "memory(GiB)": 21.48, "step": 11762, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.955042 }, { "epoch": 0.3821264983919696, "grad_norm": 0.5211997628211975, "learning_rate": 7.275717806629385e-06, "loss": 0.02863384410738945, "memory(GiB)": 21.48, "step": 11763, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.955061 }, { "epoch": 0.382158983854725, "grad_norm": 0.37719711661338806, "learning_rate": 7.2752395028187765e-06, "loss": 0.022415542975068092, "memory(GiB)": 21.48, "step": 11764, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.955078 }, { "epoch": 0.38219146931748044, "grad_norm": 0.3434808850288391, "learning_rate": 7.274761172748873e-06, "loss": 0.024126499891281128, "memory(GiB)": 21.48, "step": 11765, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.955095 }, { "epoch": 0.38222395478023585, "grad_norm": 0.43580543994903564, "learning_rate": 7.274282816425199e-06, "loss": 0.028583375737071037, "memory(GiB)": 21.48, "step": 11766, "token_acc": 0.9812734082397003, "train_speed(iter/s)": 0.955111 }, { "epoch": 0.38225644024299127, "grad_norm": 0.5048171877861023, "learning_rate": 7.273804433853273e-06, "loss": 0.03484885394573212, "memory(GiB)": 21.48, "step": 11767, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.955129 }, { "epoch": 0.3822889257057467, "grad_norm": 0.4770706295967102, "learning_rate": 7.27332602503862e-06, "loss": 0.03161884844303131, "memory(GiB)": 21.48, "step": 11768, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.955147 }, { "epoch": 0.3823214111685021, "grad_norm": 0.3858013153076172, "learning_rate": 7.272847589986756e-06, "loss": 0.025235377252101898, "memory(GiB)": 21.48, "step": 11769, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.955164 }, { "epoch": 0.3823538966312575, "grad_norm": 0.39008989930152893, "learning_rate": 7.2723691287032075e-06, "loss": 0.029263906180858612, "memory(GiB)": 21.48, "step": 11770, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.955182 }, { "epoch": 0.38238638209401293, "grad_norm": 0.4691492021083832, "learning_rate": 7.271890641193493e-06, "loss": 0.032218463718891144, "memory(GiB)": 21.48, "step": 11771, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.955198 }, { "epoch": 0.38241886755676835, "grad_norm": 0.5927140116691589, "learning_rate": 7.271412127463139e-06, "loss": 0.025769304484128952, "memory(GiB)": 21.48, "step": 11772, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.955214 }, { "epoch": 0.38245135301952377, "grad_norm": 0.3064715266227722, "learning_rate": 7.270933587517664e-06, "loss": 0.021862030029296875, "memory(GiB)": 21.48, "step": 11773, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.95523 }, { "epoch": 0.3824838384822792, "grad_norm": 0.30010247230529785, "learning_rate": 7.270455021362593e-06, "loss": 0.03056344762444496, "memory(GiB)": 21.48, "step": 11774, "token_acc": 0.9695652173913043, "train_speed(iter/s)": 0.955246 }, { "epoch": 0.3825163239450346, "grad_norm": 0.3587973117828369, "learning_rate": 7.269976429003449e-06, "loss": 0.026250848546624184, "memory(GiB)": 21.48, "step": 11775, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.955264 }, { "epoch": 0.38254880940779, "grad_norm": 0.2808908224105835, "learning_rate": 7.269497810445756e-06, "loss": 0.027918599545955658, "memory(GiB)": 21.48, "step": 11776, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955282 }, { "epoch": 0.38258129487054543, "grad_norm": 0.2897610068321228, "learning_rate": 7.269019165695038e-06, "loss": 0.02113880217075348, "memory(GiB)": 21.48, "step": 11777, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.955299 }, { "epoch": 0.38261378033330085, "grad_norm": 0.28512492775917053, "learning_rate": 7.268540494756819e-06, "loss": 0.024465737864375114, "memory(GiB)": 21.48, "step": 11778, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.955316 }, { "epoch": 0.38264626579605626, "grad_norm": 0.35210803151130676, "learning_rate": 7.268061797636623e-06, "loss": 0.034325405955314636, "memory(GiB)": 21.48, "step": 11779, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.955333 }, { "epoch": 0.3826787512588117, "grad_norm": 0.3326505720615387, "learning_rate": 7.267583074339974e-06, "loss": 0.022954583168029785, "memory(GiB)": 21.48, "step": 11780, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.955348 }, { "epoch": 0.3827112367215671, "grad_norm": 0.31458908319473267, "learning_rate": 7.267104324872399e-06, "loss": 0.021103551611304283, "memory(GiB)": 21.48, "step": 11781, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.955364 }, { "epoch": 0.3827437221843225, "grad_norm": 0.530625581741333, "learning_rate": 7.2666255492394234e-06, "loss": 0.023297492414712906, "memory(GiB)": 21.48, "step": 11782, "token_acc": 0.9752650176678446, "train_speed(iter/s)": 0.955382 }, { "epoch": 0.3827762076470779, "grad_norm": 0.5317413210868835, "learning_rate": 7.266146747446571e-06, "loss": 0.03414710611104965, "memory(GiB)": 21.48, "step": 11783, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.955398 }, { "epoch": 0.38280869310983334, "grad_norm": 0.3578430712223053, "learning_rate": 7.265667919499368e-06, "loss": 0.029551107436418533, "memory(GiB)": 21.48, "step": 11784, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.955414 }, { "epoch": 0.38284117857258876, "grad_norm": 0.31115978956222534, "learning_rate": 7.265189065403343e-06, "loss": 0.016258278861641884, "memory(GiB)": 21.48, "step": 11785, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.955428 }, { "epoch": 0.3828736640353442, "grad_norm": 0.28635987639427185, "learning_rate": 7.264710185164021e-06, "loss": 0.02475276216864586, "memory(GiB)": 21.48, "step": 11786, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.955444 }, { "epoch": 0.3829061494980996, "grad_norm": 0.4396820366382599, "learning_rate": 7.264231278786931e-06, "loss": 0.03301715850830078, "memory(GiB)": 21.48, "step": 11787, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.955459 }, { "epoch": 0.382938634960855, "grad_norm": 0.38651296496391296, "learning_rate": 7.263752346277595e-06, "loss": 0.022831227630376816, "memory(GiB)": 21.48, "step": 11788, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.955472 }, { "epoch": 0.3829711204236104, "grad_norm": 0.3796164095401764, "learning_rate": 7.263273387641546e-06, "loss": 0.024127701297402382, "memory(GiB)": 21.48, "step": 11789, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.955487 }, { "epoch": 0.38300360588636584, "grad_norm": 0.3744927942752838, "learning_rate": 7.2627944028843074e-06, "loss": 0.02046981267631054, "memory(GiB)": 21.48, "step": 11790, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955501 }, { "epoch": 0.38303609134912125, "grad_norm": 0.35590603947639465, "learning_rate": 7.262315392011412e-06, "loss": 0.02366998791694641, "memory(GiB)": 21.48, "step": 11791, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.955515 }, { "epoch": 0.38306857681187667, "grad_norm": 0.47874876856803894, "learning_rate": 7.261836355028385e-06, "loss": 0.02318393625319004, "memory(GiB)": 21.48, "step": 11792, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.955525 }, { "epoch": 0.3831010622746321, "grad_norm": 0.3010430634021759, "learning_rate": 7.261357291940756e-06, "loss": 0.028717154636979103, "memory(GiB)": 21.48, "step": 11793, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.955537 }, { "epoch": 0.3831335477373875, "grad_norm": 0.4161090552806854, "learning_rate": 7.260878202754055e-06, "loss": 0.023465551435947418, "memory(GiB)": 21.48, "step": 11794, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.955549 }, { "epoch": 0.3831660332001429, "grad_norm": 1.1289159059524536, "learning_rate": 7.260399087473809e-06, "loss": 0.02437499538064003, "memory(GiB)": 21.48, "step": 11795, "token_acc": 0.9963636363636363, "train_speed(iter/s)": 0.95556 }, { "epoch": 0.38319851866289834, "grad_norm": 0.49664267897605896, "learning_rate": 7.2599199461055495e-06, "loss": 0.03336348757147789, "memory(GiB)": 21.48, "step": 11796, "token_acc": 0.9776119402985075, "train_speed(iter/s)": 0.955571 }, { "epoch": 0.38323100412565375, "grad_norm": 0.3876838982105255, "learning_rate": 7.2594407786548046e-06, "loss": 0.024746250361204147, "memory(GiB)": 21.48, "step": 11797, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.955583 }, { "epoch": 0.38326348958840917, "grad_norm": 0.42129430174827576, "learning_rate": 7.258961585127108e-06, "loss": 0.022855238988995552, "memory(GiB)": 21.48, "step": 11798, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955594 }, { "epoch": 0.3832959750511646, "grad_norm": 0.7327600717544556, "learning_rate": 7.258482365527986e-06, "loss": 0.026435676962137222, "memory(GiB)": 21.48, "step": 11799, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.955604 }, { "epoch": 0.38332846051392, "grad_norm": 0.42123547196388245, "learning_rate": 7.258003119862973e-06, "loss": 0.02726479060947895, "memory(GiB)": 21.48, "step": 11800, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955614 }, { "epoch": 0.3833609459766754, "grad_norm": 0.4903185963630676, "learning_rate": 7.2575238481375985e-06, "loss": 0.029242679476737976, "memory(GiB)": 21.48, "step": 11801, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.955625 }, { "epoch": 0.38339343143943083, "grad_norm": 0.32892316579818726, "learning_rate": 7.257044550357393e-06, "loss": 0.018330635502934456, "memory(GiB)": 21.48, "step": 11802, "token_acc": 0.988, "train_speed(iter/s)": 0.955637 }, { "epoch": 0.38342591690218625, "grad_norm": 0.5104873776435852, "learning_rate": 7.2565652265278895e-06, "loss": 0.0291171632707119, "memory(GiB)": 21.48, "step": 11803, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.955648 }, { "epoch": 0.38345840236494166, "grad_norm": 0.35699620842933655, "learning_rate": 7.256085876654621e-06, "loss": 0.03163947910070419, "memory(GiB)": 21.48, "step": 11804, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.95566 }, { "epoch": 0.3834908878276971, "grad_norm": 0.44881531596183777, "learning_rate": 7.255606500743117e-06, "loss": 0.025779543444514275, "memory(GiB)": 21.48, "step": 11805, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955671 }, { "epoch": 0.3835233732904525, "grad_norm": 0.42159026861190796, "learning_rate": 7.255127098798913e-06, "loss": 0.02842652052640915, "memory(GiB)": 21.48, "step": 11806, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955682 }, { "epoch": 0.3835558587532079, "grad_norm": 0.34047049283981323, "learning_rate": 7.25464767082754e-06, "loss": 0.02912808582186699, "memory(GiB)": 21.48, "step": 11807, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.955693 }, { "epoch": 0.3835883442159634, "grad_norm": 0.3967589735984802, "learning_rate": 7.254168216834531e-06, "loss": 0.0236319862306118, "memory(GiB)": 21.48, "step": 11808, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955707 }, { "epoch": 0.3836208296787188, "grad_norm": 0.5278865098953247, "learning_rate": 7.253688736825421e-06, "loss": 0.041625142097473145, "memory(GiB)": 21.48, "step": 11809, "token_acc": 0.9732620320855615, "train_speed(iter/s)": 0.955721 }, { "epoch": 0.3836533151414742, "grad_norm": 0.3271167576313019, "learning_rate": 7.253209230805745e-06, "loss": 0.018207203596830368, "memory(GiB)": 21.48, "step": 11810, "token_acc": 1.0, "train_speed(iter/s)": 0.955736 }, { "epoch": 0.38368580060422963, "grad_norm": 0.43953534960746765, "learning_rate": 7.252729698781035e-06, "loss": 0.028786437585949898, "memory(GiB)": 21.48, "step": 11811, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955752 }, { "epoch": 0.38371828606698505, "grad_norm": 0.4363475441932678, "learning_rate": 7.252250140756826e-06, "loss": 0.03737727180123329, "memory(GiB)": 21.48, "step": 11812, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.955768 }, { "epoch": 0.38375077152974046, "grad_norm": 0.48601627349853516, "learning_rate": 7.251770556738651e-06, "loss": 0.03106123022735119, "memory(GiB)": 21.48, "step": 11813, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.955785 }, { "epoch": 0.3837832569924959, "grad_norm": 0.3584989607334137, "learning_rate": 7.251290946732048e-06, "loss": 0.025387460365891457, "memory(GiB)": 21.48, "step": 11814, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.955803 }, { "epoch": 0.3838157424552513, "grad_norm": 0.9091425538063049, "learning_rate": 7.250811310742551e-06, "loss": 0.025521984323859215, "memory(GiB)": 21.48, "step": 11815, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.95582 }, { "epoch": 0.3838482279180067, "grad_norm": 0.43498045206069946, "learning_rate": 7.2503316487756944e-06, "loss": 0.026606474071741104, "memory(GiB)": 21.48, "step": 11816, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.955837 }, { "epoch": 0.38388071338076213, "grad_norm": 0.49321264028549194, "learning_rate": 7.249851960837018e-06, "loss": 0.04038720577955246, "memory(GiB)": 21.48, "step": 11817, "token_acc": 0.9965635738831615, "train_speed(iter/s)": 0.955854 }, { "epoch": 0.38391319884351754, "grad_norm": 0.49736452102661133, "learning_rate": 7.249372246932053e-06, "loss": 0.026728320866823196, "memory(GiB)": 21.48, "step": 11818, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.955871 }, { "epoch": 0.38394568430627296, "grad_norm": 0.41956835985183716, "learning_rate": 7.24889250706634e-06, "loss": 0.036358121782541275, "memory(GiB)": 21.48, "step": 11819, "token_acc": 1.0, "train_speed(iter/s)": 0.955888 }, { "epoch": 0.3839781697690284, "grad_norm": 0.4484880268573761, "learning_rate": 7.248412741245412e-06, "loss": 0.026673737913370132, "memory(GiB)": 21.48, "step": 11820, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.955906 }, { "epoch": 0.3840106552317838, "grad_norm": 0.7016724944114685, "learning_rate": 7.24793294947481e-06, "loss": 0.019785722717642784, "memory(GiB)": 21.48, "step": 11821, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.955923 }, { "epoch": 0.3840431406945392, "grad_norm": 0.7564593553543091, "learning_rate": 7.24745313176007e-06, "loss": 0.03396531939506531, "memory(GiB)": 21.48, "step": 11822, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.95594 }, { "epoch": 0.3840756261572946, "grad_norm": 0.4477465748786926, "learning_rate": 7.246973288106728e-06, "loss": 0.039089374244213104, "memory(GiB)": 21.48, "step": 11823, "token_acc": 1.0, "train_speed(iter/s)": 0.955955 }, { "epoch": 0.38410811162005004, "grad_norm": 0.44326192140579224, "learning_rate": 7.246493418520325e-06, "loss": 0.033041100949048996, "memory(GiB)": 21.48, "step": 11824, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.955971 }, { "epoch": 0.38414059708280546, "grad_norm": 0.4096095561981201, "learning_rate": 7.2460135230063966e-06, "loss": 0.030265167355537415, "memory(GiB)": 21.48, "step": 11825, "token_acc": 0.984, "train_speed(iter/s)": 0.955989 }, { "epoch": 0.3841730825455609, "grad_norm": 0.3754519820213318, "learning_rate": 7.245533601570483e-06, "loss": 0.028787903487682343, "memory(GiB)": 21.48, "step": 11826, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.956006 }, { "epoch": 0.3842055680083163, "grad_norm": 0.4381125867366791, "learning_rate": 7.245053654218124e-06, "loss": 0.03386496752500534, "memory(GiB)": 21.48, "step": 11827, "token_acc": 0.98, "train_speed(iter/s)": 0.956024 }, { "epoch": 0.3842380534710717, "grad_norm": 0.36403709650039673, "learning_rate": 7.244573680954857e-06, "loss": 0.023160845041275024, "memory(GiB)": 21.48, "step": 11828, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.956041 }, { "epoch": 0.3842705389338271, "grad_norm": 0.4127162992954254, "learning_rate": 7.244093681786223e-06, "loss": 0.032579705119132996, "memory(GiB)": 21.48, "step": 11829, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.956056 }, { "epoch": 0.38430302439658254, "grad_norm": 0.4073948562145233, "learning_rate": 7.24361365671776e-06, "loss": 0.02682500332593918, "memory(GiB)": 21.48, "step": 11830, "token_acc": 0.981549815498155, "train_speed(iter/s)": 0.956074 }, { "epoch": 0.38433550985933795, "grad_norm": 0.3071906268596649, "learning_rate": 7.24313360575501e-06, "loss": 0.02188974805176258, "memory(GiB)": 21.48, "step": 11831, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956092 }, { "epoch": 0.38436799532209337, "grad_norm": 0.36058956384658813, "learning_rate": 7.242653528903511e-06, "loss": 0.027385467663407326, "memory(GiB)": 21.48, "step": 11832, "token_acc": 1.0, "train_speed(iter/s)": 0.956108 }, { "epoch": 0.3844004807848488, "grad_norm": 0.3547193706035614, "learning_rate": 7.242173426168806e-06, "loss": 0.0221329927444458, "memory(GiB)": 21.48, "step": 11833, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.956126 }, { "epoch": 0.3844329662476042, "grad_norm": 0.41731852293014526, "learning_rate": 7.241693297556436e-06, "loss": 0.02552010864019394, "memory(GiB)": 21.48, "step": 11834, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.956143 }, { "epoch": 0.3844654517103596, "grad_norm": 0.36828792095184326, "learning_rate": 7.241213143071941e-06, "loss": 0.02335122600197792, "memory(GiB)": 21.48, "step": 11835, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.95616 }, { "epoch": 0.38449793717311503, "grad_norm": 0.4301164150238037, "learning_rate": 7.240732962720863e-06, "loss": 0.02220168337225914, "memory(GiB)": 21.48, "step": 11836, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.956176 }, { "epoch": 0.38453042263587045, "grad_norm": 0.2924172878265381, "learning_rate": 7.240252756508744e-06, "loss": 0.023951057344675064, "memory(GiB)": 21.48, "step": 11837, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.956191 }, { "epoch": 0.38456290809862587, "grad_norm": 0.389056533575058, "learning_rate": 7.2397725244411275e-06, "loss": 0.026402341201901436, "memory(GiB)": 21.48, "step": 11838, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956208 }, { "epoch": 0.3845953935613813, "grad_norm": 0.5031121969223022, "learning_rate": 7.2392922665235544e-06, "loss": 0.025590764358639717, "memory(GiB)": 21.48, "step": 11839, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.956226 }, { "epoch": 0.3846278790241367, "grad_norm": 0.3598119616508484, "learning_rate": 7.238811982761569e-06, "loss": 0.020871449261903763, "memory(GiB)": 21.48, "step": 11840, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.956242 }, { "epoch": 0.3846603644868921, "grad_norm": 0.5086458325386047, "learning_rate": 7.2383316731607125e-06, "loss": 0.03175592049956322, "memory(GiB)": 21.48, "step": 11841, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.95626 }, { "epoch": 0.38469284994964753, "grad_norm": 0.3278493881225586, "learning_rate": 7.2378513377265295e-06, "loss": 0.02632143534719944, "memory(GiB)": 21.48, "step": 11842, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.956278 }, { "epoch": 0.38472533541240295, "grad_norm": 0.4760197699069977, "learning_rate": 7.237370976464564e-06, "loss": 0.025020677596330643, "memory(GiB)": 21.48, "step": 11843, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.956295 }, { "epoch": 0.38475782087515836, "grad_norm": 0.42940500378608704, "learning_rate": 7.236890589380359e-06, "loss": 0.026064099743962288, "memory(GiB)": 21.48, "step": 11844, "token_acc": 0.9904761904761905, "train_speed(iter/s)": 0.956312 }, { "epoch": 0.3847903063379138, "grad_norm": 0.4034448266029358, "learning_rate": 7.236410176479459e-06, "loss": 0.022978629916906357, "memory(GiB)": 21.48, "step": 11845, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.956328 }, { "epoch": 0.3848227918006692, "grad_norm": 0.49973002076148987, "learning_rate": 7.235929737767409e-06, "loss": 0.030089640989899635, "memory(GiB)": 21.48, "step": 11846, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.956346 }, { "epoch": 0.3848552772634246, "grad_norm": 0.3593432903289795, "learning_rate": 7.235449273249755e-06, "loss": 0.0262494795024395, "memory(GiB)": 21.48, "step": 11847, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.95636 }, { "epoch": 0.38488776272618, "grad_norm": 0.33786463737487793, "learning_rate": 7.234968782932039e-06, "loss": 0.02694704942405224, "memory(GiB)": 21.48, "step": 11848, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956374 }, { "epoch": 0.38492024818893544, "grad_norm": 0.46994200348854065, "learning_rate": 7.2344882668198105e-06, "loss": 0.02469807118177414, "memory(GiB)": 21.48, "step": 11849, "token_acc": 0.9895287958115183, "train_speed(iter/s)": 0.956388 }, { "epoch": 0.38495273365169086, "grad_norm": 0.38859987258911133, "learning_rate": 7.234007724918612e-06, "loss": 0.027713673189282417, "memory(GiB)": 21.48, "step": 11850, "token_acc": 0.9869565217391304, "train_speed(iter/s)": 0.956401 }, { "epoch": 0.3849852191144463, "grad_norm": 0.3878663182258606, "learning_rate": 7.233527157233991e-06, "loss": 0.02767980843782425, "memory(GiB)": 21.48, "step": 11851, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.956414 }, { "epoch": 0.3850177045772017, "grad_norm": 0.4736846089363098, "learning_rate": 7.233046563771493e-06, "loss": 0.03311657905578613, "memory(GiB)": 21.48, "step": 11852, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.956426 }, { "epoch": 0.3850501900399571, "grad_norm": 0.3667394816875458, "learning_rate": 7.232565944536667e-06, "loss": 0.01770380511879921, "memory(GiB)": 21.48, "step": 11853, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.956437 }, { "epoch": 0.3850826755027125, "grad_norm": 0.26112282276153564, "learning_rate": 7.232085299535058e-06, "loss": 0.019025538116693497, "memory(GiB)": 21.48, "step": 11854, "token_acc": 1.0, "train_speed(iter/s)": 0.956448 }, { "epoch": 0.38511516096546794, "grad_norm": 0.39869385957717896, "learning_rate": 7.231604628772213e-06, "loss": 0.02896156534552574, "memory(GiB)": 21.48, "step": 11855, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956461 }, { "epoch": 0.38514764642822336, "grad_norm": 0.5940133929252625, "learning_rate": 7.2311239322536805e-06, "loss": 0.0233200304210186, "memory(GiB)": 21.48, "step": 11856, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.956472 }, { "epoch": 0.38518013189097877, "grad_norm": 0.3248978555202484, "learning_rate": 7.23064320998501e-06, "loss": 0.021393796429038048, "memory(GiB)": 21.48, "step": 11857, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.956484 }, { "epoch": 0.3852126173537342, "grad_norm": 0.5501841306686401, "learning_rate": 7.230162461971746e-06, "loss": 0.028837312012910843, "memory(GiB)": 21.48, "step": 11858, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.956496 }, { "epoch": 0.3852451028164896, "grad_norm": 0.3257637321949005, "learning_rate": 7.22968168821944e-06, "loss": 0.022970642894506454, "memory(GiB)": 21.48, "step": 11859, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.956509 }, { "epoch": 0.385277588279245, "grad_norm": 0.538662314414978, "learning_rate": 7.229200888733638e-06, "loss": 0.04528260976076126, "memory(GiB)": 21.48, "step": 11860, "token_acc": 0.9823008849557522, "train_speed(iter/s)": 0.956521 }, { "epoch": 0.38531007374200044, "grad_norm": 0.4764460623264313, "learning_rate": 7.228720063519893e-06, "loss": 0.02899288758635521, "memory(GiB)": 21.48, "step": 11861, "token_acc": 0.9781659388646288, "train_speed(iter/s)": 0.956533 }, { "epoch": 0.38534255920475585, "grad_norm": 0.3809884786605835, "learning_rate": 7.22823921258375e-06, "loss": 0.027608197182416916, "memory(GiB)": 21.48, "step": 11862, "token_acc": 0.9770992366412213, "train_speed(iter/s)": 0.956544 }, { "epoch": 0.38537504466751127, "grad_norm": 0.4307730197906494, "learning_rate": 7.227758335930762e-06, "loss": 0.02170237898826599, "memory(GiB)": 21.48, "step": 11863, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.956556 }, { "epoch": 0.3854075301302667, "grad_norm": 0.4454805552959442, "learning_rate": 7.227277433566476e-06, "loss": 0.0329560860991478, "memory(GiB)": 21.48, "step": 11864, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.956568 }, { "epoch": 0.3854400155930221, "grad_norm": 0.5812702178955078, "learning_rate": 7.226796505496446e-06, "loss": 0.028905672952532768, "memory(GiB)": 21.48, "step": 11865, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.95658 }, { "epoch": 0.3854725010557775, "grad_norm": 0.34221747517585754, "learning_rate": 7.2263155517262206e-06, "loss": 0.030456623062491417, "memory(GiB)": 21.48, "step": 11866, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.956591 }, { "epoch": 0.38550498651853293, "grad_norm": 0.950217068195343, "learning_rate": 7.225834572261348e-06, "loss": 0.0402691587805748, "memory(GiB)": 21.48, "step": 11867, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.956602 }, { "epoch": 0.38553747198128835, "grad_norm": 0.425270676612854, "learning_rate": 7.2253535671073836e-06, "loss": 0.021676169708371162, "memory(GiB)": 21.48, "step": 11868, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.956615 }, { "epoch": 0.38556995744404376, "grad_norm": 0.3128538727760315, "learning_rate": 7.224872536269876e-06, "loss": 0.02433263137936592, "memory(GiB)": 21.48, "step": 11869, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956628 }, { "epoch": 0.3856024429067992, "grad_norm": 0.36140137910842896, "learning_rate": 7.22439147975438e-06, "loss": 0.033162109553813934, "memory(GiB)": 21.48, "step": 11870, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.95664 }, { "epoch": 0.3856349283695546, "grad_norm": 0.6214622259140015, "learning_rate": 7.223910397566445e-06, "loss": 0.030211610719561577, "memory(GiB)": 21.48, "step": 11871, "token_acc": 1.0, "train_speed(iter/s)": 0.956656 }, { "epoch": 0.38566741383231007, "grad_norm": 0.2860284745693207, "learning_rate": 7.223429289711622e-06, "loss": 0.024579260498285294, "memory(GiB)": 21.48, "step": 11872, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.956674 }, { "epoch": 0.3856998992950655, "grad_norm": 0.4017906188964844, "learning_rate": 7.222948156195469e-06, "loss": 0.03289223089814186, "memory(GiB)": 21.48, "step": 11873, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.956691 }, { "epoch": 0.3857323847578209, "grad_norm": 0.5143237709999084, "learning_rate": 7.222466997023532e-06, "loss": 0.029237210750579834, "memory(GiB)": 21.48, "step": 11874, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.956709 }, { "epoch": 0.3857648702205763, "grad_norm": 0.2597072422504425, "learning_rate": 7.22198581220137e-06, "loss": 0.018253035843372345, "memory(GiB)": 21.48, "step": 11875, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956727 }, { "epoch": 0.38579735568333173, "grad_norm": 1.4088364839553833, "learning_rate": 7.221504601734534e-06, "loss": 0.024616315960884094, "memory(GiB)": 21.48, "step": 11876, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.956743 }, { "epoch": 0.38582984114608715, "grad_norm": 0.5779117345809937, "learning_rate": 7.221023365628578e-06, "loss": 0.046529605984687805, "memory(GiB)": 21.48, "step": 11877, "token_acc": 0.9718875502008032, "train_speed(iter/s)": 0.956761 }, { "epoch": 0.38586232660884257, "grad_norm": 0.6013123393058777, "learning_rate": 7.220542103889055e-06, "loss": 0.02887009084224701, "memory(GiB)": 21.48, "step": 11878, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.956778 }, { "epoch": 0.385894812071598, "grad_norm": 0.3899726867675781, "learning_rate": 7.2200608165215214e-06, "loss": 0.02166062407195568, "memory(GiB)": 21.48, "step": 11879, "token_acc": 1.0, "train_speed(iter/s)": 0.956795 }, { "epoch": 0.3859272975343534, "grad_norm": 0.38090386986732483, "learning_rate": 7.219579503531532e-06, "loss": 0.03290921077132225, "memory(GiB)": 21.48, "step": 11880, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.956811 }, { "epoch": 0.3859597829971088, "grad_norm": 0.3964523673057556, "learning_rate": 7.21909816492464e-06, "loss": 0.03129517659544945, "memory(GiB)": 21.48, "step": 11881, "token_acc": 0.9707317073170731, "train_speed(iter/s)": 0.956829 }, { "epoch": 0.38599226845986423, "grad_norm": 0.37915706634521484, "learning_rate": 7.2186168007064015e-06, "loss": 0.021159738302230835, "memory(GiB)": 21.48, "step": 11882, "token_acc": 1.0, "train_speed(iter/s)": 0.956845 }, { "epoch": 0.38602475392261965, "grad_norm": 0.362537145614624, "learning_rate": 7.218135410882372e-06, "loss": 0.02754363790154457, "memory(GiB)": 21.48, "step": 11883, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.956862 }, { "epoch": 0.38605723938537506, "grad_norm": 0.4322184920310974, "learning_rate": 7.217653995458107e-06, "loss": 0.0262552909553051, "memory(GiB)": 21.48, "step": 11884, "token_acc": 0.983739837398374, "train_speed(iter/s)": 0.956879 }, { "epoch": 0.3860897248481305, "grad_norm": 0.4136146008968353, "learning_rate": 7.217172554439165e-06, "loss": 0.03164714202284813, "memory(GiB)": 21.48, "step": 11885, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.956895 }, { "epoch": 0.3861222103108859, "grad_norm": 0.6293635964393616, "learning_rate": 7.216691087831099e-06, "loss": 0.02481655776500702, "memory(GiB)": 21.48, "step": 11886, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.956913 }, { "epoch": 0.3861546957736413, "grad_norm": 1.298460841178894, "learning_rate": 7.21620959563947e-06, "loss": 0.026002902537584305, "memory(GiB)": 21.48, "step": 11887, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.95693 }, { "epoch": 0.3861871812363967, "grad_norm": 0.5440792441368103, "learning_rate": 7.2157280778698295e-06, "loss": 0.030670098960399628, "memory(GiB)": 21.48, "step": 11888, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956947 }, { "epoch": 0.38621966669915214, "grad_norm": 0.5260557532310486, "learning_rate": 7.215246534527739e-06, "loss": 0.02478136494755745, "memory(GiB)": 21.48, "step": 11889, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956964 }, { "epoch": 0.38625215216190756, "grad_norm": 0.4259079098701477, "learning_rate": 7.214764965618755e-06, "loss": 0.0246918722987175, "memory(GiB)": 21.48, "step": 11890, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956983 }, { "epoch": 0.386284637624663, "grad_norm": 0.311979740858078, "learning_rate": 7.214283371148436e-06, "loss": 0.021434631198644638, "memory(GiB)": 21.48, "step": 11891, "token_acc": 1.0, "train_speed(iter/s)": 0.957001 }, { "epoch": 0.3863171230874184, "grad_norm": 0.44920969009399414, "learning_rate": 7.213801751122338e-06, "loss": 0.03143572062253952, "memory(GiB)": 21.48, "step": 11892, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.957018 }, { "epoch": 0.3863496085501738, "grad_norm": 0.40908554196357727, "learning_rate": 7.213320105546023e-06, "loss": 0.029245495796203613, "memory(GiB)": 21.48, "step": 11893, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.957035 }, { "epoch": 0.3863820940129292, "grad_norm": 0.36557549238204956, "learning_rate": 7.212838434425048e-06, "loss": 0.028462599962949753, "memory(GiB)": 21.48, "step": 11894, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.957051 }, { "epoch": 0.38641457947568464, "grad_norm": 0.3291287422180176, "learning_rate": 7.212356737764971e-06, "loss": 0.025159472599625587, "memory(GiB)": 21.48, "step": 11895, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.957066 }, { "epoch": 0.38644706493844005, "grad_norm": 0.3961528539657593, "learning_rate": 7.211875015571353e-06, "loss": 0.032348789274692535, "memory(GiB)": 21.48, "step": 11896, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.957083 }, { "epoch": 0.38647955040119547, "grad_norm": 0.3190504014492035, "learning_rate": 7.211393267849753e-06, "loss": 0.02104959264397621, "memory(GiB)": 21.48, "step": 11897, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.957098 }, { "epoch": 0.3865120358639509, "grad_norm": 0.27518385648727417, "learning_rate": 7.210911494605732e-06, "loss": 0.01769283413887024, "memory(GiB)": 21.48, "step": 11898, "token_acc": 1.0, "train_speed(iter/s)": 0.957115 }, { "epoch": 0.3865445213267063, "grad_norm": 0.3599061667919159, "learning_rate": 7.2104296958448504e-06, "loss": 0.02735431119799614, "memory(GiB)": 21.48, "step": 11899, "token_acc": 0.9759450171821306, "train_speed(iter/s)": 0.957133 }, { "epoch": 0.3865770067894617, "grad_norm": 0.6500585675239563, "learning_rate": 7.209947871572667e-06, "loss": 0.019797008484601974, "memory(GiB)": 21.48, "step": 11900, "token_acc": 0.98828125, "train_speed(iter/s)": 0.957149 }, { "epoch": 0.38660949225221714, "grad_norm": 0.44071224331855774, "learning_rate": 7.209466021794744e-06, "loss": 0.026277896016836166, "memory(GiB)": 21.48, "step": 11901, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.957167 }, { "epoch": 0.38664197771497255, "grad_norm": 0.6684126853942871, "learning_rate": 7.208984146516641e-06, "loss": 0.03223419934511185, "memory(GiB)": 21.48, "step": 11902, "token_acc": 0.9812206572769953, "train_speed(iter/s)": 0.957185 }, { "epoch": 0.38667446317772797, "grad_norm": 0.35334306955337524, "learning_rate": 7.208502245743922e-06, "loss": 0.02647210843861103, "memory(GiB)": 21.48, "step": 11903, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.957202 }, { "epoch": 0.3867069486404834, "grad_norm": 0.422760933637619, "learning_rate": 7.208020319482147e-06, "loss": 0.03143637627363205, "memory(GiB)": 21.48, "step": 11904, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.957219 }, { "epoch": 0.3867394341032388, "grad_norm": 0.6321749687194824, "learning_rate": 7.2075383677368805e-06, "loss": 0.037375688552856445, "memory(GiB)": 21.48, "step": 11905, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.957238 }, { "epoch": 0.3867719195659942, "grad_norm": 0.32500582933425903, "learning_rate": 7.207056390513682e-06, "loss": 0.022069266065955162, "memory(GiB)": 21.48, "step": 11906, "token_acc": 0.9739583333333334, "train_speed(iter/s)": 0.957255 }, { "epoch": 0.38680440502874963, "grad_norm": 0.3259343206882477, "learning_rate": 7.2065743878181145e-06, "loss": 0.028362195938825607, "memory(GiB)": 21.48, "step": 11907, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.957272 }, { "epoch": 0.38683689049150505, "grad_norm": 0.45925000309944153, "learning_rate": 7.206092359655743e-06, "loss": 0.032924994826316833, "memory(GiB)": 21.48, "step": 11908, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.957289 }, { "epoch": 0.38686937595426046, "grad_norm": 0.4799860119819641, "learning_rate": 7.205610306032127e-06, "loss": 0.02823076769709587, "memory(GiB)": 21.48, "step": 11909, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.957305 }, { "epoch": 0.3869018614170159, "grad_norm": 0.6039271354675293, "learning_rate": 7.205128226952835e-06, "loss": 0.02645145170390606, "memory(GiB)": 21.48, "step": 11910, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.957323 }, { "epoch": 0.3869343468797713, "grad_norm": 0.27362528443336487, "learning_rate": 7.2046461224234265e-06, "loss": 0.021849844604730606, "memory(GiB)": 21.48, "step": 11911, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.95734 }, { "epoch": 0.3869668323425267, "grad_norm": 0.3555663526058197, "learning_rate": 7.204163992449469e-06, "loss": 0.026622842997312546, "memory(GiB)": 21.48, "step": 11912, "token_acc": 0.984, "train_speed(iter/s)": 0.957353 }, { "epoch": 0.38699931780528213, "grad_norm": 0.3489309251308441, "learning_rate": 7.203681837036523e-06, "loss": 0.026485417038202286, "memory(GiB)": 21.48, "step": 11913, "token_acc": 1.0, "train_speed(iter/s)": 0.957363 }, { "epoch": 0.38703180326803754, "grad_norm": 0.3358446955680847, "learning_rate": 7.203199656190156e-06, "loss": 0.027231235057115555, "memory(GiB)": 21.48, "step": 11914, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.957374 }, { "epoch": 0.38706428873079296, "grad_norm": 0.400149405002594, "learning_rate": 7.202717449915934e-06, "loss": 0.027561280876398087, "memory(GiB)": 21.48, "step": 11915, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.957386 }, { "epoch": 0.3870967741935484, "grad_norm": 0.2705209255218506, "learning_rate": 7.2022352182194196e-06, "loss": 0.022092249244451523, "memory(GiB)": 21.48, "step": 11916, "token_acc": 0.9845559845559846, "train_speed(iter/s)": 0.957396 }, { "epoch": 0.3871292596563038, "grad_norm": 2.916020154953003, "learning_rate": 7.20175296110618e-06, "loss": 0.03867453336715698, "memory(GiB)": 21.48, "step": 11917, "token_acc": 1.0, "train_speed(iter/s)": 0.957409 }, { "epoch": 0.3871617451190592, "grad_norm": 0.8053995966911316, "learning_rate": 7.20127067858178e-06, "loss": 0.02747856266796589, "memory(GiB)": 21.48, "step": 11918, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.957421 }, { "epoch": 0.3871942305818146, "grad_norm": 0.30147549510002136, "learning_rate": 7.200788370651788e-06, "loss": 0.027360405772924423, "memory(GiB)": 21.48, "step": 11919, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.957432 }, { "epoch": 0.38722671604457004, "grad_norm": 0.3632841110229492, "learning_rate": 7.2003060373217684e-06, "loss": 0.022159256041049957, "memory(GiB)": 21.48, "step": 11920, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.957444 }, { "epoch": 0.38725920150732546, "grad_norm": 0.6340256929397583, "learning_rate": 7.1998236785972885e-06, "loss": 0.028628207743167877, "memory(GiB)": 21.48, "step": 11921, "token_acc": 0.9838056680161943, "train_speed(iter/s)": 0.957453 }, { "epoch": 0.3872916869700809, "grad_norm": 0.35746878385543823, "learning_rate": 7.199341294483916e-06, "loss": 0.027049411088228226, "memory(GiB)": 21.48, "step": 11922, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.957466 }, { "epoch": 0.3873241724328363, "grad_norm": 0.4475976526737213, "learning_rate": 7.198858884987215e-06, "loss": 0.02654683217406273, "memory(GiB)": 21.48, "step": 11923, "token_acc": 1.0, "train_speed(iter/s)": 0.957478 }, { "epoch": 0.3873566578955917, "grad_norm": 0.49050650000572205, "learning_rate": 7.198376450112758e-06, "loss": 0.034178074449300766, "memory(GiB)": 21.48, "step": 11924, "token_acc": 1.0, "train_speed(iter/s)": 0.95749 }, { "epoch": 0.3873891433583471, "grad_norm": 0.43209391832351685, "learning_rate": 7.19789398986611e-06, "loss": 0.026735860854387283, "memory(GiB)": 21.48, "step": 11925, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.9575 }, { "epoch": 0.38742162882110254, "grad_norm": 0.41209885478019714, "learning_rate": 7.197411504252841e-06, "loss": 0.02430194616317749, "memory(GiB)": 21.48, "step": 11926, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.957511 }, { "epoch": 0.38745411428385795, "grad_norm": 0.3471308648586273, "learning_rate": 7.196928993278518e-06, "loss": 0.021794581785798073, "memory(GiB)": 21.48, "step": 11927, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.957523 }, { "epoch": 0.38748659974661337, "grad_norm": 0.39273563027381897, "learning_rate": 7.19644645694871e-06, "loss": 0.022623756900429726, "memory(GiB)": 21.48, "step": 11928, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.957535 }, { "epoch": 0.3875190852093688, "grad_norm": 0.31890761852264404, "learning_rate": 7.195963895268987e-06, "loss": 0.02023974061012268, "memory(GiB)": 21.48, "step": 11929, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.957546 }, { "epoch": 0.3875515706721242, "grad_norm": 0.4056374728679657, "learning_rate": 7.195481308244916e-06, "loss": 0.02122386544942856, "memory(GiB)": 21.48, "step": 11930, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.957557 }, { "epoch": 0.3875840561348796, "grad_norm": 0.38578519225120544, "learning_rate": 7.19499869588207e-06, "loss": 0.02986789122223854, "memory(GiB)": 21.48, "step": 11931, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.957568 }, { "epoch": 0.38761654159763503, "grad_norm": 0.43662890791893005, "learning_rate": 7.194516058186018e-06, "loss": 0.02611645683646202, "memory(GiB)": 21.48, "step": 11932, "token_acc": 0.9857651245551602, "train_speed(iter/s)": 0.957581 }, { "epoch": 0.38764902706039045, "grad_norm": 0.34557437896728516, "learning_rate": 7.19403339516233e-06, "loss": 0.027654066681861877, "memory(GiB)": 21.48, "step": 11933, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957594 }, { "epoch": 0.38768151252314587, "grad_norm": 0.3737114667892456, "learning_rate": 7.193550706816574e-06, "loss": 0.029589857906103134, "memory(GiB)": 21.48, "step": 11934, "token_acc": 1.0, "train_speed(iter/s)": 0.957607 }, { "epoch": 0.3877139979859013, "grad_norm": 0.38841360807418823, "learning_rate": 7.193067993154326e-06, "loss": 0.020442117005586624, "memory(GiB)": 21.48, "step": 11935, "token_acc": 0.98828125, "train_speed(iter/s)": 0.95762 }, { "epoch": 0.38774648344865675, "grad_norm": 0.24503743648529053, "learning_rate": 7.192585254181154e-06, "loss": 0.016563476994633675, "memory(GiB)": 21.48, "step": 11936, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.957635 }, { "epoch": 0.38777896891141217, "grad_norm": 0.3312632739543915, "learning_rate": 7.192102489902628e-06, "loss": 0.021857481449842453, "memory(GiB)": 21.48, "step": 11937, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.957651 }, { "epoch": 0.3878114543741676, "grad_norm": 0.3874715566635132, "learning_rate": 7.191619700324324e-06, "loss": 0.02485709637403488, "memory(GiB)": 21.48, "step": 11938, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.957667 }, { "epoch": 0.387843939836923, "grad_norm": 0.521484911441803, "learning_rate": 7.191136885451809e-06, "loss": 0.03484013304114342, "memory(GiB)": 21.48, "step": 11939, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957683 }, { "epoch": 0.3878764252996784, "grad_norm": 0.43497782945632935, "learning_rate": 7.190654045290659e-06, "loss": 0.028267163783311844, "memory(GiB)": 21.48, "step": 11940, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.9577 }, { "epoch": 0.38790891076243383, "grad_norm": 0.3184722661972046, "learning_rate": 7.190171179846446e-06, "loss": 0.032232411205768585, "memory(GiB)": 21.48, "step": 11941, "token_acc": 0.976303317535545, "train_speed(iter/s)": 0.957716 }, { "epoch": 0.38794139622518925, "grad_norm": 0.5220504999160767, "learning_rate": 7.1896882891247435e-06, "loss": 0.04237721860408783, "memory(GiB)": 21.48, "step": 11942, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.957732 }, { "epoch": 0.38797388168794467, "grad_norm": 0.4210231304168701, "learning_rate": 7.18920537313112e-06, "loss": 0.02977924793958664, "memory(GiB)": 21.48, "step": 11943, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.95775 }, { "epoch": 0.3880063671507001, "grad_norm": 0.3944474458694458, "learning_rate": 7.1887224318711565e-06, "loss": 0.03353565186262131, "memory(GiB)": 21.48, "step": 11944, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.957768 }, { "epoch": 0.3880388526134555, "grad_norm": 0.3797084093093872, "learning_rate": 7.188239465350421e-06, "loss": 0.035608164966106415, "memory(GiB)": 21.48, "step": 11945, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.957786 }, { "epoch": 0.3880713380762109, "grad_norm": 0.413341224193573, "learning_rate": 7.18775647357449e-06, "loss": 0.029227077960968018, "memory(GiB)": 21.48, "step": 11946, "token_acc": 0.9836065573770492, "train_speed(iter/s)": 0.957803 }, { "epoch": 0.38810382353896633, "grad_norm": 0.336290568113327, "learning_rate": 7.1872734565489375e-06, "loss": 0.02578818053007126, "memory(GiB)": 21.48, "step": 11947, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.957821 }, { "epoch": 0.38813630900172175, "grad_norm": 0.3703747093677521, "learning_rate": 7.186790414279338e-06, "loss": 0.022219780832529068, "memory(GiB)": 21.48, "step": 11948, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.957837 }, { "epoch": 0.38816879446447716, "grad_norm": 0.3866904675960541, "learning_rate": 7.186307346771266e-06, "loss": 0.015747644007205963, "memory(GiB)": 21.48, "step": 11949, "token_acc": 1.0, "train_speed(iter/s)": 0.957854 }, { "epoch": 0.3882012799272326, "grad_norm": 0.49940773844718933, "learning_rate": 7.185824254030298e-06, "loss": 0.02875984087586403, "memory(GiB)": 21.48, "step": 11950, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957872 }, { "epoch": 0.388233765389988, "grad_norm": 0.5145270228385925, "learning_rate": 7.185341136062009e-06, "loss": 0.030368095263838768, "memory(GiB)": 21.48, "step": 11951, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.957889 }, { "epoch": 0.3882662508527434, "grad_norm": 0.44722115993499756, "learning_rate": 7.184857992871974e-06, "loss": 0.03407195210456848, "memory(GiB)": 21.48, "step": 11952, "token_acc": 0.984, "train_speed(iter/s)": 0.957906 }, { "epoch": 0.3882987363154988, "grad_norm": 0.484666109085083, "learning_rate": 7.184374824465769e-06, "loss": 0.027333790436387062, "memory(GiB)": 21.48, "step": 11953, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.957922 }, { "epoch": 0.38833122177825424, "grad_norm": 0.4565039575099945, "learning_rate": 7.183891630848974e-06, "loss": 0.027164677157998085, "memory(GiB)": 21.48, "step": 11954, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.957939 }, { "epoch": 0.38836370724100966, "grad_norm": 0.36119404435157776, "learning_rate": 7.183408412027159e-06, "loss": 0.026777835562825203, "memory(GiB)": 21.48, "step": 11955, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.957956 }, { "epoch": 0.3883961927037651, "grad_norm": 0.3510867953300476, "learning_rate": 7.182925168005909e-06, "loss": 0.025474969297647476, "memory(GiB)": 21.48, "step": 11956, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.957973 }, { "epoch": 0.3884286781665205, "grad_norm": 0.4437721073627472, "learning_rate": 7.182441898790793e-06, "loss": 0.025854777544736862, "memory(GiB)": 21.48, "step": 11957, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957989 }, { "epoch": 0.3884611636292759, "grad_norm": 0.3548940122127533, "learning_rate": 7.181958604387396e-06, "loss": 0.025888439267873764, "memory(GiB)": 21.48, "step": 11958, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.958006 }, { "epoch": 0.3884936490920313, "grad_norm": 0.352899432182312, "learning_rate": 7.181475284801289e-06, "loss": 0.0210496224462986, "memory(GiB)": 21.48, "step": 11959, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.958023 }, { "epoch": 0.38852613455478674, "grad_norm": 0.47084417939186096, "learning_rate": 7.180991940038056e-06, "loss": 0.022383354604244232, "memory(GiB)": 21.48, "step": 11960, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.95804 }, { "epoch": 0.38855862001754216, "grad_norm": 0.5020252466201782, "learning_rate": 7.1805085701032724e-06, "loss": 0.03236694261431694, "memory(GiB)": 21.48, "step": 11961, "token_acc": 0.986013986013986, "train_speed(iter/s)": 0.958058 }, { "epoch": 0.38859110548029757, "grad_norm": 0.4466695785522461, "learning_rate": 7.1800251750025185e-06, "loss": 0.021418659016489983, "memory(GiB)": 21.48, "step": 11962, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.958075 }, { "epoch": 0.388623590943053, "grad_norm": 0.6132325530052185, "learning_rate": 7.1795417547413725e-06, "loss": 0.035966210067272186, "memory(GiB)": 21.48, "step": 11963, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.958092 }, { "epoch": 0.3886560764058084, "grad_norm": 0.31017789244651794, "learning_rate": 7.179058309325413e-06, "loss": 0.021795162931084633, "memory(GiB)": 21.48, "step": 11964, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.958109 }, { "epoch": 0.3886885618685638, "grad_norm": 0.4248221516609192, "learning_rate": 7.17857483876022e-06, "loss": 0.031241565942764282, "memory(GiB)": 21.48, "step": 11965, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.958127 }, { "epoch": 0.38872104733131924, "grad_norm": 0.6001425385475159, "learning_rate": 7.178091343051375e-06, "loss": 0.035214703530073166, "memory(GiB)": 21.48, "step": 11966, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.958144 }, { "epoch": 0.38875353279407465, "grad_norm": 0.48563098907470703, "learning_rate": 7.1776078222044554e-06, "loss": 0.029791153967380524, "memory(GiB)": 21.48, "step": 11967, "token_acc": 0.9835390946502057, "train_speed(iter/s)": 0.958162 }, { "epoch": 0.38878601825683007, "grad_norm": 0.4231630265712738, "learning_rate": 7.177124276225045e-06, "loss": 0.023734495043754578, "memory(GiB)": 21.48, "step": 11968, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.958179 }, { "epoch": 0.3888185037195855, "grad_norm": 0.4079066514968872, "learning_rate": 7.176640705118721e-06, "loss": 0.01907121017575264, "memory(GiB)": 21.48, "step": 11969, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.958197 }, { "epoch": 0.3888509891823409, "grad_norm": 0.38538751006126404, "learning_rate": 7.176157108891067e-06, "loss": 0.03232646733522415, "memory(GiB)": 21.48, "step": 11970, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.958214 }, { "epoch": 0.3888834746450963, "grad_norm": 0.3564913868904114, "learning_rate": 7.175673487547662e-06, "loss": 0.028993407264351845, "memory(GiB)": 21.48, "step": 11971, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.95823 }, { "epoch": 0.38891596010785173, "grad_norm": 0.32484498620033264, "learning_rate": 7.17518984109409e-06, "loss": 0.030340436846017838, "memory(GiB)": 21.48, "step": 11972, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.958246 }, { "epoch": 0.38894844557060715, "grad_norm": 0.9633527994155884, "learning_rate": 7.174706169535932e-06, "loss": 0.03312164545059204, "memory(GiB)": 21.48, "step": 11973, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.958258 }, { "epoch": 0.38898093103336256, "grad_norm": 0.3767084777355194, "learning_rate": 7.174222472878771e-06, "loss": 0.02773980051279068, "memory(GiB)": 21.48, "step": 11974, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.958272 }, { "epoch": 0.389013416496118, "grad_norm": 0.6441879272460938, "learning_rate": 7.17373875112819e-06, "loss": 0.03468628600239754, "memory(GiB)": 21.48, "step": 11975, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.958285 }, { "epoch": 0.3890459019588734, "grad_norm": 0.3917916715145111, "learning_rate": 7.173255004289768e-06, "loss": 0.031024666503071785, "memory(GiB)": 21.48, "step": 11976, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.958299 }, { "epoch": 0.3890783874216288, "grad_norm": 0.3555976152420044, "learning_rate": 7.172771232369091e-06, "loss": 0.02157895267009735, "memory(GiB)": 21.48, "step": 11977, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.95831 }, { "epoch": 0.38911087288438423, "grad_norm": 0.4231288731098175, "learning_rate": 7.172287435371742e-06, "loss": 0.029760271310806274, "memory(GiB)": 21.48, "step": 11978, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.958321 }, { "epoch": 0.38914335834713965, "grad_norm": 0.31679707765579224, "learning_rate": 7.1718036133033055e-06, "loss": 0.019041575491428375, "memory(GiB)": 21.48, "step": 11979, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.958333 }, { "epoch": 0.38917584380989506, "grad_norm": 0.36096200346946716, "learning_rate": 7.171319766169365e-06, "loss": 0.017108209431171417, "memory(GiB)": 21.48, "step": 11980, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.958345 }, { "epoch": 0.3892083292726505, "grad_norm": 0.4168833792209625, "learning_rate": 7.170835893975504e-06, "loss": 0.02009805291891098, "memory(GiB)": 21.48, "step": 11981, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.958357 }, { "epoch": 0.3892408147354059, "grad_norm": 1.99380362033844, "learning_rate": 7.170351996727307e-06, "loss": 0.02423301711678505, "memory(GiB)": 21.48, "step": 11982, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.958369 }, { "epoch": 0.3892733001981613, "grad_norm": 0.31454846262931824, "learning_rate": 7.169868074430359e-06, "loss": 0.022000160068273544, "memory(GiB)": 21.48, "step": 11983, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.958381 }, { "epoch": 0.3893057856609167, "grad_norm": 0.2266257405281067, "learning_rate": 7.169384127090246e-06, "loss": 0.01715010218322277, "memory(GiB)": 21.48, "step": 11984, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.958392 }, { "epoch": 0.38933827112367214, "grad_norm": 0.36244627833366394, "learning_rate": 7.1689001547125525e-06, "loss": 0.03377356380224228, "memory(GiB)": 21.48, "step": 11985, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.958403 }, { "epoch": 0.38937075658642756, "grad_norm": 0.4428814947605133, "learning_rate": 7.168416157302866e-06, "loss": 0.02347564324736595, "memory(GiB)": 21.48, "step": 11986, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.958415 }, { "epoch": 0.389403242049183, "grad_norm": 0.49083584547042847, "learning_rate": 7.167932134866768e-06, "loss": 0.0343245193362236, "memory(GiB)": 21.48, "step": 11987, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.958427 }, { "epoch": 0.3894357275119384, "grad_norm": 0.26737159490585327, "learning_rate": 7.167448087409849e-06, "loss": 0.01879776641726494, "memory(GiB)": 21.48, "step": 11988, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.958438 }, { "epoch": 0.3894682129746938, "grad_norm": 0.46350380778312683, "learning_rate": 7.166964014937695e-06, "loss": 0.029176140204072, "memory(GiB)": 21.48, "step": 11989, "token_acc": 1.0, "train_speed(iter/s)": 0.95845 }, { "epoch": 0.3895006984374492, "grad_norm": 0.3535623848438263, "learning_rate": 7.166479917455891e-06, "loss": 0.02453562058508396, "memory(GiB)": 21.48, "step": 11990, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.958459 }, { "epoch": 0.38953318390020464, "grad_norm": 0.28443673253059387, "learning_rate": 7.165995794970024e-06, "loss": 0.019882548600435257, "memory(GiB)": 21.48, "step": 11991, "token_acc": 1.0, "train_speed(iter/s)": 0.958469 }, { "epoch": 0.38956566936296005, "grad_norm": 0.5845165252685547, "learning_rate": 7.165511647485684e-06, "loss": 0.029436323791742325, "memory(GiB)": 21.48, "step": 11992, "token_acc": 1.0, "train_speed(iter/s)": 0.958483 }, { "epoch": 0.38959815482571547, "grad_norm": 0.44620612263679504, "learning_rate": 7.165027475008458e-06, "loss": 0.02755436860024929, "memory(GiB)": 21.48, "step": 11993, "token_acc": 0.9823008849557522, "train_speed(iter/s)": 0.958497 }, { "epoch": 0.3896306402884709, "grad_norm": 0.6496707797050476, "learning_rate": 7.164543277543933e-06, "loss": 0.033815380185842514, "memory(GiB)": 21.48, "step": 11994, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.958511 }, { "epoch": 0.3896631257512263, "grad_norm": 0.3869825601577759, "learning_rate": 7.164059055097696e-06, "loss": 0.034783460199832916, "memory(GiB)": 21.48, "step": 11995, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.958522 }, { "epoch": 0.3896956112139817, "grad_norm": 0.4242071509361267, "learning_rate": 7.1635748076753395e-06, "loss": 0.02481781877577305, "memory(GiB)": 21.48, "step": 11996, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.958536 }, { "epoch": 0.38972809667673713, "grad_norm": 0.30511611700057983, "learning_rate": 7.163090535282448e-06, "loss": 0.025371547788381577, "memory(GiB)": 21.48, "step": 11997, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.95855 }, { "epoch": 0.38976058213949255, "grad_norm": 0.4279342591762543, "learning_rate": 7.162606237924614e-06, "loss": 0.021829472854733467, "memory(GiB)": 21.48, "step": 11998, "token_acc": 0.9897959183673469, "train_speed(iter/s)": 0.958565 }, { "epoch": 0.38979306760224797, "grad_norm": 0.5319545269012451, "learning_rate": 7.162121915607425e-06, "loss": 0.02766594849526882, "memory(GiB)": 21.48, "step": 11999, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.958578 }, { "epoch": 0.38982555306500344, "grad_norm": 0.5159096121788025, "learning_rate": 7.161637568336471e-06, "loss": 0.02814764901995659, "memory(GiB)": 21.48, "step": 12000, "token_acc": 0.9895470383275261, "train_speed(iter/s)": 0.958593 }, { "epoch": 0.38982555306500344, "eval_loss": 0.02622402273118496, "eval_runtime": 80.2011, "eval_samples_per_second": 124.063, "eval_steps_per_second": 3.878, "eval_token_acc": 0.9896982583312312, "step": 12000 }, { "epoch": 0.38985803852775885, "grad_norm": 0.3211595416069031, "learning_rate": 7.161153196117343e-06, "loss": 0.021948523819446564, "memory(GiB)": 21.48, "step": 12001, "token_acc": 0.9893717557147357, "train_speed(iter/s)": 0.951672 }, { "epoch": 0.38989052399051427, "grad_norm": 0.31613224744796753, "learning_rate": 7.16066879895563e-06, "loss": 0.024646498262882233, "memory(GiB)": 21.48, "step": 12002, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.951685 }, { "epoch": 0.3899230094532697, "grad_norm": 0.3205096423625946, "learning_rate": 7.160184376856924e-06, "loss": 0.021464642137289047, "memory(GiB)": 21.48, "step": 12003, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.951697 }, { "epoch": 0.3899554949160251, "grad_norm": 0.27432286739349365, "learning_rate": 7.1596999298268154e-06, "loss": 0.021469835191965103, "memory(GiB)": 21.48, "step": 12004, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.95171 }, { "epoch": 0.3899879803787805, "grad_norm": 0.39755910634994507, "learning_rate": 7.1592154578708935e-06, "loss": 0.021561643108725548, "memory(GiB)": 21.48, "step": 12005, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.951724 }, { "epoch": 0.39002046584153593, "grad_norm": 0.6393887400627136, "learning_rate": 7.158730960994751e-06, "loss": 0.034550122916698456, "memory(GiB)": 21.48, "step": 12006, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.951742 }, { "epoch": 0.39005295130429135, "grad_norm": 0.8328821659088135, "learning_rate": 7.1582464392039816e-06, "loss": 0.039219461381435394, "memory(GiB)": 21.48, "step": 12007, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.951757 }, { "epoch": 0.39008543676704677, "grad_norm": 0.7886665463447571, "learning_rate": 7.157761892504176e-06, "loss": 0.03014436550438404, "memory(GiB)": 21.48, "step": 12008, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.951774 }, { "epoch": 0.3901179222298022, "grad_norm": 0.39423444867134094, "learning_rate": 7.157277320900925e-06, "loss": 0.027874823659658432, "memory(GiB)": 21.48, "step": 12009, "token_acc": 0.9823788546255506, "train_speed(iter/s)": 0.95179 }, { "epoch": 0.3901504076925576, "grad_norm": 0.3650679886341095, "learning_rate": 7.156792724399823e-06, "loss": 0.02582928165793419, "memory(GiB)": 21.48, "step": 12010, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.951807 }, { "epoch": 0.390182893155313, "grad_norm": 0.42717933654785156, "learning_rate": 7.156308103006462e-06, "loss": 0.027210291475057602, "memory(GiB)": 21.48, "step": 12011, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.951825 }, { "epoch": 0.39021537861806843, "grad_norm": 0.42118826508522034, "learning_rate": 7.1558234567264355e-06, "loss": 0.03309483826160431, "memory(GiB)": 21.48, "step": 12012, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.951841 }, { "epoch": 0.39024786408082385, "grad_norm": 0.35629889369010925, "learning_rate": 7.1553387855653355e-06, "loss": 0.0231475867331028, "memory(GiB)": 21.48, "step": 12013, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.951858 }, { "epoch": 0.39028034954357926, "grad_norm": 0.452655166387558, "learning_rate": 7.15485408952876e-06, "loss": 0.02997700124979019, "memory(GiB)": 21.48, "step": 12014, "token_acc": 0.9963503649635036, "train_speed(iter/s)": 0.951875 }, { "epoch": 0.3903128350063347, "grad_norm": 0.3418913781642914, "learning_rate": 7.1543693686222985e-06, "loss": 0.02227913774549961, "memory(GiB)": 21.48, "step": 12015, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.951891 }, { "epoch": 0.3903453204690901, "grad_norm": 0.37052640318870544, "learning_rate": 7.1538846228515465e-06, "loss": 0.02823452651500702, "memory(GiB)": 21.48, "step": 12016, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.951906 }, { "epoch": 0.3903778059318455, "grad_norm": 0.9162788987159729, "learning_rate": 7.1533998522221e-06, "loss": 0.03765229135751724, "memory(GiB)": 21.48, "step": 12017, "token_acc": 0.9798994974874372, "train_speed(iter/s)": 0.9519 }, { "epoch": 0.39041029139460093, "grad_norm": 0.3058721721172333, "learning_rate": 7.152915056739553e-06, "loss": 0.02181248739361763, "memory(GiB)": 21.48, "step": 12018, "token_acc": 0.9899328859060402, "train_speed(iter/s)": 0.951915 }, { "epoch": 0.39044277685735634, "grad_norm": 0.5808916687965393, "learning_rate": 7.152430236409502e-06, "loss": 0.027300599962472916, "memory(GiB)": 21.48, "step": 12019, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.951927 }, { "epoch": 0.39047526232011176, "grad_norm": 0.4981369972229004, "learning_rate": 7.1519453912375395e-06, "loss": 0.03414887934923172, "memory(GiB)": 21.48, "step": 12020, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.951942 }, { "epoch": 0.3905077477828672, "grad_norm": 0.294134259223938, "learning_rate": 7.151460521229263e-06, "loss": 0.022856716066598892, "memory(GiB)": 21.48, "step": 12021, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.951956 }, { "epoch": 0.3905402332456226, "grad_norm": 0.5564053058624268, "learning_rate": 7.150975626390269e-06, "loss": 0.03133027255535126, "memory(GiB)": 21.48, "step": 12022, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.951971 }, { "epoch": 0.390572718708378, "grad_norm": 0.4020495116710663, "learning_rate": 7.150490706726152e-06, "loss": 0.026479128748178482, "memory(GiB)": 21.48, "step": 12023, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.951985 }, { "epoch": 0.3906052041711334, "grad_norm": 0.47918036580085754, "learning_rate": 7.150005762242511e-06, "loss": 0.028999146074056625, "memory(GiB)": 21.48, "step": 12024, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.951998 }, { "epoch": 0.39063768963388884, "grad_norm": 0.38150373101234436, "learning_rate": 7.1495207929449414e-06, "loss": 0.03325551748275757, "memory(GiB)": 21.48, "step": 12025, "token_acc": 0.984, "train_speed(iter/s)": 0.952011 }, { "epoch": 0.39067017509664426, "grad_norm": 0.4106820225715637, "learning_rate": 7.149035798839042e-06, "loss": 0.028959836810827255, "memory(GiB)": 21.48, "step": 12026, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.952024 }, { "epoch": 0.3907026605593997, "grad_norm": 0.34548813104629517, "learning_rate": 7.148550779930409e-06, "loss": 0.024936050176620483, "memory(GiB)": 21.48, "step": 12027, "token_acc": 0.9932885906040269, "train_speed(iter/s)": 0.952037 }, { "epoch": 0.3907351460221551, "grad_norm": 0.345380038022995, "learning_rate": 7.1480657362246395e-06, "loss": 0.028571801260113716, "memory(GiB)": 21.48, "step": 12028, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.952051 }, { "epoch": 0.3907676314849105, "grad_norm": 0.5250491499900818, "learning_rate": 7.147580667727332e-06, "loss": 0.0385856069624424, "memory(GiB)": 21.48, "step": 12029, "token_acc": 0.9893992932862191, "train_speed(iter/s)": 0.952065 }, { "epoch": 0.3908001169476659, "grad_norm": 0.4344976544380188, "learning_rate": 7.147095574444086e-06, "loss": 0.02819230780005455, "memory(GiB)": 21.48, "step": 12030, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.95208 }, { "epoch": 0.39083260241042134, "grad_norm": 0.4902648329734802, "learning_rate": 7.146610456380501e-06, "loss": 0.03789235278964043, "memory(GiB)": 21.48, "step": 12031, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.952093 }, { "epoch": 0.39086508787317675, "grad_norm": 0.7984194159507751, "learning_rate": 7.1461253135421704e-06, "loss": 0.02340519428253174, "memory(GiB)": 21.48, "step": 12032, "token_acc": 0.9854545454545455, "train_speed(iter/s)": 0.952107 }, { "epoch": 0.39089757333593217, "grad_norm": 0.3991502821445465, "learning_rate": 7.1456401459347e-06, "loss": 0.02105966955423355, "memory(GiB)": 21.48, "step": 12033, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.952121 }, { "epoch": 0.3909300587986876, "grad_norm": 0.34577634930610657, "learning_rate": 7.1451549535636845e-06, "loss": 0.02632615901529789, "memory(GiB)": 21.48, "step": 12034, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.952134 }, { "epoch": 0.390962544261443, "grad_norm": 0.527667760848999, "learning_rate": 7.144669736434728e-06, "loss": 0.03838974982500076, "memory(GiB)": 21.48, "step": 12035, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.952145 }, { "epoch": 0.3909950297241984, "grad_norm": 0.5398395657539368, "learning_rate": 7.144184494553426e-06, "loss": 0.028387561440467834, "memory(GiB)": 21.48, "step": 12036, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.952158 }, { "epoch": 0.39102751518695383, "grad_norm": 0.4322368800640106, "learning_rate": 7.1436992279253815e-06, "loss": 0.031615033745765686, "memory(GiB)": 21.48, "step": 12037, "token_acc": 0.984, "train_speed(iter/s)": 0.952173 }, { "epoch": 0.39106000064970925, "grad_norm": 0.4629628658294678, "learning_rate": 7.143213936556195e-06, "loss": 0.03527066856622696, "memory(GiB)": 21.48, "step": 12038, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.952185 }, { "epoch": 0.39109248611246467, "grad_norm": 0.3011372685432434, "learning_rate": 7.142728620451467e-06, "loss": 0.02420281618833542, "memory(GiB)": 21.48, "step": 12039, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.9522 }, { "epoch": 0.3911249715752201, "grad_norm": 0.4460855722427368, "learning_rate": 7.142243279616799e-06, "loss": 0.02019769698381424, "memory(GiB)": 21.48, "step": 12040, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.952215 }, { "epoch": 0.3911574570379755, "grad_norm": 0.38020944595336914, "learning_rate": 7.141757914057792e-06, "loss": 0.02357875183224678, "memory(GiB)": 21.48, "step": 12041, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.952231 }, { "epoch": 0.3911899425007309, "grad_norm": 0.41887950897216797, "learning_rate": 7.141272523780049e-06, "loss": 0.022407177835702896, "memory(GiB)": 21.48, "step": 12042, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.952248 }, { "epoch": 0.39122242796348633, "grad_norm": 0.4681752920150757, "learning_rate": 7.140787108789169e-06, "loss": 0.02882222831249237, "memory(GiB)": 21.48, "step": 12043, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.952266 }, { "epoch": 0.39125491342624175, "grad_norm": 0.4249557852745056, "learning_rate": 7.140301669090758e-06, "loss": 0.02790970727801323, "memory(GiB)": 21.48, "step": 12044, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.952279 }, { "epoch": 0.39128739888899716, "grad_norm": 0.3989163041114807, "learning_rate": 7.139816204690417e-06, "loss": 0.02385825291275978, "memory(GiB)": 21.48, "step": 12045, "token_acc": 1.0, "train_speed(iter/s)": 0.952291 }, { "epoch": 0.3913198843517526, "grad_norm": 0.40528106689453125, "learning_rate": 7.139330715593748e-06, "loss": 0.0276519563049078, "memory(GiB)": 21.48, "step": 12046, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.952304 }, { "epoch": 0.391352369814508, "grad_norm": 0.3965646028518677, "learning_rate": 7.138845201806356e-06, "loss": 0.027114704251289368, "memory(GiB)": 21.48, "step": 12047, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.952318 }, { "epoch": 0.3913848552772634, "grad_norm": 0.74223393201828, "learning_rate": 7.138359663333843e-06, "loss": 0.026500793173909187, "memory(GiB)": 21.48, "step": 12048, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.95233 }, { "epoch": 0.3914173407400188, "grad_norm": 0.402994841337204, "learning_rate": 7.137874100181814e-06, "loss": 0.025436967611312866, "memory(GiB)": 21.48, "step": 12049, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.952342 }, { "epoch": 0.39144982620277424, "grad_norm": 0.5047913789749146, "learning_rate": 7.137388512355872e-06, "loss": 0.029084328562021255, "memory(GiB)": 21.48, "step": 12050, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.952355 }, { "epoch": 0.39148231166552966, "grad_norm": 0.3073522746562958, "learning_rate": 7.13690289986162e-06, "loss": 0.021838214248418808, "memory(GiB)": 21.48, "step": 12051, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.952368 }, { "epoch": 0.3915147971282851, "grad_norm": 0.387207567691803, "learning_rate": 7.136417262704667e-06, "loss": 0.026228923350572586, "memory(GiB)": 21.48, "step": 12052, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.952381 }, { "epoch": 0.3915472825910405, "grad_norm": 0.702877402305603, "learning_rate": 7.135931600890613e-06, "loss": 0.027348864823579788, "memory(GiB)": 21.48, "step": 12053, "token_acc": 1.0, "train_speed(iter/s)": 0.952395 }, { "epoch": 0.3915797680537959, "grad_norm": 0.6143993735313416, "learning_rate": 7.135445914425068e-06, "loss": 0.025966022163629532, "memory(GiB)": 21.48, "step": 12054, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.952408 }, { "epoch": 0.3916122535165513, "grad_norm": 0.22632503509521484, "learning_rate": 7.134960203313633e-06, "loss": 0.02317938581109047, "memory(GiB)": 21.48, "step": 12055, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.952419 }, { "epoch": 0.39164473897930674, "grad_norm": 0.36986586451530457, "learning_rate": 7.134474467561916e-06, "loss": 0.024267688393592834, "memory(GiB)": 21.48, "step": 12056, "token_acc": 0.9947368421052631, "train_speed(iter/s)": 0.952433 }, { "epoch": 0.39167722444206216, "grad_norm": 0.4085307717323303, "learning_rate": 7.133988707175521e-06, "loss": 0.02998536452651024, "memory(GiB)": 21.48, "step": 12057, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.952448 }, { "epoch": 0.39170970990481757, "grad_norm": 0.36740782856941223, "learning_rate": 7.1335029221600585e-06, "loss": 0.029293086379766464, "memory(GiB)": 21.48, "step": 12058, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.952461 }, { "epoch": 0.391742195367573, "grad_norm": 0.4767114520072937, "learning_rate": 7.133017112521132e-06, "loss": 0.027175821363925934, "memory(GiB)": 21.48, "step": 12059, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.952474 }, { "epoch": 0.3917746808303284, "grad_norm": 0.39970657229423523, "learning_rate": 7.132531278264349e-06, "loss": 0.03331909328699112, "memory(GiB)": 21.48, "step": 12060, "token_acc": 0.9966216216216216, "train_speed(iter/s)": 0.952488 }, { "epoch": 0.3918071662930838, "grad_norm": 0.2549106776714325, "learning_rate": 7.132045419395317e-06, "loss": 0.015028133988380432, "memory(GiB)": 21.48, "step": 12061, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.9525 }, { "epoch": 0.39183965175583924, "grad_norm": 0.46371519565582275, "learning_rate": 7.131559535919641e-06, "loss": 0.03481991961598396, "memory(GiB)": 21.48, "step": 12062, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.952514 }, { "epoch": 0.39187213721859465, "grad_norm": 0.3301302194595337, "learning_rate": 7.131073627842933e-06, "loss": 0.023925790563225746, "memory(GiB)": 21.48, "step": 12063, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.952529 }, { "epoch": 0.3919046226813501, "grad_norm": 0.4230920672416687, "learning_rate": 7.130587695170799e-06, "loss": 0.019912034273147583, "memory(GiB)": 21.48, "step": 12064, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.952543 }, { "epoch": 0.39193710814410554, "grad_norm": 0.3773188591003418, "learning_rate": 7.130101737908846e-06, "loss": 0.028004631400108337, "memory(GiB)": 21.48, "step": 12065, "token_acc": 0.9823788546255506, "train_speed(iter/s)": 0.95256 }, { "epoch": 0.39196959360686096, "grad_norm": 0.44587182998657227, "learning_rate": 7.129615756062683e-06, "loss": 0.02449464052915573, "memory(GiB)": 21.48, "step": 12066, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.952577 }, { "epoch": 0.39200207906961637, "grad_norm": 0.36866000294685364, "learning_rate": 7.129129749637922e-06, "loss": 0.022375287488102913, "memory(GiB)": 21.48, "step": 12067, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.952594 }, { "epoch": 0.3920345645323718, "grad_norm": 0.3106241524219513, "learning_rate": 7.128643718640168e-06, "loss": 0.021169910207390785, "memory(GiB)": 21.48, "step": 12068, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.95261 }, { "epoch": 0.3920670499951272, "grad_norm": 0.8222217559814453, "learning_rate": 7.128157663075033e-06, "loss": 0.02216382697224617, "memory(GiB)": 21.48, "step": 12069, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.952627 }, { "epoch": 0.3920995354578826, "grad_norm": 0.4137743413448334, "learning_rate": 7.1276715829481255e-06, "loss": 0.026414688676595688, "memory(GiB)": 21.48, "step": 12070, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.952644 }, { "epoch": 0.39213202092063804, "grad_norm": 0.5065029859542847, "learning_rate": 7.127185478265056e-06, "loss": 0.023483198136091232, "memory(GiB)": 21.48, "step": 12071, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.95266 }, { "epoch": 0.39216450638339345, "grad_norm": 0.44904765486717224, "learning_rate": 7.126699349031436e-06, "loss": 0.03956924378871918, "memory(GiB)": 21.48, "step": 12072, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.952676 }, { "epoch": 0.39219699184614887, "grad_norm": 0.387216180562973, "learning_rate": 7.126213195252876e-06, "loss": 0.027904290705919266, "memory(GiB)": 21.48, "step": 12073, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.952693 }, { "epoch": 0.3922294773089043, "grad_norm": 0.5224136114120483, "learning_rate": 7.125727016934982e-06, "loss": 0.0267182644456625, "memory(GiB)": 21.48, "step": 12074, "token_acc": 0.9751243781094527, "train_speed(iter/s)": 0.95271 }, { "epoch": 0.3922619627716597, "grad_norm": 0.44543036818504333, "learning_rate": 7.125240814083372e-06, "loss": 0.03255157172679901, "memory(GiB)": 21.48, "step": 12075, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.952727 }, { "epoch": 0.3922944482344151, "grad_norm": 0.5762690305709839, "learning_rate": 7.1247545867036525e-06, "loss": 0.02681819535791874, "memory(GiB)": 21.48, "step": 12076, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.952744 }, { "epoch": 0.39232693369717053, "grad_norm": 0.4414564371109009, "learning_rate": 7.124268334801438e-06, "loss": 0.027887709438800812, "memory(GiB)": 21.48, "step": 12077, "token_acc": 0.9941176470588236, "train_speed(iter/s)": 0.952759 }, { "epoch": 0.39235941915992595, "grad_norm": 0.31308436393737793, "learning_rate": 7.123782058382338e-06, "loss": 0.01599757932126522, "memory(GiB)": 21.48, "step": 12078, "token_acc": 0.992, "train_speed(iter/s)": 0.952777 }, { "epoch": 0.39239190462268136, "grad_norm": 0.4574466943740845, "learning_rate": 7.123295757451969e-06, "loss": 0.027254361659288406, "memory(GiB)": 21.48, "step": 12079, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.952793 }, { "epoch": 0.3924243900854368, "grad_norm": 0.46755295991897583, "learning_rate": 7.122809432015938e-06, "loss": 0.021190758794546127, "memory(GiB)": 21.48, "step": 12080, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.952806 }, { "epoch": 0.3924568755481922, "grad_norm": 0.39017006754875183, "learning_rate": 7.1223230820798626e-06, "loss": 0.03358679637312889, "memory(GiB)": 21.48, "step": 12081, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.95282 }, { "epoch": 0.3924893610109476, "grad_norm": 0.40581628680229187, "learning_rate": 7.1218367076493545e-06, "loss": 0.027517620474100113, "memory(GiB)": 21.48, "step": 12082, "token_acc": 0.981203007518797, "train_speed(iter/s)": 0.952833 }, { "epoch": 0.39252184647370303, "grad_norm": 0.4132474660873413, "learning_rate": 7.121350308730024e-06, "loss": 0.026532985270023346, "memory(GiB)": 21.48, "step": 12083, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.952846 }, { "epoch": 0.39255433193645844, "grad_norm": 0.4139474630355835, "learning_rate": 7.120863885327489e-06, "loss": 0.03622420132160187, "memory(GiB)": 21.48, "step": 12084, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.95286 }, { "epoch": 0.39258681739921386, "grad_norm": 0.4260737895965576, "learning_rate": 7.120377437447361e-06, "loss": 0.023890117183327675, "memory(GiB)": 21.48, "step": 12085, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.952874 }, { "epoch": 0.3926193028619693, "grad_norm": 0.4118371903896332, "learning_rate": 7.119890965095258e-06, "loss": 0.022564100101590157, "memory(GiB)": 21.48, "step": 12086, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.952888 }, { "epoch": 0.3926517883247247, "grad_norm": 0.3568600118160248, "learning_rate": 7.11940446827679e-06, "loss": 0.018749570474028587, "memory(GiB)": 21.48, "step": 12087, "token_acc": 0.9922779922779923, "train_speed(iter/s)": 0.952901 }, { "epoch": 0.3926842737874801, "grad_norm": 0.667026162147522, "learning_rate": 7.118917946997572e-06, "loss": 0.026406768709421158, "memory(GiB)": 21.48, "step": 12088, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.952914 }, { "epoch": 0.3927167592502355, "grad_norm": 0.39632582664489746, "learning_rate": 7.1184314012632225e-06, "loss": 0.026628417894244194, "memory(GiB)": 21.48, "step": 12089, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.952927 }, { "epoch": 0.39274924471299094, "grad_norm": 0.5701071619987488, "learning_rate": 7.117944831079354e-06, "loss": 0.02864106185734272, "memory(GiB)": 21.48, "step": 12090, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.952939 }, { "epoch": 0.39278173017574636, "grad_norm": 0.42801252007484436, "learning_rate": 7.117458236451584e-06, "loss": 0.029673432931303978, "memory(GiB)": 21.48, "step": 12091, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.952952 }, { "epoch": 0.3928142156385018, "grad_norm": 0.6753338575363159, "learning_rate": 7.116971617385527e-06, "loss": 0.03548844903707504, "memory(GiB)": 21.48, "step": 12092, "token_acc": 0.9812734082397003, "train_speed(iter/s)": 0.952966 }, { "epoch": 0.3928467011012572, "grad_norm": 0.6385966539382935, "learning_rate": 7.1164849738868e-06, "loss": 0.038555875420570374, "memory(GiB)": 21.48, "step": 12093, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.952979 }, { "epoch": 0.3928791865640126, "grad_norm": 0.39603713154792786, "learning_rate": 7.115998305961019e-06, "loss": 0.028015445917844772, "memory(GiB)": 21.48, "step": 12094, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.952992 }, { "epoch": 0.392911672026768, "grad_norm": 0.4409857988357544, "learning_rate": 7.115511613613801e-06, "loss": 0.03480282053351402, "memory(GiB)": 21.48, "step": 12095, "token_acc": 0.9787985865724381, "train_speed(iter/s)": 0.953005 }, { "epoch": 0.39294415748952344, "grad_norm": 0.6036827564239502, "learning_rate": 7.115024896850765e-06, "loss": 0.041288577020168304, "memory(GiB)": 21.48, "step": 12096, "token_acc": 0.984375, "train_speed(iter/s)": 0.953018 }, { "epoch": 0.39297664295227885, "grad_norm": 0.33424708247184753, "learning_rate": 7.114538155677525e-06, "loss": 0.027829274535179138, "memory(GiB)": 21.48, "step": 12097, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.953031 }, { "epoch": 0.39300912841503427, "grad_norm": 0.3701252341270447, "learning_rate": 7.114051390099701e-06, "loss": 0.03200945258140564, "memory(GiB)": 21.48, "step": 12098, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.953043 }, { "epoch": 0.3930416138777897, "grad_norm": 0.26599177718162537, "learning_rate": 7.113564600122909e-06, "loss": 0.017934519797563553, "memory(GiB)": 21.48, "step": 12099, "token_acc": 0.9939393939393939, "train_speed(iter/s)": 0.953055 }, { "epoch": 0.3930740993405451, "grad_norm": 0.36132851243019104, "learning_rate": 7.113077785752769e-06, "loss": 0.023234069347381592, "memory(GiB)": 21.48, "step": 12100, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.953068 }, { "epoch": 0.3931065848033005, "grad_norm": 0.4382087290287018, "learning_rate": 7.1125909469949e-06, "loss": 0.02489183098077774, "memory(GiB)": 21.48, "step": 12101, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.953082 }, { "epoch": 0.39313907026605593, "grad_norm": 0.670595645904541, "learning_rate": 7.1121040838549186e-06, "loss": 0.027653027325868607, "memory(GiB)": 21.48, "step": 12102, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.953095 }, { "epoch": 0.39317155572881135, "grad_norm": 0.4954470992088318, "learning_rate": 7.111617196338445e-06, "loss": 0.03870845586061478, "memory(GiB)": 21.48, "step": 12103, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.953108 }, { "epoch": 0.39320404119156677, "grad_norm": 1.4294767379760742, "learning_rate": 7.111130284451097e-06, "loss": 0.034001849591732025, "memory(GiB)": 21.48, "step": 12104, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.953119 }, { "epoch": 0.3932365266543222, "grad_norm": 0.33123302459716797, "learning_rate": 7.110643348198498e-06, "loss": 0.024836622178554535, "memory(GiB)": 21.48, "step": 12105, "token_acc": 0.9797979797979798, "train_speed(iter/s)": 0.95313 }, { "epoch": 0.3932690121170776, "grad_norm": 0.39209067821502686, "learning_rate": 7.110156387586264e-06, "loss": 0.029448455199599266, "memory(GiB)": 21.48, "step": 12106, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.953144 }, { "epoch": 0.393301497579833, "grad_norm": 0.2813032567501068, "learning_rate": 7.109669402620018e-06, "loss": 0.021775508299469948, "memory(GiB)": 21.48, "step": 12107, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.953156 }, { "epoch": 0.39333398304258843, "grad_norm": 0.36510294675827026, "learning_rate": 7.109182393305377e-06, "loss": 0.024664513766765594, "memory(GiB)": 21.48, "step": 12108, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.953168 }, { "epoch": 0.39336646850534385, "grad_norm": 0.3074241578578949, "learning_rate": 7.108695359647966e-06, "loss": 0.022830767557024956, "memory(GiB)": 21.48, "step": 12109, "token_acc": 1.0, "train_speed(iter/s)": 0.953182 }, { "epoch": 0.39339895396809926, "grad_norm": 0.41530585289001465, "learning_rate": 7.108208301653403e-06, "loss": 0.03427411615848541, "memory(GiB)": 21.48, "step": 12110, "token_acc": 0.9865771812080537, "train_speed(iter/s)": 0.953194 }, { "epoch": 0.3934314394308547, "grad_norm": 0.380446195602417, "learning_rate": 7.10772121932731e-06, "loss": 0.02412855625152588, "memory(GiB)": 21.48, "step": 12111, "token_acc": 1.0, "train_speed(iter/s)": 0.953204 }, { "epoch": 0.3934639248936101, "grad_norm": 0.3518041670322418, "learning_rate": 7.10723411267531e-06, "loss": 0.03196016699075699, "memory(GiB)": 21.48, "step": 12112, "token_acc": 0.9806201550387597, "train_speed(iter/s)": 0.953217 }, { "epoch": 0.3934964103563655, "grad_norm": 0.30795446038246155, "learning_rate": 7.106746981703021e-06, "loss": 0.0262556504458189, "memory(GiB)": 21.48, "step": 12113, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.953227 }, { "epoch": 0.3935288958191209, "grad_norm": 0.29281798005104065, "learning_rate": 7.10625982641607e-06, "loss": 0.02136111445724964, "memory(GiB)": 21.48, "step": 12114, "token_acc": 1.0, "train_speed(iter/s)": 0.953239 }, { "epoch": 0.39356138128187634, "grad_norm": 0.3331003785133362, "learning_rate": 7.105772646820076e-06, "loss": 0.023379765450954437, "memory(GiB)": 21.48, "step": 12115, "token_acc": 0.9847908745247148, "train_speed(iter/s)": 0.953252 }, { "epoch": 0.39359386674463176, "grad_norm": 0.6092410683631897, "learning_rate": 7.105285442920663e-06, "loss": 0.0289461687207222, "memory(GiB)": 21.48, "step": 12116, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.953262 }, { "epoch": 0.3936263522073872, "grad_norm": 1.5707156658172607, "learning_rate": 7.104798214723454e-06, "loss": 0.02693558856844902, "memory(GiB)": 21.48, "step": 12117, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.953274 }, { "epoch": 0.3936588376701426, "grad_norm": 0.3694809675216675, "learning_rate": 7.104310962234072e-06, "loss": 0.023184046149253845, "memory(GiB)": 21.48, "step": 12118, "token_acc": 0.9938650306748467, "train_speed(iter/s)": 0.953288 }, { "epoch": 0.393691323132898, "grad_norm": 0.39041635394096375, "learning_rate": 7.103823685458142e-06, "loss": 0.021903742104768753, "memory(GiB)": 21.48, "step": 12119, "token_acc": 1.0, "train_speed(iter/s)": 0.953301 }, { "epoch": 0.3937238085956534, "grad_norm": 0.6627378463745117, "learning_rate": 7.1033363844012845e-06, "loss": 0.024233970791101456, "memory(GiB)": 21.48, "step": 12120, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.953313 }, { "epoch": 0.39375629405840884, "grad_norm": 0.29527080059051514, "learning_rate": 7.102849059069128e-06, "loss": 0.019889499992132187, "memory(GiB)": 21.48, "step": 12121, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.953327 }, { "epoch": 0.39378877952116426, "grad_norm": 0.8913042545318604, "learning_rate": 7.102361709467291e-06, "loss": 0.02527264691889286, "memory(GiB)": 21.48, "step": 12122, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.953341 }, { "epoch": 0.39382126498391967, "grad_norm": 0.5349574089050293, "learning_rate": 7.1018743356014056e-06, "loss": 0.03191416338086128, "memory(GiB)": 21.48, "step": 12123, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.953357 }, { "epoch": 0.3938537504466751, "grad_norm": 1.435676097869873, "learning_rate": 7.10138693747709e-06, "loss": 0.04175713658332825, "memory(GiB)": 21.48, "step": 12124, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.953372 }, { "epoch": 0.3938862359094305, "grad_norm": 0.876263439655304, "learning_rate": 7.100899515099973e-06, "loss": 0.021963102743029594, "memory(GiB)": 21.48, "step": 12125, "token_acc": 0.9895470383275261, "train_speed(iter/s)": 0.953389 }, { "epoch": 0.3939187213721859, "grad_norm": 0.3197370767593384, "learning_rate": 7.100412068475679e-06, "loss": 0.022992853075265884, "memory(GiB)": 21.48, "step": 12126, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.953405 }, { "epoch": 0.39395120683494134, "grad_norm": 0.41534024477005005, "learning_rate": 7.0999245976098355e-06, "loss": 0.022275160998106003, "memory(GiB)": 21.48, "step": 12127, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.953422 }, { "epoch": 0.3939836922976968, "grad_norm": 0.440641850233078, "learning_rate": 7.099437102508067e-06, "loss": 0.029111389070749283, "memory(GiB)": 21.48, "step": 12128, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.953439 }, { "epoch": 0.3940161777604522, "grad_norm": 0.47764137387275696, "learning_rate": 7.098949583175998e-06, "loss": 0.0281188003718853, "memory(GiB)": 21.48, "step": 12129, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.953455 }, { "epoch": 0.39404866322320764, "grad_norm": 0.7430843114852905, "learning_rate": 7.098462039619257e-06, "loss": 0.03476255014538765, "memory(GiB)": 21.48, "step": 12130, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.953472 }, { "epoch": 0.39408114868596306, "grad_norm": 0.4084171950817108, "learning_rate": 7.097974471843473e-06, "loss": 0.020653126761317253, "memory(GiB)": 21.48, "step": 12131, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.953489 }, { "epoch": 0.3941136341487185, "grad_norm": 0.34021082520484924, "learning_rate": 7.09748687985427e-06, "loss": 0.026888223364949226, "memory(GiB)": 21.48, "step": 12132, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.953505 }, { "epoch": 0.3941461196114739, "grad_norm": 0.336016982793808, "learning_rate": 7.096999263657277e-06, "loss": 0.022795289754867554, "memory(GiB)": 21.48, "step": 12133, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.953521 }, { "epoch": 0.3941786050742293, "grad_norm": 0.33540457487106323, "learning_rate": 7.096511623258122e-06, "loss": 0.025068923830986023, "memory(GiB)": 21.48, "step": 12134, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.953538 }, { "epoch": 0.3942110905369847, "grad_norm": 0.45660272240638733, "learning_rate": 7.0960239586624325e-06, "loss": 0.022671518847346306, "memory(GiB)": 21.48, "step": 12135, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.953552 }, { "epoch": 0.39424357599974014, "grad_norm": 0.33430901169776917, "learning_rate": 7.095536269875835e-06, "loss": 0.021460801362991333, "memory(GiB)": 21.48, "step": 12136, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.953569 }, { "epoch": 0.39427606146249555, "grad_norm": 0.2796076834201813, "learning_rate": 7.0950485569039605e-06, "loss": 0.018504858016967773, "memory(GiB)": 21.48, "step": 12137, "token_acc": 0.995, "train_speed(iter/s)": 0.953586 }, { "epoch": 0.39430854692525097, "grad_norm": 0.45981982350349426, "learning_rate": 7.094560819752438e-06, "loss": 0.019070647656917572, "memory(GiB)": 21.48, "step": 12138, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.953603 }, { "epoch": 0.3943410323880064, "grad_norm": 0.4540422260761261, "learning_rate": 7.094073058426896e-06, "loss": 0.033655762672424316, "memory(GiB)": 21.48, "step": 12139, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.95362 }, { "epoch": 0.3943735178507618, "grad_norm": 0.3234941065311432, "learning_rate": 7.093585272932964e-06, "loss": 0.02329087257385254, "memory(GiB)": 21.48, "step": 12140, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.953637 }, { "epoch": 0.3944060033135172, "grad_norm": 0.43187904357910156, "learning_rate": 7.09309746327627e-06, "loss": 0.02447614073753357, "memory(GiB)": 21.48, "step": 12141, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.953653 }, { "epoch": 0.39443848877627263, "grad_norm": 0.416151225566864, "learning_rate": 7.092609629462446e-06, "loss": 0.025816719979047775, "memory(GiB)": 21.48, "step": 12142, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.95367 }, { "epoch": 0.39447097423902805, "grad_norm": 0.43418434262275696, "learning_rate": 7.09212177149712e-06, "loss": 0.026148075237870216, "memory(GiB)": 21.48, "step": 12143, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.953687 }, { "epoch": 0.39450345970178347, "grad_norm": 0.27357396483421326, "learning_rate": 7.0916338893859264e-06, "loss": 0.02210225909948349, "memory(GiB)": 21.48, "step": 12144, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.9537 }, { "epoch": 0.3945359451645389, "grad_norm": 0.35945844650268555, "learning_rate": 7.091145983134492e-06, "loss": 0.02162584289908409, "memory(GiB)": 21.48, "step": 12145, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.953713 }, { "epoch": 0.3945684306272943, "grad_norm": 0.5949070453643799, "learning_rate": 7.090658052748451e-06, "loss": 0.030862832441926003, "memory(GiB)": 21.48, "step": 12146, "token_acc": 0.9776785714285714, "train_speed(iter/s)": 0.953725 }, { "epoch": 0.3946009160900497, "grad_norm": 0.5649858713150024, "learning_rate": 7.090170098233433e-06, "loss": 0.02887861244380474, "memory(GiB)": 21.48, "step": 12147, "token_acc": 1.0, "train_speed(iter/s)": 0.953738 }, { "epoch": 0.39463340155280513, "grad_norm": 0.42601168155670166, "learning_rate": 7.0896821195950696e-06, "loss": 0.021223561838269234, "memory(GiB)": 21.48, "step": 12148, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.953751 }, { "epoch": 0.39466588701556055, "grad_norm": 0.4345252811908722, "learning_rate": 7.089194116838993e-06, "loss": 0.022395581007003784, "memory(GiB)": 21.48, "step": 12149, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.953763 }, { "epoch": 0.39469837247831596, "grad_norm": 0.28289690613746643, "learning_rate": 7.088706089970836e-06, "loss": 0.011992032639682293, "memory(GiB)": 21.48, "step": 12150, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.953776 }, { "epoch": 0.3947308579410714, "grad_norm": 0.26536598801612854, "learning_rate": 7.088218038996232e-06, "loss": 0.015861468389630318, "memory(GiB)": 21.48, "step": 12151, "token_acc": 1.0, "train_speed(iter/s)": 0.95379 }, { "epoch": 0.3947633434038268, "grad_norm": 0.3739505410194397, "learning_rate": 7.087729963920811e-06, "loss": 0.024446511641144753, "memory(GiB)": 21.48, "step": 12152, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.953802 }, { "epoch": 0.3947958288665822, "grad_norm": 0.5115154981613159, "learning_rate": 7.087241864750208e-06, "loss": 0.0308493934571743, "memory(GiB)": 21.48, "step": 12153, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.953816 }, { "epoch": 0.3948283143293376, "grad_norm": 0.39412933588027954, "learning_rate": 7.0867537414900554e-06, "loss": 0.02249690517783165, "memory(GiB)": 21.48, "step": 12154, "token_acc": 1.0, "train_speed(iter/s)": 0.953826 }, { "epoch": 0.39486079979209304, "grad_norm": 0.6790791749954224, "learning_rate": 7.086265594145987e-06, "loss": 0.04316636919975281, "memory(GiB)": 21.48, "step": 12155, "token_acc": 0.9823788546255506, "train_speed(iter/s)": 0.953839 }, { "epoch": 0.39489328525484846, "grad_norm": 0.8113204836845398, "learning_rate": 7.085777422723638e-06, "loss": 0.02811357006430626, "memory(GiB)": 21.48, "step": 12156, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.953852 }, { "epoch": 0.3949257707176039, "grad_norm": 0.501863420009613, "learning_rate": 7.08528922722864e-06, "loss": 0.02465960569679737, "memory(GiB)": 21.48, "step": 12157, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.953865 }, { "epoch": 0.3949582561803593, "grad_norm": 0.46334701776504517, "learning_rate": 7.08480100766663e-06, "loss": 0.028767181560397148, "memory(GiB)": 21.48, "step": 12158, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.953878 }, { "epoch": 0.3949907416431147, "grad_norm": 0.8707190155982971, "learning_rate": 7.0843127640432396e-06, "loss": 0.024017758667469025, "memory(GiB)": 21.48, "step": 12159, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.953891 }, { "epoch": 0.3950232271058701, "grad_norm": 0.399340957403183, "learning_rate": 7.083824496364108e-06, "loss": 0.028007831424474716, "memory(GiB)": 21.48, "step": 12160, "token_acc": 0.9847328244274809, "train_speed(iter/s)": 0.953904 }, { "epoch": 0.39505571256862554, "grad_norm": 0.5503575205802917, "learning_rate": 7.0833362046348665e-06, "loss": 0.024958591908216476, "memory(GiB)": 21.48, "step": 12161, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.953915 }, { "epoch": 0.39508819803138095, "grad_norm": 0.2745204269886017, "learning_rate": 7.0828478888611516e-06, "loss": 0.01897435076534748, "memory(GiB)": 21.48, "step": 12162, "token_acc": 1.0, "train_speed(iter/s)": 0.953928 }, { "epoch": 0.39512068349413637, "grad_norm": 0.5601378679275513, "learning_rate": 7.082359549048602e-06, "loss": 0.025463547557592392, "memory(GiB)": 21.48, "step": 12163, "token_acc": 0.9892086330935251, "train_speed(iter/s)": 0.953936 }, { "epoch": 0.3951531689568918, "grad_norm": 0.3839031457901001, "learning_rate": 7.08187118520285e-06, "loss": 0.021858833730220795, "memory(GiB)": 21.48, "step": 12164, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.953947 }, { "epoch": 0.3951856544196472, "grad_norm": 0.41188284754753113, "learning_rate": 7.081382797329535e-06, "loss": 0.022854167968034744, "memory(GiB)": 21.48, "step": 12165, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.953958 }, { "epoch": 0.3952181398824026, "grad_norm": 0.5109811425209045, "learning_rate": 7.08089438543429e-06, "loss": 0.03728729486465454, "memory(GiB)": 21.48, "step": 12166, "token_acc": 0.984, "train_speed(iter/s)": 0.95397 }, { "epoch": 0.39525062534515804, "grad_norm": 0.4362393021583557, "learning_rate": 7.080405949522756e-06, "loss": 0.0345761775970459, "memory(GiB)": 21.48, "step": 12167, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.953981 }, { "epoch": 0.39528311080791345, "grad_norm": 0.34656867384910583, "learning_rate": 7.079917489600568e-06, "loss": 0.021357029676437378, "memory(GiB)": 21.48, "step": 12168, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.953994 }, { "epoch": 0.39531559627066887, "grad_norm": 0.39414843916893005, "learning_rate": 7.079429005673364e-06, "loss": 0.02945391647517681, "memory(GiB)": 21.48, "step": 12169, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.954008 }, { "epoch": 0.3953480817334243, "grad_norm": 0.47791600227355957, "learning_rate": 7.0789404977467815e-06, "loss": 0.032168351113796234, "memory(GiB)": 21.48, "step": 12170, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.954021 }, { "epoch": 0.3953805671961797, "grad_norm": 0.4409172534942627, "learning_rate": 7.078451965826457e-06, "loss": 0.029446516185998917, "memory(GiB)": 21.48, "step": 12171, "token_acc": 0.9745454545454545, "train_speed(iter/s)": 0.954033 }, { "epoch": 0.3954130526589351, "grad_norm": 0.5608239769935608, "learning_rate": 7.0779634099180325e-06, "loss": 0.023957908153533936, "memory(GiB)": 21.48, "step": 12172, "token_acc": 1.0, "train_speed(iter/s)": 0.954046 }, { "epoch": 0.39544553812169053, "grad_norm": 0.4213269054889679, "learning_rate": 7.077474830027143e-06, "loss": 0.02291596494615078, "memory(GiB)": 21.48, "step": 12173, "token_acc": 0.992, "train_speed(iter/s)": 0.95406 }, { "epoch": 0.39547802358444595, "grad_norm": 0.4355717897415161, "learning_rate": 7.07698622615943e-06, "loss": 0.025202903896570206, "memory(GiB)": 21.48, "step": 12174, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.954073 }, { "epoch": 0.39551050904720136, "grad_norm": 0.39411523938179016, "learning_rate": 7.076497598320531e-06, "loss": 0.0275130532681942, "memory(GiB)": 21.48, "step": 12175, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.954085 }, { "epoch": 0.3955429945099568, "grad_norm": 0.40616580843925476, "learning_rate": 7.076008946516085e-06, "loss": 0.02092600427567959, "memory(GiB)": 21.48, "step": 12176, "token_acc": 1.0, "train_speed(iter/s)": 0.954098 }, { "epoch": 0.3955754799727122, "grad_norm": 0.2811466455459595, "learning_rate": 7.0755202707517344e-06, "loss": 0.02018585614860058, "memory(GiB)": 21.48, "step": 12177, "token_acc": 1.0, "train_speed(iter/s)": 0.954111 }, { "epoch": 0.3956079654354676, "grad_norm": 0.8171178698539734, "learning_rate": 7.075031571033115e-06, "loss": 0.040871839970350266, "memory(GiB)": 21.48, "step": 12178, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.954125 }, { "epoch": 0.39564045089822303, "grad_norm": 0.5626968145370483, "learning_rate": 7.0745428473658715e-06, "loss": 0.027148079127073288, "memory(GiB)": 21.48, "step": 12179, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.954137 }, { "epoch": 0.39567293636097844, "grad_norm": 0.35265418887138367, "learning_rate": 7.074054099755642e-06, "loss": 0.02583823725581169, "memory(GiB)": 21.48, "step": 12180, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.95415 }, { "epoch": 0.39570542182373386, "grad_norm": 0.518265426158905, "learning_rate": 7.073565328208068e-06, "loss": 0.030243244022130966, "memory(GiB)": 21.48, "step": 12181, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.954162 }, { "epoch": 0.3957379072864893, "grad_norm": 0.4910613000392914, "learning_rate": 7.07307653272879e-06, "loss": 0.030091652646660805, "memory(GiB)": 21.48, "step": 12182, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.954178 }, { "epoch": 0.3957703927492447, "grad_norm": 0.5285410284996033, "learning_rate": 7.072587713323448e-06, "loss": 0.02200855314731598, "memory(GiB)": 21.48, "step": 12183, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.954195 }, { "epoch": 0.3958028782120001, "grad_norm": 0.3656441867351532, "learning_rate": 7.072098869997687e-06, "loss": 0.026623453944921494, "memory(GiB)": 21.48, "step": 12184, "token_acc": 0.996, "train_speed(iter/s)": 0.954211 }, { "epoch": 0.3958353636747555, "grad_norm": 0.4129357635974884, "learning_rate": 7.071610002757146e-06, "loss": 0.024452587589621544, "memory(GiB)": 21.48, "step": 12185, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.954228 }, { "epoch": 0.39586784913751094, "grad_norm": 0.3701748847961426, "learning_rate": 7.071121111607468e-06, "loss": 0.026035122573375702, "memory(GiB)": 21.48, "step": 12186, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.954245 }, { "epoch": 0.39590033460026636, "grad_norm": 0.48494482040405273, "learning_rate": 7.070632196554297e-06, "loss": 0.01950259506702423, "memory(GiB)": 21.48, "step": 12187, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.954262 }, { "epoch": 0.3959328200630218, "grad_norm": 0.3681076467037201, "learning_rate": 7.070143257603274e-06, "loss": 0.02860139310359955, "memory(GiB)": 21.48, "step": 12188, "token_acc": 0.9904761904761905, "train_speed(iter/s)": 0.954278 }, { "epoch": 0.3959653055257772, "grad_norm": 0.3933594822883606, "learning_rate": 7.0696542947600425e-06, "loss": 0.02604242041707039, "memory(GiB)": 21.48, "step": 12189, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.954295 }, { "epoch": 0.3959977909885326, "grad_norm": 0.2667981684207916, "learning_rate": 7.069165308030245e-06, "loss": 0.017489057034254074, "memory(GiB)": 21.48, "step": 12190, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.954312 }, { "epoch": 0.396030276451288, "grad_norm": 0.462070494890213, "learning_rate": 7.068676297419527e-06, "loss": 0.025154653936624527, "memory(GiB)": 21.48, "step": 12191, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.954329 }, { "epoch": 0.3960627619140435, "grad_norm": 0.4050668179988861, "learning_rate": 7.068187262933528e-06, "loss": 0.02258576825261116, "memory(GiB)": 21.48, "step": 12192, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.954346 }, { "epoch": 0.3960952473767989, "grad_norm": 0.5976259708404541, "learning_rate": 7.067698204577898e-06, "loss": 0.0343245267868042, "memory(GiB)": 21.48, "step": 12193, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.954362 }, { "epoch": 0.3961277328395543, "grad_norm": 0.3891558349132538, "learning_rate": 7.0672091223582785e-06, "loss": 0.020420990884304047, "memory(GiB)": 21.48, "step": 12194, "token_acc": 1.0, "train_speed(iter/s)": 0.954379 }, { "epoch": 0.39616021830230974, "grad_norm": 0.32538774609565735, "learning_rate": 7.066720016280314e-06, "loss": 0.023959282785654068, "memory(GiB)": 21.48, "step": 12195, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.954396 }, { "epoch": 0.39619270376506516, "grad_norm": 0.3403509557247162, "learning_rate": 7.066230886349651e-06, "loss": 0.02922341413795948, "memory(GiB)": 21.48, "step": 12196, "token_acc": 0.9901315789473685, "train_speed(iter/s)": 0.954413 }, { "epoch": 0.3962251892278206, "grad_norm": 0.4971226453781128, "learning_rate": 7.065741732571931e-06, "loss": 0.020330872386693954, "memory(GiB)": 21.48, "step": 12197, "token_acc": 0.9806763285024155, "train_speed(iter/s)": 0.954429 }, { "epoch": 0.396257674690576, "grad_norm": 0.3972166180610657, "learning_rate": 7.065252554952804e-06, "loss": 0.025890763849020004, "memory(GiB)": 21.48, "step": 12198, "token_acc": 0.9754901960784313, "train_speed(iter/s)": 0.954446 }, { "epoch": 0.3962901601533314, "grad_norm": 0.9359214901924133, "learning_rate": 7.064763353497912e-06, "loss": 0.03296111896634102, "memory(GiB)": 21.48, "step": 12199, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.954462 }, { "epoch": 0.3963226456160868, "grad_norm": 0.3247208297252655, "learning_rate": 7.064274128212904e-06, "loss": 0.02224704995751381, "memory(GiB)": 21.48, "step": 12200, "token_acc": 1.0, "train_speed(iter/s)": 0.954479 }, { "epoch": 0.39635513107884224, "grad_norm": 0.34945014119148254, "learning_rate": 7.0637848791034235e-06, "loss": 0.021568406373262405, "memory(GiB)": 21.48, "step": 12201, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.954496 }, { "epoch": 0.39638761654159765, "grad_norm": 0.4944807291030884, "learning_rate": 7.06329560617512e-06, "loss": 0.025301963090896606, "memory(GiB)": 21.48, "step": 12202, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.954513 }, { "epoch": 0.39642010200435307, "grad_norm": 0.32324787974357605, "learning_rate": 7.062806309433639e-06, "loss": 0.02464192360639572, "memory(GiB)": 21.48, "step": 12203, "token_acc": 0.985663082437276, "train_speed(iter/s)": 0.95453 }, { "epoch": 0.3964525874671085, "grad_norm": 0.4122086763381958, "learning_rate": 7.062316988884626e-06, "loss": 0.030086718499660492, "memory(GiB)": 21.48, "step": 12204, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.954548 }, { "epoch": 0.3964850729298639, "grad_norm": 0.43406763672828674, "learning_rate": 7.061827644533732e-06, "loss": 0.02241365611553192, "memory(GiB)": 21.48, "step": 12205, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.954564 }, { "epoch": 0.3965175583926193, "grad_norm": 0.5210382342338562, "learning_rate": 7.061338276386602e-06, "loss": 0.044479455798864365, "memory(GiB)": 21.48, "step": 12206, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.95458 }, { "epoch": 0.39655004385537473, "grad_norm": 0.3683170676231384, "learning_rate": 7.0608488844488835e-06, "loss": 0.02280350774526596, "memory(GiB)": 21.48, "step": 12207, "token_acc": 0.9873015873015873, "train_speed(iter/s)": 0.954596 }, { "epoch": 0.39658252931813015, "grad_norm": 0.28010067343711853, "learning_rate": 7.060359468726227e-06, "loss": 0.0185465719550848, "memory(GiB)": 21.48, "step": 12208, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.954611 }, { "epoch": 0.39661501478088557, "grad_norm": 0.4920230209827423, "learning_rate": 7.05987002922428e-06, "loss": 0.025341790169477463, "memory(GiB)": 21.48, "step": 12209, "token_acc": 0.9866220735785953, "train_speed(iter/s)": 0.954625 }, { "epoch": 0.396647500243641, "grad_norm": 0.3362456262111664, "learning_rate": 7.0593805659486915e-06, "loss": 0.016705967485904694, "memory(GiB)": 21.48, "step": 12210, "token_acc": 1.0, "train_speed(iter/s)": 0.954638 }, { "epoch": 0.3966799857063964, "grad_norm": 0.4072887897491455, "learning_rate": 7.058891078905109e-06, "loss": 0.021422861143946648, "memory(GiB)": 21.48, "step": 12211, "token_acc": 0.9895287958115183, "train_speed(iter/s)": 0.954651 }, { "epoch": 0.3967124711691518, "grad_norm": 0.3988072872161865, "learning_rate": 7.058401568099185e-06, "loss": 0.030367359519004822, "memory(GiB)": 21.48, "step": 12212, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.954663 }, { "epoch": 0.39674495663190723, "grad_norm": 0.42868298292160034, "learning_rate": 7.057912033536565e-06, "loss": 0.027665622532367706, "memory(GiB)": 21.48, "step": 12213, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.954677 }, { "epoch": 0.39677744209466265, "grad_norm": 0.2742624580860138, "learning_rate": 7.0574224752229025e-06, "loss": 0.01850040629506111, "memory(GiB)": 21.48, "step": 12214, "token_acc": 1.0, "train_speed(iter/s)": 0.954689 }, { "epoch": 0.39680992755741806, "grad_norm": 0.46738380193710327, "learning_rate": 7.056932893163845e-06, "loss": 0.03127625584602356, "memory(GiB)": 21.48, "step": 12215, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.954703 }, { "epoch": 0.3968424130201735, "grad_norm": 0.40620362758636475, "learning_rate": 7.056443287365045e-06, "loss": 0.01883796602487564, "memory(GiB)": 21.48, "step": 12216, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.954715 }, { "epoch": 0.3968748984829289, "grad_norm": 0.4459097385406494, "learning_rate": 7.055953657832152e-06, "loss": 0.02064048871397972, "memory(GiB)": 21.48, "step": 12217, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.954727 }, { "epoch": 0.3969073839456843, "grad_norm": 0.32520005106925964, "learning_rate": 7.055464004570818e-06, "loss": 0.02476370707154274, "memory(GiB)": 21.48, "step": 12218, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.954738 }, { "epoch": 0.3969398694084397, "grad_norm": 0.2860698699951172, "learning_rate": 7.054974327586693e-06, "loss": 0.018298504874110222, "memory(GiB)": 21.48, "step": 12219, "token_acc": 0.9896907216494846, "train_speed(iter/s)": 0.95475 }, { "epoch": 0.39697235487119514, "grad_norm": 0.31224673986434937, "learning_rate": 7.05448462688543e-06, "loss": 0.021329987794160843, "memory(GiB)": 21.48, "step": 12220, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.954762 }, { "epoch": 0.39700484033395056, "grad_norm": 0.3620968163013458, "learning_rate": 7.053994902472678e-06, "loss": 0.02568764053285122, "memory(GiB)": 21.48, "step": 12221, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.954775 }, { "epoch": 0.397037325796706, "grad_norm": 0.42278802394866943, "learning_rate": 7.053505154354092e-06, "loss": 0.0206881295889616, "memory(GiB)": 21.48, "step": 12222, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.954787 }, { "epoch": 0.3970698112594614, "grad_norm": 0.3381281793117523, "learning_rate": 7.053015382535323e-06, "loss": 0.01697348989546299, "memory(GiB)": 21.48, "step": 12223, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.954799 }, { "epoch": 0.3971022967222168, "grad_norm": 0.3669818341732025, "learning_rate": 7.052525587022026e-06, "loss": 0.02581828087568283, "memory(GiB)": 21.48, "step": 12224, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.954811 }, { "epoch": 0.3971347821849722, "grad_norm": 0.4542175233364105, "learning_rate": 7.0520357678198485e-06, "loss": 0.02950572967529297, "memory(GiB)": 21.48, "step": 12225, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.954823 }, { "epoch": 0.39716726764772764, "grad_norm": 0.44092103838920593, "learning_rate": 7.051545924934449e-06, "loss": 0.026412364095449448, "memory(GiB)": 21.48, "step": 12226, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.954834 }, { "epoch": 0.39719975311048306, "grad_norm": 0.37722301483154297, "learning_rate": 7.051056058371478e-06, "loss": 0.014011847786605358, "memory(GiB)": 21.48, "step": 12227, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.954843 }, { "epoch": 0.39723223857323847, "grad_norm": 0.46161800622940063, "learning_rate": 7.050566168136591e-06, "loss": 0.024418095126748085, "memory(GiB)": 21.48, "step": 12228, "token_acc": 1.0, "train_speed(iter/s)": 0.954854 }, { "epoch": 0.3972647240359939, "grad_norm": 0.4258866608142853, "learning_rate": 7.0500762542354405e-06, "loss": 0.025928538292646408, "memory(GiB)": 21.48, "step": 12229, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.954866 }, { "epoch": 0.3972972094987493, "grad_norm": 0.38409748673439026, "learning_rate": 7.049586316673682e-06, "loss": 0.019060958176851273, "memory(GiB)": 21.48, "step": 12230, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.954876 }, { "epoch": 0.3973296949615047, "grad_norm": 0.373248815536499, "learning_rate": 7.049096355456968e-06, "loss": 0.022211603820323944, "memory(GiB)": 21.48, "step": 12231, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.954886 }, { "epoch": 0.39736218042426014, "grad_norm": 0.3517046868801117, "learning_rate": 7.048606370590957e-06, "loss": 0.025309083983302116, "memory(GiB)": 21.48, "step": 12232, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.954898 }, { "epoch": 0.39739466588701555, "grad_norm": 0.6561799049377441, "learning_rate": 7.048116362081301e-06, "loss": 0.03001369908452034, "memory(GiB)": 21.48, "step": 12233, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.95491 }, { "epoch": 0.39742715134977097, "grad_norm": 0.4190131425857544, "learning_rate": 7.047626329933654e-06, "loss": 0.024583932012319565, "memory(GiB)": 21.48, "step": 12234, "token_acc": 0.9930555555555556, "train_speed(iter/s)": 0.954923 }, { "epoch": 0.3974596368125264, "grad_norm": 0.41168907284736633, "learning_rate": 7.047136274153676e-06, "loss": 0.026386737823486328, "memory(GiB)": 21.48, "step": 12235, "token_acc": 0.9937106918238994, "train_speed(iter/s)": 0.954935 }, { "epoch": 0.3974921222752818, "grad_norm": 0.4697502851486206, "learning_rate": 7.046646194747021e-06, "loss": 0.021471574902534485, "memory(GiB)": 21.48, "step": 12236, "token_acc": 0.993006993006993, "train_speed(iter/s)": 0.954948 }, { "epoch": 0.3975246077380372, "grad_norm": 0.6061764359474182, "learning_rate": 7.046156091719344e-06, "loss": 0.023919716477394104, "memory(GiB)": 21.48, "step": 12237, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.954962 }, { "epoch": 0.39755709320079263, "grad_norm": 0.45775777101516724, "learning_rate": 7.045665965076302e-06, "loss": 0.036029666662216187, "memory(GiB)": 21.48, "step": 12238, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.954975 }, { "epoch": 0.39758957866354805, "grad_norm": 3.3036956787109375, "learning_rate": 7.045175814823552e-06, "loss": 0.027833521366119385, "memory(GiB)": 21.48, "step": 12239, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.954987 }, { "epoch": 0.39762206412630346, "grad_norm": 0.45834237337112427, "learning_rate": 7.044685640966751e-06, "loss": 0.024586210027337074, "memory(GiB)": 21.48, "step": 12240, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.955 }, { "epoch": 0.3976545495890589, "grad_norm": 0.4421902298927307, "learning_rate": 7.044195443511557e-06, "loss": 0.023110052570700645, "memory(GiB)": 21.48, "step": 12241, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.955014 }, { "epoch": 0.3976870350518143, "grad_norm": 0.36080867052078247, "learning_rate": 7.043705222463626e-06, "loss": 0.022505730390548706, "memory(GiB)": 21.48, "step": 12242, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955031 }, { "epoch": 0.3977195205145697, "grad_norm": 0.4549231231212616, "learning_rate": 7.043214977828616e-06, "loss": 0.029913542792201042, "memory(GiB)": 21.48, "step": 12243, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.955048 }, { "epoch": 0.39775200597732513, "grad_norm": 0.46627286076545715, "learning_rate": 7.042724709612187e-06, "loss": 0.023236846551299095, "memory(GiB)": 21.48, "step": 12244, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955065 }, { "epoch": 0.39778449144008055, "grad_norm": 0.37273362278938293, "learning_rate": 7.0422344178199956e-06, "loss": 0.024510473012924194, "memory(GiB)": 21.48, "step": 12245, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.955081 }, { "epoch": 0.39781697690283596, "grad_norm": 0.44163790345191956, "learning_rate": 7.0417441024577e-06, "loss": 0.026423918083310127, "memory(GiB)": 21.48, "step": 12246, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.955097 }, { "epoch": 0.3978494623655914, "grad_norm": 0.38110753893852234, "learning_rate": 7.041253763530962e-06, "loss": 0.025114350020885468, "memory(GiB)": 21.48, "step": 12247, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.955113 }, { "epoch": 0.3978819478283468, "grad_norm": 0.40513816475868225, "learning_rate": 7.040763401045436e-06, "loss": 0.02530902437865734, "memory(GiB)": 21.48, "step": 12248, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.955131 }, { "epoch": 0.3979144332911022, "grad_norm": 1.046845555305481, "learning_rate": 7.040273015006787e-06, "loss": 0.03615814447402954, "memory(GiB)": 21.48, "step": 12249, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955147 }, { "epoch": 0.3979469187538576, "grad_norm": 0.4822240173816681, "learning_rate": 7.039782605420669e-06, "loss": 0.02849021553993225, "memory(GiB)": 21.48, "step": 12250, "token_acc": 0.9662447257383966, "train_speed(iter/s)": 0.955163 }, { "epoch": 0.39797940421661304, "grad_norm": 0.616226851940155, "learning_rate": 7.039292172292747e-06, "loss": 0.03369913995265961, "memory(GiB)": 21.48, "step": 12251, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.95518 }, { "epoch": 0.39801188967936846, "grad_norm": 0.31428077816963196, "learning_rate": 7.038801715628676e-06, "loss": 0.024621929973363876, "memory(GiB)": 21.48, "step": 12252, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.955197 }, { "epoch": 0.3980443751421239, "grad_norm": 0.2683366537094116, "learning_rate": 7.038311235434122e-06, "loss": 0.02099960297346115, "memory(GiB)": 21.48, "step": 12253, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955214 }, { "epoch": 0.3980768606048793, "grad_norm": 0.46827730536460876, "learning_rate": 7.037820731714743e-06, "loss": 0.031122956424951553, "memory(GiB)": 21.48, "step": 12254, "token_acc": 1.0, "train_speed(iter/s)": 0.95523 }, { "epoch": 0.3981093460676347, "grad_norm": 0.38282859325408936, "learning_rate": 7.0373302044762e-06, "loss": 0.021693196147680283, "memory(GiB)": 21.48, "step": 12255, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955247 }, { "epoch": 0.3981418315303902, "grad_norm": 0.4061136841773987, "learning_rate": 7.036839653724155e-06, "loss": 0.025595301762223244, "memory(GiB)": 21.48, "step": 12256, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.955264 }, { "epoch": 0.3981743169931456, "grad_norm": 0.5642168521881104, "learning_rate": 7.036349079464269e-06, "loss": 0.03085929900407791, "memory(GiB)": 21.48, "step": 12257, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.95528 }, { "epoch": 0.398206802455901, "grad_norm": 0.5602965354919434, "learning_rate": 7.035858481702205e-06, "loss": 0.036194685846567154, "memory(GiB)": 21.48, "step": 12258, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.955298 }, { "epoch": 0.3982392879186564, "grad_norm": 0.3554702401161194, "learning_rate": 7.035367860443624e-06, "loss": 0.025987330824136734, "memory(GiB)": 21.48, "step": 12259, "token_acc": 0.9929824561403509, "train_speed(iter/s)": 0.955314 }, { "epoch": 0.39827177338141184, "grad_norm": 0.3795657455921173, "learning_rate": 7.0348772156941895e-06, "loss": 0.027138469740748405, "memory(GiB)": 21.48, "step": 12260, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.955331 }, { "epoch": 0.39830425884416726, "grad_norm": 0.30716776847839355, "learning_rate": 7.034386547459563e-06, "loss": 0.021763769909739494, "memory(GiB)": 21.48, "step": 12261, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.955348 }, { "epoch": 0.3983367443069227, "grad_norm": 0.4010687470436096, "learning_rate": 7.033895855745407e-06, "loss": 0.02183469384908676, "memory(GiB)": 21.48, "step": 12262, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.955364 }, { "epoch": 0.3983692297696781, "grad_norm": 0.4063096046447754, "learning_rate": 7.033405140557387e-06, "loss": 0.023679427802562714, "memory(GiB)": 21.48, "step": 12263, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.95538 }, { "epoch": 0.3984017152324335, "grad_norm": 0.43297314643859863, "learning_rate": 7.032914401901164e-06, "loss": 0.03155228868126869, "memory(GiB)": 21.48, "step": 12264, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.955398 }, { "epoch": 0.3984342006951889, "grad_norm": 0.33017295598983765, "learning_rate": 7.032423639782404e-06, "loss": 0.02089468203485012, "memory(GiB)": 21.48, "step": 12265, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.955413 }, { "epoch": 0.39846668615794434, "grad_norm": 0.31124603748321533, "learning_rate": 7.0319328542067696e-06, "loss": 0.027074584737420082, "memory(GiB)": 21.48, "step": 12266, "token_acc": 0.995, "train_speed(iter/s)": 0.95543 }, { "epoch": 0.39849917162069975, "grad_norm": 0.7956988215446472, "learning_rate": 7.031442045179927e-06, "loss": 0.03369463235139847, "memory(GiB)": 21.48, "step": 12267, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955446 }, { "epoch": 0.39853165708345517, "grad_norm": 0.49594107270240784, "learning_rate": 7.030951212707537e-06, "loss": 0.02474125288426876, "memory(GiB)": 21.48, "step": 12268, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955463 }, { "epoch": 0.3985641425462106, "grad_norm": 0.39912858605384827, "learning_rate": 7.030460356795268e-06, "loss": 0.02532402239739895, "memory(GiB)": 21.48, "step": 12269, "token_acc": 0.9757085020242915, "train_speed(iter/s)": 0.955479 }, { "epoch": 0.398596628008966, "grad_norm": 0.4265696108341217, "learning_rate": 7.029969477448785e-06, "loss": 0.025408942252397537, "memory(GiB)": 21.48, "step": 12270, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.955496 }, { "epoch": 0.3986291134717214, "grad_norm": 0.32809656858444214, "learning_rate": 7.02947857467375e-06, "loss": 0.027114950120449066, "memory(GiB)": 21.48, "step": 12271, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.955512 }, { "epoch": 0.39866159893447684, "grad_norm": 0.9657846093177795, "learning_rate": 7.0289876484758335e-06, "loss": 0.02318086475133896, "memory(GiB)": 21.48, "step": 12272, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.955529 }, { "epoch": 0.39869408439723225, "grad_norm": 0.29217520356178284, "learning_rate": 7.028496698860696e-06, "loss": 0.021915355697274208, "memory(GiB)": 21.48, "step": 12273, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955542 }, { "epoch": 0.39872656985998767, "grad_norm": 2.2182533740997314, "learning_rate": 7.02800572583401e-06, "loss": 0.027824275195598602, "memory(GiB)": 21.48, "step": 12274, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.955556 }, { "epoch": 0.3987590553227431, "grad_norm": 0.3245150148868561, "learning_rate": 7.0275147294014365e-06, "loss": 0.0208713561296463, "memory(GiB)": 21.48, "step": 12275, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.955568 }, { "epoch": 0.3987915407854985, "grad_norm": 0.5069010257720947, "learning_rate": 7.0270237095686456e-06, "loss": 0.03209071606397629, "memory(GiB)": 21.48, "step": 12276, "token_acc": 0.9868995633187773, "train_speed(iter/s)": 0.955582 }, { "epoch": 0.3988240262482539, "grad_norm": 0.3979334831237793, "learning_rate": 7.026532666341303e-06, "loss": 0.021317940205335617, "memory(GiB)": 21.48, "step": 12277, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.955594 }, { "epoch": 0.39885651171100933, "grad_norm": 0.3913573920726776, "learning_rate": 7.026041599725075e-06, "loss": 0.0321846678853035, "memory(GiB)": 21.48, "step": 12278, "token_acc": 0.9837837837837838, "train_speed(iter/s)": 0.955606 }, { "epoch": 0.39888899717376475, "grad_norm": 0.3593733608722687, "learning_rate": 7.025550509725631e-06, "loss": 0.031090212985873222, "memory(GiB)": 21.48, "step": 12279, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.955618 }, { "epoch": 0.39892148263652016, "grad_norm": 0.4687696695327759, "learning_rate": 7.025059396348638e-06, "loss": 0.027484118938446045, "memory(GiB)": 21.48, "step": 12280, "token_acc": 0.9847715736040609, "train_speed(iter/s)": 0.955632 }, { "epoch": 0.3989539680992756, "grad_norm": 0.6168137192726135, "learning_rate": 7.024568259599764e-06, "loss": 0.028897633776068687, "memory(GiB)": 21.48, "step": 12281, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.955644 }, { "epoch": 0.398986453562031, "grad_norm": 0.5044901371002197, "learning_rate": 7.024077099484678e-06, "loss": 0.0270796287804842, "memory(GiB)": 21.48, "step": 12282, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.955656 }, { "epoch": 0.3990189390247864, "grad_norm": 0.4082792401313782, "learning_rate": 7.023585916009048e-06, "loss": 0.025036033242940903, "memory(GiB)": 21.48, "step": 12283, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.955666 }, { "epoch": 0.39905142448754183, "grad_norm": 0.4352076053619385, "learning_rate": 7.023094709178544e-06, "loss": 0.027100734412670135, "memory(GiB)": 21.48, "step": 12284, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955677 }, { "epoch": 0.39908390995029724, "grad_norm": 0.3502980172634125, "learning_rate": 7.022603478998832e-06, "loss": 0.025532830506563187, "memory(GiB)": 21.48, "step": 12285, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.955688 }, { "epoch": 0.39911639541305266, "grad_norm": 0.43301716446876526, "learning_rate": 7.022112225475586e-06, "loss": 0.022245274856686592, "memory(GiB)": 21.48, "step": 12286, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.955699 }, { "epoch": 0.3991488808758081, "grad_norm": 0.6508142352104187, "learning_rate": 7.021620948614472e-06, "loss": 0.02566320076584816, "memory(GiB)": 21.48, "step": 12287, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955709 }, { "epoch": 0.3991813663385635, "grad_norm": 0.3835337460041046, "learning_rate": 7.021129648421163e-06, "loss": 0.022249551489949226, "memory(GiB)": 21.48, "step": 12288, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.955719 }, { "epoch": 0.3992138518013189, "grad_norm": 0.41135331988334656, "learning_rate": 7.020638324901328e-06, "loss": 0.020536545664072037, "memory(GiB)": 21.48, "step": 12289, "token_acc": 1.0, "train_speed(iter/s)": 0.95573 }, { "epoch": 0.3992463372640743, "grad_norm": 0.34478652477264404, "learning_rate": 7.0201469780606356e-06, "loss": 0.024415146559476852, "memory(GiB)": 21.48, "step": 12290, "token_acc": 0.986013986013986, "train_speed(iter/s)": 0.955741 }, { "epoch": 0.39927882272682974, "grad_norm": 0.33458980917930603, "learning_rate": 7.019655607904759e-06, "loss": 0.027375927194952965, "memory(GiB)": 21.48, "step": 12291, "token_acc": 0.98828125, "train_speed(iter/s)": 0.955752 }, { "epoch": 0.39931130818958516, "grad_norm": 0.3782828748226166, "learning_rate": 7.0191642144393675e-06, "loss": 0.029943374916911125, "memory(GiB)": 21.48, "step": 12292, "token_acc": 0.9836956521739131, "train_speed(iter/s)": 0.955761 }, { "epoch": 0.3993437936523406, "grad_norm": 0.5054112076759338, "learning_rate": 7.018672797670135e-06, "loss": 0.038029566407203674, "memory(GiB)": 21.48, "step": 12293, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.955772 }, { "epoch": 0.399376279115096, "grad_norm": 0.36259105801582336, "learning_rate": 7.018181357602732e-06, "loss": 0.02338816598057747, "memory(GiB)": 21.48, "step": 12294, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.955784 }, { "epoch": 0.3994087645778514, "grad_norm": 0.4112624228000641, "learning_rate": 7.017689894242829e-06, "loss": 0.022576577961444855, "memory(GiB)": 21.48, "step": 12295, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.955795 }, { "epoch": 0.3994412500406068, "grad_norm": 0.3734525740146637, "learning_rate": 7.017198407596098e-06, "loss": 0.02018628641963005, "memory(GiB)": 21.48, "step": 12296, "token_acc": 1.0, "train_speed(iter/s)": 0.955807 }, { "epoch": 0.39947373550336224, "grad_norm": 0.3492636978626251, "learning_rate": 7.016706897668214e-06, "loss": 0.029414458200335503, "memory(GiB)": 21.48, "step": 12297, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955819 }, { "epoch": 0.39950622096611765, "grad_norm": 0.4429316222667694, "learning_rate": 7.016215364464849e-06, "loss": 0.02144312672317028, "memory(GiB)": 21.48, "step": 12298, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.955831 }, { "epoch": 0.39953870642887307, "grad_norm": 0.40957286953926086, "learning_rate": 7.015723807991672e-06, "loss": 0.02806146629154682, "memory(GiB)": 21.48, "step": 12299, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955844 }, { "epoch": 0.3995711918916285, "grad_norm": 0.2826067805290222, "learning_rate": 7.015232228254362e-06, "loss": 0.02241835743188858, "memory(GiB)": 21.48, "step": 12300, "token_acc": 0.9837837837837838, "train_speed(iter/s)": 0.955857 }, { "epoch": 0.3996036773543839, "grad_norm": 0.3929429054260254, "learning_rate": 7.014740625258588e-06, "loss": 0.02591150626540184, "memory(GiB)": 21.48, "step": 12301, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.95587 }, { "epoch": 0.3996361628171393, "grad_norm": 0.5054710507392883, "learning_rate": 7.014248999010027e-06, "loss": 0.035681772977113724, "memory(GiB)": 21.48, "step": 12302, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.955886 }, { "epoch": 0.39966864827989473, "grad_norm": 0.4622182846069336, "learning_rate": 7.013757349514352e-06, "loss": 0.0273494441062212, "memory(GiB)": 21.48, "step": 12303, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.955893 }, { "epoch": 0.39970113374265015, "grad_norm": 0.3839699625968933, "learning_rate": 7.013265676777235e-06, "loss": 0.021570563316345215, "memory(GiB)": 21.48, "step": 12304, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.95591 }, { "epoch": 0.39973361920540557, "grad_norm": 0.41413089632987976, "learning_rate": 7.0127739808043536e-06, "loss": 0.017307059839367867, "memory(GiB)": 21.48, "step": 12305, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.955926 }, { "epoch": 0.399766104668161, "grad_norm": 0.3352544605731964, "learning_rate": 7.012282261601381e-06, "loss": 0.025232573971152306, "memory(GiB)": 21.48, "step": 12306, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.955943 }, { "epoch": 0.3997985901309164, "grad_norm": 0.4626902937889099, "learning_rate": 7.011790519173994e-06, "loss": 0.03298722952604294, "memory(GiB)": 21.48, "step": 12307, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955959 }, { "epoch": 0.3998310755936718, "grad_norm": 0.3835380971431732, "learning_rate": 7.011298753527865e-06, "loss": 0.022264566272497177, "memory(GiB)": 21.48, "step": 12308, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.955976 }, { "epoch": 0.39986356105642723, "grad_norm": 0.3317253291606903, "learning_rate": 7.0108069646686725e-06, "loss": 0.01958431862294674, "memory(GiB)": 21.48, "step": 12309, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.955993 }, { "epoch": 0.39989604651918265, "grad_norm": 0.47086623311042786, "learning_rate": 7.010315152602091e-06, "loss": 0.02272617071866989, "memory(GiB)": 21.48, "step": 12310, "token_acc": 0.9896907216494846, "train_speed(iter/s)": 0.956008 }, { "epoch": 0.39992853198193806, "grad_norm": 0.4060508906841278, "learning_rate": 7.009823317333797e-06, "loss": 0.022832170128822327, "memory(GiB)": 21.48, "step": 12311, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.956024 }, { "epoch": 0.3999610174446935, "grad_norm": 0.4442629814147949, "learning_rate": 7.009331458869467e-06, "loss": 0.020790591835975647, "memory(GiB)": 21.48, "step": 12312, "token_acc": 1.0, "train_speed(iter/s)": 0.95604 }, { "epoch": 0.3999935029074489, "grad_norm": 0.3166247606277466, "learning_rate": 7.008839577214777e-06, "loss": 0.01995762623846531, "memory(GiB)": 21.48, "step": 12313, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956057 }, { "epoch": 0.4000259883702043, "grad_norm": 0.37782466411590576, "learning_rate": 7.0083476723754054e-06, "loss": 0.025963187217712402, "memory(GiB)": 21.48, "step": 12314, "token_acc": 0.988, "train_speed(iter/s)": 0.956073 }, { "epoch": 0.4000584738329597, "grad_norm": 0.5583402514457703, "learning_rate": 7.007855744357026e-06, "loss": 0.040342800319194794, "memory(GiB)": 21.48, "step": 12315, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.95609 }, { "epoch": 0.40009095929571514, "grad_norm": 0.6278669834136963, "learning_rate": 7.007363793165322e-06, "loss": 0.03666889667510986, "memory(GiB)": 21.48, "step": 12316, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.956106 }, { "epoch": 0.40012344475847056, "grad_norm": 0.3258536159992218, "learning_rate": 7.006871818805967e-06, "loss": 0.023838777095079422, "memory(GiB)": 21.48, "step": 12317, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.956122 }, { "epoch": 0.400155930221226, "grad_norm": 0.40803903341293335, "learning_rate": 7.006379821284639e-06, "loss": 0.030206825584173203, "memory(GiB)": 21.48, "step": 12318, "token_acc": 0.9766355140186916, "train_speed(iter/s)": 0.956139 }, { "epoch": 0.4001884156839814, "grad_norm": 0.4375273883342743, "learning_rate": 7.0058878006070185e-06, "loss": 0.027974043041467667, "memory(GiB)": 21.48, "step": 12319, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956155 }, { "epoch": 0.40022090114673686, "grad_norm": 0.3784838914871216, "learning_rate": 7.0053957567787826e-06, "loss": 0.023323915898799896, "memory(GiB)": 21.48, "step": 12320, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.956172 }, { "epoch": 0.4002533866094923, "grad_norm": 0.38965845108032227, "learning_rate": 7.004903689805611e-06, "loss": 0.022436916828155518, "memory(GiB)": 21.48, "step": 12321, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.956187 }, { "epoch": 0.4002858720722477, "grad_norm": 0.349600225687027, "learning_rate": 7.004411599693182e-06, "loss": 0.03076651319861412, "memory(GiB)": 21.48, "step": 12322, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956203 }, { "epoch": 0.4003183575350031, "grad_norm": 1.4470075368881226, "learning_rate": 7.003919486447175e-06, "loss": 0.02609066665172577, "memory(GiB)": 21.48, "step": 12323, "token_acc": 0.9797979797979798, "train_speed(iter/s)": 0.956219 }, { "epoch": 0.4003508429977585, "grad_norm": 0.3965972661972046, "learning_rate": 7.00342735007327e-06, "loss": 0.022239159792661667, "memory(GiB)": 21.48, "step": 12324, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.956236 }, { "epoch": 0.40038332846051394, "grad_norm": 0.42398130893707275, "learning_rate": 7.002935190577147e-06, "loss": 0.025144342333078384, "memory(GiB)": 21.48, "step": 12325, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.956252 }, { "epoch": 0.40041581392326936, "grad_norm": 0.47256892919540405, "learning_rate": 7.002443007964486e-06, "loss": 0.029597897082567215, "memory(GiB)": 21.48, "step": 12326, "token_acc": 0.9794238683127572, "train_speed(iter/s)": 0.956269 }, { "epoch": 0.4004482993860248, "grad_norm": 0.3435487747192383, "learning_rate": 7.0019508022409666e-06, "loss": 0.021709192544221878, "memory(GiB)": 21.48, "step": 12327, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956286 }, { "epoch": 0.4004807848487802, "grad_norm": 0.4491632878780365, "learning_rate": 7.001458573412272e-06, "loss": 0.03092299774289131, "memory(GiB)": 21.48, "step": 12328, "token_acc": 0.995, "train_speed(iter/s)": 0.956301 }, { "epoch": 0.4005132703115356, "grad_norm": 0.3937053084373474, "learning_rate": 7.00096632148408e-06, "loss": 0.02553264982998371, "memory(GiB)": 21.48, "step": 12329, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.956317 }, { "epoch": 0.400545755774291, "grad_norm": 0.395913690328598, "learning_rate": 7.0004740464620745e-06, "loss": 0.02359219267964363, "memory(GiB)": 21.48, "step": 12330, "token_acc": 1.0, "train_speed(iter/s)": 0.956334 }, { "epoch": 0.40057824123704644, "grad_norm": 0.3581584095954895, "learning_rate": 6.999981748351935e-06, "loss": 0.018787231296300888, "memory(GiB)": 21.48, "step": 12331, "token_acc": 0.9835164835164835, "train_speed(iter/s)": 0.956351 }, { "epoch": 0.40061072669980186, "grad_norm": 0.3593294620513916, "learning_rate": 6.999489427159344e-06, "loss": 0.022542238235473633, "memory(GiB)": 21.48, "step": 12332, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.956368 }, { "epoch": 0.40064321216255727, "grad_norm": 0.3226136863231659, "learning_rate": 6.998997082889985e-06, "loss": 0.028688274323940277, "memory(GiB)": 21.48, "step": 12333, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.956384 }, { "epoch": 0.4006756976253127, "grad_norm": 0.5298824906349182, "learning_rate": 6.998504715549537e-06, "loss": 0.02173277549445629, "memory(GiB)": 21.48, "step": 12334, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.9564 }, { "epoch": 0.4007081830880681, "grad_norm": 0.41499483585357666, "learning_rate": 6.998012325143686e-06, "loss": 0.02726595103740692, "memory(GiB)": 21.48, "step": 12335, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.956417 }, { "epoch": 0.4007406685508235, "grad_norm": 0.4411229193210602, "learning_rate": 6.997519911678114e-06, "loss": 0.024384452030062675, "memory(GiB)": 21.48, "step": 12336, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.956433 }, { "epoch": 0.40077315401357894, "grad_norm": 0.36139625310897827, "learning_rate": 6.997027475158502e-06, "loss": 0.016615338623523712, "memory(GiB)": 21.48, "step": 12337, "token_acc": 1.0, "train_speed(iter/s)": 0.95645 }, { "epoch": 0.40080563947633435, "grad_norm": 0.36446452140808105, "learning_rate": 6.996535015590536e-06, "loss": 0.025256266817450523, "memory(GiB)": 21.48, "step": 12338, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956463 }, { "epoch": 0.40083812493908977, "grad_norm": 0.29905810952186584, "learning_rate": 6.996042532979899e-06, "loss": 0.020007966086268425, "memory(GiB)": 21.48, "step": 12339, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.956475 }, { "epoch": 0.4008706104018452, "grad_norm": 0.32678014039993286, "learning_rate": 6.995550027332273e-06, "loss": 0.023988468572497368, "memory(GiB)": 21.48, "step": 12340, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956489 }, { "epoch": 0.4009030958646006, "grad_norm": 0.33967575430870056, "learning_rate": 6.9950574986533436e-06, "loss": 0.017348118126392365, "memory(GiB)": 21.48, "step": 12341, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.9565 }, { "epoch": 0.400935581327356, "grad_norm": 0.3222584128379822, "learning_rate": 6.994564946948797e-06, "loss": 0.02526647225022316, "memory(GiB)": 21.48, "step": 12342, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.95651 }, { "epoch": 0.40096806679011143, "grad_norm": 0.3503456711769104, "learning_rate": 6.9940723722243145e-06, "loss": 0.026587065309286118, "memory(GiB)": 21.48, "step": 12343, "token_acc": 1.0, "train_speed(iter/s)": 0.956522 }, { "epoch": 0.40100055225286685, "grad_norm": 0.3370315432548523, "learning_rate": 6.993579774485584e-06, "loss": 0.024125024676322937, "memory(GiB)": 21.48, "step": 12344, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.956534 }, { "epoch": 0.40103303771562226, "grad_norm": 0.32557809352874756, "learning_rate": 6.993087153738287e-06, "loss": 0.020698146894574165, "memory(GiB)": 21.48, "step": 12345, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956545 }, { "epoch": 0.4010655231783777, "grad_norm": 0.3740778863430023, "learning_rate": 6.992594509988113e-06, "loss": 0.026195615530014038, "memory(GiB)": 21.48, "step": 12346, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956556 }, { "epoch": 0.4010980086411331, "grad_norm": 0.4002883732318878, "learning_rate": 6.9921018432407475e-06, "loss": 0.021458275616168976, "memory(GiB)": 21.48, "step": 12347, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.956566 }, { "epoch": 0.4011304941038885, "grad_norm": 0.28186243772506714, "learning_rate": 6.9916091535018735e-06, "loss": 0.017899956554174423, "memory(GiB)": 21.48, "step": 12348, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.956577 }, { "epoch": 0.40116297956664393, "grad_norm": 0.8373296856880188, "learning_rate": 6.99111644077718e-06, "loss": 0.03707621246576309, "memory(GiB)": 21.48, "step": 12349, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.956587 }, { "epoch": 0.40119546502939935, "grad_norm": 0.47510313987731934, "learning_rate": 6.990623705072352e-06, "loss": 0.028333870694041252, "memory(GiB)": 21.48, "step": 12350, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.956597 }, { "epoch": 0.40122795049215476, "grad_norm": 0.4793369472026825, "learning_rate": 6.990130946393078e-06, "loss": 0.030555395409464836, "memory(GiB)": 21.48, "step": 12351, "token_acc": 0.9936102236421726, "train_speed(iter/s)": 0.956606 }, { "epoch": 0.4012604359549102, "grad_norm": 0.38455772399902344, "learning_rate": 6.9896381647450425e-06, "loss": 0.02389506809413433, "memory(GiB)": 21.48, "step": 12352, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.956617 }, { "epoch": 0.4012929214176656, "grad_norm": 0.5356584787368774, "learning_rate": 6.989145360133935e-06, "loss": 0.03271123766899109, "memory(GiB)": 21.48, "step": 12353, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.956627 }, { "epoch": 0.401325406880421, "grad_norm": 0.35213786363601685, "learning_rate": 6.9886525325654435e-06, "loss": 0.017101015895605087, "memory(GiB)": 21.48, "step": 12354, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.956638 }, { "epoch": 0.4013578923431764, "grad_norm": 0.588604211807251, "learning_rate": 6.988159682045254e-06, "loss": 0.03218362480401993, "memory(GiB)": 21.48, "step": 12355, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.956648 }, { "epoch": 0.40139037780593184, "grad_norm": 0.39994511008262634, "learning_rate": 6.987666808579055e-06, "loss": 0.025765197351574898, "memory(GiB)": 21.48, "step": 12356, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.956657 }, { "epoch": 0.40142286326868726, "grad_norm": 0.3001660704612732, "learning_rate": 6.9871739121725355e-06, "loss": 0.02556317113339901, "memory(GiB)": 21.48, "step": 12357, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.956668 }, { "epoch": 0.4014553487314427, "grad_norm": 0.3548559844493866, "learning_rate": 6.986680992831385e-06, "loss": 0.025760222226381302, "memory(GiB)": 21.48, "step": 12358, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.956679 }, { "epoch": 0.4014878341941981, "grad_norm": 0.3061484396457672, "learning_rate": 6.986188050561291e-06, "loss": 0.02524113468825817, "memory(GiB)": 21.48, "step": 12359, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956689 }, { "epoch": 0.4015203196569535, "grad_norm": 0.35326769948005676, "learning_rate": 6.985695085367944e-06, "loss": 0.024409931153059006, "memory(GiB)": 21.48, "step": 12360, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956699 }, { "epoch": 0.4015528051197089, "grad_norm": 0.3480999171733856, "learning_rate": 6.985202097257032e-06, "loss": 0.019207436591386795, "memory(GiB)": 21.48, "step": 12361, "token_acc": 1.0, "train_speed(iter/s)": 0.95671 }, { "epoch": 0.40158529058246434, "grad_norm": 0.3150005042552948, "learning_rate": 6.984709086234245e-06, "loss": 0.02859370969235897, "memory(GiB)": 21.48, "step": 12362, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.956727 }, { "epoch": 0.40161777604521975, "grad_norm": 0.5548555254936218, "learning_rate": 6.984216052305275e-06, "loss": 0.027084091678261757, "memory(GiB)": 21.48, "step": 12363, "token_acc": 1.0, "train_speed(iter/s)": 0.956742 }, { "epoch": 0.40165026150797517, "grad_norm": 0.5924116373062134, "learning_rate": 6.983722995475809e-06, "loss": 0.02830968238413334, "memory(GiB)": 21.48, "step": 12364, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.956758 }, { "epoch": 0.4016827469707306, "grad_norm": 0.4408022463321686, "learning_rate": 6.9832299157515415e-06, "loss": 0.03588952124118805, "memory(GiB)": 21.48, "step": 12365, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.956775 }, { "epoch": 0.401715232433486, "grad_norm": 0.28070881962776184, "learning_rate": 6.982736813138159e-06, "loss": 0.023041438311338425, "memory(GiB)": 21.48, "step": 12366, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.95679 }, { "epoch": 0.4017477178962414, "grad_norm": 0.3248308002948761, "learning_rate": 6.982243687641356e-06, "loss": 0.016136784106492996, "memory(GiB)": 21.48, "step": 12367, "token_acc": 1.0, "train_speed(iter/s)": 0.956806 }, { "epoch": 0.40178020335899683, "grad_norm": 0.35886067152023315, "learning_rate": 6.981750539266821e-06, "loss": 0.024474509060382843, "memory(GiB)": 21.48, "step": 12368, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956821 }, { "epoch": 0.40181268882175225, "grad_norm": 0.5858771204948425, "learning_rate": 6.981257368020249e-06, "loss": 0.025879234075546265, "memory(GiB)": 21.48, "step": 12369, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.956838 }, { "epoch": 0.40184517428450767, "grad_norm": 0.3660150170326233, "learning_rate": 6.9807641739073295e-06, "loss": 0.02616049349308014, "memory(GiB)": 21.48, "step": 12370, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956854 }, { "epoch": 0.4018776597472631, "grad_norm": 0.4374943673610687, "learning_rate": 6.980270956933755e-06, "loss": 0.028397761285305023, "memory(GiB)": 21.48, "step": 12371, "token_acc": 0.988, "train_speed(iter/s)": 0.956871 }, { "epoch": 0.4019101452100185, "grad_norm": 0.500383734703064, "learning_rate": 6.979777717105218e-06, "loss": 0.026899591088294983, "memory(GiB)": 21.48, "step": 12372, "token_acc": 0.9726775956284153, "train_speed(iter/s)": 0.956887 }, { "epoch": 0.4019426306727739, "grad_norm": 0.3898746371269226, "learning_rate": 6.979284454427409e-06, "loss": 0.027305010706186295, "memory(GiB)": 21.48, "step": 12373, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956901 }, { "epoch": 0.40197511613552933, "grad_norm": 0.2694602310657501, "learning_rate": 6.978791168906025e-06, "loss": 0.02038617618381977, "memory(GiB)": 21.48, "step": 12374, "token_acc": 1.0, "train_speed(iter/s)": 0.956916 }, { "epoch": 0.40200760159828475, "grad_norm": 0.6583669781684875, "learning_rate": 6.978297860546756e-06, "loss": 0.021776940673589706, "memory(GiB)": 21.48, "step": 12375, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.956932 }, { "epoch": 0.40204008706104016, "grad_norm": 0.48282590508461, "learning_rate": 6.977804529355299e-06, "loss": 0.02254335954785347, "memory(GiB)": 21.48, "step": 12376, "token_acc": 0.966542750929368, "train_speed(iter/s)": 0.956949 }, { "epoch": 0.4020725725237956, "grad_norm": 0.4757465720176697, "learning_rate": 6.977311175337342e-06, "loss": 0.025895802304148674, "memory(GiB)": 21.48, "step": 12377, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.956965 }, { "epoch": 0.402105057986551, "grad_norm": 0.8442367315292358, "learning_rate": 6.976817798498583e-06, "loss": 0.028923358768224716, "memory(GiB)": 21.48, "step": 12378, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956981 }, { "epoch": 0.4021375434493064, "grad_norm": 0.38946568965911865, "learning_rate": 6.976324398844717e-06, "loss": 0.03090069070458412, "memory(GiB)": 21.48, "step": 12379, "token_acc": 0.9727272727272728, "train_speed(iter/s)": 0.956998 }, { "epoch": 0.4021700289120618, "grad_norm": 0.3356223702430725, "learning_rate": 6.975830976381436e-06, "loss": 0.02496296353638172, "memory(GiB)": 21.48, "step": 12380, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.957014 }, { "epoch": 0.40220251437481724, "grad_norm": 0.4230514168739319, "learning_rate": 6.9753375311144365e-06, "loss": 0.026385366916656494, "memory(GiB)": 21.48, "step": 12381, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.957029 }, { "epoch": 0.40223499983757266, "grad_norm": 0.3326576352119446, "learning_rate": 6.974844063049412e-06, "loss": 0.02803438901901245, "memory(GiB)": 21.48, "step": 12382, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957045 }, { "epoch": 0.4022674853003281, "grad_norm": 0.28209927678108215, "learning_rate": 6.974350572192059e-06, "loss": 0.02080863155424595, "memory(GiB)": 21.48, "step": 12383, "token_acc": 1.0, "train_speed(iter/s)": 0.957062 }, { "epoch": 0.40229997076308355, "grad_norm": 0.8177801966667175, "learning_rate": 6.973857058548073e-06, "loss": 0.029865581542253494, "memory(GiB)": 21.48, "step": 12384, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.957078 }, { "epoch": 0.40233245622583896, "grad_norm": 0.3348255157470703, "learning_rate": 6.973363522123148e-06, "loss": 0.02169634774327278, "memory(GiB)": 21.48, "step": 12385, "token_acc": 1.0, "train_speed(iter/s)": 0.957095 }, { "epoch": 0.4023649416885944, "grad_norm": 0.6706414818763733, "learning_rate": 6.972869962922981e-06, "loss": 0.030703052878379822, "memory(GiB)": 21.48, "step": 12386, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.957111 }, { "epoch": 0.4023974271513498, "grad_norm": 0.3837454617023468, "learning_rate": 6.972376380953269e-06, "loss": 0.03011668473482132, "memory(GiB)": 21.48, "step": 12387, "token_acc": 0.9837837837837838, "train_speed(iter/s)": 0.957127 }, { "epoch": 0.4024299126141052, "grad_norm": 0.36219102144241333, "learning_rate": 6.9718827762197105e-06, "loss": 0.028481770306825638, "memory(GiB)": 21.48, "step": 12388, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.957144 }, { "epoch": 0.40246239807686063, "grad_norm": 0.6607905626296997, "learning_rate": 6.9713891487279984e-06, "loss": 0.027175361290574074, "memory(GiB)": 21.48, "step": 12389, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.95716 }, { "epoch": 0.40249488353961604, "grad_norm": 0.38975757360458374, "learning_rate": 6.970895498483832e-06, "loss": 0.02252396196126938, "memory(GiB)": 21.48, "step": 12390, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957176 }, { "epoch": 0.40252736900237146, "grad_norm": 0.3955380320549011, "learning_rate": 6.970401825492908e-06, "loss": 0.029517071321606636, "memory(GiB)": 21.48, "step": 12391, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.957192 }, { "epoch": 0.4025598544651269, "grad_norm": 0.51917964220047, "learning_rate": 6.969908129760925e-06, "loss": 0.03540516272187233, "memory(GiB)": 21.48, "step": 12392, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.957208 }, { "epoch": 0.4025923399278823, "grad_norm": 0.5333631634712219, "learning_rate": 6.9694144112935795e-06, "loss": 0.025211934000253677, "memory(GiB)": 21.48, "step": 12393, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.957224 }, { "epoch": 0.4026248253906377, "grad_norm": 0.6810558438301086, "learning_rate": 6.96892067009657e-06, "loss": 0.027568267658352852, "memory(GiB)": 21.48, "step": 12394, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.95724 }, { "epoch": 0.4026573108533931, "grad_norm": 0.34920692443847656, "learning_rate": 6.968426906175596e-06, "loss": 0.017949622124433517, "memory(GiB)": 21.48, "step": 12395, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.957256 }, { "epoch": 0.40268979631614854, "grad_norm": 0.41527873277664185, "learning_rate": 6.967933119536355e-06, "loss": 0.029447557404637337, "memory(GiB)": 21.48, "step": 12396, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.957273 }, { "epoch": 0.40272228177890396, "grad_norm": 0.3568645119667053, "learning_rate": 6.967439310184548e-06, "loss": 0.019802717491984367, "memory(GiB)": 21.48, "step": 12397, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957288 }, { "epoch": 0.4027547672416594, "grad_norm": 0.37084880471229553, "learning_rate": 6.966945478125873e-06, "loss": 0.02399563044309616, "memory(GiB)": 21.48, "step": 12398, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.957304 }, { "epoch": 0.4027872527044148, "grad_norm": 1.19921875, "learning_rate": 6.966451623366027e-06, "loss": 0.040320299565792084, "memory(GiB)": 21.48, "step": 12399, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.957321 }, { "epoch": 0.4028197381671702, "grad_norm": 0.4639432728290558, "learning_rate": 6.965957745910713e-06, "loss": 0.03700145334005356, "memory(GiB)": 21.48, "step": 12400, "token_acc": 0.9826989619377162, "train_speed(iter/s)": 0.957334 }, { "epoch": 0.4028522236299256, "grad_norm": 0.3475033640861511, "learning_rate": 6.96546384576563e-06, "loss": 0.019966505467891693, "memory(GiB)": 21.48, "step": 12401, "token_acc": 1.0, "train_speed(iter/s)": 0.957348 }, { "epoch": 0.40288470909268104, "grad_norm": 0.6139343976974487, "learning_rate": 6.964969922936477e-06, "loss": 0.02926214598119259, "memory(GiB)": 21.48, "step": 12402, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.957359 }, { "epoch": 0.40291719455543645, "grad_norm": 0.41458964347839355, "learning_rate": 6.964475977428957e-06, "loss": 0.029444221407175064, "memory(GiB)": 21.48, "step": 12403, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.95737 }, { "epoch": 0.40294968001819187, "grad_norm": 0.4272242784500122, "learning_rate": 6.963982009248769e-06, "loss": 0.024888545274734497, "memory(GiB)": 21.48, "step": 12404, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.957381 }, { "epoch": 0.4029821654809473, "grad_norm": 0.35365721583366394, "learning_rate": 6.9634880184016154e-06, "loss": 0.022539764642715454, "memory(GiB)": 21.48, "step": 12405, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957392 }, { "epoch": 0.4030146509437027, "grad_norm": 0.46244344115257263, "learning_rate": 6.962994004893196e-06, "loss": 0.018517933785915375, "memory(GiB)": 21.48, "step": 12406, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.957403 }, { "epoch": 0.4030471364064581, "grad_norm": 0.47764328122138977, "learning_rate": 6.962499968729214e-06, "loss": 0.02215540036559105, "memory(GiB)": 21.48, "step": 12407, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.957414 }, { "epoch": 0.40307962186921353, "grad_norm": 0.5936877727508545, "learning_rate": 6.962005909915368e-06, "loss": 0.03986496850848198, "memory(GiB)": 21.48, "step": 12408, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.957426 }, { "epoch": 0.40311210733196895, "grad_norm": 0.3850381076335907, "learning_rate": 6.961511828457363e-06, "loss": 0.028023669496178627, "memory(GiB)": 21.48, "step": 12409, "token_acc": 1.0, "train_speed(iter/s)": 0.957437 }, { "epoch": 0.40314459279472437, "grad_norm": 0.42603883147239685, "learning_rate": 6.961017724360902e-06, "loss": 0.026973329484462738, "memory(GiB)": 21.48, "step": 12410, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.957441 }, { "epoch": 0.4031770782574798, "grad_norm": 0.5835192799568176, "learning_rate": 6.960523597631686e-06, "loss": 0.02650647982954979, "memory(GiB)": 21.48, "step": 12411, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.957451 }, { "epoch": 0.4032095637202352, "grad_norm": 0.39045920968055725, "learning_rate": 6.960029448275419e-06, "loss": 0.021748322993516922, "memory(GiB)": 21.48, "step": 12412, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.957462 }, { "epoch": 0.4032420491829906, "grad_norm": 0.4638848900794983, "learning_rate": 6.959535276297802e-06, "loss": 0.031072251498699188, "memory(GiB)": 21.48, "step": 12413, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957473 }, { "epoch": 0.40327453464574603, "grad_norm": 0.40702834725379944, "learning_rate": 6.959041081704541e-06, "loss": 0.031025119125843048, "memory(GiB)": 21.48, "step": 12414, "token_acc": 1.0, "train_speed(iter/s)": 0.957483 }, { "epoch": 0.40330702010850145, "grad_norm": 0.4017650783061981, "learning_rate": 6.958546864501337e-06, "loss": 0.02906932681798935, "memory(GiB)": 21.48, "step": 12415, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.957493 }, { "epoch": 0.40333950557125686, "grad_norm": 0.4355776011943817, "learning_rate": 6.958052624693897e-06, "loss": 0.031615324318408966, "memory(GiB)": 21.48, "step": 12416, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.957503 }, { "epoch": 0.4033719910340123, "grad_norm": 0.37826287746429443, "learning_rate": 6.957558362287921e-06, "loss": 0.03228690102696419, "memory(GiB)": 21.48, "step": 12417, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.957515 }, { "epoch": 0.4034044764967677, "grad_norm": 0.357075035572052, "learning_rate": 6.9570640772891185e-06, "loss": 0.024862773716449738, "memory(GiB)": 21.48, "step": 12418, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957525 }, { "epoch": 0.4034369619595231, "grad_norm": 0.30973920226097107, "learning_rate": 6.956569769703189e-06, "loss": 0.022816043347120285, "memory(GiB)": 21.48, "step": 12419, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.957536 }, { "epoch": 0.4034694474222785, "grad_norm": 0.4200742840766907, "learning_rate": 6.956075439535843e-06, "loss": 0.032329559326171875, "memory(GiB)": 21.48, "step": 12420, "token_acc": 0.9918367346938776, "train_speed(iter/s)": 0.957547 }, { "epoch": 0.40350193288503394, "grad_norm": 0.33621731400489807, "learning_rate": 6.955581086792783e-06, "loss": 0.023540478199720383, "memory(GiB)": 21.48, "step": 12421, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957559 }, { "epoch": 0.40353441834778936, "grad_norm": 0.456988662481308, "learning_rate": 6.955086711479712e-06, "loss": 0.032882608473300934, "memory(GiB)": 21.48, "step": 12422, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.957571 }, { "epoch": 0.4035669038105448, "grad_norm": 0.9387011528015137, "learning_rate": 6.954592313602341e-06, "loss": 0.03544950857758522, "memory(GiB)": 21.48, "step": 12423, "token_acc": 0.9776119402985075, "train_speed(iter/s)": 0.957585 }, { "epoch": 0.4035993892733002, "grad_norm": 0.5059340000152588, "learning_rate": 6.954097893166369e-06, "loss": 0.04061930626630783, "memory(GiB)": 21.48, "step": 12424, "token_acc": 0.9798387096774194, "train_speed(iter/s)": 0.957601 }, { "epoch": 0.4036318747360556, "grad_norm": 0.39710816740989685, "learning_rate": 6.95360345017751e-06, "loss": 0.026372313499450684, "memory(GiB)": 21.48, "step": 12425, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957618 }, { "epoch": 0.403664360198811, "grad_norm": 0.35764896869659424, "learning_rate": 6.9531089846414665e-06, "loss": 0.024942778050899506, "memory(GiB)": 21.48, "step": 12426, "token_acc": 0.9961977186311787, "train_speed(iter/s)": 0.957633 }, { "epoch": 0.40369684566156644, "grad_norm": 0.43902984261512756, "learning_rate": 6.952614496563944e-06, "loss": 0.028572680428624153, "memory(GiB)": 21.48, "step": 12427, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.95765 }, { "epoch": 0.40372933112432186, "grad_norm": 0.2508884072303772, "learning_rate": 6.952119985950654e-06, "loss": 0.02242119610309601, "memory(GiB)": 21.48, "step": 12428, "token_acc": 1.0, "train_speed(iter/s)": 0.957665 }, { "epoch": 0.40376181658707727, "grad_norm": 0.25824153423309326, "learning_rate": 6.951625452807298e-06, "loss": 0.012929876334965229, "memory(GiB)": 21.48, "step": 12429, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.95768 }, { "epoch": 0.4037943020498327, "grad_norm": 0.4948320984840393, "learning_rate": 6.9511308971395885e-06, "loss": 0.02447446435689926, "memory(GiB)": 21.48, "step": 12430, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.957695 }, { "epoch": 0.4038267875125881, "grad_norm": 0.29993292689323425, "learning_rate": 6.950636318953231e-06, "loss": 0.022482171654701233, "memory(GiB)": 21.48, "step": 12431, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.957712 }, { "epoch": 0.4038592729753435, "grad_norm": 0.3295130729675293, "learning_rate": 6.950141718253935e-06, "loss": 0.02321546897292137, "memory(GiB)": 21.48, "step": 12432, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.957727 }, { "epoch": 0.40389175843809894, "grad_norm": 0.4887259602546692, "learning_rate": 6.949647095047407e-06, "loss": 0.025063324719667435, "memory(GiB)": 21.48, "step": 12433, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.957744 }, { "epoch": 0.40392424390085435, "grad_norm": 0.44130226969718933, "learning_rate": 6.949152449339357e-06, "loss": 0.022240933030843735, "memory(GiB)": 21.48, "step": 12434, "token_acc": 0.9813084112149533, "train_speed(iter/s)": 0.957759 }, { "epoch": 0.40395672936360977, "grad_norm": 0.3110162913799286, "learning_rate": 6.948657781135494e-06, "loss": 0.02488688752055168, "memory(GiB)": 21.48, "step": 12435, "token_acc": 1.0, "train_speed(iter/s)": 0.957775 }, { "epoch": 0.4039892148263652, "grad_norm": 0.3612479269504547, "learning_rate": 6.948163090441524e-06, "loss": 0.03126208856701851, "memory(GiB)": 21.48, "step": 12436, "token_acc": 1.0, "train_speed(iter/s)": 0.957791 }, { "epoch": 0.4040217002891206, "grad_norm": 0.30579259991645813, "learning_rate": 6.947668377263162e-06, "loss": 0.023612376302480698, "memory(GiB)": 21.48, "step": 12437, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.957807 }, { "epoch": 0.404054185751876, "grad_norm": 0.5517910122871399, "learning_rate": 6.947173641606113e-06, "loss": 0.0311169121414423, "memory(GiB)": 21.48, "step": 12438, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957823 }, { "epoch": 0.40408667121463143, "grad_norm": 0.329453706741333, "learning_rate": 6.946678883476088e-06, "loss": 0.017216268926858902, "memory(GiB)": 21.48, "step": 12439, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.957839 }, { "epoch": 0.40411915667738685, "grad_norm": 0.5688583850860596, "learning_rate": 6.946184102878799e-06, "loss": 0.0259549617767334, "memory(GiB)": 21.48, "step": 12440, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.957854 }, { "epoch": 0.40415164214014226, "grad_norm": 0.3320659399032593, "learning_rate": 6.945689299819953e-06, "loss": 0.028717372566461563, "memory(GiB)": 21.48, "step": 12441, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.957868 }, { "epoch": 0.4041841276028977, "grad_norm": 0.46343719959259033, "learning_rate": 6.945194474305266e-06, "loss": 0.02829909138381481, "memory(GiB)": 21.48, "step": 12442, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.957885 }, { "epoch": 0.4042166130656531, "grad_norm": 0.803247332572937, "learning_rate": 6.944699626340443e-06, "loss": 0.02209247276186943, "memory(GiB)": 21.48, "step": 12443, "token_acc": 0.9927272727272727, "train_speed(iter/s)": 0.9579 }, { "epoch": 0.4042490985284085, "grad_norm": 0.3310604989528656, "learning_rate": 6.944204755931198e-06, "loss": 0.022625332698225975, "memory(GiB)": 21.48, "step": 12444, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.957915 }, { "epoch": 0.40428158399116393, "grad_norm": 0.38684383034706116, "learning_rate": 6.943709863083243e-06, "loss": 0.0289301835000515, "memory(GiB)": 21.48, "step": 12445, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.957931 }, { "epoch": 0.40431406945391934, "grad_norm": 0.4866624176502228, "learning_rate": 6.94321494780229e-06, "loss": 0.017477724701166153, "memory(GiB)": 21.48, "step": 12446, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957946 }, { "epoch": 0.40434655491667476, "grad_norm": 0.3285345733165741, "learning_rate": 6.9427200100940494e-06, "loss": 0.01719348505139351, "memory(GiB)": 21.48, "step": 12447, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957961 }, { "epoch": 0.40437904037943023, "grad_norm": 0.23996391892433167, "learning_rate": 6.942225049964235e-06, "loss": 0.01686840131878853, "memory(GiB)": 21.48, "step": 12448, "token_acc": 0.979253112033195, "train_speed(iter/s)": 0.957977 }, { "epoch": 0.40441152584218565, "grad_norm": 0.270881325006485, "learning_rate": 6.941730067418557e-06, "loss": 0.01369132474064827, "memory(GiB)": 21.48, "step": 12449, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.957994 }, { "epoch": 0.40444401130494106, "grad_norm": 0.3161953389644623, "learning_rate": 6.94123506246273e-06, "loss": 0.022099606692790985, "memory(GiB)": 21.48, "step": 12450, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.95801 }, { "epoch": 0.4044764967676965, "grad_norm": 0.704790472984314, "learning_rate": 6.940740035102468e-06, "loss": 0.021325550973415375, "memory(GiB)": 21.48, "step": 12451, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.958027 }, { "epoch": 0.4045089822304519, "grad_norm": 0.3931627571582794, "learning_rate": 6.94024498534348e-06, "loss": 0.02467331662774086, "memory(GiB)": 21.48, "step": 12452, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.958042 }, { "epoch": 0.4045414676932073, "grad_norm": 0.5604182481765747, "learning_rate": 6.939749913191486e-06, "loss": 0.02437639608979225, "memory(GiB)": 21.48, "step": 12453, "token_acc": 1.0, "train_speed(iter/s)": 0.958058 }, { "epoch": 0.40457395315596273, "grad_norm": 0.41358599066734314, "learning_rate": 6.939254818652193e-06, "loss": 0.018227238208055496, "memory(GiB)": 21.48, "step": 12454, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.958075 }, { "epoch": 0.40460643861871814, "grad_norm": 0.5261097550392151, "learning_rate": 6.938759701731319e-06, "loss": 0.02168627642095089, "memory(GiB)": 21.48, "step": 12455, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.958092 }, { "epoch": 0.40463892408147356, "grad_norm": 0.3330460786819458, "learning_rate": 6.938264562434579e-06, "loss": 0.018285511061549187, "memory(GiB)": 21.48, "step": 12456, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.958108 }, { "epoch": 0.404671409544229, "grad_norm": 0.46851038932800293, "learning_rate": 6.9377694007676844e-06, "loss": 0.025860179215669632, "memory(GiB)": 21.48, "step": 12457, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.958125 }, { "epoch": 0.4047038950069844, "grad_norm": 0.3409734070301056, "learning_rate": 6.937274216736354e-06, "loss": 0.02491983026266098, "memory(GiB)": 21.48, "step": 12458, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.95814 }, { "epoch": 0.4047363804697398, "grad_norm": 0.5811756253242493, "learning_rate": 6.9367790103462985e-06, "loss": 0.040415652096271515, "memory(GiB)": 21.48, "step": 12459, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.958157 }, { "epoch": 0.4047688659324952, "grad_norm": 0.40170109272003174, "learning_rate": 6.936283781603236e-06, "loss": 0.017423823475837708, "memory(GiB)": 21.48, "step": 12460, "token_acc": 1.0, "train_speed(iter/s)": 0.958174 }, { "epoch": 0.40480135139525064, "grad_norm": 0.519656777381897, "learning_rate": 6.935788530512881e-06, "loss": 0.02532053366303444, "memory(GiB)": 21.48, "step": 12461, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.958189 }, { "epoch": 0.40483383685800606, "grad_norm": 0.552262544631958, "learning_rate": 6.935293257080951e-06, "loss": 0.030034344643354416, "memory(GiB)": 21.48, "step": 12462, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.958202 }, { "epoch": 0.4048663223207615, "grad_norm": 0.5158554315567017, "learning_rate": 6.934797961313161e-06, "loss": 0.028719361871480942, "memory(GiB)": 21.48, "step": 12463, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.958211 }, { "epoch": 0.4048988077835169, "grad_norm": 0.29091688990592957, "learning_rate": 6.934302643215226e-06, "loss": 0.020816579461097717, "memory(GiB)": 21.48, "step": 12464, "token_acc": 0.99609375, "train_speed(iter/s)": 0.958222 }, { "epoch": 0.4049312932462723, "grad_norm": 0.4832039475440979, "learning_rate": 6.933807302792865e-06, "loss": 0.029874147847294807, "memory(GiB)": 21.48, "step": 12465, "token_acc": 0.986013986013986, "train_speed(iter/s)": 0.958233 }, { "epoch": 0.4049637787090277, "grad_norm": 0.7009689807891846, "learning_rate": 6.9333119400517925e-06, "loss": 0.02797367423772812, "memory(GiB)": 21.48, "step": 12466, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.958244 }, { "epoch": 0.40499626417178314, "grad_norm": 0.4747963547706604, "learning_rate": 6.932816554997729e-06, "loss": 0.03241352736949921, "memory(GiB)": 21.48, "step": 12467, "token_acc": 0.9765258215962441, "train_speed(iter/s)": 0.958256 }, { "epoch": 0.40502874963453855, "grad_norm": 0.47717443108558655, "learning_rate": 6.932321147636388e-06, "loss": 0.01943284459412098, "memory(GiB)": 21.48, "step": 12468, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.958267 }, { "epoch": 0.40506123509729397, "grad_norm": 0.38751471042633057, "learning_rate": 6.93182571797349e-06, "loss": 0.02475586161017418, "memory(GiB)": 21.48, "step": 12469, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.958277 }, { "epoch": 0.4050937205600494, "grad_norm": 0.4057597219944, "learning_rate": 6.931330266014752e-06, "loss": 0.020768793299794197, "memory(GiB)": 21.48, "step": 12470, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.958288 }, { "epoch": 0.4051262060228048, "grad_norm": 0.3900315761566162, "learning_rate": 6.930834791765892e-06, "loss": 0.02228240668773651, "memory(GiB)": 21.48, "step": 12471, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.9583 }, { "epoch": 0.4051586914855602, "grad_norm": 0.3449859619140625, "learning_rate": 6.930339295232629e-06, "loss": 0.02254543825984001, "memory(GiB)": 21.48, "step": 12472, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.958311 }, { "epoch": 0.40519117694831563, "grad_norm": 0.33969515562057495, "learning_rate": 6.9298437764206794e-06, "loss": 0.02445639669895172, "memory(GiB)": 21.48, "step": 12473, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.958322 }, { "epoch": 0.40522366241107105, "grad_norm": 0.2774886190891266, "learning_rate": 6.929348235335765e-06, "loss": 0.01708921790122986, "memory(GiB)": 21.48, "step": 12474, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.958333 }, { "epoch": 0.40525614787382647, "grad_norm": 0.5430199503898621, "learning_rate": 6.928852671983605e-06, "loss": 0.03032253310084343, "memory(GiB)": 21.48, "step": 12475, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.958343 }, { "epoch": 0.4052886333365819, "grad_norm": 0.3329002559185028, "learning_rate": 6.928357086369917e-06, "loss": 0.023369982838630676, "memory(GiB)": 21.48, "step": 12476, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.958353 }, { "epoch": 0.4053211187993373, "grad_norm": 0.3608773946762085, "learning_rate": 6.927861478500422e-06, "loss": 0.01976807788014412, "memory(GiB)": 21.48, "step": 12477, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.958364 }, { "epoch": 0.4053536042620927, "grad_norm": 0.6045159697532654, "learning_rate": 6.927365848380839e-06, "loss": 0.034585967659950256, "memory(GiB)": 21.48, "step": 12478, "token_acc": 0.9823008849557522, "train_speed(iter/s)": 0.958374 }, { "epoch": 0.40538608972484813, "grad_norm": 0.4119260013103485, "learning_rate": 6.92687019601689e-06, "loss": 0.03302321583032608, "memory(GiB)": 21.48, "step": 12479, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.958384 }, { "epoch": 0.40541857518760355, "grad_norm": 0.42210379242897034, "learning_rate": 6.926374521414293e-06, "loss": 0.03201374411582947, "memory(GiB)": 21.48, "step": 12480, "token_acc": 0.9624413145539906, "train_speed(iter/s)": 0.958395 }, { "epoch": 0.40545106065035896, "grad_norm": 0.6988522410392761, "learning_rate": 6.925878824578771e-06, "loss": 0.023392755538225174, "memory(GiB)": 21.48, "step": 12481, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.958404 }, { "epoch": 0.4054835461131144, "grad_norm": 0.5087921619415283, "learning_rate": 6.925383105516043e-06, "loss": 0.027587242424488068, "memory(GiB)": 21.48, "step": 12482, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.958417 }, { "epoch": 0.4055160315758698, "grad_norm": 1.6129637956619263, "learning_rate": 6.924887364231831e-06, "loss": 0.03409590199589729, "memory(GiB)": 21.48, "step": 12483, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.958431 }, { "epoch": 0.4055485170386252, "grad_norm": 0.3966664969921112, "learning_rate": 6.924391600731858e-06, "loss": 0.030708394944667816, "memory(GiB)": 21.48, "step": 12484, "token_acc": 0.9805825242718447, "train_speed(iter/s)": 0.958444 }, { "epoch": 0.4055810025013806, "grad_norm": 0.4266510605812073, "learning_rate": 6.923895815021844e-06, "loss": 0.031528543680906296, "memory(GiB)": 21.48, "step": 12485, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.958457 }, { "epoch": 0.40561348796413604, "grad_norm": 0.33583176136016846, "learning_rate": 6.923400007107512e-06, "loss": 0.021315941587090492, "memory(GiB)": 21.48, "step": 12486, "token_acc": 0.990521327014218, "train_speed(iter/s)": 0.958473 }, { "epoch": 0.40564597342689146, "grad_norm": 0.4281918704509735, "learning_rate": 6.922904176994583e-06, "loss": 0.029412386938929558, "memory(GiB)": 21.48, "step": 12487, "token_acc": 0.984, "train_speed(iter/s)": 0.958489 }, { "epoch": 0.4056784588896469, "grad_norm": 0.5879517793655396, "learning_rate": 6.922408324688782e-06, "loss": 0.02251788228750229, "memory(GiB)": 21.48, "step": 12488, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.958506 }, { "epoch": 0.4057109443524023, "grad_norm": 0.4452474117279053, "learning_rate": 6.921912450195829e-06, "loss": 0.029187435284256935, "memory(GiB)": 21.48, "step": 12489, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.958522 }, { "epoch": 0.4057434298151577, "grad_norm": 0.4978885352611542, "learning_rate": 6.9214165535214484e-06, "loss": 0.030978821218013763, "memory(GiB)": 21.48, "step": 12490, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.958537 }, { "epoch": 0.4057759152779131, "grad_norm": 0.356125146150589, "learning_rate": 6.9209206346713644e-06, "loss": 0.022935641929507256, "memory(GiB)": 21.48, "step": 12491, "token_acc": 1.0, "train_speed(iter/s)": 0.958554 }, { "epoch": 0.40580840074066854, "grad_norm": 0.352603018283844, "learning_rate": 6.9204246936512976e-06, "loss": 0.0320369154214859, "memory(GiB)": 21.48, "step": 12492, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.958569 }, { "epoch": 0.40584088620342396, "grad_norm": 0.5362686514854431, "learning_rate": 6.919928730466976e-06, "loss": 0.027839824557304382, "memory(GiB)": 21.48, "step": 12493, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.958585 }, { "epoch": 0.40587337166617937, "grad_norm": 0.3525438606739044, "learning_rate": 6.91943274512412e-06, "loss": 0.024280238896608353, "memory(GiB)": 21.48, "step": 12494, "token_acc": 0.9948186528497409, "train_speed(iter/s)": 0.958601 }, { "epoch": 0.4059058571289348, "grad_norm": 0.29697751998901367, "learning_rate": 6.918936737628454e-06, "loss": 0.02454303205013275, "memory(GiB)": 21.48, "step": 12495, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.958617 }, { "epoch": 0.4059383425916902, "grad_norm": 0.5259214639663696, "learning_rate": 6.918440707985707e-06, "loss": 0.02606244757771492, "memory(GiB)": 21.48, "step": 12496, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.958633 }, { "epoch": 0.4059708280544456, "grad_norm": 0.8765596747398376, "learning_rate": 6.9179446562015986e-06, "loss": 0.022745590656995773, "memory(GiB)": 21.48, "step": 12497, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.958648 }, { "epoch": 0.40600331351720104, "grad_norm": 0.3895418047904968, "learning_rate": 6.917448582281856e-06, "loss": 0.03026691824197769, "memory(GiB)": 21.48, "step": 12498, "token_acc": 1.0, "train_speed(iter/s)": 0.958664 }, { "epoch": 0.40603579897995645, "grad_norm": 0.5929280519485474, "learning_rate": 6.916952486232207e-06, "loss": 0.026378072798252106, "memory(GiB)": 21.48, "step": 12499, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.95868 }, { "epoch": 0.40606828444271187, "grad_norm": 0.24407385289669037, "learning_rate": 6.916456368058373e-06, "loss": 0.019969042390584946, "memory(GiB)": 21.48, "step": 12500, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.958696 }, { "epoch": 0.40606828444271187, "eval_loss": 0.026207538321614265, "eval_runtime": 80.7127, "eval_samples_per_second": 123.277, "eval_steps_per_second": 3.853, "eval_token_acc": 0.9898681521218705, "step": 12500 }, { "epoch": 0.4061007699054673, "grad_norm": 0.2578938603401184, "learning_rate": 6.9159602277660805e-06, "loss": 0.014196018688380718, "memory(GiB)": 21.48, "step": 12501, "token_acc": 0.9894559546469114, "train_speed(iter/s)": 0.951943 }, { "epoch": 0.4061332553682227, "grad_norm": 0.33609557151794434, "learning_rate": 6.915464065361059e-06, "loss": 0.027758169919252396, "memory(GiB)": 21.48, "step": 12502, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.951953 }, { "epoch": 0.4061657408309781, "grad_norm": 0.5040923953056335, "learning_rate": 6.914967880849031e-06, "loss": 0.03221602365374565, "memory(GiB)": 21.48, "step": 12503, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.951963 }, { "epoch": 0.40619822629373353, "grad_norm": 0.40631917119026184, "learning_rate": 6.9144716742357276e-06, "loss": 0.02661263197660446, "memory(GiB)": 21.48, "step": 12504, "token_acc": 1.0, "train_speed(iter/s)": 0.951974 }, { "epoch": 0.40623071175648895, "grad_norm": 0.4665051996707916, "learning_rate": 6.9139754455268706e-06, "loss": 0.025097191333770752, "memory(GiB)": 21.48, "step": 12505, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.951986 }, { "epoch": 0.40626319721924437, "grad_norm": 0.35988569259643555, "learning_rate": 6.91347919472819e-06, "loss": 0.03124620020389557, "memory(GiB)": 21.48, "step": 12506, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.951996 }, { "epoch": 0.4062956826819998, "grad_norm": 0.3703390061855316, "learning_rate": 6.912982921845413e-06, "loss": 0.019006598740816116, "memory(GiB)": 21.48, "step": 12507, "token_acc": 1.0, "train_speed(iter/s)": 0.952007 }, { "epoch": 0.4063281681447552, "grad_norm": 0.2617507874965668, "learning_rate": 6.912486626884266e-06, "loss": 0.02064875140786171, "memory(GiB)": 21.48, "step": 12508, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.952018 }, { "epoch": 0.4063606536075106, "grad_norm": 0.3740350008010864, "learning_rate": 6.911990309850479e-06, "loss": 0.018371393904089928, "memory(GiB)": 21.48, "step": 12509, "token_acc": 1.0, "train_speed(iter/s)": 0.95203 }, { "epoch": 0.40639313907026603, "grad_norm": 0.4824446737766266, "learning_rate": 6.911493970749779e-06, "loss": 0.024682026356458664, "memory(GiB)": 21.48, "step": 12510, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.952041 }, { "epoch": 0.40642562453302145, "grad_norm": 0.4146524667739868, "learning_rate": 6.910997609587894e-06, "loss": 0.028663024306297302, "memory(GiB)": 21.48, "step": 12511, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.952052 }, { "epoch": 0.4064581099957769, "grad_norm": 0.3161047101020813, "learning_rate": 6.910501226370552e-06, "loss": 0.02162344940006733, "memory(GiB)": 21.48, "step": 12512, "token_acc": 1.0, "train_speed(iter/s)": 0.952063 }, { "epoch": 0.40649059545853233, "grad_norm": 0.3179706335067749, "learning_rate": 6.910004821103484e-06, "loss": 0.020866991952061653, "memory(GiB)": 21.48, "step": 12513, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.952074 }, { "epoch": 0.40652308092128775, "grad_norm": 0.3477620482444763, "learning_rate": 6.909508393792419e-06, "loss": 0.02326522395014763, "memory(GiB)": 21.48, "step": 12514, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.952085 }, { "epoch": 0.40655556638404317, "grad_norm": 0.4812907576560974, "learning_rate": 6.909011944443083e-06, "loss": 0.021449415013194084, "memory(GiB)": 21.48, "step": 12515, "token_acc": 0.9884615384615385, "train_speed(iter/s)": 0.952097 }, { "epoch": 0.4065880518467986, "grad_norm": 0.33802178502082825, "learning_rate": 6.9085154730612105e-06, "loss": 0.026359044015407562, "memory(GiB)": 21.48, "step": 12516, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.952111 }, { "epoch": 0.406620537309554, "grad_norm": 0.3766777217388153, "learning_rate": 6.908018979652528e-06, "loss": 0.025272246450185776, "memory(GiB)": 21.48, "step": 12517, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.952125 }, { "epoch": 0.4066530227723094, "grad_norm": 0.4970840513706207, "learning_rate": 6.907522464222768e-06, "loss": 0.04035298526287079, "memory(GiB)": 21.48, "step": 12518, "token_acc": 0.9890510948905109, "train_speed(iter/s)": 0.952139 }, { "epoch": 0.40668550823506483, "grad_norm": 0.45631328225135803, "learning_rate": 6.90702592677766e-06, "loss": 0.028403468430042267, "memory(GiB)": 21.48, "step": 12519, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.952151 }, { "epoch": 0.40671799369782025, "grad_norm": 0.2904373109340668, "learning_rate": 6.906529367322933e-06, "loss": 0.019534191116690636, "memory(GiB)": 21.48, "step": 12520, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.952161 }, { "epoch": 0.40675047916057566, "grad_norm": 0.47548919916152954, "learning_rate": 6.90603278586432e-06, "loss": 0.0316944383084774, "memory(GiB)": 21.48, "step": 12521, "token_acc": 1.0, "train_speed(iter/s)": 0.952176 }, { "epoch": 0.4067829646233311, "grad_norm": 0.4163305461406708, "learning_rate": 6.905536182407553e-06, "loss": 0.036979541182518005, "memory(GiB)": 21.48, "step": 12522, "token_acc": 0.9724409448818898, "train_speed(iter/s)": 0.952189 }, { "epoch": 0.4068154500860865, "grad_norm": 0.3812284469604492, "learning_rate": 6.905039556958361e-06, "loss": 0.023495085537433624, "memory(GiB)": 21.48, "step": 12523, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.952202 }, { "epoch": 0.4068479355488419, "grad_norm": 0.5456402897834778, "learning_rate": 6.904542909522477e-06, "loss": 0.025419730693101883, "memory(GiB)": 21.48, "step": 12524, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.952215 }, { "epoch": 0.4068804210115973, "grad_norm": 0.29287439584732056, "learning_rate": 6.904046240105633e-06, "loss": 0.024422721937298775, "memory(GiB)": 21.48, "step": 12525, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.952228 }, { "epoch": 0.40691290647435274, "grad_norm": 0.45694953203201294, "learning_rate": 6.903549548713561e-06, "loss": 0.03475744649767876, "memory(GiB)": 21.48, "step": 12526, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.952244 }, { "epoch": 0.40694539193710816, "grad_norm": 0.8215309381484985, "learning_rate": 6.903052835351994e-06, "loss": 0.0444217212498188, "memory(GiB)": 21.48, "step": 12527, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.95226 }, { "epoch": 0.4069778773998636, "grad_norm": 0.36729303002357483, "learning_rate": 6.902556100026665e-06, "loss": 0.02992478758096695, "memory(GiB)": 21.48, "step": 12528, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.952276 }, { "epoch": 0.407010362862619, "grad_norm": 0.4429605305194855, "learning_rate": 6.902059342743305e-06, "loss": 0.03408820927143097, "memory(GiB)": 21.48, "step": 12529, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.952292 }, { "epoch": 0.4070428483253744, "grad_norm": 0.431753009557724, "learning_rate": 6.901562563507649e-06, "loss": 0.02181783691048622, "memory(GiB)": 21.48, "step": 12530, "token_acc": 0.9775280898876404, "train_speed(iter/s)": 0.952308 }, { "epoch": 0.4070753337881298, "grad_norm": 0.3230399787425995, "learning_rate": 6.901065762325429e-06, "loss": 0.026545614004135132, "memory(GiB)": 21.48, "step": 12531, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.952324 }, { "epoch": 0.40710781925088524, "grad_norm": 0.3631261885166168, "learning_rate": 6.900568939202381e-06, "loss": 0.026284223422408104, "memory(GiB)": 21.48, "step": 12532, "token_acc": 0.988, "train_speed(iter/s)": 0.95234 }, { "epoch": 0.40714030471364065, "grad_norm": 0.8291481733322144, "learning_rate": 6.900072094144238e-06, "loss": 0.04216988384723663, "memory(GiB)": 21.48, "step": 12533, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.952356 }, { "epoch": 0.40717279017639607, "grad_norm": 0.3473012149333954, "learning_rate": 6.899575227156734e-06, "loss": 0.02651720494031906, "memory(GiB)": 21.48, "step": 12534, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.952372 }, { "epoch": 0.4072052756391515, "grad_norm": 0.5540883541107178, "learning_rate": 6.899078338245603e-06, "loss": 0.034429073333740234, "memory(GiB)": 21.48, "step": 12535, "token_acc": 0.9761904761904762, "train_speed(iter/s)": 0.95239 }, { "epoch": 0.4072377611019069, "grad_norm": 0.39946091175079346, "learning_rate": 6.898581427416579e-06, "loss": 0.022596366703510284, "memory(GiB)": 21.48, "step": 12536, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.952406 }, { "epoch": 0.4072702465646623, "grad_norm": 0.4022013545036316, "learning_rate": 6.898084494675401e-06, "loss": 0.027063149958848953, "memory(GiB)": 21.48, "step": 12537, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.952423 }, { "epoch": 0.40730273202741774, "grad_norm": 0.6288347840309143, "learning_rate": 6.8975875400278e-06, "loss": 0.023624617606401443, "memory(GiB)": 21.48, "step": 12538, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.952439 }, { "epoch": 0.40733521749017315, "grad_norm": 0.31554853916168213, "learning_rate": 6.897090563479514e-06, "loss": 0.01802605763077736, "memory(GiB)": 21.48, "step": 12539, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.952454 }, { "epoch": 0.40736770295292857, "grad_norm": 0.3301674723625183, "learning_rate": 6.896593565036277e-06, "loss": 0.025467492640018463, "memory(GiB)": 21.48, "step": 12540, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.952471 }, { "epoch": 0.407400188415684, "grad_norm": 0.33996790647506714, "learning_rate": 6.896096544703828e-06, "loss": 0.028696026653051376, "memory(GiB)": 21.48, "step": 12541, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.952487 }, { "epoch": 0.4074326738784394, "grad_norm": 0.2667141258716583, "learning_rate": 6.8955995024879e-06, "loss": 0.016611818224191666, "memory(GiB)": 21.48, "step": 12542, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.952504 }, { "epoch": 0.4074651593411948, "grad_norm": 0.4132024049758911, "learning_rate": 6.895102438394229e-06, "loss": 0.02527238056063652, "memory(GiB)": 21.48, "step": 12543, "token_acc": 1.0, "train_speed(iter/s)": 0.95252 }, { "epoch": 0.40749764480395023, "grad_norm": 1.4184919595718384, "learning_rate": 6.894605352428555e-06, "loss": 0.025362472981214523, "memory(GiB)": 21.48, "step": 12544, "token_acc": 1.0, "train_speed(iter/s)": 0.952534 }, { "epoch": 0.40753013026670565, "grad_norm": 0.2776193618774414, "learning_rate": 6.894108244596612e-06, "loss": 0.024378448724746704, "memory(GiB)": 21.48, "step": 12545, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.95255 }, { "epoch": 0.40756261572946106, "grad_norm": 0.3783416152000427, "learning_rate": 6.8936111149041406e-06, "loss": 0.030766615644097328, "memory(GiB)": 21.48, "step": 12546, "token_acc": 0.9836734693877551, "train_speed(iter/s)": 0.952565 }, { "epoch": 0.4075951011922165, "grad_norm": 0.833358108997345, "learning_rate": 6.893113963356875e-06, "loss": 0.021582067012786865, "memory(GiB)": 21.48, "step": 12547, "token_acc": 0.983739837398374, "train_speed(iter/s)": 0.952582 }, { "epoch": 0.4076275866549719, "grad_norm": 0.351666122674942, "learning_rate": 6.892616789960555e-06, "loss": 0.026868466287851334, "memory(GiB)": 21.48, "step": 12548, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.952599 }, { "epoch": 0.4076600721177273, "grad_norm": 0.3197287619113922, "learning_rate": 6.892119594720919e-06, "loss": 0.026778079569339752, "memory(GiB)": 21.48, "step": 12549, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.952615 }, { "epoch": 0.40769255758048273, "grad_norm": 0.287110835313797, "learning_rate": 6.891622377643704e-06, "loss": 0.018946412950754166, "memory(GiB)": 21.48, "step": 12550, "token_acc": 1.0, "train_speed(iter/s)": 0.952631 }, { "epoch": 0.40772504304323814, "grad_norm": 0.3790658712387085, "learning_rate": 6.89112513873465e-06, "loss": 0.024154625833034515, "memory(GiB)": 21.48, "step": 12551, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.952647 }, { "epoch": 0.40775752850599356, "grad_norm": 0.36063647270202637, "learning_rate": 6.890627877999494e-06, "loss": 0.023103587329387665, "memory(GiB)": 21.48, "step": 12552, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.952664 }, { "epoch": 0.407790013968749, "grad_norm": 0.4367527961730957, "learning_rate": 6.890130595443975e-06, "loss": 0.03670906275510788, "memory(GiB)": 21.48, "step": 12553, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.952682 }, { "epoch": 0.4078224994315044, "grad_norm": 0.4938969612121582, "learning_rate": 6.889633291073834e-06, "loss": 0.02927623689174652, "memory(GiB)": 21.48, "step": 12554, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.952699 }, { "epoch": 0.4078549848942598, "grad_norm": 0.3792512118816376, "learning_rate": 6.889135964894811e-06, "loss": 0.028809139505028725, "memory(GiB)": 21.48, "step": 12555, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.952712 }, { "epoch": 0.4078874703570152, "grad_norm": 0.34544894099235535, "learning_rate": 6.888638616912643e-06, "loss": 0.015944665297865868, "memory(GiB)": 21.48, "step": 12556, "token_acc": 1.0, "train_speed(iter/s)": 0.952724 }, { "epoch": 0.40791995581977064, "grad_norm": 0.4865983724594116, "learning_rate": 6.888141247133071e-06, "loss": 0.0243646539747715, "memory(GiB)": 21.48, "step": 12557, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.952737 }, { "epoch": 0.40795244128252606, "grad_norm": 0.54993736743927, "learning_rate": 6.887643855561837e-06, "loss": 0.029160616919398308, "memory(GiB)": 21.48, "step": 12558, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.952748 }, { "epoch": 0.4079849267452815, "grad_norm": 0.6551170349121094, "learning_rate": 6.887146442204681e-06, "loss": 0.03302035480737686, "memory(GiB)": 21.48, "step": 12559, "token_acc": 0.976, "train_speed(iter/s)": 0.952761 }, { "epoch": 0.4080174122080369, "grad_norm": 0.4421779215335846, "learning_rate": 6.886649007067343e-06, "loss": 0.028268057852983475, "memory(GiB)": 21.48, "step": 12560, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.952773 }, { "epoch": 0.4080498976707923, "grad_norm": 0.3739538788795471, "learning_rate": 6.886151550155562e-06, "loss": 0.023029016330838203, "memory(GiB)": 21.48, "step": 12561, "token_acc": 1.0, "train_speed(iter/s)": 0.952785 }, { "epoch": 0.4080823831335477, "grad_norm": 0.2936466336250305, "learning_rate": 6.8856540714750855e-06, "loss": 0.021326083689928055, "memory(GiB)": 21.48, "step": 12562, "token_acc": 0.986013986013986, "train_speed(iter/s)": 0.952797 }, { "epoch": 0.40811486859630314, "grad_norm": 0.480695515871048, "learning_rate": 6.885156571031648e-06, "loss": 0.017379233613610268, "memory(GiB)": 21.48, "step": 12563, "token_acc": 0.9930313588850174, "train_speed(iter/s)": 0.95279 }, { "epoch": 0.40814735405905855, "grad_norm": 0.47453001141548157, "learning_rate": 6.884659048830995e-06, "loss": 0.026891443878412247, "memory(GiB)": 21.48, "step": 12564, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.952802 }, { "epoch": 0.40817983952181397, "grad_norm": 0.4284614026546478, "learning_rate": 6.884161504878869e-06, "loss": 0.02334560640156269, "memory(GiB)": 21.48, "step": 12565, "token_acc": 0.9823943661971831, "train_speed(iter/s)": 0.952815 }, { "epoch": 0.4082123249845694, "grad_norm": 0.6722145080566406, "learning_rate": 6.883663939181011e-06, "loss": 0.030032213777303696, "memory(GiB)": 21.48, "step": 12566, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.952826 }, { "epoch": 0.4082448104473248, "grad_norm": 0.440090537071228, "learning_rate": 6.883166351743165e-06, "loss": 0.021622132509946823, "memory(GiB)": 21.48, "step": 12567, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.952836 }, { "epoch": 0.4082772959100802, "grad_norm": 0.3383011817932129, "learning_rate": 6.882668742571072e-06, "loss": 0.02216409333050251, "memory(GiB)": 21.48, "step": 12568, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.952848 }, { "epoch": 0.40830978137283563, "grad_norm": 0.39673569798469543, "learning_rate": 6.882171111670475e-06, "loss": 0.02301686257123947, "memory(GiB)": 21.48, "step": 12569, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.952859 }, { "epoch": 0.40834226683559105, "grad_norm": 0.473145991563797, "learning_rate": 6.881673459047119e-06, "loss": 0.03577857092022896, "memory(GiB)": 21.48, "step": 12570, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.95287 }, { "epoch": 0.40837475229834647, "grad_norm": 0.28126251697540283, "learning_rate": 6.881175784706747e-06, "loss": 0.01818399876356125, "memory(GiB)": 21.48, "step": 12571, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.952881 }, { "epoch": 0.4084072377611019, "grad_norm": 0.6020501255989075, "learning_rate": 6.880678088655104e-06, "loss": 0.03642772138118744, "memory(GiB)": 21.48, "step": 12572, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.952894 }, { "epoch": 0.4084397232238573, "grad_norm": 0.2945112884044647, "learning_rate": 6.88018037089793e-06, "loss": 0.016831841319799423, "memory(GiB)": 21.48, "step": 12573, "token_acc": 1.0, "train_speed(iter/s)": 0.952906 }, { "epoch": 0.4084722086866127, "grad_norm": 0.27904370427131653, "learning_rate": 6.8796826314409736e-06, "loss": 0.01981993392109871, "memory(GiB)": 21.48, "step": 12574, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.952919 }, { "epoch": 0.40850469414936813, "grad_norm": 0.35172808170318604, "learning_rate": 6.879184870289976e-06, "loss": 0.023611808195710182, "memory(GiB)": 21.48, "step": 12575, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.952933 }, { "epoch": 0.4085371796121236, "grad_norm": 0.5816140174865723, "learning_rate": 6.878687087450686e-06, "loss": 0.036822713911533356, "memory(GiB)": 21.48, "step": 12576, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.952946 }, { "epoch": 0.408569665074879, "grad_norm": 0.5481429100036621, "learning_rate": 6.878189282928846e-06, "loss": 0.02381882630288601, "memory(GiB)": 21.48, "step": 12577, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.952959 }, { "epoch": 0.40860215053763443, "grad_norm": 0.35140714049339294, "learning_rate": 6.8776914567302014e-06, "loss": 0.026558492332696915, "memory(GiB)": 21.48, "step": 12578, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.952972 }, { "epoch": 0.40863463600038985, "grad_norm": 0.3798210322856903, "learning_rate": 6.877193608860499e-06, "loss": 0.030185652896761894, "memory(GiB)": 21.48, "step": 12579, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.952986 }, { "epoch": 0.40866712146314527, "grad_norm": 0.33045321702957153, "learning_rate": 6.876695739325482e-06, "loss": 0.023856543004512787, "memory(GiB)": 21.48, "step": 12580, "token_acc": 0.9923371647509579, "train_speed(iter/s)": 0.953 }, { "epoch": 0.4086996069259007, "grad_norm": 0.44351068139076233, "learning_rate": 6.876197848130899e-06, "loss": 0.027668550610542297, "memory(GiB)": 21.48, "step": 12581, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.953015 }, { "epoch": 0.4087320923886561, "grad_norm": 0.39210328459739685, "learning_rate": 6.875699935282496e-06, "loss": 0.025803372263908386, "memory(GiB)": 21.48, "step": 12582, "token_acc": 0.9778761061946902, "train_speed(iter/s)": 0.953029 }, { "epoch": 0.4087645778514115, "grad_norm": 0.47459933161735535, "learning_rate": 6.87520200078602e-06, "loss": 0.030033450573682785, "memory(GiB)": 21.48, "step": 12583, "token_acc": 0.98, "train_speed(iter/s)": 0.953042 }, { "epoch": 0.40879706331416693, "grad_norm": 0.43503814935684204, "learning_rate": 6.874704044647216e-06, "loss": 0.025948071852326393, "memory(GiB)": 21.48, "step": 12584, "token_acc": 0.9767441860465116, "train_speed(iter/s)": 0.953054 }, { "epoch": 0.40882954877692235, "grad_norm": 0.372884064912796, "learning_rate": 6.874206066871832e-06, "loss": 0.027332013472914696, "memory(GiB)": 21.48, "step": 12585, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.953065 }, { "epoch": 0.40886203423967776, "grad_norm": 0.35782602429389954, "learning_rate": 6.873708067465616e-06, "loss": 0.022204220294952393, "memory(GiB)": 21.48, "step": 12586, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.953079 }, { "epoch": 0.4088945197024332, "grad_norm": 0.41753625869750977, "learning_rate": 6.873210046434314e-06, "loss": 0.025160839781165123, "memory(GiB)": 21.48, "step": 12587, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.953093 }, { "epoch": 0.4089270051651886, "grad_norm": 1.2736965417861938, "learning_rate": 6.872712003783677e-06, "loss": 0.032906126230955124, "memory(GiB)": 21.48, "step": 12588, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.953107 }, { "epoch": 0.408959490627944, "grad_norm": 0.38457170128822327, "learning_rate": 6.872213939519449e-06, "loss": 0.028635496273636818, "memory(GiB)": 21.48, "step": 12589, "token_acc": 1.0, "train_speed(iter/s)": 0.953121 }, { "epoch": 0.4089919760906994, "grad_norm": 0.3579843044281006, "learning_rate": 6.87171585364738e-06, "loss": 0.02593369223177433, "memory(GiB)": 21.48, "step": 12590, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.953135 }, { "epoch": 0.40902446155345484, "grad_norm": 0.47280046343803406, "learning_rate": 6.87121774617322e-06, "loss": 0.028578532859683037, "memory(GiB)": 21.48, "step": 12591, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.953149 }, { "epoch": 0.40905694701621026, "grad_norm": 0.4546339809894562, "learning_rate": 6.870719617102717e-06, "loss": 0.03485129773616791, "memory(GiB)": 21.48, "step": 12592, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.953164 }, { "epoch": 0.4090894324789657, "grad_norm": 0.3883022964000702, "learning_rate": 6.870221466441618e-06, "loss": 0.027868833392858505, "memory(GiB)": 21.48, "step": 12593, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.95318 }, { "epoch": 0.4091219179417211, "grad_norm": 0.4082944989204407, "learning_rate": 6.8697232941956736e-06, "loss": 0.02909005433320999, "memory(GiB)": 21.48, "step": 12594, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.953197 }, { "epoch": 0.4091544034044765, "grad_norm": 0.5243545174598694, "learning_rate": 6.869225100370636e-06, "loss": 0.032101914286613464, "memory(GiB)": 21.48, "step": 12595, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.953212 }, { "epoch": 0.4091868888672319, "grad_norm": 0.45000576972961426, "learning_rate": 6.868726884972251e-06, "loss": 0.03337017819285393, "memory(GiB)": 21.48, "step": 12596, "token_acc": 0.9747474747474747, "train_speed(iter/s)": 0.953228 }, { "epoch": 0.40921937432998734, "grad_norm": 0.3035622537136078, "learning_rate": 6.868228648006272e-06, "loss": 0.01852412335574627, "memory(GiB)": 21.48, "step": 12597, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.953245 }, { "epoch": 0.40925185979274276, "grad_norm": 0.376496285200119, "learning_rate": 6.867730389478446e-06, "loss": 0.023816432803869247, "memory(GiB)": 21.48, "step": 12598, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.95326 }, { "epoch": 0.40928434525549817, "grad_norm": 0.5168724656105042, "learning_rate": 6.867232109394527e-06, "loss": 0.03176383674144745, "memory(GiB)": 21.48, "step": 12599, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.953275 }, { "epoch": 0.4093168307182536, "grad_norm": 0.40907078981399536, "learning_rate": 6.866733807760264e-06, "loss": 0.036586564034223557, "memory(GiB)": 21.48, "step": 12600, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.95329 }, { "epoch": 0.409349316181009, "grad_norm": 0.3536503314971924, "learning_rate": 6.866235484581408e-06, "loss": 0.02523605152964592, "memory(GiB)": 21.48, "step": 12601, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.953306 }, { "epoch": 0.4093818016437644, "grad_norm": 0.27951768040657043, "learning_rate": 6.8657371398637105e-06, "loss": 0.02272622287273407, "memory(GiB)": 21.48, "step": 12602, "token_acc": 0.9923664122137404, "train_speed(iter/s)": 0.953321 }, { "epoch": 0.40941428710651984, "grad_norm": 0.3687175512313843, "learning_rate": 6.865238773612922e-06, "loss": 0.026156673207879066, "memory(GiB)": 21.48, "step": 12603, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.953338 }, { "epoch": 0.40944677256927525, "grad_norm": 0.4292220175266266, "learning_rate": 6.8647403858347975e-06, "loss": 0.030392782762646675, "memory(GiB)": 21.48, "step": 12604, "token_acc": 0.9961832061068703, "train_speed(iter/s)": 0.953355 }, { "epoch": 0.40947925803203067, "grad_norm": 0.2734363377094269, "learning_rate": 6.864241976535085e-06, "loss": 0.020770393311977386, "memory(GiB)": 21.48, "step": 12605, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.953372 }, { "epoch": 0.4095117434947861, "grad_norm": 0.3316398561000824, "learning_rate": 6.863743545719541e-06, "loss": 0.024567674845457077, "memory(GiB)": 21.48, "step": 12606, "token_acc": 1.0, "train_speed(iter/s)": 0.953388 }, { "epoch": 0.4095442289575415, "grad_norm": 0.2241629809141159, "learning_rate": 6.863245093393913e-06, "loss": 0.017616327852010727, "memory(GiB)": 21.48, "step": 12607, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.953405 }, { "epoch": 0.4095767144202969, "grad_norm": 0.3738003671169281, "learning_rate": 6.862746619563959e-06, "loss": 0.021938912570476532, "memory(GiB)": 21.48, "step": 12608, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.953421 }, { "epoch": 0.40960919988305233, "grad_norm": 0.3726353347301483, "learning_rate": 6.862248124235429e-06, "loss": 0.023705847561359406, "memory(GiB)": 21.48, "step": 12609, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.953437 }, { "epoch": 0.40964168534580775, "grad_norm": 0.338672012090683, "learning_rate": 6.861749607414076e-06, "loss": 0.02009686827659607, "memory(GiB)": 21.48, "step": 12610, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.953454 }, { "epoch": 0.40967417080856317, "grad_norm": 0.32448527216911316, "learning_rate": 6.8612510691056565e-06, "loss": 0.02169281244277954, "memory(GiB)": 21.48, "step": 12611, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.953471 }, { "epoch": 0.4097066562713186, "grad_norm": 0.34417134523391724, "learning_rate": 6.86075250931592e-06, "loss": 0.02074952982366085, "memory(GiB)": 21.48, "step": 12612, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.953487 }, { "epoch": 0.409739141734074, "grad_norm": 0.4503189027309418, "learning_rate": 6.860253928050625e-06, "loss": 0.025920569896697998, "memory(GiB)": 21.48, "step": 12613, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.953501 }, { "epoch": 0.4097716271968294, "grad_norm": 0.31822535395622253, "learning_rate": 6.859755325315524e-06, "loss": 0.019912663847208023, "memory(GiB)": 21.48, "step": 12614, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.953513 }, { "epoch": 0.40980411265958483, "grad_norm": 0.377607524394989, "learning_rate": 6.859256701116369e-06, "loss": 0.02657947689294815, "memory(GiB)": 21.48, "step": 12615, "token_acc": 0.992, "train_speed(iter/s)": 0.953525 }, { "epoch": 0.40983659812234025, "grad_norm": 0.35393205285072327, "learning_rate": 6.8587580554589185e-06, "loss": 0.022093819454312325, "memory(GiB)": 21.48, "step": 12616, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.953537 }, { "epoch": 0.40986908358509566, "grad_norm": 0.48024803400039673, "learning_rate": 6.858259388348926e-06, "loss": 0.02543976716697216, "memory(GiB)": 21.48, "step": 12617, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.95355 }, { "epoch": 0.4099015690478511, "grad_norm": 0.49510428309440613, "learning_rate": 6.8577606997921465e-06, "loss": 0.021678876131772995, "memory(GiB)": 21.48, "step": 12618, "token_acc": 0.9947643979057592, "train_speed(iter/s)": 0.953563 }, { "epoch": 0.4099340545106065, "grad_norm": 0.3672857880592346, "learning_rate": 6.857261989794335e-06, "loss": 0.025719141587615013, "memory(GiB)": 21.48, "step": 12619, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.953575 }, { "epoch": 0.4099665399733619, "grad_norm": 0.40197229385375977, "learning_rate": 6.85676325836125e-06, "loss": 0.0243307426571846, "memory(GiB)": 21.48, "step": 12620, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.953587 }, { "epoch": 0.4099990254361173, "grad_norm": 0.30611497163772583, "learning_rate": 6.856264505498644e-06, "loss": 0.017564117908477783, "memory(GiB)": 21.48, "step": 12621, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.9536 }, { "epoch": 0.41003151089887274, "grad_norm": 0.5897549390792847, "learning_rate": 6.855765731212276e-06, "loss": 0.022212523967027664, "memory(GiB)": 21.48, "step": 12622, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.953613 }, { "epoch": 0.41006399636162816, "grad_norm": 0.516876757144928, "learning_rate": 6.8552669355079015e-06, "loss": 0.02573554217815399, "memory(GiB)": 21.48, "step": 12623, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.953626 }, { "epoch": 0.4100964818243836, "grad_norm": 0.3826930522918701, "learning_rate": 6.854768118391275e-06, "loss": 0.027103375643491745, "memory(GiB)": 21.48, "step": 12624, "token_acc": 1.0, "train_speed(iter/s)": 0.953638 }, { "epoch": 0.410128967287139, "grad_norm": 0.5495727062225342, "learning_rate": 6.854269279868158e-06, "loss": 0.029896177351474762, "memory(GiB)": 21.48, "step": 12625, "token_acc": 0.9779005524861878, "train_speed(iter/s)": 0.95365 }, { "epoch": 0.4101614527498944, "grad_norm": 0.3357589840888977, "learning_rate": 6.853770419944304e-06, "loss": 0.02030474692583084, "memory(GiB)": 21.48, "step": 12626, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.953662 }, { "epoch": 0.4101939382126498, "grad_norm": 0.35915830731391907, "learning_rate": 6.853271538625474e-06, "loss": 0.024569671601057053, "memory(GiB)": 21.48, "step": 12627, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.953675 }, { "epoch": 0.41022642367540524, "grad_norm": 0.37859201431274414, "learning_rate": 6.852772635917423e-06, "loss": 0.031396668404340744, "memory(GiB)": 21.48, "step": 12628, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.953688 }, { "epoch": 0.41025890913816065, "grad_norm": 0.3665357530117035, "learning_rate": 6.8522737118259084e-06, "loss": 0.02119060792028904, "memory(GiB)": 21.48, "step": 12629, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.953701 }, { "epoch": 0.41029139460091607, "grad_norm": 0.40368762612342834, "learning_rate": 6.851774766356691e-06, "loss": 0.02403521165251732, "memory(GiB)": 21.48, "step": 12630, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.953713 }, { "epoch": 0.4103238800636715, "grad_norm": 0.3011608123779297, "learning_rate": 6.851275799515528e-06, "loss": 0.01516571082174778, "memory(GiB)": 21.48, "step": 12631, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.953725 }, { "epoch": 0.4103563655264269, "grad_norm": 0.29995930194854736, "learning_rate": 6.850776811308179e-06, "loss": 0.015127791091799736, "memory(GiB)": 21.48, "step": 12632, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.953736 }, { "epoch": 0.4103888509891823, "grad_norm": 0.4508040249347687, "learning_rate": 6.8502778017404005e-06, "loss": 0.03111989051103592, "memory(GiB)": 21.48, "step": 12633, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.953746 }, { "epoch": 0.41042133645193773, "grad_norm": 0.35462701320648193, "learning_rate": 6.849778770817954e-06, "loss": 0.02290978655219078, "memory(GiB)": 21.48, "step": 12634, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.953757 }, { "epoch": 0.41045382191469315, "grad_norm": 0.39246538281440735, "learning_rate": 6.849279718546599e-06, "loss": 0.02679331600666046, "memory(GiB)": 21.48, "step": 12635, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.953769 }, { "epoch": 0.41048630737744857, "grad_norm": 0.2778951823711395, "learning_rate": 6.848780644932094e-06, "loss": 0.021884657442569733, "memory(GiB)": 21.48, "step": 12636, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.953782 }, { "epoch": 0.410518792840204, "grad_norm": 0.5086390376091003, "learning_rate": 6.8482815499802e-06, "loss": 0.03242575004696846, "memory(GiB)": 21.48, "step": 12637, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.953793 }, { "epoch": 0.4105512783029594, "grad_norm": 0.4279949963092804, "learning_rate": 6.847782433696677e-06, "loss": 0.01708459109067917, "memory(GiB)": 21.48, "step": 12638, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.953805 }, { "epoch": 0.4105837637657148, "grad_norm": 0.49384263157844543, "learning_rate": 6.847283296087286e-06, "loss": 0.02729225531220436, "memory(GiB)": 21.48, "step": 12639, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.953818 }, { "epoch": 0.4106162492284703, "grad_norm": 0.406363308429718, "learning_rate": 6.846784137157785e-06, "loss": 0.023009195923805237, "memory(GiB)": 21.48, "step": 12640, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.953829 }, { "epoch": 0.4106487346912257, "grad_norm": 0.3374292254447937, "learning_rate": 6.846284956913938e-06, "loss": 0.01997619867324829, "memory(GiB)": 21.48, "step": 12641, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.953841 }, { "epoch": 0.4106812201539811, "grad_norm": 0.4653995633125305, "learning_rate": 6.845785755361506e-06, "loss": 0.03156533092260361, "memory(GiB)": 21.48, "step": 12642, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.953853 }, { "epoch": 0.41071370561673654, "grad_norm": 0.5108234286308289, "learning_rate": 6.845286532506248e-06, "loss": 0.03501749783754349, "memory(GiB)": 21.48, "step": 12643, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.953866 }, { "epoch": 0.41074619107949195, "grad_norm": 0.49593859910964966, "learning_rate": 6.844787288353929e-06, "loss": 0.034142524003982544, "memory(GiB)": 21.48, "step": 12644, "token_acc": 0.9826086956521739, "train_speed(iter/s)": 0.953879 }, { "epoch": 0.41077867654224737, "grad_norm": 0.3898411989212036, "learning_rate": 6.844288022910308e-06, "loss": 0.029014524072408676, "memory(GiB)": 21.48, "step": 12645, "token_acc": 1.0, "train_speed(iter/s)": 0.953892 }, { "epoch": 0.4108111620050028, "grad_norm": 0.4218144416809082, "learning_rate": 6.843788736181149e-06, "loss": 0.029413601383566856, "memory(GiB)": 21.48, "step": 12646, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.953905 }, { "epoch": 0.4108436474677582, "grad_norm": 0.3856370151042938, "learning_rate": 6.843289428172215e-06, "loss": 0.025577832013368607, "memory(GiB)": 21.48, "step": 12647, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.953917 }, { "epoch": 0.4108761329305136, "grad_norm": 0.3088783025741577, "learning_rate": 6.842790098889267e-06, "loss": 0.02004048228263855, "memory(GiB)": 21.48, "step": 12648, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.953929 }, { "epoch": 0.41090861839326903, "grad_norm": 0.6952904462814331, "learning_rate": 6.842290748338067e-06, "loss": 0.03675629198551178, "memory(GiB)": 21.48, "step": 12649, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.95394 }, { "epoch": 0.41094110385602445, "grad_norm": 0.38068127632141113, "learning_rate": 6.8417913765243815e-06, "loss": 0.025661706924438477, "memory(GiB)": 21.48, "step": 12650, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.953953 }, { "epoch": 0.41097358931877986, "grad_norm": 0.3383786976337433, "learning_rate": 6.841291983453973e-06, "loss": 0.02681996487081051, "memory(GiB)": 21.48, "step": 12651, "token_acc": 1.0, "train_speed(iter/s)": 0.953964 }, { "epoch": 0.4110060747815353, "grad_norm": 0.3469776511192322, "learning_rate": 6.8407925691326016e-06, "loss": 0.030007854104042053, "memory(GiB)": 21.48, "step": 12652, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.953976 }, { "epoch": 0.4110385602442907, "grad_norm": 0.4555474817752838, "learning_rate": 6.840293133566035e-06, "loss": 0.021737370640039444, "memory(GiB)": 21.48, "step": 12653, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.953989 }, { "epoch": 0.4110710457070461, "grad_norm": 0.33638861775398254, "learning_rate": 6.8397936767600355e-06, "loss": 0.021585330367088318, "memory(GiB)": 21.48, "step": 12654, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.954002 }, { "epoch": 0.41110353116980153, "grad_norm": 0.42022061347961426, "learning_rate": 6.839294198720369e-06, "loss": 0.02345334179699421, "memory(GiB)": 21.48, "step": 12655, "token_acc": 1.0, "train_speed(iter/s)": 0.954019 }, { "epoch": 0.41113601663255694, "grad_norm": 0.4214019775390625, "learning_rate": 6.8387946994528e-06, "loss": 0.01747995801270008, "memory(GiB)": 21.48, "step": 12656, "token_acc": 0.9884169884169884, "train_speed(iter/s)": 0.954035 }, { "epoch": 0.41116850209531236, "grad_norm": 0.3527398407459259, "learning_rate": 6.83829517896309e-06, "loss": 0.026319101452827454, "memory(GiB)": 21.48, "step": 12657, "token_acc": 1.0, "train_speed(iter/s)": 0.954051 }, { "epoch": 0.4112009875580678, "grad_norm": 0.45561134815216064, "learning_rate": 6.8377956372570094e-06, "loss": 0.028014708310365677, "memory(GiB)": 21.48, "step": 12658, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.954067 }, { "epoch": 0.4112334730208232, "grad_norm": 0.4391842186450958, "learning_rate": 6.837296074340321e-06, "loss": 0.02117897756397724, "memory(GiB)": 21.48, "step": 12659, "token_acc": 1.0, "train_speed(iter/s)": 0.954083 }, { "epoch": 0.4112659584835786, "grad_norm": 0.48585784435272217, "learning_rate": 6.83679649021879e-06, "loss": 0.03016357682645321, "memory(GiB)": 21.48, "step": 12660, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.954098 }, { "epoch": 0.411298443946334, "grad_norm": 0.3187272250652313, "learning_rate": 6.836296884898181e-06, "loss": 0.02389792539179325, "memory(GiB)": 21.48, "step": 12661, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.954114 }, { "epoch": 0.41133092940908944, "grad_norm": 0.3531225621700287, "learning_rate": 6.835797258384265e-06, "loss": 0.021613694727420807, "memory(GiB)": 21.48, "step": 12662, "token_acc": 0.993103448275862, "train_speed(iter/s)": 0.95413 }, { "epoch": 0.41136341487184486, "grad_norm": 0.5630180239677429, "learning_rate": 6.835297610682803e-06, "loss": 0.033446669578552246, "memory(GiB)": 21.48, "step": 12663, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.954146 }, { "epoch": 0.4113959003346003, "grad_norm": 0.6847231388092041, "learning_rate": 6.834797941799565e-06, "loss": 0.0413549542427063, "memory(GiB)": 21.48, "step": 12664, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.954162 }, { "epoch": 0.4114283857973557, "grad_norm": 0.5975587368011475, "learning_rate": 6.834298251740317e-06, "loss": 0.031860873103141785, "memory(GiB)": 21.48, "step": 12665, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.954179 }, { "epoch": 0.4114608712601111, "grad_norm": 0.515470027923584, "learning_rate": 6.833798540510824e-06, "loss": 0.025910893455147743, "memory(GiB)": 21.48, "step": 12666, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.954194 }, { "epoch": 0.4114933567228665, "grad_norm": 0.3602025508880615, "learning_rate": 6.833298808116857e-06, "loss": 0.02563808672130108, "memory(GiB)": 21.48, "step": 12667, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.954211 }, { "epoch": 0.41152584218562194, "grad_norm": 0.5273259878158569, "learning_rate": 6.83279905456418e-06, "loss": 0.03163588047027588, "memory(GiB)": 21.48, "step": 12668, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.954227 }, { "epoch": 0.41155832764837735, "grad_norm": 0.33339008688926697, "learning_rate": 6.832299279858563e-06, "loss": 0.02421596273779869, "memory(GiB)": 21.48, "step": 12669, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.954243 }, { "epoch": 0.41159081311113277, "grad_norm": 0.46004873514175415, "learning_rate": 6.831799484005773e-06, "loss": 0.028136616572737694, "memory(GiB)": 21.48, "step": 12670, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.954258 }, { "epoch": 0.4116232985738882, "grad_norm": 0.3054220974445343, "learning_rate": 6.83129966701158e-06, "loss": 0.018507760018110275, "memory(GiB)": 21.48, "step": 12671, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.954273 }, { "epoch": 0.4116557840366436, "grad_norm": 0.43279609084129333, "learning_rate": 6.830799828881751e-06, "loss": 0.029579242691397667, "memory(GiB)": 21.48, "step": 12672, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.954289 }, { "epoch": 0.411688269499399, "grad_norm": 0.5498020052909851, "learning_rate": 6.830299969622054e-06, "loss": 0.02806282415986061, "memory(GiB)": 21.48, "step": 12673, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.954304 }, { "epoch": 0.41172075496215443, "grad_norm": 0.3686963617801666, "learning_rate": 6.829800089238261e-06, "loss": 0.025031642988324165, "memory(GiB)": 21.48, "step": 12674, "token_acc": 1.0, "train_speed(iter/s)": 0.954317 }, { "epoch": 0.41175324042490985, "grad_norm": 0.42557215690612793, "learning_rate": 6.829300187736138e-06, "loss": 0.024665728211402893, "memory(GiB)": 21.48, "step": 12675, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.95433 }, { "epoch": 0.41178572588766527, "grad_norm": 0.38279828429222107, "learning_rate": 6.828800265121457e-06, "loss": 0.027724990621209145, "memory(GiB)": 21.48, "step": 12676, "token_acc": 0.9756944444444444, "train_speed(iter/s)": 0.954342 }, { "epoch": 0.4118182113504207, "grad_norm": 0.4290882349014282, "learning_rate": 6.828300321399985e-06, "loss": 0.02798493579030037, "memory(GiB)": 21.48, "step": 12677, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.954354 }, { "epoch": 0.4118506968131761, "grad_norm": 0.3544834554195404, "learning_rate": 6.827800356577496e-06, "loss": 0.023435762152075768, "memory(GiB)": 21.48, "step": 12678, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.954366 }, { "epoch": 0.4118831822759315, "grad_norm": 0.30789774656295776, "learning_rate": 6.82730037065976e-06, "loss": 0.017357777804136276, "memory(GiB)": 21.48, "step": 12679, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.954379 }, { "epoch": 0.41191566773868693, "grad_norm": 0.3758745491504669, "learning_rate": 6.826800363652542e-06, "loss": 0.026349011808633804, "memory(GiB)": 21.48, "step": 12680, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.95439 }, { "epoch": 0.41194815320144235, "grad_norm": 0.4356979429721832, "learning_rate": 6.826300335561618e-06, "loss": 0.025822678580880165, "memory(GiB)": 21.48, "step": 12681, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.954403 }, { "epoch": 0.41198063866419776, "grad_norm": 0.4444079101085663, "learning_rate": 6.825800286392755e-06, "loss": 0.022591616958379745, "memory(GiB)": 21.48, "step": 12682, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.954413 }, { "epoch": 0.4120131241269532, "grad_norm": 0.3264819383621216, "learning_rate": 6.825300216151729e-06, "loss": 0.03070923686027527, "memory(GiB)": 21.48, "step": 12683, "token_acc": 1.0, "train_speed(iter/s)": 0.954425 }, { "epoch": 0.4120456095897086, "grad_norm": 0.47488933801651, "learning_rate": 6.824800124844308e-06, "loss": 0.026234891265630722, "memory(GiB)": 21.48, "step": 12684, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.954436 }, { "epoch": 0.412078095052464, "grad_norm": 0.542023241519928, "learning_rate": 6.824300012476266e-06, "loss": 0.026320550590753555, "memory(GiB)": 21.48, "step": 12685, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.954446 }, { "epoch": 0.4121105805152194, "grad_norm": 0.31745365262031555, "learning_rate": 6.823799879053372e-06, "loss": 0.03073020651936531, "memory(GiB)": 21.48, "step": 12686, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.954457 }, { "epoch": 0.41214306597797484, "grad_norm": 0.39563611149787903, "learning_rate": 6.823299724581401e-06, "loss": 0.03235336020588875, "memory(GiB)": 21.48, "step": 12687, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.95447 }, { "epoch": 0.41217555144073026, "grad_norm": 0.27919715642929077, "learning_rate": 6.822799549066124e-06, "loss": 0.020417647436261177, "memory(GiB)": 21.48, "step": 12688, "token_acc": 1.0, "train_speed(iter/s)": 0.954483 }, { "epoch": 0.4122080369034857, "grad_norm": 0.2574913501739502, "learning_rate": 6.822299352513313e-06, "loss": 0.015369201079010963, "memory(GiB)": 21.48, "step": 12689, "token_acc": 1.0, "train_speed(iter/s)": 0.954493 }, { "epoch": 0.4122405223662411, "grad_norm": 0.37153056263923645, "learning_rate": 6.821799134928744e-06, "loss": 0.029687630012631416, "memory(GiB)": 21.48, "step": 12690, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.954506 }, { "epoch": 0.4122730078289965, "grad_norm": 0.4128410816192627, "learning_rate": 6.821298896318187e-06, "loss": 0.02538606896996498, "memory(GiB)": 21.48, "step": 12691, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.954519 }, { "epoch": 0.4123054932917519, "grad_norm": 0.43195006251335144, "learning_rate": 6.820798636687418e-06, "loss": 0.023402832448482513, "memory(GiB)": 21.48, "step": 12692, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.95453 }, { "epoch": 0.41233797875450734, "grad_norm": 0.24511617422103882, "learning_rate": 6.820298356042208e-06, "loss": 0.020079627633094788, "memory(GiB)": 21.48, "step": 12693, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.954542 }, { "epoch": 0.41237046421726276, "grad_norm": 0.2863159477710724, "learning_rate": 6.819798054388331e-06, "loss": 0.018044356256723404, "memory(GiB)": 21.48, "step": 12694, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.954554 }, { "epoch": 0.41240294968001817, "grad_norm": 0.28814634680747986, "learning_rate": 6.819297731731564e-06, "loss": 0.01639021746814251, "memory(GiB)": 21.48, "step": 12695, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.954568 }, { "epoch": 0.4124354351427736, "grad_norm": 0.3356078863143921, "learning_rate": 6.818797388077681e-06, "loss": 0.019227217882871628, "memory(GiB)": 21.48, "step": 12696, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.95458 }, { "epoch": 0.412467920605529, "grad_norm": 0.40757426619529724, "learning_rate": 6.818297023432453e-06, "loss": 0.028235379606485367, "memory(GiB)": 21.48, "step": 12697, "token_acc": 0.9858657243816255, "train_speed(iter/s)": 0.954593 }, { "epoch": 0.4125004060682844, "grad_norm": 0.6471906900405884, "learning_rate": 6.817796637801657e-06, "loss": 0.020958947017788887, "memory(GiB)": 21.48, "step": 12698, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.954606 }, { "epoch": 0.41253289153103984, "grad_norm": 0.3132939338684082, "learning_rate": 6.817296231191071e-06, "loss": 0.02456684783101082, "memory(GiB)": 21.48, "step": 12699, "token_acc": 1.0, "train_speed(iter/s)": 0.954619 }, { "epoch": 0.41256537699379525, "grad_norm": 0.3982532024383545, "learning_rate": 6.816795803606464e-06, "loss": 0.021985553205013275, "memory(GiB)": 21.48, "step": 12700, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.954631 }, { "epoch": 0.41259786245655067, "grad_norm": 0.48545747995376587, "learning_rate": 6.8162953550536185e-06, "loss": 0.029705777764320374, "memory(GiB)": 21.48, "step": 12701, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.954644 }, { "epoch": 0.4126303479193061, "grad_norm": 0.46050405502319336, "learning_rate": 6.815794885538305e-06, "loss": 0.031663667410612106, "memory(GiB)": 21.48, "step": 12702, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.954655 }, { "epoch": 0.4126628333820615, "grad_norm": 0.6290549635887146, "learning_rate": 6.815294395066302e-06, "loss": 0.02192136086523533, "memory(GiB)": 21.48, "step": 12703, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.954667 }, { "epoch": 0.41269531884481697, "grad_norm": 0.33863112330436707, "learning_rate": 6.814793883643387e-06, "loss": 0.018005486577749252, "memory(GiB)": 21.48, "step": 12704, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.95468 }, { "epoch": 0.4127278043075724, "grad_norm": 2.179760456085205, "learning_rate": 6.8142933512753326e-06, "loss": 0.033713772892951965, "memory(GiB)": 21.48, "step": 12705, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.954693 }, { "epoch": 0.4127602897703278, "grad_norm": 0.42224013805389404, "learning_rate": 6.8137927979679205e-06, "loss": 0.025350650772452354, "memory(GiB)": 21.48, "step": 12706, "token_acc": 0.9932885906040269, "train_speed(iter/s)": 0.954705 }, { "epoch": 0.4127927752330832, "grad_norm": 0.32288527488708496, "learning_rate": 6.813292223726925e-06, "loss": 0.0198298878967762, "memory(GiB)": 21.48, "step": 12707, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.954718 }, { "epoch": 0.41282526069583864, "grad_norm": 0.3408307731151581, "learning_rate": 6.812791628558122e-06, "loss": 0.016667846590280533, "memory(GiB)": 21.48, "step": 12708, "token_acc": 0.9865771812080537, "train_speed(iter/s)": 0.954731 }, { "epoch": 0.41285774615859405, "grad_norm": 0.3823012709617615, "learning_rate": 6.812291012467294e-06, "loss": 0.029407326132059097, "memory(GiB)": 21.48, "step": 12709, "token_acc": 0.9793388429752066, "train_speed(iter/s)": 0.954743 }, { "epoch": 0.41289023162134947, "grad_norm": 0.40058964490890503, "learning_rate": 6.8117903754602126e-06, "loss": 0.02395160123705864, "memory(GiB)": 21.48, "step": 12710, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.954755 }, { "epoch": 0.4129227170841049, "grad_norm": 0.4553889036178589, "learning_rate": 6.8112897175426595e-06, "loss": 0.022863788530230522, "memory(GiB)": 21.48, "step": 12711, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.954765 }, { "epoch": 0.4129552025468603, "grad_norm": 0.6502983570098877, "learning_rate": 6.810789038720412e-06, "loss": 0.03720981255173683, "memory(GiB)": 21.48, "step": 12712, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.954777 }, { "epoch": 0.4129876880096157, "grad_norm": 0.3414027988910675, "learning_rate": 6.81028833899925e-06, "loss": 0.01981954649090767, "memory(GiB)": 21.48, "step": 12713, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.95479 }, { "epoch": 0.41302017347237113, "grad_norm": 0.37976107001304626, "learning_rate": 6.8097876183849496e-06, "loss": 0.02188543602824211, "memory(GiB)": 21.48, "step": 12714, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.954802 }, { "epoch": 0.41305265893512655, "grad_norm": 0.44298604130744934, "learning_rate": 6.809286876883293e-06, "loss": 0.030219666659832, "memory(GiB)": 21.48, "step": 12715, "token_acc": 0.9835164835164835, "train_speed(iter/s)": 0.954815 }, { "epoch": 0.41308514439788196, "grad_norm": 0.39293381571769714, "learning_rate": 6.808786114500057e-06, "loss": 0.028987957164645195, "memory(GiB)": 21.48, "step": 12716, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.95483 }, { "epoch": 0.4131176298606374, "grad_norm": 0.437549352645874, "learning_rate": 6.808285331241021e-06, "loss": 0.023757681250572205, "memory(GiB)": 21.48, "step": 12717, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.954846 }, { "epoch": 0.4131501153233928, "grad_norm": 0.39943358302116394, "learning_rate": 6.807784527111966e-06, "loss": 0.028588097542524338, "memory(GiB)": 21.48, "step": 12718, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.954862 }, { "epoch": 0.4131826007861482, "grad_norm": 0.3276243805885315, "learning_rate": 6.80728370211867e-06, "loss": 0.02071845903992653, "memory(GiB)": 21.48, "step": 12719, "token_acc": 1.0, "train_speed(iter/s)": 0.954878 }, { "epoch": 0.41321508624890363, "grad_norm": 0.3755037486553192, "learning_rate": 6.806782856266917e-06, "loss": 0.02707984670996666, "memory(GiB)": 21.48, "step": 12720, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.954894 }, { "epoch": 0.41324757171165905, "grad_norm": 0.3238063156604767, "learning_rate": 6.806281989562484e-06, "loss": 0.023816652595996857, "memory(GiB)": 21.48, "step": 12721, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.95491 }, { "epoch": 0.41328005717441446, "grad_norm": 0.3344348073005676, "learning_rate": 6.805781102011152e-06, "loss": 0.026683727279305458, "memory(GiB)": 21.48, "step": 12722, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.954926 }, { "epoch": 0.4133125426371699, "grad_norm": 0.39723554253578186, "learning_rate": 6.805280193618704e-06, "loss": 0.025110483169555664, "memory(GiB)": 21.48, "step": 12723, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.954942 }, { "epoch": 0.4133450280999253, "grad_norm": 1.209377646446228, "learning_rate": 6.804779264390918e-06, "loss": 0.03701664134860039, "memory(GiB)": 21.48, "step": 12724, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.954958 }, { "epoch": 0.4133775135626807, "grad_norm": 0.3728778660297394, "learning_rate": 6.804278314333579e-06, "loss": 0.033295273780822754, "memory(GiB)": 21.48, "step": 12725, "token_acc": 0.984375, "train_speed(iter/s)": 0.954974 }, { "epoch": 0.4134099990254361, "grad_norm": 0.46057209372520447, "learning_rate": 6.803777343452466e-06, "loss": 0.01760121062397957, "memory(GiB)": 21.48, "step": 12726, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.954989 }, { "epoch": 0.41344248448819154, "grad_norm": 0.26461347937583923, "learning_rate": 6.803276351753362e-06, "loss": 0.01923864707350731, "memory(GiB)": 21.48, "step": 12727, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.955005 }, { "epoch": 0.41347496995094696, "grad_norm": 0.44309982657432556, "learning_rate": 6.802775339242047e-06, "loss": 0.019016630947589874, "memory(GiB)": 21.48, "step": 12728, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.955022 }, { "epoch": 0.4135074554137024, "grad_norm": 0.32262173295021057, "learning_rate": 6.8022743059243064e-06, "loss": 0.026901692152023315, "memory(GiB)": 21.48, "step": 12729, "token_acc": 0.972, "train_speed(iter/s)": 0.955038 }, { "epoch": 0.4135399408764578, "grad_norm": 0.29458361864089966, "learning_rate": 6.801773251805923e-06, "loss": 0.017562907189130783, "memory(GiB)": 21.48, "step": 12730, "token_acc": 0.9966777408637874, "train_speed(iter/s)": 0.955054 }, { "epoch": 0.4135724263392132, "grad_norm": 0.40547049045562744, "learning_rate": 6.801272176892676e-06, "loss": 0.021690823137760162, "memory(GiB)": 21.48, "step": 12731, "token_acc": 1.0, "train_speed(iter/s)": 0.955069 }, { "epoch": 0.4136049118019686, "grad_norm": 0.4088034927845001, "learning_rate": 6.800771081190352e-06, "loss": 0.026991959661245346, "memory(GiB)": 21.48, "step": 12732, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.955086 }, { "epoch": 0.41363739726472404, "grad_norm": 0.29395878314971924, "learning_rate": 6.8002699647047295e-06, "loss": 0.02093624696135521, "memory(GiB)": 21.48, "step": 12733, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955099 }, { "epoch": 0.41366988272747945, "grad_norm": 0.5005411505699158, "learning_rate": 6.799768827441598e-06, "loss": 0.02813822031021118, "memory(GiB)": 21.48, "step": 12734, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.955111 }, { "epoch": 0.41370236819023487, "grad_norm": 0.8354775905609131, "learning_rate": 6.799267669406739e-06, "loss": 0.031270142644643784, "memory(GiB)": 21.48, "step": 12735, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955124 }, { "epoch": 0.4137348536529903, "grad_norm": 0.47096285223960876, "learning_rate": 6.798766490605937e-06, "loss": 0.03662529215216637, "memory(GiB)": 21.48, "step": 12736, "token_acc": 0.9798387096774194, "train_speed(iter/s)": 0.955137 }, { "epoch": 0.4137673391157457, "grad_norm": 0.36610642075538635, "learning_rate": 6.798265291044974e-06, "loss": 0.021866925060749054, "memory(GiB)": 21.48, "step": 12737, "token_acc": 0.996, "train_speed(iter/s)": 0.955148 }, { "epoch": 0.4137998245785011, "grad_norm": 0.35781481862068176, "learning_rate": 6.7977640707296366e-06, "loss": 0.02047424390912056, "memory(GiB)": 21.48, "step": 12738, "token_acc": 1.0, "train_speed(iter/s)": 0.955159 }, { "epoch": 0.41383231004125653, "grad_norm": 0.4529794454574585, "learning_rate": 6.797262829665709e-06, "loss": 0.03492162376642227, "memory(GiB)": 21.48, "step": 12739, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955172 }, { "epoch": 0.41386479550401195, "grad_norm": 0.48357757925987244, "learning_rate": 6.796761567858976e-06, "loss": 0.026871122419834137, "memory(GiB)": 21.48, "step": 12740, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.955183 }, { "epoch": 0.41389728096676737, "grad_norm": 0.3767905831336975, "learning_rate": 6.796260285315224e-06, "loss": 0.02433077245950699, "memory(GiB)": 21.48, "step": 12741, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.955194 }, { "epoch": 0.4139297664295228, "grad_norm": 0.39322295784950256, "learning_rate": 6.795758982040237e-06, "loss": 0.027985189110040665, "memory(GiB)": 21.48, "step": 12742, "token_acc": 0.9826839826839827, "train_speed(iter/s)": 0.955206 }, { "epoch": 0.4139622518922782, "grad_norm": 0.33021682500839233, "learning_rate": 6.795257658039801e-06, "loss": 0.020556963980197906, "memory(GiB)": 21.48, "step": 12743, "token_acc": 0.9869565217391304, "train_speed(iter/s)": 0.955218 }, { "epoch": 0.4139947373550336, "grad_norm": 0.3025646507740021, "learning_rate": 6.794756313319702e-06, "loss": 0.021034372970461845, "memory(GiB)": 21.48, "step": 12744, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.95523 }, { "epoch": 0.41402722281778903, "grad_norm": 0.30992281436920166, "learning_rate": 6.794254947885726e-06, "loss": 0.018328599631786346, "memory(GiB)": 21.48, "step": 12745, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.955241 }, { "epoch": 0.41405970828054445, "grad_norm": 0.29384705424308777, "learning_rate": 6.7937535617436615e-06, "loss": 0.029346715658903122, "memory(GiB)": 21.48, "step": 12746, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955252 }, { "epoch": 0.41409219374329986, "grad_norm": 0.4942488670349121, "learning_rate": 6.79325215489929e-06, "loss": 0.028308387845754623, "memory(GiB)": 21.48, "step": 12747, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.955264 }, { "epoch": 0.4141246792060553, "grad_norm": 0.4629698693752289, "learning_rate": 6.7927507273584034e-06, "loss": 0.03145166486501694, "memory(GiB)": 21.48, "step": 12748, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.955277 }, { "epoch": 0.4141571646688107, "grad_norm": 0.4746515154838562, "learning_rate": 6.7922492791267895e-06, "loss": 0.022506816312670708, "memory(GiB)": 21.48, "step": 12749, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.955289 }, { "epoch": 0.4141896501315661, "grad_norm": 0.3183838129043579, "learning_rate": 6.791747810210231e-06, "loss": 0.022036418318748474, "memory(GiB)": 21.48, "step": 12750, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.955301 }, { "epoch": 0.4142221355943215, "grad_norm": 0.4664495885372162, "learning_rate": 6.79124632061452e-06, "loss": 0.03625521808862686, "memory(GiB)": 21.48, "step": 12751, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.955314 }, { "epoch": 0.41425462105707694, "grad_norm": 0.2986292839050293, "learning_rate": 6.79074481034544e-06, "loss": 0.022372422739863396, "memory(GiB)": 21.48, "step": 12752, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.955327 }, { "epoch": 0.41428710651983236, "grad_norm": 0.24791480600833893, "learning_rate": 6.790243279408782e-06, "loss": 0.020039523020386696, "memory(GiB)": 21.48, "step": 12753, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955342 }, { "epoch": 0.4143195919825878, "grad_norm": 0.42005395889282227, "learning_rate": 6.789741727810334e-06, "loss": 0.025060251355171204, "memory(GiB)": 21.48, "step": 12754, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.955354 }, { "epoch": 0.4143520774453432, "grad_norm": 0.43083909153938293, "learning_rate": 6.789240155555885e-06, "loss": 0.018505264073610306, "memory(GiB)": 21.48, "step": 12755, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955366 }, { "epoch": 0.4143845629080986, "grad_norm": 0.3975731432437897, "learning_rate": 6.7887385626512225e-06, "loss": 0.032411906868219376, "memory(GiB)": 21.48, "step": 12756, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.955379 }, { "epoch": 0.414417048370854, "grad_norm": 0.5068339109420776, "learning_rate": 6.788236949102136e-06, "loss": 0.028927650302648544, "memory(GiB)": 21.48, "step": 12757, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.955391 }, { "epoch": 0.41444953383360944, "grad_norm": 0.3924030363559723, "learning_rate": 6.787735314914416e-06, "loss": 0.02760452963411808, "memory(GiB)": 21.48, "step": 12758, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.955403 }, { "epoch": 0.41448201929636486, "grad_norm": 0.4395853281021118, "learning_rate": 6.787233660093849e-06, "loss": 0.024590561166405678, "memory(GiB)": 21.48, "step": 12759, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.955416 }, { "epoch": 0.4145145047591203, "grad_norm": 0.4504311978816986, "learning_rate": 6.786731984646229e-06, "loss": 0.020840266719460487, "memory(GiB)": 21.48, "step": 12760, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.955426 }, { "epoch": 0.4145469902218757, "grad_norm": 0.4176081418991089, "learning_rate": 6.786230288577342e-06, "loss": 0.02079939655959606, "memory(GiB)": 21.48, "step": 12761, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.955439 }, { "epoch": 0.4145794756846311, "grad_norm": 0.3184390664100647, "learning_rate": 6.7857285718929825e-06, "loss": 0.021652713418006897, "memory(GiB)": 21.48, "step": 12762, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.955452 }, { "epoch": 0.4146119611473865, "grad_norm": 0.40634575486183167, "learning_rate": 6.785226834598935e-06, "loss": 0.03127095475792885, "memory(GiB)": 21.48, "step": 12763, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.955464 }, { "epoch": 0.41464444661014194, "grad_norm": 0.5852968096733093, "learning_rate": 6.7847250767009955e-06, "loss": 0.02924462966620922, "memory(GiB)": 21.48, "step": 12764, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.955476 }, { "epoch": 0.41467693207289735, "grad_norm": 0.39338549971580505, "learning_rate": 6.784223298204953e-06, "loss": 0.021356966346502304, "memory(GiB)": 21.48, "step": 12765, "token_acc": 0.9921875, "train_speed(iter/s)": 0.955487 }, { "epoch": 0.41470941753565277, "grad_norm": 0.4237165153026581, "learning_rate": 6.783721499116599e-06, "loss": 0.029342129826545715, "memory(GiB)": 21.48, "step": 12766, "token_acc": 0.9707112970711297, "train_speed(iter/s)": 0.955501 }, { "epoch": 0.4147419029984082, "grad_norm": 0.4874185621738434, "learning_rate": 6.783219679441725e-06, "loss": 0.03050251118838787, "memory(GiB)": 21.48, "step": 12767, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.955513 }, { "epoch": 0.41477438846116366, "grad_norm": 0.5330098867416382, "learning_rate": 6.782717839186121e-06, "loss": 0.029023993760347366, "memory(GiB)": 21.48, "step": 12768, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.955527 }, { "epoch": 0.4148068739239191, "grad_norm": 0.4367115795612335, "learning_rate": 6.782215978355582e-06, "loss": 0.02242167294025421, "memory(GiB)": 21.48, "step": 12769, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.955542 }, { "epoch": 0.4148393593866745, "grad_norm": 0.3296544551849365, "learning_rate": 6.781714096955898e-06, "loss": 0.020414313301444054, "memory(GiB)": 21.48, "step": 12770, "token_acc": 0.9947368421052631, "train_speed(iter/s)": 0.955554 }, { "epoch": 0.4148718448494299, "grad_norm": 0.4707582890987396, "learning_rate": 6.781212194992862e-06, "loss": 0.028955042362213135, "memory(GiB)": 21.48, "step": 12771, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955568 }, { "epoch": 0.4149043303121853, "grad_norm": 1.5871446132659912, "learning_rate": 6.780710272472266e-06, "loss": 0.03107621893286705, "memory(GiB)": 21.48, "step": 12772, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955579 }, { "epoch": 0.41493681577494074, "grad_norm": 0.41405656933784485, "learning_rate": 6.7802083293999045e-06, "loss": 0.030684003606438637, "memory(GiB)": 21.48, "step": 12773, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.95559 }, { "epoch": 0.41496930123769615, "grad_norm": 0.353322833776474, "learning_rate": 6.779706365781569e-06, "loss": 0.021355804055929184, "memory(GiB)": 21.48, "step": 12774, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955601 }, { "epoch": 0.41500178670045157, "grad_norm": 0.3062724173069, "learning_rate": 6.779204381623053e-06, "loss": 0.018252665176987648, "memory(GiB)": 21.48, "step": 12775, "token_acc": 1.0, "train_speed(iter/s)": 0.955614 }, { "epoch": 0.415034272163207, "grad_norm": 0.340509831905365, "learning_rate": 6.77870237693015e-06, "loss": 0.0310934130102396, "memory(GiB)": 21.48, "step": 12776, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.955627 }, { "epoch": 0.4150667576259624, "grad_norm": 0.46293529868125916, "learning_rate": 6.778200351708654e-06, "loss": 0.03277340158820152, "memory(GiB)": 21.48, "step": 12777, "token_acc": 0.9922779922779923, "train_speed(iter/s)": 0.95564 }, { "epoch": 0.4150992430887178, "grad_norm": 0.634568989276886, "learning_rate": 6.777698305964359e-06, "loss": 0.035142142325639725, "memory(GiB)": 21.48, "step": 12778, "token_acc": 0.9819494584837545, "train_speed(iter/s)": 0.955657 }, { "epoch": 0.41513172855147323, "grad_norm": 0.396028995513916, "learning_rate": 6.77719623970306e-06, "loss": 0.017522525042295456, "memory(GiB)": 21.48, "step": 12779, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.955673 }, { "epoch": 0.41516421401422865, "grad_norm": 0.3774532377719879, "learning_rate": 6.776694152930549e-06, "loss": 0.019436538219451904, "memory(GiB)": 21.48, "step": 12780, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.955688 }, { "epoch": 0.41519669947698407, "grad_norm": 0.42456862330436707, "learning_rate": 6.7761920456526265e-06, "loss": 0.024339135736227036, "memory(GiB)": 21.48, "step": 12781, "token_acc": 0.984375, "train_speed(iter/s)": 0.955704 }, { "epoch": 0.4152291849397395, "grad_norm": 0.47389915585517883, "learning_rate": 6.7756899178750815e-06, "loss": 0.03239265829324722, "memory(GiB)": 21.48, "step": 12782, "token_acc": 0.99, "train_speed(iter/s)": 0.955719 }, { "epoch": 0.4152616704024949, "grad_norm": 0.45659834146499634, "learning_rate": 6.775187769603712e-06, "loss": 0.02779385820031166, "memory(GiB)": 21.48, "step": 12783, "token_acc": 0.98828125, "train_speed(iter/s)": 0.955735 }, { "epoch": 0.4152941558652503, "grad_norm": 0.3685843050479889, "learning_rate": 6.774685600844311e-06, "loss": 0.02921377122402191, "memory(GiB)": 21.48, "step": 12784, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.955751 }, { "epoch": 0.41532664132800573, "grad_norm": 0.36682623624801636, "learning_rate": 6.774183411602679e-06, "loss": 0.024799931794404984, "memory(GiB)": 21.48, "step": 12785, "token_acc": 1.0, "train_speed(iter/s)": 0.955767 }, { "epoch": 0.41535912679076115, "grad_norm": 0.30978918075561523, "learning_rate": 6.773681201884606e-06, "loss": 0.02086222730576992, "memory(GiB)": 21.48, "step": 12786, "token_acc": 0.9840425531914894, "train_speed(iter/s)": 0.955783 }, { "epoch": 0.41539161225351656, "grad_norm": 0.5234740972518921, "learning_rate": 6.773178971695894e-06, "loss": 0.026104170829057693, "memory(GiB)": 21.48, "step": 12787, "token_acc": 0.9699570815450643, "train_speed(iter/s)": 0.955798 }, { "epoch": 0.415424097716272, "grad_norm": 0.32297447323799133, "learning_rate": 6.772676721042335e-06, "loss": 0.020679116249084473, "memory(GiB)": 21.48, "step": 12788, "token_acc": 1.0, "train_speed(iter/s)": 0.955814 }, { "epoch": 0.4154565831790274, "grad_norm": 0.40372949838638306, "learning_rate": 6.772174449929727e-06, "loss": 0.027513910084962845, "memory(GiB)": 21.48, "step": 12789, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.95583 }, { "epoch": 0.4154890686417828, "grad_norm": 0.4109477400779724, "learning_rate": 6.771672158363869e-06, "loss": 0.029857950285077095, "memory(GiB)": 21.48, "step": 12790, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.955846 }, { "epoch": 0.4155215541045382, "grad_norm": 0.3776818513870239, "learning_rate": 6.771169846350554e-06, "loss": 0.020351141691207886, "memory(GiB)": 21.48, "step": 12791, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.955861 }, { "epoch": 0.41555403956729364, "grad_norm": 0.2785986065864563, "learning_rate": 6.770667513895583e-06, "loss": 0.020259607583284378, "memory(GiB)": 21.48, "step": 12792, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955875 }, { "epoch": 0.41558652503004906, "grad_norm": 0.4118165075778961, "learning_rate": 6.770165161004751e-06, "loss": 0.024996906518936157, "memory(GiB)": 21.48, "step": 12793, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.955887 }, { "epoch": 0.4156190104928045, "grad_norm": 0.3870069086551666, "learning_rate": 6.7696627876838575e-06, "loss": 0.02965398132801056, "memory(GiB)": 21.48, "step": 12794, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.955901 }, { "epoch": 0.4156514959555599, "grad_norm": 0.3055121898651123, "learning_rate": 6.7691603939387e-06, "loss": 0.01587362214922905, "memory(GiB)": 21.48, "step": 12795, "token_acc": 0.9895470383275261, "train_speed(iter/s)": 0.955915 }, { "epoch": 0.4156839814183153, "grad_norm": 0.5966975688934326, "learning_rate": 6.768657979775076e-06, "loss": 0.04409012943506241, "memory(GiB)": 21.48, "step": 12796, "token_acc": 0.981549815498155, "train_speed(iter/s)": 0.955928 }, { "epoch": 0.4157164668810707, "grad_norm": 0.44660642743110657, "learning_rate": 6.768155545198786e-06, "loss": 0.014855614863336086, "memory(GiB)": 21.48, "step": 12797, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.95594 }, { "epoch": 0.41574895234382614, "grad_norm": 0.3424852192401886, "learning_rate": 6.767653090215627e-06, "loss": 0.024390146136283875, "memory(GiB)": 21.48, "step": 12798, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955952 }, { "epoch": 0.41578143780658156, "grad_norm": 0.27946561574935913, "learning_rate": 6.767150614831399e-06, "loss": 0.01734255626797676, "memory(GiB)": 21.48, "step": 12799, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.955965 }, { "epoch": 0.41581392326933697, "grad_norm": 0.2767590880393982, "learning_rate": 6.7666481190519e-06, "loss": 0.02320127934217453, "memory(GiB)": 21.48, "step": 12800, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.955977 }, { "epoch": 0.4158464087320924, "grad_norm": 0.32726362347602844, "learning_rate": 6.766145602882932e-06, "loss": 0.023831097409129143, "memory(GiB)": 21.48, "step": 12801, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.955988 }, { "epoch": 0.4158788941948478, "grad_norm": 0.4121706187725067, "learning_rate": 6.765643066330292e-06, "loss": 0.02439272217452526, "memory(GiB)": 21.48, "step": 12802, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.956 }, { "epoch": 0.4159113796576032, "grad_norm": 0.3493373394012451, "learning_rate": 6.76514050939978e-06, "loss": 0.016272801905870438, "memory(GiB)": 21.48, "step": 12803, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.956013 }, { "epoch": 0.41594386512035864, "grad_norm": 0.4511028826236725, "learning_rate": 6.764637932097199e-06, "loss": 0.020424775779247284, "memory(GiB)": 21.48, "step": 12804, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.956025 }, { "epoch": 0.41597635058311405, "grad_norm": 0.38308560848236084, "learning_rate": 6.764135334428345e-06, "loss": 0.02065502479672432, "memory(GiB)": 21.48, "step": 12805, "token_acc": 1.0, "train_speed(iter/s)": 0.956036 }, { "epoch": 0.41600883604586947, "grad_norm": 0.3779330551624298, "learning_rate": 6.763632716399024e-06, "loss": 0.02453090250492096, "memory(GiB)": 21.48, "step": 12806, "token_acc": 0.9788359788359788, "train_speed(iter/s)": 0.956049 }, { "epoch": 0.4160413215086249, "grad_norm": 0.4686840772628784, "learning_rate": 6.763130078015032e-06, "loss": 0.027626918628811836, "memory(GiB)": 21.48, "step": 12807, "token_acc": 1.0, "train_speed(iter/s)": 0.956061 }, { "epoch": 0.4160738069713803, "grad_norm": 0.3636173605918884, "learning_rate": 6.762627419282174e-06, "loss": 0.021997570991516113, "memory(GiB)": 21.48, "step": 12808, "token_acc": 1.0, "train_speed(iter/s)": 0.956073 }, { "epoch": 0.4161062924341357, "grad_norm": 0.609150230884552, "learning_rate": 6.762124740206249e-06, "loss": 0.022995539009571075, "memory(GiB)": 21.48, "step": 12809, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.956086 }, { "epoch": 0.41613877789689113, "grad_norm": 0.28293466567993164, "learning_rate": 6.761622040793057e-06, "loss": 0.021421559154987335, "memory(GiB)": 21.48, "step": 12810, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.956097 }, { "epoch": 0.41617126335964655, "grad_norm": 0.2989822030067444, "learning_rate": 6.761119321048404e-06, "loss": 0.019379159435629845, "memory(GiB)": 21.48, "step": 12811, "token_acc": 1.0, "train_speed(iter/s)": 0.956109 }, { "epoch": 0.41620374882240196, "grad_norm": 0.40879425406455994, "learning_rate": 6.760616580978089e-06, "loss": 0.030115369707345963, "memory(GiB)": 21.48, "step": 12812, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.956122 }, { "epoch": 0.4162362342851574, "grad_norm": 0.4060088098049164, "learning_rate": 6.760113820587917e-06, "loss": 0.025221755728125572, "memory(GiB)": 21.48, "step": 12813, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.956137 }, { "epoch": 0.4162687197479128, "grad_norm": 0.43513891100883484, "learning_rate": 6.759611039883687e-06, "loss": 0.026473652571439743, "memory(GiB)": 21.48, "step": 12814, "token_acc": 0.9809523809523809, "train_speed(iter/s)": 0.956152 }, { "epoch": 0.4163012052106682, "grad_norm": 0.3000466823577881, "learning_rate": 6.759108238871204e-06, "loss": 0.016460346058011055, "memory(GiB)": 21.48, "step": 12815, "token_acc": 1.0, "train_speed(iter/s)": 0.956168 }, { "epoch": 0.41633369067342363, "grad_norm": 0.2936707139015198, "learning_rate": 6.758605417556271e-06, "loss": 0.020395779982209206, "memory(GiB)": 21.48, "step": 12816, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.95618 }, { "epoch": 0.41636617613617904, "grad_norm": 0.509436309337616, "learning_rate": 6.75810257594469e-06, "loss": 0.024017034098505974, "memory(GiB)": 21.48, "step": 12817, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.956192 }, { "epoch": 0.41639866159893446, "grad_norm": 0.48988986015319824, "learning_rate": 6.757599714042267e-06, "loss": 0.03234532102942467, "memory(GiB)": 21.48, "step": 12818, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.956205 }, { "epoch": 0.4164311470616899, "grad_norm": 0.47515779733657837, "learning_rate": 6.757096831854801e-06, "loss": 0.028285138309001923, "memory(GiB)": 21.48, "step": 12819, "token_acc": 0.9948717948717949, "train_speed(iter/s)": 0.956216 }, { "epoch": 0.4164636325244453, "grad_norm": 0.35475853085517883, "learning_rate": 6.756593929388102e-06, "loss": 0.021521978080272675, "memory(GiB)": 21.48, "step": 12820, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.956228 }, { "epoch": 0.4164961179872007, "grad_norm": 0.6315789222717285, "learning_rate": 6.756091006647969e-06, "loss": 0.017973685637116432, "memory(GiB)": 21.48, "step": 12821, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.956239 }, { "epoch": 0.4165286034499561, "grad_norm": 0.3480144441127777, "learning_rate": 6.755588063640209e-06, "loss": 0.0214216411113739, "memory(GiB)": 21.48, "step": 12822, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.956252 }, { "epoch": 0.41656108891271154, "grad_norm": 0.4468110203742981, "learning_rate": 6.7550851003706265e-06, "loss": 0.028545675799250603, "memory(GiB)": 21.48, "step": 12823, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.956263 }, { "epoch": 0.41659357437546696, "grad_norm": 0.3174546957015991, "learning_rate": 6.754582116845025e-06, "loss": 0.017560653388500214, "memory(GiB)": 21.48, "step": 12824, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.956273 }, { "epoch": 0.4166260598382224, "grad_norm": 0.4135478436946869, "learning_rate": 6.75407911306921e-06, "loss": 0.026048697531223297, "memory(GiB)": 21.48, "step": 12825, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.956284 }, { "epoch": 0.4166585453009778, "grad_norm": 0.5107070803642273, "learning_rate": 6.753576089048989e-06, "loss": 0.030125372111797333, "memory(GiB)": 21.48, "step": 12826, "token_acc": 0.9794238683127572, "train_speed(iter/s)": 0.956296 }, { "epoch": 0.4166910307637332, "grad_norm": 0.4176843464374542, "learning_rate": 6.753073044790166e-06, "loss": 0.035768844187259674, "memory(GiB)": 21.48, "step": 12827, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.956308 }, { "epoch": 0.4167235162264886, "grad_norm": 0.3248235881328583, "learning_rate": 6.7525699802985455e-06, "loss": 0.03709767013788223, "memory(GiB)": 21.48, "step": 12828, "token_acc": 0.9739776951672863, "train_speed(iter/s)": 0.95632 }, { "epoch": 0.41675600168924404, "grad_norm": 0.3334468603134155, "learning_rate": 6.752066895579936e-06, "loss": 0.017608128488063812, "memory(GiB)": 21.48, "step": 12829, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956332 }, { "epoch": 0.41678848715199945, "grad_norm": 0.292082816362381, "learning_rate": 6.751563790640142e-06, "loss": 0.023566341027617455, "memory(GiB)": 21.48, "step": 12830, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.956345 }, { "epoch": 0.41682097261475487, "grad_norm": 0.38813552260398865, "learning_rate": 6.75106066548497e-06, "loss": 0.0257358830422163, "memory(GiB)": 21.48, "step": 12831, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.956358 }, { "epoch": 0.41685345807751034, "grad_norm": 0.5022813677787781, "learning_rate": 6.750557520120228e-06, "loss": 0.026361875236034393, "memory(GiB)": 21.48, "step": 12832, "token_acc": 0.992, "train_speed(iter/s)": 0.956371 }, { "epoch": 0.41688594354026576, "grad_norm": 0.3586970269680023, "learning_rate": 6.750054354551722e-06, "loss": 0.014366899617016315, "memory(GiB)": 21.48, "step": 12833, "token_acc": 1.0, "train_speed(iter/s)": 0.956384 }, { "epoch": 0.4169184290030212, "grad_norm": 0.525244951248169, "learning_rate": 6.74955116878526e-06, "loss": 0.03645291551947594, "memory(GiB)": 21.48, "step": 12834, "token_acc": 0.9797979797979798, "train_speed(iter/s)": 0.956394 }, { "epoch": 0.4169509144657766, "grad_norm": 0.3622131645679474, "learning_rate": 6.749047962826647e-06, "loss": 0.01978808455169201, "memory(GiB)": 21.48, "step": 12835, "token_acc": 0.992, "train_speed(iter/s)": 0.956407 }, { "epoch": 0.416983399928532, "grad_norm": 0.37265917658805847, "learning_rate": 6.748544736681695e-06, "loss": 0.02224062941968441, "memory(GiB)": 21.48, "step": 12836, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.95642 }, { "epoch": 0.4170158853912874, "grad_norm": 0.29557275772094727, "learning_rate": 6.748041490356208e-06, "loss": 0.022385910153388977, "memory(GiB)": 21.48, "step": 12837, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.956432 }, { "epoch": 0.41704837085404284, "grad_norm": 0.34169551730155945, "learning_rate": 6.747538223855995e-06, "loss": 0.029322056099772453, "memory(GiB)": 21.48, "step": 12838, "token_acc": 1.0, "train_speed(iter/s)": 0.956444 }, { "epoch": 0.41708085631679825, "grad_norm": 0.35591185092926025, "learning_rate": 6.7470349371868656e-06, "loss": 0.02664880082011223, "memory(GiB)": 21.48, "step": 12839, "token_acc": 0.9961538461538462, "train_speed(iter/s)": 0.956458 }, { "epoch": 0.41711334177955367, "grad_norm": 0.3153087794780731, "learning_rate": 6.746531630354628e-06, "loss": 0.022923659533262253, "memory(GiB)": 21.48, "step": 12840, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.956474 }, { "epoch": 0.4171458272423091, "grad_norm": 0.463408499956131, "learning_rate": 6.74602830336509e-06, "loss": 0.02634315937757492, "memory(GiB)": 21.48, "step": 12841, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.95649 }, { "epoch": 0.4171783127050645, "grad_norm": 0.4710252583026886, "learning_rate": 6.745524956224061e-06, "loss": 0.023163046687841415, "memory(GiB)": 21.48, "step": 12842, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.956506 }, { "epoch": 0.4172107981678199, "grad_norm": 0.4876828193664551, "learning_rate": 6.745021588937353e-06, "loss": 0.025463907048106194, "memory(GiB)": 21.48, "step": 12843, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.956523 }, { "epoch": 0.41724328363057533, "grad_norm": 0.36745357513427734, "learning_rate": 6.7445182015107725e-06, "loss": 0.025865353643894196, "memory(GiB)": 21.48, "step": 12844, "token_acc": 0.9816176470588235, "train_speed(iter/s)": 0.956539 }, { "epoch": 0.41727576909333075, "grad_norm": 0.47143325209617615, "learning_rate": 6.744014793950129e-06, "loss": 0.02857714146375656, "memory(GiB)": 21.48, "step": 12845, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.956556 }, { "epoch": 0.41730825455608617, "grad_norm": 0.33669859170913696, "learning_rate": 6.743511366261234e-06, "loss": 0.027865435928106308, "memory(GiB)": 21.48, "step": 12846, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.956572 }, { "epoch": 0.4173407400188416, "grad_norm": 0.40976738929748535, "learning_rate": 6.743007918449896e-06, "loss": 0.026764357462525368, "memory(GiB)": 21.48, "step": 12847, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.956588 }, { "epoch": 0.417373225481597, "grad_norm": 0.4187588691711426, "learning_rate": 6.742504450521929e-06, "loss": 0.02047678828239441, "memory(GiB)": 21.48, "step": 12848, "token_acc": 0.99, "train_speed(iter/s)": 0.956604 }, { "epoch": 0.4174057109443524, "grad_norm": 0.3777407705783844, "learning_rate": 6.74200096248314e-06, "loss": 0.03205958381295204, "memory(GiB)": 21.48, "step": 12849, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.95662 }, { "epoch": 0.41743819640710783, "grad_norm": 0.3445516526699066, "learning_rate": 6.741497454339342e-06, "loss": 0.0324053019285202, "memory(GiB)": 21.48, "step": 12850, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.956636 }, { "epoch": 0.41747068186986325, "grad_norm": 0.5061295628547668, "learning_rate": 6.740993926096344e-06, "loss": 0.03526455909013748, "memory(GiB)": 21.48, "step": 12851, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.956652 }, { "epoch": 0.41750316733261866, "grad_norm": 0.34569451212882996, "learning_rate": 6.740490377759961e-06, "loss": 0.025892313569784164, "memory(GiB)": 21.48, "step": 12852, "token_acc": 1.0, "train_speed(iter/s)": 0.956665 }, { "epoch": 0.4175356527953741, "grad_norm": 0.48646053671836853, "learning_rate": 6.739986809336001e-06, "loss": 0.0329413115978241, "memory(GiB)": 21.48, "step": 12853, "token_acc": 0.9847908745247148, "train_speed(iter/s)": 0.956678 }, { "epoch": 0.4175681382581295, "grad_norm": 0.38623738288879395, "learning_rate": 6.739483220830276e-06, "loss": 0.0271975789219141, "memory(GiB)": 21.48, "step": 12854, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956691 }, { "epoch": 0.4176006237208849, "grad_norm": 0.4306538999080658, "learning_rate": 6.738979612248602e-06, "loss": 0.030204761773347855, "memory(GiB)": 21.48, "step": 12855, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.956704 }, { "epoch": 0.4176331091836403, "grad_norm": 0.33544421195983887, "learning_rate": 6.738475983596786e-06, "loss": 0.027719862759113312, "memory(GiB)": 21.48, "step": 12856, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.956717 }, { "epoch": 0.41766559464639574, "grad_norm": 0.3429052233695984, "learning_rate": 6.737972334880646e-06, "loss": 0.01981641724705696, "memory(GiB)": 21.48, "step": 12857, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956728 }, { "epoch": 0.41769808010915116, "grad_norm": 0.37804168462753296, "learning_rate": 6.737468666105991e-06, "loss": 0.03110814094543457, "memory(GiB)": 21.48, "step": 12858, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.956741 }, { "epoch": 0.4177305655719066, "grad_norm": 0.8292710781097412, "learning_rate": 6.736964977278635e-06, "loss": 0.03528684377670288, "memory(GiB)": 21.48, "step": 12859, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.956752 }, { "epoch": 0.417763051034662, "grad_norm": 0.3155285120010376, "learning_rate": 6.736461268404391e-06, "loss": 0.025645744055509567, "memory(GiB)": 21.48, "step": 12860, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.956765 }, { "epoch": 0.4177955364974174, "grad_norm": 0.9167109131813049, "learning_rate": 6.735957539489073e-06, "loss": 0.03722041845321655, "memory(GiB)": 21.48, "step": 12861, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956779 }, { "epoch": 0.4178280219601728, "grad_norm": 0.3821057081222534, "learning_rate": 6.735453790538494e-06, "loss": 0.026137972250580788, "memory(GiB)": 21.48, "step": 12862, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.95679 }, { "epoch": 0.41786050742292824, "grad_norm": 0.617354154586792, "learning_rate": 6.734950021558468e-06, "loss": 0.03130931034684181, "memory(GiB)": 21.48, "step": 12863, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.956801 }, { "epoch": 0.41789299288568366, "grad_norm": 0.3751447796821594, "learning_rate": 6.734446232554812e-06, "loss": 0.02679925598204136, "memory(GiB)": 21.48, "step": 12864, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.956814 }, { "epoch": 0.4179254783484391, "grad_norm": 0.31302276253700256, "learning_rate": 6.733942423533335e-06, "loss": 0.019403930753469467, "memory(GiB)": 21.48, "step": 12865, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956826 }, { "epoch": 0.4179579638111945, "grad_norm": 0.2899269163608551, "learning_rate": 6.733438594499857e-06, "loss": 0.02085680142045021, "memory(GiB)": 21.48, "step": 12866, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.956839 }, { "epoch": 0.4179904492739499, "grad_norm": 0.3179551959037781, "learning_rate": 6.732934745460189e-06, "loss": 0.031672216951847076, "memory(GiB)": 21.48, "step": 12867, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956851 }, { "epoch": 0.4180229347367053, "grad_norm": 1.1366808414459229, "learning_rate": 6.732430876420148e-06, "loss": 0.022843429818749428, "memory(GiB)": 21.48, "step": 12868, "token_acc": 0.9783783783783784, "train_speed(iter/s)": 0.956862 }, { "epoch": 0.41805542019946074, "grad_norm": 0.4429433345794678, "learning_rate": 6.731926987385549e-06, "loss": 0.032195430248975754, "memory(GiB)": 21.48, "step": 12869, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956873 }, { "epoch": 0.41808790566221615, "grad_norm": 0.3215700685977936, "learning_rate": 6.731423078362206e-06, "loss": 0.018858077004551888, "memory(GiB)": 21.48, "step": 12870, "token_acc": 0.9813084112149533, "train_speed(iter/s)": 0.956885 }, { "epoch": 0.41812039112497157, "grad_norm": 0.3527599275112152, "learning_rate": 6.730919149355937e-06, "loss": 0.029813341796398163, "memory(GiB)": 21.48, "step": 12871, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.956898 }, { "epoch": 0.418152876587727, "grad_norm": 0.423525333404541, "learning_rate": 6.7304152003725565e-06, "loss": 0.027345459908246994, "memory(GiB)": 21.48, "step": 12872, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.956914 }, { "epoch": 0.4181853620504824, "grad_norm": 0.30830419063568115, "learning_rate": 6.7299112314178815e-06, "loss": 0.021175026893615723, "memory(GiB)": 21.48, "step": 12873, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.956929 }, { "epoch": 0.4182178475132378, "grad_norm": 0.334641695022583, "learning_rate": 6.729407242497729e-06, "loss": 0.018118154257535934, "memory(GiB)": 21.48, "step": 12874, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.956945 }, { "epoch": 0.41825033297599323, "grad_norm": 0.4946625530719757, "learning_rate": 6.728903233617916e-06, "loss": 0.0329335480928421, "memory(GiB)": 21.48, "step": 12875, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.95696 }, { "epoch": 0.41828281843874865, "grad_norm": 0.41848260164260864, "learning_rate": 6.7283992047842575e-06, "loss": 0.02378959208726883, "memory(GiB)": 21.48, "step": 12876, "token_acc": 1.0, "train_speed(iter/s)": 0.956977 }, { "epoch": 0.41831530390150407, "grad_norm": 0.49813729524612427, "learning_rate": 6.727895156002572e-06, "loss": 0.0295280609279871, "memory(GiB)": 21.48, "step": 12877, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.956992 }, { "epoch": 0.4183477893642595, "grad_norm": 0.4334852993488312, "learning_rate": 6.727391087278676e-06, "loss": 0.02218068763613701, "memory(GiB)": 21.48, "step": 12878, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957006 }, { "epoch": 0.4183802748270149, "grad_norm": 0.37390321493148804, "learning_rate": 6.726886998618388e-06, "loss": 0.028502661734819412, "memory(GiB)": 21.48, "step": 12879, "token_acc": 0.996, "train_speed(iter/s)": 0.957019 }, { "epoch": 0.4184127602897703, "grad_norm": 0.5179631114006042, "learning_rate": 6.726382890027526e-06, "loss": 0.03537597507238388, "memory(GiB)": 21.48, "step": 12880, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.957032 }, { "epoch": 0.41844524575252573, "grad_norm": 0.4549235701560974, "learning_rate": 6.725878761511908e-06, "loss": 0.026855627074837685, "memory(GiB)": 21.48, "step": 12881, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957045 }, { "epoch": 0.41847773121528115, "grad_norm": 0.46965456008911133, "learning_rate": 6.72537461307735e-06, "loss": 0.025108087807893753, "memory(GiB)": 21.48, "step": 12882, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.957059 }, { "epoch": 0.41851021667803656, "grad_norm": 0.36839714646339417, "learning_rate": 6.7248704447296745e-06, "loss": 0.024796759709715843, "memory(GiB)": 21.48, "step": 12883, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.957072 }, { "epoch": 0.418542702140792, "grad_norm": 0.5049626231193542, "learning_rate": 6.724366256474697e-06, "loss": 0.026993587613105774, "memory(GiB)": 21.48, "step": 12884, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.957085 }, { "epoch": 0.4185751876035474, "grad_norm": 0.41212591528892517, "learning_rate": 6.723862048318239e-06, "loss": 0.0259382463991642, "memory(GiB)": 21.48, "step": 12885, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.957098 }, { "epoch": 0.4186076730663028, "grad_norm": 0.5285915732383728, "learning_rate": 6.723357820266116e-06, "loss": 0.027803640812635422, "memory(GiB)": 21.48, "step": 12886, "token_acc": 0.9884615384615385, "train_speed(iter/s)": 0.957112 }, { "epoch": 0.4186401585290582, "grad_norm": 0.33369576930999756, "learning_rate": 6.722853572324153e-06, "loss": 0.02390379086136818, "memory(GiB)": 21.48, "step": 12887, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.957125 }, { "epoch": 0.41867264399181364, "grad_norm": 0.3206581473350525, "learning_rate": 6.722349304498166e-06, "loss": 0.02273578941822052, "memory(GiB)": 21.48, "step": 12888, "token_acc": 0.9837837837837838, "train_speed(iter/s)": 0.957137 }, { "epoch": 0.41870512945456906, "grad_norm": 0.4228266775608063, "learning_rate": 6.721845016793973e-06, "loss": 0.02584260329604149, "memory(GiB)": 21.48, "step": 12889, "token_acc": 0.986013986013986, "train_speed(iter/s)": 0.957148 }, { "epoch": 0.4187376149173245, "grad_norm": 0.35890135169029236, "learning_rate": 6.7213407092174e-06, "loss": 0.019564613699913025, "memory(GiB)": 21.48, "step": 12890, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.957162 }, { "epoch": 0.4187701003800799, "grad_norm": 0.44011014699935913, "learning_rate": 6.7208363817742625e-06, "loss": 0.02926110476255417, "memory(GiB)": 21.48, "step": 12891, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.957173 }, { "epoch": 0.4188025858428353, "grad_norm": 0.4206666946411133, "learning_rate": 6.720332034470383e-06, "loss": 0.02998812310397625, "memory(GiB)": 21.48, "step": 12892, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.957188 }, { "epoch": 0.4188350713055907, "grad_norm": 0.4189753532409668, "learning_rate": 6.719827667311581e-06, "loss": 0.02459154836833477, "memory(GiB)": 21.48, "step": 12893, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.957201 }, { "epoch": 0.41886755676834614, "grad_norm": 0.5351101756095886, "learning_rate": 6.719323280303682e-06, "loss": 0.024549908936023712, "memory(GiB)": 21.48, "step": 12894, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.957211 }, { "epoch": 0.41890004223110155, "grad_norm": 0.5176683664321899, "learning_rate": 6.718818873452502e-06, "loss": 0.029932823032140732, "memory(GiB)": 21.48, "step": 12895, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.957222 }, { "epoch": 0.418932527693857, "grad_norm": 0.7059126496315002, "learning_rate": 6.7183144467638626e-06, "loss": 0.030414579436182976, "memory(GiB)": 21.48, "step": 12896, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.957234 }, { "epoch": 0.41896501315661244, "grad_norm": 0.3805308938026428, "learning_rate": 6.71781000024359e-06, "loss": 0.029192347079515457, "memory(GiB)": 21.48, "step": 12897, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957246 }, { "epoch": 0.41899749861936786, "grad_norm": 0.4010600745677948, "learning_rate": 6.717305533897502e-06, "loss": 0.025968797504901886, "memory(GiB)": 21.48, "step": 12898, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.957258 }, { "epoch": 0.4190299840821233, "grad_norm": 0.42481330037117004, "learning_rate": 6.716801047731423e-06, "loss": 0.03043869510293007, "memory(GiB)": 21.48, "step": 12899, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.957271 }, { "epoch": 0.4190624695448787, "grad_norm": 0.2923136055469513, "learning_rate": 6.716296541751175e-06, "loss": 0.01769348420202732, "memory(GiB)": 21.48, "step": 12900, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957285 }, { "epoch": 0.4190949550076341, "grad_norm": 0.33610475063323975, "learning_rate": 6.71579201596258e-06, "loss": 0.02339027263224125, "memory(GiB)": 21.48, "step": 12901, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.9573 }, { "epoch": 0.4191274404703895, "grad_norm": 0.4033679664134979, "learning_rate": 6.715287470371461e-06, "loss": 0.030774792656302452, "memory(GiB)": 21.48, "step": 12902, "token_acc": 0.996, "train_speed(iter/s)": 0.957316 }, { "epoch": 0.41915992593314494, "grad_norm": 0.40445515513420105, "learning_rate": 6.714782904983642e-06, "loss": 0.024571353569626808, "memory(GiB)": 21.48, "step": 12903, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.957333 }, { "epoch": 0.41919241139590036, "grad_norm": 0.2910330891609192, "learning_rate": 6.714278319804947e-06, "loss": 0.021114226430654526, "memory(GiB)": 21.48, "step": 12904, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.957349 }, { "epoch": 0.41922489685865577, "grad_norm": 0.49352729320526123, "learning_rate": 6.713773714841197e-06, "loss": 0.03065529838204384, "memory(GiB)": 21.48, "step": 12905, "token_acc": 1.0, "train_speed(iter/s)": 0.957366 }, { "epoch": 0.4192573823214112, "grad_norm": 0.30668845772743225, "learning_rate": 6.713269090098219e-06, "loss": 0.015343500301241875, "memory(GiB)": 21.48, "step": 12906, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957383 }, { "epoch": 0.4192898677841666, "grad_norm": 0.33777669072151184, "learning_rate": 6.7127644455818346e-06, "loss": 0.01724088005721569, "memory(GiB)": 21.48, "step": 12907, "token_acc": 1.0, "train_speed(iter/s)": 0.957399 }, { "epoch": 0.419322353246922, "grad_norm": 0.36653587222099304, "learning_rate": 6.712259781297868e-06, "loss": 0.021335866302251816, "memory(GiB)": 21.48, "step": 12908, "token_acc": 1.0, "train_speed(iter/s)": 0.957415 }, { "epoch": 0.41935483870967744, "grad_norm": 0.4146974980831146, "learning_rate": 6.711755097252146e-06, "loss": 0.02198193222284317, "memory(GiB)": 21.48, "step": 12909, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.957432 }, { "epoch": 0.41938732417243285, "grad_norm": 0.5343672037124634, "learning_rate": 6.711250393450491e-06, "loss": 0.024935737252235413, "memory(GiB)": 21.48, "step": 12910, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957448 }, { "epoch": 0.41941980963518827, "grad_norm": 0.6727445125579834, "learning_rate": 6.7107456698987286e-06, "loss": 0.030354686081409454, "memory(GiB)": 21.48, "step": 12911, "token_acc": 1.0, "train_speed(iter/s)": 0.957464 }, { "epoch": 0.4194522950979437, "grad_norm": 0.4352191090583801, "learning_rate": 6.710240926602684e-06, "loss": 0.036470793187618256, "memory(GiB)": 21.48, "step": 12912, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.957475 }, { "epoch": 0.4194847805606991, "grad_norm": 0.40751364827156067, "learning_rate": 6.709736163568183e-06, "loss": 0.023716527968645096, "memory(GiB)": 21.48, "step": 12913, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957488 }, { "epoch": 0.4195172660234545, "grad_norm": 0.30386561155319214, "learning_rate": 6.709231380801051e-06, "loss": 0.017767202109098434, "memory(GiB)": 21.48, "step": 12914, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.9575 }, { "epoch": 0.41954975148620993, "grad_norm": 0.5697981715202332, "learning_rate": 6.708726578307114e-06, "loss": 0.029217351227998734, "memory(GiB)": 21.48, "step": 12915, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.957512 }, { "epoch": 0.41958223694896535, "grad_norm": 0.4567398428916931, "learning_rate": 6.708221756092197e-06, "loss": 0.027408933266997337, "memory(GiB)": 21.48, "step": 12916, "token_acc": 0.9896907216494846, "train_speed(iter/s)": 0.957524 }, { "epoch": 0.41961472241172076, "grad_norm": 0.41316676139831543, "learning_rate": 6.707716914162129e-06, "loss": 0.019857782870531082, "memory(GiB)": 21.48, "step": 12917, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.957536 }, { "epoch": 0.4196472078744762, "grad_norm": 0.39808928966522217, "learning_rate": 6.707212052522734e-06, "loss": 0.019390618428587914, "memory(GiB)": 21.48, "step": 12918, "token_acc": 0.9929824561403509, "train_speed(iter/s)": 0.95755 }, { "epoch": 0.4196796933372316, "grad_norm": 0.5147300362586975, "learning_rate": 6.706707171179839e-06, "loss": 0.021733000874519348, "memory(GiB)": 21.48, "step": 12919, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.957561 }, { "epoch": 0.419712178799987, "grad_norm": 0.5130528211593628, "learning_rate": 6.706202270139273e-06, "loss": 0.023991726338863373, "memory(GiB)": 21.48, "step": 12920, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.957573 }, { "epoch": 0.41974466426274243, "grad_norm": 0.2889983654022217, "learning_rate": 6.705697349406861e-06, "loss": 0.01838405802845955, "memory(GiB)": 21.48, "step": 12921, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.957584 }, { "epoch": 0.41977714972549784, "grad_norm": 0.3915903866291046, "learning_rate": 6.705192408988431e-06, "loss": 0.021326806396245956, "memory(GiB)": 21.48, "step": 12922, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957596 }, { "epoch": 0.41980963518825326, "grad_norm": 0.33294856548309326, "learning_rate": 6.704687448889813e-06, "loss": 0.023764844983816147, "memory(GiB)": 21.48, "step": 12923, "token_acc": 0.994413407821229, "train_speed(iter/s)": 0.957608 }, { "epoch": 0.4198421206510087, "grad_norm": 0.3418360650539398, "learning_rate": 6.70418246911683e-06, "loss": 0.021969454362988472, "memory(GiB)": 21.48, "step": 12924, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.95762 }, { "epoch": 0.4198746061137641, "grad_norm": 0.5300268530845642, "learning_rate": 6.703677469675315e-06, "loss": 0.01639631763100624, "memory(GiB)": 21.48, "step": 12925, "token_acc": 0.9918367346938776, "train_speed(iter/s)": 0.957634 }, { "epoch": 0.4199070915765195, "grad_norm": 0.6292446255683899, "learning_rate": 6.703172450571094e-06, "loss": 0.018830234184861183, "memory(GiB)": 21.48, "step": 12926, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.957644 }, { "epoch": 0.4199395770392749, "grad_norm": 0.4378589689731598, "learning_rate": 6.7026674118099954e-06, "loss": 0.025611290708184242, "memory(GiB)": 21.48, "step": 12927, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.957657 }, { "epoch": 0.41997206250203034, "grad_norm": 0.3803156614303589, "learning_rate": 6.70216235339785e-06, "loss": 0.026374194771051407, "memory(GiB)": 21.48, "step": 12928, "token_acc": 0.9814126394052045, "train_speed(iter/s)": 0.95767 }, { "epoch": 0.42000454796478576, "grad_norm": 0.3734776973724365, "learning_rate": 6.701657275340484e-06, "loss": 0.023013487458229065, "memory(GiB)": 21.48, "step": 12929, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.957682 }, { "epoch": 0.4200370334275412, "grad_norm": 0.446176677942276, "learning_rate": 6.701152177643728e-06, "loss": 0.02741527371108532, "memory(GiB)": 21.48, "step": 12930, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.957694 }, { "epoch": 0.4200695188902966, "grad_norm": 0.5075907707214355, "learning_rate": 6.700647060313413e-06, "loss": 0.0263381265103817, "memory(GiB)": 21.48, "step": 12931, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957708 }, { "epoch": 0.420102004353052, "grad_norm": 0.4072325825691223, "learning_rate": 6.700141923355368e-06, "loss": 0.02465999685227871, "memory(GiB)": 21.48, "step": 12932, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957724 }, { "epoch": 0.4201344898158074, "grad_norm": 0.3922758400440216, "learning_rate": 6.699636766775419e-06, "loss": 0.02813579887151718, "memory(GiB)": 21.48, "step": 12933, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957741 }, { "epoch": 0.42016697527856284, "grad_norm": 0.517319917678833, "learning_rate": 6.699131590579402e-06, "loss": 0.02390475757420063, "memory(GiB)": 21.48, "step": 12934, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957757 }, { "epoch": 0.42019946074131825, "grad_norm": 0.3051376938819885, "learning_rate": 6.698626394773143e-06, "loss": 0.016571182757616043, "memory(GiB)": 21.48, "step": 12935, "token_acc": 1.0, "train_speed(iter/s)": 0.957772 }, { "epoch": 0.42023194620407367, "grad_norm": 0.3788362741470337, "learning_rate": 6.698121179362477e-06, "loss": 0.018392862752079964, "memory(GiB)": 21.48, "step": 12936, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957787 }, { "epoch": 0.4202644316668291, "grad_norm": 0.29614129662513733, "learning_rate": 6.697615944353232e-06, "loss": 0.02093464881181717, "memory(GiB)": 21.48, "step": 12937, "token_acc": 0.9860627177700348, "train_speed(iter/s)": 0.957804 }, { "epoch": 0.4202969171295845, "grad_norm": 0.38875633478164673, "learning_rate": 6.6971106897512385e-06, "loss": 0.022838125005364418, "memory(GiB)": 21.48, "step": 12938, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.95782 }, { "epoch": 0.4203294025923399, "grad_norm": 0.35930582880973816, "learning_rate": 6.696605415562329e-06, "loss": 0.03365050256252289, "memory(GiB)": 21.48, "step": 12939, "token_acc": 0.9885496183206107, "train_speed(iter/s)": 0.957836 }, { "epoch": 0.42036188805509533, "grad_norm": 0.8297509551048279, "learning_rate": 6.696100121792335e-06, "loss": 0.03231823071837425, "memory(GiB)": 21.48, "step": 12940, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.95785 }, { "epoch": 0.42039437351785075, "grad_norm": 0.41150927543640137, "learning_rate": 6.695594808447088e-06, "loss": 0.024013295769691467, "memory(GiB)": 21.48, "step": 12941, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.957863 }, { "epoch": 0.42042685898060617, "grad_norm": 0.3201085925102234, "learning_rate": 6.69508947553242e-06, "loss": 0.023611940443515778, "memory(GiB)": 21.48, "step": 12942, "token_acc": 0.9927272727272727, "train_speed(iter/s)": 0.957877 }, { "epoch": 0.4204593444433616, "grad_norm": 0.39834845066070557, "learning_rate": 6.694584123054164e-06, "loss": 0.02837362140417099, "memory(GiB)": 21.48, "step": 12943, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.95789 }, { "epoch": 0.420491829906117, "grad_norm": 0.4184960126876831, "learning_rate": 6.694078751018151e-06, "loss": 0.021567855030298233, "memory(GiB)": 21.48, "step": 12944, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.957901 }, { "epoch": 0.4205243153688724, "grad_norm": 0.27333420515060425, "learning_rate": 6.693573359430214e-06, "loss": 0.014434587210416794, "memory(GiB)": 21.48, "step": 12945, "token_acc": 1.0, "train_speed(iter/s)": 0.957914 }, { "epoch": 0.42055680083162783, "grad_norm": 0.37260377407073975, "learning_rate": 6.693067948296188e-06, "loss": 0.029276739805936813, "memory(GiB)": 21.48, "step": 12946, "token_acc": 0.9750889679715302, "train_speed(iter/s)": 0.957928 }, { "epoch": 0.42058928629438325, "grad_norm": 0.2755574882030487, "learning_rate": 6.692562517621902e-06, "loss": 0.021108416840434074, "memory(GiB)": 21.48, "step": 12947, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.95794 }, { "epoch": 0.42062177175713866, "grad_norm": 0.27877530455589294, "learning_rate": 6.692057067413194e-06, "loss": 0.017171170562505722, "memory(GiB)": 21.48, "step": 12948, "token_acc": 1.0, "train_speed(iter/s)": 0.957953 }, { "epoch": 0.4206542572198941, "grad_norm": 0.32794609665870667, "learning_rate": 6.6915515976758926e-06, "loss": 0.020963001996278763, "memory(GiB)": 21.48, "step": 12949, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.957965 }, { "epoch": 0.4206867426826495, "grad_norm": 0.42875102162361145, "learning_rate": 6.691046108415837e-06, "loss": 0.022683780640363693, "memory(GiB)": 21.48, "step": 12950, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957976 }, { "epoch": 0.4207192281454049, "grad_norm": 0.33750295639038086, "learning_rate": 6.690540599638857e-06, "loss": 0.022106152027845383, "memory(GiB)": 21.48, "step": 12951, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.957987 }, { "epoch": 0.4207517136081603, "grad_norm": 0.32018569111824036, "learning_rate": 6.690035071350789e-06, "loss": 0.02297857776284218, "memory(GiB)": 21.48, "step": 12952, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.958001 }, { "epoch": 0.42078419907091574, "grad_norm": 0.31774699687957764, "learning_rate": 6.689529523557468e-06, "loss": 0.020686935633420944, "memory(GiB)": 21.48, "step": 12953, "token_acc": 1.0, "train_speed(iter/s)": 0.958014 }, { "epoch": 0.42081668453367116, "grad_norm": 0.41473588347435, "learning_rate": 6.689023956264725e-06, "loss": 0.03171917796134949, "memory(GiB)": 21.48, "step": 12954, "token_acc": 0.9846153846153847, "train_speed(iter/s)": 0.958027 }, { "epoch": 0.4208491699964266, "grad_norm": 0.4219600260257721, "learning_rate": 6.6885183694784e-06, "loss": 0.025175122544169426, "memory(GiB)": 21.48, "step": 12955, "token_acc": 0.9875, "train_speed(iter/s)": 0.958039 }, { "epoch": 0.420881655459182, "grad_norm": 0.42762506008148193, "learning_rate": 6.688012763204324e-06, "loss": 0.028939418494701385, "memory(GiB)": 21.48, "step": 12956, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.958052 }, { "epoch": 0.4209141409219374, "grad_norm": 0.43022826313972473, "learning_rate": 6.687507137448334e-06, "loss": 0.026317594572901726, "memory(GiB)": 21.48, "step": 12957, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.958064 }, { "epoch": 0.4209466263846928, "grad_norm": 0.4295399785041809, "learning_rate": 6.687001492216265e-06, "loss": 0.026556501165032387, "memory(GiB)": 21.48, "step": 12958, "token_acc": 1.0, "train_speed(iter/s)": 0.958077 }, { "epoch": 0.42097911184744824, "grad_norm": 0.3066140413284302, "learning_rate": 6.6864958275139555e-06, "loss": 0.02722882106900215, "memory(GiB)": 21.48, "step": 12959, "token_acc": 0.9778761061946902, "train_speed(iter/s)": 0.958086 }, { "epoch": 0.4210115973102037, "grad_norm": 0.3666439354419708, "learning_rate": 6.685990143347238e-06, "loss": 0.024602457880973816, "memory(GiB)": 21.48, "step": 12960, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.9581 }, { "epoch": 0.4210440827729591, "grad_norm": 0.3301485776901245, "learning_rate": 6.685484439721951e-06, "loss": 0.023981351405382156, "memory(GiB)": 21.48, "step": 12961, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.958113 }, { "epoch": 0.42107656823571454, "grad_norm": 0.2495630830526352, "learning_rate": 6.6849787166439295e-06, "loss": 0.017639966681599617, "memory(GiB)": 21.48, "step": 12962, "token_acc": 0.98828125, "train_speed(iter/s)": 0.958125 }, { "epoch": 0.42110905369846996, "grad_norm": 0.35696858167648315, "learning_rate": 6.68447297411901e-06, "loss": 0.02689148485660553, "memory(GiB)": 21.48, "step": 12963, "token_acc": 1.0, "train_speed(iter/s)": 0.958138 }, { "epoch": 0.4211415391612254, "grad_norm": 0.4168316423892975, "learning_rate": 6.683967212153032e-06, "loss": 0.029766174033284187, "memory(GiB)": 21.48, "step": 12964, "token_acc": 0.9730941704035875, "train_speed(iter/s)": 0.958152 }, { "epoch": 0.4211740246239808, "grad_norm": 0.49830296635627747, "learning_rate": 6.6834614307518295e-06, "loss": 0.026287885382771492, "memory(GiB)": 21.48, "step": 12965, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.958166 }, { "epoch": 0.4212065100867362, "grad_norm": 0.262184739112854, "learning_rate": 6.6829556299212424e-06, "loss": 0.016332756727933884, "memory(GiB)": 21.48, "step": 12966, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.958184 }, { "epoch": 0.4212389955494916, "grad_norm": 0.5349858999252319, "learning_rate": 6.682449809667108e-06, "loss": 0.03733036294579506, "memory(GiB)": 21.48, "step": 12967, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.958198 }, { "epoch": 0.42127148101224704, "grad_norm": 0.363826185464859, "learning_rate": 6.681943969995263e-06, "loss": 0.02911144495010376, "memory(GiB)": 21.48, "step": 12968, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.958215 }, { "epoch": 0.42130396647500246, "grad_norm": 0.2747526168823242, "learning_rate": 6.681438110911546e-06, "loss": 0.021539833396673203, "memory(GiB)": 21.48, "step": 12969, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.958231 }, { "epoch": 0.42133645193775787, "grad_norm": 0.2684755027294159, "learning_rate": 6.680932232421794e-06, "loss": 0.019379090517759323, "memory(GiB)": 21.48, "step": 12970, "token_acc": 0.9899328859060402, "train_speed(iter/s)": 0.958246 }, { "epoch": 0.4213689374005133, "grad_norm": 0.30120038986206055, "learning_rate": 6.680426334531847e-06, "loss": 0.01978256180882454, "memory(GiB)": 21.48, "step": 12971, "token_acc": 0.9921875, "train_speed(iter/s)": 0.95826 }, { "epoch": 0.4214014228632687, "grad_norm": 0.4270375370979309, "learning_rate": 6.679920417247544e-06, "loss": 0.025637034326791763, "memory(GiB)": 21.48, "step": 12972, "token_acc": 1.0, "train_speed(iter/s)": 0.958274 }, { "epoch": 0.4214339083260241, "grad_norm": 0.27686747908592224, "learning_rate": 6.679414480574724e-06, "loss": 0.017681661993265152, "memory(GiB)": 21.48, "step": 12973, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.958285 }, { "epoch": 0.42146639378877954, "grad_norm": 0.3739583194255829, "learning_rate": 6.678908524519224e-06, "loss": 0.028778713196516037, "memory(GiB)": 21.48, "step": 12974, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.958297 }, { "epoch": 0.42149887925153495, "grad_norm": 0.3821655511856079, "learning_rate": 6.678402549086886e-06, "loss": 0.024918414652347565, "memory(GiB)": 21.48, "step": 12975, "token_acc": 1.0, "train_speed(iter/s)": 0.958309 }, { "epoch": 0.42153136471429037, "grad_norm": 0.49288806319236755, "learning_rate": 6.67789655428355e-06, "loss": 0.026432454586029053, "memory(GiB)": 21.48, "step": 12976, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.958321 }, { "epoch": 0.4215638501770458, "grad_norm": 0.5369252562522888, "learning_rate": 6.677390540115053e-06, "loss": 0.026597287505865097, "memory(GiB)": 21.48, "step": 12977, "token_acc": 1.0, "train_speed(iter/s)": 0.958334 }, { "epoch": 0.4215963356398012, "grad_norm": 0.651890218257904, "learning_rate": 6.676884506587237e-06, "loss": 0.03500626981258392, "memory(GiB)": 21.48, "step": 12978, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.958346 }, { "epoch": 0.4216288211025566, "grad_norm": 0.299030065536499, "learning_rate": 6.676378453705941e-06, "loss": 0.017912717536091805, "memory(GiB)": 21.48, "step": 12979, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.958357 }, { "epoch": 0.42166130656531203, "grad_norm": 0.28394362330436707, "learning_rate": 6.675872381477006e-06, "loss": 0.01831754855811596, "memory(GiB)": 21.48, "step": 12980, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.958367 }, { "epoch": 0.42169379202806745, "grad_norm": 0.3656233847141266, "learning_rate": 6.675366289906276e-06, "loss": 0.02520804852247238, "memory(GiB)": 21.48, "step": 12981, "token_acc": 0.9748743718592965, "train_speed(iter/s)": 0.958379 }, { "epoch": 0.42172627749082287, "grad_norm": 0.26173970103263855, "learning_rate": 6.674860178999586e-06, "loss": 0.022723793983459473, "memory(GiB)": 21.48, "step": 12982, "token_acc": 1.0, "train_speed(iter/s)": 0.958389 }, { "epoch": 0.4217587629535783, "grad_norm": 0.6863563060760498, "learning_rate": 6.674354048762782e-06, "loss": 0.03383629769086838, "memory(GiB)": 21.48, "step": 12983, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.958402 }, { "epoch": 0.4217912484163337, "grad_norm": 0.4118901491165161, "learning_rate": 6.673847899201702e-06, "loss": 0.01728261075913906, "memory(GiB)": 21.48, "step": 12984, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.958412 }, { "epoch": 0.4218237338790891, "grad_norm": 0.44345328211784363, "learning_rate": 6.6733417303221915e-06, "loss": 0.02431074157357216, "memory(GiB)": 21.48, "step": 12985, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.958424 }, { "epoch": 0.42185621934184453, "grad_norm": 0.36556026339530945, "learning_rate": 6.672835542130089e-06, "loss": 0.028259411454200745, "memory(GiB)": 21.48, "step": 12986, "token_acc": 0.9893238434163701, "train_speed(iter/s)": 0.958436 }, { "epoch": 0.42188870480459995, "grad_norm": 0.30135151743888855, "learning_rate": 6.672329334631238e-06, "loss": 0.022099129855632782, "memory(GiB)": 21.48, "step": 12987, "token_acc": 1.0, "train_speed(iter/s)": 0.958449 }, { "epoch": 0.42192119026735536, "grad_norm": 0.46995559334754944, "learning_rate": 6.671823107831481e-06, "loss": 0.025392498821020126, "memory(GiB)": 21.48, "step": 12988, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.95846 }, { "epoch": 0.4219536757301108, "grad_norm": 0.4527071416378021, "learning_rate": 6.6713168617366595e-06, "loss": 0.0300650205463171, "memory(GiB)": 21.48, "step": 12989, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.958471 }, { "epoch": 0.4219861611928662, "grad_norm": 0.45127081871032715, "learning_rate": 6.670810596352619e-06, "loss": 0.024062087759375572, "memory(GiB)": 21.48, "step": 12990, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.958484 }, { "epoch": 0.4220186466556216, "grad_norm": 0.44975388050079346, "learning_rate": 6.6703043116851975e-06, "loss": 0.029330182820558548, "memory(GiB)": 21.48, "step": 12991, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.958498 }, { "epoch": 0.422051132118377, "grad_norm": 0.4649293124675751, "learning_rate": 6.6697980077402426e-06, "loss": 0.022911634296178818, "memory(GiB)": 21.48, "step": 12992, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.958514 }, { "epoch": 0.42208361758113244, "grad_norm": 0.36568403244018555, "learning_rate": 6.669291684523596e-06, "loss": 0.024491799995303154, "memory(GiB)": 21.48, "step": 12993, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.95853 }, { "epoch": 0.42211610304388786, "grad_norm": 0.47389596700668335, "learning_rate": 6.6687853420411006e-06, "loss": 0.024899525567889214, "memory(GiB)": 21.48, "step": 12994, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.958546 }, { "epoch": 0.4221485885066433, "grad_norm": 0.6039224863052368, "learning_rate": 6.668278980298602e-06, "loss": 0.0327586755156517, "memory(GiB)": 21.48, "step": 12995, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.958563 }, { "epoch": 0.4221810739693987, "grad_norm": 0.6139506697654724, "learning_rate": 6.667772599301943e-06, "loss": 0.03374478965997696, "memory(GiB)": 21.48, "step": 12996, "token_acc": 0.9668508287292817, "train_speed(iter/s)": 0.95858 }, { "epoch": 0.4222135594321541, "grad_norm": 0.5749441981315613, "learning_rate": 6.66726619905697e-06, "loss": 0.025370098650455475, "memory(GiB)": 21.48, "step": 12997, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.958595 }, { "epoch": 0.4222460448949095, "grad_norm": 0.5609633922576904, "learning_rate": 6.666759779569524e-06, "loss": 0.030474543571472168, "memory(GiB)": 21.48, "step": 12998, "token_acc": 0.9762845849802372, "train_speed(iter/s)": 0.958611 }, { "epoch": 0.42227853035766494, "grad_norm": 0.35938870906829834, "learning_rate": 6.666253340845452e-06, "loss": 0.026584986597299576, "memory(GiB)": 21.48, "step": 12999, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.958627 }, { "epoch": 0.42231101582042035, "grad_norm": 1.147422194480896, "learning_rate": 6.665746882890599e-06, "loss": 0.026840338483452797, "memory(GiB)": 21.48, "step": 13000, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.958643 }, { "epoch": 0.42231101582042035, "eval_loss": 0.025667807087302208, "eval_runtime": 80.8945, "eval_samples_per_second": 123.0, "eval_steps_per_second": 3.845, "eval_token_acc": 0.9896416270676847, "step": 13000 }, { "epoch": 0.42234350128317577, "grad_norm": 0.4050731658935547, "learning_rate": 6.6652404057108115e-06, "loss": 0.02287723682820797, "memory(GiB)": 21.48, "step": 13001, "token_acc": 0.9891514279843846, "train_speed(iter/s)": 0.952229 }, { "epoch": 0.4223759867459312, "grad_norm": 0.4954547882080078, "learning_rate": 6.66473390931193e-06, "loss": 0.031725555658340454, "memory(GiB)": 21.48, "step": 13002, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.952239 }, { "epoch": 0.4224084722086866, "grad_norm": 0.4556221663951874, "learning_rate": 6.664227393699806e-06, "loss": 0.035120926797389984, "memory(GiB)": 21.48, "step": 13003, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.95225 }, { "epoch": 0.422440957671442, "grad_norm": 0.4253007769584656, "learning_rate": 6.663720858880283e-06, "loss": 0.02453121915459633, "memory(GiB)": 21.48, "step": 13004, "token_acc": 0.9833887043189369, "train_speed(iter/s)": 0.952261 }, { "epoch": 0.42247344313419744, "grad_norm": 0.3631524443626404, "learning_rate": 6.663214304859205e-06, "loss": 0.030515681952238083, "memory(GiB)": 21.48, "step": 13005, "token_acc": 0.9863481228668942, "train_speed(iter/s)": 0.952271 }, { "epoch": 0.42250592859695285, "grad_norm": 0.5132870078086853, "learning_rate": 6.662707731642422e-06, "loss": 0.029748007655143738, "memory(GiB)": 21.48, "step": 13006, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.952282 }, { "epoch": 0.42253841405970827, "grad_norm": 0.32717493176460266, "learning_rate": 6.662201139235776e-06, "loss": 0.027110232040286064, "memory(GiB)": 21.48, "step": 13007, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.952297 }, { "epoch": 0.4225708995224637, "grad_norm": 0.3034195303916931, "learning_rate": 6.661694527645119e-06, "loss": 0.028149496763944626, "memory(GiB)": 21.48, "step": 13008, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.952313 }, { "epoch": 0.4226033849852191, "grad_norm": 0.48522627353668213, "learning_rate": 6.661187896876295e-06, "loss": 0.03209691122174263, "memory(GiB)": 21.48, "step": 13009, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.952329 }, { "epoch": 0.4226358704479745, "grad_norm": 0.5259846448898315, "learning_rate": 6.660681246935152e-06, "loss": 0.031443458050489426, "memory(GiB)": 21.48, "step": 13010, "token_acc": 0.9768339768339769, "train_speed(iter/s)": 0.952345 }, { "epoch": 0.42266835591072993, "grad_norm": 0.41094255447387695, "learning_rate": 6.6601745778275366e-06, "loss": 0.02739926427602768, "memory(GiB)": 21.48, "step": 13011, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.952361 }, { "epoch": 0.42270084137348535, "grad_norm": 0.3310237228870392, "learning_rate": 6.659667889559295e-06, "loss": 0.01787382736802101, "memory(GiB)": 21.48, "step": 13012, "token_acc": 1.0, "train_speed(iter/s)": 0.952376 }, { "epoch": 0.42273332683624076, "grad_norm": 0.4906264543533325, "learning_rate": 6.659161182136279e-06, "loss": 0.026557384058833122, "memory(GiB)": 21.48, "step": 13013, "token_acc": 0.983739837398374, "train_speed(iter/s)": 0.952392 }, { "epoch": 0.4227658122989962, "grad_norm": 0.34076789021492004, "learning_rate": 6.658654455564334e-06, "loss": 0.025777393952012062, "memory(GiB)": 21.48, "step": 13014, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.952406 }, { "epoch": 0.4227982977617516, "grad_norm": 0.3827105760574341, "learning_rate": 6.65814770984931e-06, "loss": 0.023731987923383713, "memory(GiB)": 21.48, "step": 13015, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.952421 }, { "epoch": 0.422830783224507, "grad_norm": 0.3648832440376282, "learning_rate": 6.6576409449970525e-06, "loss": 0.0317440927028656, "memory(GiB)": 21.48, "step": 13016, "token_acc": 0.993127147766323, "train_speed(iter/s)": 0.952437 }, { "epoch": 0.42286326868726243, "grad_norm": 0.7892619967460632, "learning_rate": 6.657134161013414e-06, "loss": 0.023782238364219666, "memory(GiB)": 21.48, "step": 13017, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.952453 }, { "epoch": 0.42289575415001784, "grad_norm": 0.353166401386261, "learning_rate": 6.6566273579042415e-06, "loss": 0.024060554802417755, "memory(GiB)": 21.48, "step": 13018, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.952469 }, { "epoch": 0.42292823961277326, "grad_norm": 0.5014739036560059, "learning_rate": 6.656120535675384e-06, "loss": 0.020830940455198288, "memory(GiB)": 21.48, "step": 13019, "token_acc": 1.0, "train_speed(iter/s)": 0.952485 }, { "epoch": 0.4229607250755287, "grad_norm": 0.4784961938858032, "learning_rate": 6.655613694332691e-06, "loss": 0.02800096943974495, "memory(GiB)": 21.48, "step": 13020, "token_acc": 0.979253112033195, "train_speed(iter/s)": 0.952502 }, { "epoch": 0.4229932105382841, "grad_norm": 0.3544900715351105, "learning_rate": 6.655106833882011e-06, "loss": 0.021192612126469612, "memory(GiB)": 21.48, "step": 13021, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.952516 }, { "epoch": 0.4230256960010395, "grad_norm": 0.4438513517379761, "learning_rate": 6.654599954329197e-06, "loss": 0.034604355692863464, "memory(GiB)": 21.48, "step": 13022, "token_acc": 0.9724770642201835, "train_speed(iter/s)": 0.952533 }, { "epoch": 0.4230581814637949, "grad_norm": 0.4551321864128113, "learning_rate": 6.654093055680097e-06, "loss": 0.029383130371570587, "memory(GiB)": 21.48, "step": 13023, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.952549 }, { "epoch": 0.4230906669265504, "grad_norm": 0.3212684690952301, "learning_rate": 6.653586137940562e-06, "loss": 0.0244804248213768, "memory(GiB)": 21.48, "step": 13024, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.952566 }, { "epoch": 0.4231231523893058, "grad_norm": 0.35586610436439514, "learning_rate": 6.65307920111644e-06, "loss": 0.020878339186310768, "memory(GiB)": 21.48, "step": 13025, "token_acc": 0.985, "train_speed(iter/s)": 0.952563 }, { "epoch": 0.42315563785206123, "grad_norm": 0.39488235116004944, "learning_rate": 6.652572245213586e-06, "loss": 0.020868739113211632, "memory(GiB)": 21.48, "step": 13026, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.95258 }, { "epoch": 0.42318812331481664, "grad_norm": 0.2757139801979065, "learning_rate": 6.6520652702378475e-06, "loss": 0.018774867057800293, "memory(GiB)": 21.48, "step": 13027, "token_acc": 0.996, "train_speed(iter/s)": 0.952595 }, { "epoch": 0.42322060877757206, "grad_norm": 0.39009320735931396, "learning_rate": 6.651558276195077e-06, "loss": 0.022977273911237717, "memory(GiB)": 21.48, "step": 13028, "token_acc": 0.988, "train_speed(iter/s)": 0.952611 }, { "epoch": 0.4232530942403275, "grad_norm": 0.2981061041355133, "learning_rate": 6.651051263091126e-06, "loss": 0.019801009446382523, "memory(GiB)": 21.48, "step": 13029, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.952627 }, { "epoch": 0.4232855797030829, "grad_norm": 0.41097038984298706, "learning_rate": 6.650544230931845e-06, "loss": 0.024595841765403748, "memory(GiB)": 21.48, "step": 13030, "token_acc": 0.9918367346938776, "train_speed(iter/s)": 0.952643 }, { "epoch": 0.4233180651658383, "grad_norm": 0.3714140057563782, "learning_rate": 6.650037179723089e-06, "loss": 0.023085959255695343, "memory(GiB)": 21.48, "step": 13031, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.952659 }, { "epoch": 0.4233505506285937, "grad_norm": 0.46116435527801514, "learning_rate": 6.649530109470707e-06, "loss": 0.020274564623832703, "memory(GiB)": 21.48, "step": 13032, "token_acc": 1.0, "train_speed(iter/s)": 0.952675 }, { "epoch": 0.42338303609134914, "grad_norm": 0.4224643409252167, "learning_rate": 6.6490230201805504e-06, "loss": 0.023886386305093765, "memory(GiB)": 21.48, "step": 13033, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.95269 }, { "epoch": 0.42341552155410456, "grad_norm": 0.3149145841598511, "learning_rate": 6.648515911858475e-06, "loss": 0.02321033552289009, "memory(GiB)": 21.48, "step": 13034, "token_acc": 0.976, "train_speed(iter/s)": 0.952707 }, { "epoch": 0.42344800701686, "grad_norm": 0.3704341948032379, "learning_rate": 6.64800878451033e-06, "loss": 0.02465507760643959, "memory(GiB)": 21.48, "step": 13035, "token_acc": 1.0, "train_speed(iter/s)": 0.952722 }, { "epoch": 0.4234804924796154, "grad_norm": 0.27655288577079773, "learning_rate": 6.647501638141972e-06, "loss": 0.026913335546851158, "memory(GiB)": 21.48, "step": 13036, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.952739 }, { "epoch": 0.4235129779423708, "grad_norm": 0.37918150424957275, "learning_rate": 6.646994472759252e-06, "loss": 0.023352351039648056, "memory(GiB)": 21.48, "step": 13037, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.952754 }, { "epoch": 0.4235454634051262, "grad_norm": 0.4448554813861847, "learning_rate": 6.646487288368023e-06, "loss": 0.02075888402760029, "memory(GiB)": 21.48, "step": 13038, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.95277 }, { "epoch": 0.42357794886788164, "grad_norm": 0.4277845323085785, "learning_rate": 6.6459800849741384e-06, "loss": 0.03532809019088745, "memory(GiB)": 21.48, "step": 13039, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.952786 }, { "epoch": 0.42361043433063705, "grad_norm": 0.7459567785263062, "learning_rate": 6.645472862583454e-06, "loss": 0.02504020743072033, "memory(GiB)": 21.48, "step": 13040, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.952802 }, { "epoch": 0.42364291979339247, "grad_norm": 0.4112514853477478, "learning_rate": 6.644965621201823e-06, "loss": 0.027107637375593185, "memory(GiB)": 21.48, "step": 13041, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.952817 }, { "epoch": 0.4236754052561479, "grad_norm": 0.31452879309654236, "learning_rate": 6.644458360835098e-06, "loss": 0.023843906819820404, "memory(GiB)": 21.48, "step": 13042, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.952834 }, { "epoch": 0.4237078907189033, "grad_norm": 0.30644652247428894, "learning_rate": 6.643951081489135e-06, "loss": 0.022525951266288757, "memory(GiB)": 21.48, "step": 13043, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.952849 }, { "epoch": 0.4237403761816587, "grad_norm": 0.44603782892227173, "learning_rate": 6.643443783169789e-06, "loss": 0.02662733569741249, "memory(GiB)": 21.48, "step": 13044, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.952865 }, { "epoch": 0.42377286164441413, "grad_norm": 0.4227393567562103, "learning_rate": 6.642936465882914e-06, "loss": 0.02242317795753479, "memory(GiB)": 21.48, "step": 13045, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.952881 }, { "epoch": 0.42380534710716955, "grad_norm": 0.33339083194732666, "learning_rate": 6.642429129634367e-06, "loss": 0.023100925609469414, "memory(GiB)": 21.48, "step": 13046, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.952897 }, { "epoch": 0.42383783256992497, "grad_norm": 0.7224939465522766, "learning_rate": 6.64192177443e-06, "loss": 0.03202046826481819, "memory(GiB)": 21.48, "step": 13047, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.952909 }, { "epoch": 0.4238703180326804, "grad_norm": 0.5072411894798279, "learning_rate": 6.6414144002756705e-06, "loss": 0.028389867395162582, "memory(GiB)": 21.48, "step": 13048, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.952919 }, { "epoch": 0.4239028034954358, "grad_norm": 0.3582504093647003, "learning_rate": 6.640907007177234e-06, "loss": 0.014538910239934921, "memory(GiB)": 21.48, "step": 13049, "token_acc": 1.0, "train_speed(iter/s)": 0.952931 }, { "epoch": 0.4239352889581912, "grad_norm": 0.5118230581283569, "learning_rate": 6.640399595140548e-06, "loss": 0.018422085791826248, "memory(GiB)": 21.48, "step": 13050, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.952941 }, { "epoch": 0.42396777442094663, "grad_norm": 0.29642874002456665, "learning_rate": 6.639892164171465e-06, "loss": 0.021249920129776, "memory(GiB)": 21.48, "step": 13051, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.952951 }, { "epoch": 0.42400025988370205, "grad_norm": 0.3309210538864136, "learning_rate": 6.639384714275846e-06, "loss": 0.02298511192202568, "memory(GiB)": 21.48, "step": 13052, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.952962 }, { "epoch": 0.42403274534645746, "grad_norm": 0.45449307560920715, "learning_rate": 6.638877245459544e-06, "loss": 0.026390699669718742, "memory(GiB)": 21.48, "step": 13053, "token_acc": 0.974169741697417, "train_speed(iter/s)": 0.952973 }, { "epoch": 0.4240652308092129, "grad_norm": 0.4737618863582611, "learning_rate": 6.638369757728418e-06, "loss": 0.025891857221722603, "memory(GiB)": 21.48, "step": 13054, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.952983 }, { "epoch": 0.4240977162719683, "grad_norm": 0.3966209292411804, "learning_rate": 6.637862251088325e-06, "loss": 0.03259853273630142, "memory(GiB)": 21.48, "step": 13055, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.952993 }, { "epoch": 0.4241302017347237, "grad_norm": 0.3517668843269348, "learning_rate": 6.63735472554512e-06, "loss": 0.021643640473484993, "memory(GiB)": 21.48, "step": 13056, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.953003 }, { "epoch": 0.4241626871974791, "grad_norm": 0.3550403416156769, "learning_rate": 6.636847181104662e-06, "loss": 0.0186464823782444, "memory(GiB)": 21.48, "step": 13057, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.953014 }, { "epoch": 0.42419517266023454, "grad_norm": 0.39330312609672546, "learning_rate": 6.636339617772809e-06, "loss": 0.025983113795518875, "memory(GiB)": 21.48, "step": 13058, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.953024 }, { "epoch": 0.42422765812298996, "grad_norm": 0.37264466285705566, "learning_rate": 6.635832035555421e-06, "loss": 0.023977059870958328, "memory(GiB)": 21.48, "step": 13059, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.953036 }, { "epoch": 0.4242601435857454, "grad_norm": 0.394288569688797, "learning_rate": 6.6353244344583525e-06, "loss": 0.02615925297141075, "memory(GiB)": 21.48, "step": 13060, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.953046 }, { "epoch": 0.4242926290485008, "grad_norm": 0.28562843799591064, "learning_rate": 6.634816814487462e-06, "loss": 0.01813610829412937, "memory(GiB)": 21.48, "step": 13061, "token_acc": 0.9967105263157895, "train_speed(iter/s)": 0.953057 }, { "epoch": 0.4243251145112562, "grad_norm": 0.3639778792858124, "learning_rate": 6.634309175648612e-06, "loss": 0.020772235468029976, "memory(GiB)": 21.48, "step": 13062, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.953067 }, { "epoch": 0.4243575999740116, "grad_norm": 0.43483996391296387, "learning_rate": 6.6338015179476565e-06, "loss": 0.032423920929431915, "memory(GiB)": 21.48, "step": 13063, "token_acc": 0.996, "train_speed(iter/s)": 0.953077 }, { "epoch": 0.42439008543676704, "grad_norm": 0.31431227922439575, "learning_rate": 6.633293841390459e-06, "loss": 0.024023503065109253, "memory(GiB)": 21.48, "step": 13064, "token_acc": 0.9800664451827242, "train_speed(iter/s)": 0.953087 }, { "epoch": 0.42442257089952246, "grad_norm": 0.42392611503601074, "learning_rate": 6.632786145982876e-06, "loss": 0.02592877671122551, "memory(GiB)": 21.48, "step": 13065, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.953099 }, { "epoch": 0.42445505636227787, "grad_norm": 0.5117339491844177, "learning_rate": 6.632278431730768e-06, "loss": 0.024552050977945328, "memory(GiB)": 21.48, "step": 13066, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.953107 }, { "epoch": 0.4244875418250333, "grad_norm": 0.291239857673645, "learning_rate": 6.6317706986399935e-06, "loss": 0.01977677270770073, "memory(GiB)": 21.48, "step": 13067, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.953116 }, { "epoch": 0.4245200272877887, "grad_norm": 0.28621676564216614, "learning_rate": 6.631262946716413e-06, "loss": 0.022524315863847733, "memory(GiB)": 21.48, "step": 13068, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.953127 }, { "epoch": 0.4245525127505441, "grad_norm": 0.5083929896354675, "learning_rate": 6.6307551759658884e-06, "loss": 0.030011553317308426, "memory(GiB)": 21.48, "step": 13069, "token_acc": 0.975103734439834, "train_speed(iter/s)": 0.953139 }, { "epoch": 0.42458499821329954, "grad_norm": 0.27869895100593567, "learning_rate": 6.630247386394277e-06, "loss": 0.018873270601034164, "memory(GiB)": 21.48, "step": 13070, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.953152 }, { "epoch": 0.42461748367605495, "grad_norm": 0.36536797881126404, "learning_rate": 6.6297395780074425e-06, "loss": 0.02537820115685463, "memory(GiB)": 21.48, "step": 13071, "token_acc": 0.9836956521739131, "train_speed(iter/s)": 0.953166 }, { "epoch": 0.42464996913881037, "grad_norm": 0.3946645259857178, "learning_rate": 6.629231750811242e-06, "loss": 0.026917049661278725, "memory(GiB)": 21.48, "step": 13072, "token_acc": 0.981042654028436, "train_speed(iter/s)": 0.953182 }, { "epoch": 0.4246824546015658, "grad_norm": 0.5651800036430359, "learning_rate": 6.628723904811539e-06, "loss": 0.02987632155418396, "memory(GiB)": 21.48, "step": 13073, "token_acc": 0.9837837837837838, "train_speed(iter/s)": 0.953197 }, { "epoch": 0.4247149400643212, "grad_norm": 0.4297477900981903, "learning_rate": 6.628216040014196e-06, "loss": 0.027218671515583992, "memory(GiB)": 21.48, "step": 13074, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.953213 }, { "epoch": 0.4247474255270766, "grad_norm": 0.2542573809623718, "learning_rate": 6.627708156425071e-06, "loss": 0.01770368218421936, "memory(GiB)": 21.48, "step": 13075, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.953229 }, { "epoch": 0.42477991098983203, "grad_norm": 0.2959039509296417, "learning_rate": 6.627200254050029e-06, "loss": 0.01310911774635315, "memory(GiB)": 21.48, "step": 13076, "token_acc": 0.994413407821229, "train_speed(iter/s)": 0.953245 }, { "epoch": 0.42481239645258745, "grad_norm": 0.3635832965373993, "learning_rate": 6.6266923328949286e-06, "loss": 0.02481343224644661, "memory(GiB)": 21.48, "step": 13077, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.953261 }, { "epoch": 0.42484488191534286, "grad_norm": 0.8103556036949158, "learning_rate": 6.626184392965636e-06, "loss": 0.032291751354932785, "memory(GiB)": 21.48, "step": 13078, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.953276 }, { "epoch": 0.4248773673780983, "grad_norm": 0.3820737302303314, "learning_rate": 6.625676434268008e-06, "loss": 0.031056219711899757, "memory(GiB)": 21.48, "step": 13079, "token_acc": 1.0, "train_speed(iter/s)": 0.953291 }, { "epoch": 0.4249098528408537, "grad_norm": 0.30615004897117615, "learning_rate": 6.625168456807912e-06, "loss": 0.023111393675208092, "memory(GiB)": 21.48, "step": 13080, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.953307 }, { "epoch": 0.4249423383036091, "grad_norm": 0.4640271067619324, "learning_rate": 6.624660460591208e-06, "loss": 0.031527016311883926, "memory(GiB)": 21.48, "step": 13081, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.953324 }, { "epoch": 0.42497482376636453, "grad_norm": 0.4195670187473297, "learning_rate": 6.624152445623761e-06, "loss": 0.0272764191031456, "memory(GiB)": 21.48, "step": 13082, "token_acc": 0.9880239520958084, "train_speed(iter/s)": 0.95334 }, { "epoch": 0.42500730922911995, "grad_norm": 0.37437912821769714, "learning_rate": 6.623644411911433e-06, "loss": 0.0285311471670866, "memory(GiB)": 21.48, "step": 13083, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.953355 }, { "epoch": 0.42503979469187536, "grad_norm": 0.3156569004058838, "learning_rate": 6.6231363594600876e-06, "loss": 0.02408391609787941, "memory(GiB)": 21.48, "step": 13084, "token_acc": 0.9923076923076923, "train_speed(iter/s)": 0.953371 }, { "epoch": 0.4250722801546308, "grad_norm": 1.2367147207260132, "learning_rate": 6.622628288275587e-06, "loss": 0.027383167296648026, "memory(GiB)": 21.48, "step": 13085, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.953386 }, { "epoch": 0.4251047656173862, "grad_norm": 0.2806466221809387, "learning_rate": 6.622120198363796e-06, "loss": 0.018855668604373932, "memory(GiB)": 21.48, "step": 13086, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.953401 }, { "epoch": 0.4251372510801416, "grad_norm": 0.42565983533859253, "learning_rate": 6.62161208973058e-06, "loss": 0.024105403572320938, "memory(GiB)": 21.48, "step": 13087, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.953417 }, { "epoch": 0.4251697365428971, "grad_norm": 0.4258853793144226, "learning_rate": 6.6211039623818016e-06, "loss": 0.017644962295889854, "memory(GiB)": 21.48, "step": 13088, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.953433 }, { "epoch": 0.4252022220056525, "grad_norm": 0.6014308929443359, "learning_rate": 6.620595816323325e-06, "loss": 0.021433210000395775, "memory(GiB)": 21.48, "step": 13089, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.953449 }, { "epoch": 0.4252347074684079, "grad_norm": 0.8780843019485474, "learning_rate": 6.620087651561018e-06, "loss": 0.028772663325071335, "memory(GiB)": 21.48, "step": 13090, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.953464 }, { "epoch": 0.42526719293116333, "grad_norm": 0.5955225825309753, "learning_rate": 6.619579468100741e-06, "loss": 0.03363162651658058, "memory(GiB)": 21.48, "step": 13091, "token_acc": 0.9849056603773585, "train_speed(iter/s)": 0.95348 }, { "epoch": 0.42529967839391875, "grad_norm": 0.5202310681343079, "learning_rate": 6.619071265948363e-06, "loss": 0.03036942332983017, "memory(GiB)": 21.48, "step": 13092, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.953496 }, { "epoch": 0.42533216385667416, "grad_norm": 0.42389973998069763, "learning_rate": 6.618563045109745e-06, "loss": 0.029597744345664978, "memory(GiB)": 21.48, "step": 13093, "token_acc": 0.992, "train_speed(iter/s)": 0.953512 }, { "epoch": 0.4253646493194296, "grad_norm": 0.3704468905925751, "learning_rate": 6.618054805590758e-06, "loss": 0.030637472867965698, "memory(GiB)": 21.48, "step": 13094, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.953528 }, { "epoch": 0.425397134782185, "grad_norm": 0.3662671148777008, "learning_rate": 6.617546547397264e-06, "loss": 0.03251533955335617, "memory(GiB)": 21.48, "step": 13095, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.953544 }, { "epoch": 0.4254296202449404, "grad_norm": 0.6925240755081177, "learning_rate": 6.617038270535129e-06, "loss": 0.024979498237371445, "memory(GiB)": 21.48, "step": 13096, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.95356 }, { "epoch": 0.4254621057076958, "grad_norm": 0.2987484633922577, "learning_rate": 6.616529975010222e-06, "loss": 0.023895855993032455, "memory(GiB)": 21.48, "step": 13097, "token_acc": 1.0, "train_speed(iter/s)": 0.953576 }, { "epoch": 0.42549459117045124, "grad_norm": 0.5625459551811218, "learning_rate": 6.616021660828404e-06, "loss": 0.022313019260764122, "memory(GiB)": 21.48, "step": 13098, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.953591 }, { "epoch": 0.42552707663320666, "grad_norm": 0.46872347593307495, "learning_rate": 6.6155133279955485e-06, "loss": 0.02885887771844864, "memory(GiB)": 21.48, "step": 13099, "token_acc": 0.9813953488372092, "train_speed(iter/s)": 0.953607 }, { "epoch": 0.4255595620959621, "grad_norm": 0.2955735921859741, "learning_rate": 6.615004976517517e-06, "loss": 0.020087411627173424, "memory(GiB)": 21.48, "step": 13100, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.953623 }, { "epoch": 0.4255920475587175, "grad_norm": 0.37042924761772156, "learning_rate": 6.61449660640018e-06, "loss": 0.025071827694773674, "memory(GiB)": 21.48, "step": 13101, "token_acc": 0.976878612716763, "train_speed(iter/s)": 0.953638 }, { "epoch": 0.4256245330214729, "grad_norm": 0.540256142616272, "learning_rate": 6.613988217649402e-06, "loss": 0.030205920338630676, "memory(GiB)": 21.48, "step": 13102, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.953654 }, { "epoch": 0.4256570184842283, "grad_norm": 0.4412440359592438, "learning_rate": 6.613479810271052e-06, "loss": 0.026215646415948868, "memory(GiB)": 21.48, "step": 13103, "token_acc": 1.0, "train_speed(iter/s)": 0.953668 }, { "epoch": 0.42568950394698374, "grad_norm": 0.566244900226593, "learning_rate": 6.6129713842709985e-06, "loss": 0.02806813269853592, "memory(GiB)": 21.48, "step": 13104, "token_acc": 0.9816176470588235, "train_speed(iter/s)": 0.953684 }, { "epoch": 0.42572198940973915, "grad_norm": 0.5710088610649109, "learning_rate": 6.6124629396551064e-06, "loss": 0.02475706860423088, "memory(GiB)": 21.48, "step": 13105, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.9537 }, { "epoch": 0.42575447487249457, "grad_norm": 0.419429749250412, "learning_rate": 6.6119544764292475e-06, "loss": 0.023695167154073715, "memory(GiB)": 21.48, "step": 13106, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.953714 }, { "epoch": 0.42578696033525, "grad_norm": 0.38955962657928467, "learning_rate": 6.611445994599288e-06, "loss": 0.022087685763835907, "memory(GiB)": 21.48, "step": 13107, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.953728 }, { "epoch": 0.4258194457980054, "grad_norm": 0.43921950459480286, "learning_rate": 6.610937494171097e-06, "loss": 0.03132016584277153, "memory(GiB)": 21.48, "step": 13108, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.953737 }, { "epoch": 0.4258519312607608, "grad_norm": 0.5002980828285217, "learning_rate": 6.610428975150542e-06, "loss": 0.022993136197328568, "memory(GiB)": 21.48, "step": 13109, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.953749 }, { "epoch": 0.42588441672351623, "grad_norm": 0.32657337188720703, "learning_rate": 6.609920437543496e-06, "loss": 0.02201230078935623, "memory(GiB)": 21.48, "step": 13110, "token_acc": 1.0, "train_speed(iter/s)": 0.953761 }, { "epoch": 0.42591690218627165, "grad_norm": 0.3882383704185486, "learning_rate": 6.6094118813558224e-06, "loss": 0.023434069007635117, "memory(GiB)": 21.48, "step": 13111, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.953774 }, { "epoch": 0.42594938764902707, "grad_norm": 0.30997970700263977, "learning_rate": 6.608903306593395e-06, "loss": 0.020043667405843735, "memory(GiB)": 21.48, "step": 13112, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.953785 }, { "epoch": 0.4259818731117825, "grad_norm": 0.37833455204963684, "learning_rate": 6.608394713262083e-06, "loss": 0.02685190550982952, "memory(GiB)": 21.48, "step": 13113, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.953797 }, { "epoch": 0.4260143585745379, "grad_norm": 0.38854679465293884, "learning_rate": 6.607886101367753e-06, "loss": 0.022896472364664078, "memory(GiB)": 21.48, "step": 13114, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.953807 }, { "epoch": 0.4260468440372933, "grad_norm": 0.3889167308807373, "learning_rate": 6.607377470916278e-06, "loss": 0.03429830074310303, "memory(GiB)": 21.48, "step": 13115, "token_acc": 0.9823529411764705, "train_speed(iter/s)": 0.953817 }, { "epoch": 0.42607932950004873, "grad_norm": 0.30987897515296936, "learning_rate": 6.606868821913528e-06, "loss": 0.01794116571545601, "memory(GiB)": 21.48, "step": 13116, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.953828 }, { "epoch": 0.42611181496280415, "grad_norm": 0.6554765701293945, "learning_rate": 6.606360154365374e-06, "loss": 0.020728960633277893, "memory(GiB)": 21.48, "step": 13117, "token_acc": 1.0, "train_speed(iter/s)": 0.953839 }, { "epoch": 0.42614430042555956, "grad_norm": 0.31726568937301636, "learning_rate": 6.605851468277683e-06, "loss": 0.02006649225950241, "memory(GiB)": 21.48, "step": 13118, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.953849 }, { "epoch": 0.426176785888315, "grad_norm": 0.35069501399993896, "learning_rate": 6.6053427636563306e-06, "loss": 0.021757064387202263, "memory(GiB)": 21.48, "step": 13119, "token_acc": 0.990521327014218, "train_speed(iter/s)": 0.953861 }, { "epoch": 0.4262092713510704, "grad_norm": 0.448597252368927, "learning_rate": 6.604834040507185e-06, "loss": 0.02897416055202484, "memory(GiB)": 21.48, "step": 13120, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.953871 }, { "epoch": 0.4262417568138258, "grad_norm": 0.5061982274055481, "learning_rate": 6.6043252988361195e-06, "loss": 0.024162575602531433, "memory(GiB)": 21.48, "step": 13121, "token_acc": 1.0, "train_speed(iter/s)": 0.95388 }, { "epoch": 0.42627424227658123, "grad_norm": 0.4789552092552185, "learning_rate": 6.603816538649005e-06, "loss": 0.036951616406440735, "memory(GiB)": 21.48, "step": 13122, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.953891 }, { "epoch": 0.42630672773933664, "grad_norm": 0.39325618743896484, "learning_rate": 6.603307759951712e-06, "loss": 0.027873490005731583, "memory(GiB)": 21.48, "step": 13123, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.953902 }, { "epoch": 0.42633921320209206, "grad_norm": 0.3686586022377014, "learning_rate": 6.6027989627501145e-06, "loss": 0.021625462919473648, "memory(GiB)": 21.48, "step": 13124, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.953912 }, { "epoch": 0.4263716986648475, "grad_norm": 0.3267146944999695, "learning_rate": 6.602290147050084e-06, "loss": 0.026795774698257446, "memory(GiB)": 21.48, "step": 13125, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.953921 }, { "epoch": 0.4264041841276029, "grad_norm": 0.337956041097641, "learning_rate": 6.60178131285749e-06, "loss": 0.022837795317173004, "memory(GiB)": 21.48, "step": 13126, "token_acc": 1.0, "train_speed(iter/s)": 0.95393 }, { "epoch": 0.4264366695903583, "grad_norm": 6.387720108032227, "learning_rate": 6.60127246017821e-06, "loss": 0.03189687058329582, "memory(GiB)": 21.48, "step": 13127, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.953943 }, { "epoch": 0.4264691550531137, "grad_norm": 0.6020675301551819, "learning_rate": 6.600763589018113e-06, "loss": 0.02052384987473488, "memory(GiB)": 21.48, "step": 13128, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.953956 }, { "epoch": 0.42650164051586914, "grad_norm": 0.3488600254058838, "learning_rate": 6.600254699383075e-06, "loss": 0.025158632546663284, "memory(GiB)": 21.48, "step": 13129, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.953969 }, { "epoch": 0.42653412597862456, "grad_norm": 0.3692373037338257, "learning_rate": 6.599745791278966e-06, "loss": 0.02457144297659397, "memory(GiB)": 21.48, "step": 13130, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.95398 }, { "epoch": 0.42656661144138, "grad_norm": 0.29444417357444763, "learning_rate": 6.599236864711664e-06, "loss": 0.021367140114307404, "memory(GiB)": 21.48, "step": 13131, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.953992 }, { "epoch": 0.4265990969041354, "grad_norm": 0.37905213236808777, "learning_rate": 6.598727919687037e-06, "loss": 0.02787970006465912, "memory(GiB)": 21.48, "step": 13132, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.954004 }, { "epoch": 0.4266315823668908, "grad_norm": 0.4166765511035919, "learning_rate": 6.598218956210964e-06, "loss": 0.03430186212062836, "memory(GiB)": 21.48, "step": 13133, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.954015 }, { "epoch": 0.4266640678296462, "grad_norm": 0.5422077775001526, "learning_rate": 6.5977099742893155e-06, "loss": 0.025165695697069168, "memory(GiB)": 21.48, "step": 13134, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.954029 }, { "epoch": 0.42669655329240164, "grad_norm": 0.3932333290576935, "learning_rate": 6.597200973927969e-06, "loss": 0.02520880103111267, "memory(GiB)": 21.48, "step": 13135, "token_acc": 0.9943502824858758, "train_speed(iter/s)": 0.954043 }, { "epoch": 0.42672903875515705, "grad_norm": 0.3820123076438904, "learning_rate": 6.596691955132797e-06, "loss": 0.026364818215370178, "memory(GiB)": 21.48, "step": 13136, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.954057 }, { "epoch": 0.42676152421791247, "grad_norm": 0.4645267128944397, "learning_rate": 6.596182917909673e-06, "loss": 0.023985471576452255, "memory(GiB)": 21.48, "step": 13137, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.954074 }, { "epoch": 0.4267940096806679, "grad_norm": 0.3971051871776581, "learning_rate": 6.5956738622644755e-06, "loss": 0.028857357800006866, "memory(GiB)": 21.48, "step": 13138, "token_acc": 0.986013986013986, "train_speed(iter/s)": 0.954088 }, { "epoch": 0.4268264951434233, "grad_norm": 0.3596678674221039, "learning_rate": 6.5951647882030776e-06, "loss": 0.0235573910176754, "memory(GiB)": 21.48, "step": 13139, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.954103 }, { "epoch": 0.4268589806061787, "grad_norm": 0.20965614914894104, "learning_rate": 6.594655695731353e-06, "loss": 0.018613409250974655, "memory(GiB)": 21.48, "step": 13140, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.954119 }, { "epoch": 0.42689146606893413, "grad_norm": 0.31591007113456726, "learning_rate": 6.594146584855182e-06, "loss": 0.024229664355516434, "memory(GiB)": 21.48, "step": 13141, "token_acc": 0.9836956521739131, "train_speed(iter/s)": 0.954134 }, { "epoch": 0.42692395153168955, "grad_norm": 0.3909352421760559, "learning_rate": 6.593637455580436e-06, "loss": 0.017264515161514282, "memory(GiB)": 21.48, "step": 13142, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.95415 }, { "epoch": 0.42695643699444497, "grad_norm": 0.3263686001300812, "learning_rate": 6.593128307912993e-06, "loss": 0.018293116241693497, "memory(GiB)": 21.48, "step": 13143, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.954166 }, { "epoch": 0.4269889224572004, "grad_norm": 0.4897024929523468, "learning_rate": 6.592619141858728e-06, "loss": 0.02887553721666336, "memory(GiB)": 21.48, "step": 13144, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.954181 }, { "epoch": 0.4270214079199558, "grad_norm": 0.3832980990409851, "learning_rate": 6.59210995742352e-06, "loss": 0.0266434196382761, "memory(GiB)": 21.48, "step": 13145, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.954196 }, { "epoch": 0.4270538933827112, "grad_norm": 0.25904184579849243, "learning_rate": 6.591600754613243e-06, "loss": 0.018111873418092728, "memory(GiB)": 21.48, "step": 13146, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.954212 }, { "epoch": 0.42708637884546663, "grad_norm": 0.2713269293308258, "learning_rate": 6.591091533433776e-06, "loss": 0.016905123367905617, "memory(GiB)": 21.48, "step": 13147, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.954227 }, { "epoch": 0.42711886430822205, "grad_norm": 0.42549917101860046, "learning_rate": 6.590582293890993e-06, "loss": 0.030702415853738785, "memory(GiB)": 21.48, "step": 13148, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.954242 }, { "epoch": 0.42715134977097746, "grad_norm": 0.2870936095714569, "learning_rate": 6.590073035990774e-06, "loss": 0.022730693221092224, "memory(GiB)": 21.48, "step": 13149, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.954259 }, { "epoch": 0.4271838352337329, "grad_norm": 0.4015931785106659, "learning_rate": 6.589563759738996e-06, "loss": 0.02917923964560032, "memory(GiB)": 21.48, "step": 13150, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.954274 }, { "epoch": 0.4272163206964883, "grad_norm": 0.5570049285888672, "learning_rate": 6.5890544651415354e-06, "loss": 0.039463918656110764, "memory(GiB)": 21.48, "step": 13151, "token_acc": 0.9724770642201835, "train_speed(iter/s)": 0.954291 }, { "epoch": 0.42724880615924377, "grad_norm": 0.3322058618068695, "learning_rate": 6.588545152204273e-06, "loss": 0.026034250855445862, "memory(GiB)": 21.48, "step": 13152, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.954306 }, { "epoch": 0.4272812916219992, "grad_norm": 0.3738582730293274, "learning_rate": 6.588035820933085e-06, "loss": 0.024308383464813232, "memory(GiB)": 21.48, "step": 13153, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.954322 }, { "epoch": 0.4273137770847546, "grad_norm": 0.3188973367214203, "learning_rate": 6.58752647133385e-06, "loss": 0.023429974913597107, "memory(GiB)": 21.48, "step": 13154, "token_acc": 0.9896907216494846, "train_speed(iter/s)": 0.954338 }, { "epoch": 0.42734626254751, "grad_norm": 0.29538455605506897, "learning_rate": 6.5870171034124454e-06, "loss": 0.023573577404022217, "memory(GiB)": 21.48, "step": 13155, "token_acc": 0.995, "train_speed(iter/s)": 0.954354 }, { "epoch": 0.42737874801026543, "grad_norm": 0.34152206778526306, "learning_rate": 6.586507717174752e-06, "loss": 0.021361513063311577, "memory(GiB)": 21.48, "step": 13156, "token_acc": 1.0, "train_speed(iter/s)": 0.954368 }, { "epoch": 0.42741123347302085, "grad_norm": 0.2951725423336029, "learning_rate": 6.585998312626648e-06, "loss": 0.019482579082250595, "memory(GiB)": 21.48, "step": 13157, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.954384 }, { "epoch": 0.42744371893577626, "grad_norm": 0.2891232371330261, "learning_rate": 6.585488889774011e-06, "loss": 0.026122471317648888, "memory(GiB)": 21.48, "step": 13158, "token_acc": 0.981203007518797, "train_speed(iter/s)": 0.954399 }, { "epoch": 0.4274762043985317, "grad_norm": 0.3303494453430176, "learning_rate": 6.584979448622724e-06, "loss": 0.01690504513680935, "memory(GiB)": 21.48, "step": 13159, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.954415 }, { "epoch": 0.4275086898612871, "grad_norm": 0.8405306339263916, "learning_rate": 6.5844699891786635e-06, "loss": 0.037816666066646576, "memory(GiB)": 21.48, "step": 13160, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.954429 }, { "epoch": 0.4275411753240425, "grad_norm": 1.0798910856246948, "learning_rate": 6.5839605114477115e-06, "loss": 0.02679922990500927, "memory(GiB)": 21.48, "step": 13161, "token_acc": 0.9771863117870723, "train_speed(iter/s)": 0.954445 }, { "epoch": 0.4275736607867979, "grad_norm": 0.3557901382446289, "learning_rate": 6.5834510154357455e-06, "loss": 0.020968971773982048, "memory(GiB)": 21.48, "step": 13162, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.95446 }, { "epoch": 0.42760614624955334, "grad_norm": 0.4794628322124481, "learning_rate": 6.582941501148647e-06, "loss": 0.02902406081557274, "memory(GiB)": 21.48, "step": 13163, "token_acc": 0.9930555555555556, "train_speed(iter/s)": 0.954475 }, { "epoch": 0.42763863171230876, "grad_norm": 0.3306155204772949, "learning_rate": 6.582431968592298e-06, "loss": 0.018013320863246918, "memory(GiB)": 21.48, "step": 13164, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.95449 }, { "epoch": 0.4276711171750642, "grad_norm": 0.3974855840206146, "learning_rate": 6.581922417772576e-06, "loss": 0.026970136910676956, "memory(GiB)": 21.48, "step": 13165, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.954506 }, { "epoch": 0.4277036026378196, "grad_norm": 0.37984129786491394, "learning_rate": 6.581412848695365e-06, "loss": 0.01740792579948902, "memory(GiB)": 21.48, "step": 13166, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.954522 }, { "epoch": 0.427736088100575, "grad_norm": 0.2594148814678192, "learning_rate": 6.580903261366546e-06, "loss": 0.020796937867999077, "memory(GiB)": 21.48, "step": 13167, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.954536 }, { "epoch": 0.4277685735633304, "grad_norm": 0.2928861081600189, "learning_rate": 6.580393655791998e-06, "loss": 0.015165526419878006, "memory(GiB)": 21.48, "step": 13168, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.954546 }, { "epoch": 0.42780105902608584, "grad_norm": 0.42864444851875305, "learning_rate": 6.579884031977604e-06, "loss": 0.022386465221643448, "memory(GiB)": 21.48, "step": 13169, "token_acc": 1.0, "train_speed(iter/s)": 0.954558 }, { "epoch": 0.42783354448884126, "grad_norm": 0.3139040172100067, "learning_rate": 6.579374389929245e-06, "loss": 0.017958058044314384, "memory(GiB)": 21.48, "step": 13170, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.95457 }, { "epoch": 0.42786602995159667, "grad_norm": 0.3520325720310211, "learning_rate": 6.578864729652803e-06, "loss": 0.026911213994026184, "memory(GiB)": 21.48, "step": 13171, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.954583 }, { "epoch": 0.4278985154143521, "grad_norm": 0.40644949674606323, "learning_rate": 6.578355051154162e-06, "loss": 0.02779814787209034, "memory(GiB)": 21.48, "step": 13172, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.954596 }, { "epoch": 0.4279310008771075, "grad_norm": 0.41180431842803955, "learning_rate": 6.577845354439202e-06, "loss": 0.03008611872792244, "memory(GiB)": 21.48, "step": 13173, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.954608 }, { "epoch": 0.4279634863398629, "grad_norm": 0.579454779624939, "learning_rate": 6.577335639513807e-06, "loss": 0.02822956256568432, "memory(GiB)": 21.48, "step": 13174, "token_acc": 0.9805825242718447, "train_speed(iter/s)": 0.954618 }, { "epoch": 0.42799597180261834, "grad_norm": 0.3486911952495575, "learning_rate": 6.576825906383859e-06, "loss": 0.025960225611925125, "memory(GiB)": 21.48, "step": 13175, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.954628 }, { "epoch": 0.42802845726537375, "grad_norm": 0.42126694321632385, "learning_rate": 6.576316155055241e-06, "loss": 0.025652162730693817, "memory(GiB)": 21.48, "step": 13176, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.954638 }, { "epoch": 0.42806094272812917, "grad_norm": 0.38661760091781616, "learning_rate": 6.575806385533836e-06, "loss": 0.024115504696965218, "memory(GiB)": 21.48, "step": 13177, "token_acc": 0.98828125, "train_speed(iter/s)": 0.954649 }, { "epoch": 0.4280934281908846, "grad_norm": 0.4569051265716553, "learning_rate": 6.575296597825529e-06, "loss": 0.02780214697122574, "memory(GiB)": 21.48, "step": 13178, "token_acc": 0.9823943661971831, "train_speed(iter/s)": 0.954658 }, { "epoch": 0.42812591365364, "grad_norm": 0.47211262583732605, "learning_rate": 6.574786791936202e-06, "loss": 0.028246749192476273, "memory(GiB)": 21.48, "step": 13179, "token_acc": 0.9823788546255506, "train_speed(iter/s)": 0.954669 }, { "epoch": 0.4281583991163954, "grad_norm": 0.366094172000885, "learning_rate": 6.57427696787174e-06, "loss": 0.027440626174211502, "memory(GiB)": 21.48, "step": 13180, "token_acc": 1.0, "train_speed(iter/s)": 0.954679 }, { "epoch": 0.42819088457915083, "grad_norm": 0.2905857264995575, "learning_rate": 6.573767125638024e-06, "loss": 0.022480105981230736, "memory(GiB)": 21.48, "step": 13181, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.95469 }, { "epoch": 0.42822337004190625, "grad_norm": 0.4148296117782593, "learning_rate": 6.573257265240944e-06, "loss": 0.026726391166448593, "memory(GiB)": 21.48, "step": 13182, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.954701 }, { "epoch": 0.42825585550466166, "grad_norm": 0.4028339087963104, "learning_rate": 6.57274738668638e-06, "loss": 0.02176687866449356, "memory(GiB)": 21.48, "step": 13183, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.954711 }, { "epoch": 0.4282883409674171, "grad_norm": 0.2997998297214508, "learning_rate": 6.572237489980217e-06, "loss": 0.01983707956969738, "memory(GiB)": 21.48, "step": 13184, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.954721 }, { "epoch": 0.4283208264301725, "grad_norm": 0.37829914689064026, "learning_rate": 6.571727575128342e-06, "loss": 0.02141560986638069, "memory(GiB)": 21.48, "step": 13185, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.95473 }, { "epoch": 0.4283533118929279, "grad_norm": 0.2735448479652405, "learning_rate": 6.571217642136638e-06, "loss": 0.022484108805656433, "memory(GiB)": 21.48, "step": 13186, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.954742 }, { "epoch": 0.42838579735568333, "grad_norm": 0.2565761208534241, "learning_rate": 6.570707691010992e-06, "loss": 0.019878793507814407, "memory(GiB)": 21.48, "step": 13187, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.954754 }, { "epoch": 0.42841828281843874, "grad_norm": 0.2727213203907013, "learning_rate": 6.570197721757288e-06, "loss": 0.02630722150206566, "memory(GiB)": 21.48, "step": 13188, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.954767 }, { "epoch": 0.42845076828119416, "grad_norm": 0.47435495257377625, "learning_rate": 6.569687734381413e-06, "loss": 0.01766112446784973, "memory(GiB)": 21.48, "step": 13189, "token_acc": 1.0, "train_speed(iter/s)": 0.95478 }, { "epoch": 0.4284832537439496, "grad_norm": 0.3855539858341217, "learning_rate": 6.56917772888925e-06, "loss": 0.02381177619099617, "memory(GiB)": 21.48, "step": 13190, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.954792 }, { "epoch": 0.428515739206705, "grad_norm": 0.33130595088005066, "learning_rate": 6.56866770528669e-06, "loss": 0.02407408505678177, "memory(GiB)": 21.48, "step": 13191, "token_acc": 0.9923664122137404, "train_speed(iter/s)": 0.954803 }, { "epoch": 0.4285482246694604, "grad_norm": 0.35508689284324646, "learning_rate": 6.5681576635796165e-06, "loss": 0.017264360561966896, "memory(GiB)": 21.48, "step": 13192, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.954814 }, { "epoch": 0.4285807101322158, "grad_norm": 0.39334574341773987, "learning_rate": 6.567647603773914e-06, "loss": 0.03166920691728592, "memory(GiB)": 21.48, "step": 13193, "token_acc": 0.995, "train_speed(iter/s)": 0.954824 }, { "epoch": 0.42861319559497124, "grad_norm": 0.3790408670902252, "learning_rate": 6.567137525875475e-06, "loss": 0.02085103839635849, "memory(GiB)": 21.48, "step": 13194, "token_acc": 1.0, "train_speed(iter/s)": 0.954835 }, { "epoch": 0.42864568105772666, "grad_norm": 0.31556811928749084, "learning_rate": 6.566627429890181e-06, "loss": 0.019290000200271606, "memory(GiB)": 21.48, "step": 13195, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.954847 }, { "epoch": 0.4286781665204821, "grad_norm": 0.5602611303329468, "learning_rate": 6.566117315823921e-06, "loss": 0.024946361780166626, "memory(GiB)": 21.48, "step": 13196, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.95486 }, { "epoch": 0.4287106519832375, "grad_norm": 0.30332547426223755, "learning_rate": 6.565607183682584e-06, "loss": 0.023197975009679794, "memory(GiB)": 21.48, "step": 13197, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.954872 }, { "epoch": 0.4287431374459929, "grad_norm": 0.46304985880851746, "learning_rate": 6.5650970334720555e-06, "loss": 0.022739112377166748, "memory(GiB)": 21.48, "step": 13198, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.954887 }, { "epoch": 0.4287756229087483, "grad_norm": 0.4799381196498871, "learning_rate": 6.564586865198223e-06, "loss": 0.0352591909468174, "memory(GiB)": 21.48, "step": 13199, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.954903 }, { "epoch": 0.42880810837150374, "grad_norm": 0.31437525153160095, "learning_rate": 6.564076678866977e-06, "loss": 0.018339287489652634, "memory(GiB)": 21.48, "step": 13200, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.954918 }, { "epoch": 0.42884059383425915, "grad_norm": 0.2813704311847687, "learning_rate": 6.563566474484205e-06, "loss": 0.02029632031917572, "memory(GiB)": 21.48, "step": 13201, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.954933 }, { "epoch": 0.42887307929701457, "grad_norm": 0.312641441822052, "learning_rate": 6.563056252055794e-06, "loss": 0.019518744200468063, "memory(GiB)": 21.48, "step": 13202, "token_acc": 1.0, "train_speed(iter/s)": 0.954948 }, { "epoch": 0.42890556475977, "grad_norm": 0.3679584264755249, "learning_rate": 6.562546011587634e-06, "loss": 0.029489774256944656, "memory(GiB)": 21.48, "step": 13203, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.954963 }, { "epoch": 0.4289380502225254, "grad_norm": 0.3696790337562561, "learning_rate": 6.562035753085612e-06, "loss": 0.02279374748468399, "memory(GiB)": 21.48, "step": 13204, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.954978 }, { "epoch": 0.4289705356852808, "grad_norm": 0.24760280549526215, "learning_rate": 6.561525476555619e-06, "loss": 0.016823220998048782, "memory(GiB)": 21.48, "step": 13205, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.954993 }, { "epoch": 0.42900302114803623, "grad_norm": 0.43444108963012695, "learning_rate": 6.5610151820035454e-06, "loss": 0.027920950204133987, "memory(GiB)": 21.48, "step": 13206, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955008 }, { "epoch": 0.42903550661079165, "grad_norm": 0.31062746047973633, "learning_rate": 6.560504869435276e-06, "loss": 0.024856088683009148, "memory(GiB)": 21.48, "step": 13207, "token_acc": 0.9836065573770492, "train_speed(iter/s)": 0.955023 }, { "epoch": 0.42906799207354707, "grad_norm": 0.4915595054626465, "learning_rate": 6.559994538856705e-06, "loss": 0.0249347984790802, "memory(GiB)": 21.48, "step": 13208, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.955037 }, { "epoch": 0.4291004775363025, "grad_norm": 2.2419726848602295, "learning_rate": 6.55948419027372e-06, "loss": 0.025808952748775482, "memory(GiB)": 21.48, "step": 13209, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.95505 }, { "epoch": 0.4291329629990579, "grad_norm": 0.4491397738456726, "learning_rate": 6.558973823692212e-06, "loss": 0.029253371059894562, "memory(GiB)": 21.48, "step": 13210, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955066 }, { "epoch": 0.4291654484618133, "grad_norm": 0.31107887625694275, "learning_rate": 6.558463439118071e-06, "loss": 0.016731631010770798, "memory(GiB)": 21.48, "step": 13211, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955082 }, { "epoch": 0.42919793392456873, "grad_norm": 0.3427588641643524, "learning_rate": 6.557953036557188e-06, "loss": 0.023577241227030754, "memory(GiB)": 21.48, "step": 13212, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.955098 }, { "epoch": 0.42923041938732415, "grad_norm": 0.2616254687309265, "learning_rate": 6.557442616015452e-06, "loss": 0.02127610333263874, "memory(GiB)": 21.48, "step": 13213, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955112 }, { "epoch": 0.42926290485007956, "grad_norm": 0.6748915314674377, "learning_rate": 6.556932177498756e-06, "loss": 0.039730384945869446, "memory(GiB)": 21.48, "step": 13214, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.955128 }, { "epoch": 0.429295390312835, "grad_norm": 0.4788500666618347, "learning_rate": 6.556421721012991e-06, "loss": 0.029109681025147438, "memory(GiB)": 21.48, "step": 13215, "token_acc": 1.0, "train_speed(iter/s)": 0.955143 }, { "epoch": 0.42932787577559045, "grad_norm": 0.33751538395881653, "learning_rate": 6.555911246564047e-06, "loss": 0.02324288710951805, "memory(GiB)": 21.48, "step": 13216, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.955159 }, { "epoch": 0.42936036123834587, "grad_norm": 0.3390659987926483, "learning_rate": 6.555400754157818e-06, "loss": 0.022018898278474808, "memory(GiB)": 21.48, "step": 13217, "token_acc": 0.9966666666666667, "train_speed(iter/s)": 0.955175 }, { "epoch": 0.4293928467011013, "grad_norm": 0.3324240446090698, "learning_rate": 6.554890243800191e-06, "loss": 0.016366004943847656, "memory(GiB)": 21.48, "step": 13218, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.95519 }, { "epoch": 0.4294253321638567, "grad_norm": 0.32402241230010986, "learning_rate": 6.554379715497065e-06, "loss": 0.020770100876688957, "memory(GiB)": 21.48, "step": 13219, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.955205 }, { "epoch": 0.4294578176266121, "grad_norm": 0.3257766664028168, "learning_rate": 6.553869169254325e-06, "loss": 0.021947426721453667, "memory(GiB)": 21.48, "step": 13220, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.955219 }, { "epoch": 0.42949030308936753, "grad_norm": 0.49175283312797546, "learning_rate": 6.553358605077867e-06, "loss": 0.030492426827549934, "memory(GiB)": 21.48, "step": 13221, "token_acc": 1.0, "train_speed(iter/s)": 0.955234 }, { "epoch": 0.42952278855212295, "grad_norm": 0.340361624956131, "learning_rate": 6.552848022973586e-06, "loss": 0.0219285786151886, "memory(GiB)": 21.48, "step": 13222, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.95525 }, { "epoch": 0.42955527401487836, "grad_norm": 0.3259185552597046, "learning_rate": 6.552337422947368e-06, "loss": 0.014371901750564575, "memory(GiB)": 21.48, "step": 13223, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.955265 }, { "epoch": 0.4295877594776338, "grad_norm": 0.3716428875923157, "learning_rate": 6.551826805005113e-06, "loss": 0.023362349718809128, "memory(GiB)": 21.48, "step": 13224, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.955282 }, { "epoch": 0.4296202449403892, "grad_norm": 0.7213587760925293, "learning_rate": 6.551316169152709e-06, "loss": 0.028078608214855194, "memory(GiB)": 21.48, "step": 13225, "token_acc": 0.99, "train_speed(iter/s)": 0.955297 }, { "epoch": 0.4296527304031446, "grad_norm": 0.40351995825767517, "learning_rate": 6.550805515396053e-06, "loss": 0.030317623168230057, "memory(GiB)": 21.48, "step": 13226, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.955312 }, { "epoch": 0.4296852158659, "grad_norm": 0.42230886220932007, "learning_rate": 6.550294843741036e-06, "loss": 0.02690049260854721, "memory(GiB)": 21.48, "step": 13227, "token_acc": 0.9961977186311787, "train_speed(iter/s)": 0.955324 }, { "epoch": 0.42971770132865544, "grad_norm": 0.42368945479393005, "learning_rate": 6.549784154193553e-06, "loss": 0.028951678425073624, "memory(GiB)": 21.48, "step": 13228, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.955336 }, { "epoch": 0.42975018679141086, "grad_norm": 0.4011102318763733, "learning_rate": 6.549273446759501e-06, "loss": 0.03006787970662117, "memory(GiB)": 21.48, "step": 13229, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.95535 }, { "epoch": 0.4297826722541663, "grad_norm": 0.4310900866985321, "learning_rate": 6.548762721444768e-06, "loss": 0.026414617896080017, "memory(GiB)": 21.48, "step": 13230, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.955363 }, { "epoch": 0.4298151577169217, "grad_norm": 0.6585534811019897, "learning_rate": 6.548251978255253e-06, "loss": 0.02546626329421997, "memory(GiB)": 21.48, "step": 13231, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.955374 }, { "epoch": 0.4298476431796771, "grad_norm": 0.33055058121681213, "learning_rate": 6.54774121719685e-06, "loss": 0.02368847280740738, "memory(GiB)": 21.48, "step": 13232, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.955386 }, { "epoch": 0.4298801286424325, "grad_norm": 0.29609623551368713, "learning_rate": 6.547230438275454e-06, "loss": 0.019705872982740402, "memory(GiB)": 21.48, "step": 13233, "token_acc": 1.0, "train_speed(iter/s)": 0.955398 }, { "epoch": 0.42991261410518794, "grad_norm": 0.24718177318572998, "learning_rate": 6.546719641496958e-06, "loss": 0.017588283866643906, "memory(GiB)": 21.48, "step": 13234, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.95541 }, { "epoch": 0.42994509956794336, "grad_norm": 0.30974751710891724, "learning_rate": 6.546208826867259e-06, "loss": 0.016721302643418312, "memory(GiB)": 21.48, "step": 13235, "token_acc": 0.98828125, "train_speed(iter/s)": 0.955423 }, { "epoch": 0.4299775850306988, "grad_norm": 0.3977084755897522, "learning_rate": 6.545697994392252e-06, "loss": 0.031867414712905884, "memory(GiB)": 21.48, "step": 13236, "token_acc": 0.9869565217391304, "train_speed(iter/s)": 0.955436 }, { "epoch": 0.4300100704934542, "grad_norm": 0.448896586894989, "learning_rate": 6.5451871440778326e-06, "loss": 0.028622880578041077, "memory(GiB)": 21.48, "step": 13237, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.955449 }, { "epoch": 0.4300425559562096, "grad_norm": 0.4457450211048126, "learning_rate": 6.544676275929898e-06, "loss": 0.02494385652244091, "memory(GiB)": 21.48, "step": 13238, "token_acc": 1.0, "train_speed(iter/s)": 0.95546 }, { "epoch": 0.430075041418965, "grad_norm": 0.3259454071521759, "learning_rate": 6.5441653899543415e-06, "loss": 0.018239203840494156, "memory(GiB)": 21.48, "step": 13239, "token_acc": 0.9878787878787879, "train_speed(iter/s)": 0.95547 }, { "epoch": 0.43010752688172044, "grad_norm": 0.3116110563278198, "learning_rate": 6.543654486157063e-06, "loss": 0.027201686054468155, "memory(GiB)": 21.48, "step": 13240, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955481 }, { "epoch": 0.43014001234447585, "grad_norm": 0.30459079146385193, "learning_rate": 6.5431435645439545e-06, "loss": 0.020400654524564743, "memory(GiB)": 21.48, "step": 13241, "token_acc": 1.0, "train_speed(iter/s)": 0.955491 }, { "epoch": 0.43017249780723127, "grad_norm": 0.23158080875873566, "learning_rate": 6.542632625120917e-06, "loss": 0.014698858372867107, "memory(GiB)": 21.48, "step": 13242, "token_acc": 0.9944444444444445, "train_speed(iter/s)": 0.955501 }, { "epoch": 0.4302049832699867, "grad_norm": 0.3527534306049347, "learning_rate": 6.542121667893846e-06, "loss": 0.019144371151924133, "memory(GiB)": 21.48, "step": 13243, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.955512 }, { "epoch": 0.4302374687327421, "grad_norm": 0.33065348863601685, "learning_rate": 6.541610692868639e-06, "loss": 0.022259794175624847, "memory(GiB)": 21.48, "step": 13244, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.955523 }, { "epoch": 0.4302699541954975, "grad_norm": 0.23751437664031982, "learning_rate": 6.541099700051191e-06, "loss": 0.017834480851888657, "memory(GiB)": 21.48, "step": 13245, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.955534 }, { "epoch": 0.43030243965825293, "grad_norm": 0.37493693828582764, "learning_rate": 6.5405886894474024e-06, "loss": 0.0208276454359293, "memory(GiB)": 21.48, "step": 13246, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.955546 }, { "epoch": 0.43033492512100835, "grad_norm": 0.3192111551761627, "learning_rate": 6.54007766106317e-06, "loss": 0.02108190208673477, "memory(GiB)": 21.48, "step": 13247, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.955559 }, { "epoch": 0.43036741058376377, "grad_norm": 0.40011414885520935, "learning_rate": 6.539566614904391e-06, "loss": 0.029485899955034256, "memory(GiB)": 21.48, "step": 13248, "token_acc": 0.9836956521739131, "train_speed(iter/s)": 0.955571 }, { "epoch": 0.4303998960465192, "grad_norm": 0.20541159808635712, "learning_rate": 6.539055550976964e-06, "loss": 0.01427574921399355, "memory(GiB)": 21.48, "step": 13249, "token_acc": 1.0, "train_speed(iter/s)": 0.955582 }, { "epoch": 0.4304323815092746, "grad_norm": 0.49201226234436035, "learning_rate": 6.538544469286788e-06, "loss": 0.03208088129758835, "memory(GiB)": 21.48, "step": 13250, "token_acc": 0.9826839826839827, "train_speed(iter/s)": 0.955594 }, { "epoch": 0.43046486697203, "grad_norm": 0.3223872482776642, "learning_rate": 6.5380333698397605e-06, "loss": 0.020832421258091927, "memory(GiB)": 21.48, "step": 13251, "token_acc": 1.0, "train_speed(iter/s)": 0.955607 }, { "epoch": 0.43049735243478543, "grad_norm": 0.3491860628128052, "learning_rate": 6.537522252641781e-06, "loss": 0.0197284035384655, "memory(GiB)": 21.48, "step": 13252, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.95562 }, { "epoch": 0.43052983789754085, "grad_norm": 0.33326926827430725, "learning_rate": 6.537011117698749e-06, "loss": 0.021145006641745567, "memory(GiB)": 21.48, "step": 13253, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.955634 }, { "epoch": 0.43056232336029626, "grad_norm": 0.4530177414417267, "learning_rate": 6.536499965016562e-06, "loss": 0.02850562334060669, "memory(GiB)": 21.48, "step": 13254, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.955648 }, { "epoch": 0.4305948088230517, "grad_norm": 0.3953760862350464, "learning_rate": 6.53598879460112e-06, "loss": 0.025194015353918076, "memory(GiB)": 21.48, "step": 13255, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.95566 }, { "epoch": 0.4306272942858071, "grad_norm": 0.3481040894985199, "learning_rate": 6.535477606458323e-06, "loss": 0.01777886226773262, "memory(GiB)": 21.48, "step": 13256, "token_acc": 0.9880239520958084, "train_speed(iter/s)": 0.955673 }, { "epoch": 0.4306597797485625, "grad_norm": 0.39088043570518494, "learning_rate": 6.534966400594073e-06, "loss": 0.025780368596315384, "memory(GiB)": 21.48, "step": 13257, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.955685 }, { "epoch": 0.4306922652113179, "grad_norm": 0.6148903965950012, "learning_rate": 6.534455177014264e-06, "loss": 0.032900240272283554, "memory(GiB)": 21.48, "step": 13258, "token_acc": 0.992, "train_speed(iter/s)": 0.955697 }, { "epoch": 0.43072475067407334, "grad_norm": 0.3607047200202942, "learning_rate": 6.533943935724803e-06, "loss": 0.029919365420937538, "memory(GiB)": 21.48, "step": 13259, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.95571 }, { "epoch": 0.43075723613682876, "grad_norm": 0.33891844749450684, "learning_rate": 6.533432676731584e-06, "loss": 0.02091970294713974, "memory(GiB)": 21.48, "step": 13260, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.955723 }, { "epoch": 0.4307897215995842, "grad_norm": 0.3804610073566437, "learning_rate": 6.532921400040514e-06, "loss": 0.023702409118413925, "memory(GiB)": 21.48, "step": 13261, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.955735 }, { "epoch": 0.4308222070623396, "grad_norm": 0.3986613154411316, "learning_rate": 6.532410105657491e-06, "loss": 0.031429074704647064, "memory(GiB)": 21.48, "step": 13262, "token_acc": 0.9858156028368794, "train_speed(iter/s)": 0.955747 }, { "epoch": 0.430854692525095, "grad_norm": 0.4254777431488037, "learning_rate": 6.531898793588414e-06, "loss": 0.02835913561284542, "memory(GiB)": 21.48, "step": 13263, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.955762 }, { "epoch": 0.4308871779878504, "grad_norm": 0.48283636569976807, "learning_rate": 6.531387463839186e-06, "loss": 0.03057492896914482, "memory(GiB)": 21.48, "step": 13264, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.955778 }, { "epoch": 0.43091966345060584, "grad_norm": 0.3494753837585449, "learning_rate": 6.530876116415708e-06, "loss": 0.02578536793589592, "memory(GiB)": 21.48, "step": 13265, "token_acc": 1.0, "train_speed(iter/s)": 0.955793 }, { "epoch": 0.43095214891336125, "grad_norm": 0.3786960244178772, "learning_rate": 6.530364751323883e-06, "loss": 0.025374513119459152, "memory(GiB)": 21.48, "step": 13266, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.955808 }, { "epoch": 0.43098463437611667, "grad_norm": 0.2997688055038452, "learning_rate": 6.529853368569612e-06, "loss": 0.02092209830880165, "memory(GiB)": 21.48, "step": 13267, "token_acc": 0.996, "train_speed(iter/s)": 0.955822 }, { "epoch": 0.4310171198388721, "grad_norm": 0.8619086146354675, "learning_rate": 6.529341968158798e-06, "loss": 0.04590278118848801, "memory(GiB)": 21.48, "step": 13268, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.955838 }, { "epoch": 0.4310496053016275, "grad_norm": 0.42673152685165405, "learning_rate": 6.5288305500973395e-06, "loss": 0.027305515483021736, "memory(GiB)": 21.48, "step": 13269, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.955853 }, { "epoch": 0.4310820907643829, "grad_norm": 0.5040025115013123, "learning_rate": 6.528319114391144e-06, "loss": 0.026801932603120804, "memory(GiB)": 21.48, "step": 13270, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.955869 }, { "epoch": 0.43111457622713834, "grad_norm": 0.3590799570083618, "learning_rate": 6.527807661046111e-06, "loss": 0.02500893734395504, "memory(GiB)": 21.48, "step": 13271, "token_acc": 0.9845559845559846, "train_speed(iter/s)": 0.955884 }, { "epoch": 0.43114706168989375, "grad_norm": 0.31208476424217224, "learning_rate": 6.527296190068142e-06, "loss": 0.02432161569595337, "memory(GiB)": 21.48, "step": 13272, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.9559 }, { "epoch": 0.43117954715264917, "grad_norm": 0.6446510553359985, "learning_rate": 6.5267847014631434e-06, "loss": 0.02167677879333496, "memory(GiB)": 21.48, "step": 13273, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955915 }, { "epoch": 0.4312120326154046, "grad_norm": 0.47418782114982605, "learning_rate": 6.5262731952370176e-06, "loss": 0.023750202730298042, "memory(GiB)": 21.48, "step": 13274, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.95593 }, { "epoch": 0.43124451807816, "grad_norm": 0.3505288064479828, "learning_rate": 6.525761671395668e-06, "loss": 0.027715805917978287, "memory(GiB)": 21.48, "step": 13275, "token_acc": 0.9826086956521739, "train_speed(iter/s)": 0.955945 }, { "epoch": 0.4312770035409154, "grad_norm": 0.4153233766555786, "learning_rate": 6.525250129944998e-06, "loss": 0.02725161984562874, "memory(GiB)": 21.48, "step": 13276, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.95596 }, { "epoch": 0.43130948900367083, "grad_norm": 0.4114663302898407, "learning_rate": 6.5247385708909104e-06, "loss": 0.023519521579146385, "memory(GiB)": 21.48, "step": 13277, "token_acc": 0.9896193771626297, "train_speed(iter/s)": 0.955976 }, { "epoch": 0.43134197446642625, "grad_norm": 0.34223830699920654, "learning_rate": 6.524226994239311e-06, "loss": 0.017774758860468864, "memory(GiB)": 21.48, "step": 13278, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.955991 }, { "epoch": 0.43137445992918166, "grad_norm": 0.25831344723701477, "learning_rate": 6.5237153999961035e-06, "loss": 0.02201053872704506, "memory(GiB)": 21.48, "step": 13279, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.956005 }, { "epoch": 0.43140694539193714, "grad_norm": 0.3047724664211273, "learning_rate": 6.523203788167192e-06, "loss": 0.021452998742461205, "memory(GiB)": 21.48, "step": 13280, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956018 }, { "epoch": 0.43143943085469255, "grad_norm": 0.41373011469841003, "learning_rate": 6.5226921587584815e-06, "loss": 0.03169407695531845, "memory(GiB)": 21.48, "step": 13281, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.956032 }, { "epoch": 0.43147191631744797, "grad_norm": 0.24473153054714203, "learning_rate": 6.522180511775876e-06, "loss": 0.02153584733605385, "memory(GiB)": 21.48, "step": 13282, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.956048 }, { "epoch": 0.4315044017802034, "grad_norm": 0.667830765247345, "learning_rate": 6.521668847225282e-06, "loss": 0.023889608681201935, "memory(GiB)": 21.48, "step": 13283, "token_acc": 0.9805825242718447, "train_speed(iter/s)": 0.956062 }, { "epoch": 0.4315368872429588, "grad_norm": 0.41165077686309814, "learning_rate": 6.521157165112605e-06, "loss": 0.01700468175113201, "memory(GiB)": 21.48, "step": 13284, "token_acc": 1.0, "train_speed(iter/s)": 0.956077 }, { "epoch": 0.4315693727057142, "grad_norm": 0.48390454053878784, "learning_rate": 6.520645465443749e-06, "loss": 0.03001830354332924, "memory(GiB)": 21.48, "step": 13285, "token_acc": 1.0, "train_speed(iter/s)": 0.956092 }, { "epoch": 0.43160185816846963, "grad_norm": 0.3846232295036316, "learning_rate": 6.520133748224619e-06, "loss": 0.024399269372224808, "memory(GiB)": 21.48, "step": 13286, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.956108 }, { "epoch": 0.43163434363122505, "grad_norm": 0.4328964650630951, "learning_rate": 6.519622013461124e-06, "loss": 0.021849041804671288, "memory(GiB)": 21.48, "step": 13287, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.95612 }, { "epoch": 0.43166682909398046, "grad_norm": 0.39185792207717896, "learning_rate": 6.5191102611591675e-06, "loss": 0.02158074826002121, "memory(GiB)": 21.48, "step": 13288, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.956131 }, { "epoch": 0.4316993145567359, "grad_norm": 0.4514453411102295, "learning_rate": 6.518598491324657e-06, "loss": 0.02589547447860241, "memory(GiB)": 21.48, "step": 13289, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956144 }, { "epoch": 0.4317318000194913, "grad_norm": 0.3212798833847046, "learning_rate": 6.518086703963498e-06, "loss": 0.02304130792617798, "memory(GiB)": 21.48, "step": 13290, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.956157 }, { "epoch": 0.4317642854822467, "grad_norm": 0.3304641842842102, "learning_rate": 6.517574899081599e-06, "loss": 0.02361125871539116, "memory(GiB)": 21.48, "step": 13291, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.956168 }, { "epoch": 0.43179677094500213, "grad_norm": 0.5571074485778809, "learning_rate": 6.517063076684865e-06, "loss": 0.027413882315158844, "memory(GiB)": 21.48, "step": 13292, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.956181 }, { "epoch": 0.43182925640775754, "grad_norm": 0.34695199131965637, "learning_rate": 6.5165512367792035e-06, "loss": 0.02353493496775627, "memory(GiB)": 21.48, "step": 13293, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.956193 }, { "epoch": 0.43186174187051296, "grad_norm": 0.5091639757156372, "learning_rate": 6.516039379370524e-06, "loss": 0.03871850669384003, "memory(GiB)": 21.48, "step": 13294, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.956203 }, { "epoch": 0.4318942273332684, "grad_norm": 0.4168848395347595, "learning_rate": 6.51552750446473e-06, "loss": 0.02294597402215004, "memory(GiB)": 21.48, "step": 13295, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956214 }, { "epoch": 0.4319267127960238, "grad_norm": 0.2762397825717926, "learning_rate": 6.515015612067734e-06, "loss": 0.01964525878429413, "memory(GiB)": 21.48, "step": 13296, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.956224 }, { "epoch": 0.4319591982587792, "grad_norm": 0.2657284140586853, "learning_rate": 6.5145037021854385e-06, "loss": 0.01927415281534195, "memory(GiB)": 21.48, "step": 13297, "token_acc": 1.0, "train_speed(iter/s)": 0.956235 }, { "epoch": 0.4319916837215346, "grad_norm": 0.33854758739471436, "learning_rate": 6.513991774823757e-06, "loss": 0.01515989564359188, "memory(GiB)": 21.48, "step": 13298, "token_acc": 1.0, "train_speed(iter/s)": 0.956246 }, { "epoch": 0.43202416918429004, "grad_norm": 0.39536798000335693, "learning_rate": 6.513479829988594e-06, "loss": 0.025970445945858955, "memory(GiB)": 21.48, "step": 13299, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.956259 }, { "epoch": 0.43205665464704546, "grad_norm": 0.3631942868232727, "learning_rate": 6.512967867685858e-06, "loss": 0.024578973650932312, "memory(GiB)": 21.48, "step": 13300, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.95627 }, { "epoch": 0.4320891401098009, "grad_norm": 0.3514662981033325, "learning_rate": 6.512455887921463e-06, "loss": 0.03053237497806549, "memory(GiB)": 21.48, "step": 13301, "token_acc": 0.9813953488372092, "train_speed(iter/s)": 0.956282 }, { "epoch": 0.4321216255725563, "grad_norm": 0.4291568398475647, "learning_rate": 6.51194389070131e-06, "loss": 0.022548843175172806, "memory(GiB)": 21.48, "step": 13302, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.956294 }, { "epoch": 0.4321541110353117, "grad_norm": 0.396056592464447, "learning_rate": 6.511431876031315e-06, "loss": 0.026678159832954407, "memory(GiB)": 21.48, "step": 13303, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.956305 }, { "epoch": 0.4321865964980671, "grad_norm": 0.47206923365592957, "learning_rate": 6.5109198439173814e-06, "loss": 0.022018905729055405, "memory(GiB)": 21.48, "step": 13304, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.956315 }, { "epoch": 0.43221908196082254, "grad_norm": 0.5859171152114868, "learning_rate": 6.510407794365423e-06, "loss": 0.02016483247280121, "memory(GiB)": 21.48, "step": 13305, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.956326 }, { "epoch": 0.43225156742357795, "grad_norm": 0.3505276143550873, "learning_rate": 6.509895727381349e-06, "loss": 0.025842057541012764, "memory(GiB)": 21.48, "step": 13306, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.956338 }, { "epoch": 0.43228405288633337, "grad_norm": 0.3309127986431122, "learning_rate": 6.509383642971068e-06, "loss": 0.01774301379919052, "memory(GiB)": 21.48, "step": 13307, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956349 }, { "epoch": 0.4323165383490888, "grad_norm": 0.410843163728714, "learning_rate": 6.508871541140491e-06, "loss": 0.023004090413451195, "memory(GiB)": 21.48, "step": 13308, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956362 }, { "epoch": 0.4323490238118442, "grad_norm": 0.3989962041378021, "learning_rate": 6.5083594218955284e-06, "loss": 0.02369411662220955, "memory(GiB)": 21.48, "step": 13309, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.956374 }, { "epoch": 0.4323815092745996, "grad_norm": 0.45966455340385437, "learning_rate": 6.5078472852420906e-06, "loss": 0.03643794357776642, "memory(GiB)": 21.48, "step": 13310, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.956387 }, { "epoch": 0.43241399473735503, "grad_norm": 0.4302089214324951, "learning_rate": 6.507335131186088e-06, "loss": 0.02340039610862732, "memory(GiB)": 21.48, "step": 13311, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.956399 }, { "epoch": 0.43244648020011045, "grad_norm": 0.4126881957054138, "learning_rate": 6.506822959733432e-06, "loss": 0.02412431128323078, "memory(GiB)": 21.48, "step": 13312, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.956412 }, { "epoch": 0.43247896566286587, "grad_norm": 2.0911905765533447, "learning_rate": 6.506310770890033e-06, "loss": 0.01854041963815689, "memory(GiB)": 21.48, "step": 13313, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.956423 }, { "epoch": 0.4325114511256213, "grad_norm": 0.6250865459442139, "learning_rate": 6.505798564661803e-06, "loss": 0.03555409237742424, "memory(GiB)": 21.48, "step": 13314, "token_acc": 0.9877300613496932, "train_speed(iter/s)": 0.956434 }, { "epoch": 0.4325439365883767, "grad_norm": 0.32145950198173523, "learning_rate": 6.505286341054653e-06, "loss": 0.02276444248855114, "memory(GiB)": 21.48, "step": 13315, "token_acc": 0.9967948717948718, "train_speed(iter/s)": 0.956446 }, { "epoch": 0.4325764220511321, "grad_norm": 0.29489168524742126, "learning_rate": 6.5047741000744945e-06, "loss": 0.013154014013707638, "memory(GiB)": 21.48, "step": 13316, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.956459 }, { "epoch": 0.43260890751388753, "grad_norm": 0.29572391510009766, "learning_rate": 6.504261841727242e-06, "loss": 0.021685952320694923, "memory(GiB)": 21.48, "step": 13317, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.956472 }, { "epoch": 0.43264139297664295, "grad_norm": 0.3162511885166168, "learning_rate": 6.503749566018805e-06, "loss": 0.021095212548971176, "memory(GiB)": 21.48, "step": 13318, "token_acc": 0.9858490566037735, "train_speed(iter/s)": 0.956485 }, { "epoch": 0.43267387843939836, "grad_norm": 0.2665068507194519, "learning_rate": 6.503237272955095e-06, "loss": 0.026196401566267014, "memory(GiB)": 21.48, "step": 13319, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.956496 }, { "epoch": 0.4327063639021538, "grad_norm": 0.3155885636806488, "learning_rate": 6.502724962542027e-06, "loss": 0.026305150240659714, "memory(GiB)": 21.48, "step": 13320, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.956508 }, { "epoch": 0.4327388493649092, "grad_norm": 0.38946324586868286, "learning_rate": 6.502212634785512e-06, "loss": 0.028148651123046875, "memory(GiB)": 21.48, "step": 13321, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.95652 }, { "epoch": 0.4327713348276646, "grad_norm": 0.3434077203273773, "learning_rate": 6.501700289691466e-06, "loss": 0.026902567595243454, "memory(GiB)": 21.48, "step": 13322, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.95653 }, { "epoch": 0.43280382029042, "grad_norm": 0.3643668591976166, "learning_rate": 6.501187927265798e-06, "loss": 0.026219746097922325, "memory(GiB)": 21.48, "step": 13323, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.956542 }, { "epoch": 0.43283630575317544, "grad_norm": 0.4099191725254059, "learning_rate": 6.500675547514423e-06, "loss": 0.018091894686222076, "memory(GiB)": 21.48, "step": 13324, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.956553 }, { "epoch": 0.43286879121593086, "grad_norm": 0.34552058577537537, "learning_rate": 6.500163150443256e-06, "loss": 0.021128136664628983, "memory(GiB)": 21.48, "step": 13325, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.956565 }, { "epoch": 0.4329012766786863, "grad_norm": 0.4468095898628235, "learning_rate": 6.49965073605821e-06, "loss": 0.028858307749032974, "memory(GiB)": 21.48, "step": 13326, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.956576 }, { "epoch": 0.4329337621414417, "grad_norm": 0.36851927638053894, "learning_rate": 6.499138304365198e-06, "loss": 0.01935717649757862, "memory(GiB)": 21.48, "step": 13327, "token_acc": 1.0, "train_speed(iter/s)": 0.956588 }, { "epoch": 0.4329662476041971, "grad_norm": 0.47076746821403503, "learning_rate": 6.498625855370133e-06, "loss": 0.033339183777570724, "memory(GiB)": 21.48, "step": 13328, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.956603 }, { "epoch": 0.4329987330669525, "grad_norm": 0.3370122015476227, "learning_rate": 6.498113389078933e-06, "loss": 0.02240561693906784, "memory(GiB)": 21.48, "step": 13329, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956618 }, { "epoch": 0.43303121852970794, "grad_norm": 0.6856294870376587, "learning_rate": 6.4976009054975095e-06, "loss": 0.02481679990887642, "memory(GiB)": 21.48, "step": 13330, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956632 }, { "epoch": 0.43306370399246336, "grad_norm": 0.4101126194000244, "learning_rate": 6.497088404631778e-06, "loss": 0.023257214576005936, "memory(GiB)": 21.48, "step": 13331, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.956646 }, { "epoch": 0.43309618945521877, "grad_norm": 0.39697974920272827, "learning_rate": 6.4965758864876536e-06, "loss": 0.027722831815481186, "memory(GiB)": 21.48, "step": 13332, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.956659 }, { "epoch": 0.4331286749179742, "grad_norm": 0.5077852606773376, "learning_rate": 6.4960633510710534e-06, "loss": 0.02484714612364769, "memory(GiB)": 21.48, "step": 13333, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.956674 }, { "epoch": 0.4331611603807296, "grad_norm": 0.41527700424194336, "learning_rate": 6.495550798387888e-06, "loss": 0.0222913920879364, "memory(GiB)": 21.48, "step": 13334, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.956689 }, { "epoch": 0.433193645843485, "grad_norm": 0.4165785610675812, "learning_rate": 6.495038228444077e-06, "loss": 0.027342356741428375, "memory(GiB)": 21.48, "step": 13335, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.956703 }, { "epoch": 0.43322613130624044, "grad_norm": 0.3327333331108093, "learning_rate": 6.494525641245536e-06, "loss": 0.021741269156336784, "memory(GiB)": 21.48, "step": 13336, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.956717 }, { "epoch": 0.43325861676899585, "grad_norm": 0.3792784512042999, "learning_rate": 6.494013036798179e-06, "loss": 0.027201900258660316, "memory(GiB)": 21.48, "step": 13337, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.956731 }, { "epoch": 0.43329110223175127, "grad_norm": 0.3200475871562958, "learning_rate": 6.493500415107924e-06, "loss": 0.019472196698188782, "memory(GiB)": 21.48, "step": 13338, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.956747 }, { "epoch": 0.4333235876945067, "grad_norm": 0.29591473937034607, "learning_rate": 6.492987776180685e-06, "loss": 0.019742025062441826, "memory(GiB)": 21.48, "step": 13339, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.95676 }, { "epoch": 0.4333560731572621, "grad_norm": 0.38528159260749817, "learning_rate": 6.492475120022382e-06, "loss": 0.0249228086322546, "memory(GiB)": 21.48, "step": 13340, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956775 }, { "epoch": 0.4333885586200175, "grad_norm": 0.37231290340423584, "learning_rate": 6.491962446638927e-06, "loss": 0.023630604147911072, "memory(GiB)": 21.48, "step": 13341, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.95679 }, { "epoch": 0.43342104408277293, "grad_norm": 0.2900526523590088, "learning_rate": 6.491449756036241e-06, "loss": 0.02392568066716194, "memory(GiB)": 21.48, "step": 13342, "token_acc": 1.0, "train_speed(iter/s)": 0.956805 }, { "epoch": 0.43345352954552835, "grad_norm": 0.27996569871902466, "learning_rate": 6.490937048220239e-06, "loss": 0.0259910486638546, "memory(GiB)": 21.48, "step": 13343, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.956817 }, { "epoch": 0.4334860150082838, "grad_norm": 0.38372012972831726, "learning_rate": 6.49042432319684e-06, "loss": 0.02435750886797905, "memory(GiB)": 21.48, "step": 13344, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.956832 }, { "epoch": 0.43351850047103924, "grad_norm": 0.43179941177368164, "learning_rate": 6.48991158097196e-06, "loss": 0.024618176743388176, "memory(GiB)": 21.48, "step": 13345, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956847 }, { "epoch": 0.43355098593379465, "grad_norm": 0.2908749282360077, "learning_rate": 6.489398821551517e-06, "loss": 0.02248189225792885, "memory(GiB)": 21.48, "step": 13346, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.956861 }, { "epoch": 0.43358347139655007, "grad_norm": 0.49764057993888855, "learning_rate": 6.488886044941429e-06, "loss": 0.03846006095409393, "memory(GiB)": 21.48, "step": 13347, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.956871 }, { "epoch": 0.4336159568593055, "grad_norm": 0.22418615221977234, "learning_rate": 6.488373251147615e-06, "loss": 0.02002393826842308, "memory(GiB)": 21.48, "step": 13348, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.956884 }, { "epoch": 0.4336484423220609, "grad_norm": 0.2983998656272888, "learning_rate": 6.487860440175993e-06, "loss": 0.018830539658665657, "memory(GiB)": 21.48, "step": 13349, "token_acc": 1.0, "train_speed(iter/s)": 0.956895 }, { "epoch": 0.4336809277848163, "grad_norm": 0.34710943698883057, "learning_rate": 6.4873476120324795e-06, "loss": 0.01790769025683403, "memory(GiB)": 21.48, "step": 13350, "token_acc": 0.9747899159663865, "train_speed(iter/s)": 0.956906 }, { "epoch": 0.43371341324757173, "grad_norm": 0.2993029057979584, "learning_rate": 6.4868347667229956e-06, "loss": 0.015598025172948837, "memory(GiB)": 21.48, "step": 13351, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.956915 }, { "epoch": 0.43374589871032715, "grad_norm": 0.35698026418685913, "learning_rate": 6.486321904253461e-06, "loss": 0.021736161783337593, "memory(GiB)": 21.48, "step": 13352, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.956926 }, { "epoch": 0.43377838417308257, "grad_norm": 0.5303827524185181, "learning_rate": 6.485809024629792e-06, "loss": 0.021093076094985008, "memory(GiB)": 21.48, "step": 13353, "token_acc": 1.0, "train_speed(iter/s)": 0.956934 }, { "epoch": 0.433810869635838, "grad_norm": 0.39374712109565735, "learning_rate": 6.485296127857909e-06, "loss": 0.02317228354513645, "memory(GiB)": 21.48, "step": 13354, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.956945 }, { "epoch": 0.4338433550985934, "grad_norm": 0.41967326402664185, "learning_rate": 6.484783213943734e-06, "loss": 0.024919910356402397, "memory(GiB)": 21.48, "step": 13355, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956955 }, { "epoch": 0.4338758405613488, "grad_norm": 0.33346372842788696, "learning_rate": 6.484270282893181e-06, "loss": 0.01572471112012863, "memory(GiB)": 21.48, "step": 13356, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956966 }, { "epoch": 0.43390832602410423, "grad_norm": 0.3635420799255371, "learning_rate": 6.483757334712176e-06, "loss": 0.021678917109966278, "memory(GiB)": 21.48, "step": 13357, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956976 }, { "epoch": 0.43394081148685965, "grad_norm": 0.298721045255661, "learning_rate": 6.483244369406636e-06, "loss": 0.021409137174487114, "memory(GiB)": 21.48, "step": 13358, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.956987 }, { "epoch": 0.43397329694961506, "grad_norm": 0.30779069662094116, "learning_rate": 6.482731386982482e-06, "loss": 0.018680276349186897, "memory(GiB)": 21.48, "step": 13359, "token_acc": 1.0, "train_speed(iter/s)": 0.956998 }, { "epoch": 0.4340057824123705, "grad_norm": 0.5074437856674194, "learning_rate": 6.482218387445634e-06, "loss": 0.026683807373046875, "memory(GiB)": 21.48, "step": 13360, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.957009 }, { "epoch": 0.4340382678751259, "grad_norm": 0.26782986521720886, "learning_rate": 6.481705370802011e-06, "loss": 0.01336134783923626, "memory(GiB)": 21.48, "step": 13361, "token_acc": 1.0, "train_speed(iter/s)": 0.95702 }, { "epoch": 0.4340707533378813, "grad_norm": 0.44524312019348145, "learning_rate": 6.481192337057538e-06, "loss": 0.029415922239422798, "memory(GiB)": 21.48, "step": 13362, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.957031 }, { "epoch": 0.4341032388006367, "grad_norm": 0.5036192536354065, "learning_rate": 6.480679286218135e-06, "loss": 0.02679542638361454, "memory(GiB)": 21.48, "step": 13363, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957041 }, { "epoch": 0.43413572426339214, "grad_norm": 0.45777183771133423, "learning_rate": 6.48016621828972e-06, "loss": 0.02409185841679573, "memory(GiB)": 21.48, "step": 13364, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.957052 }, { "epoch": 0.43416820972614756, "grad_norm": 0.5236433744430542, "learning_rate": 6.479653133278217e-06, "loss": 0.03343408554792404, "memory(GiB)": 21.48, "step": 13365, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.957064 }, { "epoch": 0.434200695188903, "grad_norm": 0.5455021262168884, "learning_rate": 6.479140031189548e-06, "loss": 0.0328841507434845, "memory(GiB)": 21.48, "step": 13366, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.957077 }, { "epoch": 0.4342331806516584, "grad_norm": 0.3527494966983795, "learning_rate": 6.478626912029633e-06, "loss": 0.032579224556684494, "memory(GiB)": 21.48, "step": 13367, "token_acc": 0.9840425531914894, "train_speed(iter/s)": 0.957089 }, { "epoch": 0.4342656661144138, "grad_norm": 0.24374911189079285, "learning_rate": 6.478113775804398e-06, "loss": 0.018474247306585312, "memory(GiB)": 21.48, "step": 13368, "token_acc": 1.0, "train_speed(iter/s)": 0.957102 }, { "epoch": 0.4342981515771692, "grad_norm": 0.38006678223609924, "learning_rate": 6.477600622519762e-06, "loss": 0.025971781462430954, "memory(GiB)": 21.48, "step": 13369, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957114 }, { "epoch": 0.43433063703992464, "grad_norm": 0.40651994943618774, "learning_rate": 6.477087452181646e-06, "loss": 0.022104837000370026, "memory(GiB)": 21.48, "step": 13370, "token_acc": 1.0, "train_speed(iter/s)": 0.957127 }, { "epoch": 0.43436312250268005, "grad_norm": 0.4455573260784149, "learning_rate": 6.476574264795977e-06, "loss": 0.02384256199002266, "memory(GiB)": 21.48, "step": 13371, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957139 }, { "epoch": 0.43439560796543547, "grad_norm": 0.8311968445777893, "learning_rate": 6.476061060368675e-06, "loss": 0.028845787048339844, "memory(GiB)": 21.48, "step": 13372, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.957151 }, { "epoch": 0.4344280934281909, "grad_norm": 0.2920600473880768, "learning_rate": 6.475547838905664e-06, "loss": 0.020749332383275032, "memory(GiB)": 21.48, "step": 13373, "token_acc": 0.9961832061068703, "train_speed(iter/s)": 0.957164 }, { "epoch": 0.4344605788909463, "grad_norm": 0.6072977781295776, "learning_rate": 6.475034600412866e-06, "loss": 0.030351579189300537, "memory(GiB)": 21.48, "step": 13374, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.957176 }, { "epoch": 0.4344930643537017, "grad_norm": 0.443149596452713, "learning_rate": 6.4745213448962065e-06, "loss": 0.02282826416194439, "memory(GiB)": 21.48, "step": 13375, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.957188 }, { "epoch": 0.43452554981645714, "grad_norm": 0.44347521662712097, "learning_rate": 6.4740080723616075e-06, "loss": 0.022674573585391045, "memory(GiB)": 21.48, "step": 13376, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.957201 }, { "epoch": 0.43455803527921255, "grad_norm": 0.4527961015701294, "learning_rate": 6.473494782814994e-06, "loss": 0.024151429533958435, "memory(GiB)": 21.48, "step": 13377, "token_acc": 0.979253112033195, "train_speed(iter/s)": 0.957212 }, { "epoch": 0.43459052074196797, "grad_norm": 0.3543245196342468, "learning_rate": 6.47298147626229e-06, "loss": 0.0251753069460392, "memory(GiB)": 21.48, "step": 13378, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.957224 }, { "epoch": 0.4346230062047234, "grad_norm": 0.2473621815443039, "learning_rate": 6.472468152709417e-06, "loss": 0.018280260264873505, "memory(GiB)": 21.48, "step": 13379, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.957236 }, { "epoch": 0.4346554916674788, "grad_norm": 0.32443031668663025, "learning_rate": 6.471954812162304e-06, "loss": 0.018932703882455826, "memory(GiB)": 21.48, "step": 13380, "token_acc": 0.9895287958115183, "train_speed(iter/s)": 0.957249 }, { "epoch": 0.4346879771302342, "grad_norm": 0.4125169813632965, "learning_rate": 6.4714414546268715e-06, "loss": 0.027611512690782547, "memory(GiB)": 21.48, "step": 13381, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.957261 }, { "epoch": 0.43472046259298963, "grad_norm": 0.3769819140434265, "learning_rate": 6.4709280801090466e-06, "loss": 0.02633950486779213, "memory(GiB)": 21.48, "step": 13382, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.957275 }, { "epoch": 0.43475294805574505, "grad_norm": 0.44868502020835876, "learning_rate": 6.4704146886147535e-06, "loss": 0.02748136967420578, "memory(GiB)": 21.48, "step": 13383, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.957287 }, { "epoch": 0.43478543351850046, "grad_norm": 0.43111491203308105, "learning_rate": 6.46990128014992e-06, "loss": 0.027833368629217148, "memory(GiB)": 21.48, "step": 13384, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.957299 }, { "epoch": 0.4348179189812559, "grad_norm": 0.5174177289009094, "learning_rate": 6.469387854720468e-06, "loss": 0.020684916526079178, "memory(GiB)": 21.48, "step": 13385, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.957312 }, { "epoch": 0.4348504044440113, "grad_norm": 0.3445066213607788, "learning_rate": 6.468874412332323e-06, "loss": 0.025583187118172646, "memory(GiB)": 21.48, "step": 13386, "token_acc": 0.984375, "train_speed(iter/s)": 0.957323 }, { "epoch": 0.4348828899067667, "grad_norm": 0.4432086646556854, "learning_rate": 6.4683609529914135e-06, "loss": 0.024087117984890938, "memory(GiB)": 21.48, "step": 13387, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.957335 }, { "epoch": 0.43491537536952213, "grad_norm": 0.36569276452064514, "learning_rate": 6.467847476703663e-06, "loss": 0.022087767720222473, "memory(GiB)": 21.48, "step": 13388, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.957348 }, { "epoch": 0.43494786083227754, "grad_norm": 0.4414560794830322, "learning_rate": 6.467333983475002e-06, "loss": 0.03297474980354309, "memory(GiB)": 21.48, "step": 13389, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.957361 }, { "epoch": 0.43498034629503296, "grad_norm": 0.2984265685081482, "learning_rate": 6.46682047331135e-06, "loss": 0.01882682368159294, "memory(GiB)": 21.48, "step": 13390, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.957375 }, { "epoch": 0.4350128317577884, "grad_norm": 0.4257396161556244, "learning_rate": 6.466306946218641e-06, "loss": 0.01922563649713993, "memory(GiB)": 21.48, "step": 13391, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.957388 }, { "epoch": 0.4350453172205438, "grad_norm": 0.49299511313438416, "learning_rate": 6.465793402202795e-06, "loss": 0.03261704742908478, "memory(GiB)": 21.48, "step": 13392, "token_acc": 0.9930313588850174, "train_speed(iter/s)": 0.957403 }, { "epoch": 0.4350778026832992, "grad_norm": 0.22889387607574463, "learning_rate": 6.465279841269745e-06, "loss": 0.01337471790611744, "memory(GiB)": 21.48, "step": 13393, "token_acc": 1.0, "train_speed(iter/s)": 0.957418 }, { "epoch": 0.4351102881460546, "grad_norm": 0.4486238658428192, "learning_rate": 6.464766263425415e-06, "loss": 0.027174420654773712, "memory(GiB)": 21.48, "step": 13394, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.957433 }, { "epoch": 0.43514277360881004, "grad_norm": 0.4142357110977173, "learning_rate": 6.46425266867573e-06, "loss": 0.030752576887607574, "memory(GiB)": 21.48, "step": 13395, "token_acc": 1.0, "train_speed(iter/s)": 0.957449 }, { "epoch": 0.43517525907156546, "grad_norm": 0.28734809160232544, "learning_rate": 6.463739057026623e-06, "loss": 0.01940116658806801, "memory(GiB)": 21.48, "step": 13396, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.957464 }, { "epoch": 0.4352077445343209, "grad_norm": 0.36926475167274475, "learning_rate": 6.463225428484017e-06, "loss": 0.026543961837887764, "memory(GiB)": 21.48, "step": 13397, "token_acc": 0.9737991266375546, "train_speed(iter/s)": 0.957479 }, { "epoch": 0.4352402299970763, "grad_norm": 0.7412105202674866, "learning_rate": 6.462711783053843e-06, "loss": 0.02272675931453705, "memory(GiB)": 21.48, "step": 13398, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.957494 }, { "epoch": 0.4352727154598317, "grad_norm": 0.36505237221717834, "learning_rate": 6.462198120742029e-06, "loss": 0.02488415688276291, "memory(GiB)": 21.48, "step": 13399, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.957509 }, { "epoch": 0.4353052009225871, "grad_norm": 0.2935979664325714, "learning_rate": 6.461684441554501e-06, "loss": 0.02019531838595867, "memory(GiB)": 21.48, "step": 13400, "token_acc": 0.9831460674157303, "train_speed(iter/s)": 0.957523 }, { "epoch": 0.43533768638534254, "grad_norm": 0.4586026668548584, "learning_rate": 6.46117074549719e-06, "loss": 0.02351096272468567, "memory(GiB)": 21.48, "step": 13401, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.957538 }, { "epoch": 0.43537017184809795, "grad_norm": 0.5815514326095581, "learning_rate": 6.460657032576023e-06, "loss": 0.025341086089611053, "memory(GiB)": 21.48, "step": 13402, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.957553 }, { "epoch": 0.43540265731085337, "grad_norm": 0.27312204241752625, "learning_rate": 6.460143302796931e-06, "loss": 0.020934857428073883, "memory(GiB)": 21.48, "step": 13403, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957568 }, { "epoch": 0.4354351427736088, "grad_norm": 0.3709864318370819, "learning_rate": 6.459629556165841e-06, "loss": 0.025835495442152023, "memory(GiB)": 21.48, "step": 13404, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957584 }, { "epoch": 0.4354676282363642, "grad_norm": 0.5079489350318909, "learning_rate": 6.459115792688684e-06, "loss": 0.023252911865711212, "memory(GiB)": 21.48, "step": 13405, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.957598 }, { "epoch": 0.4355001136991196, "grad_norm": 0.2972571551799774, "learning_rate": 6.458602012371386e-06, "loss": 0.026377003639936447, "memory(GiB)": 21.48, "step": 13406, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.957611 }, { "epoch": 0.43553259916187503, "grad_norm": 0.3619445860385895, "learning_rate": 6.458088215219882e-06, "loss": 0.023537011817097664, "memory(GiB)": 21.48, "step": 13407, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.957621 }, { "epoch": 0.4355650846246305, "grad_norm": 0.8217774629592896, "learning_rate": 6.4575744012400995e-06, "loss": 0.022590091452002525, "memory(GiB)": 21.48, "step": 13408, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.957633 }, { "epoch": 0.4355975700873859, "grad_norm": 0.3238258957862854, "learning_rate": 6.457060570437967e-06, "loss": 0.020267030224204063, "memory(GiB)": 21.48, "step": 13409, "token_acc": 0.9858156028368794, "train_speed(iter/s)": 0.957644 }, { "epoch": 0.43563005555014134, "grad_norm": 0.47420772910118103, "learning_rate": 6.456546722819417e-06, "loss": 0.024279501289129257, "memory(GiB)": 21.48, "step": 13410, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957656 }, { "epoch": 0.43566254101289675, "grad_norm": 0.43621930480003357, "learning_rate": 6.456032858390379e-06, "loss": 0.028321396559476852, "memory(GiB)": 21.48, "step": 13411, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.957666 }, { "epoch": 0.43569502647565217, "grad_norm": 0.3326825499534607, "learning_rate": 6.455518977156782e-06, "loss": 0.020434517413377762, "memory(GiB)": 21.48, "step": 13412, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957677 }, { "epoch": 0.4357275119384076, "grad_norm": 0.2907421886920929, "learning_rate": 6.455005079124561e-06, "loss": 0.01732739433646202, "memory(GiB)": 21.48, "step": 13413, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.957688 }, { "epoch": 0.435759997401163, "grad_norm": 0.9656285643577576, "learning_rate": 6.454491164299643e-06, "loss": 0.028957515954971313, "memory(GiB)": 21.48, "step": 13414, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.9577 }, { "epoch": 0.4357924828639184, "grad_norm": 0.5060511231422424, "learning_rate": 6.4539772326879635e-06, "loss": 0.03060714341700077, "memory(GiB)": 21.48, "step": 13415, "token_acc": 1.0, "train_speed(iter/s)": 0.957711 }, { "epoch": 0.43582496832667383, "grad_norm": 0.6254203915596008, "learning_rate": 6.453463284295449e-06, "loss": 0.021956615149974823, "memory(GiB)": 21.48, "step": 13416, "token_acc": 1.0, "train_speed(iter/s)": 0.957722 }, { "epoch": 0.43585745378942925, "grad_norm": 0.40893638134002686, "learning_rate": 6.452949319128035e-06, "loss": 0.025952985510230064, "memory(GiB)": 21.48, "step": 13417, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.957733 }, { "epoch": 0.43588993925218467, "grad_norm": 0.3295283019542694, "learning_rate": 6.452435337191652e-06, "loss": 0.029944442212581635, "memory(GiB)": 21.48, "step": 13418, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.957742 }, { "epoch": 0.4359224247149401, "grad_norm": 0.4033161997795105, "learning_rate": 6.451921338492233e-06, "loss": 0.020893219858407974, "memory(GiB)": 21.48, "step": 13419, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.957752 }, { "epoch": 0.4359549101776955, "grad_norm": 0.37602129578590393, "learning_rate": 6.451407323035708e-06, "loss": 0.023705150932073593, "memory(GiB)": 21.48, "step": 13420, "token_acc": 0.9730769230769231, "train_speed(iter/s)": 0.957763 }, { "epoch": 0.4359873956404509, "grad_norm": 0.4425629675388336, "learning_rate": 6.450893290828011e-06, "loss": 0.022525180131196976, "memory(GiB)": 21.48, "step": 13421, "token_acc": 0.99609375, "train_speed(iter/s)": 0.957776 }, { "epoch": 0.43601988110320633, "grad_norm": 0.3808327913284302, "learning_rate": 6.450379241875074e-06, "loss": 0.01827479898929596, "memory(GiB)": 21.48, "step": 13422, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.957787 }, { "epoch": 0.43605236656596175, "grad_norm": 0.2771296203136444, "learning_rate": 6.44986517618283e-06, "loss": 0.017431406304240227, "memory(GiB)": 21.48, "step": 13423, "token_acc": 0.9858490566037735, "train_speed(iter/s)": 0.957799 }, { "epoch": 0.43608485202871716, "grad_norm": 0.30898919701576233, "learning_rate": 6.449351093757213e-06, "loss": 0.020557589828968048, "memory(GiB)": 21.48, "step": 13424, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.95781 }, { "epoch": 0.4361173374914726, "grad_norm": 0.23219437897205353, "learning_rate": 6.448836994604153e-06, "loss": 0.020781103521585464, "memory(GiB)": 21.48, "step": 13425, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.957821 }, { "epoch": 0.436149822954228, "grad_norm": 0.38352760672569275, "learning_rate": 6.448322878729588e-06, "loss": 0.02636752463877201, "memory(GiB)": 21.48, "step": 13426, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957834 }, { "epoch": 0.4361823084169834, "grad_norm": 0.29819992184638977, "learning_rate": 6.4478087461394475e-06, "loss": 0.02367595210671425, "memory(GiB)": 21.48, "step": 13427, "token_acc": 1.0, "train_speed(iter/s)": 0.957845 }, { "epoch": 0.4362147938797388, "grad_norm": 0.39030444622039795, "learning_rate": 6.447294596839668e-06, "loss": 0.026966841891407967, "memory(GiB)": 21.48, "step": 13428, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.957856 }, { "epoch": 0.43624727934249424, "grad_norm": 0.39789244532585144, "learning_rate": 6.446780430836183e-06, "loss": 0.02861766517162323, "memory(GiB)": 21.48, "step": 13429, "token_acc": 0.975, "train_speed(iter/s)": 0.957868 }, { "epoch": 0.43627976480524966, "grad_norm": 0.3431341350078583, "learning_rate": 6.446266248134925e-06, "loss": 0.02066049352288246, "memory(GiB)": 21.48, "step": 13430, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.95788 }, { "epoch": 0.4363122502680051, "grad_norm": 0.5462596416473389, "learning_rate": 6.445752048741831e-06, "loss": 0.02872263640165329, "memory(GiB)": 21.48, "step": 13431, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.95789 }, { "epoch": 0.4363447357307605, "grad_norm": 0.471162348985672, "learning_rate": 6.445237832662833e-06, "loss": 0.031051214784383774, "memory(GiB)": 21.48, "step": 13432, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957901 }, { "epoch": 0.4363772211935159, "grad_norm": 0.37816137075424194, "learning_rate": 6.444723599903867e-06, "loss": 0.03478533774614334, "memory(GiB)": 21.48, "step": 13433, "token_acc": 0.996, "train_speed(iter/s)": 0.957911 }, { "epoch": 0.4364097066562713, "grad_norm": 0.4530268907546997, "learning_rate": 6.444209350470867e-06, "loss": 0.02531180903315544, "memory(GiB)": 21.48, "step": 13434, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.957921 }, { "epoch": 0.43644219211902674, "grad_norm": 0.4502354562282562, "learning_rate": 6.44369508436977e-06, "loss": 0.027505306527018547, "memory(GiB)": 21.48, "step": 13435, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.95793 }, { "epoch": 0.43647467758178216, "grad_norm": 0.4961177408695221, "learning_rate": 6.443180801606511e-06, "loss": 0.02616673707962036, "memory(GiB)": 21.48, "step": 13436, "token_acc": 1.0, "train_speed(iter/s)": 0.957939 }, { "epoch": 0.43650716304453757, "grad_norm": 0.3727385997772217, "learning_rate": 6.442666502187022e-06, "loss": 0.017913158982992172, "memory(GiB)": 21.48, "step": 13437, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.957949 }, { "epoch": 0.436539648507293, "grad_norm": 0.4071997106075287, "learning_rate": 6.4421521861172445e-06, "loss": 0.02437213994562626, "memory(GiB)": 21.48, "step": 13438, "token_acc": 1.0, "train_speed(iter/s)": 0.95796 }, { "epoch": 0.4365721339700484, "grad_norm": 0.3986450731754303, "learning_rate": 6.441637853403108e-06, "loss": 0.024749796837568283, "memory(GiB)": 21.48, "step": 13439, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.957969 }, { "epoch": 0.4366046194328038, "grad_norm": 0.38206222653388977, "learning_rate": 6.441123504050555e-06, "loss": 0.021632593125104904, "memory(GiB)": 21.48, "step": 13440, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.95798 }, { "epoch": 0.43663710489555924, "grad_norm": 0.46634432673454285, "learning_rate": 6.440609138065517e-06, "loss": 0.02781464345753193, "memory(GiB)": 21.48, "step": 13441, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.957989 }, { "epoch": 0.43666959035831465, "grad_norm": 0.24412724375724792, "learning_rate": 6.440094755453932e-06, "loss": 0.019380604848265648, "memory(GiB)": 21.48, "step": 13442, "token_acc": 1.0, "train_speed(iter/s)": 0.958 }, { "epoch": 0.43670207582107007, "grad_norm": 0.5121877789497375, "learning_rate": 6.439580356221738e-06, "loss": 0.028621505945920944, "memory(GiB)": 21.48, "step": 13443, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.958009 }, { "epoch": 0.4367345612838255, "grad_norm": 0.6657356023788452, "learning_rate": 6.4390659403748676e-06, "loss": 0.04268627613782883, "memory(GiB)": 21.48, "step": 13444, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.95802 }, { "epoch": 0.4367670467465809, "grad_norm": 0.23484964668750763, "learning_rate": 6.438551507919264e-06, "loss": 0.016500499099493027, "memory(GiB)": 21.48, "step": 13445, "token_acc": 1.0, "train_speed(iter/s)": 0.958029 }, { "epoch": 0.4367995322093363, "grad_norm": 0.3656153380870819, "learning_rate": 6.4380370588608595e-06, "loss": 0.022893842309713364, "memory(GiB)": 21.48, "step": 13446, "token_acc": 0.9961832061068703, "train_speed(iter/s)": 0.958042 }, { "epoch": 0.43683201767209173, "grad_norm": 0.30090221762657166, "learning_rate": 6.437522593205594e-06, "loss": 0.021525029093027115, "memory(GiB)": 21.48, "step": 13447, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.958054 }, { "epoch": 0.43686450313484715, "grad_norm": 0.4087558686733246, "learning_rate": 6.437008110959404e-06, "loss": 0.025562379509210587, "memory(GiB)": 21.48, "step": 13448, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.958066 }, { "epoch": 0.43689698859760256, "grad_norm": 0.45333629846572876, "learning_rate": 6.436493612128229e-06, "loss": 0.0386752113699913, "memory(GiB)": 21.48, "step": 13449, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.958078 }, { "epoch": 0.436929474060358, "grad_norm": 0.3420241177082062, "learning_rate": 6.4359790967180055e-06, "loss": 0.020748887211084366, "memory(GiB)": 21.48, "step": 13450, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.958088 }, { "epoch": 0.4369619595231134, "grad_norm": 0.35888224840164185, "learning_rate": 6.43546456473467e-06, "loss": 0.024423211812973022, "memory(GiB)": 21.48, "step": 13451, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.958099 }, { "epoch": 0.4369944449858688, "grad_norm": 0.3600355088710785, "learning_rate": 6.434950016184165e-06, "loss": 0.01757875084877014, "memory(GiB)": 21.48, "step": 13452, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.958112 }, { "epoch": 0.43702693044862423, "grad_norm": 0.3220450282096863, "learning_rate": 6.434435451072426e-06, "loss": 0.018506579101085663, "memory(GiB)": 21.48, "step": 13453, "token_acc": 1.0, "train_speed(iter/s)": 0.958124 }, { "epoch": 0.43705941591137965, "grad_norm": 0.39576971530914307, "learning_rate": 6.433920869405393e-06, "loss": 0.02646128460764885, "memory(GiB)": 21.48, "step": 13454, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.958137 }, { "epoch": 0.43709190137413506, "grad_norm": 0.33508074283599854, "learning_rate": 6.433406271189004e-06, "loss": 0.025031989440321922, "memory(GiB)": 21.48, "step": 13455, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.958151 }, { "epoch": 0.4371243868368905, "grad_norm": 0.27567362785339355, "learning_rate": 6.4328916564292e-06, "loss": 0.022555239498615265, "memory(GiB)": 21.48, "step": 13456, "token_acc": 0.9898477157360406, "train_speed(iter/s)": 0.958166 }, { "epoch": 0.4371568722996459, "grad_norm": 0.46236810088157654, "learning_rate": 6.432377025131917e-06, "loss": 0.025544211268424988, "memory(GiB)": 21.48, "step": 13457, "token_acc": 0.9710743801652892, "train_speed(iter/s)": 0.95818 }, { "epoch": 0.4371893577624013, "grad_norm": 0.4087742567062378, "learning_rate": 6.4318623773030984e-06, "loss": 0.02204063907265663, "memory(GiB)": 21.48, "step": 13458, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.958195 }, { "epoch": 0.4372218432251567, "grad_norm": 0.3886373043060303, "learning_rate": 6.431347712948681e-06, "loss": 0.020265404134988785, "memory(GiB)": 21.48, "step": 13459, "token_acc": 1.0, "train_speed(iter/s)": 0.95821 }, { "epoch": 0.43725432868791214, "grad_norm": 0.5613539218902588, "learning_rate": 6.430833032074607e-06, "loss": 0.03009963408112526, "memory(GiB)": 21.48, "step": 13460, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.958224 }, { "epoch": 0.43728681415066756, "grad_norm": 0.34349745512008667, "learning_rate": 6.430318334686815e-06, "loss": 0.0219639353454113, "memory(GiB)": 21.48, "step": 13461, "token_acc": 0.9834710743801653, "train_speed(iter/s)": 0.958239 }, { "epoch": 0.437319299613423, "grad_norm": 0.2747117578983307, "learning_rate": 6.4298036207912455e-06, "loss": 0.02046031504869461, "memory(GiB)": 21.48, "step": 13462, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.958253 }, { "epoch": 0.4373517850761784, "grad_norm": 0.3991551995277405, "learning_rate": 6.429288890393839e-06, "loss": 0.018648575991392136, "memory(GiB)": 21.48, "step": 13463, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.958269 }, { "epoch": 0.4373842705389338, "grad_norm": 0.40846681594848633, "learning_rate": 6.428774143500538e-06, "loss": 0.023296918720006943, "memory(GiB)": 21.48, "step": 13464, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.958283 }, { "epoch": 0.4374167560016892, "grad_norm": 0.3600820004940033, "learning_rate": 6.428259380117279e-06, "loss": 0.025706615298986435, "memory(GiB)": 21.48, "step": 13465, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.958299 }, { "epoch": 0.43744924146444464, "grad_norm": 0.32075023651123047, "learning_rate": 6.427744600250008e-06, "loss": 0.02112240344285965, "memory(GiB)": 21.48, "step": 13466, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.958314 }, { "epoch": 0.43748172692720005, "grad_norm": 0.48647791147232056, "learning_rate": 6.427229803904662e-06, "loss": 0.015199251472949982, "memory(GiB)": 21.48, "step": 13467, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.958329 }, { "epoch": 0.43751421238995547, "grad_norm": 0.302661269903183, "learning_rate": 6.4267149910871865e-06, "loss": 0.023713700473308563, "memory(GiB)": 21.48, "step": 13468, "token_acc": 1.0, "train_speed(iter/s)": 0.958345 }, { "epoch": 0.4375466978527109, "grad_norm": 0.5097548365592957, "learning_rate": 6.42620016180352e-06, "loss": 0.019144445657730103, "memory(GiB)": 21.48, "step": 13469, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.958361 }, { "epoch": 0.4375791833154663, "grad_norm": 0.4948287308216095, "learning_rate": 6.425685316059605e-06, "loss": 0.028358712792396545, "memory(GiB)": 21.48, "step": 13470, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.958375 }, { "epoch": 0.4376116687782217, "grad_norm": 0.5514031648635864, "learning_rate": 6.4251704538613836e-06, "loss": 0.025833729654550552, "memory(GiB)": 21.48, "step": 13471, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.958389 }, { "epoch": 0.4376441542409772, "grad_norm": 0.3395901024341583, "learning_rate": 6.4246555752148e-06, "loss": 0.02032138593494892, "memory(GiB)": 21.48, "step": 13472, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.958404 }, { "epoch": 0.4376766397037326, "grad_norm": 0.30604374408721924, "learning_rate": 6.424140680125793e-06, "loss": 0.025439821183681488, "memory(GiB)": 21.48, "step": 13473, "token_acc": 1.0, "train_speed(iter/s)": 0.958419 }, { "epoch": 0.437709125166488, "grad_norm": 0.7141172885894775, "learning_rate": 6.423625768600307e-06, "loss": 0.029360707849264145, "memory(GiB)": 21.48, "step": 13474, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.958434 }, { "epoch": 0.43774161062924344, "grad_norm": 0.39799055457115173, "learning_rate": 6.423110840644285e-06, "loss": 0.02353743091225624, "memory(GiB)": 21.48, "step": 13475, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.958448 }, { "epoch": 0.43777409609199885, "grad_norm": 0.4030814468860626, "learning_rate": 6.42259589626367e-06, "loss": 0.02521868795156479, "memory(GiB)": 21.48, "step": 13476, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.958463 }, { "epoch": 0.43780658155475427, "grad_norm": 0.36754342913627625, "learning_rate": 6.422080935464407e-06, "loss": 0.022447578608989716, "memory(GiB)": 21.48, "step": 13477, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.958478 }, { "epoch": 0.4378390670175097, "grad_norm": 0.4691927134990692, "learning_rate": 6.4215659582524345e-06, "loss": 0.027620935812592506, "memory(GiB)": 21.48, "step": 13478, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.958493 }, { "epoch": 0.4378715524802651, "grad_norm": 0.3591959476470947, "learning_rate": 6.421050964633699e-06, "loss": 0.023797962814569473, "memory(GiB)": 21.48, "step": 13479, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.958508 }, { "epoch": 0.4379040379430205, "grad_norm": 0.3095736801624298, "learning_rate": 6.420535954614146e-06, "loss": 0.022335559129714966, "memory(GiB)": 21.48, "step": 13480, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.958522 }, { "epoch": 0.43793652340577593, "grad_norm": 0.39620840549468994, "learning_rate": 6.420020928199715e-06, "loss": 0.022837180644273758, "memory(GiB)": 21.48, "step": 13481, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.958536 }, { "epoch": 0.43796900886853135, "grad_norm": 0.33478546142578125, "learning_rate": 6.419505885396353e-06, "loss": 0.025786958634853363, "memory(GiB)": 21.48, "step": 13482, "token_acc": 0.9895104895104895, "train_speed(iter/s)": 0.958551 }, { "epoch": 0.43800149433128677, "grad_norm": 0.3408800959587097, "learning_rate": 6.418990826210004e-06, "loss": 0.018707426264882088, "memory(GiB)": 21.48, "step": 13483, "token_acc": 0.9897959183673469, "train_speed(iter/s)": 0.958566 }, { "epoch": 0.4380339797940422, "grad_norm": 0.34682729840278625, "learning_rate": 6.418475750646613e-06, "loss": 0.022228755056858063, "memory(GiB)": 21.48, "step": 13484, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.958581 }, { "epoch": 0.4380664652567976, "grad_norm": 0.4826347827911377, "learning_rate": 6.417960658712122e-06, "loss": 0.028989020735025406, "memory(GiB)": 21.48, "step": 13485, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.958597 }, { "epoch": 0.438098950719553, "grad_norm": 0.37281137704849243, "learning_rate": 6.41744555041248e-06, "loss": 0.024843236431479454, "memory(GiB)": 21.48, "step": 13486, "token_acc": 0.9878048780487805, "train_speed(iter/s)": 0.958611 }, { "epoch": 0.43813143618230843, "grad_norm": 0.47914162278175354, "learning_rate": 6.416930425753629e-06, "loss": 0.031038625165820122, "memory(GiB)": 21.48, "step": 13487, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.958622 }, { "epoch": 0.43816392164506385, "grad_norm": 0.3132876455783844, "learning_rate": 6.416415284741514e-06, "loss": 0.016229741275310516, "memory(GiB)": 21.48, "step": 13488, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.958632 }, { "epoch": 0.43819640710781926, "grad_norm": 0.19011612236499786, "learning_rate": 6.415900127382082e-06, "loss": 0.010723601095378399, "memory(GiB)": 21.48, "step": 13489, "token_acc": 1.0, "train_speed(iter/s)": 0.958642 }, { "epoch": 0.4382288925705747, "grad_norm": 0.6206397414207458, "learning_rate": 6.415384953681277e-06, "loss": 0.0270112045109272, "memory(GiB)": 21.48, "step": 13490, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.958653 }, { "epoch": 0.4382613780333301, "grad_norm": 0.39379677176475525, "learning_rate": 6.414869763645046e-06, "loss": 0.028140679001808167, "memory(GiB)": 21.48, "step": 13491, "token_acc": 0.983957219251337, "train_speed(iter/s)": 0.958664 }, { "epoch": 0.4382938634960855, "grad_norm": 0.5186380743980408, "learning_rate": 6.414354557279336e-06, "loss": 0.03217015415430069, "memory(GiB)": 21.48, "step": 13492, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.958675 }, { "epoch": 0.43832634895884093, "grad_norm": 0.40520626306533813, "learning_rate": 6.4138393345900915e-06, "loss": 0.031954776495695114, "memory(GiB)": 21.48, "step": 13493, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.958686 }, { "epoch": 0.43835883442159634, "grad_norm": 0.33213478326797485, "learning_rate": 6.41332409558326e-06, "loss": 0.02090728096663952, "memory(GiB)": 21.48, "step": 13494, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.958696 }, { "epoch": 0.43839131988435176, "grad_norm": 0.25774112343788147, "learning_rate": 6.412808840264786e-06, "loss": 0.015371862798929214, "memory(GiB)": 21.48, "step": 13495, "token_acc": 1.0, "train_speed(iter/s)": 0.958706 }, { "epoch": 0.4384238053471072, "grad_norm": 0.4679638743400574, "learning_rate": 6.412293568640618e-06, "loss": 0.030570026487112045, "memory(GiB)": 21.48, "step": 13496, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.958716 }, { "epoch": 0.4384562908098626, "grad_norm": 0.27171409130096436, "learning_rate": 6.411778280716702e-06, "loss": 0.02099098637700081, "memory(GiB)": 21.48, "step": 13497, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.958726 }, { "epoch": 0.438488776272618, "grad_norm": 0.37203842401504517, "learning_rate": 6.4112629764989866e-06, "loss": 0.026092439889907837, "memory(GiB)": 21.48, "step": 13498, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.958736 }, { "epoch": 0.4385212617353734, "grad_norm": 0.3416745662689209, "learning_rate": 6.410747655993416e-06, "loss": 0.024346347898244858, "memory(GiB)": 21.48, "step": 13499, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.958745 }, { "epoch": 0.43855374719812884, "grad_norm": 0.35466310381889343, "learning_rate": 6.410232319205943e-06, "loss": 0.02849484235048294, "memory(GiB)": 21.48, "step": 13500, "token_acc": 0.992, "train_speed(iter/s)": 0.958754 }, { "epoch": 0.43855374719812884, "eval_loss": 0.023951232433319092, "eval_runtime": 80.3781, "eval_samples_per_second": 123.79, "eval_steps_per_second": 3.869, "eval_token_acc": 0.9905648882733814, "step": 13500 }, { "epoch": 0.43858623266088426, "grad_norm": 0.29423612356185913, "learning_rate": 6.40971696614251e-06, "loss": 0.02519233524799347, "memory(GiB)": 21.48, "step": 13501, "token_acc": 0.9903126798388644, "train_speed(iter/s)": 0.952494 }, { "epoch": 0.4386187181236397, "grad_norm": 0.3839685022830963, "learning_rate": 6.409201596809066e-06, "loss": 0.020538335666060448, "memory(GiB)": 21.48, "step": 13502, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.952503 }, { "epoch": 0.4386512035863951, "grad_norm": 0.3407670557498932, "learning_rate": 6.408686211211561e-06, "loss": 0.023566750809550285, "memory(GiB)": 21.48, "step": 13503, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.952514 }, { "epoch": 0.4386836890491505, "grad_norm": 0.350326269865036, "learning_rate": 6.408170809355942e-06, "loss": 0.02502310648560524, "memory(GiB)": 21.48, "step": 13504, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.952524 }, { "epoch": 0.4387161745119059, "grad_norm": 0.4206404387950897, "learning_rate": 6.407655391248158e-06, "loss": 0.019322428852319717, "memory(GiB)": 21.48, "step": 13505, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.952534 }, { "epoch": 0.43874865997466134, "grad_norm": 0.30935898423194885, "learning_rate": 6.407139956894157e-06, "loss": 0.016874166205525398, "memory(GiB)": 21.48, "step": 13506, "token_acc": 1.0, "train_speed(iter/s)": 0.952545 }, { "epoch": 0.43878114543741675, "grad_norm": 0.3115125596523285, "learning_rate": 6.4066245062998865e-06, "loss": 0.02566574141383171, "memory(GiB)": 21.48, "step": 13507, "token_acc": 0.9865771812080537, "train_speed(iter/s)": 0.952556 }, { "epoch": 0.43881363090017217, "grad_norm": 0.4118051826953888, "learning_rate": 6.4061090394713e-06, "loss": 0.027422506362199783, "memory(GiB)": 21.48, "step": 13508, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.952568 }, { "epoch": 0.4388461163629276, "grad_norm": 0.45138928294181824, "learning_rate": 6.4055935564143415e-06, "loss": 0.024363286793231964, "memory(GiB)": 21.48, "step": 13509, "token_acc": 0.9808612440191388, "train_speed(iter/s)": 0.952579 }, { "epoch": 0.438878601825683, "grad_norm": 0.3323698341846466, "learning_rate": 6.405078057134964e-06, "loss": 0.018994368612766266, "memory(GiB)": 21.48, "step": 13510, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.95259 }, { "epoch": 0.4389110872884384, "grad_norm": 0.35962533950805664, "learning_rate": 6.4045625416391135e-06, "loss": 0.025573108345270157, "memory(GiB)": 21.48, "step": 13511, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.952601 }, { "epoch": 0.43894357275119383, "grad_norm": 0.2999809980392456, "learning_rate": 6.404047009932743e-06, "loss": 0.022280912846326828, "memory(GiB)": 21.48, "step": 13512, "token_acc": 0.9757281553398058, "train_speed(iter/s)": 0.952612 }, { "epoch": 0.43897605821394925, "grad_norm": 0.45316746830940247, "learning_rate": 6.403531462021801e-06, "loss": 0.026195958256721497, "memory(GiB)": 21.48, "step": 13513, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.952623 }, { "epoch": 0.43900854367670467, "grad_norm": 0.3501029312610626, "learning_rate": 6.403015897912238e-06, "loss": 0.019061844795942307, "memory(GiB)": 21.48, "step": 13514, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.952633 }, { "epoch": 0.4390410291394601, "grad_norm": 0.3191632628440857, "learning_rate": 6.4025003176100055e-06, "loss": 0.020442213863134384, "memory(GiB)": 21.48, "step": 13515, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.952645 }, { "epoch": 0.4390735146022155, "grad_norm": 0.3491249978542328, "learning_rate": 6.401984721121051e-06, "loss": 0.022106219083070755, "memory(GiB)": 21.48, "step": 13516, "token_acc": 0.9890710382513661, "train_speed(iter/s)": 0.952657 }, { "epoch": 0.4391060000649709, "grad_norm": 0.38438770174980164, "learning_rate": 6.401469108451326e-06, "loss": 0.022953543812036514, "memory(GiB)": 21.48, "step": 13517, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.952673 }, { "epoch": 0.43913848552772633, "grad_norm": 0.2909771203994751, "learning_rate": 6.400953479606783e-06, "loss": 0.016071096062660217, "memory(GiB)": 21.48, "step": 13518, "token_acc": 1.0, "train_speed(iter/s)": 0.952688 }, { "epoch": 0.43917097099048175, "grad_norm": 0.45512086153030396, "learning_rate": 6.400437834593372e-06, "loss": 0.017740104347467422, "memory(GiB)": 21.48, "step": 13519, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.952702 }, { "epoch": 0.43920345645323716, "grad_norm": 0.4095018804073334, "learning_rate": 6.3999221734170445e-06, "loss": 0.024667685851454735, "memory(GiB)": 21.48, "step": 13520, "token_acc": 0.9898477157360406, "train_speed(iter/s)": 0.952715 }, { "epoch": 0.4392359419159926, "grad_norm": 0.2940453886985779, "learning_rate": 6.399406496083752e-06, "loss": 0.014615142717957497, "memory(GiB)": 21.48, "step": 13521, "token_acc": 0.9798387096774194, "train_speed(iter/s)": 0.952731 }, { "epoch": 0.439268427378748, "grad_norm": 0.32332703471183777, "learning_rate": 6.398890802599447e-06, "loss": 0.01611400954425335, "memory(GiB)": 21.48, "step": 13522, "token_acc": 1.0, "train_speed(iter/s)": 0.952745 }, { "epoch": 0.4393009128415034, "grad_norm": 0.45222073793411255, "learning_rate": 6.398375092970078e-06, "loss": 0.021344834938645363, "memory(GiB)": 21.48, "step": 13523, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.95276 }, { "epoch": 0.4393333983042588, "grad_norm": 0.4213676154613495, "learning_rate": 6.397859367201601e-06, "loss": 0.026901040226221085, "memory(GiB)": 21.48, "step": 13524, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.952776 }, { "epoch": 0.43936588376701424, "grad_norm": 0.4350224435329437, "learning_rate": 6.397343625299966e-06, "loss": 0.019291147589683533, "memory(GiB)": 21.48, "step": 13525, "token_acc": 0.9948717948717949, "train_speed(iter/s)": 0.952791 }, { "epoch": 0.43939836922976966, "grad_norm": 0.5377951860427856, "learning_rate": 6.396827867271125e-06, "loss": 0.024132732301950455, "memory(GiB)": 21.48, "step": 13526, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.952807 }, { "epoch": 0.4394308546925251, "grad_norm": 0.4261499047279358, "learning_rate": 6.396312093121031e-06, "loss": 0.02101278491318226, "memory(GiB)": 21.48, "step": 13527, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.952822 }, { "epoch": 0.4394633401552805, "grad_norm": 0.4051751494407654, "learning_rate": 6.395796302855639e-06, "loss": 0.02453288435935974, "memory(GiB)": 21.48, "step": 13528, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.952833 }, { "epoch": 0.4394958256180359, "grad_norm": 0.38081663846969604, "learning_rate": 6.395280496480899e-06, "loss": 0.019819241017103195, "memory(GiB)": 21.48, "step": 13529, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.952848 }, { "epoch": 0.4395283110807913, "grad_norm": 0.4164372384548187, "learning_rate": 6.394764674002764e-06, "loss": 0.02546970173716545, "memory(GiB)": 21.48, "step": 13530, "token_acc": 0.984375, "train_speed(iter/s)": 0.952863 }, { "epoch": 0.43956079654354674, "grad_norm": 0.5646742582321167, "learning_rate": 6.394248835427189e-06, "loss": 0.02409910038113594, "memory(GiB)": 21.48, "step": 13531, "token_acc": 0.9825174825174825, "train_speed(iter/s)": 0.952878 }, { "epoch": 0.43959328200630216, "grad_norm": 0.3991362452507019, "learning_rate": 6.393732980760126e-06, "loss": 0.027867600321769714, "memory(GiB)": 21.48, "step": 13532, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.952892 }, { "epoch": 0.43962576746905757, "grad_norm": 0.459987074136734, "learning_rate": 6.393217110007531e-06, "loss": 0.02318432554602623, "memory(GiB)": 21.48, "step": 13533, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.952907 }, { "epoch": 0.439658252931813, "grad_norm": 0.3615202307701111, "learning_rate": 6.392701223175354e-06, "loss": 0.0217878594994545, "memory(GiB)": 21.48, "step": 13534, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.952922 }, { "epoch": 0.4396907383945684, "grad_norm": 0.30148953199386597, "learning_rate": 6.392185320269553e-06, "loss": 0.01938660442829132, "memory(GiB)": 21.48, "step": 13535, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.952937 }, { "epoch": 0.4397232238573239, "grad_norm": 0.34828221797943115, "learning_rate": 6.391669401296077e-06, "loss": 0.021854951977729797, "memory(GiB)": 21.48, "step": 13536, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.952952 }, { "epoch": 0.4397557093200793, "grad_norm": 0.3307349681854248, "learning_rate": 6.3911534662608864e-06, "loss": 0.021919146180152893, "memory(GiB)": 21.48, "step": 13537, "token_acc": 1.0, "train_speed(iter/s)": 0.952966 }, { "epoch": 0.4397881947828347, "grad_norm": 0.34727218747138977, "learning_rate": 6.390637515169934e-06, "loss": 0.022205576300621033, "memory(GiB)": 21.48, "step": 13538, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.952977 }, { "epoch": 0.4398206802455901, "grad_norm": 0.3079896867275238, "learning_rate": 6.3901215480291725e-06, "loss": 0.027641162276268005, "memory(GiB)": 21.48, "step": 13539, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.952989 }, { "epoch": 0.43985316570834554, "grad_norm": 0.39629918336868286, "learning_rate": 6.389605564844559e-06, "loss": 0.02901947684586048, "memory(GiB)": 21.48, "step": 13540, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.953001 }, { "epoch": 0.43988565117110096, "grad_norm": 0.4148111343383789, "learning_rate": 6.389089565622046e-06, "loss": 0.019697539508342743, "memory(GiB)": 21.48, "step": 13541, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.953013 }, { "epoch": 0.43991813663385637, "grad_norm": 0.34161680936813354, "learning_rate": 6.388573550367593e-06, "loss": 0.02440236136317253, "memory(GiB)": 21.48, "step": 13542, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.953026 }, { "epoch": 0.4399506220966118, "grad_norm": 0.28288963437080383, "learning_rate": 6.388057519087152e-06, "loss": 0.02039838209748268, "memory(GiB)": 21.48, "step": 13543, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.953037 }, { "epoch": 0.4399831075593672, "grad_norm": 0.32352176308631897, "learning_rate": 6.387541471786679e-06, "loss": 0.027999067679047585, "memory(GiB)": 21.48, "step": 13544, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.953048 }, { "epoch": 0.4400155930221226, "grad_norm": 0.3116534352302551, "learning_rate": 6.387025408472132e-06, "loss": 0.020007159560918808, "memory(GiB)": 21.48, "step": 13545, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.953049 }, { "epoch": 0.44004807848487804, "grad_norm": 0.35609903931617737, "learning_rate": 6.386509329149464e-06, "loss": 0.023232080042362213, "memory(GiB)": 21.48, "step": 13546, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.953061 }, { "epoch": 0.44008056394763345, "grad_norm": 0.43065449595451355, "learning_rate": 6.385993233824635e-06, "loss": 0.024270717054605484, "memory(GiB)": 21.48, "step": 13547, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.953072 }, { "epoch": 0.44011304941038887, "grad_norm": 0.42189380526542664, "learning_rate": 6.385477122503596e-06, "loss": 0.03031269460916519, "memory(GiB)": 21.48, "step": 13548, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.953084 }, { "epoch": 0.4401455348731443, "grad_norm": 0.5078397989273071, "learning_rate": 6.3849609951923094e-06, "loss": 0.024500226601958275, "memory(GiB)": 21.48, "step": 13549, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.953095 }, { "epoch": 0.4401780203358997, "grad_norm": 0.8745116591453552, "learning_rate": 6.38444485189673e-06, "loss": 0.03214838728308678, "memory(GiB)": 21.48, "step": 13550, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.953106 }, { "epoch": 0.4402105057986551, "grad_norm": 0.4388848543167114, "learning_rate": 6.383928692622812e-06, "loss": 0.0368836373090744, "memory(GiB)": 21.48, "step": 13551, "token_acc": 0.9834254143646409, "train_speed(iter/s)": 0.953118 }, { "epoch": 0.44024299126141053, "grad_norm": 0.3096361458301544, "learning_rate": 6.383412517376517e-06, "loss": 0.02617114782333374, "memory(GiB)": 21.48, "step": 13552, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.95313 }, { "epoch": 0.44027547672416595, "grad_norm": 0.47549930214881897, "learning_rate": 6.382896326163799e-06, "loss": 0.024059653282165527, "memory(GiB)": 21.48, "step": 13553, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.953141 }, { "epoch": 0.44030796218692136, "grad_norm": 0.32379668951034546, "learning_rate": 6.382380118990618e-06, "loss": 0.032001957297325134, "memory(GiB)": 21.48, "step": 13554, "token_acc": 0.984375, "train_speed(iter/s)": 0.953153 }, { "epoch": 0.4403404476496768, "grad_norm": 0.32742205262184143, "learning_rate": 6.381863895862929e-06, "loss": 0.02524305321276188, "memory(GiB)": 21.48, "step": 13555, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.953164 }, { "epoch": 0.4403729331124322, "grad_norm": 0.2700381278991699, "learning_rate": 6.3813476567866915e-06, "loss": 0.020803656429052353, "memory(GiB)": 21.48, "step": 13556, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.953177 }, { "epoch": 0.4404054185751876, "grad_norm": 0.3342507779598236, "learning_rate": 6.380831401767865e-06, "loss": 0.027329150587320328, "memory(GiB)": 21.48, "step": 13557, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.953187 }, { "epoch": 0.44043790403794303, "grad_norm": 0.6576143503189087, "learning_rate": 6.3803151308124056e-06, "loss": 0.0362454429268837, "memory(GiB)": 21.48, "step": 13558, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.953195 }, { "epoch": 0.44047038950069844, "grad_norm": 0.5636878609657288, "learning_rate": 6.3797988439262725e-06, "loss": 0.03296118974685669, "memory(GiB)": 21.48, "step": 13559, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.953205 }, { "epoch": 0.44050287496345386, "grad_norm": 0.4637271761894226, "learning_rate": 6.379282541115424e-06, "loss": 0.029031001031398773, "memory(GiB)": 21.48, "step": 13560, "token_acc": 0.9847908745247148, "train_speed(iter/s)": 0.953215 }, { "epoch": 0.4405353604262093, "grad_norm": 0.8275464177131653, "learning_rate": 6.378766222385818e-06, "loss": 0.02912018448114395, "memory(GiB)": 21.48, "step": 13561, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.953223 }, { "epoch": 0.4405678458889647, "grad_norm": 0.4974484145641327, "learning_rate": 6.378249887743415e-06, "loss": 0.030373606830835342, "memory(GiB)": 21.48, "step": 13562, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.953234 }, { "epoch": 0.4406003313517201, "grad_norm": 0.39991602301597595, "learning_rate": 6.377733537194176e-06, "loss": 0.02619684860110283, "memory(GiB)": 21.48, "step": 13563, "token_acc": 1.0, "train_speed(iter/s)": 0.953244 }, { "epoch": 0.4406328168144755, "grad_norm": 0.27929815649986267, "learning_rate": 6.3772171707440565e-06, "loss": 0.024071943014860153, "memory(GiB)": 21.48, "step": 13564, "token_acc": 0.9884615384615385, "train_speed(iter/s)": 0.953254 }, { "epoch": 0.44066530227723094, "grad_norm": 0.4411616623401642, "learning_rate": 6.376700788399019e-06, "loss": 0.02540920116007328, "memory(GiB)": 21.48, "step": 13565, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.953264 }, { "epoch": 0.44069778773998636, "grad_norm": 0.4505937099456787, "learning_rate": 6.376184390165021e-06, "loss": 0.02759138122200966, "memory(GiB)": 21.48, "step": 13566, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.953273 }, { "epoch": 0.4407302732027418, "grad_norm": 0.49362996220588684, "learning_rate": 6.375667976048021e-06, "loss": 0.025878705084323883, "memory(GiB)": 21.48, "step": 13567, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.953283 }, { "epoch": 0.4407627586654972, "grad_norm": 0.48780539631843567, "learning_rate": 6.3751515460539845e-06, "loss": 0.029624171555042267, "memory(GiB)": 21.48, "step": 13568, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.953293 }, { "epoch": 0.4407952441282526, "grad_norm": 0.3326933979988098, "learning_rate": 6.374635100188869e-06, "loss": 0.02534192055463791, "memory(GiB)": 21.48, "step": 13569, "token_acc": 0.9834983498349835, "train_speed(iter/s)": 0.953304 }, { "epoch": 0.440827729591008, "grad_norm": 0.27277830243110657, "learning_rate": 6.374118638458634e-06, "loss": 0.02601241134107113, "memory(GiB)": 21.48, "step": 13570, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.953315 }, { "epoch": 0.44086021505376344, "grad_norm": 0.30932947993278503, "learning_rate": 6.373602160869241e-06, "loss": 0.028576241806149483, "memory(GiB)": 21.48, "step": 13571, "token_acc": 0.9918367346938776, "train_speed(iter/s)": 0.953301 }, { "epoch": 0.44089270051651885, "grad_norm": 0.3929576575756073, "learning_rate": 6.37308566742665e-06, "loss": 0.03460502624511719, "memory(GiB)": 21.48, "step": 13572, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.953313 }, { "epoch": 0.44092518597927427, "grad_norm": 0.32971712946891785, "learning_rate": 6.372569158136825e-06, "loss": 0.01892939954996109, "memory(GiB)": 21.48, "step": 13573, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.953323 }, { "epoch": 0.4409576714420297, "grad_norm": 0.4148705303668976, "learning_rate": 6.372052633005723e-06, "loss": 0.030977051705121994, "memory(GiB)": 21.48, "step": 13574, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.953334 }, { "epoch": 0.4409901569047851, "grad_norm": 0.33203545212745667, "learning_rate": 6.37153609203931e-06, "loss": 0.03245927393436432, "memory(GiB)": 21.48, "step": 13575, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.953347 }, { "epoch": 0.4410226423675405, "grad_norm": 0.30509495735168457, "learning_rate": 6.371019535243542e-06, "loss": 0.03025009110569954, "memory(GiB)": 21.48, "step": 13576, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.95336 }, { "epoch": 0.44105512783029593, "grad_norm": 0.2854350209236145, "learning_rate": 6.370502962624385e-06, "loss": 0.022780831903219223, "memory(GiB)": 21.48, "step": 13577, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.953376 }, { "epoch": 0.44108761329305135, "grad_norm": 0.36527034640312195, "learning_rate": 6.3699863741878e-06, "loss": 0.021189222112298012, "memory(GiB)": 21.48, "step": 13578, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.953391 }, { "epoch": 0.44112009875580677, "grad_norm": 0.34618401527404785, "learning_rate": 6.369469769939749e-06, "loss": 0.02605324611067772, "memory(GiB)": 21.48, "step": 13579, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.953407 }, { "epoch": 0.4411525842185622, "grad_norm": 0.42012786865234375, "learning_rate": 6.3689531498861935e-06, "loss": 0.020402174443006516, "memory(GiB)": 21.48, "step": 13580, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.953422 }, { "epoch": 0.4411850696813176, "grad_norm": 0.6116495728492737, "learning_rate": 6.368436514033096e-06, "loss": 0.023355286568403244, "memory(GiB)": 21.48, "step": 13581, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.953438 }, { "epoch": 0.441217555144073, "grad_norm": 0.3929165303707123, "learning_rate": 6.3679198623864206e-06, "loss": 0.030444318428635597, "memory(GiB)": 21.48, "step": 13582, "token_acc": 1.0, "train_speed(iter/s)": 0.953452 }, { "epoch": 0.44125004060682843, "grad_norm": 0.39126428961753845, "learning_rate": 6.3674031949521275e-06, "loss": 0.032845351845026016, "memory(GiB)": 21.48, "step": 13583, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.953466 }, { "epoch": 0.44128252606958385, "grad_norm": 0.32047006487846375, "learning_rate": 6.366886511736183e-06, "loss": 0.021843578666448593, "memory(GiB)": 21.48, "step": 13584, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.953482 }, { "epoch": 0.44131501153233926, "grad_norm": 0.291929692029953, "learning_rate": 6.3663698127445495e-06, "loss": 0.027912896126508713, "memory(GiB)": 21.48, "step": 13585, "token_acc": 0.9730769230769231, "train_speed(iter/s)": 0.953498 }, { "epoch": 0.4413474969950947, "grad_norm": 0.3711908757686615, "learning_rate": 6.365853097983189e-06, "loss": 0.025516994297504425, "memory(GiB)": 21.48, "step": 13586, "token_acc": 0.9781659388646288, "train_speed(iter/s)": 0.953513 }, { "epoch": 0.4413799824578501, "grad_norm": 0.40607669949531555, "learning_rate": 6.365336367458067e-06, "loss": 0.021796897053718567, "memory(GiB)": 21.48, "step": 13587, "token_acc": 0.9759450171821306, "train_speed(iter/s)": 0.953527 }, { "epoch": 0.4414124679206055, "grad_norm": 0.32179152965545654, "learning_rate": 6.364819621175145e-06, "loss": 0.02001246064901352, "memory(GiB)": 21.48, "step": 13588, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.953542 }, { "epoch": 0.4414449533833609, "grad_norm": 0.2831164300441742, "learning_rate": 6.364302859140388e-06, "loss": 0.02663853019475937, "memory(GiB)": 21.48, "step": 13589, "token_acc": 0.994413407821229, "train_speed(iter/s)": 0.953556 }, { "epoch": 0.44147743884611634, "grad_norm": 0.32535654306411743, "learning_rate": 6.363786081359759e-06, "loss": 0.022228289395570755, "memory(GiB)": 21.48, "step": 13590, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.95357 }, { "epoch": 0.44150992430887176, "grad_norm": 0.485464870929718, "learning_rate": 6.363269287839226e-06, "loss": 0.030365601181983948, "memory(GiB)": 21.48, "step": 13591, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.953585 }, { "epoch": 0.4415424097716272, "grad_norm": 0.29117873311042786, "learning_rate": 6.3627524785847484e-06, "loss": 0.016733501106500626, "memory(GiB)": 21.48, "step": 13592, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.9536 }, { "epoch": 0.4415748952343826, "grad_norm": 0.44646599888801575, "learning_rate": 6.362235653602294e-06, "loss": 0.03217260167002678, "memory(GiB)": 21.48, "step": 13593, "token_acc": 0.9776785714285714, "train_speed(iter/s)": 0.953615 }, { "epoch": 0.441607380697138, "grad_norm": 0.3184635043144226, "learning_rate": 6.361718812897828e-06, "loss": 0.024948284029960632, "memory(GiB)": 21.48, "step": 13594, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.95363 }, { "epoch": 0.4416398661598934, "grad_norm": 0.34484726190567017, "learning_rate": 6.361201956477314e-06, "loss": 0.03186536580324173, "memory(GiB)": 21.48, "step": 13595, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.953645 }, { "epoch": 0.44167235162264884, "grad_norm": 0.24131788313388824, "learning_rate": 6.360685084346718e-06, "loss": 0.01969539374113083, "memory(GiB)": 21.48, "step": 13596, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.95366 }, { "epoch": 0.44170483708540426, "grad_norm": 0.31771889328956604, "learning_rate": 6.360168196512004e-06, "loss": 0.019512850791215897, "memory(GiB)": 21.48, "step": 13597, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.953676 }, { "epoch": 0.44173732254815967, "grad_norm": 0.38318219780921936, "learning_rate": 6.359651292979139e-06, "loss": 0.027336714789271355, "memory(GiB)": 21.48, "step": 13598, "token_acc": 1.0, "train_speed(iter/s)": 0.95369 }, { "epoch": 0.4417698080109151, "grad_norm": 0.4894682466983795, "learning_rate": 6.359134373754088e-06, "loss": 0.03033429943025112, "memory(GiB)": 21.48, "step": 13599, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.953704 }, { "epoch": 0.44180229347367056, "grad_norm": 0.5618480443954468, "learning_rate": 6.358617438842818e-06, "loss": 0.022965271025896072, "memory(GiB)": 21.48, "step": 13600, "token_acc": 1.0, "train_speed(iter/s)": 0.953719 }, { "epoch": 0.441834778936426, "grad_norm": 0.29724252223968506, "learning_rate": 6.3581004882512954e-06, "loss": 0.01822584867477417, "memory(GiB)": 21.48, "step": 13601, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.953733 }, { "epoch": 0.4418672643991814, "grad_norm": 0.4023216962814331, "learning_rate": 6.3575835219854845e-06, "loss": 0.024962715804576874, "memory(GiB)": 21.48, "step": 13602, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.953748 }, { "epoch": 0.4418997498619368, "grad_norm": 0.28480854630470276, "learning_rate": 6.357066540051353e-06, "loss": 0.019488349556922913, "memory(GiB)": 21.48, "step": 13603, "token_acc": 1.0, "train_speed(iter/s)": 0.953763 }, { "epoch": 0.4419322353246922, "grad_norm": 0.3688678741455078, "learning_rate": 6.356549542454866e-06, "loss": 0.028925083577632904, "memory(GiB)": 21.48, "step": 13604, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.953776 }, { "epoch": 0.44196472078744764, "grad_norm": 0.3742721378803253, "learning_rate": 6.356032529201994e-06, "loss": 0.03244036063551903, "memory(GiB)": 21.48, "step": 13605, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.953788 }, { "epoch": 0.44199720625020306, "grad_norm": 0.36076653003692627, "learning_rate": 6.3555155002987005e-06, "loss": 0.020669393241405487, "memory(GiB)": 21.48, "step": 13606, "token_acc": 0.996, "train_speed(iter/s)": 0.9538 }, { "epoch": 0.4420296917129585, "grad_norm": 0.33800366520881653, "learning_rate": 6.354998455750953e-06, "loss": 0.018522042781114578, "memory(GiB)": 21.48, "step": 13607, "token_acc": 0.9964285714285714, "train_speed(iter/s)": 0.953813 }, { "epoch": 0.4420621771757139, "grad_norm": 0.28667134046554565, "learning_rate": 6.354481395564721e-06, "loss": 0.018486661836504936, "memory(GiB)": 21.48, "step": 13608, "token_acc": 0.9966442953020134, "train_speed(iter/s)": 0.953826 }, { "epoch": 0.4420946626384693, "grad_norm": 0.3352738320827484, "learning_rate": 6.353964319745972e-06, "loss": 0.031076956540346146, "memory(GiB)": 21.48, "step": 13609, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.953838 }, { "epoch": 0.4421271481012247, "grad_norm": 0.34324154257774353, "learning_rate": 6.353447228300672e-06, "loss": 0.024741526693105698, "memory(GiB)": 21.48, "step": 13610, "token_acc": 0.9923664122137404, "train_speed(iter/s)": 0.953851 }, { "epoch": 0.44215963356398014, "grad_norm": 0.49065378308296204, "learning_rate": 6.352930121234788e-06, "loss": 0.03252806514501572, "memory(GiB)": 21.48, "step": 13611, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.953863 }, { "epoch": 0.44219211902673555, "grad_norm": 0.5248473286628723, "learning_rate": 6.352412998554292e-06, "loss": 0.019837457686662674, "memory(GiB)": 21.48, "step": 13612, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.953875 }, { "epoch": 0.44222460448949097, "grad_norm": 0.48911425471305847, "learning_rate": 6.351895860265147e-06, "loss": 0.026805192232131958, "memory(GiB)": 21.48, "step": 13613, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.953888 }, { "epoch": 0.4422570899522464, "grad_norm": 0.3940601050853729, "learning_rate": 6.351378706373326e-06, "loss": 0.03545372933149338, "memory(GiB)": 21.48, "step": 13614, "token_acc": 0.98, "train_speed(iter/s)": 0.953899 }, { "epoch": 0.4422895754150018, "grad_norm": 0.4178265929222107, "learning_rate": 6.350861536884796e-06, "loss": 0.026606222614645958, "memory(GiB)": 21.48, "step": 13615, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.953912 }, { "epoch": 0.4423220608777572, "grad_norm": 0.30703413486480713, "learning_rate": 6.350344351805526e-06, "loss": 0.01829393580555916, "memory(GiB)": 21.48, "step": 13616, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.953921 }, { "epoch": 0.44235454634051263, "grad_norm": 0.41910842061042786, "learning_rate": 6.3498271511414855e-06, "loss": 0.02082574926316738, "memory(GiB)": 21.48, "step": 13617, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.953932 }, { "epoch": 0.44238703180326805, "grad_norm": 0.34279680252075195, "learning_rate": 6.349309934898642e-06, "loss": 0.021569818258285522, "memory(GiB)": 21.48, "step": 13618, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.953942 }, { "epoch": 0.44241951726602347, "grad_norm": 0.4906371831893921, "learning_rate": 6.3487927030829664e-06, "loss": 0.03405900299549103, "memory(GiB)": 21.48, "step": 13619, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.953951 }, { "epoch": 0.4424520027287789, "grad_norm": 1.0635008811950684, "learning_rate": 6.348275455700427e-06, "loss": 0.02583301067352295, "memory(GiB)": 21.48, "step": 13620, "token_acc": 1.0, "train_speed(iter/s)": 0.953959 }, { "epoch": 0.4424844881915343, "grad_norm": 0.4929479658603668, "learning_rate": 6.3477581927569954e-06, "loss": 0.02799525298178196, "memory(GiB)": 21.48, "step": 13621, "token_acc": 0.9890909090909091, "train_speed(iter/s)": 0.953969 }, { "epoch": 0.4425169736542897, "grad_norm": 0.34450197219848633, "learning_rate": 6.347240914258639e-06, "loss": 0.021747101098299026, "memory(GiB)": 21.48, "step": 13622, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.953979 }, { "epoch": 0.44254945911704513, "grad_norm": 0.34844455122947693, "learning_rate": 6.346723620211332e-06, "loss": 0.024727357551455498, "memory(GiB)": 21.48, "step": 13623, "token_acc": 1.0, "train_speed(iter/s)": 0.953988 }, { "epoch": 0.44258194457980055, "grad_norm": 0.3576897978782654, "learning_rate": 6.346206310621039e-06, "loss": 0.01839711330831051, "memory(GiB)": 21.48, "step": 13624, "token_acc": 0.990521327014218, "train_speed(iter/s)": 0.953997 }, { "epoch": 0.44261443004255596, "grad_norm": 0.43248873949050903, "learning_rate": 6.345688985493734e-06, "loss": 0.025410253554582596, "memory(GiB)": 21.48, "step": 13625, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.954006 }, { "epoch": 0.4426469155053114, "grad_norm": 0.47686612606048584, "learning_rate": 6.345171644835389e-06, "loss": 0.02054157480597496, "memory(GiB)": 21.48, "step": 13626, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.954016 }, { "epoch": 0.4426794009680668, "grad_norm": 0.4082483947277069, "learning_rate": 6.34465428865197e-06, "loss": 0.01775859296321869, "memory(GiB)": 21.48, "step": 13627, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.954026 }, { "epoch": 0.4427118864308222, "grad_norm": 0.6368843913078308, "learning_rate": 6.344136916949452e-06, "loss": 0.03329205513000488, "memory(GiB)": 21.48, "step": 13628, "token_acc": 0.9797979797979798, "train_speed(iter/s)": 0.954033 }, { "epoch": 0.4427443718935776, "grad_norm": 0.596173882484436, "learning_rate": 6.3436195297338055e-06, "loss": 0.02954680286347866, "memory(GiB)": 21.48, "step": 13629, "token_acc": 0.9704641350210971, "train_speed(iter/s)": 0.954045 }, { "epoch": 0.44277685735633304, "grad_norm": 0.712682843208313, "learning_rate": 6.3431021270110005e-06, "loss": 0.027204232290387154, "memory(GiB)": 21.48, "step": 13630, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.954055 }, { "epoch": 0.44280934281908846, "grad_norm": 0.5045388340950012, "learning_rate": 6.34258470878701e-06, "loss": 0.027987919747829437, "memory(GiB)": 21.48, "step": 13631, "token_acc": 0.992831541218638, "train_speed(iter/s)": 0.954068 }, { "epoch": 0.4428418282818439, "grad_norm": 0.34176886081695557, "learning_rate": 6.342067275067805e-06, "loss": 0.023384232074022293, "memory(GiB)": 21.48, "step": 13632, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.95408 }, { "epoch": 0.4428743137445993, "grad_norm": 0.36039236187934875, "learning_rate": 6.341549825859356e-06, "loss": 0.021596945822238922, "memory(GiB)": 21.48, "step": 13633, "token_acc": 0.9894179894179894, "train_speed(iter/s)": 0.954092 }, { "epoch": 0.4429067992073547, "grad_norm": 0.33927902579307556, "learning_rate": 6.341032361167637e-06, "loss": 0.027775567024946213, "memory(GiB)": 21.48, "step": 13634, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.954103 }, { "epoch": 0.4429392846701101, "grad_norm": 0.4160305857658386, "learning_rate": 6.340514880998621e-06, "loss": 0.020596720278263092, "memory(GiB)": 21.48, "step": 13635, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.954118 }, { "epoch": 0.44297177013286554, "grad_norm": 0.36365824937820435, "learning_rate": 6.339997385358278e-06, "loss": 0.021395854651927948, "memory(GiB)": 21.48, "step": 13636, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.954133 }, { "epoch": 0.44300425559562095, "grad_norm": 0.5431442260742188, "learning_rate": 6.339479874252582e-06, "loss": 0.02343040145933628, "memory(GiB)": 21.48, "step": 13637, "token_acc": 0.9885496183206107, "train_speed(iter/s)": 0.954149 }, { "epoch": 0.44303674105837637, "grad_norm": 0.5130015015602112, "learning_rate": 6.338962347687505e-06, "loss": 0.032841697335243225, "memory(GiB)": 21.48, "step": 13638, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.954164 }, { "epoch": 0.4430692265211318, "grad_norm": 0.4771840274333954, "learning_rate": 6.33844480566902e-06, "loss": 0.024329660460352898, "memory(GiB)": 21.48, "step": 13639, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.95418 }, { "epoch": 0.4431017119838872, "grad_norm": 0.4018351435661316, "learning_rate": 6.337927248203102e-06, "loss": 0.030946919694542885, "memory(GiB)": 21.48, "step": 13640, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.954195 }, { "epoch": 0.4431341974466426, "grad_norm": 0.2671767473220825, "learning_rate": 6.3374096752957205e-06, "loss": 0.013517621904611588, "memory(GiB)": 21.48, "step": 13641, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.954209 }, { "epoch": 0.44316668290939804, "grad_norm": 0.6864758133888245, "learning_rate": 6.3368920869528525e-06, "loss": 0.022896915674209595, "memory(GiB)": 21.48, "step": 13642, "token_acc": 1.0, "train_speed(iter/s)": 0.954224 }, { "epoch": 0.44319916837215345, "grad_norm": 0.4846312403678894, "learning_rate": 6.33637448318047e-06, "loss": 0.02278098836541176, "memory(GiB)": 21.48, "step": 13643, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.954239 }, { "epoch": 0.44323165383490887, "grad_norm": 0.689052939414978, "learning_rate": 6.3358568639845476e-06, "loss": 0.031055711209774017, "memory(GiB)": 21.48, "step": 13644, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.954253 }, { "epoch": 0.4432641392976643, "grad_norm": 0.4264217019081116, "learning_rate": 6.335339229371061e-06, "loss": 0.02226363494992256, "memory(GiB)": 21.48, "step": 13645, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.954268 }, { "epoch": 0.4432966247604197, "grad_norm": 0.42371276021003723, "learning_rate": 6.334821579345978e-06, "loss": 0.024561237543821335, "memory(GiB)": 21.48, "step": 13646, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.954283 }, { "epoch": 0.4433291102231751, "grad_norm": 0.4107881784439087, "learning_rate": 6.334303913915281e-06, "loss": 0.025440586730837822, "memory(GiB)": 21.48, "step": 13647, "token_acc": 1.0, "train_speed(iter/s)": 0.954298 }, { "epoch": 0.44336159568593053, "grad_norm": 0.32793059945106506, "learning_rate": 6.3337862330849385e-06, "loss": 0.017923399806022644, "memory(GiB)": 21.48, "step": 13648, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.954312 }, { "epoch": 0.44339408114868595, "grad_norm": 0.43851882219314575, "learning_rate": 6.33326853686093e-06, "loss": 0.03126879781484604, "memory(GiB)": 21.48, "step": 13649, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.954327 }, { "epoch": 0.44342656661144136, "grad_norm": 0.33508405089378357, "learning_rate": 6.3327508252492254e-06, "loss": 0.021526243537664413, "memory(GiB)": 21.48, "step": 13650, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.954342 }, { "epoch": 0.4434590520741968, "grad_norm": 0.3569345474243164, "learning_rate": 6.3322330982558045e-06, "loss": 0.026708979159593582, "memory(GiB)": 21.48, "step": 13651, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.954357 }, { "epoch": 0.4434915375369522, "grad_norm": 0.3687722682952881, "learning_rate": 6.331715355886639e-06, "loss": 0.021418500691652298, "memory(GiB)": 21.48, "step": 13652, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.954372 }, { "epoch": 0.4435240229997076, "grad_norm": 0.2407197654247284, "learning_rate": 6.331197598147707e-06, "loss": 0.016662463545799255, "memory(GiB)": 21.48, "step": 13653, "token_acc": 1.0, "train_speed(iter/s)": 0.954386 }, { "epoch": 0.44355650846246303, "grad_norm": 0.5516663789749146, "learning_rate": 6.330679825044983e-06, "loss": 0.03089415468275547, "memory(GiB)": 21.48, "step": 13654, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.9544 }, { "epoch": 0.44358899392521844, "grad_norm": 0.565911591053009, "learning_rate": 6.330162036584441e-06, "loss": 0.021943816915154457, "memory(GiB)": 21.48, "step": 13655, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.954414 }, { "epoch": 0.44362147938797386, "grad_norm": 0.3543374538421631, "learning_rate": 6.329644232772061e-06, "loss": 0.025223873555660248, "memory(GiB)": 21.48, "step": 13656, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.954427 }, { "epoch": 0.4436539648507293, "grad_norm": 0.31547388434410095, "learning_rate": 6.329126413613814e-06, "loss": 0.017923761159181595, "memory(GiB)": 21.48, "step": 13657, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.954442 }, { "epoch": 0.4436864503134847, "grad_norm": 0.3067644238471985, "learning_rate": 6.3286085791156816e-06, "loss": 0.024138255044817924, "memory(GiB)": 21.48, "step": 13658, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.954456 }, { "epoch": 0.4437189357762401, "grad_norm": 0.5225142240524292, "learning_rate": 6.328090729283636e-06, "loss": 0.025819681584835052, "memory(GiB)": 21.48, "step": 13659, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.95447 }, { "epoch": 0.4437514212389955, "grad_norm": 0.4681052565574646, "learning_rate": 6.327572864123656e-06, "loss": 0.02898586541414261, "memory(GiB)": 21.48, "step": 13660, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.954483 }, { "epoch": 0.44378390670175094, "grad_norm": 0.31155407428741455, "learning_rate": 6.327054983641719e-06, "loss": 0.02153761498630047, "memory(GiB)": 21.48, "step": 13661, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.954498 }, { "epoch": 0.44381639216450636, "grad_norm": 0.23018322885036469, "learning_rate": 6.3265370878438005e-06, "loss": 0.017038557678461075, "memory(GiB)": 21.48, "step": 13662, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.954513 }, { "epoch": 0.4438488776272618, "grad_norm": 0.357471227645874, "learning_rate": 6.3260191767358795e-06, "loss": 0.027063651010394096, "memory(GiB)": 21.48, "step": 13663, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.954528 }, { "epoch": 0.44388136309001724, "grad_norm": 0.31158435344696045, "learning_rate": 6.325501250323931e-06, "loss": 0.018559910356998444, "memory(GiB)": 21.48, "step": 13664, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.954543 }, { "epoch": 0.44391384855277266, "grad_norm": 0.46939176321029663, "learning_rate": 6.324983308613934e-06, "loss": 0.018596550449728966, "memory(GiB)": 21.48, "step": 13665, "token_acc": 0.9944444444444445, "train_speed(iter/s)": 0.954558 }, { "epoch": 0.4439463340155281, "grad_norm": 0.4082270860671997, "learning_rate": 6.324465351611867e-06, "loss": 0.02637728862464428, "memory(GiB)": 21.48, "step": 13666, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.95457 }, { "epoch": 0.4439788194782835, "grad_norm": 0.3757840096950531, "learning_rate": 6.323947379323706e-06, "loss": 0.021835479885339737, "memory(GiB)": 21.48, "step": 13667, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.954585 }, { "epoch": 0.4440113049410389, "grad_norm": 0.4137439727783203, "learning_rate": 6.323429391755431e-06, "loss": 0.0291665680706501, "memory(GiB)": 21.48, "step": 13668, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.9546 }, { "epoch": 0.4440437904037943, "grad_norm": 0.3450455963611603, "learning_rate": 6.322911388913018e-06, "loss": 0.028279302641749382, "memory(GiB)": 21.48, "step": 13669, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.954612 }, { "epoch": 0.44407627586654974, "grad_norm": 0.496017724275589, "learning_rate": 6.322393370802449e-06, "loss": 0.02854187600314617, "memory(GiB)": 21.48, "step": 13670, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.954623 }, { "epoch": 0.44410876132930516, "grad_norm": 0.2991814613342285, "learning_rate": 6.321875337429698e-06, "loss": 0.02615295723080635, "memory(GiB)": 21.48, "step": 13671, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.954634 }, { "epoch": 0.4441412467920606, "grad_norm": 0.6500625014305115, "learning_rate": 6.321357288800747e-06, "loss": 0.024851195514202118, "memory(GiB)": 21.48, "step": 13672, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.954645 }, { "epoch": 0.444173732254816, "grad_norm": 0.5567157864570618, "learning_rate": 6.320839224921574e-06, "loss": 0.02947409078478813, "memory(GiB)": 21.48, "step": 13673, "token_acc": 0.9823321554770318, "train_speed(iter/s)": 0.954657 }, { "epoch": 0.4442062177175714, "grad_norm": 0.35006120800971985, "learning_rate": 6.320321145798159e-06, "loss": 0.025670640170574188, "memory(GiB)": 21.48, "step": 13674, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.954669 }, { "epoch": 0.4442387031803268, "grad_norm": 0.35253840684890747, "learning_rate": 6.3198030514364805e-06, "loss": 0.02353120967745781, "memory(GiB)": 21.48, "step": 13675, "token_acc": 1.0, "train_speed(iter/s)": 0.954681 }, { "epoch": 0.44427118864308224, "grad_norm": 0.45861274003982544, "learning_rate": 6.3192849418425175e-06, "loss": 0.0212697833776474, "memory(GiB)": 21.48, "step": 13676, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.954691 }, { "epoch": 0.44430367410583765, "grad_norm": 0.4671303331851959, "learning_rate": 6.3187668170222505e-06, "loss": 0.024042878299951553, "memory(GiB)": 21.48, "step": 13677, "token_acc": 0.98046875, "train_speed(iter/s)": 0.954699 }, { "epoch": 0.44433615956859307, "grad_norm": 0.8660868406295776, "learning_rate": 6.3182486769816586e-06, "loss": 0.030653871595859528, "memory(GiB)": 21.48, "step": 13678, "token_acc": 0.9895470383275261, "train_speed(iter/s)": 0.954709 }, { "epoch": 0.4443686450313485, "grad_norm": 0.49130913615226746, "learning_rate": 6.3177305217267235e-06, "loss": 0.03869618847966194, "memory(GiB)": 21.48, "step": 13679, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.954719 }, { "epoch": 0.4444011304941039, "grad_norm": 0.3021448254585266, "learning_rate": 6.317212351263425e-06, "loss": 0.014907535165548325, "memory(GiB)": 21.48, "step": 13680, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.954729 }, { "epoch": 0.4444336159568593, "grad_norm": 0.3384381830692291, "learning_rate": 6.316694165597741e-06, "loss": 0.020400574430823326, "memory(GiB)": 21.48, "step": 13681, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.954739 }, { "epoch": 0.44446610141961473, "grad_norm": 0.26146599650382996, "learning_rate": 6.316175964735656e-06, "loss": 0.016107413917779922, "memory(GiB)": 21.48, "step": 13682, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.954749 }, { "epoch": 0.44449858688237015, "grad_norm": 0.30248314142227173, "learning_rate": 6.3156577486831465e-06, "loss": 0.016596533358097076, "memory(GiB)": 21.48, "step": 13683, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.954757 }, { "epoch": 0.44453107234512557, "grad_norm": 0.3526412546634674, "learning_rate": 6.315139517446196e-06, "loss": 0.021308310329914093, "memory(GiB)": 21.48, "step": 13684, "token_acc": 1.0, "train_speed(iter/s)": 0.954765 }, { "epoch": 0.444563557807881, "grad_norm": 0.40257546305656433, "learning_rate": 6.314621271030784e-06, "loss": 0.021604958921670914, "memory(GiB)": 21.48, "step": 13685, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.954775 }, { "epoch": 0.4445960432706364, "grad_norm": 0.4303743839263916, "learning_rate": 6.3141030094428945e-06, "loss": 0.028715211898088455, "memory(GiB)": 21.48, "step": 13686, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.954786 }, { "epoch": 0.4446285287333918, "grad_norm": 0.4118092954158783, "learning_rate": 6.313584732688506e-06, "loss": 0.020046507939696312, "memory(GiB)": 21.48, "step": 13687, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.954795 }, { "epoch": 0.44466101419614723, "grad_norm": 0.2629191279411316, "learning_rate": 6.313066440773602e-06, "loss": 0.023029278963804245, "memory(GiB)": 21.48, "step": 13688, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.954805 }, { "epoch": 0.44469349965890265, "grad_norm": 0.2639247477054596, "learning_rate": 6.312548133704164e-06, "loss": 0.016717122867703438, "memory(GiB)": 21.48, "step": 13689, "token_acc": 0.9930795847750865, "train_speed(iter/s)": 0.954816 }, { "epoch": 0.44472598512165806, "grad_norm": 0.3121309280395508, "learning_rate": 6.312029811486171e-06, "loss": 0.022514836862683296, "memory(GiB)": 21.48, "step": 13690, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.954827 }, { "epoch": 0.4447584705844135, "grad_norm": 0.4379447102546692, "learning_rate": 6.311511474125611e-06, "loss": 0.026697004213929176, "memory(GiB)": 21.48, "step": 13691, "token_acc": 1.0, "train_speed(iter/s)": 0.954837 }, { "epoch": 0.4447909560471689, "grad_norm": 0.4137101173400879, "learning_rate": 6.3109931216284605e-06, "loss": 0.023091468960046768, "memory(GiB)": 21.48, "step": 13692, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.954848 }, { "epoch": 0.4448234415099243, "grad_norm": 0.3436192572116852, "learning_rate": 6.310474754000706e-06, "loss": 0.021062713116407394, "memory(GiB)": 21.48, "step": 13693, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.95486 }, { "epoch": 0.4448559269726797, "grad_norm": 0.3685557544231415, "learning_rate": 6.309956371248329e-06, "loss": 0.02729714661836624, "memory(GiB)": 21.48, "step": 13694, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.954871 }, { "epoch": 0.44488841243543514, "grad_norm": 0.3820147216320038, "learning_rate": 6.3094379733773105e-06, "loss": 0.021897725760936737, "memory(GiB)": 21.48, "step": 13695, "token_acc": 0.996, "train_speed(iter/s)": 0.954885 }, { "epoch": 0.44492089789819056, "grad_norm": 0.36323094367980957, "learning_rate": 6.308919560393636e-06, "loss": 0.02285757288336754, "memory(GiB)": 21.48, "step": 13696, "token_acc": 1.0, "train_speed(iter/s)": 0.954899 }, { "epoch": 0.444953383360946, "grad_norm": 0.22681795060634613, "learning_rate": 6.308401132303287e-06, "loss": 0.011789871379733086, "memory(GiB)": 21.48, "step": 13697, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.954915 }, { "epoch": 0.4449858688237014, "grad_norm": 0.4800310432910919, "learning_rate": 6.307882689112248e-06, "loss": 0.034611258655786514, "memory(GiB)": 21.48, "step": 13698, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.95493 }, { "epoch": 0.4450183542864568, "grad_norm": 0.4497319161891937, "learning_rate": 6.3073642308265e-06, "loss": 0.02761082351207733, "memory(GiB)": 21.48, "step": 13699, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.954944 }, { "epoch": 0.4450508397492122, "grad_norm": 0.42445722222328186, "learning_rate": 6.306845757452031e-06, "loss": 0.024789195507764816, "memory(GiB)": 21.48, "step": 13700, "token_acc": 0.984375, "train_speed(iter/s)": 0.954959 }, { "epoch": 0.44508332521196764, "grad_norm": 0.39094483852386475, "learning_rate": 6.306327268994821e-06, "loss": 0.022559549659490585, "memory(GiB)": 21.48, "step": 13701, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.954975 }, { "epoch": 0.44511581067472306, "grad_norm": 0.34865570068359375, "learning_rate": 6.305808765460857e-06, "loss": 0.027597881853580475, "memory(GiB)": 21.48, "step": 13702, "token_acc": 1.0, "train_speed(iter/s)": 0.95499 }, { "epoch": 0.44514829613747847, "grad_norm": 0.4110128581523895, "learning_rate": 6.305290246856122e-06, "loss": 0.033192235976457596, "memory(GiB)": 21.48, "step": 13703, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.955005 }, { "epoch": 0.4451807816002339, "grad_norm": 0.2875705361366272, "learning_rate": 6.304771713186599e-06, "loss": 0.016539830714464188, "memory(GiB)": 21.48, "step": 13704, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.95502 }, { "epoch": 0.4452132670629893, "grad_norm": 0.3318815529346466, "learning_rate": 6.3042531644582735e-06, "loss": 0.02739972248673439, "memory(GiB)": 21.48, "step": 13705, "token_acc": 1.0, "train_speed(iter/s)": 0.955035 }, { "epoch": 0.4452457525257447, "grad_norm": 0.5182695984840393, "learning_rate": 6.3037346006771305e-06, "loss": 0.028705595061182976, "memory(GiB)": 21.48, "step": 13706, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955049 }, { "epoch": 0.44527823798850014, "grad_norm": 0.3328804671764374, "learning_rate": 6.303216021849157e-06, "loss": 0.020349953323602676, "memory(GiB)": 21.48, "step": 13707, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955064 }, { "epoch": 0.44531072345125555, "grad_norm": 0.45508503913879395, "learning_rate": 6.302697427980335e-06, "loss": 0.030527563765645027, "memory(GiB)": 21.48, "step": 13708, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.955079 }, { "epoch": 0.44534320891401097, "grad_norm": 0.8629416227340698, "learning_rate": 6.30217881907665e-06, "loss": 0.020112480968236923, "memory(GiB)": 21.48, "step": 13709, "token_acc": 1.0, "train_speed(iter/s)": 0.955094 }, { "epoch": 0.4453756943767664, "grad_norm": 0.2980451285839081, "learning_rate": 6.301660195144088e-06, "loss": 0.021299995481967926, "memory(GiB)": 21.48, "step": 13710, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.955108 }, { "epoch": 0.4454081798395218, "grad_norm": 0.33457934856414795, "learning_rate": 6.301141556188634e-06, "loss": 0.025191541761159897, "memory(GiB)": 21.48, "step": 13711, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.955122 }, { "epoch": 0.4454406653022772, "grad_norm": 0.3984859883785248, "learning_rate": 6.300622902216278e-06, "loss": 0.022093061357736588, "memory(GiB)": 21.48, "step": 13712, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955136 }, { "epoch": 0.44547315076503263, "grad_norm": 0.3410140872001648, "learning_rate": 6.300104233233e-06, "loss": 0.020288217812776566, "memory(GiB)": 21.48, "step": 13713, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.955151 }, { "epoch": 0.44550563622778805, "grad_norm": 0.4453136622905731, "learning_rate": 6.29958554924479e-06, "loss": 0.02493685856461525, "memory(GiB)": 21.48, "step": 13714, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955166 }, { "epoch": 0.44553812169054346, "grad_norm": 0.3396895229816437, "learning_rate": 6.299066850257632e-06, "loss": 0.02065386436879635, "memory(GiB)": 21.48, "step": 13715, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.95518 }, { "epoch": 0.4455706071532989, "grad_norm": 0.3100925087928772, "learning_rate": 6.298548136277514e-06, "loss": 0.017071407288312912, "memory(GiB)": 21.48, "step": 13716, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955195 }, { "epoch": 0.4456030926160543, "grad_norm": 0.38272079825401306, "learning_rate": 6.298029407310423e-06, "loss": 0.02951257862150669, "memory(GiB)": 21.48, "step": 13717, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.955209 }, { "epoch": 0.4456355780788097, "grad_norm": 0.4252939224243164, "learning_rate": 6.297510663362344e-06, "loss": 0.027294892817735672, "memory(GiB)": 21.48, "step": 13718, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.955223 }, { "epoch": 0.44566806354156513, "grad_norm": 0.3220409154891968, "learning_rate": 6.296991904439265e-06, "loss": 0.020068475976586342, "memory(GiB)": 21.48, "step": 13719, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.955238 }, { "epoch": 0.44570054900432055, "grad_norm": 0.8643450736999512, "learning_rate": 6.296473130547173e-06, "loss": 0.03087741881608963, "memory(GiB)": 21.48, "step": 13720, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.955253 }, { "epoch": 0.44573303446707596, "grad_norm": 0.39945393800735474, "learning_rate": 6.2959543416920545e-06, "loss": 0.018397726118564606, "memory(GiB)": 21.48, "step": 13721, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955269 }, { "epoch": 0.4457655199298314, "grad_norm": 0.4186706244945526, "learning_rate": 6.295435537879901e-06, "loss": 0.031010672450065613, "memory(GiB)": 21.48, "step": 13722, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.955283 }, { "epoch": 0.4457980053925868, "grad_norm": 0.26940521597862244, "learning_rate": 6.294916719116694e-06, "loss": 0.020563621073961258, "memory(GiB)": 21.48, "step": 13723, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955297 }, { "epoch": 0.4458304908553422, "grad_norm": 0.36092936992645264, "learning_rate": 6.294397885408427e-06, "loss": 0.019849330186843872, "memory(GiB)": 21.48, "step": 13724, "token_acc": 1.0, "train_speed(iter/s)": 0.95531 }, { "epoch": 0.4458629763180976, "grad_norm": 0.2576819956302643, "learning_rate": 6.2938790367610834e-06, "loss": 0.019237734377384186, "memory(GiB)": 21.48, "step": 13725, "token_acc": 1.0, "train_speed(iter/s)": 0.955325 }, { "epoch": 0.44589546178085304, "grad_norm": 0.3913688659667969, "learning_rate": 6.293360173180655e-06, "loss": 0.022080160677433014, "memory(GiB)": 21.48, "step": 13726, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.95534 }, { "epoch": 0.44592794724360846, "grad_norm": 0.3894845247268677, "learning_rate": 6.292841294673128e-06, "loss": 0.03188334405422211, "memory(GiB)": 21.48, "step": 13727, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.955355 }, { "epoch": 0.44596043270636393, "grad_norm": 0.41309115290641785, "learning_rate": 6.292322401244492e-06, "loss": 0.018501274287700653, "memory(GiB)": 21.48, "step": 13728, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.955369 }, { "epoch": 0.44599291816911935, "grad_norm": 0.3971541225910187, "learning_rate": 6.291803492900734e-06, "loss": 0.025616487488150597, "memory(GiB)": 21.48, "step": 13729, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.955383 }, { "epoch": 0.44602540363187476, "grad_norm": 0.33289971947669983, "learning_rate": 6.291284569647846e-06, "loss": 0.02317425049841404, "memory(GiB)": 21.48, "step": 13730, "token_acc": 1.0, "train_speed(iter/s)": 0.955397 }, { "epoch": 0.4460578890946302, "grad_norm": 0.7959278225898743, "learning_rate": 6.2907656314918165e-06, "loss": 0.02730267494916916, "memory(GiB)": 21.48, "step": 13731, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.955412 }, { "epoch": 0.4460903745573856, "grad_norm": 0.480508953332901, "learning_rate": 6.290246678438631e-06, "loss": 0.024471566081047058, "memory(GiB)": 21.48, "step": 13732, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.955427 }, { "epoch": 0.446122860020141, "grad_norm": 0.32497915625572205, "learning_rate": 6.289727710494284e-06, "loss": 0.020441323518753052, "memory(GiB)": 21.48, "step": 13733, "token_acc": 1.0, "train_speed(iter/s)": 0.955441 }, { "epoch": 0.4461553454828964, "grad_norm": 0.3499165177345276, "learning_rate": 6.289208727664761e-06, "loss": 0.018080849200487137, "memory(GiB)": 21.48, "step": 13734, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.955453 }, { "epoch": 0.44618783094565184, "grad_norm": 0.38604602217674255, "learning_rate": 6.288689729956054e-06, "loss": 0.0211244635283947, "memory(GiB)": 21.48, "step": 13735, "token_acc": 0.9893238434163701, "train_speed(iter/s)": 0.955465 }, { "epoch": 0.44622031640840726, "grad_norm": 0.38839471340179443, "learning_rate": 6.288170717374151e-06, "loss": 0.028840964660048485, "memory(GiB)": 21.48, "step": 13736, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.955475 }, { "epoch": 0.4462528018711627, "grad_norm": 0.6421195268630981, "learning_rate": 6.287651689925044e-06, "loss": 0.03489479422569275, "memory(GiB)": 21.48, "step": 13737, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.955485 }, { "epoch": 0.4462852873339181, "grad_norm": 0.32786059379577637, "learning_rate": 6.287132647614723e-06, "loss": 0.016560085117816925, "memory(GiB)": 21.48, "step": 13738, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.955496 }, { "epoch": 0.4463177727966735, "grad_norm": 0.5396456122398376, "learning_rate": 6.286613590449178e-06, "loss": 0.03316202387213707, "memory(GiB)": 21.48, "step": 13739, "token_acc": 0.9723502304147466, "train_speed(iter/s)": 0.955506 }, { "epoch": 0.4463502582594289, "grad_norm": 0.33720073103904724, "learning_rate": 6.2860945184344005e-06, "loss": 0.016916979104280472, "memory(GiB)": 21.48, "step": 13740, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.955516 }, { "epoch": 0.44638274372218434, "grad_norm": 0.416873961687088, "learning_rate": 6.285575431576379e-06, "loss": 0.027744639664888382, "memory(GiB)": 21.48, "step": 13741, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.955526 }, { "epoch": 0.44641522918493975, "grad_norm": 0.2777414321899414, "learning_rate": 6.285056329881107e-06, "loss": 0.019153419882059097, "memory(GiB)": 21.48, "step": 13742, "token_acc": 0.9931972789115646, "train_speed(iter/s)": 0.955536 }, { "epoch": 0.44644771464769517, "grad_norm": 0.42505478858947754, "learning_rate": 6.284537213354574e-06, "loss": 0.024578627198934555, "memory(GiB)": 21.48, "step": 13743, "token_acc": 0.9759036144578314, "train_speed(iter/s)": 0.955544 }, { "epoch": 0.4464802001104506, "grad_norm": 0.5112025141716003, "learning_rate": 6.284018082002772e-06, "loss": 0.023023489862680435, "memory(GiB)": 21.48, "step": 13744, "token_acc": 0.9947916666666666, "train_speed(iter/s)": 0.955554 }, { "epoch": 0.446512685573206, "grad_norm": 0.4838724136352539, "learning_rate": 6.2834989358316925e-06, "loss": 0.020477894693613052, "memory(GiB)": 21.48, "step": 13745, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.955563 }, { "epoch": 0.4465451710359614, "grad_norm": 0.9562405943870544, "learning_rate": 6.282979774847326e-06, "loss": 0.027408840134739876, "memory(GiB)": 21.48, "step": 13746, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955574 }, { "epoch": 0.44657765649871684, "grad_norm": 0.2880081534385681, "learning_rate": 6.2824605990556665e-06, "loss": 0.02234736829996109, "memory(GiB)": 21.48, "step": 13747, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955583 }, { "epoch": 0.44661014196147225, "grad_norm": 0.40599432587623596, "learning_rate": 6.281941408462704e-06, "loss": 0.02326333150267601, "memory(GiB)": 21.48, "step": 13748, "token_acc": 1.0, "train_speed(iter/s)": 0.955593 }, { "epoch": 0.44664262742422767, "grad_norm": 0.3303179442882538, "learning_rate": 6.281422203074432e-06, "loss": 0.02021525800228119, "memory(GiB)": 21.48, "step": 13749, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.955602 }, { "epoch": 0.4466751128869831, "grad_norm": 0.5552971959114075, "learning_rate": 6.280902982896841e-06, "loss": 0.03435855358839035, "memory(GiB)": 21.48, "step": 13750, "token_acc": 0.983957219251337, "train_speed(iter/s)": 0.955612 }, { "epoch": 0.4467075983497385, "grad_norm": 0.46293312311172485, "learning_rate": 6.280383747935926e-06, "loss": 0.024659357964992523, "memory(GiB)": 21.48, "step": 13751, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.955622 }, { "epoch": 0.4467400838124939, "grad_norm": 0.5388293266296387, "learning_rate": 6.2798644981976776e-06, "loss": 0.03012252226471901, "memory(GiB)": 21.48, "step": 13752, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.955631 }, { "epoch": 0.44677256927524933, "grad_norm": 0.3238990604877472, "learning_rate": 6.279345233688089e-06, "loss": 0.020088322460651398, "memory(GiB)": 21.48, "step": 13753, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.955641 }, { "epoch": 0.44680505473800475, "grad_norm": 0.38942569494247437, "learning_rate": 6.2788259544131546e-06, "loss": 0.026390664279460907, "memory(GiB)": 21.48, "step": 13754, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.955651 }, { "epoch": 0.44683754020076016, "grad_norm": 0.31178712844848633, "learning_rate": 6.278306660378866e-06, "loss": 0.021488212049007416, "memory(GiB)": 21.48, "step": 13755, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.955663 }, { "epoch": 0.4468700256635156, "grad_norm": 0.549382209777832, "learning_rate": 6.277787351591218e-06, "loss": 0.029563022777438164, "memory(GiB)": 21.48, "step": 13756, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.955674 }, { "epoch": 0.446902511126271, "grad_norm": 0.42814135551452637, "learning_rate": 6.277268028056203e-06, "loss": 0.024101432412862778, "memory(GiB)": 21.48, "step": 13757, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.955688 }, { "epoch": 0.4469349965890264, "grad_norm": 3.338172435760498, "learning_rate": 6.276748689779814e-06, "loss": 0.03128962218761444, "memory(GiB)": 21.48, "step": 13758, "token_acc": 1.0, "train_speed(iter/s)": 0.955703 }, { "epoch": 0.44696748205178183, "grad_norm": 0.3431583046913147, "learning_rate": 6.276229336768047e-06, "loss": 0.020042629912495613, "memory(GiB)": 21.48, "step": 13759, "token_acc": 1.0, "train_speed(iter/s)": 0.955717 }, { "epoch": 0.44699996751453724, "grad_norm": 0.3691064417362213, "learning_rate": 6.275709969026893e-06, "loss": 0.024209866300225258, "memory(GiB)": 21.48, "step": 13760, "token_acc": 1.0, "train_speed(iter/s)": 0.955732 }, { "epoch": 0.44703245297729266, "grad_norm": 0.2583961486816406, "learning_rate": 6.2751905865623494e-06, "loss": 0.018845055252313614, "memory(GiB)": 21.48, "step": 13761, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.955747 }, { "epoch": 0.4470649384400481, "grad_norm": 0.4417187571525574, "learning_rate": 6.274671189380409e-06, "loss": 0.024317730218172073, "memory(GiB)": 21.48, "step": 13762, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.955762 }, { "epoch": 0.4470974239028035, "grad_norm": 0.4413679242134094, "learning_rate": 6.274151777487067e-06, "loss": 0.024563049897551537, "memory(GiB)": 21.48, "step": 13763, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.955777 }, { "epoch": 0.4471299093655589, "grad_norm": 0.3616630434989929, "learning_rate": 6.273632350888316e-06, "loss": 0.020074155181646347, "memory(GiB)": 21.48, "step": 13764, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.955792 }, { "epoch": 0.4471623948283143, "grad_norm": 0.42252087593078613, "learning_rate": 6.2731129095901535e-06, "loss": 0.02475106529891491, "memory(GiB)": 21.48, "step": 13765, "token_acc": 1.0, "train_speed(iter/s)": 0.955806 }, { "epoch": 0.44719488029106974, "grad_norm": 0.46974900364875793, "learning_rate": 6.272593453598573e-06, "loss": 0.022478196769952774, "memory(GiB)": 21.48, "step": 13766, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.955821 }, { "epoch": 0.44722736575382516, "grad_norm": 0.3824991285800934, "learning_rate": 6.272073982919571e-06, "loss": 0.025114579126238823, "memory(GiB)": 21.48, "step": 13767, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.955836 }, { "epoch": 0.4472598512165806, "grad_norm": 0.6301902532577515, "learning_rate": 6.2715544975591415e-06, "loss": 0.02074631303548813, "memory(GiB)": 21.48, "step": 13768, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.955851 }, { "epoch": 0.447292336679336, "grad_norm": 0.2974455952644348, "learning_rate": 6.2710349975232796e-06, "loss": 0.01663241535425186, "memory(GiB)": 21.48, "step": 13769, "token_acc": 1.0, "train_speed(iter/s)": 0.955867 }, { "epoch": 0.4473248221420914, "grad_norm": 0.37137851119041443, "learning_rate": 6.270515482817985e-06, "loss": 0.026028577238321304, "memory(GiB)": 21.48, "step": 13770, "token_acc": 0.9806763285024155, "train_speed(iter/s)": 0.955881 }, { "epoch": 0.4473573076048468, "grad_norm": 0.41257497668266296, "learning_rate": 6.269995953449249e-06, "loss": 0.02614378184080124, "memory(GiB)": 21.48, "step": 13771, "token_acc": 1.0, "train_speed(iter/s)": 0.955896 }, { "epoch": 0.44738979306760224, "grad_norm": 0.42599374055862427, "learning_rate": 6.269476409423069e-06, "loss": 0.020487841218709946, "memory(GiB)": 21.48, "step": 13772, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.955911 }, { "epoch": 0.44742227853035765, "grad_norm": 0.3429594337940216, "learning_rate": 6.268956850745443e-06, "loss": 0.02056577056646347, "memory(GiB)": 21.48, "step": 13773, "token_acc": 1.0, "train_speed(iter/s)": 0.955926 }, { "epoch": 0.44745476399311307, "grad_norm": 1.3283363580703735, "learning_rate": 6.268437277422363e-06, "loss": 0.02023611217737198, "memory(GiB)": 21.48, "step": 13774, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.955941 }, { "epoch": 0.4474872494558685, "grad_norm": 0.3270328938961029, "learning_rate": 6.267917689459833e-06, "loss": 0.0185529962182045, "memory(GiB)": 21.48, "step": 13775, "token_acc": 1.0, "train_speed(iter/s)": 0.955956 }, { "epoch": 0.4475197349186239, "grad_norm": 0.5089519023895264, "learning_rate": 6.267398086863841e-06, "loss": 0.03077176958322525, "memory(GiB)": 21.48, "step": 13776, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955971 }, { "epoch": 0.4475522203813793, "grad_norm": 0.4080715477466583, "learning_rate": 6.266878469640391e-06, "loss": 0.022559698671102524, "memory(GiB)": 21.48, "step": 13777, "token_acc": 0.9828571428571429, "train_speed(iter/s)": 0.955984 }, { "epoch": 0.44758470584413473, "grad_norm": 0.42871564626693726, "learning_rate": 6.266358837795475e-06, "loss": 0.02424684166908264, "memory(GiB)": 21.48, "step": 13778, "token_acc": 0.9805825242718447, "train_speed(iter/s)": 0.955998 }, { "epoch": 0.44761719130689015, "grad_norm": 0.4703102111816406, "learning_rate": 6.2658391913350945e-06, "loss": 0.026955656707286835, "memory(GiB)": 21.48, "step": 13779, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.956014 }, { "epoch": 0.44764967676964557, "grad_norm": 0.3251309096813202, "learning_rate": 6.265319530265244e-06, "loss": 0.01866195909678936, "memory(GiB)": 21.48, "step": 13780, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.956029 }, { "epoch": 0.447682162232401, "grad_norm": 0.3908456861972809, "learning_rate": 6.264799854591922e-06, "loss": 0.026908326894044876, "memory(GiB)": 21.48, "step": 13781, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.956044 }, { "epoch": 0.4477146476951564, "grad_norm": 0.4171726107597351, "learning_rate": 6.264280164321126e-06, "loss": 0.025995712727308273, "memory(GiB)": 21.48, "step": 13782, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.956058 }, { "epoch": 0.4477471331579118, "grad_norm": 0.49500417709350586, "learning_rate": 6.263760459458853e-06, "loss": 0.028628218919038773, "memory(GiB)": 21.48, "step": 13783, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.956072 }, { "epoch": 0.44777961862066723, "grad_norm": 0.2980254888534546, "learning_rate": 6.263240740011105e-06, "loss": 0.01923009566962719, "memory(GiB)": 21.48, "step": 13784, "token_acc": 1.0, "train_speed(iter/s)": 0.956087 }, { "epoch": 0.44781210408342265, "grad_norm": 0.7544729709625244, "learning_rate": 6.262721005983875e-06, "loss": 0.027582546696066856, "memory(GiB)": 21.48, "step": 13785, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.956101 }, { "epoch": 0.44784458954617806, "grad_norm": 0.3773353397846222, "learning_rate": 6.262201257383166e-06, "loss": 0.024664729833602905, "memory(GiB)": 21.48, "step": 13786, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.956116 }, { "epoch": 0.4478770750089335, "grad_norm": 0.3456937074661255, "learning_rate": 6.261681494214974e-06, "loss": 0.024961933493614197, "memory(GiB)": 21.48, "step": 13787, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.956131 }, { "epoch": 0.4479095604716889, "grad_norm": 0.4043998122215271, "learning_rate": 6.2611617164852964e-06, "loss": 0.01656157523393631, "memory(GiB)": 21.48, "step": 13788, "token_acc": 1.0, "train_speed(iter/s)": 0.956145 }, { "epoch": 0.4479420459344443, "grad_norm": 0.39675483107566833, "learning_rate": 6.260641924200137e-06, "loss": 0.022036589682102203, "memory(GiB)": 21.48, "step": 13789, "token_acc": 1.0, "train_speed(iter/s)": 0.956159 }, { "epoch": 0.4479745313971997, "grad_norm": 0.35609719157218933, "learning_rate": 6.260122117365489e-06, "loss": 0.02717779576778412, "memory(GiB)": 21.48, "step": 13790, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.956174 }, { "epoch": 0.44800701685995514, "grad_norm": 0.24924223124980927, "learning_rate": 6.259602295987357e-06, "loss": 0.01903923600912094, "memory(GiB)": 21.48, "step": 13791, "token_acc": 0.983739837398374, "train_speed(iter/s)": 0.956188 }, { "epoch": 0.4480395023227106, "grad_norm": 0.4620293378829956, "learning_rate": 6.259082460071736e-06, "loss": 0.04035171866416931, "memory(GiB)": 21.48, "step": 13792, "token_acc": 0.9741379310344828, "train_speed(iter/s)": 0.956204 }, { "epoch": 0.44807198778546603, "grad_norm": 0.3943978548049927, "learning_rate": 6.258562609624629e-06, "loss": 0.020029712468385696, "memory(GiB)": 21.48, "step": 13793, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.956219 }, { "epoch": 0.44810447324822145, "grad_norm": 0.4738720953464508, "learning_rate": 6.258042744652033e-06, "loss": 0.01898983307182789, "memory(GiB)": 21.48, "step": 13794, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956233 }, { "epoch": 0.44813695871097686, "grad_norm": 0.47424787282943726, "learning_rate": 6.25752286515995e-06, "loss": 0.019279176369309425, "memory(GiB)": 21.48, "step": 13795, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956247 }, { "epoch": 0.4481694441737323, "grad_norm": 0.37163877487182617, "learning_rate": 6.25700297115438e-06, "loss": 0.027650468051433563, "memory(GiB)": 21.48, "step": 13796, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.95626 }, { "epoch": 0.4482019296364877, "grad_norm": 0.31317201256752014, "learning_rate": 6.256483062641322e-06, "loss": 0.025092698633670807, "memory(GiB)": 21.48, "step": 13797, "token_acc": 0.9737827715355806, "train_speed(iter/s)": 0.95627 }, { "epoch": 0.4482344150992431, "grad_norm": 0.30351096391677856, "learning_rate": 6.2559631396267766e-06, "loss": 0.018748758360743523, "memory(GiB)": 21.48, "step": 13798, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.956282 }, { "epoch": 0.4482669005619985, "grad_norm": 0.29578179121017456, "learning_rate": 6.255443202116744e-06, "loss": 0.02612873911857605, "memory(GiB)": 21.48, "step": 13799, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.956292 }, { "epoch": 0.44829938602475394, "grad_norm": 0.429901123046875, "learning_rate": 6.254923250117229e-06, "loss": 0.023739252239465714, "memory(GiB)": 21.48, "step": 13800, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.956302 }, { "epoch": 0.44833187148750936, "grad_norm": 0.36623549461364746, "learning_rate": 6.254403283634227e-06, "loss": 0.023060046136379242, "memory(GiB)": 21.48, "step": 13801, "token_acc": 1.0, "train_speed(iter/s)": 0.95631 }, { "epoch": 0.4483643569502648, "grad_norm": 0.5212721824645996, "learning_rate": 6.253883302673742e-06, "loss": 0.027714746072888374, "memory(GiB)": 21.48, "step": 13802, "token_acc": 1.0, "train_speed(iter/s)": 0.95632 }, { "epoch": 0.4483968424130202, "grad_norm": 0.36340388655662537, "learning_rate": 6.2533633072417755e-06, "loss": 0.020703986287117004, "memory(GiB)": 21.48, "step": 13803, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.95633 }, { "epoch": 0.4484293278757756, "grad_norm": 0.3917731046676636, "learning_rate": 6.252843297344328e-06, "loss": 0.02032635547220707, "memory(GiB)": 21.48, "step": 13804, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956339 }, { "epoch": 0.448461813338531, "grad_norm": 0.3547374904155731, "learning_rate": 6.252323272987401e-06, "loss": 0.022412944585084915, "memory(GiB)": 21.48, "step": 13805, "token_acc": 0.9928571428571429, "train_speed(iter/s)": 0.95635 }, { "epoch": 0.44849429880128644, "grad_norm": 0.3222697377204895, "learning_rate": 6.2518032341769965e-06, "loss": 0.01855909265577793, "memory(GiB)": 21.48, "step": 13806, "token_acc": 1.0, "train_speed(iter/s)": 0.956359 }, { "epoch": 0.44852678426404186, "grad_norm": 0.3214000165462494, "learning_rate": 6.2512831809191165e-06, "loss": 0.021453259512782097, "memory(GiB)": 21.48, "step": 13807, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.956369 }, { "epoch": 0.44855926972679727, "grad_norm": 0.4643152952194214, "learning_rate": 6.2507631132197626e-06, "loss": 0.023938115686178207, "memory(GiB)": 21.48, "step": 13808, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.956379 }, { "epoch": 0.4485917551895527, "grad_norm": 0.3302275836467743, "learning_rate": 6.2502430310849394e-06, "loss": 0.021833600476384163, "memory(GiB)": 21.48, "step": 13809, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.956389 }, { "epoch": 0.4486242406523081, "grad_norm": 0.3190026879310608, "learning_rate": 6.249722934520646e-06, "loss": 0.022669371217489243, "memory(GiB)": 21.48, "step": 13810, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956399 }, { "epoch": 0.4486567261150635, "grad_norm": 0.33708295226097107, "learning_rate": 6.249202823532886e-06, "loss": 0.030149824917316437, "memory(GiB)": 21.48, "step": 13811, "token_acc": 0.9759036144578314, "train_speed(iter/s)": 0.956409 }, { "epoch": 0.44868921157781894, "grad_norm": 0.3028393089771271, "learning_rate": 6.248682698127665e-06, "loss": 0.01761622540652752, "memory(GiB)": 21.48, "step": 13812, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956418 }, { "epoch": 0.44872169704057435, "grad_norm": 0.6578344106674194, "learning_rate": 6.248162558310982e-06, "loss": 0.019581541419029236, "memory(GiB)": 21.48, "step": 13813, "token_acc": 1.0, "train_speed(iter/s)": 0.956427 }, { "epoch": 0.44875418250332977, "grad_norm": 0.38431257009506226, "learning_rate": 6.24764240408884e-06, "loss": 0.02671233005821705, "memory(GiB)": 21.48, "step": 13814, "token_acc": 0.9684684684684685, "train_speed(iter/s)": 0.956436 }, { "epoch": 0.4487866679660852, "grad_norm": 0.3231048285961151, "learning_rate": 6.247122235467246e-06, "loss": 0.016603752970695496, "memory(GiB)": 21.48, "step": 13815, "token_acc": 1.0, "train_speed(iter/s)": 0.956446 }, { "epoch": 0.4488191534288406, "grad_norm": 0.3405001759529114, "learning_rate": 6.246602052452203e-06, "loss": 0.017835982143878937, "memory(GiB)": 21.48, "step": 13816, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.956457 }, { "epoch": 0.448851638891596, "grad_norm": 0.6830373406410217, "learning_rate": 6.2460818550497125e-06, "loss": 0.028270617127418518, "memory(GiB)": 21.48, "step": 13817, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956467 }, { "epoch": 0.44888412435435143, "grad_norm": 0.3627762198448181, "learning_rate": 6.245561643265778e-06, "loss": 0.02019769325852394, "memory(GiB)": 21.48, "step": 13818, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.956479 }, { "epoch": 0.44891660981710685, "grad_norm": 0.46890413761138916, "learning_rate": 6.245041417106404e-06, "loss": 0.02346859872341156, "memory(GiB)": 21.48, "step": 13819, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.95649 }, { "epoch": 0.44894909527986226, "grad_norm": 0.4753659963607788, "learning_rate": 6.244521176577595e-06, "loss": 0.026417246088385582, "memory(GiB)": 21.48, "step": 13820, "token_acc": 0.9823529411764705, "train_speed(iter/s)": 0.956501 }, { "epoch": 0.4489815807426177, "grad_norm": 0.2965187430381775, "learning_rate": 6.244000921685356e-06, "loss": 0.018155153840780258, "memory(GiB)": 21.48, "step": 13821, "token_acc": 1.0, "train_speed(iter/s)": 0.956513 }, { "epoch": 0.4490140662053731, "grad_norm": 0.4655214250087738, "learning_rate": 6.243480652435689e-06, "loss": 0.019824815914034843, "memory(GiB)": 21.48, "step": 13822, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.956525 }, { "epoch": 0.4490465516681285, "grad_norm": 0.5540842413902283, "learning_rate": 6.2429603688346015e-06, "loss": 0.017641741782426834, "memory(GiB)": 21.48, "step": 13823, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.956539 }, { "epoch": 0.44907903713088393, "grad_norm": 0.5935860276222229, "learning_rate": 6.2424400708880976e-06, "loss": 0.025695033371448517, "memory(GiB)": 21.48, "step": 13824, "token_acc": 1.0, "train_speed(iter/s)": 0.956552 }, { "epoch": 0.44911152259363935, "grad_norm": 0.3526626527309418, "learning_rate": 6.2419197586021805e-06, "loss": 0.019621988758444786, "memory(GiB)": 21.48, "step": 13825, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956568 }, { "epoch": 0.44914400805639476, "grad_norm": 0.4888840615749359, "learning_rate": 6.2413994319828575e-06, "loss": 0.02674628049135208, "memory(GiB)": 21.48, "step": 13826, "token_acc": 0.981549815498155, "train_speed(iter/s)": 0.956582 }, { "epoch": 0.4491764935191502, "grad_norm": 0.35748860239982605, "learning_rate": 6.2408790910361305e-06, "loss": 0.022283706814050674, "memory(GiB)": 21.48, "step": 13827, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.956597 }, { "epoch": 0.4492089789819056, "grad_norm": 0.3826289474964142, "learning_rate": 6.240358735768009e-06, "loss": 0.02424212172627449, "memory(GiB)": 21.48, "step": 13828, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956611 }, { "epoch": 0.449241464444661, "grad_norm": 0.4892798662185669, "learning_rate": 6.239838366184495e-06, "loss": 0.025283046066761017, "memory(GiB)": 21.48, "step": 13829, "token_acc": 0.985, "train_speed(iter/s)": 0.956626 }, { "epoch": 0.4492739499074164, "grad_norm": 0.21569843590259552, "learning_rate": 6.239317982291599e-06, "loss": 0.012353518977761269, "memory(GiB)": 21.48, "step": 13830, "token_acc": 0.9905660377358491, "train_speed(iter/s)": 0.95664 }, { "epoch": 0.44930643537017184, "grad_norm": 0.39530104398727417, "learning_rate": 6.238797584095322e-06, "loss": 0.02334333211183548, "memory(GiB)": 21.48, "step": 13831, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.956655 }, { "epoch": 0.44933892083292726, "grad_norm": 0.3002670109272003, "learning_rate": 6.2382771716016724e-06, "loss": 0.013812527991831303, "memory(GiB)": 21.48, "step": 13832, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.95667 }, { "epoch": 0.4493714062956827, "grad_norm": 0.5705774426460266, "learning_rate": 6.237756744816656e-06, "loss": 0.030136549845337868, "memory(GiB)": 21.48, "step": 13833, "token_acc": 0.9940828402366864, "train_speed(iter/s)": 0.956684 }, { "epoch": 0.4494038917584381, "grad_norm": 0.44588619470596313, "learning_rate": 6.2372363037462795e-06, "loss": 0.022795798256993294, "memory(GiB)": 21.48, "step": 13834, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.956697 }, { "epoch": 0.4494363772211935, "grad_norm": 0.2578323781490326, "learning_rate": 6.236715848396549e-06, "loss": 0.017654582858085632, "memory(GiB)": 21.48, "step": 13835, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956712 }, { "epoch": 0.4494688626839489, "grad_norm": 0.446778804063797, "learning_rate": 6.236195378773471e-06, "loss": 0.02927207387983799, "memory(GiB)": 21.48, "step": 13836, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.956727 }, { "epoch": 0.44950134814670434, "grad_norm": 0.42111241817474365, "learning_rate": 6.235674894883055e-06, "loss": 0.025264445692300797, "memory(GiB)": 21.48, "step": 13837, "token_acc": 0.9904761904761905, "train_speed(iter/s)": 0.956742 }, { "epoch": 0.44953383360945975, "grad_norm": 0.5864883661270142, "learning_rate": 6.235154396731303e-06, "loss": 0.025552693754434586, "memory(GiB)": 21.48, "step": 13838, "token_acc": 0.9674418604651163, "train_speed(iter/s)": 0.956757 }, { "epoch": 0.44956631907221517, "grad_norm": 0.3442606031894684, "learning_rate": 6.234633884324228e-06, "loss": 0.01682269014418125, "memory(GiB)": 21.48, "step": 13839, "token_acc": 0.9787985865724381, "train_speed(iter/s)": 0.956771 }, { "epoch": 0.4495988045349706, "grad_norm": 0.586961567401886, "learning_rate": 6.234113357667835e-06, "loss": 0.028321970254182816, "memory(GiB)": 21.48, "step": 13840, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.956786 }, { "epoch": 0.449631289997726, "grad_norm": 0.8257972598075867, "learning_rate": 6.233592816768128e-06, "loss": 0.0293632373213768, "memory(GiB)": 21.48, "step": 13841, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.9568 }, { "epoch": 0.4496637754604814, "grad_norm": 0.40571829676628113, "learning_rate": 6.2330722616311214e-06, "loss": 0.01841726526618004, "memory(GiB)": 21.48, "step": 13842, "token_acc": 0.9893238434163701, "train_speed(iter/s)": 0.956814 }, { "epoch": 0.44969626092323683, "grad_norm": 0.3236003518104553, "learning_rate": 6.232551692262817e-06, "loss": 0.020055264234542847, "memory(GiB)": 21.48, "step": 13843, "token_acc": 1.0, "train_speed(iter/s)": 0.956829 }, { "epoch": 0.44972874638599225, "grad_norm": 0.3380473852157593, "learning_rate": 6.232031108669228e-06, "loss": 0.02314670756459236, "memory(GiB)": 21.48, "step": 13844, "token_acc": 1.0, "train_speed(iter/s)": 0.956844 }, { "epoch": 0.44976123184874767, "grad_norm": 0.335043340921402, "learning_rate": 6.231510510856358e-06, "loss": 0.020014971494674683, "memory(GiB)": 21.48, "step": 13845, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956859 }, { "epoch": 0.4497937173115031, "grad_norm": 0.5332570672035217, "learning_rate": 6.230989898830217e-06, "loss": 0.038655657321214676, "memory(GiB)": 21.48, "step": 13846, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956873 }, { "epoch": 0.4498262027742585, "grad_norm": 0.6295015215873718, "learning_rate": 6.230469272596817e-06, "loss": 0.03089103102684021, "memory(GiB)": 21.48, "step": 13847, "token_acc": 1.0, "train_speed(iter/s)": 0.956887 }, { "epoch": 0.4498586882370139, "grad_norm": 0.3783120810985565, "learning_rate": 6.2299486321621616e-06, "loss": 0.03394439071416855, "memory(GiB)": 21.48, "step": 13848, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.956901 }, { "epoch": 0.44989117369976933, "grad_norm": 0.5759564638137817, "learning_rate": 6.229427977532264e-06, "loss": 0.026746878400444984, "memory(GiB)": 21.48, "step": 13849, "token_acc": 0.9948717948717949, "train_speed(iter/s)": 0.956916 }, { "epoch": 0.44992365916252475, "grad_norm": 0.3400798738002777, "learning_rate": 6.228907308713129e-06, "loss": 0.02214870974421501, "memory(GiB)": 21.48, "step": 13850, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.95693 }, { "epoch": 0.44995614462528016, "grad_norm": 0.26553288102149963, "learning_rate": 6.2283866257107695e-06, "loss": 0.016340937465429306, "memory(GiB)": 21.48, "step": 13851, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956945 }, { "epoch": 0.4499886300880356, "grad_norm": 0.3547411262989044, "learning_rate": 6.227865928531192e-06, "loss": 0.017821092158555984, "memory(GiB)": 21.48, "step": 13852, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.95696 }, { "epoch": 0.450021115550791, "grad_norm": 0.34914568066596985, "learning_rate": 6.227345217180409e-06, "loss": 0.016877764835953712, "memory(GiB)": 21.48, "step": 13853, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956973 }, { "epoch": 0.4500536010135464, "grad_norm": 0.2996273636817932, "learning_rate": 6.226824491664428e-06, "loss": 0.017440272495150566, "memory(GiB)": 21.48, "step": 13854, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.956987 }, { "epoch": 0.4500860864763018, "grad_norm": 0.3537217378616333, "learning_rate": 6.226303751989259e-06, "loss": 0.02435498870909214, "memory(GiB)": 21.48, "step": 13855, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.957001 }, { "epoch": 0.4501185719390573, "grad_norm": 0.3063126802444458, "learning_rate": 6.225782998160913e-06, "loss": 0.01900681108236313, "memory(GiB)": 21.48, "step": 13856, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957013 }, { "epoch": 0.4501510574018127, "grad_norm": 0.3804517984390259, "learning_rate": 6.225262230185399e-06, "loss": 0.022359222173690796, "memory(GiB)": 21.48, "step": 13857, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.957024 }, { "epoch": 0.45018354286456813, "grad_norm": 16.23484992980957, "learning_rate": 6.224741448068729e-06, "loss": 0.03202028200030327, "memory(GiB)": 21.48, "step": 13858, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.957035 }, { "epoch": 0.45021602832732355, "grad_norm": 0.47325393557548523, "learning_rate": 6.2242206518169126e-06, "loss": 0.03497835993766785, "memory(GiB)": 21.48, "step": 13859, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.957046 }, { "epoch": 0.45024851379007896, "grad_norm": 0.3622826933860779, "learning_rate": 6.2236998414359585e-06, "loss": 0.019237354397773743, "memory(GiB)": 21.48, "step": 13860, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.957057 }, { "epoch": 0.4502809992528344, "grad_norm": 0.4766923785209656, "learning_rate": 6.223179016931882e-06, "loss": 0.024147193878889084, "memory(GiB)": 21.48, "step": 13861, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.957068 }, { "epoch": 0.4503134847155898, "grad_norm": 0.3242838978767395, "learning_rate": 6.22265817831069e-06, "loss": 0.02813042886555195, "memory(GiB)": 21.48, "step": 13862, "token_acc": 0.9838056680161943, "train_speed(iter/s)": 0.95708 }, { "epoch": 0.4503459701783452, "grad_norm": 0.3053562343120575, "learning_rate": 6.2221373255783955e-06, "loss": 0.020741339772939682, "memory(GiB)": 21.48, "step": 13863, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.95709 }, { "epoch": 0.45037845564110063, "grad_norm": 0.5536080598831177, "learning_rate": 6.22161645874101e-06, "loss": 0.03872379660606384, "memory(GiB)": 21.48, "step": 13864, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.9571 }, { "epoch": 0.45041094110385604, "grad_norm": 0.4959840476512909, "learning_rate": 6.221095577804545e-06, "loss": 0.024496309459209442, "memory(GiB)": 21.48, "step": 13865, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.95711 }, { "epoch": 0.45044342656661146, "grad_norm": 0.56966632604599, "learning_rate": 6.22057468277501e-06, "loss": 0.03165898472070694, "memory(GiB)": 21.48, "step": 13866, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.957121 }, { "epoch": 0.4504759120293669, "grad_norm": 0.2687459886074066, "learning_rate": 6.220053773658421e-06, "loss": 0.016596080735325813, "memory(GiB)": 21.48, "step": 13867, "token_acc": 0.984375, "train_speed(iter/s)": 0.957131 }, { "epoch": 0.4505083974921223, "grad_norm": 0.31963685154914856, "learning_rate": 6.219532850460787e-06, "loss": 0.02800469845533371, "memory(GiB)": 21.48, "step": 13868, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.957141 }, { "epoch": 0.4505408829548777, "grad_norm": 0.5370992422103882, "learning_rate": 6.219011913188119e-06, "loss": 0.028340622782707214, "memory(GiB)": 21.48, "step": 13869, "token_acc": 0.9734513274336283, "train_speed(iter/s)": 0.95715 }, { "epoch": 0.4505733684176331, "grad_norm": 0.3078690469264984, "learning_rate": 6.218490961846433e-06, "loss": 0.026143096387386322, "memory(GiB)": 21.48, "step": 13870, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.95716 }, { "epoch": 0.45060585388038854, "grad_norm": 0.4199919104576111, "learning_rate": 6.2179699964417374e-06, "loss": 0.02248588390648365, "memory(GiB)": 21.48, "step": 13871, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.957169 }, { "epoch": 0.45063833934314396, "grad_norm": 0.3783092796802521, "learning_rate": 6.217449016980048e-06, "loss": 0.019186686724424362, "memory(GiB)": 21.48, "step": 13872, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957178 }, { "epoch": 0.4506708248058994, "grad_norm": 0.38015809655189514, "learning_rate": 6.216928023467377e-06, "loss": 0.023062527179718018, "memory(GiB)": 21.48, "step": 13873, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.957188 }, { "epoch": 0.4507033102686548, "grad_norm": 0.3454990088939667, "learning_rate": 6.216407015909737e-06, "loss": 0.022079087793827057, "memory(GiB)": 21.48, "step": 13874, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.957198 }, { "epoch": 0.4507357957314102, "grad_norm": 0.26075053215026855, "learning_rate": 6.21588599431314e-06, "loss": 0.019858721643686295, "memory(GiB)": 21.48, "step": 13875, "token_acc": 0.992, "train_speed(iter/s)": 0.957208 }, { "epoch": 0.4507682811941656, "grad_norm": 0.36981093883514404, "learning_rate": 6.2153649586836e-06, "loss": 0.02348918840289116, "memory(GiB)": 21.48, "step": 13876, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.957219 }, { "epoch": 0.45080076665692104, "grad_norm": 0.40946879982948303, "learning_rate": 6.2148439090271314e-06, "loss": 0.024357695132493973, "memory(GiB)": 21.48, "step": 13877, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.957231 }, { "epoch": 0.45083325211967645, "grad_norm": 0.35985323786735535, "learning_rate": 6.214322845349747e-06, "loss": 0.02071327343583107, "memory(GiB)": 21.48, "step": 13878, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.957243 }, { "epoch": 0.45086573758243187, "grad_norm": 0.369698166847229, "learning_rate": 6.213801767657462e-06, "loss": 0.022772006690502167, "memory(GiB)": 21.48, "step": 13879, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.957256 }, { "epoch": 0.4508982230451873, "grad_norm": 0.3118188977241516, "learning_rate": 6.213280675956287e-06, "loss": 0.01912248507142067, "memory(GiB)": 21.48, "step": 13880, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.957267 }, { "epoch": 0.4509307085079427, "grad_norm": 0.44455647468566895, "learning_rate": 6.21275957025224e-06, "loss": 0.026532525196671486, "memory(GiB)": 21.48, "step": 13881, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957277 }, { "epoch": 0.4509631939706981, "grad_norm": 0.3839067220687866, "learning_rate": 6.212238450551334e-06, "loss": 0.0169049184769392, "memory(GiB)": 21.48, "step": 13882, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957287 }, { "epoch": 0.45099567943345353, "grad_norm": 0.3389317989349365, "learning_rate": 6.21171731685958e-06, "loss": 0.027249593287706375, "memory(GiB)": 21.48, "step": 13883, "token_acc": 0.984, "train_speed(iter/s)": 0.957298 }, { "epoch": 0.45102816489620895, "grad_norm": 0.4089883863925934, "learning_rate": 6.211196169182997e-06, "loss": 0.033076025545597076, "memory(GiB)": 21.48, "step": 13884, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.95731 }, { "epoch": 0.45106065035896437, "grad_norm": 0.8313717246055603, "learning_rate": 6.210675007527597e-06, "loss": 0.04175553470849991, "memory(GiB)": 21.48, "step": 13885, "token_acc": 0.9813953488372092, "train_speed(iter/s)": 0.957321 }, { "epoch": 0.4510931358217198, "grad_norm": 0.38765180110931396, "learning_rate": 6.210153831899397e-06, "loss": 0.02129058912396431, "memory(GiB)": 21.48, "step": 13886, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.957332 }, { "epoch": 0.4511256212844752, "grad_norm": 0.31929993629455566, "learning_rate": 6.20963264230441e-06, "loss": 0.018330641090869904, "memory(GiB)": 21.48, "step": 13887, "token_acc": 0.9918367346938776, "train_speed(iter/s)": 0.957346 }, { "epoch": 0.4511581067472306, "grad_norm": 0.6181867122650146, "learning_rate": 6.209111438748654e-06, "loss": 0.020625073462724686, "memory(GiB)": 21.48, "step": 13888, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.95736 }, { "epoch": 0.45119059220998603, "grad_norm": 0.32113778591156006, "learning_rate": 6.208590221238142e-06, "loss": 0.024163823574781418, "memory(GiB)": 21.48, "step": 13889, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.957375 }, { "epoch": 0.45122307767274145, "grad_norm": 0.3705878257751465, "learning_rate": 6.208068989778891e-06, "loss": 0.022888321429491043, "memory(GiB)": 21.48, "step": 13890, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.957388 }, { "epoch": 0.45125556313549686, "grad_norm": 1.0049656629562378, "learning_rate": 6.207547744376914e-06, "loss": 0.017398186028003693, "memory(GiB)": 21.48, "step": 13891, "token_acc": 0.9933774834437086, "train_speed(iter/s)": 0.957403 }, { "epoch": 0.4512880485982523, "grad_norm": 0.3587663173675537, "learning_rate": 6.207026485038229e-06, "loss": 0.025250963866710663, "memory(GiB)": 21.48, "step": 13892, "token_acc": 1.0, "train_speed(iter/s)": 0.957415 }, { "epoch": 0.4513205340610077, "grad_norm": 0.40760353207588196, "learning_rate": 6.206505211768853e-06, "loss": 0.026606714352965355, "memory(GiB)": 21.48, "step": 13893, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957429 }, { "epoch": 0.4513530195237631, "grad_norm": 0.3778724670410156, "learning_rate": 6.2059839245748e-06, "loss": 0.029521998018026352, "memory(GiB)": 21.48, "step": 13894, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957444 }, { "epoch": 0.4513855049865185, "grad_norm": 0.6552366018295288, "learning_rate": 6.205462623462088e-06, "loss": 0.024568770080804825, "memory(GiB)": 21.48, "step": 13895, "token_acc": 1.0, "train_speed(iter/s)": 0.957458 }, { "epoch": 0.45141799044927394, "grad_norm": 0.47854387760162354, "learning_rate": 6.204941308436734e-06, "loss": 0.028776099905371666, "memory(GiB)": 21.48, "step": 13896, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.957471 }, { "epoch": 0.45145047591202936, "grad_norm": 0.34212619066238403, "learning_rate": 6.204419979504752e-06, "loss": 0.022421304136514664, "memory(GiB)": 21.48, "step": 13897, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.957486 }, { "epoch": 0.4514829613747848, "grad_norm": 0.34751611948013306, "learning_rate": 6.203898636672161e-06, "loss": 0.023009896278381348, "memory(GiB)": 21.48, "step": 13898, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.957501 }, { "epoch": 0.4515154468375402, "grad_norm": 0.37227416038513184, "learning_rate": 6.203377279944976e-06, "loss": 0.022443391382694244, "memory(GiB)": 21.48, "step": 13899, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.957514 }, { "epoch": 0.4515479323002956, "grad_norm": 0.4316680431365967, "learning_rate": 6.2028559093292175e-06, "loss": 0.023560652509331703, "memory(GiB)": 21.48, "step": 13900, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.957529 }, { "epoch": 0.451580417763051, "grad_norm": 0.3790815770626068, "learning_rate": 6.202334524830899e-06, "loss": 0.024505047127604485, "memory(GiB)": 21.48, "step": 13901, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957543 }, { "epoch": 0.45161290322580644, "grad_norm": 0.35847246646881104, "learning_rate": 6.201813126456041e-06, "loss": 0.02309216372668743, "memory(GiB)": 21.48, "step": 13902, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.957557 }, { "epoch": 0.45164538868856186, "grad_norm": 0.2898443043231964, "learning_rate": 6.201291714210658e-06, "loss": 0.016416732221841812, "memory(GiB)": 21.48, "step": 13903, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.957572 }, { "epoch": 0.45167787415131727, "grad_norm": 0.28426623344421387, "learning_rate": 6.200770288100772e-06, "loss": 0.02056971564888954, "memory(GiB)": 21.48, "step": 13904, "token_acc": 0.9858490566037735, "train_speed(iter/s)": 0.957586 }, { "epoch": 0.4517103596140727, "grad_norm": 0.32226788997650146, "learning_rate": 6.200248848132399e-06, "loss": 0.020448386669158936, "memory(GiB)": 21.48, "step": 13905, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.957599 }, { "epoch": 0.4517428450768281, "grad_norm": 0.36390143632888794, "learning_rate": 6.199727394311556e-06, "loss": 0.030219081789255142, "memory(GiB)": 21.48, "step": 13906, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.957614 }, { "epoch": 0.4517753305395835, "grad_norm": 0.4820915162563324, "learning_rate": 6.1992059266442604e-06, "loss": 0.02175440639257431, "memory(GiB)": 21.48, "step": 13907, "token_acc": 0.996, "train_speed(iter/s)": 0.957627 }, { "epoch": 0.45180781600233894, "grad_norm": 0.3742983937263489, "learning_rate": 6.198684445136534e-06, "loss": 0.023406166583299637, "memory(GiB)": 21.48, "step": 13908, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.957642 }, { "epoch": 0.45184030146509435, "grad_norm": 0.4568297863006592, "learning_rate": 6.198162949794393e-06, "loss": 0.02431230992078781, "memory(GiB)": 21.48, "step": 13909, "token_acc": 0.996, "train_speed(iter/s)": 0.957655 }, { "epoch": 0.45187278692784977, "grad_norm": 0.9551137089729309, "learning_rate": 6.197641440623857e-06, "loss": 0.030378421768546104, "memory(GiB)": 21.48, "step": 13910, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.95767 }, { "epoch": 0.4519052723906052, "grad_norm": 0.3218713104724884, "learning_rate": 6.197119917630945e-06, "loss": 0.022742945700883865, "memory(GiB)": 21.48, "step": 13911, "token_acc": 0.995, "train_speed(iter/s)": 0.957685 }, { "epoch": 0.4519377578533606, "grad_norm": 0.3998182713985443, "learning_rate": 6.196598380821676e-06, "loss": 0.025310836732387543, "memory(GiB)": 21.48, "step": 13912, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.957699 }, { "epoch": 0.451970243316116, "grad_norm": 0.3311546742916107, "learning_rate": 6.196076830202067e-06, "loss": 0.03193499892950058, "memory(GiB)": 21.48, "step": 13913, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.957713 }, { "epoch": 0.45200272877887143, "grad_norm": 0.3825303912162781, "learning_rate": 6.1955552657781415e-06, "loss": 0.01288275420665741, "memory(GiB)": 21.48, "step": 13914, "token_acc": 1.0, "train_speed(iter/s)": 0.957727 }, { "epoch": 0.45203521424162685, "grad_norm": 0.7248145341873169, "learning_rate": 6.195033687555916e-06, "loss": 0.024309799075126648, "memory(GiB)": 21.48, "step": 13915, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957741 }, { "epoch": 0.45206769970438226, "grad_norm": 0.45193684101104736, "learning_rate": 6.194512095541412e-06, "loss": 0.02397097647190094, "memory(GiB)": 21.48, "step": 13916, "token_acc": 0.9927272727272727, "train_speed(iter/s)": 0.957752 }, { "epoch": 0.4521001851671377, "grad_norm": 0.40182745456695557, "learning_rate": 6.193990489740648e-06, "loss": 0.029168453067541122, "memory(GiB)": 21.48, "step": 13917, "token_acc": 0.9875, "train_speed(iter/s)": 0.957762 }, { "epoch": 0.4521326706298931, "grad_norm": 0.4096534252166748, "learning_rate": 6.1934688701596446e-06, "loss": 0.023209132254123688, "memory(GiB)": 21.48, "step": 13918, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.957774 }, { "epoch": 0.4521651560926485, "grad_norm": 0.26029306650161743, "learning_rate": 6.192947236804423e-06, "loss": 0.02021721564233303, "memory(GiB)": 21.48, "step": 13919, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.957786 }, { "epoch": 0.452197641555404, "grad_norm": 0.28618714213371277, "learning_rate": 6.192425589681001e-06, "loss": 0.02002764865756035, "memory(GiB)": 21.48, "step": 13920, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957797 }, { "epoch": 0.4522301270181594, "grad_norm": 0.46057674288749695, "learning_rate": 6.191903928795401e-06, "loss": 0.025612467899918556, "memory(GiB)": 21.48, "step": 13921, "token_acc": 0.984375, "train_speed(iter/s)": 0.957809 }, { "epoch": 0.4522626124809148, "grad_norm": 0.29284751415252686, "learning_rate": 6.1913822541536415e-06, "loss": 0.01769503578543663, "memory(GiB)": 21.48, "step": 13922, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.957819 }, { "epoch": 0.45229509794367023, "grad_norm": 0.2890430986881256, "learning_rate": 6.190860565761747e-06, "loss": 0.019233163446187973, "memory(GiB)": 21.48, "step": 13923, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.95783 }, { "epoch": 0.45232758340642565, "grad_norm": 0.4285416603088379, "learning_rate": 6.190338863625738e-06, "loss": 0.022073745727539062, "memory(GiB)": 21.48, "step": 13924, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.957841 }, { "epoch": 0.45236006886918106, "grad_norm": 0.37391141057014465, "learning_rate": 6.189817147751631e-06, "loss": 0.03019952028989792, "memory(GiB)": 21.48, "step": 13925, "token_acc": 1.0, "train_speed(iter/s)": 0.957851 }, { "epoch": 0.4523925543319365, "grad_norm": 0.28725555539131165, "learning_rate": 6.189295418145454e-06, "loss": 0.017161712050437927, "memory(GiB)": 21.48, "step": 13926, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.957862 }, { "epoch": 0.4524250397946919, "grad_norm": 0.37376868724823, "learning_rate": 6.188773674813221e-06, "loss": 0.019096530973911285, "memory(GiB)": 21.48, "step": 13927, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957873 }, { "epoch": 0.4524575252574473, "grad_norm": 3.2180335521698, "learning_rate": 6.1882519177609605e-06, "loss": 0.02841934561729431, "memory(GiB)": 21.48, "step": 13928, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.957882 }, { "epoch": 0.45249001072020273, "grad_norm": 0.3799017071723938, "learning_rate": 6.187730146994689e-06, "loss": 0.02336583286523819, "memory(GiB)": 21.48, "step": 13929, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.957892 }, { "epoch": 0.45252249618295814, "grad_norm": 0.38068026304244995, "learning_rate": 6.187208362520433e-06, "loss": 0.024588260799646378, "memory(GiB)": 21.48, "step": 13930, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.957902 }, { "epoch": 0.45255498164571356, "grad_norm": 0.2637135982513428, "learning_rate": 6.186686564344211e-06, "loss": 0.021106889471411705, "memory(GiB)": 21.48, "step": 13931, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957912 }, { "epoch": 0.452587467108469, "grad_norm": 0.5177953839302063, "learning_rate": 6.186164752472047e-06, "loss": 0.02232513763010502, "memory(GiB)": 21.48, "step": 13932, "token_acc": 0.9732142857142857, "train_speed(iter/s)": 0.957923 }, { "epoch": 0.4526199525712244, "grad_norm": 0.2923681437969208, "learning_rate": 6.185642926909964e-06, "loss": 0.023298010230064392, "memory(GiB)": 21.48, "step": 13933, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.957932 }, { "epoch": 0.4526524380339798, "grad_norm": 0.34508636593818665, "learning_rate": 6.185121087663981e-06, "loss": 0.017003456130623817, "memory(GiB)": 21.48, "step": 13934, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.957943 }, { "epoch": 0.4526849234967352, "grad_norm": 0.5651402473449707, "learning_rate": 6.184599234740125e-06, "loss": 0.03198738396167755, "memory(GiB)": 21.48, "step": 13935, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.957953 }, { "epoch": 0.45271740895949064, "grad_norm": 0.47236210107803345, "learning_rate": 6.1840773681444164e-06, "loss": 0.02269156277179718, "memory(GiB)": 21.48, "step": 13936, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957965 }, { "epoch": 0.45274989442224606, "grad_norm": 2.403099536895752, "learning_rate": 6.18355548788288e-06, "loss": 0.0388849675655365, "memory(GiB)": 21.48, "step": 13937, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.957975 }, { "epoch": 0.4527823798850015, "grad_norm": 0.34689992666244507, "learning_rate": 6.1830335939615375e-06, "loss": 0.019977763295173645, "memory(GiB)": 21.48, "step": 13938, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957986 }, { "epoch": 0.4528148653477569, "grad_norm": 0.31451183557510376, "learning_rate": 6.182511686386412e-06, "loss": 0.026828911155462265, "memory(GiB)": 21.48, "step": 13939, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957996 }, { "epoch": 0.4528473508105123, "grad_norm": 0.32958337664604187, "learning_rate": 6.1819897651635285e-06, "loss": 0.024055499583482742, "memory(GiB)": 21.48, "step": 13940, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.958006 }, { "epoch": 0.4528798362732677, "grad_norm": 0.3575854003429413, "learning_rate": 6.18146783029891e-06, "loss": 0.030175330117344856, "memory(GiB)": 21.48, "step": 13941, "token_acc": 0.9948453608247423, "train_speed(iter/s)": 0.958018 }, { "epoch": 0.45291232173602314, "grad_norm": 0.38616064190864563, "learning_rate": 6.180945881798579e-06, "loss": 0.03133780136704445, "memory(GiB)": 21.48, "step": 13942, "token_acc": 0.988, "train_speed(iter/s)": 0.95803 }, { "epoch": 0.45294480719877855, "grad_norm": 0.4410213530063629, "learning_rate": 6.180423919668561e-06, "loss": 0.02769426256418228, "memory(GiB)": 21.48, "step": 13943, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.958042 }, { "epoch": 0.45297729266153397, "grad_norm": 0.27973610162734985, "learning_rate": 6.179901943914881e-06, "loss": 0.026852034032344818, "memory(GiB)": 21.48, "step": 13944, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.958053 }, { "epoch": 0.4530097781242894, "grad_norm": 0.2941772937774658, "learning_rate": 6.179379954543561e-06, "loss": 0.016887977719306946, "memory(GiB)": 21.48, "step": 13945, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.958065 }, { "epoch": 0.4530422635870448, "grad_norm": 0.44603681564331055, "learning_rate": 6.178857951560628e-06, "loss": 0.03266746550798416, "memory(GiB)": 21.48, "step": 13946, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.958074 }, { "epoch": 0.4530747490498002, "grad_norm": 0.28851059079170227, "learning_rate": 6.178335934972105e-06, "loss": 0.02189118042588234, "memory(GiB)": 21.48, "step": 13947, "token_acc": 1.0, "train_speed(iter/s)": 0.958085 }, { "epoch": 0.45310723451255563, "grad_norm": 0.42186063528060913, "learning_rate": 6.177813904784015e-06, "loss": 0.025883987545967102, "memory(GiB)": 21.48, "step": 13948, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.958097 }, { "epoch": 0.45313971997531105, "grad_norm": 0.3759237229824066, "learning_rate": 6.177291861002387e-06, "loss": 0.027831505984067917, "memory(GiB)": 21.48, "step": 13949, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.958108 }, { "epoch": 0.45317220543806647, "grad_norm": 0.35460585355758667, "learning_rate": 6.176769803633242e-06, "loss": 0.01603797823190689, "memory(GiB)": 21.48, "step": 13950, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.95812 }, { "epoch": 0.4532046909008219, "grad_norm": 0.4620637595653534, "learning_rate": 6.176247732682609e-06, "loss": 0.031114326789975166, "memory(GiB)": 21.48, "step": 13951, "token_acc": 0.984375, "train_speed(iter/s)": 0.958132 }, { "epoch": 0.4532371763635773, "grad_norm": 0.29545357823371887, "learning_rate": 6.1757256481565095e-06, "loss": 0.017371509224176407, "memory(GiB)": 21.48, "step": 13952, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.958147 }, { "epoch": 0.4532696618263327, "grad_norm": 0.369601309299469, "learning_rate": 6.175203550060973e-06, "loss": 0.022481117397546768, "memory(GiB)": 21.48, "step": 13953, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.958161 }, { "epoch": 0.45330214728908813, "grad_norm": 0.4503362774848938, "learning_rate": 6.174681438402022e-06, "loss": 0.023507846519351006, "memory(GiB)": 21.48, "step": 13954, "token_acc": 0.992, "train_speed(iter/s)": 0.958176 }, { "epoch": 0.45333463275184355, "grad_norm": 0.3905690312385559, "learning_rate": 6.174159313185685e-06, "loss": 0.026647008955478668, "memory(GiB)": 21.48, "step": 13955, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.958191 }, { "epoch": 0.45336711821459896, "grad_norm": 0.31126320362091064, "learning_rate": 6.173637174417987e-06, "loss": 0.02492179349064827, "memory(GiB)": 21.48, "step": 13956, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.958205 }, { "epoch": 0.4533996036773544, "grad_norm": 0.48798200488090515, "learning_rate": 6.1731150221049515e-06, "loss": 0.016762632876634598, "memory(GiB)": 21.48, "step": 13957, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.958219 }, { "epoch": 0.4534320891401098, "grad_norm": 0.2829146385192871, "learning_rate": 6.17259285625261e-06, "loss": 0.02362741157412529, "memory(GiB)": 21.48, "step": 13958, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.958232 }, { "epoch": 0.4534645746028652, "grad_norm": 0.2797078490257263, "learning_rate": 6.1720706768669845e-06, "loss": 0.017906537279486656, "memory(GiB)": 21.48, "step": 13959, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.958248 }, { "epoch": 0.4534970600656206, "grad_norm": 0.42153748869895935, "learning_rate": 6.171548483954104e-06, "loss": 0.029990291222929955, "memory(GiB)": 21.48, "step": 13960, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.958262 }, { "epoch": 0.45352954552837604, "grad_norm": 0.3456249535083771, "learning_rate": 6.1710262775199954e-06, "loss": 0.014080915600061417, "memory(GiB)": 21.48, "step": 13961, "token_acc": 1.0, "train_speed(iter/s)": 0.958277 }, { "epoch": 0.45356203099113146, "grad_norm": 0.2988099753856659, "learning_rate": 6.170504057570683e-06, "loss": 0.023932918906211853, "memory(GiB)": 21.48, "step": 13962, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.958292 }, { "epoch": 0.4535945164538869, "grad_norm": 0.2927950322628021, "learning_rate": 6.1699818241121975e-06, "loss": 0.019899316132068634, "memory(GiB)": 21.48, "step": 13963, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.958307 }, { "epoch": 0.4536270019166423, "grad_norm": 0.42885875701904297, "learning_rate": 6.169459577150564e-06, "loss": 0.020465828478336334, "memory(GiB)": 21.48, "step": 13964, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.958321 }, { "epoch": 0.4536594873793977, "grad_norm": 0.41645026206970215, "learning_rate": 6.1689373166918105e-06, "loss": 0.018985453993082047, "memory(GiB)": 21.48, "step": 13965, "token_acc": 1.0, "train_speed(iter/s)": 0.958335 }, { "epoch": 0.4536919728421531, "grad_norm": 0.23136284947395325, "learning_rate": 6.168415042741964e-06, "loss": 0.018779011443257332, "memory(GiB)": 21.48, "step": 13966, "token_acc": 1.0, "train_speed(iter/s)": 0.95835 }, { "epoch": 0.45372445830490854, "grad_norm": 0.3495463728904724, "learning_rate": 6.1678927553070545e-06, "loss": 0.02087465673685074, "memory(GiB)": 21.48, "step": 13967, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.958365 }, { "epoch": 0.45375694376766396, "grad_norm": 0.4006040096282959, "learning_rate": 6.167370454393106e-06, "loss": 0.020693548023700714, "memory(GiB)": 21.48, "step": 13968, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.95838 }, { "epoch": 0.45378942923041937, "grad_norm": 0.3159036636352539, "learning_rate": 6.166848140006148e-06, "loss": 0.016361616551876068, "memory(GiB)": 21.48, "step": 13969, "token_acc": 0.9885931558935361, "train_speed(iter/s)": 0.958394 }, { "epoch": 0.4538219146931748, "grad_norm": 0.4437863528728485, "learning_rate": 6.166325812152212e-06, "loss": 0.022066418081521988, "memory(GiB)": 21.48, "step": 13970, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.958408 }, { "epoch": 0.4538544001559302, "grad_norm": 0.344791442155838, "learning_rate": 6.165803470837321e-06, "loss": 0.020645201206207275, "memory(GiB)": 21.48, "step": 13971, "token_acc": 1.0, "train_speed(iter/s)": 0.958423 }, { "epoch": 0.4538868856186856, "grad_norm": 0.5458125472068787, "learning_rate": 6.16528111606751e-06, "loss": 0.034376248717308044, "memory(GiB)": 21.48, "step": 13972, "token_acc": 0.988, "train_speed(iter/s)": 0.958437 }, { "epoch": 0.45391937108144104, "grad_norm": 0.3181172311306, "learning_rate": 6.1647587478488e-06, "loss": 0.02158442512154579, "memory(GiB)": 21.48, "step": 13973, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.958452 }, { "epoch": 0.45395185654419645, "grad_norm": 0.376783549785614, "learning_rate": 6.1642363661872275e-06, "loss": 0.0254768505692482, "memory(GiB)": 21.48, "step": 13974, "token_acc": 0.9899328859060402, "train_speed(iter/s)": 0.958466 }, { "epoch": 0.45398434200695187, "grad_norm": 0.4118039309978485, "learning_rate": 6.163713971088816e-06, "loss": 0.02271907962858677, "memory(GiB)": 21.48, "step": 13975, "token_acc": 0.98828125, "train_speed(iter/s)": 0.95848 }, { "epoch": 0.4540168274697073, "grad_norm": 0.29784905910491943, "learning_rate": 6.163191562559595e-06, "loss": 0.01618814282119274, "memory(GiB)": 21.48, "step": 13976, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.95849 }, { "epoch": 0.4540493129324627, "grad_norm": 0.3873404562473297, "learning_rate": 6.162669140605597e-06, "loss": 0.026408705860376358, "memory(GiB)": 21.48, "step": 13977, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.958502 }, { "epoch": 0.4540817983952181, "grad_norm": 0.4497644305229187, "learning_rate": 6.162146705232849e-06, "loss": 0.026976585388183594, "memory(GiB)": 21.48, "step": 13978, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.958514 }, { "epoch": 0.45411428385797353, "grad_norm": 0.42921215295791626, "learning_rate": 6.161624256447382e-06, "loss": 0.027451204136013985, "memory(GiB)": 21.48, "step": 13979, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.958525 }, { "epoch": 0.45414676932072895, "grad_norm": 0.3724652826786041, "learning_rate": 6.1611017942552234e-06, "loss": 0.022419709712266922, "memory(GiB)": 21.48, "step": 13980, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.958537 }, { "epoch": 0.45417925478348437, "grad_norm": 0.4027289152145386, "learning_rate": 6.160579318662405e-06, "loss": 0.023577630519866943, "memory(GiB)": 21.48, "step": 13981, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.958548 }, { "epoch": 0.4542117402462398, "grad_norm": 0.45265185832977295, "learning_rate": 6.160056829674956e-06, "loss": 0.01906592771410942, "memory(GiB)": 21.48, "step": 13982, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.958559 }, { "epoch": 0.4542442257089952, "grad_norm": 0.3480205237865448, "learning_rate": 6.159534327298909e-06, "loss": 0.02156238444149494, "memory(GiB)": 21.48, "step": 13983, "token_acc": 1.0, "train_speed(iter/s)": 0.95857 }, { "epoch": 0.45427671117175067, "grad_norm": 0.3486780524253845, "learning_rate": 6.159011811540292e-06, "loss": 0.02445870451629162, "memory(GiB)": 21.48, "step": 13984, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.95858 }, { "epoch": 0.4543091966345061, "grad_norm": 0.5322192907333374, "learning_rate": 6.1584892824051345e-06, "loss": 0.0237107090651989, "memory(GiB)": 21.48, "step": 13985, "token_acc": 0.9858156028368794, "train_speed(iter/s)": 0.958592 }, { "epoch": 0.4543416820972615, "grad_norm": 0.8288241028785706, "learning_rate": 6.15796673989947e-06, "loss": 0.0320100411772728, "memory(GiB)": 21.48, "step": 13986, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.958602 }, { "epoch": 0.4543741675600169, "grad_norm": 0.3719649016857147, "learning_rate": 6.157444184029328e-06, "loss": 0.02586677297949791, "memory(GiB)": 21.48, "step": 13987, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.958614 }, { "epoch": 0.45440665302277233, "grad_norm": 0.39810264110565186, "learning_rate": 6.15692161480074e-06, "loss": 0.0345890186727047, "memory(GiB)": 21.48, "step": 13988, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.958623 }, { "epoch": 0.45443913848552775, "grad_norm": 0.4861201047897339, "learning_rate": 6.156399032219736e-06, "loss": 0.028136003762483597, "memory(GiB)": 21.48, "step": 13989, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.958634 }, { "epoch": 0.45447162394828317, "grad_norm": 0.4451114535331726, "learning_rate": 6.155876436292349e-06, "loss": 0.029871227219700813, "memory(GiB)": 21.48, "step": 13990, "token_acc": 0.9883040935672515, "train_speed(iter/s)": 0.958646 }, { "epoch": 0.4545041094110386, "grad_norm": 0.5395084023475647, "learning_rate": 6.155353827024608e-06, "loss": 0.02870858460664749, "memory(GiB)": 21.48, "step": 13991, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.958657 }, { "epoch": 0.454536594873794, "grad_norm": 0.1743762344121933, "learning_rate": 6.154831204422547e-06, "loss": 0.014153065159916878, "memory(GiB)": 21.48, "step": 13992, "token_acc": 1.0, "train_speed(iter/s)": 0.958665 }, { "epoch": 0.4545690803365494, "grad_norm": 0.3327457308769226, "learning_rate": 6.1543085684921976e-06, "loss": 0.017573323100805283, "memory(GiB)": 21.48, "step": 13993, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.958675 }, { "epoch": 0.45460156579930483, "grad_norm": 0.5748869180679321, "learning_rate": 6.153785919239589e-06, "loss": 0.022653728723526, "memory(GiB)": 21.48, "step": 13994, "token_acc": 0.9788732394366197, "train_speed(iter/s)": 0.958685 }, { "epoch": 0.45463405126206025, "grad_norm": 0.3715418875217438, "learning_rate": 6.153263256670756e-06, "loss": 0.029174013063311577, "memory(GiB)": 21.48, "step": 13995, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.958695 }, { "epoch": 0.45466653672481566, "grad_norm": 0.3741742968559265, "learning_rate": 6.152740580791731e-06, "loss": 0.02442912757396698, "memory(GiB)": 21.48, "step": 13996, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.958707 }, { "epoch": 0.4546990221875711, "grad_norm": 0.32346490025520325, "learning_rate": 6.152217891608544e-06, "loss": 0.020952556282281876, "memory(GiB)": 21.48, "step": 13997, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.958718 }, { "epoch": 0.4547315076503265, "grad_norm": 0.35870638489723206, "learning_rate": 6.15169518912723e-06, "loss": 0.028070608153939247, "memory(GiB)": 21.48, "step": 13998, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.95873 }, { "epoch": 0.4547639931130819, "grad_norm": 0.3962956368923187, "learning_rate": 6.151172473353819e-06, "loss": 0.023777619004249573, "memory(GiB)": 21.48, "step": 13999, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.958741 }, { "epoch": 0.4547964785758373, "grad_norm": 0.31826597452163696, "learning_rate": 6.1506497442943465e-06, "loss": 0.016381429508328438, "memory(GiB)": 21.48, "step": 14000, "token_acc": 0.992, "train_speed(iter/s)": 0.958752 }, { "epoch": 0.4547964785758373, "eval_loss": 0.023086078464984894, "eval_runtime": 81.6745, "eval_samples_per_second": 121.825, "eval_steps_per_second": 3.808, "eval_token_acc": 0.9908617733822764, "step": 14000 }, { "epoch": 0.45482896403859274, "grad_norm": 0.35306841135025024, "learning_rate": 6.150127001954844e-06, "loss": 0.016405552625656128, "memory(GiB)": 21.48, "step": 14001, "token_acc": 0.9903692033700938, "train_speed(iter/s)": 0.95269 }, { "epoch": 0.45486144950134816, "grad_norm": 0.7037625908851624, "learning_rate": 6.1496042463413466e-06, "loss": 0.022738587111234665, "memory(GiB)": 21.48, "step": 14002, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.952701 }, { "epoch": 0.4548939349641036, "grad_norm": 0.39196544885635376, "learning_rate": 6.149081477459884e-06, "loss": 0.023443248122930527, "memory(GiB)": 21.48, "step": 14003, "token_acc": 0.9875, "train_speed(iter/s)": 0.952712 }, { "epoch": 0.454926420426859, "grad_norm": 0.3575163781642914, "learning_rate": 6.1485586953164935e-06, "loss": 0.023856136947870255, "memory(GiB)": 21.48, "step": 14004, "token_acc": 0.99609375, "train_speed(iter/s)": 0.952725 }, { "epoch": 0.4549589058896144, "grad_norm": 0.5634580850601196, "learning_rate": 6.148035899917207e-06, "loss": 0.026576772332191467, "memory(GiB)": 21.48, "step": 14005, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.952737 }, { "epoch": 0.4549913913523698, "grad_norm": 0.3653404712677002, "learning_rate": 6.147513091268057e-06, "loss": 0.031302738934755325, "memory(GiB)": 21.48, "step": 14006, "token_acc": 0.9727272727272728, "train_speed(iter/s)": 0.952748 }, { "epoch": 0.45502387681512524, "grad_norm": 0.4472545385360718, "learning_rate": 6.14699026937508e-06, "loss": 0.01989501714706421, "memory(GiB)": 21.48, "step": 14007, "token_acc": 0.9928571428571429, "train_speed(iter/s)": 0.95276 }, { "epoch": 0.45505636227788065, "grad_norm": 0.40094953775405884, "learning_rate": 6.146467434244306e-06, "loss": 0.021306056529283524, "memory(GiB)": 21.48, "step": 14008, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.952773 }, { "epoch": 0.45508884774063607, "grad_norm": 0.35660624504089355, "learning_rate": 6.145944585881774e-06, "loss": 0.02262435108423233, "memory(GiB)": 21.48, "step": 14009, "token_acc": 0.9837837837837838, "train_speed(iter/s)": 0.952784 }, { "epoch": 0.4551213332033915, "grad_norm": 0.3000072240829468, "learning_rate": 6.145421724293516e-06, "loss": 0.018051641061902046, "memory(GiB)": 21.48, "step": 14010, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.952796 }, { "epoch": 0.4551538186661469, "grad_norm": 0.679831326007843, "learning_rate": 6.1448988494855665e-06, "loss": 0.02478002943098545, "memory(GiB)": 21.48, "step": 14011, "token_acc": 0.9893238434163701, "train_speed(iter/s)": 0.952808 }, { "epoch": 0.4551863041289023, "grad_norm": 0.31357648968696594, "learning_rate": 6.14437596146396e-06, "loss": 0.02035338431596756, "memory(GiB)": 21.48, "step": 14012, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.95282 }, { "epoch": 0.45521878959165774, "grad_norm": 0.2772251069545746, "learning_rate": 6.1438530602347325e-06, "loss": 0.019809510558843613, "memory(GiB)": 21.48, "step": 14013, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.952832 }, { "epoch": 0.45525127505441315, "grad_norm": 0.5621891021728516, "learning_rate": 6.143330145803919e-06, "loss": 0.032457925379276276, "memory(GiB)": 21.48, "step": 14014, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.952848 }, { "epoch": 0.45528376051716857, "grad_norm": 0.4112733006477356, "learning_rate": 6.142807218177551e-06, "loss": 0.027957765385508537, "memory(GiB)": 21.48, "step": 14015, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.952863 }, { "epoch": 0.455316245979924, "grad_norm": 0.4022168219089508, "learning_rate": 6.142284277361669e-06, "loss": 0.028226936236023903, "memory(GiB)": 21.48, "step": 14016, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.952878 }, { "epoch": 0.4553487314426794, "grad_norm": 0.2705889344215393, "learning_rate": 6.141761323362304e-06, "loss": 0.021267348900437355, "memory(GiB)": 21.48, "step": 14017, "token_acc": 1.0, "train_speed(iter/s)": 0.952893 }, { "epoch": 0.4553812169054348, "grad_norm": 0.30286648869514465, "learning_rate": 6.141238356185496e-06, "loss": 0.01782228983938694, "memory(GiB)": 21.48, "step": 14018, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.952908 }, { "epoch": 0.45541370236819023, "grad_norm": 0.4849124252796173, "learning_rate": 6.140715375837278e-06, "loss": 0.026719912886619568, "memory(GiB)": 21.48, "step": 14019, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.952923 }, { "epoch": 0.45544618783094565, "grad_norm": 0.26359042525291443, "learning_rate": 6.140192382323685e-06, "loss": 0.01875792071223259, "memory(GiB)": 21.48, "step": 14020, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.952938 }, { "epoch": 0.45547867329370106, "grad_norm": 0.5888102054595947, "learning_rate": 6.139669375650756e-06, "loss": 0.026195527985692024, "memory(GiB)": 21.48, "step": 14021, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.952952 }, { "epoch": 0.4555111587564565, "grad_norm": 0.4790857434272766, "learning_rate": 6.139146355824524e-06, "loss": 0.033790960907936096, "memory(GiB)": 21.48, "step": 14022, "token_acc": 0.9895104895104895, "train_speed(iter/s)": 0.952966 }, { "epoch": 0.4555436442192119, "grad_norm": 0.4246425926685333, "learning_rate": 6.138623322851027e-06, "loss": 0.024628007784485817, "memory(GiB)": 21.48, "step": 14023, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.952981 }, { "epoch": 0.4555761296819673, "grad_norm": 0.3331936001777649, "learning_rate": 6.1381002767363015e-06, "loss": 0.016067026183009148, "memory(GiB)": 21.48, "step": 14024, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.952996 }, { "epoch": 0.45560861514472273, "grad_norm": 0.29514530301094055, "learning_rate": 6.137577217486386e-06, "loss": 0.02246219664812088, "memory(GiB)": 21.48, "step": 14025, "token_acc": 1.0, "train_speed(iter/s)": 0.95301 }, { "epoch": 0.45564110060747814, "grad_norm": 0.33188796043395996, "learning_rate": 6.1370541451073115e-06, "loss": 0.020773358643054962, "memory(GiB)": 21.48, "step": 14026, "token_acc": 1.0, "train_speed(iter/s)": 0.953024 }, { "epoch": 0.45567358607023356, "grad_norm": 0.4480857253074646, "learning_rate": 6.136531059605122e-06, "loss": 0.022758930921554565, "memory(GiB)": 21.48, "step": 14027, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.953038 }, { "epoch": 0.455706071532989, "grad_norm": 0.5721725821495056, "learning_rate": 6.13600796098585e-06, "loss": 0.02984357438981533, "memory(GiB)": 21.48, "step": 14028, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.953053 }, { "epoch": 0.4557385569957444, "grad_norm": 0.6916571259498596, "learning_rate": 6.135484849255534e-06, "loss": 0.022643737494945526, "memory(GiB)": 21.48, "step": 14029, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.953068 }, { "epoch": 0.4557710424584998, "grad_norm": 0.35271739959716797, "learning_rate": 6.134961724420212e-06, "loss": 0.019272975623607635, "memory(GiB)": 21.48, "step": 14030, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.953083 }, { "epoch": 0.4558035279212552, "grad_norm": 0.32123732566833496, "learning_rate": 6.13443858648592e-06, "loss": 0.02571171708405018, "memory(GiB)": 21.48, "step": 14031, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.953098 }, { "epoch": 0.45583601338401064, "grad_norm": 0.3842400014400482, "learning_rate": 6.133915435458698e-06, "loss": 0.020262839272618294, "memory(GiB)": 21.48, "step": 14032, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.953112 }, { "epoch": 0.45586849884676606, "grad_norm": 0.3425173759460449, "learning_rate": 6.133392271344582e-06, "loss": 0.02429128997027874, "memory(GiB)": 21.48, "step": 14033, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.953127 }, { "epoch": 0.4559009843095215, "grad_norm": 0.2242904007434845, "learning_rate": 6.1328690941496115e-06, "loss": 0.018171563744544983, "memory(GiB)": 21.48, "step": 14034, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.953125 }, { "epoch": 0.4559334697722769, "grad_norm": 0.28734228014945984, "learning_rate": 6.132345903879824e-06, "loss": 0.021744897589087486, "memory(GiB)": 21.48, "step": 14035, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.95314 }, { "epoch": 0.4559659552350323, "grad_norm": 0.3354763984680176, "learning_rate": 6.131822700541256e-06, "loss": 0.02151075378060341, "memory(GiB)": 21.48, "step": 14036, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.953152 }, { "epoch": 0.4559984406977877, "grad_norm": 0.2767033576965332, "learning_rate": 6.131299484139949e-06, "loss": 0.018500639125704765, "memory(GiB)": 21.48, "step": 14037, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.953164 }, { "epoch": 0.45603092616054314, "grad_norm": 0.2929747402667999, "learning_rate": 6.130776254681938e-06, "loss": 0.021089866757392883, "memory(GiB)": 21.48, "step": 14038, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.953176 }, { "epoch": 0.45606341162329855, "grad_norm": 0.5839053988456726, "learning_rate": 6.130253012173267e-06, "loss": 0.021641310304403305, "memory(GiB)": 21.48, "step": 14039, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.953188 }, { "epoch": 0.45609589708605397, "grad_norm": 0.3670404851436615, "learning_rate": 6.1297297566199696e-06, "loss": 0.02145015075802803, "memory(GiB)": 21.48, "step": 14040, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.953199 }, { "epoch": 0.4561283825488094, "grad_norm": 1.1243468523025513, "learning_rate": 6.129206488028089e-06, "loss": 0.037643034011125565, "memory(GiB)": 21.48, "step": 14041, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.953211 }, { "epoch": 0.4561608680115648, "grad_norm": 0.32432398200035095, "learning_rate": 6.128683206403662e-06, "loss": 0.01686502806842327, "memory(GiB)": 21.48, "step": 14042, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.953221 }, { "epoch": 0.4561933534743202, "grad_norm": 0.39763733744621277, "learning_rate": 6.128159911752727e-06, "loss": 0.013998501002788544, "memory(GiB)": 21.48, "step": 14043, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.953232 }, { "epoch": 0.45622583893707563, "grad_norm": 0.4168246388435364, "learning_rate": 6.127636604081328e-06, "loss": 0.0209436547011137, "memory(GiB)": 21.48, "step": 14044, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.953242 }, { "epoch": 0.45625832439983105, "grad_norm": 0.47895756363868713, "learning_rate": 6.1271132833954985e-06, "loss": 0.02258184179663658, "memory(GiB)": 21.48, "step": 14045, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.953254 }, { "epoch": 0.45629080986258647, "grad_norm": 0.384385347366333, "learning_rate": 6.126589949701281e-06, "loss": 0.03014727309346199, "memory(GiB)": 21.48, "step": 14046, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.953265 }, { "epoch": 0.4563232953253419, "grad_norm": 0.6847755908966064, "learning_rate": 6.126066603004719e-06, "loss": 0.02744951657950878, "memory(GiB)": 21.48, "step": 14047, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.953276 }, { "epoch": 0.45635578078809735, "grad_norm": 0.35281577706336975, "learning_rate": 6.125543243311848e-06, "loss": 0.022029424086213112, "memory(GiB)": 21.48, "step": 14048, "token_acc": 0.9897435897435898, "train_speed(iter/s)": 0.953287 }, { "epoch": 0.45638826625085277, "grad_norm": 0.38209962844848633, "learning_rate": 6.125019870628711e-06, "loss": 0.025792758911848068, "memory(GiB)": 21.48, "step": 14049, "token_acc": 0.9877049180327869, "train_speed(iter/s)": 0.953299 }, { "epoch": 0.4564207517136082, "grad_norm": 0.6019861698150635, "learning_rate": 6.124496484961347e-06, "loss": 0.024257078766822815, "memory(GiB)": 21.48, "step": 14050, "token_acc": 0.9757281553398058, "train_speed(iter/s)": 0.95331 }, { "epoch": 0.4564532371763636, "grad_norm": 0.37597745656967163, "learning_rate": 6.123973086315796e-06, "loss": 0.025064438581466675, "memory(GiB)": 21.48, "step": 14051, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.953322 }, { "epoch": 0.456485722639119, "grad_norm": 0.4912942945957184, "learning_rate": 6.1234496746981e-06, "loss": 0.029115866869688034, "memory(GiB)": 21.48, "step": 14052, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.953335 }, { "epoch": 0.45651820810187443, "grad_norm": 0.4748189151287079, "learning_rate": 6.122926250114299e-06, "loss": 0.03138171136379242, "memory(GiB)": 21.48, "step": 14053, "token_acc": 0.9847908745247148, "train_speed(iter/s)": 0.953345 }, { "epoch": 0.45655069356462985, "grad_norm": 0.4307825267314911, "learning_rate": 6.122402812570435e-06, "loss": 0.030777040868997574, "memory(GiB)": 21.48, "step": 14054, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.953354 }, { "epoch": 0.45658317902738527, "grad_norm": 0.3581077456474304, "learning_rate": 6.121879362072549e-06, "loss": 0.02046121470630169, "memory(GiB)": 21.48, "step": 14055, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.953362 }, { "epoch": 0.4566156644901407, "grad_norm": 0.575346827507019, "learning_rate": 6.121355898626682e-06, "loss": 0.029413582757115364, "memory(GiB)": 21.48, "step": 14056, "token_acc": 0.9875, "train_speed(iter/s)": 0.953371 }, { "epoch": 0.4566481499528961, "grad_norm": 0.325273334980011, "learning_rate": 6.120832422238875e-06, "loss": 0.025760944932699203, "memory(GiB)": 21.48, "step": 14057, "token_acc": 0.9823943661971831, "train_speed(iter/s)": 0.95338 }, { "epoch": 0.4566806354156515, "grad_norm": 0.44669419527053833, "learning_rate": 6.12030893291517e-06, "loss": 0.02592659741640091, "memory(GiB)": 21.48, "step": 14058, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.953389 }, { "epoch": 0.45671312087840693, "grad_norm": 0.36132481694221497, "learning_rate": 6.119785430661609e-06, "loss": 0.02575966715812683, "memory(GiB)": 21.48, "step": 14059, "token_acc": 0.9933554817275747, "train_speed(iter/s)": 0.953399 }, { "epoch": 0.45674560634116235, "grad_norm": 0.3217758536338806, "learning_rate": 6.1192619154842335e-06, "loss": 0.02203647792339325, "memory(GiB)": 21.48, "step": 14060, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.953409 }, { "epoch": 0.45677809180391776, "grad_norm": 0.40743228793144226, "learning_rate": 6.1187383873890846e-06, "loss": 0.03200829401612282, "memory(GiB)": 21.48, "step": 14061, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.953418 }, { "epoch": 0.4568105772666732, "grad_norm": 0.30132076144218445, "learning_rate": 6.118214846382207e-06, "loss": 0.01482333429157734, "memory(GiB)": 21.48, "step": 14062, "token_acc": 1.0, "train_speed(iter/s)": 0.953427 }, { "epoch": 0.4568430627294286, "grad_norm": 0.25688886642456055, "learning_rate": 6.117691292469642e-06, "loss": 0.015780359506607056, "memory(GiB)": 21.48, "step": 14063, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.953439 }, { "epoch": 0.456875548192184, "grad_norm": 0.23681224882602692, "learning_rate": 6.11716772565743e-06, "loss": 0.014539050869643688, "memory(GiB)": 21.48, "step": 14064, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.953451 }, { "epoch": 0.4569080336549394, "grad_norm": 0.2507950961589813, "learning_rate": 6.116644145951618e-06, "loss": 0.01708848774433136, "memory(GiB)": 21.48, "step": 14065, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.953463 }, { "epoch": 0.45694051911769484, "grad_norm": 0.3819302022457123, "learning_rate": 6.116120553358244e-06, "loss": 0.02275541052222252, "memory(GiB)": 21.48, "step": 14066, "token_acc": 1.0, "train_speed(iter/s)": 0.953474 }, { "epoch": 0.45697300458045026, "grad_norm": 0.3602221608161926, "learning_rate": 6.115596947883356e-06, "loss": 0.024641284719109535, "memory(GiB)": 21.48, "step": 14067, "token_acc": 1.0, "train_speed(iter/s)": 0.953485 }, { "epoch": 0.4570054900432057, "grad_norm": 0.4645487666130066, "learning_rate": 6.115073329532992e-06, "loss": 0.026893382892012596, "memory(GiB)": 21.48, "step": 14068, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.953497 }, { "epoch": 0.4570379755059611, "grad_norm": 0.4175227880477905, "learning_rate": 6.114549698313199e-06, "loss": 0.022221282124519348, "memory(GiB)": 21.48, "step": 14069, "token_acc": 0.9783549783549783, "train_speed(iter/s)": 0.953509 }, { "epoch": 0.4570704609687165, "grad_norm": 0.5465309023857117, "learning_rate": 6.114026054230019e-06, "loss": 0.03551153838634491, "memory(GiB)": 21.48, "step": 14070, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.953522 }, { "epoch": 0.4571029464314719, "grad_norm": 0.4978586733341217, "learning_rate": 6.113502397289494e-06, "loss": 0.02405383065342903, "memory(GiB)": 21.48, "step": 14071, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.953534 }, { "epoch": 0.45713543189422734, "grad_norm": 0.3390236794948578, "learning_rate": 6.11297872749767e-06, "loss": 0.027855098247528076, "memory(GiB)": 21.48, "step": 14072, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.953544 }, { "epoch": 0.45716791735698276, "grad_norm": 0.2824857532978058, "learning_rate": 6.11245504486059e-06, "loss": 0.018417805433273315, "memory(GiB)": 21.48, "step": 14073, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.953559 }, { "epoch": 0.45720040281973817, "grad_norm": 0.2576603889465332, "learning_rate": 6.111931349384299e-06, "loss": 0.016487371176481247, "memory(GiB)": 21.48, "step": 14074, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.953574 }, { "epoch": 0.4572328882824936, "grad_norm": 0.2981092631816864, "learning_rate": 6.11140764107484e-06, "loss": 0.019384410232305527, "memory(GiB)": 21.48, "step": 14075, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.953589 }, { "epoch": 0.457265373745249, "grad_norm": 0.30138930678367615, "learning_rate": 6.1108839199382576e-06, "loss": 0.01756996661424637, "memory(GiB)": 21.48, "step": 14076, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.953604 }, { "epoch": 0.4572978592080044, "grad_norm": 0.357726514339447, "learning_rate": 6.110360185980595e-06, "loss": 0.017710065469145775, "memory(GiB)": 21.48, "step": 14077, "token_acc": 0.9877049180327869, "train_speed(iter/s)": 0.953618 }, { "epoch": 0.45733034467075984, "grad_norm": 0.40189245343208313, "learning_rate": 6.109836439207899e-06, "loss": 0.02481040731072426, "memory(GiB)": 21.48, "step": 14078, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.953633 }, { "epoch": 0.45736283013351525, "grad_norm": 0.28520825505256653, "learning_rate": 6.109312679626214e-06, "loss": 0.02231370098888874, "memory(GiB)": 21.48, "step": 14079, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.953647 }, { "epoch": 0.45739531559627067, "grad_norm": 0.42131051421165466, "learning_rate": 6.108788907241582e-06, "loss": 0.022089995443820953, "memory(GiB)": 21.48, "step": 14080, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.953662 }, { "epoch": 0.4574278010590261, "grad_norm": 0.2260274887084961, "learning_rate": 6.108265122060052e-06, "loss": 0.016536004841327667, "memory(GiB)": 21.48, "step": 14081, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.953676 }, { "epoch": 0.4574602865217815, "grad_norm": 0.24116720259189606, "learning_rate": 6.107741324087666e-06, "loss": 0.01681479625403881, "memory(GiB)": 21.48, "step": 14082, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.953691 }, { "epoch": 0.4574927719845369, "grad_norm": 0.36797186732292175, "learning_rate": 6.1072175133304725e-06, "loss": 0.0140597615391016, "memory(GiB)": 21.48, "step": 14083, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.953705 }, { "epoch": 0.45752525744729233, "grad_norm": 0.44042032957077026, "learning_rate": 6.106693689794515e-06, "loss": 0.027801621705293655, "memory(GiB)": 21.48, "step": 14084, "token_acc": 0.9923076923076923, "train_speed(iter/s)": 0.95372 }, { "epoch": 0.45755774291004775, "grad_norm": 0.43367740511894226, "learning_rate": 6.1061698534858385e-06, "loss": 0.023708662018179893, "memory(GiB)": 21.48, "step": 14085, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.953735 }, { "epoch": 0.45759022837280316, "grad_norm": 0.4158959686756134, "learning_rate": 6.10564600441049e-06, "loss": 0.020561696961522102, "memory(GiB)": 21.48, "step": 14086, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.953749 }, { "epoch": 0.4576227138355586, "grad_norm": 0.3886276185512543, "learning_rate": 6.105122142574515e-06, "loss": 0.024593383073806763, "memory(GiB)": 21.48, "step": 14087, "token_acc": 0.9834254143646409, "train_speed(iter/s)": 0.953763 }, { "epoch": 0.457655199298314, "grad_norm": 0.3398903012275696, "learning_rate": 6.104598267983959e-06, "loss": 0.024833427742123604, "memory(GiB)": 21.48, "step": 14088, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.953778 }, { "epoch": 0.4576876847610694, "grad_norm": 0.2748976945877075, "learning_rate": 6.104074380644869e-06, "loss": 0.018952542915940285, "memory(GiB)": 21.48, "step": 14089, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.953792 }, { "epoch": 0.45772017022382483, "grad_norm": 0.4340967535972595, "learning_rate": 6.1035504805632916e-06, "loss": 0.024504171684384346, "memory(GiB)": 21.48, "step": 14090, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.953807 }, { "epoch": 0.45775265568658025, "grad_norm": 0.39768967032432556, "learning_rate": 6.1030265677452714e-06, "loss": 0.02687913179397583, "memory(GiB)": 21.48, "step": 14091, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.953821 }, { "epoch": 0.45778514114933566, "grad_norm": 0.5412459969520569, "learning_rate": 6.102502642196857e-06, "loss": 0.02907743491232395, "memory(GiB)": 21.48, "step": 14092, "token_acc": 0.9893238434163701, "train_speed(iter/s)": 0.953836 }, { "epoch": 0.4578176266120911, "grad_norm": 0.46813061833381653, "learning_rate": 6.101978703924095e-06, "loss": 0.02058328688144684, "memory(GiB)": 21.48, "step": 14093, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.95385 }, { "epoch": 0.4578501120748465, "grad_norm": 0.7456794381141663, "learning_rate": 6.101454752933032e-06, "loss": 0.030234092846512794, "memory(GiB)": 21.48, "step": 14094, "token_acc": 0.9930555555555556, "train_speed(iter/s)": 0.953864 }, { "epoch": 0.4578825975376019, "grad_norm": 0.35990214347839355, "learning_rate": 6.100930789229716e-06, "loss": 0.01929548755288124, "memory(GiB)": 21.48, "step": 14095, "token_acc": 0.9858657243816255, "train_speed(iter/s)": 0.953879 }, { "epoch": 0.4579150830003573, "grad_norm": 0.3195289373397827, "learning_rate": 6.100406812820192e-06, "loss": 0.015676673501729965, "memory(GiB)": 21.48, "step": 14096, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.953893 }, { "epoch": 0.45794756846311274, "grad_norm": 0.6022314429283142, "learning_rate": 6.09988282371051e-06, "loss": 0.03879459202289581, "memory(GiB)": 21.48, "step": 14097, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.953906 }, { "epoch": 0.45798005392586816, "grad_norm": 0.6196302771568298, "learning_rate": 6.099358821906717e-06, "loss": 0.026939192786812782, "memory(GiB)": 21.48, "step": 14098, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.953921 }, { "epoch": 0.4580125393886236, "grad_norm": 0.4774348735809326, "learning_rate": 6.098834807414857e-06, "loss": 0.02255878411233425, "memory(GiB)": 21.48, "step": 14099, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.953936 }, { "epoch": 0.458045024851379, "grad_norm": 0.34121108055114746, "learning_rate": 6.098310780240984e-06, "loss": 0.024568919092416763, "memory(GiB)": 21.48, "step": 14100, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.953951 }, { "epoch": 0.4580775103141344, "grad_norm": 0.3328586220741272, "learning_rate": 6.09778674039114e-06, "loss": 0.017285913228988647, "memory(GiB)": 21.48, "step": 14101, "token_acc": 0.9961538461538462, "train_speed(iter/s)": 0.953964 }, { "epoch": 0.4581099957768898, "grad_norm": 0.3171097934246063, "learning_rate": 6.097262687871378e-06, "loss": 0.0229334719479084, "memory(GiB)": 21.48, "step": 14102, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.953976 }, { "epoch": 0.45814248123964524, "grad_norm": 0.3365561068058014, "learning_rate": 6.096738622687743e-06, "loss": 0.023055734112858772, "memory(GiB)": 21.48, "step": 14103, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.953986 }, { "epoch": 0.45817496670240065, "grad_norm": 0.3399171233177185, "learning_rate": 6.0962145448462854e-06, "loss": 0.024464037269353867, "memory(GiB)": 21.48, "step": 14104, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.953998 }, { "epoch": 0.45820745216515607, "grad_norm": 0.2549121677875519, "learning_rate": 6.09569045435305e-06, "loss": 0.022314995527267456, "memory(GiB)": 21.48, "step": 14105, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.95401 }, { "epoch": 0.4582399376279115, "grad_norm": 0.4482806921005249, "learning_rate": 6.095166351214093e-06, "loss": 0.02829321287572384, "memory(GiB)": 21.48, "step": 14106, "token_acc": 0.9787985865724381, "train_speed(iter/s)": 0.954019 }, { "epoch": 0.4582724230906669, "grad_norm": 0.452871173620224, "learning_rate": 6.0946422354354555e-06, "loss": 0.02297251671552658, "memory(GiB)": 21.48, "step": 14107, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.954031 }, { "epoch": 0.4583049085534223, "grad_norm": 0.34047508239746094, "learning_rate": 6.09411810702319e-06, "loss": 0.025491006672382355, "memory(GiB)": 21.48, "step": 14108, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.954043 }, { "epoch": 0.45833739401617773, "grad_norm": 0.39236029982566833, "learning_rate": 6.0935939659833455e-06, "loss": 0.02283160202205181, "memory(GiB)": 21.48, "step": 14109, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.954055 }, { "epoch": 0.45836987947893315, "grad_norm": 0.3357200622558594, "learning_rate": 6.09306981232197e-06, "loss": 0.020075460895895958, "memory(GiB)": 21.48, "step": 14110, "token_acc": 0.9927007299270073, "train_speed(iter/s)": 0.954067 }, { "epoch": 0.45840236494168857, "grad_norm": 0.40953919291496277, "learning_rate": 6.092545646045116e-06, "loss": 0.026791129261255264, "memory(GiB)": 21.48, "step": 14111, "token_acc": 0.979253112033195, "train_speed(iter/s)": 0.954076 }, { "epoch": 0.45843485040444404, "grad_norm": 0.3574337363243103, "learning_rate": 6.092021467158831e-06, "loss": 0.021587492898106575, "memory(GiB)": 21.48, "step": 14112, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.954086 }, { "epoch": 0.45846733586719945, "grad_norm": 0.36168229579925537, "learning_rate": 6.091497275669164e-06, "loss": 0.020785804837942123, "memory(GiB)": 21.48, "step": 14113, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.954096 }, { "epoch": 0.45849982132995487, "grad_norm": 0.3470449447631836, "learning_rate": 6.090973071582166e-06, "loss": 0.022977139800786972, "memory(GiB)": 21.48, "step": 14114, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.954106 }, { "epoch": 0.4585323067927103, "grad_norm": 0.4056602120399475, "learning_rate": 6.0904488549038855e-06, "loss": 0.02098465897142887, "memory(GiB)": 21.48, "step": 14115, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.954116 }, { "epoch": 0.4585647922554657, "grad_norm": 0.5636491775512695, "learning_rate": 6.089924625640375e-06, "loss": 0.019308622926473618, "memory(GiB)": 21.48, "step": 14116, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.954124 }, { "epoch": 0.4585972777182211, "grad_norm": 0.284486323595047, "learning_rate": 6.089400383797683e-06, "loss": 0.023497380316257477, "memory(GiB)": 21.48, "step": 14117, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.954134 }, { "epoch": 0.45862976318097654, "grad_norm": 0.3081872761249542, "learning_rate": 6.088876129381862e-06, "loss": 0.0215647853910923, "memory(GiB)": 21.48, "step": 14118, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.954144 }, { "epoch": 0.45866224864373195, "grad_norm": 0.3999481797218323, "learning_rate": 6.088351862398959e-06, "loss": 0.02517935074865818, "memory(GiB)": 21.48, "step": 14119, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.954154 }, { "epoch": 0.45869473410648737, "grad_norm": 0.45787572860717773, "learning_rate": 6.087827582855028e-06, "loss": 0.026356523856520653, "memory(GiB)": 21.48, "step": 14120, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.954164 }, { "epoch": 0.4587272195692428, "grad_norm": 0.6740930676460266, "learning_rate": 6.08730329075612e-06, "loss": 0.024725230410695076, "memory(GiB)": 21.48, "step": 14121, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.954173 }, { "epoch": 0.4587597050319982, "grad_norm": 0.32159659266471863, "learning_rate": 6.0867789861082835e-06, "loss": 0.013803926296532154, "memory(GiB)": 21.48, "step": 14122, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.954183 }, { "epoch": 0.4587921904947536, "grad_norm": 0.39059120416641235, "learning_rate": 6.08625466891757e-06, "loss": 0.023837439715862274, "memory(GiB)": 21.48, "step": 14123, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.954193 }, { "epoch": 0.45882467595750903, "grad_norm": 0.4584566056728363, "learning_rate": 6.085730339190033e-06, "loss": 0.023720083758234978, "memory(GiB)": 21.48, "step": 14124, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.954202 }, { "epoch": 0.45885716142026445, "grad_norm": 0.5466629266738892, "learning_rate": 6.0852059969317235e-06, "loss": 0.02036418579518795, "memory(GiB)": 21.48, "step": 14125, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.954212 }, { "epoch": 0.45888964688301986, "grad_norm": 0.6575833559036255, "learning_rate": 6.084681642148692e-06, "loss": 0.032368432730436325, "memory(GiB)": 21.48, "step": 14126, "token_acc": 1.0, "train_speed(iter/s)": 0.954224 }, { "epoch": 0.4589221323457753, "grad_norm": 0.4588162302970886, "learning_rate": 6.084157274846989e-06, "loss": 0.03447175770998001, "memory(GiB)": 21.48, "step": 14127, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.954233 }, { "epoch": 0.4589546178085307, "grad_norm": 0.41192570328712463, "learning_rate": 6.083632895032671e-06, "loss": 0.018201518803834915, "memory(GiB)": 21.48, "step": 14128, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.954243 }, { "epoch": 0.4589871032712861, "grad_norm": 0.5248187184333801, "learning_rate": 6.083108502711785e-06, "loss": 0.029674820601940155, "memory(GiB)": 21.48, "step": 14129, "token_acc": 0.986013986013986, "train_speed(iter/s)": 0.954255 }, { "epoch": 0.45901958873404153, "grad_norm": 0.3387773334980011, "learning_rate": 6.082584097890385e-06, "loss": 0.02149171382188797, "memory(GiB)": 21.48, "step": 14130, "token_acc": 0.9813084112149533, "train_speed(iter/s)": 0.954265 }, { "epoch": 0.45905207419679694, "grad_norm": 0.3604181706905365, "learning_rate": 6.082059680574525e-06, "loss": 0.0206045713275671, "memory(GiB)": 21.48, "step": 14131, "token_acc": 0.992, "train_speed(iter/s)": 0.954276 }, { "epoch": 0.45908455965955236, "grad_norm": 0.32709255814552307, "learning_rate": 6.081535250770255e-06, "loss": 0.028471341356635094, "memory(GiB)": 21.48, "step": 14132, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.954289 }, { "epoch": 0.4591170451223078, "grad_norm": 0.46837329864501953, "learning_rate": 6.081010808483628e-06, "loss": 0.026802413165569305, "memory(GiB)": 21.48, "step": 14133, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.954304 }, { "epoch": 0.4591495305850632, "grad_norm": 0.5647604465484619, "learning_rate": 6.080486353720699e-06, "loss": 0.02727476879954338, "memory(GiB)": 21.48, "step": 14134, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.954318 }, { "epoch": 0.4591820160478186, "grad_norm": 0.33716973662376404, "learning_rate": 6.07996188648752e-06, "loss": 0.015145748853683472, "memory(GiB)": 21.48, "step": 14135, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.954332 }, { "epoch": 0.459214501510574, "grad_norm": 0.3173065781593323, "learning_rate": 6.079437406790142e-06, "loss": 0.01952989026904106, "memory(GiB)": 21.48, "step": 14136, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.954346 }, { "epoch": 0.45924698697332944, "grad_norm": 0.39119577407836914, "learning_rate": 6.07891291463462e-06, "loss": 0.024573925882577896, "memory(GiB)": 21.48, "step": 14137, "token_acc": 0.9814126394052045, "train_speed(iter/s)": 0.95436 }, { "epoch": 0.45927947243608486, "grad_norm": 0.4015670418739319, "learning_rate": 6.078388410027005e-06, "loss": 0.023507703095674515, "memory(GiB)": 21.48, "step": 14138, "token_acc": 1.0, "train_speed(iter/s)": 0.954374 }, { "epoch": 0.4593119578988403, "grad_norm": 0.3453786373138428, "learning_rate": 6.077863892973355e-06, "loss": 0.021607907488942146, "memory(GiB)": 21.48, "step": 14139, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.954388 }, { "epoch": 0.4593444433615957, "grad_norm": 0.6566636562347412, "learning_rate": 6.0773393634797205e-06, "loss": 0.028463110327720642, "memory(GiB)": 21.48, "step": 14140, "token_acc": 0.9838056680161943, "train_speed(iter/s)": 0.954402 }, { "epoch": 0.4593769288243511, "grad_norm": 0.45420122146606445, "learning_rate": 6.076814821552155e-06, "loss": 0.023219667375087738, "memory(GiB)": 21.48, "step": 14141, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.954416 }, { "epoch": 0.4594094142871065, "grad_norm": 0.33788156509399414, "learning_rate": 6.076290267196714e-06, "loss": 0.022970352321863174, "memory(GiB)": 21.48, "step": 14142, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.954431 }, { "epoch": 0.45944189974986194, "grad_norm": 0.3991869390010834, "learning_rate": 6.075765700419451e-06, "loss": 0.02078518643975258, "memory(GiB)": 21.48, "step": 14143, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.954445 }, { "epoch": 0.45947438521261735, "grad_norm": 0.32388463616371155, "learning_rate": 6.07524112122642e-06, "loss": 0.018676672130823135, "memory(GiB)": 21.48, "step": 14144, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.954459 }, { "epoch": 0.45950687067537277, "grad_norm": 0.40620702505111694, "learning_rate": 6.074716529623674e-06, "loss": 0.02817240171134472, "memory(GiB)": 21.48, "step": 14145, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.954474 }, { "epoch": 0.4595393561381282, "grad_norm": 0.27852144837379456, "learning_rate": 6.07419192561727e-06, "loss": 0.017247704789042473, "memory(GiB)": 21.48, "step": 14146, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.95449 }, { "epoch": 0.4595718416008836, "grad_norm": 0.3049381971359253, "learning_rate": 6.07366730921326e-06, "loss": 0.018978895619511604, "memory(GiB)": 21.48, "step": 14147, "token_acc": 0.9875, "train_speed(iter/s)": 0.954505 }, { "epoch": 0.459604327063639, "grad_norm": 0.307503342628479, "learning_rate": 6.073142680417702e-06, "loss": 0.0225665420293808, "memory(GiB)": 21.48, "step": 14148, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.954519 }, { "epoch": 0.45963681252639443, "grad_norm": 0.38372141122817993, "learning_rate": 6.07261803923665e-06, "loss": 0.029276389628648758, "memory(GiB)": 21.48, "step": 14149, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.954534 }, { "epoch": 0.45966929798914985, "grad_norm": 0.3671022057533264, "learning_rate": 6.072093385676156e-06, "loss": 0.02007528580725193, "memory(GiB)": 21.48, "step": 14150, "token_acc": 1.0, "train_speed(iter/s)": 0.954548 }, { "epoch": 0.45970178345190527, "grad_norm": 0.33093297481536865, "learning_rate": 6.071568719742278e-06, "loss": 0.017044417560100555, "memory(GiB)": 21.48, "step": 14151, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.954563 }, { "epoch": 0.4597342689146607, "grad_norm": 0.3547888398170471, "learning_rate": 6.0710440414410695e-06, "loss": 0.01664634980261326, "memory(GiB)": 21.48, "step": 14152, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.954578 }, { "epoch": 0.4597667543774161, "grad_norm": 0.3073669672012329, "learning_rate": 6.07051935077859e-06, "loss": 0.029236389324069023, "memory(GiB)": 21.48, "step": 14153, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.954593 }, { "epoch": 0.4597992398401715, "grad_norm": 0.35614028573036194, "learning_rate": 6.06999464776089e-06, "loss": 0.022476252168416977, "memory(GiB)": 21.48, "step": 14154, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.954607 }, { "epoch": 0.45983172530292693, "grad_norm": 0.34893178939819336, "learning_rate": 6.069469932394028e-06, "loss": 0.01797301322221756, "memory(GiB)": 21.48, "step": 14155, "token_acc": 0.9899328859060402, "train_speed(iter/s)": 0.954621 }, { "epoch": 0.45986421076568235, "grad_norm": 0.3451945185661316, "learning_rate": 6.068945204684059e-06, "loss": 0.022965779528021812, "memory(GiB)": 21.48, "step": 14156, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.954635 }, { "epoch": 0.45989669622843776, "grad_norm": 0.3526555299758911, "learning_rate": 6.068420464637041e-06, "loss": 0.02122647315263748, "memory(GiB)": 21.48, "step": 14157, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.954649 }, { "epoch": 0.4599291816911932, "grad_norm": 0.36512571573257446, "learning_rate": 6.067895712259027e-06, "loss": 0.01907844841480255, "memory(GiB)": 21.48, "step": 14158, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.954664 }, { "epoch": 0.4599616671539486, "grad_norm": 0.34782710671424866, "learning_rate": 6.0673709475560774e-06, "loss": 0.01863589510321617, "memory(GiB)": 21.48, "step": 14159, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.954678 }, { "epoch": 0.459994152616704, "grad_norm": 0.36435383558273315, "learning_rate": 6.066846170534245e-06, "loss": 0.023563813418149948, "memory(GiB)": 21.48, "step": 14160, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.954692 }, { "epoch": 0.4600266380794594, "grad_norm": 0.9804219007492065, "learning_rate": 6.066321381199588e-06, "loss": 0.027701664716005325, "memory(GiB)": 21.48, "step": 14161, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.954706 }, { "epoch": 0.46005912354221484, "grad_norm": 0.4069008529186249, "learning_rate": 6.065796579558163e-06, "loss": 0.020594051107764244, "memory(GiB)": 21.48, "step": 14162, "token_acc": 0.988, "train_speed(iter/s)": 0.954722 }, { "epoch": 0.46009160900497026, "grad_norm": 0.3702165484428406, "learning_rate": 6.065271765616028e-06, "loss": 0.026035793125629425, "memory(GiB)": 21.48, "step": 14163, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.954734 }, { "epoch": 0.4601240944677257, "grad_norm": 0.37915974855422974, "learning_rate": 6.064746939379237e-06, "loss": 0.019277231767773628, "memory(GiB)": 21.48, "step": 14164, "token_acc": 0.9817073170731707, "train_speed(iter/s)": 0.954746 }, { "epoch": 0.4601565799304811, "grad_norm": 0.5215632319450378, "learning_rate": 6.06422210085385e-06, "loss": 0.02730589359998703, "memory(GiB)": 21.48, "step": 14165, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.954757 }, { "epoch": 0.4601890653932365, "grad_norm": 0.32976263761520386, "learning_rate": 6.063697250045924e-06, "loss": 0.024578124284744263, "memory(GiB)": 21.48, "step": 14166, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.954767 }, { "epoch": 0.4602215508559919, "grad_norm": 0.36312779784202576, "learning_rate": 6.063172386961516e-06, "loss": 0.023470263928174973, "memory(GiB)": 21.48, "step": 14167, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.954778 }, { "epoch": 0.46025403631874734, "grad_norm": 0.27599579095840454, "learning_rate": 6.062647511606683e-06, "loss": 0.016907796263694763, "memory(GiB)": 21.48, "step": 14168, "token_acc": 0.9846938775510204, "train_speed(iter/s)": 0.954791 }, { "epoch": 0.46028652178150276, "grad_norm": 0.7212283611297607, "learning_rate": 6.0621226239874816e-06, "loss": 0.03617286682128906, "memory(GiB)": 21.48, "step": 14169, "token_acc": 0.9824561403508771, "train_speed(iter/s)": 0.954803 }, { "epoch": 0.46031900724425817, "grad_norm": 0.361758828163147, "learning_rate": 6.061597724109974e-06, "loss": 0.024295197799801826, "memory(GiB)": 21.48, "step": 14170, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.954815 }, { "epoch": 0.4603514927070136, "grad_norm": 0.540370762348175, "learning_rate": 6.061072811980215e-06, "loss": 0.023412059992551804, "memory(GiB)": 21.48, "step": 14171, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.954826 }, { "epoch": 0.460383978169769, "grad_norm": 0.28761082887649536, "learning_rate": 6.060547887604264e-06, "loss": 0.021959947422146797, "memory(GiB)": 21.48, "step": 14172, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.954837 }, { "epoch": 0.4604164636325244, "grad_norm": 0.3060084581375122, "learning_rate": 6.060022950988178e-06, "loss": 0.022338151931762695, "memory(GiB)": 21.48, "step": 14173, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.954846 }, { "epoch": 0.46044894909527984, "grad_norm": 0.3890329599380493, "learning_rate": 6.059498002138017e-06, "loss": 0.01874520629644394, "memory(GiB)": 21.48, "step": 14174, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.954856 }, { "epoch": 0.46048143455803525, "grad_norm": 0.446600079536438, "learning_rate": 6.058973041059837e-06, "loss": 0.024858206510543823, "memory(GiB)": 21.48, "step": 14175, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.954865 }, { "epoch": 0.4605139200207907, "grad_norm": 0.35071927309036255, "learning_rate": 6.058448067759702e-06, "loss": 0.028505578637123108, "memory(GiB)": 21.48, "step": 14176, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.954874 }, { "epoch": 0.46054640548354614, "grad_norm": 0.2874546945095062, "learning_rate": 6.057923082243665e-06, "loss": 0.016714081168174744, "memory(GiB)": 21.48, "step": 14177, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.954883 }, { "epoch": 0.46057889094630156, "grad_norm": 0.28984519839286804, "learning_rate": 6.057398084517787e-06, "loss": 0.023580890148878098, "memory(GiB)": 21.48, "step": 14178, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.954892 }, { "epoch": 0.46061137640905697, "grad_norm": 0.29473647475242615, "learning_rate": 6.05687307458813e-06, "loss": 0.019222818315029144, "memory(GiB)": 21.48, "step": 14179, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.954903 }, { "epoch": 0.4606438618718124, "grad_norm": 0.3362594246864319, "learning_rate": 6.056348052460749e-06, "loss": 0.021397531032562256, "memory(GiB)": 21.48, "step": 14180, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.954912 }, { "epoch": 0.4606763473345678, "grad_norm": 0.33263394236564636, "learning_rate": 6.055823018141707e-06, "loss": 0.019126465544104576, "memory(GiB)": 21.48, "step": 14181, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.954921 }, { "epoch": 0.4607088327973232, "grad_norm": 0.3833146095275879, "learning_rate": 6.055297971637061e-06, "loss": 0.023956406861543655, "memory(GiB)": 21.48, "step": 14182, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.954928 }, { "epoch": 0.46074131826007864, "grad_norm": 0.38603490591049194, "learning_rate": 6.054772912952872e-06, "loss": 0.024189267307519913, "memory(GiB)": 21.48, "step": 14183, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.954937 }, { "epoch": 0.46077380372283405, "grad_norm": 0.6296491622924805, "learning_rate": 6.054247842095199e-06, "loss": 0.025314079597592354, "memory(GiB)": 21.48, "step": 14184, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.954946 }, { "epoch": 0.46080628918558947, "grad_norm": 0.6980798840522766, "learning_rate": 6.053722759070103e-06, "loss": 0.02725018933415413, "memory(GiB)": 21.48, "step": 14185, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.954955 }, { "epoch": 0.4608387746483449, "grad_norm": 0.38505032658576965, "learning_rate": 6.0531976638836455e-06, "loss": 0.022817108780145645, "memory(GiB)": 21.48, "step": 14186, "token_acc": 0.9944444444444445, "train_speed(iter/s)": 0.954965 }, { "epoch": 0.4608712601111003, "grad_norm": 0.43303894996643066, "learning_rate": 6.052672556541883e-06, "loss": 0.026827817782759666, "memory(GiB)": 21.48, "step": 14187, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.954976 }, { "epoch": 0.4609037455738557, "grad_norm": 0.40981459617614746, "learning_rate": 6.05214743705088e-06, "loss": 0.026187699288129807, "memory(GiB)": 21.48, "step": 14188, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.954987 }, { "epoch": 0.46093623103661113, "grad_norm": 0.43749353289604187, "learning_rate": 6.051622305416693e-06, "loss": 0.02615264430642128, "memory(GiB)": 21.48, "step": 14189, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.954998 }, { "epoch": 0.46096871649936655, "grad_norm": 0.7075613737106323, "learning_rate": 6.051097161645387e-06, "loss": 0.022857602685689926, "memory(GiB)": 21.48, "step": 14190, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.955009 }, { "epoch": 0.46100120196212196, "grad_norm": 0.26802346110343933, "learning_rate": 6.050572005743021e-06, "loss": 0.01527445949614048, "memory(GiB)": 21.48, "step": 14191, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.95502 }, { "epoch": 0.4610336874248774, "grad_norm": 0.22731727361679077, "learning_rate": 6.050046837715653e-06, "loss": 0.016800031065940857, "memory(GiB)": 21.48, "step": 14192, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955033 }, { "epoch": 0.4610661728876328, "grad_norm": 0.2632320821285248, "learning_rate": 6.04952165756935e-06, "loss": 0.01726023107767105, "memory(GiB)": 21.48, "step": 14193, "token_acc": 1.0, "train_speed(iter/s)": 0.955047 }, { "epoch": 0.4610986583503882, "grad_norm": 0.40479251742362976, "learning_rate": 6.048996465310168e-06, "loss": 0.026345880702137947, "memory(GiB)": 21.48, "step": 14194, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955061 }, { "epoch": 0.46113114381314363, "grad_norm": 0.43367820978164673, "learning_rate": 6.048471260944171e-06, "loss": 0.017793327569961548, "memory(GiB)": 21.48, "step": 14195, "token_acc": 1.0, "train_speed(iter/s)": 0.955075 }, { "epoch": 0.46116362927589905, "grad_norm": 0.6235884428024292, "learning_rate": 6.047946044477421e-06, "loss": 0.03306080773472786, "memory(GiB)": 21.48, "step": 14196, "token_acc": 1.0, "train_speed(iter/s)": 0.955089 }, { "epoch": 0.46119611473865446, "grad_norm": 0.433101624250412, "learning_rate": 6.047420815915979e-06, "loss": 0.028216756880283356, "memory(GiB)": 21.48, "step": 14197, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.955103 }, { "epoch": 0.4612286002014099, "grad_norm": 0.26818037033081055, "learning_rate": 6.046895575265904e-06, "loss": 0.017731981351971626, "memory(GiB)": 21.48, "step": 14198, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.955116 }, { "epoch": 0.4612610856641653, "grad_norm": 0.9122522473335266, "learning_rate": 6.046370322533264e-06, "loss": 0.022906053811311722, "memory(GiB)": 21.48, "step": 14199, "token_acc": 0.996, "train_speed(iter/s)": 0.95513 }, { "epoch": 0.4612935711269207, "grad_norm": 0.377745658159256, "learning_rate": 6.045845057724116e-06, "loss": 0.02751358598470688, "memory(GiB)": 21.48, "step": 14200, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.955144 }, { "epoch": 0.4613260565896761, "grad_norm": 0.42885714769363403, "learning_rate": 6.045319780844523e-06, "loss": 0.025609338656067848, "memory(GiB)": 21.48, "step": 14201, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.955157 }, { "epoch": 0.46135854205243154, "grad_norm": 0.2955900728702545, "learning_rate": 6.044794491900551e-06, "loss": 0.02171928994357586, "memory(GiB)": 21.48, "step": 14202, "token_acc": 0.9893992932862191, "train_speed(iter/s)": 0.955171 }, { "epoch": 0.46139102751518696, "grad_norm": 0.3107181787490845, "learning_rate": 6.044269190898257e-06, "loss": 0.0205975528806448, "memory(GiB)": 21.48, "step": 14203, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955186 }, { "epoch": 0.4614235129779424, "grad_norm": 0.3567691743373871, "learning_rate": 6.043743877843709e-06, "loss": 0.019097302109003067, "memory(GiB)": 21.48, "step": 14204, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.9552 }, { "epoch": 0.4614559984406978, "grad_norm": 0.5342021584510803, "learning_rate": 6.043218552742967e-06, "loss": 0.028060855343937874, "memory(GiB)": 21.48, "step": 14205, "token_acc": 1.0, "train_speed(iter/s)": 0.955214 }, { "epoch": 0.4614884839034532, "grad_norm": 0.5193575620651245, "learning_rate": 6.042693215602094e-06, "loss": 0.021378837525844574, "memory(GiB)": 21.48, "step": 14206, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.955228 }, { "epoch": 0.4615209693662086, "grad_norm": 0.22874639928340912, "learning_rate": 6.042167866427154e-06, "loss": 0.013779433444142342, "memory(GiB)": 21.48, "step": 14207, "token_acc": 1.0, "train_speed(iter/s)": 0.955242 }, { "epoch": 0.46155345482896404, "grad_norm": 0.3538561165332794, "learning_rate": 6.041642505224209e-06, "loss": 0.02864476665854454, "memory(GiB)": 21.48, "step": 14208, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955257 }, { "epoch": 0.46158594029171945, "grad_norm": 0.3075720965862274, "learning_rate": 6.0411171319993244e-06, "loss": 0.020736947655677795, "memory(GiB)": 21.48, "step": 14209, "token_acc": 1.0, "train_speed(iter/s)": 0.955271 }, { "epoch": 0.46161842575447487, "grad_norm": 0.41122421622276306, "learning_rate": 6.040591746758561e-06, "loss": 0.026603084057569504, "memory(GiB)": 21.48, "step": 14210, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.955285 }, { "epoch": 0.4616509112172303, "grad_norm": 0.42377790808677673, "learning_rate": 6.040066349507983e-06, "loss": 0.02474203333258629, "memory(GiB)": 21.48, "step": 14211, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.955299 }, { "epoch": 0.4616833966799857, "grad_norm": 0.3106538951396942, "learning_rate": 6.039540940253656e-06, "loss": 0.02238592691719532, "memory(GiB)": 21.48, "step": 14212, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.955313 }, { "epoch": 0.4617158821427411, "grad_norm": 0.47116103768348694, "learning_rate": 6.039015519001644e-06, "loss": 0.018890751525759697, "memory(GiB)": 21.48, "step": 14213, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.955327 }, { "epoch": 0.46174836760549653, "grad_norm": 0.46071964502334595, "learning_rate": 6.038490085758008e-06, "loss": 0.026384960860013962, "memory(GiB)": 21.48, "step": 14214, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.955342 }, { "epoch": 0.46178085306825195, "grad_norm": 0.3792777955532074, "learning_rate": 6.037964640528816e-06, "loss": 0.021373730152845383, "memory(GiB)": 21.48, "step": 14215, "token_acc": 0.968421052631579, "train_speed(iter/s)": 0.955357 }, { "epoch": 0.46181333853100737, "grad_norm": 0.49239200353622437, "learning_rate": 6.03743918332013e-06, "loss": 0.023118048906326294, "memory(GiB)": 21.48, "step": 14216, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.955371 }, { "epoch": 0.4618458239937628, "grad_norm": 0.3124614655971527, "learning_rate": 6.036913714138014e-06, "loss": 0.022232338786125183, "memory(GiB)": 21.48, "step": 14217, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.955385 }, { "epoch": 0.4618783094565182, "grad_norm": 0.3476889431476593, "learning_rate": 6.036388232988535e-06, "loss": 0.0203777477145195, "memory(GiB)": 21.48, "step": 14218, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.955399 }, { "epoch": 0.4619107949192736, "grad_norm": 0.39652112126350403, "learning_rate": 6.0358627398777545e-06, "loss": 0.024449383839964867, "memory(GiB)": 21.48, "step": 14219, "token_acc": 0.9865771812080537, "train_speed(iter/s)": 0.955413 }, { "epoch": 0.46194328038202903, "grad_norm": 0.33124613761901855, "learning_rate": 6.035337234811741e-06, "loss": 0.025421112775802612, "memory(GiB)": 21.48, "step": 14220, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955427 }, { "epoch": 0.46197576584478445, "grad_norm": 0.320490300655365, "learning_rate": 6.0348117177965585e-06, "loss": 0.020364079624414444, "memory(GiB)": 21.48, "step": 14221, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.955441 }, { "epoch": 0.46200825130753986, "grad_norm": 0.20431971549987793, "learning_rate": 6.03428618883827e-06, "loss": 0.012189609929919243, "memory(GiB)": 21.48, "step": 14222, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.955455 }, { "epoch": 0.4620407367702953, "grad_norm": 0.3013032078742981, "learning_rate": 6.033760647942942e-06, "loss": 0.017455557361245155, "memory(GiB)": 21.48, "step": 14223, "token_acc": 1.0, "train_speed(iter/s)": 0.95547 }, { "epoch": 0.4620732222330507, "grad_norm": 0.2836512625217438, "learning_rate": 6.033235095116641e-06, "loss": 0.018036844208836555, "memory(GiB)": 21.48, "step": 14224, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.955484 }, { "epoch": 0.4621057076958061, "grad_norm": 0.2806480824947357, "learning_rate": 6.0327095303654315e-06, "loss": 0.022897768765687943, "memory(GiB)": 21.48, "step": 14225, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.955498 }, { "epoch": 0.4621381931585615, "grad_norm": 0.5797943472862244, "learning_rate": 6.032183953695379e-06, "loss": 0.0313858836889267, "memory(GiB)": 21.48, "step": 14226, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.955512 }, { "epoch": 0.46217067862131694, "grad_norm": 0.4150247871875763, "learning_rate": 6.03165836511255e-06, "loss": 0.023685410618782043, "memory(GiB)": 21.48, "step": 14227, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.955527 }, { "epoch": 0.46220316408407236, "grad_norm": 0.5068880915641785, "learning_rate": 6.031132764623011e-06, "loss": 0.031605031341314316, "memory(GiB)": 21.48, "step": 14228, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.955538 }, { "epoch": 0.4622356495468278, "grad_norm": 0.5890386700630188, "learning_rate": 6.0306071522328265e-06, "loss": 0.019009200856089592, "memory(GiB)": 21.48, "step": 14229, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.955549 }, { "epoch": 0.4622681350095832, "grad_norm": 0.5610077977180481, "learning_rate": 6.030081527948065e-06, "loss": 0.022907419130206108, "memory(GiB)": 21.48, "step": 14230, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.95556 }, { "epoch": 0.4623006204723386, "grad_norm": 0.3784579932689667, "learning_rate": 6.0295558917747895e-06, "loss": 0.023874860256910324, "memory(GiB)": 21.48, "step": 14231, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.955571 }, { "epoch": 0.462333105935094, "grad_norm": 0.2609046399593353, "learning_rate": 6.029030243719071e-06, "loss": 0.016511905938386917, "memory(GiB)": 21.48, "step": 14232, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.955582 }, { "epoch": 0.46236559139784944, "grad_norm": 0.44204580783843994, "learning_rate": 6.0285045837869726e-06, "loss": 0.032845333218574524, "memory(GiB)": 21.48, "step": 14233, "token_acc": 0.976, "train_speed(iter/s)": 0.955591 }, { "epoch": 0.46239807686060486, "grad_norm": 0.3477489650249481, "learning_rate": 6.027978911984563e-06, "loss": 0.02650291472673416, "memory(GiB)": 21.48, "step": 14234, "token_acc": 1.0, "train_speed(iter/s)": 0.9556 }, { "epoch": 0.4624305623233603, "grad_norm": 0.3914322257041931, "learning_rate": 6.027453228317909e-06, "loss": 0.022799735888838768, "memory(GiB)": 21.48, "step": 14235, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.95561 }, { "epoch": 0.4624630477861157, "grad_norm": 0.544212281703949, "learning_rate": 6.0269275327930754e-06, "loss": 0.017639007419347763, "memory(GiB)": 21.48, "step": 14236, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.95562 }, { "epoch": 0.4624955332488711, "grad_norm": 0.7642272710800171, "learning_rate": 6.026401825416133e-06, "loss": 0.026028243824839592, "memory(GiB)": 21.48, "step": 14237, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.955629 }, { "epoch": 0.4625280187116265, "grad_norm": 0.5379183292388916, "learning_rate": 6.025876106193146e-06, "loss": 0.025416109710931778, "memory(GiB)": 21.48, "step": 14238, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.955639 }, { "epoch": 0.46256050417438194, "grad_norm": 0.3887676000595093, "learning_rate": 6.025350375130185e-06, "loss": 0.022168882191181183, "memory(GiB)": 21.48, "step": 14239, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955647 }, { "epoch": 0.4625929896371374, "grad_norm": 0.40229320526123047, "learning_rate": 6.024824632233313e-06, "loss": 0.023399800062179565, "memory(GiB)": 21.48, "step": 14240, "token_acc": 1.0, "train_speed(iter/s)": 0.955657 }, { "epoch": 0.4626254750998928, "grad_norm": 0.3928418457508087, "learning_rate": 6.0242988775086044e-06, "loss": 0.025790700688958168, "memory(GiB)": 21.48, "step": 14241, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.955666 }, { "epoch": 0.46265796056264824, "grad_norm": 0.4324667751789093, "learning_rate": 6.023773110962121e-06, "loss": 0.021664049476385117, "memory(GiB)": 21.48, "step": 14242, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.955676 }, { "epoch": 0.46269044602540366, "grad_norm": 0.43348363041877747, "learning_rate": 6.023247332599934e-06, "loss": 0.021990347653627396, "memory(GiB)": 21.48, "step": 14243, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955685 }, { "epoch": 0.4627229314881591, "grad_norm": 0.4697158932685852, "learning_rate": 6.022721542428111e-06, "loss": 0.025343913584947586, "memory(GiB)": 21.48, "step": 14244, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.955695 }, { "epoch": 0.4627554169509145, "grad_norm": 1.7914379835128784, "learning_rate": 6.0221957404527195e-06, "loss": 0.0344795361161232, "memory(GiB)": 21.48, "step": 14245, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.955704 }, { "epoch": 0.4627879024136699, "grad_norm": 0.44129979610443115, "learning_rate": 6.0216699266798285e-06, "loss": 0.027561137452721596, "memory(GiB)": 21.48, "step": 14246, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955713 }, { "epoch": 0.4628203878764253, "grad_norm": 0.5045895576477051, "learning_rate": 6.021144101115507e-06, "loss": 0.020298048853874207, "memory(GiB)": 21.48, "step": 14247, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.955722 }, { "epoch": 0.46285287333918074, "grad_norm": 0.3028017282485962, "learning_rate": 6.020618263765824e-06, "loss": 0.02496200054883957, "memory(GiB)": 21.48, "step": 14248, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.955731 }, { "epoch": 0.46288535880193615, "grad_norm": 0.5373325943946838, "learning_rate": 6.020092414636848e-06, "loss": 0.0246268343180418, "memory(GiB)": 21.48, "step": 14249, "token_acc": 0.98828125, "train_speed(iter/s)": 0.95574 }, { "epoch": 0.46291784426469157, "grad_norm": 0.3684130907058716, "learning_rate": 6.019566553734646e-06, "loss": 0.026218418031930923, "memory(GiB)": 21.48, "step": 14250, "token_acc": 0.99, "train_speed(iter/s)": 0.955749 }, { "epoch": 0.462950329727447, "grad_norm": 0.3235117197036743, "learning_rate": 6.019040681065291e-06, "loss": 0.024346131831407547, "memory(GiB)": 21.48, "step": 14251, "token_acc": 0.9923664122137404, "train_speed(iter/s)": 0.955758 }, { "epoch": 0.4629828151902024, "grad_norm": 0.305049329996109, "learning_rate": 6.018514796634849e-06, "loss": 0.020159799605607986, "memory(GiB)": 21.48, "step": 14252, "token_acc": 1.0, "train_speed(iter/s)": 0.955771 }, { "epoch": 0.4630153006529578, "grad_norm": 0.3406842052936554, "learning_rate": 6.0179889004493906e-06, "loss": 0.020898064598441124, "memory(GiB)": 21.48, "step": 14253, "token_acc": 1.0, "train_speed(iter/s)": 0.955786 }, { "epoch": 0.46304778611571323, "grad_norm": 0.41496512293815613, "learning_rate": 6.017462992514987e-06, "loss": 0.03036775439977646, "memory(GiB)": 21.48, "step": 14254, "token_acc": 0.988, "train_speed(iter/s)": 0.955799 }, { "epoch": 0.46308027157846865, "grad_norm": 0.32314544916152954, "learning_rate": 6.016937072837704e-06, "loss": 0.02130456641316414, "memory(GiB)": 21.48, "step": 14255, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955813 }, { "epoch": 0.46311275704122407, "grad_norm": 0.3574931025505066, "learning_rate": 6.016411141423615e-06, "loss": 0.02041732333600521, "memory(GiB)": 21.48, "step": 14256, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.955828 }, { "epoch": 0.4631452425039795, "grad_norm": 0.4134511649608612, "learning_rate": 6.015885198278788e-06, "loss": 0.0237397700548172, "memory(GiB)": 21.48, "step": 14257, "token_acc": 1.0, "train_speed(iter/s)": 0.955842 }, { "epoch": 0.4631777279667349, "grad_norm": 0.38169416785240173, "learning_rate": 6.0153592434092945e-06, "loss": 0.020635634660720825, "memory(GiB)": 21.48, "step": 14258, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955856 }, { "epoch": 0.4632102134294903, "grad_norm": 0.3672221302986145, "learning_rate": 6.014833276821202e-06, "loss": 0.022038767114281654, "memory(GiB)": 21.48, "step": 14259, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955871 }, { "epoch": 0.46324269889224573, "grad_norm": 0.2873810827732086, "learning_rate": 6.014307298520585e-06, "loss": 0.01960797980427742, "memory(GiB)": 21.48, "step": 14260, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.955884 }, { "epoch": 0.46327518435500115, "grad_norm": 0.5699331760406494, "learning_rate": 6.01378130851351e-06, "loss": 0.03417259454727173, "memory(GiB)": 21.48, "step": 14261, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.955898 }, { "epoch": 0.46330766981775656, "grad_norm": 0.4066120386123657, "learning_rate": 6.013255306806049e-06, "loss": 0.022946996614336967, "memory(GiB)": 21.48, "step": 14262, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955912 }, { "epoch": 0.463340155280512, "grad_norm": 0.3430071473121643, "learning_rate": 6.012729293404276e-06, "loss": 0.02471223846077919, "memory(GiB)": 21.48, "step": 14263, "token_acc": 0.9964285714285714, "train_speed(iter/s)": 0.955926 }, { "epoch": 0.4633726407432674, "grad_norm": 0.4051896333694458, "learning_rate": 6.012203268314256e-06, "loss": 0.021358203142881393, "memory(GiB)": 21.48, "step": 14264, "token_acc": 0.996, "train_speed(iter/s)": 0.95594 }, { "epoch": 0.4634051262060228, "grad_norm": 0.3944156765937805, "learning_rate": 6.011677231542064e-06, "loss": 0.018281713128089905, "memory(GiB)": 21.48, "step": 14265, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.955954 }, { "epoch": 0.4634376116687782, "grad_norm": 0.3523004651069641, "learning_rate": 6.0111511830937706e-06, "loss": 0.01998407579958439, "memory(GiB)": 21.48, "step": 14266, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955968 }, { "epoch": 0.46347009713153364, "grad_norm": 0.4439365267753601, "learning_rate": 6.0106251229754465e-06, "loss": 0.03185540810227394, "memory(GiB)": 21.48, "step": 14267, "token_acc": 0.9768339768339769, "train_speed(iter/s)": 0.955983 }, { "epoch": 0.46350258259428906, "grad_norm": 0.3153812289237976, "learning_rate": 6.010099051193164e-06, "loss": 0.02312997728586197, "memory(GiB)": 21.48, "step": 14268, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955997 }, { "epoch": 0.4635350680570445, "grad_norm": 0.247165709733963, "learning_rate": 6.0095729677529936e-06, "loss": 0.01472958642989397, "memory(GiB)": 21.48, "step": 14269, "token_acc": 1.0, "train_speed(iter/s)": 0.956011 }, { "epoch": 0.4635675535197999, "grad_norm": 0.47209039330482483, "learning_rate": 6.009046872661006e-06, "loss": 0.028985440731048584, "memory(GiB)": 21.48, "step": 14270, "token_acc": 0.9877049180327869, "train_speed(iter/s)": 0.956025 }, { "epoch": 0.4636000389825553, "grad_norm": 0.39670389890670776, "learning_rate": 6.008520765923277e-06, "loss": 0.02228744700551033, "memory(GiB)": 21.48, "step": 14271, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.95604 }, { "epoch": 0.4636325244453107, "grad_norm": 0.27243128418922424, "learning_rate": 6.007994647545875e-06, "loss": 0.023454494774341583, "memory(GiB)": 21.48, "step": 14272, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956054 }, { "epoch": 0.46366500990806614, "grad_norm": 0.2984466254711151, "learning_rate": 6.007468517534872e-06, "loss": 0.019702959805727005, "memory(GiB)": 21.48, "step": 14273, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.956068 }, { "epoch": 0.46369749537082156, "grad_norm": 0.31258392333984375, "learning_rate": 6.006942375896343e-06, "loss": 0.019045397639274597, "memory(GiB)": 21.48, "step": 14274, "token_acc": 0.9836065573770492, "train_speed(iter/s)": 0.956082 }, { "epoch": 0.46372998083357697, "grad_norm": 0.46909236907958984, "learning_rate": 6.006416222636358e-06, "loss": 0.0213625505566597, "memory(GiB)": 21.48, "step": 14275, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.956096 }, { "epoch": 0.4637624662963324, "grad_norm": 0.28703245520591736, "learning_rate": 6.00589005776099e-06, "loss": 0.020353347063064575, "memory(GiB)": 21.48, "step": 14276, "token_acc": 1.0, "train_speed(iter/s)": 0.956111 }, { "epoch": 0.4637949517590878, "grad_norm": 0.3347167670726776, "learning_rate": 6.005363881276311e-06, "loss": 0.024609947577118874, "memory(GiB)": 21.48, "step": 14277, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.956125 }, { "epoch": 0.4638274372218432, "grad_norm": 0.39631223678588867, "learning_rate": 6.004837693188395e-06, "loss": 0.024258580058813095, "memory(GiB)": 21.48, "step": 14278, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.956138 }, { "epoch": 0.46385992268459864, "grad_norm": 0.4090276062488556, "learning_rate": 6.004311493503317e-06, "loss": 0.023177873343229294, "memory(GiB)": 21.48, "step": 14279, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.956152 }, { "epoch": 0.46389240814735405, "grad_norm": 0.3534427881240845, "learning_rate": 6.003785282227144e-06, "loss": 0.024145089089870453, "memory(GiB)": 21.48, "step": 14280, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.956165 }, { "epoch": 0.46392489361010947, "grad_norm": 0.3291895389556885, "learning_rate": 6.003259059365956e-06, "loss": 0.02473606914281845, "memory(GiB)": 21.48, "step": 14281, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.956179 }, { "epoch": 0.4639573790728649, "grad_norm": 0.39135950803756714, "learning_rate": 6.002732824925821e-06, "loss": 0.01822511851787567, "memory(GiB)": 21.48, "step": 14282, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956194 }, { "epoch": 0.4639898645356203, "grad_norm": 0.3679543137550354, "learning_rate": 6.002206578912817e-06, "loss": 0.027365589514374733, "memory(GiB)": 21.48, "step": 14283, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.956208 }, { "epoch": 0.4640223499983757, "grad_norm": 0.4633820056915283, "learning_rate": 6.0016803213330124e-06, "loss": 0.024412572383880615, "memory(GiB)": 21.48, "step": 14284, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.956221 }, { "epoch": 0.46405483546113113, "grad_norm": 0.3027660846710205, "learning_rate": 6.001154052192486e-06, "loss": 0.023505836725234985, "memory(GiB)": 21.48, "step": 14285, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.956235 }, { "epoch": 0.46408732092388655, "grad_norm": 0.25050413608551025, "learning_rate": 6.000627771497308e-06, "loss": 0.016081154346466064, "memory(GiB)": 21.48, "step": 14286, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.95625 }, { "epoch": 0.46411980638664196, "grad_norm": 0.39671990275382996, "learning_rate": 6.000101479253554e-06, "loss": 0.014325518161058426, "memory(GiB)": 21.48, "step": 14287, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.956265 }, { "epoch": 0.4641522918493974, "grad_norm": 0.32365623116493225, "learning_rate": 5.999575175467298e-06, "loss": 0.018814992159605026, "memory(GiB)": 21.48, "step": 14288, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.956278 }, { "epoch": 0.4641847773121528, "grad_norm": 0.5346531867980957, "learning_rate": 5.9990488601446125e-06, "loss": 0.022464463487267494, "memory(GiB)": 21.48, "step": 14289, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.956291 }, { "epoch": 0.4642172627749082, "grad_norm": 0.31454843282699585, "learning_rate": 5.9985225332915756e-06, "loss": 0.02301155775785446, "memory(GiB)": 21.48, "step": 14290, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.956305 }, { "epoch": 0.46424974823766363, "grad_norm": 0.3124275803565979, "learning_rate": 5.997996194914258e-06, "loss": 0.023647379130125046, "memory(GiB)": 21.48, "step": 14291, "token_acc": 1.0, "train_speed(iter/s)": 0.956318 }, { "epoch": 0.46428223370041904, "grad_norm": 0.34645992517471313, "learning_rate": 5.997469845018738e-06, "loss": 0.021979715675115585, "memory(GiB)": 21.48, "step": 14292, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.956332 }, { "epoch": 0.46431471916317446, "grad_norm": 0.37051934003829956, "learning_rate": 5.996943483611085e-06, "loss": 0.02825574204325676, "memory(GiB)": 21.48, "step": 14293, "token_acc": 0.9921875, "train_speed(iter/s)": 0.956342 }, { "epoch": 0.4643472046259299, "grad_norm": 0.3167692720890045, "learning_rate": 5.996417110697379e-06, "loss": 0.021015392616391182, "memory(GiB)": 21.48, "step": 14294, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956351 }, { "epoch": 0.4643796900886853, "grad_norm": 0.34100717306137085, "learning_rate": 5.995890726283694e-06, "loss": 0.0231289304792881, "memory(GiB)": 21.48, "step": 14295, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.956359 }, { "epoch": 0.4644121755514407, "grad_norm": 0.40309256315231323, "learning_rate": 5.995364330376103e-06, "loss": 0.01845957711338997, "memory(GiB)": 21.48, "step": 14296, "token_acc": 1.0, "train_speed(iter/s)": 0.956368 }, { "epoch": 0.4644446610141961, "grad_norm": 0.45328617095947266, "learning_rate": 5.994837922980685e-06, "loss": 0.02499696984887123, "memory(GiB)": 21.48, "step": 14297, "token_acc": 0.99609375, "train_speed(iter/s)": 0.956378 }, { "epoch": 0.46447714647695154, "grad_norm": 0.32439491152763367, "learning_rate": 5.994311504103511e-06, "loss": 0.015721965581178665, "memory(GiB)": 21.48, "step": 14298, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.956388 }, { "epoch": 0.46450963193970696, "grad_norm": 0.420559287071228, "learning_rate": 5.9937850737506606e-06, "loss": 0.020323004573583603, "memory(GiB)": 21.48, "step": 14299, "token_acc": 1.0, "train_speed(iter/s)": 0.956397 }, { "epoch": 0.4645421174024624, "grad_norm": 0.44077807664871216, "learning_rate": 5.993258631928209e-06, "loss": 0.02475707419216633, "memory(GiB)": 21.48, "step": 14300, "token_acc": 1.0, "train_speed(iter/s)": 0.956406 }, { "epoch": 0.4645746028652178, "grad_norm": 0.3945479989051819, "learning_rate": 5.992732178642228e-06, "loss": 0.022024717181921005, "memory(GiB)": 21.48, "step": 14301, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.956415 }, { "epoch": 0.4646070883279732, "grad_norm": 0.46388164162635803, "learning_rate": 5.9922057138987975e-06, "loss": 0.03195931389927864, "memory(GiB)": 21.48, "step": 14302, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.956426 }, { "epoch": 0.4646395737907286, "grad_norm": 0.22823281586170197, "learning_rate": 5.991679237703991e-06, "loss": 0.01551706064492464, "memory(GiB)": 21.48, "step": 14303, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.956435 }, { "epoch": 0.4646720592534841, "grad_norm": 0.3774203062057495, "learning_rate": 5.991152750063888e-06, "loss": 0.024469934403896332, "memory(GiB)": 21.48, "step": 14304, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.956443 }, { "epoch": 0.4647045447162395, "grad_norm": 0.43570542335510254, "learning_rate": 5.9906262509845615e-06, "loss": 0.02987697720527649, "memory(GiB)": 21.48, "step": 14305, "token_acc": 0.9732620320855615, "train_speed(iter/s)": 0.956453 }, { "epoch": 0.4647370301789949, "grad_norm": 0.3248700797557831, "learning_rate": 5.99009974047209e-06, "loss": 0.021608730778098106, "memory(GiB)": 21.48, "step": 14306, "token_acc": 1.0, "train_speed(iter/s)": 0.956462 }, { "epoch": 0.46476951564175034, "grad_norm": 0.26034295558929443, "learning_rate": 5.98957321853255e-06, "loss": 0.01407889649271965, "memory(GiB)": 21.48, "step": 14307, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.95647 }, { "epoch": 0.46480200110450576, "grad_norm": 0.5154333114624023, "learning_rate": 5.9890466851720184e-06, "loss": 0.027020230889320374, "memory(GiB)": 21.48, "step": 14308, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.956479 }, { "epoch": 0.4648344865672612, "grad_norm": 0.24285653233528137, "learning_rate": 5.988520140396572e-06, "loss": 0.015270376577973366, "memory(GiB)": 21.48, "step": 14309, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.956489 }, { "epoch": 0.4648669720300166, "grad_norm": 0.9083147644996643, "learning_rate": 5.987993584212286e-06, "loss": 0.029442699626088142, "memory(GiB)": 21.48, "step": 14310, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.956498 }, { "epoch": 0.464899457492772, "grad_norm": 0.5340197682380676, "learning_rate": 5.98746701662524e-06, "loss": 0.020335979759693146, "memory(GiB)": 21.48, "step": 14311, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956507 }, { "epoch": 0.4649319429555274, "grad_norm": 0.3440479338169098, "learning_rate": 5.98694043764151e-06, "loss": 0.0173785500228405, "memory(GiB)": 21.48, "step": 14312, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.956516 }, { "epoch": 0.46496442841828284, "grad_norm": 0.3708110451698303, "learning_rate": 5.986413847267174e-06, "loss": 0.023984383791685104, "memory(GiB)": 21.48, "step": 14313, "token_acc": 0.9923954372623575, "train_speed(iter/s)": 0.956527 }, { "epoch": 0.46499691388103825, "grad_norm": 0.4247626066207886, "learning_rate": 5.9858872455083095e-06, "loss": 0.0276290662586689, "memory(GiB)": 21.48, "step": 14314, "token_acc": 0.9930795847750865, "train_speed(iter/s)": 0.956538 }, { "epoch": 0.46502939934379367, "grad_norm": 0.31594836711883545, "learning_rate": 5.985360632370993e-06, "loss": 0.02345379814505577, "memory(GiB)": 21.48, "step": 14315, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.956549 }, { "epoch": 0.4650618848065491, "grad_norm": 0.4417826533317566, "learning_rate": 5.984834007861305e-06, "loss": 0.022821053862571716, "memory(GiB)": 21.48, "step": 14316, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956561 }, { "epoch": 0.4650943702693045, "grad_norm": 0.29867076873779297, "learning_rate": 5.984307371985318e-06, "loss": 0.022426506504416466, "memory(GiB)": 21.48, "step": 14317, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956575 }, { "epoch": 0.4651268557320599, "grad_norm": 0.22780656814575195, "learning_rate": 5.983780724749118e-06, "loss": 0.012393608689308167, "memory(GiB)": 21.48, "step": 14318, "token_acc": 1.0, "train_speed(iter/s)": 0.956589 }, { "epoch": 0.46515934119481533, "grad_norm": 0.3638901114463806, "learning_rate": 5.983254066158777e-06, "loss": 0.025247778743505478, "memory(GiB)": 21.48, "step": 14319, "token_acc": 0.9933774834437086, "train_speed(iter/s)": 0.956603 }, { "epoch": 0.46519182665757075, "grad_norm": 0.29318779706954956, "learning_rate": 5.982727396220376e-06, "loss": 0.014643773436546326, "memory(GiB)": 21.48, "step": 14320, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.956617 }, { "epoch": 0.46522431212032617, "grad_norm": 0.3046409785747528, "learning_rate": 5.982200714939992e-06, "loss": 0.021343562752008438, "memory(GiB)": 21.48, "step": 14321, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.95663 }, { "epoch": 0.4652567975830816, "grad_norm": 0.31933358311653137, "learning_rate": 5.981674022323705e-06, "loss": 0.02103404514491558, "memory(GiB)": 21.48, "step": 14322, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.956643 }, { "epoch": 0.465289283045837, "grad_norm": 0.4626609981060028, "learning_rate": 5.981147318377594e-06, "loss": 0.028958357870578766, "memory(GiB)": 21.48, "step": 14323, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.956657 }, { "epoch": 0.4653217685085924, "grad_norm": 0.49743255972862244, "learning_rate": 5.980620603107734e-06, "loss": 0.024954885244369507, "memory(GiB)": 21.48, "step": 14324, "token_acc": 1.0, "train_speed(iter/s)": 0.956671 }, { "epoch": 0.46535425397134783, "grad_norm": 0.3286474347114563, "learning_rate": 5.98009387652021e-06, "loss": 0.016454078257083893, "memory(GiB)": 21.48, "step": 14325, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956684 }, { "epoch": 0.46538673943410325, "grad_norm": 0.45755165815353394, "learning_rate": 5.979567138621097e-06, "loss": 0.03143921121954918, "memory(GiB)": 21.48, "step": 14326, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.956699 }, { "epoch": 0.46541922489685866, "grad_norm": 0.43106555938720703, "learning_rate": 5.979040389416477e-06, "loss": 0.028905026614665985, "memory(GiB)": 21.48, "step": 14327, "token_acc": 0.980544747081712, "train_speed(iter/s)": 0.956714 }, { "epoch": 0.4654517103596141, "grad_norm": 0.3191594183444977, "learning_rate": 5.978513628912427e-06, "loss": 0.021091841161251068, "memory(GiB)": 21.48, "step": 14328, "token_acc": 0.9928057553956835, "train_speed(iter/s)": 0.956728 }, { "epoch": 0.4654841958223695, "grad_norm": 0.3440884053707123, "learning_rate": 5.9779868571150265e-06, "loss": 0.024044353514909744, "memory(GiB)": 21.48, "step": 14329, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.956742 }, { "epoch": 0.4655166812851249, "grad_norm": 0.3241496682167053, "learning_rate": 5.9774600740303566e-06, "loss": 0.024918604642152786, "memory(GiB)": 21.48, "step": 14330, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.956757 }, { "epoch": 0.4655491667478803, "grad_norm": 0.3109250068664551, "learning_rate": 5.976933279664496e-06, "loss": 0.02506786584854126, "memory(GiB)": 21.48, "step": 14331, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.956771 }, { "epoch": 0.46558165221063574, "grad_norm": 0.4838930368423462, "learning_rate": 5.976406474023526e-06, "loss": 0.021712208166718483, "memory(GiB)": 21.48, "step": 14332, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.956785 }, { "epoch": 0.46561413767339116, "grad_norm": 0.3022112250328064, "learning_rate": 5.9758796571135245e-06, "loss": 0.024304799735546112, "memory(GiB)": 21.48, "step": 14333, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.9568 }, { "epoch": 0.4656466231361466, "grad_norm": 0.30393698811531067, "learning_rate": 5.975352828940574e-06, "loss": 0.019274063408374786, "memory(GiB)": 21.48, "step": 14334, "token_acc": 0.981549815498155, "train_speed(iter/s)": 0.956813 }, { "epoch": 0.465679108598902, "grad_norm": 0.5322707891464233, "learning_rate": 5.974825989510753e-06, "loss": 0.02278793975710869, "memory(GiB)": 21.48, "step": 14335, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.956826 }, { "epoch": 0.4657115940616574, "grad_norm": 0.3935730755329132, "learning_rate": 5.974299138830143e-06, "loss": 0.026612674817442894, "memory(GiB)": 21.48, "step": 14336, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.956841 }, { "epoch": 0.4657440795244128, "grad_norm": 0.31999170780181885, "learning_rate": 5.9737722769048236e-06, "loss": 0.021852806210517883, "memory(GiB)": 21.48, "step": 14337, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.956856 }, { "epoch": 0.46577656498716824, "grad_norm": 0.2683040201663971, "learning_rate": 5.973245403740876e-06, "loss": 0.019970804452896118, "memory(GiB)": 21.48, "step": 14338, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.95687 }, { "epoch": 0.46580905044992366, "grad_norm": 0.37267208099365234, "learning_rate": 5.97271851934438e-06, "loss": 0.0249936506152153, "memory(GiB)": 21.48, "step": 14339, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956884 }, { "epoch": 0.46584153591267907, "grad_norm": 0.4410713315010071, "learning_rate": 5.97219162372142e-06, "loss": 0.02597755938768387, "memory(GiB)": 21.48, "step": 14340, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.956898 }, { "epoch": 0.4658740213754345, "grad_norm": 0.27984538674354553, "learning_rate": 5.971664716878074e-06, "loss": 0.01826193556189537, "memory(GiB)": 21.48, "step": 14341, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.956912 }, { "epoch": 0.4659065068381899, "grad_norm": 0.2583610415458679, "learning_rate": 5.971137798820423e-06, "loss": 0.021099548786878586, "memory(GiB)": 21.48, "step": 14342, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956926 }, { "epoch": 0.4659389923009453, "grad_norm": 0.2458878457546234, "learning_rate": 5.9706108695545495e-06, "loss": 0.01583624817430973, "memory(GiB)": 21.48, "step": 14343, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.95694 }, { "epoch": 0.46597147776370074, "grad_norm": 0.38317692279815674, "learning_rate": 5.970083929086536e-06, "loss": 0.029536932706832886, "memory(GiB)": 21.48, "step": 14344, "token_acc": 0.9754385964912281, "train_speed(iter/s)": 0.956953 }, { "epoch": 0.46600396322645615, "grad_norm": 0.2399255931377411, "learning_rate": 5.969556977422461e-06, "loss": 0.019122770056128502, "memory(GiB)": 21.48, "step": 14345, "token_acc": 0.988, "train_speed(iter/s)": 0.956967 }, { "epoch": 0.46603644868921157, "grad_norm": 0.30137115716934204, "learning_rate": 5.969030014568409e-06, "loss": 0.02684975415468216, "memory(GiB)": 21.48, "step": 14346, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.956982 }, { "epoch": 0.466068934151967, "grad_norm": 0.2582833468914032, "learning_rate": 5.96850304053046e-06, "loss": 0.024700822308659554, "memory(GiB)": 21.48, "step": 14347, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.956996 }, { "epoch": 0.4661014196147224, "grad_norm": 0.37054985761642456, "learning_rate": 5.967976055314697e-06, "loss": 0.02817521244287491, "memory(GiB)": 21.48, "step": 14348, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.957009 }, { "epoch": 0.4661339050774778, "grad_norm": 0.4215303063392639, "learning_rate": 5.967449058927203e-06, "loss": 0.028741497546434402, "memory(GiB)": 21.48, "step": 14349, "token_acc": 1.0, "train_speed(iter/s)": 0.957022 }, { "epoch": 0.46616639054023323, "grad_norm": 0.35890766978263855, "learning_rate": 5.966922051374057e-06, "loss": 0.019033024087548256, "memory(GiB)": 21.48, "step": 14350, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.957036 }, { "epoch": 0.46619887600298865, "grad_norm": 0.31047242879867554, "learning_rate": 5.966395032661347e-06, "loss": 0.015294782817363739, "memory(GiB)": 21.48, "step": 14351, "token_acc": 1.0, "train_speed(iter/s)": 0.957049 }, { "epoch": 0.46623136146574407, "grad_norm": 0.3782680630683899, "learning_rate": 5.96586800279515e-06, "loss": 0.0250170286744833, "memory(GiB)": 21.48, "step": 14352, "token_acc": 1.0, "train_speed(iter/s)": 0.957063 }, { "epoch": 0.4662638469284995, "grad_norm": 0.5968327522277832, "learning_rate": 5.965340961781551e-06, "loss": 0.029192674905061722, "memory(GiB)": 21.48, "step": 14353, "token_acc": 0.9767441860465116, "train_speed(iter/s)": 0.957075 }, { "epoch": 0.4662963323912549, "grad_norm": 0.3812180757522583, "learning_rate": 5.964813909626631e-06, "loss": 0.03040795773267746, "memory(GiB)": 21.48, "step": 14354, "token_acc": 0.976, "train_speed(iter/s)": 0.957085 }, { "epoch": 0.4663288178540103, "grad_norm": 0.4086277484893799, "learning_rate": 5.964286846336476e-06, "loss": 0.019630007445812225, "memory(GiB)": 21.48, "step": 14355, "token_acc": 0.9884393063583815, "train_speed(iter/s)": 0.957097 }, { "epoch": 0.46636130331676573, "grad_norm": 0.30164116621017456, "learning_rate": 5.963759771917167e-06, "loss": 0.02080633118748665, "memory(GiB)": 21.48, "step": 14356, "token_acc": 0.995, "train_speed(iter/s)": 0.957108 }, { "epoch": 0.46639378877952115, "grad_norm": 0.43943485617637634, "learning_rate": 5.963232686374787e-06, "loss": 0.023355374112725258, "memory(GiB)": 21.48, "step": 14357, "token_acc": 1.0, "train_speed(iter/s)": 0.957118 }, { "epoch": 0.46642627424227656, "grad_norm": 0.3775605857372284, "learning_rate": 5.9627055897154205e-06, "loss": 0.020244408398866653, "memory(GiB)": 21.48, "step": 14358, "token_acc": 0.9952830188679245, "train_speed(iter/s)": 0.957127 }, { "epoch": 0.466458759705032, "grad_norm": 0.35424289107322693, "learning_rate": 5.962178481945149e-06, "loss": 0.020412879064679146, "memory(GiB)": 21.48, "step": 14359, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.957137 }, { "epoch": 0.4664912451677874, "grad_norm": 0.3602660894393921, "learning_rate": 5.961651363070058e-06, "loss": 0.01973206177353859, "memory(GiB)": 21.48, "step": 14360, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.957146 }, { "epoch": 0.4665237306305428, "grad_norm": 0.3396194577217102, "learning_rate": 5.96112423309623e-06, "loss": 0.013632465153932571, "memory(GiB)": 21.48, "step": 14361, "token_acc": 1.0, "train_speed(iter/s)": 0.957155 }, { "epoch": 0.4665562160932982, "grad_norm": 0.41081559658050537, "learning_rate": 5.96059709202975e-06, "loss": 0.02947969362139702, "memory(GiB)": 21.48, "step": 14362, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957165 }, { "epoch": 0.46658870155605364, "grad_norm": 1.496258020401001, "learning_rate": 5.9600699398767e-06, "loss": 0.02386484295129776, "memory(GiB)": 21.48, "step": 14363, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.957175 }, { "epoch": 0.46662118701880906, "grad_norm": 0.47026678919792175, "learning_rate": 5.959542776643166e-06, "loss": 0.022042162716388702, "memory(GiB)": 21.48, "step": 14364, "token_acc": 0.9774436090225563, "train_speed(iter/s)": 0.957184 }, { "epoch": 0.4666536724815645, "grad_norm": 0.3558814525604248, "learning_rate": 5.959015602335231e-06, "loss": 0.025715596973896027, "memory(GiB)": 21.48, "step": 14365, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.957194 }, { "epoch": 0.4666861579443199, "grad_norm": 0.7983261942863464, "learning_rate": 5.958488416958979e-06, "loss": 0.022787518799304962, "memory(GiB)": 21.48, "step": 14366, "token_acc": 1.0, "train_speed(iter/s)": 0.957204 }, { "epoch": 0.4667186434070753, "grad_norm": 0.4245937168598175, "learning_rate": 5.957961220520495e-06, "loss": 0.03157314285635948, "memory(GiB)": 21.48, "step": 14367, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.957214 }, { "epoch": 0.4667511288698308, "grad_norm": 0.5834776759147644, "learning_rate": 5.957434013025862e-06, "loss": 0.029267439618706703, "memory(GiB)": 21.48, "step": 14368, "token_acc": 1.0, "train_speed(iter/s)": 0.957224 }, { "epoch": 0.4667836143325862, "grad_norm": 0.38048046827316284, "learning_rate": 5.9569067944811685e-06, "loss": 0.01974838227033615, "memory(GiB)": 21.48, "step": 14369, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957233 }, { "epoch": 0.4668160997953416, "grad_norm": 0.34534987807273865, "learning_rate": 5.956379564892496e-06, "loss": 0.019497182220220566, "memory(GiB)": 21.48, "step": 14370, "token_acc": 1.0, "train_speed(iter/s)": 0.957242 }, { "epoch": 0.466848585258097, "grad_norm": 0.6602159738540649, "learning_rate": 5.95585232426593e-06, "loss": 0.026126233860850334, "memory(GiB)": 21.48, "step": 14371, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.95725 }, { "epoch": 0.46688107072085244, "grad_norm": 0.43103334307670593, "learning_rate": 5.955325072607556e-06, "loss": 0.01647740602493286, "memory(GiB)": 21.48, "step": 14372, "token_acc": 1.0, "train_speed(iter/s)": 0.95726 }, { "epoch": 0.46691355618360786, "grad_norm": 0.3384639024734497, "learning_rate": 5.954797809923459e-06, "loss": 0.023488055914640427, "memory(GiB)": 21.48, "step": 14373, "token_acc": 0.9733333333333334, "train_speed(iter/s)": 0.957272 }, { "epoch": 0.4669460416463633, "grad_norm": 0.44602450728416443, "learning_rate": 5.954270536219726e-06, "loss": 0.025853384286165237, "memory(GiB)": 21.48, "step": 14374, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.957282 }, { "epoch": 0.4669785271091187, "grad_norm": 5.8255743980407715, "learning_rate": 5.953743251502439e-06, "loss": 0.023847119882702827, "memory(GiB)": 21.48, "step": 14375, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957293 }, { "epoch": 0.4670110125718741, "grad_norm": 0.39481252431869507, "learning_rate": 5.953215955777687e-06, "loss": 0.022268205881118774, "memory(GiB)": 21.48, "step": 14376, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957301 }, { "epoch": 0.4670434980346295, "grad_norm": 0.44246554374694824, "learning_rate": 5.9526886490515525e-06, "loss": 0.0222773440182209, "memory(GiB)": 21.48, "step": 14377, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.957311 }, { "epoch": 0.46707598349738494, "grad_norm": 0.4069903790950775, "learning_rate": 5.952161331330124e-06, "loss": 0.037072330713272095, "memory(GiB)": 21.48, "step": 14378, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.957323 }, { "epoch": 0.46710846896014036, "grad_norm": 0.48245859146118164, "learning_rate": 5.951634002619487e-06, "loss": 0.024977125227451324, "memory(GiB)": 21.48, "step": 14379, "token_acc": 0.9707317073170731, "train_speed(iter/s)": 0.957334 }, { "epoch": 0.46714095442289577, "grad_norm": 0.33413317799568176, "learning_rate": 5.951106662925726e-06, "loss": 0.019182223826646805, "memory(GiB)": 21.48, "step": 14380, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.957345 }, { "epoch": 0.4671734398856512, "grad_norm": 0.3883671462535858, "learning_rate": 5.9505793122549296e-06, "loss": 0.026188597083091736, "memory(GiB)": 21.48, "step": 14381, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.957357 }, { "epoch": 0.4672059253484066, "grad_norm": 0.3644713759422302, "learning_rate": 5.95005195061318e-06, "loss": 0.02019568160176277, "memory(GiB)": 21.48, "step": 14382, "token_acc": 0.9812734082397003, "train_speed(iter/s)": 0.95737 }, { "epoch": 0.467238410811162, "grad_norm": 0.6070786714553833, "learning_rate": 5.949524578006569e-06, "loss": 0.026774760335683823, "memory(GiB)": 21.48, "step": 14383, "token_acc": 0.9884615384615385, "train_speed(iter/s)": 0.957384 }, { "epoch": 0.46727089627391744, "grad_norm": 0.690662682056427, "learning_rate": 5.9489971944411796e-06, "loss": 0.03186243027448654, "memory(GiB)": 21.48, "step": 14384, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.957398 }, { "epoch": 0.46730338173667285, "grad_norm": 0.23990440368652344, "learning_rate": 5.9484697999230975e-06, "loss": 0.017333440482616425, "memory(GiB)": 21.48, "step": 14385, "token_acc": 0.9866220735785953, "train_speed(iter/s)": 0.957412 }, { "epoch": 0.46733586719942827, "grad_norm": 0.33034205436706543, "learning_rate": 5.947942394458415e-06, "loss": 0.0138808349147439, "memory(GiB)": 21.48, "step": 14386, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957426 }, { "epoch": 0.4673683526621837, "grad_norm": 0.8238590955734253, "learning_rate": 5.947414978053213e-06, "loss": 0.029199475422501564, "memory(GiB)": 21.48, "step": 14387, "token_acc": 0.9675925925925926, "train_speed(iter/s)": 0.95744 }, { "epoch": 0.4674008381249391, "grad_norm": 0.43195635080337524, "learning_rate": 5.946887550713583e-06, "loss": 0.02605549246072769, "memory(GiB)": 21.48, "step": 14388, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.957453 }, { "epoch": 0.4674333235876945, "grad_norm": 0.7694075703620911, "learning_rate": 5.9463601124456084e-06, "loss": 0.025021176785230637, "memory(GiB)": 21.48, "step": 14389, "token_acc": 1.0, "train_speed(iter/s)": 0.957466 }, { "epoch": 0.46746580905044993, "grad_norm": 0.4347793459892273, "learning_rate": 5.94583266325538e-06, "loss": 0.02440708875656128, "memory(GiB)": 21.48, "step": 14390, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.957479 }, { "epoch": 0.46749829451320535, "grad_norm": 0.9356876611709595, "learning_rate": 5.945305203148983e-06, "loss": 0.031635772436857224, "memory(GiB)": 21.48, "step": 14391, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957493 }, { "epoch": 0.46753077997596076, "grad_norm": 0.35372456908226013, "learning_rate": 5.944777732132508e-06, "loss": 0.030095087364315987, "memory(GiB)": 21.48, "step": 14392, "token_acc": 0.9849056603773585, "train_speed(iter/s)": 0.957505 }, { "epoch": 0.4675632654387162, "grad_norm": 0.22839249670505524, "learning_rate": 5.944250250212038e-06, "loss": 0.013100044801831245, "memory(GiB)": 21.48, "step": 14393, "token_acc": 1.0, "train_speed(iter/s)": 0.957519 }, { "epoch": 0.4675957509014716, "grad_norm": 0.35902562737464905, "learning_rate": 5.943722757393663e-06, "loss": 0.023747384548187256, "memory(GiB)": 21.48, "step": 14394, "token_acc": 1.0, "train_speed(iter/s)": 0.957533 }, { "epoch": 0.467628236364227, "grad_norm": 0.47473374009132385, "learning_rate": 5.943195253683472e-06, "loss": 0.027754072099924088, "memory(GiB)": 21.48, "step": 14395, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.957546 }, { "epoch": 0.46766072182698243, "grad_norm": 0.5792620778083801, "learning_rate": 5.942667739087553e-06, "loss": 0.02185780555009842, "memory(GiB)": 21.48, "step": 14396, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.957559 }, { "epoch": 0.46769320728973784, "grad_norm": 0.35426846146583557, "learning_rate": 5.9421402136119935e-06, "loss": 0.0217914842069149, "memory(GiB)": 21.48, "step": 14397, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.957573 }, { "epoch": 0.46772569275249326, "grad_norm": 0.3894506096839905, "learning_rate": 5.941612677262881e-06, "loss": 0.01924104616045952, "memory(GiB)": 21.48, "step": 14398, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957587 }, { "epoch": 0.4677581782152487, "grad_norm": 0.2897859215736389, "learning_rate": 5.941085130046305e-06, "loss": 0.018896091729402542, "memory(GiB)": 21.48, "step": 14399, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957601 }, { "epoch": 0.4677906636780041, "grad_norm": 0.30858683586120605, "learning_rate": 5.940557571968355e-06, "loss": 0.016243424266576767, "memory(GiB)": 21.48, "step": 14400, "token_acc": 1.0, "train_speed(iter/s)": 0.957614 }, { "epoch": 0.4678231491407595, "grad_norm": 0.30044975876808167, "learning_rate": 5.9400300030351175e-06, "loss": 0.017894860357046127, "memory(GiB)": 21.48, "step": 14401, "token_acc": 0.992831541218638, "train_speed(iter/s)": 0.957627 }, { "epoch": 0.4678556346035149, "grad_norm": 0.32103803753852844, "learning_rate": 5.939502423252684e-06, "loss": 0.020129960030317307, "memory(GiB)": 21.48, "step": 14402, "token_acc": 1.0, "train_speed(iter/s)": 0.957641 }, { "epoch": 0.46788812006627034, "grad_norm": 0.5599616765975952, "learning_rate": 5.938974832627141e-06, "loss": 0.027261052280664444, "memory(GiB)": 21.48, "step": 14403, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.957654 }, { "epoch": 0.46792060552902576, "grad_norm": 0.46510446071624756, "learning_rate": 5.938447231164579e-06, "loss": 0.025209590792655945, "memory(GiB)": 21.48, "step": 14404, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.957668 }, { "epoch": 0.4679530909917812, "grad_norm": 0.4380509555339813, "learning_rate": 5.937919618871086e-06, "loss": 0.02079332247376442, "memory(GiB)": 21.48, "step": 14405, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.957682 }, { "epoch": 0.4679855764545366, "grad_norm": 0.3832741677761078, "learning_rate": 5.937391995752755e-06, "loss": 0.018068470060825348, "memory(GiB)": 21.48, "step": 14406, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957695 }, { "epoch": 0.468018061917292, "grad_norm": 0.5670774579048157, "learning_rate": 5.93686436181567e-06, "loss": 0.027674755081534386, "memory(GiB)": 21.48, "step": 14407, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.957708 }, { "epoch": 0.4680505473800474, "grad_norm": 0.29658374190330505, "learning_rate": 5.936336717065925e-06, "loss": 0.020419830456376076, "memory(GiB)": 21.48, "step": 14408, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.957722 }, { "epoch": 0.46808303284280284, "grad_norm": 0.42445850372314453, "learning_rate": 5.935809061509608e-06, "loss": 0.01904282346367836, "memory(GiB)": 21.48, "step": 14409, "token_acc": 1.0, "train_speed(iter/s)": 0.957735 }, { "epoch": 0.46811551830555825, "grad_norm": 0.45650598406791687, "learning_rate": 5.9352813951528066e-06, "loss": 0.026941783726215363, "memory(GiB)": 21.48, "step": 14410, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.957749 }, { "epoch": 0.46814800376831367, "grad_norm": 0.5907306671142578, "learning_rate": 5.934753718001616e-06, "loss": 0.029927577823400497, "memory(GiB)": 21.48, "step": 14411, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.957762 }, { "epoch": 0.4681804892310691, "grad_norm": 0.6147339940071106, "learning_rate": 5.93422603006212e-06, "loss": 0.02712557464838028, "memory(GiB)": 21.48, "step": 14412, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.957775 }, { "epoch": 0.4682129746938245, "grad_norm": 0.3500216603279114, "learning_rate": 5.933698331340415e-06, "loss": 0.018969561904668808, "memory(GiB)": 21.48, "step": 14413, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.957784 }, { "epoch": 0.4682454601565799, "grad_norm": 0.462588369846344, "learning_rate": 5.933170621842588e-06, "loss": 0.03867492079734802, "memory(GiB)": 21.48, "step": 14414, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957795 }, { "epoch": 0.46827794561933533, "grad_norm": 0.364888072013855, "learning_rate": 5.932642901574729e-06, "loss": 0.021286318078637123, "memory(GiB)": 21.48, "step": 14415, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957806 }, { "epoch": 0.46831043108209075, "grad_norm": 0.4673082232475281, "learning_rate": 5.93211517054293e-06, "loss": 0.022494474425911903, "memory(GiB)": 21.48, "step": 14416, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957817 }, { "epoch": 0.46834291654484617, "grad_norm": 0.42019927501678467, "learning_rate": 5.931587428753281e-06, "loss": 0.030507536605000496, "memory(GiB)": 21.48, "step": 14417, "token_acc": 0.985663082437276, "train_speed(iter/s)": 0.957827 }, { "epoch": 0.4683754020076016, "grad_norm": 0.37130817770957947, "learning_rate": 5.931059676211873e-06, "loss": 0.02389511838555336, "memory(GiB)": 21.48, "step": 14418, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957838 }, { "epoch": 0.468407887470357, "grad_norm": 0.4642466604709625, "learning_rate": 5.930531912924797e-06, "loss": 0.028033088892698288, "memory(GiB)": 21.48, "step": 14419, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.957848 }, { "epoch": 0.4684403729331124, "grad_norm": 0.43405643105506897, "learning_rate": 5.930004138898146e-06, "loss": 0.024451635777950287, "memory(GiB)": 21.48, "step": 14420, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957859 }, { "epoch": 0.46847285839586783, "grad_norm": 0.441789448261261, "learning_rate": 5.929476354138007e-06, "loss": 0.02393532358109951, "memory(GiB)": 21.48, "step": 14421, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.957868 }, { "epoch": 0.46850534385862325, "grad_norm": 0.3158142864704132, "learning_rate": 5.928948558650476e-06, "loss": 0.019671982154250145, "memory(GiB)": 21.48, "step": 14422, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957877 }, { "epoch": 0.46853782932137866, "grad_norm": 0.4091879427433014, "learning_rate": 5.928420752441642e-06, "loss": 0.02228371798992157, "memory(GiB)": 21.48, "step": 14423, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957887 }, { "epoch": 0.4685703147841341, "grad_norm": 0.3953200876712799, "learning_rate": 5.9278929355175955e-06, "loss": 0.026517700403928757, "memory(GiB)": 21.48, "step": 14424, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.957896 }, { "epoch": 0.4686028002468895, "grad_norm": 0.4102414846420288, "learning_rate": 5.927365107884429e-06, "loss": 0.030439801514148712, "memory(GiB)": 21.48, "step": 14425, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.957906 }, { "epoch": 0.4686352857096449, "grad_norm": 0.3093000650405884, "learning_rate": 5.926837269548237e-06, "loss": 0.020731110125780106, "memory(GiB)": 21.48, "step": 14426, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957916 }, { "epoch": 0.4686677711724003, "grad_norm": 0.32646796107292175, "learning_rate": 5.926309420515108e-06, "loss": 0.02236158773303032, "memory(GiB)": 21.48, "step": 14427, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.957926 }, { "epoch": 0.46870025663515574, "grad_norm": 0.29353752732276917, "learning_rate": 5.925781560791135e-06, "loss": 0.01893579214811325, "memory(GiB)": 21.48, "step": 14428, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.957935 }, { "epoch": 0.46873274209791116, "grad_norm": 0.3866526782512665, "learning_rate": 5.925253690382411e-06, "loss": 0.021893825381994247, "memory(GiB)": 21.48, "step": 14429, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.957945 }, { "epoch": 0.4687652275606666, "grad_norm": 0.3963415026664734, "learning_rate": 5.924725809295028e-06, "loss": 0.025292178615927696, "memory(GiB)": 21.48, "step": 14430, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957955 }, { "epoch": 0.468797713023422, "grad_norm": 0.42387405037879944, "learning_rate": 5.924197917535078e-06, "loss": 0.030285241082310677, "memory(GiB)": 21.48, "step": 14431, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.957965 }, { "epoch": 0.46883019848617746, "grad_norm": 0.38533341884613037, "learning_rate": 5.923670015108655e-06, "loss": 0.019890189170837402, "memory(GiB)": 21.48, "step": 14432, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957975 }, { "epoch": 0.4688626839489329, "grad_norm": 0.47537896037101746, "learning_rate": 5.923142102021848e-06, "loss": 0.03186823055148125, "memory(GiB)": 21.48, "step": 14433, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.957985 }, { "epoch": 0.4688951694116883, "grad_norm": 0.28435665369033813, "learning_rate": 5.922614178280755e-06, "loss": 0.024821864441037178, "memory(GiB)": 21.48, "step": 14434, "token_acc": 1.0, "train_speed(iter/s)": 0.957996 }, { "epoch": 0.4689276548744437, "grad_norm": 0.26555678248405457, "learning_rate": 5.922086243891465e-06, "loss": 0.022982604801654816, "memory(GiB)": 21.48, "step": 14435, "token_acc": 1.0, "train_speed(iter/s)": 0.958006 }, { "epoch": 0.4689601403371991, "grad_norm": 0.2825700640678406, "learning_rate": 5.921558298860074e-06, "loss": 0.020435065031051636, "memory(GiB)": 21.48, "step": 14436, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.958019 }, { "epoch": 0.46899262579995454, "grad_norm": 0.35477563738822937, "learning_rate": 5.921030343192673e-06, "loss": 0.019546106457710266, "memory(GiB)": 21.48, "step": 14437, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.95803 }, { "epoch": 0.46902511126270996, "grad_norm": 0.33575868606567383, "learning_rate": 5.920502376895355e-06, "loss": 0.02306044101715088, "memory(GiB)": 21.48, "step": 14438, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.958042 }, { "epoch": 0.4690575967254654, "grad_norm": 0.3625498414039612, "learning_rate": 5.919974399974215e-06, "loss": 0.02359781786799431, "memory(GiB)": 21.48, "step": 14439, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.958053 }, { "epoch": 0.4690900821882208, "grad_norm": 0.35480237007141113, "learning_rate": 5.919446412435347e-06, "loss": 0.014434708282351494, "memory(GiB)": 21.48, "step": 14440, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.958063 }, { "epoch": 0.4691225676509762, "grad_norm": 0.3020634949207306, "learning_rate": 5.918918414284842e-06, "loss": 0.021751806139945984, "memory(GiB)": 21.48, "step": 14441, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.958074 }, { "epoch": 0.4691550531137316, "grad_norm": 0.30168548226356506, "learning_rate": 5.918390405528796e-06, "loss": 0.018214277923107147, "memory(GiB)": 21.48, "step": 14442, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.958085 }, { "epoch": 0.46918753857648704, "grad_norm": 0.3321005702018738, "learning_rate": 5.917862386173303e-06, "loss": 0.017513729631900787, "memory(GiB)": 21.48, "step": 14443, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.958096 }, { "epoch": 0.46922002403924246, "grad_norm": 0.3239911198616028, "learning_rate": 5.917334356224456e-06, "loss": 0.023474102839827538, "memory(GiB)": 21.48, "step": 14444, "token_acc": 1.0, "train_speed(iter/s)": 0.958106 }, { "epoch": 0.46925250950199787, "grad_norm": 0.4011934995651245, "learning_rate": 5.916806315688349e-06, "loss": 0.02786882594227791, "memory(GiB)": 21.48, "step": 14445, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.958118 }, { "epoch": 0.4692849949647533, "grad_norm": 0.506232500076294, "learning_rate": 5.9162782645710784e-06, "loss": 0.028501668944954872, "memory(GiB)": 21.48, "step": 14446, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.958131 }, { "epoch": 0.4693174804275087, "grad_norm": 0.4316245913505554, "learning_rate": 5.915750202878734e-06, "loss": 0.02155309170484543, "memory(GiB)": 21.48, "step": 14447, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.958145 }, { "epoch": 0.4693499658902641, "grad_norm": 0.4033385217189789, "learning_rate": 5.915222130617415e-06, "loss": 0.03400930017232895, "memory(GiB)": 21.48, "step": 14448, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.958159 }, { "epoch": 0.46938245135301954, "grad_norm": 0.29435810446739197, "learning_rate": 5.914694047793215e-06, "loss": 0.025383982807397842, "memory(GiB)": 21.48, "step": 14449, "token_acc": 0.9930313588850174, "train_speed(iter/s)": 0.958173 }, { "epoch": 0.46941493681577495, "grad_norm": 0.29028236865997314, "learning_rate": 5.914165954412227e-06, "loss": 0.019248519092798233, "memory(GiB)": 21.48, "step": 14450, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.958187 }, { "epoch": 0.46944742227853037, "grad_norm": 0.3446744680404663, "learning_rate": 5.913637850480549e-06, "loss": 0.020538482815027237, "memory(GiB)": 21.48, "step": 14451, "token_acc": 0.986013986013986, "train_speed(iter/s)": 0.958201 }, { "epoch": 0.4694799077412858, "grad_norm": 0.37939614057540894, "learning_rate": 5.913109736004273e-06, "loss": 0.02997213788330555, "memory(GiB)": 21.48, "step": 14452, "token_acc": 0.9806763285024155, "train_speed(iter/s)": 0.958215 }, { "epoch": 0.4695123932040412, "grad_norm": 0.28672632575035095, "learning_rate": 5.912581610989496e-06, "loss": 0.01876574382185936, "memory(GiB)": 21.48, "step": 14453, "token_acc": 0.9822695035460993, "train_speed(iter/s)": 0.958228 }, { "epoch": 0.4695448786667966, "grad_norm": 0.40788036584854126, "learning_rate": 5.912053475442312e-06, "loss": 0.026912391185760498, "memory(GiB)": 21.48, "step": 14454, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.958242 }, { "epoch": 0.46957736412955203, "grad_norm": 0.33422401547431946, "learning_rate": 5.9115253293688176e-06, "loss": 0.02466527558863163, "memory(GiB)": 21.48, "step": 14455, "token_acc": 1.0, "train_speed(iter/s)": 0.958256 }, { "epoch": 0.46960984959230745, "grad_norm": 0.4963091313838959, "learning_rate": 5.910997172775107e-06, "loss": 0.027417441830039024, "memory(GiB)": 21.48, "step": 14456, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.958269 }, { "epoch": 0.46964233505506287, "grad_norm": 0.3991451859474182, "learning_rate": 5.910469005667276e-06, "loss": 0.023132963106036186, "memory(GiB)": 21.48, "step": 14457, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.958284 }, { "epoch": 0.4696748205178183, "grad_norm": 0.29386013746261597, "learning_rate": 5.909940828051421e-06, "loss": 0.018191242590546608, "memory(GiB)": 21.48, "step": 14458, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.958297 }, { "epoch": 0.4697073059805737, "grad_norm": 0.4089727997779846, "learning_rate": 5.909412639933639e-06, "loss": 0.02547295205295086, "memory(GiB)": 21.48, "step": 14459, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.958311 }, { "epoch": 0.4697397914433291, "grad_norm": 0.28945374488830566, "learning_rate": 5.908884441320025e-06, "loss": 0.0180043987929821, "memory(GiB)": 21.48, "step": 14460, "token_acc": 1.0, "train_speed(iter/s)": 0.958324 }, { "epoch": 0.46977227690608453, "grad_norm": 0.8158515691757202, "learning_rate": 5.908356232216673e-06, "loss": 0.023474879562854767, "memory(GiB)": 21.48, "step": 14461, "token_acc": 1.0, "train_speed(iter/s)": 0.958339 }, { "epoch": 0.46980476236883995, "grad_norm": 0.29409247636795044, "learning_rate": 5.907828012629682e-06, "loss": 0.02171049267053604, "memory(GiB)": 21.48, "step": 14462, "token_acc": 1.0, "train_speed(iter/s)": 0.958351 }, { "epoch": 0.46983724783159536, "grad_norm": 0.26814305782318115, "learning_rate": 5.907299782565146e-06, "loss": 0.01762549579143524, "memory(GiB)": 21.48, "step": 14463, "token_acc": 0.9846153846153847, "train_speed(iter/s)": 0.958365 }, { "epoch": 0.4698697332943508, "grad_norm": 0.2547670304775238, "learning_rate": 5.906771542029164e-06, "loss": 0.020157579332590103, "memory(GiB)": 21.48, "step": 14464, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.958379 }, { "epoch": 0.4699022187571062, "grad_norm": 0.38127705454826355, "learning_rate": 5.9062432910278326e-06, "loss": 0.018824351951479912, "memory(GiB)": 21.48, "step": 14465, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.958392 }, { "epoch": 0.4699347042198616, "grad_norm": 0.3977900743484497, "learning_rate": 5.905715029567246e-06, "loss": 0.020542308688163757, "memory(GiB)": 21.48, "step": 14466, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.958406 }, { "epoch": 0.469967189682617, "grad_norm": 0.29675811529159546, "learning_rate": 5.905186757653503e-06, "loss": 0.019131340086460114, "memory(GiB)": 21.48, "step": 14467, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.95842 }, { "epoch": 0.46999967514537244, "grad_norm": 0.5497995615005493, "learning_rate": 5.9046584752927e-06, "loss": 0.026028599590063095, "memory(GiB)": 21.48, "step": 14468, "token_acc": 1.0, "train_speed(iter/s)": 0.958433 }, { "epoch": 0.47003216060812786, "grad_norm": 0.43495437502861023, "learning_rate": 5.904130182490934e-06, "loss": 0.027013741433620453, "memory(GiB)": 21.48, "step": 14469, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.958447 }, { "epoch": 0.4700646460708833, "grad_norm": 0.3540625274181366, "learning_rate": 5.9036018792543016e-06, "loss": 0.024239536374807358, "memory(GiB)": 21.48, "step": 14470, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.958461 }, { "epoch": 0.4700971315336387, "grad_norm": 0.506515383720398, "learning_rate": 5.903073565588901e-06, "loss": 0.023280993103981018, "memory(GiB)": 21.48, "step": 14471, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.958475 }, { "epoch": 0.4701296169963941, "grad_norm": 0.46691179275512695, "learning_rate": 5.90254524150083e-06, "loss": 0.021832959726452827, "memory(GiB)": 21.48, "step": 14472, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.958487 }, { "epoch": 0.4701621024591495, "grad_norm": 0.44437357783317566, "learning_rate": 5.902016906996187e-06, "loss": 0.025095470249652863, "memory(GiB)": 21.48, "step": 14473, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.958497 }, { "epoch": 0.47019458792190494, "grad_norm": 0.3981384038925171, "learning_rate": 5.901488562081067e-06, "loss": 0.022536348551511765, "memory(GiB)": 21.48, "step": 14474, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.958509 }, { "epoch": 0.47022707338466035, "grad_norm": 0.6353511810302734, "learning_rate": 5.900960206761568e-06, "loss": 0.028102543205022812, "memory(GiB)": 21.48, "step": 14475, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.95852 }, { "epoch": 0.47025955884741577, "grad_norm": 0.3793911933898926, "learning_rate": 5.900431841043791e-06, "loss": 0.02954983338713646, "memory(GiB)": 21.48, "step": 14476, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.95853 }, { "epoch": 0.4702920443101712, "grad_norm": 0.5349997282028198, "learning_rate": 5.899903464933831e-06, "loss": 0.030003242194652557, "memory(GiB)": 21.48, "step": 14477, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.95854 }, { "epoch": 0.4703245297729266, "grad_norm": 0.2932334244251251, "learning_rate": 5.899375078437787e-06, "loss": 0.018893297761678696, "memory(GiB)": 21.48, "step": 14478, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.95855 }, { "epoch": 0.470357015235682, "grad_norm": 0.291227787733078, "learning_rate": 5.898846681561757e-06, "loss": 0.023489784449338913, "memory(GiB)": 21.48, "step": 14479, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.958561 }, { "epoch": 0.47038950069843743, "grad_norm": 0.8178634643554688, "learning_rate": 5.89831827431184e-06, "loss": 0.026241369545459747, "memory(GiB)": 21.48, "step": 14480, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.958572 }, { "epoch": 0.47042198616119285, "grad_norm": 0.491032212972641, "learning_rate": 5.897789856694137e-06, "loss": 0.020635807886719704, "memory(GiB)": 21.48, "step": 14481, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.958583 }, { "epoch": 0.47045447162394827, "grad_norm": 0.29184356331825256, "learning_rate": 5.897261428714742e-06, "loss": 0.014994610100984573, "memory(GiB)": 21.48, "step": 14482, "token_acc": 0.988, "train_speed(iter/s)": 0.958594 }, { "epoch": 0.4704869570867037, "grad_norm": 0.3412896394729614, "learning_rate": 5.8967329903797565e-06, "loss": 0.02080712839961052, "memory(GiB)": 21.48, "step": 14483, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.958604 }, { "epoch": 0.4705194425494591, "grad_norm": 0.24076254665851593, "learning_rate": 5.896204541695278e-06, "loss": 0.013380667194724083, "memory(GiB)": 21.48, "step": 14484, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.958616 }, { "epoch": 0.4705519280122145, "grad_norm": 0.3755670487880707, "learning_rate": 5.895676082667407e-06, "loss": 0.018147721886634827, "memory(GiB)": 21.48, "step": 14485, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.958626 }, { "epoch": 0.47058441347496993, "grad_norm": 0.3255733549594879, "learning_rate": 5.895147613302241e-06, "loss": 0.023937180638313293, "memory(GiB)": 21.48, "step": 14486, "token_acc": 0.9796747967479674, "train_speed(iter/s)": 0.958636 }, { "epoch": 0.47061689893772535, "grad_norm": 0.26532307267189026, "learning_rate": 5.894619133605881e-06, "loss": 0.014964532107114792, "memory(GiB)": 21.48, "step": 14487, "token_acc": 1.0, "train_speed(iter/s)": 0.958646 }, { "epoch": 0.47064938440048076, "grad_norm": 0.29876384139060974, "learning_rate": 5.894090643584426e-06, "loss": 0.019229084253311157, "memory(GiB)": 21.48, "step": 14488, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.958656 }, { "epoch": 0.4706818698632362, "grad_norm": 0.35697439312934875, "learning_rate": 5.893562143243974e-06, "loss": 0.023010816425085068, "memory(GiB)": 21.48, "step": 14489, "token_acc": 1.0, "train_speed(iter/s)": 0.958666 }, { "epoch": 0.4707143553259916, "grad_norm": 0.4648192524909973, "learning_rate": 5.893033632590625e-06, "loss": 0.020927583798766136, "memory(GiB)": 21.48, "step": 14490, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.958676 }, { "epoch": 0.470746840788747, "grad_norm": 0.48163270950317383, "learning_rate": 5.89250511163048e-06, "loss": 0.026786932721734047, "memory(GiB)": 21.48, "step": 14491, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.958687 }, { "epoch": 0.47077932625150243, "grad_norm": 0.7448728680610657, "learning_rate": 5.891976580369638e-06, "loss": 0.0280442014336586, "memory(GiB)": 21.48, "step": 14492, "token_acc": 1.0, "train_speed(iter/s)": 0.958698 }, { "epoch": 0.47081181171425784, "grad_norm": 0.3274635374546051, "learning_rate": 5.8914480388141994e-06, "loss": 0.02132907509803772, "memory(GiB)": 21.48, "step": 14493, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.958708 }, { "epoch": 0.47084429717701326, "grad_norm": 0.2900806665420532, "learning_rate": 5.890919486970262e-06, "loss": 0.016741737723350525, "memory(GiB)": 21.48, "step": 14494, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.958718 }, { "epoch": 0.4708767826397687, "grad_norm": 0.43709084391593933, "learning_rate": 5.89039092484393e-06, "loss": 0.024962594732642174, "memory(GiB)": 21.48, "step": 14495, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.95873 }, { "epoch": 0.47090926810252415, "grad_norm": 0.28856995701789856, "learning_rate": 5.8898623524413016e-06, "loss": 0.018025286495685577, "memory(GiB)": 21.48, "step": 14496, "token_acc": 1.0, "train_speed(iter/s)": 0.958741 }, { "epoch": 0.47094175356527956, "grad_norm": 0.41361361742019653, "learning_rate": 5.889333769768477e-06, "loss": 0.022741619497537613, "memory(GiB)": 21.48, "step": 14497, "token_acc": 0.994475138121547, "train_speed(iter/s)": 0.958752 }, { "epoch": 0.470974239028035, "grad_norm": 0.369208425283432, "learning_rate": 5.8888051768315555e-06, "loss": 0.024222614243626595, "memory(GiB)": 21.48, "step": 14498, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.958764 }, { "epoch": 0.4710067244907904, "grad_norm": 0.33148086071014404, "learning_rate": 5.888276573636642e-06, "loss": 0.022779105231165886, "memory(GiB)": 21.48, "step": 14499, "token_acc": 0.9965753424657534, "train_speed(iter/s)": 0.958775 }, { "epoch": 0.4710392099535458, "grad_norm": 0.43107596039772034, "learning_rate": 5.887747960189833e-06, "loss": 0.01803969219326973, "memory(GiB)": 21.48, "step": 14500, "token_acc": 0.996, "train_speed(iter/s)": 0.958786 }, { "epoch": 0.4710392099535458, "eval_loss": 0.023586373776197433, "eval_runtime": 80.6061, "eval_samples_per_second": 123.44, "eval_steps_per_second": 3.858, "eval_token_acc": 0.9905734687678581, "step": 14500 }, { "epoch": 0.47107169541630123, "grad_norm": 0.444740891456604, "learning_rate": 5.88721933649723e-06, "loss": 0.028127174824476242, "memory(GiB)": 21.48, "step": 14501, "token_acc": 0.9900695814157353, "train_speed(iter/s)": 0.953003 }, { "epoch": 0.47110418087905664, "grad_norm": 0.35584279894828796, "learning_rate": 5.8866907025649365e-06, "loss": 0.022963520139455795, "memory(GiB)": 21.48, "step": 14502, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.953013 }, { "epoch": 0.47113666634181206, "grad_norm": 0.8799456357955933, "learning_rate": 5.886162058399052e-06, "loss": 0.018545638769865036, "memory(GiB)": 21.48, "step": 14503, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.953022 }, { "epoch": 0.4711691518045675, "grad_norm": 0.4822484850883484, "learning_rate": 5.885633404005677e-06, "loss": 0.023128829896450043, "memory(GiB)": 21.48, "step": 14504, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.953032 }, { "epoch": 0.4712016372673229, "grad_norm": 0.4076635539531708, "learning_rate": 5.885104739390914e-06, "loss": 0.019568681716918945, "memory(GiB)": 21.48, "step": 14505, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.953043 }, { "epoch": 0.4712341227300783, "grad_norm": 0.34296971559524536, "learning_rate": 5.884576064560865e-06, "loss": 0.01363514456897974, "memory(GiB)": 21.48, "step": 14506, "token_acc": 1.0, "train_speed(iter/s)": 0.953057 }, { "epoch": 0.4712666081928337, "grad_norm": 0.4316357672214508, "learning_rate": 5.88404737952163e-06, "loss": 0.024334551766514778, "memory(GiB)": 21.48, "step": 14507, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.953067 }, { "epoch": 0.47129909365558914, "grad_norm": 0.37547335028648376, "learning_rate": 5.883518684279313e-06, "loss": 0.01871652528643608, "memory(GiB)": 21.48, "step": 14508, "token_acc": 0.9921875, "train_speed(iter/s)": 0.953077 }, { "epoch": 0.47133157911834456, "grad_norm": 0.3016720712184906, "learning_rate": 5.882989978840013e-06, "loss": 0.022485999390482903, "memory(GiB)": 21.48, "step": 14509, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.953087 }, { "epoch": 0.4713640645811, "grad_norm": 1.035976529121399, "learning_rate": 5.882461263209832e-06, "loss": 0.01773962751030922, "memory(GiB)": 21.48, "step": 14510, "token_acc": 1.0, "train_speed(iter/s)": 0.953098 }, { "epoch": 0.4713965500438554, "grad_norm": 0.49994054436683655, "learning_rate": 5.881932537394875e-06, "loss": 0.026469625532627106, "memory(GiB)": 21.48, "step": 14511, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.953109 }, { "epoch": 0.4714290355066108, "grad_norm": 0.6687461137771606, "learning_rate": 5.881403801401243e-06, "loss": 0.02965668961405754, "memory(GiB)": 21.48, "step": 14512, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.95312 }, { "epoch": 0.4714615209693662, "grad_norm": 0.4933986961841583, "learning_rate": 5.880875055235038e-06, "loss": 0.026889123022556305, "memory(GiB)": 21.48, "step": 14513, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.953131 }, { "epoch": 0.47149400643212164, "grad_norm": 0.448636531829834, "learning_rate": 5.880346298902362e-06, "loss": 0.023991765454411507, "memory(GiB)": 21.48, "step": 14514, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.953141 }, { "epoch": 0.47152649189487705, "grad_norm": 0.3667389154434204, "learning_rate": 5.879817532409319e-06, "loss": 0.02010899782180786, "memory(GiB)": 21.48, "step": 14515, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.953151 }, { "epoch": 0.47155897735763247, "grad_norm": 1.243943452835083, "learning_rate": 5.879288755762009e-06, "loss": 0.016452673822641373, "memory(GiB)": 21.48, "step": 14516, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.953162 }, { "epoch": 0.4715914628203879, "grad_norm": 0.3021979033946991, "learning_rate": 5.8787599689665374e-06, "loss": 0.02006630413234234, "memory(GiB)": 21.48, "step": 14517, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.953172 }, { "epoch": 0.4716239482831433, "grad_norm": 0.3656744360923767, "learning_rate": 5.8782311720290065e-06, "loss": 0.0246792770922184, "memory(GiB)": 21.48, "step": 14518, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.953184 }, { "epoch": 0.4716564337458987, "grad_norm": 0.6363943815231323, "learning_rate": 5.877702364955518e-06, "loss": 0.02745528146624565, "memory(GiB)": 21.48, "step": 14519, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.953192 }, { "epoch": 0.47168891920865413, "grad_norm": 0.38512173295021057, "learning_rate": 5.877173547752176e-06, "loss": 0.021726718172430992, "memory(GiB)": 21.48, "step": 14520, "token_acc": 0.992, "train_speed(iter/s)": 0.953203 }, { "epoch": 0.47172140467140955, "grad_norm": 0.3488348722457886, "learning_rate": 5.8766447204250845e-06, "loss": 0.02388951927423477, "memory(GiB)": 21.48, "step": 14521, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.953213 }, { "epoch": 0.47175389013416497, "grad_norm": 0.6047050356864929, "learning_rate": 5.876115882980347e-06, "loss": 0.023971140384674072, "memory(GiB)": 21.48, "step": 14522, "token_acc": 0.9868995633187773, "train_speed(iter/s)": 0.953224 }, { "epoch": 0.4717863755969204, "grad_norm": 0.5189557075500488, "learning_rate": 5.875587035424064e-06, "loss": 0.0315595343708992, "memory(GiB)": 21.48, "step": 14523, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.953235 }, { "epoch": 0.4718188610596758, "grad_norm": 0.4455888569355011, "learning_rate": 5.875058177762342e-06, "loss": 0.02622680924832821, "memory(GiB)": 21.48, "step": 14524, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.953245 }, { "epoch": 0.4718513465224312, "grad_norm": 0.3561202585697174, "learning_rate": 5.874529310001283e-06, "loss": 0.016896655783057213, "memory(GiB)": 21.48, "step": 14525, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.953255 }, { "epoch": 0.47188383198518663, "grad_norm": 0.377626895904541, "learning_rate": 5.874000432146992e-06, "loss": 0.02257855050265789, "memory(GiB)": 21.48, "step": 14526, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.953262 }, { "epoch": 0.47191631744794205, "grad_norm": 0.2389451116323471, "learning_rate": 5.873471544205575e-06, "loss": 0.014882401563227177, "memory(GiB)": 21.48, "step": 14527, "token_acc": 0.9875, "train_speed(iter/s)": 0.953274 }, { "epoch": 0.47194880291069746, "grad_norm": 0.23870830237865448, "learning_rate": 5.872942646183132e-06, "loss": 0.015887510031461716, "memory(GiB)": 21.48, "step": 14528, "token_acc": 0.9944444444444445, "train_speed(iter/s)": 0.953285 }, { "epoch": 0.4719812883734529, "grad_norm": 0.3828051686286926, "learning_rate": 5.872413738085769e-06, "loss": 0.030064266175031662, "memory(GiB)": 21.48, "step": 14529, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.953296 }, { "epoch": 0.4720137738362083, "grad_norm": 0.3186661899089813, "learning_rate": 5.871884819919591e-06, "loss": 0.02494865283370018, "memory(GiB)": 21.48, "step": 14530, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.953307 }, { "epoch": 0.4720462592989637, "grad_norm": 0.32243868708610535, "learning_rate": 5.871355891690701e-06, "loss": 0.021245576441287994, "memory(GiB)": 21.48, "step": 14531, "token_acc": 1.0, "train_speed(iter/s)": 0.953319 }, { "epoch": 0.4720787447617191, "grad_norm": 0.3661114573478699, "learning_rate": 5.870826953405206e-06, "loss": 0.02385871112346649, "memory(GiB)": 21.48, "step": 14532, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.95333 }, { "epoch": 0.47211123022447454, "grad_norm": 0.29300692677497864, "learning_rate": 5.870298005069206e-06, "loss": 0.01399860717356205, "memory(GiB)": 21.48, "step": 14533, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.953342 }, { "epoch": 0.47214371568722996, "grad_norm": 0.36273157596588135, "learning_rate": 5.8697690466888104e-06, "loss": 0.02338523045182228, "memory(GiB)": 21.48, "step": 14534, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.953354 }, { "epoch": 0.4721762011499854, "grad_norm": 0.3453763425350189, "learning_rate": 5.869240078270121e-06, "loss": 0.019357386976480484, "memory(GiB)": 21.48, "step": 14535, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.953364 }, { "epoch": 0.4722086866127408, "grad_norm": 0.3180868923664093, "learning_rate": 5.868711099819247e-06, "loss": 0.02376370131969452, "memory(GiB)": 21.48, "step": 14536, "token_acc": 1.0, "train_speed(iter/s)": 0.953376 }, { "epoch": 0.4722411720754962, "grad_norm": 0.35669779777526855, "learning_rate": 5.868182111342287e-06, "loss": 0.01773788034915924, "memory(GiB)": 21.48, "step": 14537, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.953387 }, { "epoch": 0.4722736575382516, "grad_norm": 0.33409637212753296, "learning_rate": 5.867653112845353e-06, "loss": 0.023070471361279488, "memory(GiB)": 21.48, "step": 14538, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.953398 }, { "epoch": 0.47230614300100704, "grad_norm": 1.6347861289978027, "learning_rate": 5.867124104334546e-06, "loss": 0.04996404051780701, "memory(GiB)": 21.48, "step": 14539, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.953408 }, { "epoch": 0.47233862846376246, "grad_norm": 0.3883666396141052, "learning_rate": 5.86659508581597e-06, "loss": 0.02425011247396469, "memory(GiB)": 21.48, "step": 14540, "token_acc": 0.988, "train_speed(iter/s)": 0.95342 }, { "epoch": 0.47237111392651787, "grad_norm": 0.41345855593681335, "learning_rate": 5.866066057295737e-06, "loss": 0.021580200642347336, "memory(GiB)": 21.48, "step": 14541, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.95343 }, { "epoch": 0.4724035993892733, "grad_norm": 0.2865098714828491, "learning_rate": 5.865537018779947e-06, "loss": 0.020506486296653748, "memory(GiB)": 21.48, "step": 14542, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.953442 }, { "epoch": 0.4724360848520287, "grad_norm": 0.3852402865886688, "learning_rate": 5.86500797027471e-06, "loss": 0.028121672570705414, "memory(GiB)": 21.48, "step": 14543, "token_acc": 1.0, "train_speed(iter/s)": 0.953452 }, { "epoch": 0.4724685703147841, "grad_norm": 0.3251776099205017, "learning_rate": 5.864478911786128e-06, "loss": 0.02334645576775074, "memory(GiB)": 21.48, "step": 14544, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.953464 }, { "epoch": 0.47250105577753954, "grad_norm": 0.26859644055366516, "learning_rate": 5.863949843320309e-06, "loss": 0.019599124789237976, "memory(GiB)": 21.48, "step": 14545, "token_acc": 1.0, "train_speed(iter/s)": 0.953475 }, { "epoch": 0.47253354124029495, "grad_norm": 0.42765748500823975, "learning_rate": 5.86342076488336e-06, "loss": 0.022670181468129158, "memory(GiB)": 21.48, "step": 14546, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.953484 }, { "epoch": 0.47256602670305037, "grad_norm": 0.2692068815231323, "learning_rate": 5.862891676481383e-06, "loss": 0.01747865602374077, "memory(GiB)": 21.48, "step": 14547, "token_acc": 0.9927272727272727, "train_speed(iter/s)": 0.953494 }, { "epoch": 0.4725985121658058, "grad_norm": 0.39789196848869324, "learning_rate": 5.862362578120491e-06, "loss": 0.03000504896044731, "memory(GiB)": 21.48, "step": 14548, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.953505 }, { "epoch": 0.4726309976285612, "grad_norm": 0.2577266991138458, "learning_rate": 5.861833469806785e-06, "loss": 0.020200911909341812, "memory(GiB)": 21.48, "step": 14549, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.953515 }, { "epoch": 0.4726634830913166, "grad_norm": 0.4447347819805145, "learning_rate": 5.8613043515463755e-06, "loss": 0.025276388972997665, "memory(GiB)": 21.48, "step": 14550, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.953527 }, { "epoch": 0.47269596855407203, "grad_norm": 2.570452928543091, "learning_rate": 5.860775223345365e-06, "loss": 0.019623801112174988, "memory(GiB)": 21.48, "step": 14551, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.953538 }, { "epoch": 0.47272845401682745, "grad_norm": 0.3774833083152771, "learning_rate": 5.860246085209864e-06, "loss": 0.02340119145810604, "memory(GiB)": 21.48, "step": 14552, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.953549 }, { "epoch": 0.47276093947958286, "grad_norm": 0.3719913363456726, "learning_rate": 5.859716937145979e-06, "loss": 0.016759632155299187, "memory(GiB)": 21.48, "step": 14553, "token_acc": 1.0, "train_speed(iter/s)": 0.95356 }, { "epoch": 0.4727934249423383, "grad_norm": 0.44788119196891785, "learning_rate": 5.859187779159815e-06, "loss": 0.023698817938566208, "memory(GiB)": 21.48, "step": 14554, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.953574 }, { "epoch": 0.4728259104050937, "grad_norm": 0.3318772315979004, "learning_rate": 5.8586586112574805e-06, "loss": 0.023712720721960068, "memory(GiB)": 21.48, "step": 14555, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.953589 }, { "epoch": 0.4728583958678491, "grad_norm": 0.3099953532218933, "learning_rate": 5.858129433445082e-06, "loss": 0.016911551356315613, "memory(GiB)": 21.48, "step": 14556, "token_acc": 1.0, "train_speed(iter/s)": 0.953602 }, { "epoch": 0.47289088133060453, "grad_norm": 0.27606314420700073, "learning_rate": 5.85760024572873e-06, "loss": 0.01831231079995632, "memory(GiB)": 21.48, "step": 14557, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.953617 }, { "epoch": 0.47292336679335994, "grad_norm": 0.40156692266464233, "learning_rate": 5.857071048114528e-06, "loss": 0.02143876999616623, "memory(GiB)": 21.48, "step": 14558, "token_acc": 0.9890710382513661, "train_speed(iter/s)": 0.95363 }, { "epoch": 0.47295585225611536, "grad_norm": 0.4954855144023895, "learning_rate": 5.856541840608586e-06, "loss": 0.01861361227929592, "memory(GiB)": 21.48, "step": 14559, "token_acc": 0.99609375, "train_speed(iter/s)": 0.953643 }, { "epoch": 0.47298833771887083, "grad_norm": 0.3463155925273895, "learning_rate": 5.856012623217011e-06, "loss": 0.020634938031435013, "memory(GiB)": 21.48, "step": 14560, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.953657 }, { "epoch": 0.47302082318162625, "grad_norm": 0.4237931966781616, "learning_rate": 5.8554833959459114e-06, "loss": 0.029975878074765205, "memory(GiB)": 21.48, "step": 14561, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.953669 }, { "epoch": 0.47305330864438166, "grad_norm": 0.3527212142944336, "learning_rate": 5.854954158801395e-06, "loss": 0.02395833469927311, "memory(GiB)": 21.48, "step": 14562, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.953683 }, { "epoch": 0.4730857941071371, "grad_norm": 0.4523308575153351, "learning_rate": 5.854424911789568e-06, "loss": 0.02800031751394272, "memory(GiB)": 21.48, "step": 14563, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.953697 }, { "epoch": 0.4731182795698925, "grad_norm": 0.3152778148651123, "learning_rate": 5.853895654916542e-06, "loss": 0.01846534013748169, "memory(GiB)": 21.48, "step": 14564, "token_acc": 1.0, "train_speed(iter/s)": 0.95371 }, { "epoch": 0.4731507650326479, "grad_norm": 0.3624753952026367, "learning_rate": 5.853366388188422e-06, "loss": 0.021513354033231735, "memory(GiB)": 21.48, "step": 14565, "token_acc": 0.9783549783549783, "train_speed(iter/s)": 0.953723 }, { "epoch": 0.47318325049540333, "grad_norm": 0.43957486748695374, "learning_rate": 5.8528371116113195e-06, "loss": 0.019488634541630745, "memory(GiB)": 21.48, "step": 14566, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.953737 }, { "epoch": 0.47321573595815875, "grad_norm": 0.39353352785110474, "learning_rate": 5.852307825191341e-06, "loss": 0.022876134142279625, "memory(GiB)": 21.48, "step": 14567, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.953746 }, { "epoch": 0.47324822142091416, "grad_norm": 0.36761096119880676, "learning_rate": 5.8517785289345955e-06, "loss": 0.023849140852689743, "memory(GiB)": 21.48, "step": 14568, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.953757 }, { "epoch": 0.4732807068836696, "grad_norm": 0.28360816836357117, "learning_rate": 5.851249222847193e-06, "loss": 0.0167032852768898, "memory(GiB)": 21.48, "step": 14569, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.953768 }, { "epoch": 0.473313192346425, "grad_norm": 0.43896645307540894, "learning_rate": 5.85071990693524e-06, "loss": 0.024940164759755135, "memory(GiB)": 21.48, "step": 14570, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.953779 }, { "epoch": 0.4733456778091804, "grad_norm": 1.0886305570602417, "learning_rate": 5.850190581204847e-06, "loss": 0.02621772699058056, "memory(GiB)": 21.48, "step": 14571, "token_acc": 0.9927272727272727, "train_speed(iter/s)": 0.95379 }, { "epoch": 0.4733781632719358, "grad_norm": 0.392204612493515, "learning_rate": 5.849661245662125e-06, "loss": 0.02518216148018837, "memory(GiB)": 21.48, "step": 14572, "token_acc": 0.99609375, "train_speed(iter/s)": 0.9538 }, { "epoch": 0.47341064873469124, "grad_norm": 0.7684295177459717, "learning_rate": 5.84913190031318e-06, "loss": 0.024465033784508705, "memory(GiB)": 21.48, "step": 14573, "token_acc": 0.9898477157360406, "train_speed(iter/s)": 0.953809 }, { "epoch": 0.47344313419744666, "grad_norm": 0.8537788987159729, "learning_rate": 5.848602545164122e-06, "loss": 0.024256262928247452, "memory(GiB)": 21.48, "step": 14574, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.953819 }, { "epoch": 0.4734756196602021, "grad_norm": 0.33380216360092163, "learning_rate": 5.8480731802210626e-06, "loss": 0.015154935419559479, "memory(GiB)": 21.48, "step": 14575, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.953829 }, { "epoch": 0.4735081051229575, "grad_norm": 0.2899419069290161, "learning_rate": 5.84754380549011e-06, "loss": 0.020034991204738617, "memory(GiB)": 21.48, "step": 14576, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.953839 }, { "epoch": 0.4735405905857129, "grad_norm": 0.27276456356048584, "learning_rate": 5.847014420977373e-06, "loss": 0.011178646236658096, "memory(GiB)": 21.48, "step": 14577, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.953848 }, { "epoch": 0.4735730760484683, "grad_norm": 0.37781310081481934, "learning_rate": 5.846485026688962e-06, "loss": 0.02701636403799057, "memory(GiB)": 21.48, "step": 14578, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.953859 }, { "epoch": 0.47360556151122374, "grad_norm": 0.5655412077903748, "learning_rate": 5.845955622630988e-06, "loss": 0.03800422325730324, "memory(GiB)": 21.48, "step": 14579, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.953869 }, { "epoch": 0.47363804697397915, "grad_norm": 0.5900435447692871, "learning_rate": 5.8454262088095595e-06, "loss": 0.032124049961566925, "memory(GiB)": 21.48, "step": 14580, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.953858 }, { "epoch": 0.47367053243673457, "grad_norm": 0.43410366773605347, "learning_rate": 5.844896785230788e-06, "loss": 0.02324514277279377, "memory(GiB)": 21.48, "step": 14581, "token_acc": 1.0, "train_speed(iter/s)": 0.953868 }, { "epoch": 0.47370301789949, "grad_norm": 0.5484323501586914, "learning_rate": 5.8443673519007816e-06, "loss": 0.024383720010519028, "memory(GiB)": 21.48, "step": 14582, "token_acc": 1.0, "train_speed(iter/s)": 0.953879 }, { "epoch": 0.4737355033622454, "grad_norm": 0.4298153221607208, "learning_rate": 5.8438379088256536e-06, "loss": 0.022920917719602585, "memory(GiB)": 21.48, "step": 14583, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.953889 }, { "epoch": 0.4737679888250008, "grad_norm": 0.48990482091903687, "learning_rate": 5.843308456011512e-06, "loss": 0.02583327889442444, "memory(GiB)": 21.48, "step": 14584, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.9539 }, { "epoch": 0.47380047428775623, "grad_norm": 0.4528105854988098, "learning_rate": 5.8427789934644685e-06, "loss": 0.017002280801534653, "memory(GiB)": 21.48, "step": 14585, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.953912 }, { "epoch": 0.47383295975051165, "grad_norm": 0.44564831256866455, "learning_rate": 5.842249521190634e-06, "loss": 0.016809560358524323, "memory(GiB)": 21.48, "step": 14586, "token_acc": 1.0, "train_speed(iter/s)": 0.953926 }, { "epoch": 0.47386544521326707, "grad_norm": 0.3353008031845093, "learning_rate": 5.8417200391961185e-06, "loss": 0.02388489991426468, "memory(GiB)": 21.48, "step": 14587, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.95394 }, { "epoch": 0.4738979306760225, "grad_norm": 0.4224707782268524, "learning_rate": 5.841190547487034e-06, "loss": 0.024858614429831505, "memory(GiB)": 21.48, "step": 14588, "token_acc": 1.0, "train_speed(iter/s)": 0.953953 }, { "epoch": 0.4739304161387779, "grad_norm": 0.45126423239707947, "learning_rate": 5.84066104606949e-06, "loss": 0.02709341049194336, "memory(GiB)": 21.48, "step": 14589, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.953966 }, { "epoch": 0.4739629016015333, "grad_norm": 0.334747314453125, "learning_rate": 5.8401315349496e-06, "loss": 0.019826896488666534, "memory(GiB)": 21.48, "step": 14590, "token_acc": 1.0, "train_speed(iter/s)": 0.953979 }, { "epoch": 0.47399538706428873, "grad_norm": 0.29860565066337585, "learning_rate": 5.839602014133473e-06, "loss": 0.016895603388547897, "memory(GiB)": 21.48, "step": 14591, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.953993 }, { "epoch": 0.47402787252704415, "grad_norm": 0.5176042318344116, "learning_rate": 5.839072483627222e-06, "loss": 0.018762117251753807, "memory(GiB)": 21.48, "step": 14592, "token_acc": 1.0, "train_speed(iter/s)": 0.954005 }, { "epoch": 0.47406035798979956, "grad_norm": 0.4080410599708557, "learning_rate": 5.838542943436957e-06, "loss": 0.021906021982431412, "memory(GiB)": 21.48, "step": 14593, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.954016 }, { "epoch": 0.474092843452555, "grad_norm": 0.3446502387523651, "learning_rate": 5.838013393568791e-06, "loss": 0.024751612916588783, "memory(GiB)": 21.48, "step": 14594, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.954026 }, { "epoch": 0.4741253289153104, "grad_norm": 0.3448447585105896, "learning_rate": 5.8374838340288356e-06, "loss": 0.022957587614655495, "memory(GiB)": 21.48, "step": 14595, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.954036 }, { "epoch": 0.4741578143780658, "grad_norm": 0.3238371014595032, "learning_rate": 5.8369542648232e-06, "loss": 0.020066101104021072, "memory(GiB)": 21.48, "step": 14596, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.954046 }, { "epoch": 0.4741902998408212, "grad_norm": 0.2826353907585144, "learning_rate": 5.8364246859579996e-06, "loss": 0.015163393691182137, "memory(GiB)": 21.48, "step": 14597, "token_acc": 0.9927007299270073, "train_speed(iter/s)": 0.954056 }, { "epoch": 0.47422278530357664, "grad_norm": 0.2547032833099365, "learning_rate": 5.835895097439345e-06, "loss": 0.017944317311048508, "memory(GiB)": 21.48, "step": 14598, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.954067 }, { "epoch": 0.47425527076633206, "grad_norm": 0.2607942521572113, "learning_rate": 5.835365499273348e-06, "loss": 0.022104352712631226, "memory(GiB)": 21.48, "step": 14599, "token_acc": 0.9862068965517241, "train_speed(iter/s)": 0.954077 }, { "epoch": 0.4742877562290875, "grad_norm": 0.5815009474754333, "learning_rate": 5.834835891466121e-06, "loss": 0.022160537540912628, "memory(GiB)": 21.48, "step": 14600, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.954087 }, { "epoch": 0.4743202416918429, "grad_norm": 0.46307238936424255, "learning_rate": 5.834306274023778e-06, "loss": 0.02116050198674202, "memory(GiB)": 21.48, "step": 14601, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.954098 }, { "epoch": 0.4743527271545983, "grad_norm": 0.3413407504558563, "learning_rate": 5.8337766469524286e-06, "loss": 0.01931624859571457, "memory(GiB)": 21.48, "step": 14602, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.954108 }, { "epoch": 0.4743852126173537, "grad_norm": 0.29400935769081116, "learning_rate": 5.833247010258186e-06, "loss": 0.018902156502008438, "memory(GiB)": 21.48, "step": 14603, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.954118 }, { "epoch": 0.47441769808010914, "grad_norm": 0.4214405119419098, "learning_rate": 5.832717363947167e-06, "loss": 0.02414080500602722, "memory(GiB)": 21.48, "step": 14604, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.954129 }, { "epoch": 0.47445018354286456, "grad_norm": 0.596611499786377, "learning_rate": 5.8321877080254785e-06, "loss": 0.02724149450659752, "memory(GiB)": 21.48, "step": 14605, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.954139 }, { "epoch": 0.47448266900562, "grad_norm": 0.40350279211997986, "learning_rate": 5.8316580424992375e-06, "loss": 0.022727318108081818, "memory(GiB)": 21.48, "step": 14606, "token_acc": 0.9875, "train_speed(iter/s)": 0.95415 }, { "epoch": 0.4745151544683754, "grad_norm": 0.30214518308639526, "learning_rate": 5.8311283673745555e-06, "loss": 0.013635522685945034, "memory(GiB)": 21.48, "step": 14607, "token_acc": 1.0, "train_speed(iter/s)": 0.954161 }, { "epoch": 0.4745476399311308, "grad_norm": 0.4559883177280426, "learning_rate": 5.830598682657546e-06, "loss": 0.02316780760884285, "memory(GiB)": 21.48, "step": 14608, "token_acc": 0.978021978021978, "train_speed(iter/s)": 0.954172 }, { "epoch": 0.4745801253938862, "grad_norm": 0.4313136041164398, "learning_rate": 5.830068988354324e-06, "loss": 0.01750810816884041, "memory(GiB)": 21.48, "step": 14609, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.954182 }, { "epoch": 0.47461261085664164, "grad_norm": 0.47360098361968994, "learning_rate": 5.829539284470998e-06, "loss": 0.030179157853126526, "memory(GiB)": 21.48, "step": 14610, "token_acc": 0.9933993399339934, "train_speed(iter/s)": 0.954192 }, { "epoch": 0.47464509631939705, "grad_norm": 0.356137216091156, "learning_rate": 5.829009571013686e-06, "loss": 0.020660147070884705, "memory(GiB)": 21.48, "step": 14611, "token_acc": 0.9834710743801653, "train_speed(iter/s)": 0.954202 }, { "epoch": 0.47467758178215247, "grad_norm": 0.460742324590683, "learning_rate": 5.8284798479885e-06, "loss": 0.018975242972373962, "memory(GiB)": 21.48, "step": 14612, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.954211 }, { "epoch": 0.4747100672449079, "grad_norm": 0.3625032305717468, "learning_rate": 5.827950115401554e-06, "loss": 0.02782587707042694, "memory(GiB)": 21.48, "step": 14613, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.954222 }, { "epoch": 0.4747425527076633, "grad_norm": 0.3394728899002075, "learning_rate": 5.82742037325896e-06, "loss": 0.024533474817872047, "memory(GiB)": 21.48, "step": 14614, "token_acc": 0.9826839826839827, "train_speed(iter/s)": 0.954233 }, { "epoch": 0.4747750381704187, "grad_norm": 0.37964847683906555, "learning_rate": 5.826890621566836e-06, "loss": 0.025581158697605133, "memory(GiB)": 21.48, "step": 14615, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.954244 }, { "epoch": 0.47480752363317413, "grad_norm": 0.3736102283000946, "learning_rate": 5.826360860331293e-06, "loss": 0.019540322944521904, "memory(GiB)": 21.48, "step": 14616, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.954255 }, { "epoch": 0.47484000909592955, "grad_norm": 0.31556227803230286, "learning_rate": 5.825831089558444e-06, "loss": 0.02004959248006344, "memory(GiB)": 21.48, "step": 14617, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.954268 }, { "epoch": 0.47487249455868497, "grad_norm": 0.2824592888355255, "learning_rate": 5.8253013092544075e-06, "loss": 0.015506215393543243, "memory(GiB)": 21.48, "step": 14618, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.954282 }, { "epoch": 0.4749049800214404, "grad_norm": 0.3318420350551605, "learning_rate": 5.824771519425294e-06, "loss": 0.01947631686925888, "memory(GiB)": 21.48, "step": 14619, "token_acc": 0.9933333333333333, "train_speed(iter/s)": 0.954295 }, { "epoch": 0.4749374654841958, "grad_norm": 0.46807408332824707, "learning_rate": 5.82424172007722e-06, "loss": 0.028069550171494484, "memory(GiB)": 21.48, "step": 14620, "token_acc": 0.9885931558935361, "train_speed(iter/s)": 0.954309 }, { "epoch": 0.4749699509469512, "grad_norm": 0.37835341691970825, "learning_rate": 5.8237119112162995e-06, "loss": 0.02451651729643345, "memory(GiB)": 21.48, "step": 14621, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.954323 }, { "epoch": 0.47500243640970663, "grad_norm": 0.20294572412967682, "learning_rate": 5.823182092848647e-06, "loss": 0.012805741280317307, "memory(GiB)": 21.48, "step": 14622, "token_acc": 1.0, "train_speed(iter/s)": 0.954336 }, { "epoch": 0.47503492187246205, "grad_norm": 0.4656606614589691, "learning_rate": 5.822652264980378e-06, "loss": 0.030844667926430702, "memory(GiB)": 21.48, "step": 14623, "token_acc": 0.9552238805970149, "train_speed(iter/s)": 0.95435 }, { "epoch": 0.4750674073352175, "grad_norm": 0.46074074506759644, "learning_rate": 5.822122427617607e-06, "loss": 0.02868969365954399, "memory(GiB)": 21.48, "step": 14624, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.954364 }, { "epoch": 0.47509989279797293, "grad_norm": 0.3861773908138275, "learning_rate": 5.821592580766449e-06, "loss": 0.027676112949848175, "memory(GiB)": 21.48, "step": 14625, "token_acc": 0.9814126394052045, "train_speed(iter/s)": 0.954376 }, { "epoch": 0.47513237826072835, "grad_norm": 0.2774016857147217, "learning_rate": 5.821062724433018e-06, "loss": 0.013374974019825459, "memory(GiB)": 21.48, "step": 14626, "token_acc": 1.0, "train_speed(iter/s)": 0.954386 }, { "epoch": 0.47516486372348377, "grad_norm": 0.401720255613327, "learning_rate": 5.8205328586234315e-06, "loss": 0.029361985623836517, "memory(GiB)": 21.48, "step": 14627, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.954397 }, { "epoch": 0.4751973491862392, "grad_norm": 0.4594588577747345, "learning_rate": 5.820002983343802e-06, "loss": 0.031041674315929413, "memory(GiB)": 21.48, "step": 14628, "token_acc": 0.98, "train_speed(iter/s)": 0.954408 }, { "epoch": 0.4752298346489946, "grad_norm": 0.41673988103866577, "learning_rate": 5.819473098600248e-06, "loss": 0.029513180255889893, "memory(GiB)": 21.48, "step": 14629, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.954418 }, { "epoch": 0.47526232011175, "grad_norm": 0.2800211012363434, "learning_rate": 5.818943204398882e-06, "loss": 0.02055952697992325, "memory(GiB)": 21.48, "step": 14630, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.95443 }, { "epoch": 0.47529480557450543, "grad_norm": 0.47773608565330505, "learning_rate": 5.818413300745824e-06, "loss": 0.02991187386214733, "memory(GiB)": 21.48, "step": 14631, "token_acc": 0.996, "train_speed(iter/s)": 0.954441 }, { "epoch": 0.47532729103726085, "grad_norm": 0.3481687307357788, "learning_rate": 5.8178833876471855e-06, "loss": 0.02178352326154709, "memory(GiB)": 21.48, "step": 14632, "token_acc": 1.0, "train_speed(iter/s)": 0.954451 }, { "epoch": 0.47535977650001626, "grad_norm": 0.2933685779571533, "learning_rate": 5.817353465109083e-06, "loss": 0.017508793622255325, "memory(GiB)": 21.48, "step": 14633, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.954461 }, { "epoch": 0.4753922619627717, "grad_norm": 0.5272020697593689, "learning_rate": 5.8168235331376366e-06, "loss": 0.026359975337982178, "memory(GiB)": 21.48, "step": 14634, "token_acc": 0.9881656804733728, "train_speed(iter/s)": 0.954471 }, { "epoch": 0.4754247474255271, "grad_norm": 0.35401618480682373, "learning_rate": 5.816293591738957e-06, "loss": 0.021594351157546043, "memory(GiB)": 21.48, "step": 14635, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.954481 }, { "epoch": 0.4754572328882825, "grad_norm": 0.2803660035133362, "learning_rate": 5.815763640919165e-06, "loss": 0.018854549154639244, "memory(GiB)": 21.48, "step": 14636, "token_acc": 1.0, "train_speed(iter/s)": 0.954492 }, { "epoch": 0.4754897183510379, "grad_norm": 0.41873490810394287, "learning_rate": 5.815233680684374e-06, "loss": 0.03206409886479378, "memory(GiB)": 21.48, "step": 14637, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.954503 }, { "epoch": 0.47552220381379334, "grad_norm": 0.346745103597641, "learning_rate": 5.814703711040701e-06, "loss": 0.01862933300435543, "memory(GiB)": 21.48, "step": 14638, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.954514 }, { "epoch": 0.47555468927654876, "grad_norm": 0.3427447974681854, "learning_rate": 5.814173731994262e-06, "loss": 0.02248198166489601, "memory(GiB)": 21.48, "step": 14639, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.954525 }, { "epoch": 0.4755871747393042, "grad_norm": 0.3679288625717163, "learning_rate": 5.813643743551176e-06, "loss": 0.02241404913365841, "memory(GiB)": 21.48, "step": 14640, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.954536 }, { "epoch": 0.4756196602020596, "grad_norm": 0.3263351023197174, "learning_rate": 5.813113745717558e-06, "loss": 0.017372027039527893, "memory(GiB)": 21.48, "step": 14641, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.954546 }, { "epoch": 0.475652145664815, "grad_norm": 0.35891419649124146, "learning_rate": 5.812583738499524e-06, "loss": 0.024584032595157623, "memory(GiB)": 21.48, "step": 14642, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.954557 }, { "epoch": 0.4756846311275704, "grad_norm": 0.28212770819664, "learning_rate": 5.812053721903193e-06, "loss": 0.017692584544420242, "memory(GiB)": 21.48, "step": 14643, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.954569 }, { "epoch": 0.47571711659032584, "grad_norm": 0.28593429923057556, "learning_rate": 5.81152369593468e-06, "loss": 0.020294558256864548, "memory(GiB)": 21.48, "step": 14644, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.954581 }, { "epoch": 0.47574960205308126, "grad_norm": 0.35666385293006897, "learning_rate": 5.810993660600105e-06, "loss": 0.030875766649842262, "memory(GiB)": 21.48, "step": 14645, "token_acc": 1.0, "train_speed(iter/s)": 0.954594 }, { "epoch": 0.47578208751583667, "grad_norm": 0.4654875099658966, "learning_rate": 5.810463615905583e-06, "loss": 0.028963275253772736, "memory(GiB)": 21.48, "step": 14646, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.954607 }, { "epoch": 0.4758145729785921, "grad_norm": 0.43172767758369446, "learning_rate": 5.809933561857233e-06, "loss": 0.026598187163472176, "memory(GiB)": 21.48, "step": 14647, "token_acc": 1.0, "train_speed(iter/s)": 0.954621 }, { "epoch": 0.4758470584413475, "grad_norm": 0.3235112130641937, "learning_rate": 5.80940349846117e-06, "loss": 0.01894564926624298, "memory(GiB)": 21.48, "step": 14648, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.954635 }, { "epoch": 0.4758795439041029, "grad_norm": 0.33500218391418457, "learning_rate": 5.8088734257235136e-06, "loss": 0.021854029968380928, "memory(GiB)": 21.48, "step": 14649, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.954649 }, { "epoch": 0.47591202936685834, "grad_norm": 0.393775999546051, "learning_rate": 5.8083433436503825e-06, "loss": 0.022171584889292717, "memory(GiB)": 21.48, "step": 14650, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.954663 }, { "epoch": 0.47594451482961375, "grad_norm": 0.30614131689071655, "learning_rate": 5.807813252247892e-06, "loss": 0.01599125750362873, "memory(GiB)": 21.48, "step": 14651, "token_acc": 1.0, "train_speed(iter/s)": 0.954676 }, { "epoch": 0.47597700029236917, "grad_norm": 0.47119152545928955, "learning_rate": 5.807283151522163e-06, "loss": 0.02100718952715397, "memory(GiB)": 21.48, "step": 14652, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.954689 }, { "epoch": 0.4760094857551246, "grad_norm": 0.569082498550415, "learning_rate": 5.806753041479312e-06, "loss": 0.021509718149900436, "memory(GiB)": 21.48, "step": 14653, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.954702 }, { "epoch": 0.47604197121788, "grad_norm": 0.292916864156723, "learning_rate": 5.806222922125455e-06, "loss": 0.019579531624913216, "memory(GiB)": 21.48, "step": 14654, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.954716 }, { "epoch": 0.4760744566806354, "grad_norm": 0.3019794225692749, "learning_rate": 5.805692793466714e-06, "loss": 0.01609465479850769, "memory(GiB)": 21.48, "step": 14655, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.95473 }, { "epoch": 0.47610694214339083, "grad_norm": 0.34844234585762024, "learning_rate": 5.8051626555092045e-06, "loss": 0.024121003225445747, "memory(GiB)": 21.48, "step": 14656, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.954742 }, { "epoch": 0.47613942760614625, "grad_norm": 0.38369491696357727, "learning_rate": 5.804632508259047e-06, "loss": 0.026291899383068085, "memory(GiB)": 21.48, "step": 14657, "token_acc": 0.9898477157360406, "train_speed(iter/s)": 0.954753 }, { "epoch": 0.47617191306890166, "grad_norm": 0.7744274139404297, "learning_rate": 5.80410235172236e-06, "loss": 0.03379296511411667, "memory(GiB)": 21.48, "step": 14658, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.954764 }, { "epoch": 0.4762043985316571, "grad_norm": 0.5520646572113037, "learning_rate": 5.8035721859052605e-06, "loss": 0.020168952643871307, "memory(GiB)": 21.48, "step": 14659, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.954775 }, { "epoch": 0.4762368839944125, "grad_norm": 0.2667924165725708, "learning_rate": 5.803042010813869e-06, "loss": 0.017305832356214523, "memory(GiB)": 21.48, "step": 14660, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.954786 }, { "epoch": 0.4762693694571679, "grad_norm": 0.3463656008243561, "learning_rate": 5.802511826454305e-06, "loss": 0.0188347976654768, "memory(GiB)": 21.48, "step": 14661, "token_acc": 0.9859649122807017, "train_speed(iter/s)": 0.954796 }, { "epoch": 0.47630185491992333, "grad_norm": 0.4294890761375427, "learning_rate": 5.801981632832685e-06, "loss": 0.024596352130174637, "memory(GiB)": 21.48, "step": 14662, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.954806 }, { "epoch": 0.47633434038267874, "grad_norm": 0.36563998460769653, "learning_rate": 5.80145142995513e-06, "loss": 0.021075019612908363, "memory(GiB)": 21.48, "step": 14663, "token_acc": 1.0, "train_speed(iter/s)": 0.954816 }, { "epoch": 0.47636682584543416, "grad_norm": 0.35351791977882385, "learning_rate": 5.800921217827757e-06, "loss": 0.023968247696757317, "memory(GiB)": 21.48, "step": 14664, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.954826 }, { "epoch": 0.4763993113081896, "grad_norm": 0.3577877879142761, "learning_rate": 5.800390996456688e-06, "loss": 0.021164266392588615, "memory(GiB)": 21.48, "step": 14665, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.954837 }, { "epoch": 0.476431796770945, "grad_norm": 0.4285350739955902, "learning_rate": 5.7998607658480435e-06, "loss": 0.021957164630293846, "memory(GiB)": 21.48, "step": 14666, "token_acc": 0.9948453608247423, "train_speed(iter/s)": 0.954847 }, { "epoch": 0.4764642822337004, "grad_norm": 0.48094281554222107, "learning_rate": 5.799330526007941e-06, "loss": 0.019903715699911118, "memory(GiB)": 21.48, "step": 14667, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.954857 }, { "epoch": 0.4764967676964558, "grad_norm": 0.37146881222724915, "learning_rate": 5.798800276942498e-06, "loss": 0.019755441695451736, "memory(GiB)": 21.48, "step": 14668, "token_acc": 0.9961685823754789, "train_speed(iter/s)": 0.954868 }, { "epoch": 0.47652925315921124, "grad_norm": 0.4234958291053772, "learning_rate": 5.7982700186578375e-06, "loss": 0.0277269184589386, "memory(GiB)": 21.48, "step": 14669, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.954875 }, { "epoch": 0.47656173862196666, "grad_norm": 0.4008808434009552, "learning_rate": 5.7977397511600785e-06, "loss": 0.02533940225839615, "memory(GiB)": 21.48, "step": 14670, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.954887 }, { "epoch": 0.4765942240847221, "grad_norm": 0.3438219726085663, "learning_rate": 5.797209474455342e-06, "loss": 0.025946279987692833, "memory(GiB)": 21.48, "step": 14671, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.954898 }, { "epoch": 0.4766267095474775, "grad_norm": 0.3345026969909668, "learning_rate": 5.796679188549745e-06, "loss": 0.021128296852111816, "memory(GiB)": 21.48, "step": 14672, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.954908 }, { "epoch": 0.4766591950102329, "grad_norm": 0.3813493847846985, "learning_rate": 5.7961488934494125e-06, "loss": 0.02099740318953991, "memory(GiB)": 21.48, "step": 14673, "token_acc": 1.0, "train_speed(iter/s)": 0.954918 }, { "epoch": 0.4766916804729883, "grad_norm": 0.5598138570785522, "learning_rate": 5.79561858916046e-06, "loss": 0.020935887470841408, "memory(GiB)": 21.48, "step": 14674, "token_acc": 0.9944444444444445, "train_speed(iter/s)": 0.954927 }, { "epoch": 0.47672416593574374, "grad_norm": 0.5674008727073669, "learning_rate": 5.795088275689011e-06, "loss": 0.021901581436395645, "memory(GiB)": 21.48, "step": 14675, "token_acc": 1.0, "train_speed(iter/s)": 0.954936 }, { "epoch": 0.47675665139849915, "grad_norm": 0.3710916340351105, "learning_rate": 5.794557953041186e-06, "loss": 0.026199471205472946, "memory(GiB)": 21.48, "step": 14676, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.954948 }, { "epoch": 0.47678913686125457, "grad_norm": 0.4178377687931061, "learning_rate": 5.794027621223103e-06, "loss": 0.014217663556337357, "memory(GiB)": 21.48, "step": 14677, "token_acc": 1.0, "train_speed(iter/s)": 0.954959 }, { "epoch": 0.47682162232401, "grad_norm": 0.2574221193790436, "learning_rate": 5.793497280240885e-06, "loss": 0.019625622779130936, "memory(GiB)": 21.48, "step": 14678, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.95497 }, { "epoch": 0.4768541077867654, "grad_norm": 0.41962283849716187, "learning_rate": 5.792966930100651e-06, "loss": 0.020626701414585114, "memory(GiB)": 21.48, "step": 14679, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.954982 }, { "epoch": 0.4768865932495208, "grad_norm": 0.6012365818023682, "learning_rate": 5.7924365708085236e-06, "loss": 0.02234516106545925, "memory(GiB)": 21.48, "step": 14680, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.954995 }, { "epoch": 0.47691907871227623, "grad_norm": 0.4041999578475952, "learning_rate": 5.791906202370625e-06, "loss": 0.019890563562512398, "memory(GiB)": 21.48, "step": 14681, "token_acc": 1.0, "train_speed(iter/s)": 0.95501 }, { "epoch": 0.47695156417503165, "grad_norm": 0.5664361715316772, "learning_rate": 5.791375824793074e-06, "loss": 0.02722964808344841, "memory(GiB)": 21.48, "step": 14682, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.955023 }, { "epoch": 0.47698404963778707, "grad_norm": 0.3336043655872345, "learning_rate": 5.790845438081993e-06, "loss": 0.024133093655109406, "memory(GiB)": 21.48, "step": 14683, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.955036 }, { "epoch": 0.4770165351005425, "grad_norm": 0.30302873253822327, "learning_rate": 5.790315042243502e-06, "loss": 0.020735876634716988, "memory(GiB)": 21.48, "step": 14684, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.955048 }, { "epoch": 0.4770490205632979, "grad_norm": 0.4231453835964203, "learning_rate": 5.789784637283724e-06, "loss": 0.016823478043079376, "memory(GiB)": 21.48, "step": 14685, "token_acc": 1.0, "train_speed(iter/s)": 0.955058 }, { "epoch": 0.4770815060260533, "grad_norm": 0.451593816280365, "learning_rate": 5.78925422320878e-06, "loss": 0.023206040263175964, "memory(GiB)": 21.48, "step": 14686, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955068 }, { "epoch": 0.47711399148880873, "grad_norm": 0.40739014744758606, "learning_rate": 5.788723800024793e-06, "loss": 0.02633366733789444, "memory(GiB)": 21.48, "step": 14687, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.955079 }, { "epoch": 0.4771464769515642, "grad_norm": 0.29587602615356445, "learning_rate": 5.788193367737882e-06, "loss": 0.014905366115272045, "memory(GiB)": 21.48, "step": 14688, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955089 }, { "epoch": 0.4771789624143196, "grad_norm": 0.3981286883354187, "learning_rate": 5.787662926354172e-06, "loss": 0.02442728728055954, "memory(GiB)": 21.48, "step": 14689, "token_acc": 0.9807692307692307, "train_speed(iter/s)": 0.9551 }, { "epoch": 0.47721144787707503, "grad_norm": 0.3535495102405548, "learning_rate": 5.787132475879782e-06, "loss": 0.01945873722434044, "memory(GiB)": 21.48, "step": 14690, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955111 }, { "epoch": 0.47724393333983045, "grad_norm": 0.35246121883392334, "learning_rate": 5.786602016320836e-06, "loss": 0.021023474633693695, "memory(GiB)": 21.48, "step": 14691, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.955123 }, { "epoch": 0.47727641880258587, "grad_norm": 0.3484974205493927, "learning_rate": 5.786071547683456e-06, "loss": 0.026328474283218384, "memory(GiB)": 21.48, "step": 14692, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955134 }, { "epoch": 0.4773089042653413, "grad_norm": 0.31536251306533813, "learning_rate": 5.785541069973763e-06, "loss": 0.01967432349920273, "memory(GiB)": 21.48, "step": 14693, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955143 }, { "epoch": 0.4773413897280967, "grad_norm": 0.3105942904949188, "learning_rate": 5.785010583197881e-06, "loss": 0.018655210733413696, "memory(GiB)": 21.48, "step": 14694, "token_acc": 1.0, "train_speed(iter/s)": 0.955155 }, { "epoch": 0.4773738751908521, "grad_norm": 0.3520239591598511, "learning_rate": 5.7844800873619335e-06, "loss": 0.023751623928546906, "memory(GiB)": 21.48, "step": 14695, "token_acc": 0.996, "train_speed(iter/s)": 0.955165 }, { "epoch": 0.47740636065360753, "grad_norm": 0.33947065472602844, "learning_rate": 5.783949582472039e-06, "loss": 0.0207684263586998, "memory(GiB)": 21.48, "step": 14696, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955176 }, { "epoch": 0.47743884611636295, "grad_norm": 0.6454399824142456, "learning_rate": 5.783419068534326e-06, "loss": 0.029630213975906372, "memory(GiB)": 21.48, "step": 14697, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.955187 }, { "epoch": 0.47747133157911836, "grad_norm": 0.3342589735984802, "learning_rate": 5.782888545554912e-06, "loss": 0.022318586707115173, "memory(GiB)": 21.48, "step": 14698, "token_acc": 1.0, "train_speed(iter/s)": 0.955198 }, { "epoch": 0.4775038170418738, "grad_norm": 0.8634442687034607, "learning_rate": 5.7823580135399225e-06, "loss": 0.023774072527885437, "memory(GiB)": 21.48, "step": 14699, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955208 }, { "epoch": 0.4775363025046292, "grad_norm": 0.4472624659538269, "learning_rate": 5.78182747249548e-06, "loss": 0.026753904297947884, "memory(GiB)": 21.48, "step": 14700, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.95522 }, { "epoch": 0.4775687879673846, "grad_norm": 0.37028077244758606, "learning_rate": 5.781296922427709e-06, "loss": 0.014696558937430382, "memory(GiB)": 21.48, "step": 14701, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.95523 }, { "epoch": 0.47760127343014, "grad_norm": 0.2867756187915802, "learning_rate": 5.780766363342731e-06, "loss": 0.01774751953780651, "memory(GiB)": 21.48, "step": 14702, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955241 }, { "epoch": 0.47763375889289544, "grad_norm": 0.29046714305877686, "learning_rate": 5.78023579524667e-06, "loss": 0.01612619124352932, "memory(GiB)": 21.48, "step": 14703, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955252 }, { "epoch": 0.47766624435565086, "grad_norm": 0.5786895751953125, "learning_rate": 5.779705218145651e-06, "loss": 0.03282879665493965, "memory(GiB)": 21.48, "step": 14704, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.955264 }, { "epoch": 0.4776987298184063, "grad_norm": 0.37508708238601685, "learning_rate": 5.779174632045793e-06, "loss": 0.01712646335363388, "memory(GiB)": 21.48, "step": 14705, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955278 }, { "epoch": 0.4777312152811617, "grad_norm": 0.34103551506996155, "learning_rate": 5.778644036953225e-06, "loss": 0.01677665114402771, "memory(GiB)": 21.48, "step": 14706, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955291 }, { "epoch": 0.4777637007439171, "grad_norm": 0.3586171567440033, "learning_rate": 5.778113432874066e-06, "loss": 0.027543414384126663, "memory(GiB)": 21.48, "step": 14707, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.955305 }, { "epoch": 0.4777961862066725, "grad_norm": 0.3377724289894104, "learning_rate": 5.777582819814444e-06, "loss": 0.021889016032218933, "memory(GiB)": 21.48, "step": 14708, "token_acc": 0.9842105263157894, "train_speed(iter/s)": 0.95532 }, { "epoch": 0.47782867166942794, "grad_norm": 0.36789295077323914, "learning_rate": 5.777052197780482e-06, "loss": 0.023691704496741295, "memory(GiB)": 21.48, "step": 14709, "token_acc": 0.996, "train_speed(iter/s)": 0.955334 }, { "epoch": 0.47786115713218336, "grad_norm": 0.45511817932128906, "learning_rate": 5.7765215667783015e-06, "loss": 0.028767958283424377, "memory(GiB)": 21.48, "step": 14710, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955348 }, { "epoch": 0.4778936425949388, "grad_norm": 0.3433626890182495, "learning_rate": 5.775990926814031e-06, "loss": 0.016891364008188248, "memory(GiB)": 21.48, "step": 14711, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.955363 }, { "epoch": 0.4779261280576942, "grad_norm": 0.40982145071029663, "learning_rate": 5.77546027789379e-06, "loss": 0.024945097044110298, "memory(GiB)": 21.48, "step": 14712, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955376 }, { "epoch": 0.4779586135204496, "grad_norm": 0.3580475449562073, "learning_rate": 5.774929620023706e-06, "loss": 0.017613880336284637, "memory(GiB)": 21.48, "step": 14713, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.95539 }, { "epoch": 0.477991098983205, "grad_norm": 0.419455349445343, "learning_rate": 5.7743989532099024e-06, "loss": 0.0208573117852211, "memory(GiB)": 21.48, "step": 14714, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.955405 }, { "epoch": 0.47802358444596044, "grad_norm": 0.4769526422023773, "learning_rate": 5.773868277458506e-06, "loss": 0.026467785239219666, "memory(GiB)": 21.48, "step": 14715, "token_acc": 0.9776951672862454, "train_speed(iter/s)": 0.955419 }, { "epoch": 0.47805606990871585, "grad_norm": 0.34093475341796875, "learning_rate": 5.773337592775638e-06, "loss": 0.018610522150993347, "memory(GiB)": 21.48, "step": 14716, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.955432 }, { "epoch": 0.47808855537147127, "grad_norm": 0.4085603356361389, "learning_rate": 5.772806899167424e-06, "loss": 0.018073182553052902, "memory(GiB)": 21.48, "step": 14717, "token_acc": 1.0, "train_speed(iter/s)": 0.955446 }, { "epoch": 0.4781210408342267, "grad_norm": 0.35315975546836853, "learning_rate": 5.772276196639992e-06, "loss": 0.0177448857575655, "memory(GiB)": 21.48, "step": 14718, "token_acc": 1.0, "train_speed(iter/s)": 0.955457 }, { "epoch": 0.4781535262969821, "grad_norm": 0.37393638491630554, "learning_rate": 5.7717454851994634e-06, "loss": 0.026625121012330055, "memory(GiB)": 21.48, "step": 14719, "token_acc": 0.988, "train_speed(iter/s)": 0.955468 }, { "epoch": 0.4781860117597375, "grad_norm": 0.33267509937286377, "learning_rate": 5.771214764851966e-06, "loss": 0.016742831096053123, "memory(GiB)": 21.48, "step": 14720, "token_acc": 1.0, "train_speed(iter/s)": 0.955478 }, { "epoch": 0.47821849722249293, "grad_norm": 0.3377365171909332, "learning_rate": 5.770684035603622e-06, "loss": 0.026496265083551407, "memory(GiB)": 21.48, "step": 14721, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.955488 }, { "epoch": 0.47825098268524835, "grad_norm": 0.5578241944313049, "learning_rate": 5.77015329746056e-06, "loss": 0.036496713757514954, "memory(GiB)": 21.48, "step": 14722, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.955499 }, { "epoch": 0.47828346814800377, "grad_norm": 0.27715978026390076, "learning_rate": 5.769622550428902e-06, "loss": 0.017727196216583252, "memory(GiB)": 21.48, "step": 14723, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.95551 }, { "epoch": 0.4783159536107592, "grad_norm": 0.49639570713043213, "learning_rate": 5.7690917945147775e-06, "loss": 0.023818254470825195, "memory(GiB)": 21.48, "step": 14724, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.95552 }, { "epoch": 0.4783484390735146, "grad_norm": 0.41018015146255493, "learning_rate": 5.768561029724309e-06, "loss": 0.02132062427699566, "memory(GiB)": 21.48, "step": 14725, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955531 }, { "epoch": 0.47838092453627, "grad_norm": 0.5349354147911072, "learning_rate": 5.7680302560636225e-06, "loss": 0.02088284119963646, "memory(GiB)": 21.48, "step": 14726, "token_acc": 1.0, "train_speed(iter/s)": 0.955543 }, { "epoch": 0.47841340999902543, "grad_norm": 0.362487256526947, "learning_rate": 5.7674994735388455e-06, "loss": 0.02472345530986786, "memory(GiB)": 21.48, "step": 14727, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.955554 }, { "epoch": 0.47844589546178085, "grad_norm": 0.39668548107147217, "learning_rate": 5.766968682156102e-06, "loss": 0.020696323364973068, "memory(GiB)": 21.48, "step": 14728, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.955564 }, { "epoch": 0.47847838092453626, "grad_norm": 0.3467937409877777, "learning_rate": 5.766437881921522e-06, "loss": 0.019561445340514183, "memory(GiB)": 21.48, "step": 14729, "token_acc": 1.0, "train_speed(iter/s)": 0.955574 }, { "epoch": 0.4785108663872917, "grad_norm": 0.45376110076904297, "learning_rate": 5.765907072841226e-06, "loss": 0.019192595034837723, "memory(GiB)": 21.48, "step": 14730, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.955586 }, { "epoch": 0.4785433518500471, "grad_norm": 0.36681824922561646, "learning_rate": 5.765376254921344e-06, "loss": 0.02048199437558651, "memory(GiB)": 21.48, "step": 14731, "token_acc": 0.9795081967213115, "train_speed(iter/s)": 0.955597 }, { "epoch": 0.4785758373128025, "grad_norm": 0.34118568897247314, "learning_rate": 5.764845428168001e-06, "loss": 0.01966799795627594, "memory(GiB)": 21.48, "step": 14732, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955608 }, { "epoch": 0.4786083227755579, "grad_norm": 0.24613094329833984, "learning_rate": 5.764314592587324e-06, "loss": 0.01383080892264843, "memory(GiB)": 21.48, "step": 14733, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955619 }, { "epoch": 0.47864080823831334, "grad_norm": 0.2759120464324951, "learning_rate": 5.76378374818544e-06, "loss": 0.01918919011950493, "memory(GiB)": 21.48, "step": 14734, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.95563 }, { "epoch": 0.47867329370106876, "grad_norm": 0.32390040159225464, "learning_rate": 5.763252894968473e-06, "loss": 0.019243841990828514, "memory(GiB)": 21.48, "step": 14735, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.955641 }, { "epoch": 0.4787057791638242, "grad_norm": 0.4361433684825897, "learning_rate": 5.762722032942554e-06, "loss": 0.029678836464881897, "memory(GiB)": 21.48, "step": 14736, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.955652 }, { "epoch": 0.4787382646265796, "grad_norm": 0.34517359733581543, "learning_rate": 5.762191162113805e-06, "loss": 0.019939279183745384, "memory(GiB)": 21.48, "step": 14737, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.955662 }, { "epoch": 0.478770750089335, "grad_norm": 0.37394973635673523, "learning_rate": 5.761660282488357e-06, "loss": 0.016870595514774323, "memory(GiB)": 21.48, "step": 14738, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.955673 }, { "epoch": 0.4788032355520904, "grad_norm": 0.21852052211761475, "learning_rate": 5.761129394072335e-06, "loss": 0.01141081377863884, "memory(GiB)": 21.48, "step": 14739, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955684 }, { "epoch": 0.47883572101484584, "grad_norm": 0.37887269258499146, "learning_rate": 5.760598496871868e-06, "loss": 0.017553161829710007, "memory(GiB)": 21.48, "step": 14740, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.955696 }, { "epoch": 0.47886820647760125, "grad_norm": 0.36821693181991577, "learning_rate": 5.760067590893081e-06, "loss": 0.02550225891172886, "memory(GiB)": 21.48, "step": 14741, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.955706 }, { "epoch": 0.47890069194035667, "grad_norm": 0.3859981894493103, "learning_rate": 5.759536676142101e-06, "loss": 0.02375064603984356, "memory(GiB)": 21.48, "step": 14742, "token_acc": 0.9713114754098361, "train_speed(iter/s)": 0.95572 }, { "epoch": 0.4789331774031121, "grad_norm": 0.2772402763366699, "learning_rate": 5.759005752625059e-06, "loss": 0.020937439054250717, "memory(GiB)": 21.48, "step": 14743, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955733 }, { "epoch": 0.4789656628658675, "grad_norm": 0.30276545882225037, "learning_rate": 5.758474820348079e-06, "loss": 0.014917595311999321, "memory(GiB)": 21.48, "step": 14744, "token_acc": 0.99, "train_speed(iter/s)": 0.955743 }, { "epoch": 0.4789981483286229, "grad_norm": 0.28078511357307434, "learning_rate": 5.757943879317292e-06, "loss": 0.01806924119591713, "memory(GiB)": 21.48, "step": 14745, "token_acc": 1.0, "train_speed(iter/s)": 0.955752 }, { "epoch": 0.47903063379137834, "grad_norm": 0.396714448928833, "learning_rate": 5.757412929538823e-06, "loss": 0.026344822719693184, "memory(GiB)": 21.48, "step": 14746, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.955764 }, { "epoch": 0.47906311925413375, "grad_norm": 0.4224678874015808, "learning_rate": 5.7568819710188e-06, "loss": 0.02490316703915596, "memory(GiB)": 21.48, "step": 14747, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.955774 }, { "epoch": 0.47909560471688917, "grad_norm": 0.30552417039871216, "learning_rate": 5.756351003763352e-06, "loss": 0.01891303062438965, "memory(GiB)": 21.48, "step": 14748, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.955785 }, { "epoch": 0.4791280901796446, "grad_norm": 0.354238897562027, "learning_rate": 5.755820027778606e-06, "loss": 0.02130657434463501, "memory(GiB)": 21.48, "step": 14749, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.955796 }, { "epoch": 0.4791605756424, "grad_norm": 0.24133069813251495, "learning_rate": 5.755289043070692e-06, "loss": 0.012336725369095802, "memory(GiB)": 21.48, "step": 14750, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.955806 }, { "epoch": 0.4791930611051554, "grad_norm": 0.35504400730133057, "learning_rate": 5.754758049645736e-06, "loss": 0.01817132532596588, "memory(GiB)": 21.48, "step": 14751, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.955815 }, { "epoch": 0.4792255465679109, "grad_norm": 0.5451922416687012, "learning_rate": 5.754227047509868e-06, "loss": 0.027362903580069542, "memory(GiB)": 21.48, "step": 14752, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955826 }, { "epoch": 0.4792580320306663, "grad_norm": 0.40560248494148254, "learning_rate": 5.753696036669216e-06, "loss": 0.021857362240552902, "memory(GiB)": 21.48, "step": 14753, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.955837 }, { "epoch": 0.4792905174934217, "grad_norm": 0.5591026544570923, "learning_rate": 5.753165017129909e-06, "loss": 0.02206173539161682, "memory(GiB)": 21.48, "step": 14754, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.955847 }, { "epoch": 0.47932300295617714, "grad_norm": 0.6537788510322571, "learning_rate": 5.752633988898075e-06, "loss": 0.028913216665387154, "memory(GiB)": 21.48, "step": 14755, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.955858 }, { "epoch": 0.47935548841893255, "grad_norm": 0.5293614268302917, "learning_rate": 5.752102951979842e-06, "loss": 0.024282269179821014, "memory(GiB)": 21.48, "step": 14756, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.955868 }, { "epoch": 0.47938797388168797, "grad_norm": 0.4012197256088257, "learning_rate": 5.7515719063813415e-06, "loss": 0.021456412971019745, "memory(GiB)": 21.48, "step": 14757, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.955876 }, { "epoch": 0.4794204593444434, "grad_norm": 0.3260604441165924, "learning_rate": 5.751040852108699e-06, "loss": 0.017890335991978645, "memory(GiB)": 21.48, "step": 14758, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955887 }, { "epoch": 0.4794529448071988, "grad_norm": 0.6851117014884949, "learning_rate": 5.750509789168048e-06, "loss": 0.02075718343257904, "memory(GiB)": 21.48, "step": 14759, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.955898 }, { "epoch": 0.4794854302699542, "grad_norm": 0.33794939517974854, "learning_rate": 5.7499787175655145e-06, "loss": 0.026599757373332977, "memory(GiB)": 21.48, "step": 14760, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.955909 }, { "epoch": 0.47951791573270963, "grad_norm": 0.24986211955547333, "learning_rate": 5.749447637307226e-06, "loss": 0.01702427864074707, "memory(GiB)": 21.48, "step": 14761, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955919 }, { "epoch": 0.47955040119546505, "grad_norm": 0.3325710892677307, "learning_rate": 5.748916548399317e-06, "loss": 0.022933565080165863, "memory(GiB)": 21.48, "step": 14762, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.95593 }, { "epoch": 0.47958288665822046, "grad_norm": 0.38797280192375183, "learning_rate": 5.748385450847913e-06, "loss": 0.025158056989312172, "memory(GiB)": 21.48, "step": 14763, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.955941 }, { "epoch": 0.4796153721209759, "grad_norm": 0.32148855924606323, "learning_rate": 5.747854344659145e-06, "loss": 0.021331872791051865, "memory(GiB)": 21.48, "step": 14764, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955953 }, { "epoch": 0.4796478575837313, "grad_norm": 0.3310922682285309, "learning_rate": 5.7473232298391415e-06, "loss": 0.028717076405882835, "memory(GiB)": 21.48, "step": 14765, "token_acc": 0.9845559845559846, "train_speed(iter/s)": 0.955967 }, { "epoch": 0.4796803430464867, "grad_norm": 0.34787246584892273, "learning_rate": 5.746792106394034e-06, "loss": 0.023219775408506393, "memory(GiB)": 21.48, "step": 14766, "token_acc": 0.988, "train_speed(iter/s)": 0.95598 }, { "epoch": 0.47971282850924213, "grad_norm": 0.45115089416503906, "learning_rate": 5.746260974329953e-06, "loss": 0.024374760687351227, "memory(GiB)": 21.48, "step": 14767, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.955993 }, { "epoch": 0.47974531397199754, "grad_norm": 0.3502654731273651, "learning_rate": 5.745729833653025e-06, "loss": 0.02046138048171997, "memory(GiB)": 21.48, "step": 14768, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.956007 }, { "epoch": 0.47977779943475296, "grad_norm": 0.48163971304893494, "learning_rate": 5.745198684369384e-06, "loss": 0.025624826550483704, "memory(GiB)": 21.48, "step": 14769, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.956021 }, { "epoch": 0.4798102848975084, "grad_norm": 0.3727543354034424, "learning_rate": 5.744667526485157e-06, "loss": 0.019538123160600662, "memory(GiB)": 21.48, "step": 14770, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.956034 }, { "epoch": 0.4798427703602638, "grad_norm": 0.31308525800704956, "learning_rate": 5.744136360006476e-06, "loss": 0.019213836640119553, "memory(GiB)": 21.48, "step": 14771, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.956047 }, { "epoch": 0.4798752558230192, "grad_norm": 0.286032110452652, "learning_rate": 5.743605184939469e-06, "loss": 0.013899841345846653, "memory(GiB)": 21.48, "step": 14772, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.956059 }, { "epoch": 0.4799077412857746, "grad_norm": 0.3349856436252594, "learning_rate": 5.7430740012902716e-06, "loss": 0.0198446623980999, "memory(GiB)": 21.48, "step": 14773, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956073 }, { "epoch": 0.47994022674853004, "grad_norm": 0.39521852135658264, "learning_rate": 5.74254280906501e-06, "loss": 0.027089640498161316, "memory(GiB)": 21.48, "step": 14774, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.956086 }, { "epoch": 0.47997271221128546, "grad_norm": 0.3245882987976074, "learning_rate": 5.7420116082698156e-06, "loss": 0.019883835688233376, "memory(GiB)": 21.48, "step": 14775, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956101 }, { "epoch": 0.4800051976740409, "grad_norm": 0.33356261253356934, "learning_rate": 5.741480398910821e-06, "loss": 0.0227917842566967, "memory(GiB)": 21.48, "step": 14776, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956114 }, { "epoch": 0.4800376831367963, "grad_norm": 0.29891520738601685, "learning_rate": 5.740949180994153e-06, "loss": 0.021122079342603683, "memory(GiB)": 21.48, "step": 14777, "token_acc": 0.9828178694158075, "train_speed(iter/s)": 0.956128 }, { "epoch": 0.4800701685995517, "grad_norm": 0.3024611175060272, "learning_rate": 5.740417954525947e-06, "loss": 0.015902966260910034, "memory(GiB)": 21.48, "step": 14778, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.956141 }, { "epoch": 0.4801026540623071, "grad_norm": 0.43548962473869324, "learning_rate": 5.7398867195123335e-06, "loss": 0.02223184145987034, "memory(GiB)": 21.48, "step": 14779, "token_acc": 0.9877551020408163, "train_speed(iter/s)": 0.956154 }, { "epoch": 0.48013513952506254, "grad_norm": 0.31057775020599365, "learning_rate": 5.73935547595944e-06, "loss": 0.020745521411299706, "memory(GiB)": 21.48, "step": 14780, "token_acc": 0.9921875, "train_speed(iter/s)": 0.956165 }, { "epoch": 0.48016762498781795, "grad_norm": 0.2895885407924652, "learning_rate": 5.738824223873401e-06, "loss": 0.019084295257925987, "memory(GiB)": 21.48, "step": 14781, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956176 }, { "epoch": 0.48020011045057337, "grad_norm": 0.3420744240283966, "learning_rate": 5.738292963260349e-06, "loss": 0.027479294687509537, "memory(GiB)": 21.48, "step": 14782, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.956187 }, { "epoch": 0.4802325959133288, "grad_norm": 0.3233586847782135, "learning_rate": 5.7377616941264126e-06, "loss": 0.020545894280076027, "memory(GiB)": 21.48, "step": 14783, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.956198 }, { "epoch": 0.4802650813760842, "grad_norm": 0.3475804328918457, "learning_rate": 5.7372304164777225e-06, "loss": 0.01954137533903122, "memory(GiB)": 21.48, "step": 14784, "token_acc": 0.995, "train_speed(iter/s)": 0.956209 }, { "epoch": 0.4802975668388396, "grad_norm": 0.27990567684173584, "learning_rate": 5.7366991303204144e-06, "loss": 0.020030977204442024, "memory(GiB)": 21.48, "step": 14785, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.95622 }, { "epoch": 0.48033005230159503, "grad_norm": 0.43647170066833496, "learning_rate": 5.736167835660617e-06, "loss": 0.024588370695710182, "memory(GiB)": 21.48, "step": 14786, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956231 }, { "epoch": 0.48036253776435045, "grad_norm": 0.3525144159793854, "learning_rate": 5.735636532504461e-06, "loss": 0.017754841595888138, "memory(GiB)": 21.48, "step": 14787, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.956242 }, { "epoch": 0.48039502322710587, "grad_norm": 0.41857045888900757, "learning_rate": 5.735105220858083e-06, "loss": 0.028895247727632523, "memory(GiB)": 21.48, "step": 14788, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.956253 }, { "epoch": 0.4804275086898613, "grad_norm": 0.3582611083984375, "learning_rate": 5.7345739007276105e-06, "loss": 0.022922717034816742, "memory(GiB)": 21.48, "step": 14789, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.956263 }, { "epoch": 0.4804599941526167, "grad_norm": 0.38044193387031555, "learning_rate": 5.734042572119179e-06, "loss": 0.022920362651348114, "memory(GiB)": 21.48, "step": 14790, "token_acc": 0.9812734082397003, "train_speed(iter/s)": 0.956273 }, { "epoch": 0.4804924796153721, "grad_norm": 0.4017818570137024, "learning_rate": 5.733511235038917e-06, "loss": 0.020955925807356834, "memory(GiB)": 21.48, "step": 14791, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.956284 }, { "epoch": 0.48052496507812753, "grad_norm": 0.39145466685295105, "learning_rate": 5.732979889492962e-06, "loss": 0.022649303078651428, "memory(GiB)": 21.48, "step": 14792, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.956294 }, { "epoch": 0.48055745054088295, "grad_norm": 0.551873505115509, "learning_rate": 5.732448535487441e-06, "loss": 0.033134251832962036, "memory(GiB)": 21.48, "step": 14793, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956304 }, { "epoch": 0.48058993600363836, "grad_norm": 0.4264485239982605, "learning_rate": 5.7319171730284904e-06, "loss": 0.024009443819522858, "memory(GiB)": 21.48, "step": 14794, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.956315 }, { "epoch": 0.4806224214663938, "grad_norm": 0.3832109868526459, "learning_rate": 5.731385802122241e-06, "loss": 0.02467474900186062, "memory(GiB)": 21.48, "step": 14795, "token_acc": 0.992, "train_speed(iter/s)": 0.956326 }, { "epoch": 0.4806549069291492, "grad_norm": 0.3563781976699829, "learning_rate": 5.7308544227748265e-06, "loss": 0.018098298460245132, "memory(GiB)": 21.48, "step": 14796, "token_acc": 0.99609375, "train_speed(iter/s)": 0.956338 }, { "epoch": 0.4806873923919046, "grad_norm": 0.3580641746520996, "learning_rate": 5.73032303499238e-06, "loss": 0.016709327697753906, "memory(GiB)": 21.48, "step": 14797, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.956349 }, { "epoch": 0.48071987785466, "grad_norm": 0.3922775685787201, "learning_rate": 5.729791638781031e-06, "loss": 0.02234276570379734, "memory(GiB)": 21.48, "step": 14798, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.956358 }, { "epoch": 0.48075236331741544, "grad_norm": 0.4196307957172394, "learning_rate": 5.729260234146917e-06, "loss": 0.026150036603212357, "memory(GiB)": 21.48, "step": 14799, "token_acc": 0.9842931937172775, "train_speed(iter/s)": 0.956368 }, { "epoch": 0.48078484878017086, "grad_norm": 0.3716910183429718, "learning_rate": 5.728728821096169e-06, "loss": 0.026676520705223083, "memory(GiB)": 21.48, "step": 14800, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.956378 }, { "epoch": 0.4808173342429263, "grad_norm": 0.24865898489952087, "learning_rate": 5.728197399634921e-06, "loss": 0.011068878695368767, "memory(GiB)": 21.48, "step": 14801, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.956389 }, { "epoch": 0.4808498197056817, "grad_norm": 0.40592145919799805, "learning_rate": 5.727665969769304e-06, "loss": 0.03258499130606651, "memory(GiB)": 21.48, "step": 14802, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.956401 }, { "epoch": 0.4808823051684371, "grad_norm": 0.345493882894516, "learning_rate": 5.727134531505454e-06, "loss": 0.026065047830343246, "memory(GiB)": 21.48, "step": 14803, "token_acc": 0.9859649122807017, "train_speed(iter/s)": 0.956411 }, { "epoch": 0.4809147906311925, "grad_norm": 0.3506922721862793, "learning_rate": 5.726603084849504e-06, "loss": 0.02112390100955963, "memory(GiB)": 21.48, "step": 14804, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.956419 }, { "epoch": 0.48094727609394794, "grad_norm": 0.3080919682979584, "learning_rate": 5.726071629807587e-06, "loss": 0.0223548486828804, "memory(GiB)": 21.48, "step": 14805, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.956431 }, { "epoch": 0.48097976155670336, "grad_norm": 0.35586392879486084, "learning_rate": 5.725540166385837e-06, "loss": 0.023462530225515366, "memory(GiB)": 21.48, "step": 14806, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956442 }, { "epoch": 0.48101224701945877, "grad_norm": 0.543110191822052, "learning_rate": 5.7250086945903885e-06, "loss": 0.015243935398757458, "memory(GiB)": 21.48, "step": 14807, "token_acc": 1.0, "train_speed(iter/s)": 0.956453 }, { "epoch": 0.4810447324822142, "grad_norm": 0.4295102655887604, "learning_rate": 5.724477214427374e-06, "loss": 0.02148590236902237, "memory(GiB)": 21.48, "step": 14808, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.956463 }, { "epoch": 0.4810772179449696, "grad_norm": 1.3636934757232666, "learning_rate": 5.723945725902927e-06, "loss": 0.03379150852560997, "memory(GiB)": 21.48, "step": 14809, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.956472 }, { "epoch": 0.481109703407725, "grad_norm": 0.35679376125335693, "learning_rate": 5.723414229023185e-06, "loss": 0.019514242187142372, "memory(GiB)": 21.48, "step": 14810, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.956482 }, { "epoch": 0.48114218887048044, "grad_norm": 1.5595275163650513, "learning_rate": 5.72288272379428e-06, "loss": 0.027589669451117516, "memory(GiB)": 21.48, "step": 14811, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.956492 }, { "epoch": 0.48117467433323585, "grad_norm": 0.3425280451774597, "learning_rate": 5.722351210222345e-06, "loss": 0.027072256430983543, "memory(GiB)": 21.48, "step": 14812, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.956501 }, { "epoch": 0.48120715979599127, "grad_norm": 0.3596045970916748, "learning_rate": 5.721819688313516e-06, "loss": 0.022095121443271637, "memory(GiB)": 21.48, "step": 14813, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956512 }, { "epoch": 0.4812396452587467, "grad_norm": 0.5351800918579102, "learning_rate": 5.721288158073926e-06, "loss": 0.028181258589029312, "memory(GiB)": 21.48, "step": 14814, "token_acc": 0.9768518518518519, "train_speed(iter/s)": 0.956523 }, { "epoch": 0.4812721307215021, "grad_norm": 0.2987355887889862, "learning_rate": 5.720756619509712e-06, "loss": 0.021427469328045845, "memory(GiB)": 21.48, "step": 14815, "token_acc": 0.9834983498349835, "train_speed(iter/s)": 0.956532 }, { "epoch": 0.48130461618425757, "grad_norm": 1.4155808687210083, "learning_rate": 5.720225072627006e-06, "loss": 0.029102269560098648, "memory(GiB)": 21.48, "step": 14816, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.956542 }, { "epoch": 0.481337101647013, "grad_norm": 0.2721021771430969, "learning_rate": 5.719693517431946e-06, "loss": 0.018524903804063797, "memory(GiB)": 21.48, "step": 14817, "token_acc": 0.9963503649635036, "train_speed(iter/s)": 0.956552 }, { "epoch": 0.4813695871097684, "grad_norm": 1.0196475982666016, "learning_rate": 5.719161953930663e-06, "loss": 0.03082052804529667, "memory(GiB)": 21.48, "step": 14818, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.956563 }, { "epoch": 0.4814020725725238, "grad_norm": 0.31648901104927063, "learning_rate": 5.718630382129293e-06, "loss": 0.019961047917604446, "memory(GiB)": 21.48, "step": 14819, "token_acc": 0.9921875, "train_speed(iter/s)": 0.956573 }, { "epoch": 0.48143455803527924, "grad_norm": 0.4026300013065338, "learning_rate": 5.7180988020339745e-06, "loss": 0.025994181632995605, "memory(GiB)": 21.48, "step": 14820, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.956582 }, { "epoch": 0.48146704349803465, "grad_norm": 0.30388343334198, "learning_rate": 5.717567213650839e-06, "loss": 0.016767438501119614, "memory(GiB)": 21.48, "step": 14821, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956593 }, { "epoch": 0.48149952896079007, "grad_norm": 0.4050935208797455, "learning_rate": 5.717035616986023e-06, "loss": 0.02651630900800228, "memory(GiB)": 21.48, "step": 14822, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.956604 }, { "epoch": 0.4815320144235455, "grad_norm": 0.4464966058731079, "learning_rate": 5.71650401204566e-06, "loss": 0.019495125859975815, "memory(GiB)": 21.48, "step": 14823, "token_acc": 0.9823321554770318, "train_speed(iter/s)": 0.956617 }, { "epoch": 0.4815644998863009, "grad_norm": 0.28843843936920166, "learning_rate": 5.715972398835889e-06, "loss": 0.023457270115613937, "memory(GiB)": 21.48, "step": 14824, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.956631 }, { "epoch": 0.4815969853490563, "grad_norm": 0.37916573882102966, "learning_rate": 5.715440777362843e-06, "loss": 0.01942538470029831, "memory(GiB)": 21.48, "step": 14825, "token_acc": 1.0, "train_speed(iter/s)": 0.956644 }, { "epoch": 0.48162947081181173, "grad_norm": 0.2574699819087982, "learning_rate": 5.7149091476326594e-06, "loss": 0.0120539590716362, "memory(GiB)": 21.48, "step": 14826, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956657 }, { "epoch": 0.48166195627456715, "grad_norm": 0.33874306082725525, "learning_rate": 5.714377509651472e-06, "loss": 0.019765842705965042, "memory(GiB)": 21.48, "step": 14827, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.956671 }, { "epoch": 0.48169444173732257, "grad_norm": 0.32541030645370483, "learning_rate": 5.713845863425416e-06, "loss": 0.021345611661672592, "memory(GiB)": 21.48, "step": 14828, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.956685 }, { "epoch": 0.481726927200078, "grad_norm": 0.36100253462791443, "learning_rate": 5.71331420896063e-06, "loss": 0.02363997884094715, "memory(GiB)": 21.48, "step": 14829, "token_acc": 1.0, "train_speed(iter/s)": 0.956699 }, { "epoch": 0.4817594126628334, "grad_norm": 0.29533714056015015, "learning_rate": 5.712782546263247e-06, "loss": 0.017059069126844406, "memory(GiB)": 21.48, "step": 14830, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956713 }, { "epoch": 0.4817918981255888, "grad_norm": 0.4561593234539032, "learning_rate": 5.712250875339405e-06, "loss": 0.025233151391148567, "memory(GiB)": 21.48, "step": 14831, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956725 }, { "epoch": 0.48182438358834423, "grad_norm": 0.5077844262123108, "learning_rate": 5.7117191961952416e-06, "loss": 0.026875775307416916, "memory(GiB)": 21.48, "step": 14832, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.95674 }, { "epoch": 0.48185686905109965, "grad_norm": 0.49446776509284973, "learning_rate": 5.711187508836889e-06, "loss": 0.023850807920098305, "memory(GiB)": 21.48, "step": 14833, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956754 }, { "epoch": 0.48188935451385506, "grad_norm": 0.47309330105781555, "learning_rate": 5.710655813270488e-06, "loss": 0.028020396828651428, "memory(GiB)": 21.48, "step": 14834, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.956767 }, { "epoch": 0.4819218399766105, "grad_norm": 0.4458467662334442, "learning_rate": 5.710124109502172e-06, "loss": 0.021574687212705612, "memory(GiB)": 21.48, "step": 14835, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.956781 }, { "epoch": 0.4819543254393659, "grad_norm": 0.420456200838089, "learning_rate": 5.70959239753808e-06, "loss": 0.021410949528217316, "memory(GiB)": 21.48, "step": 14836, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956793 }, { "epoch": 0.4819868109021213, "grad_norm": 0.4084100127220154, "learning_rate": 5.709060677384345e-06, "loss": 0.023007461801171303, "memory(GiB)": 21.48, "step": 14837, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.956806 }, { "epoch": 0.4820192963648767, "grad_norm": 0.39489883184432983, "learning_rate": 5.708528949047107e-06, "loss": 0.023318424820899963, "memory(GiB)": 21.48, "step": 14838, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.95682 }, { "epoch": 0.48205178182763214, "grad_norm": 0.31789588928222656, "learning_rate": 5.707997212532502e-06, "loss": 0.021572597324848175, "memory(GiB)": 21.48, "step": 14839, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.956834 }, { "epoch": 0.48208426729038756, "grad_norm": 0.4584744870662689, "learning_rate": 5.707465467846668e-06, "loss": 0.025837019085884094, "memory(GiB)": 21.48, "step": 14840, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.956847 }, { "epoch": 0.482116752753143, "grad_norm": 0.3295906186103821, "learning_rate": 5.70693371499574e-06, "loss": 0.0168217271566391, "memory(GiB)": 21.48, "step": 14841, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.956859 }, { "epoch": 0.4821492382158984, "grad_norm": 0.2939798831939697, "learning_rate": 5.7064019539858565e-06, "loss": 0.019114628434181213, "memory(GiB)": 21.48, "step": 14842, "token_acc": 1.0, "train_speed(iter/s)": 0.95687 }, { "epoch": 0.4821817236786538, "grad_norm": 0.48282572627067566, "learning_rate": 5.7058701848231535e-06, "loss": 0.02366073988378048, "memory(GiB)": 21.48, "step": 14843, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.95688 }, { "epoch": 0.4822142091414092, "grad_norm": 0.3926853835582733, "learning_rate": 5.705338407513768e-06, "loss": 0.022681226953864098, "memory(GiB)": 21.48, "step": 14844, "token_acc": 1.0, "train_speed(iter/s)": 0.956892 }, { "epoch": 0.48224669460416464, "grad_norm": 1.2901890277862549, "learning_rate": 5.70480662206384e-06, "loss": 0.027191605418920517, "memory(GiB)": 21.48, "step": 14845, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.956903 }, { "epoch": 0.48227918006692005, "grad_norm": 0.491696834564209, "learning_rate": 5.704274828479505e-06, "loss": 0.010664164088666439, "memory(GiB)": 21.48, "step": 14846, "token_acc": 1.0, "train_speed(iter/s)": 0.956913 }, { "epoch": 0.48231166552967547, "grad_norm": 0.33170071244239807, "learning_rate": 5.703743026766901e-06, "loss": 0.02306513302028179, "memory(GiB)": 21.48, "step": 14847, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.956924 }, { "epoch": 0.4823441509924309, "grad_norm": 0.42034125328063965, "learning_rate": 5.7032112169321664e-06, "loss": 0.022431567311286926, "memory(GiB)": 21.48, "step": 14848, "token_acc": 1.0, "train_speed(iter/s)": 0.956936 }, { "epoch": 0.4823766364551863, "grad_norm": 0.5891942977905273, "learning_rate": 5.702679398981436e-06, "loss": 0.026720814406871796, "memory(GiB)": 21.48, "step": 14849, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.956946 }, { "epoch": 0.4824091219179417, "grad_norm": 0.4764465391635895, "learning_rate": 5.702147572920853e-06, "loss": 0.023479990661144257, "memory(GiB)": 21.48, "step": 14850, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956957 }, { "epoch": 0.48244160738069714, "grad_norm": 0.349832683801651, "learning_rate": 5.701615738756552e-06, "loss": 0.021660326048731804, "memory(GiB)": 21.48, "step": 14851, "token_acc": 1.0, "train_speed(iter/s)": 0.956967 }, { "epoch": 0.48247409284345255, "grad_norm": 0.442363440990448, "learning_rate": 5.70108389649467e-06, "loss": 0.025874579325318336, "memory(GiB)": 21.48, "step": 14852, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.956978 }, { "epoch": 0.48250657830620797, "grad_norm": 0.3627690076828003, "learning_rate": 5.700552046141348e-06, "loss": 0.020184427499771118, "memory(GiB)": 21.48, "step": 14853, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.956988 }, { "epoch": 0.4825390637689634, "grad_norm": 0.3817081153392792, "learning_rate": 5.700020187702723e-06, "loss": 0.02222420461475849, "memory(GiB)": 21.48, "step": 14854, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.956999 }, { "epoch": 0.4825715492317188, "grad_norm": 0.4736174941062927, "learning_rate": 5.699488321184934e-06, "loss": 0.025959912687540054, "memory(GiB)": 21.48, "step": 14855, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.957009 }, { "epoch": 0.4826040346944742, "grad_norm": 0.6249123811721802, "learning_rate": 5.698956446594117e-06, "loss": 0.022928420454263687, "memory(GiB)": 21.48, "step": 14856, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.95702 }, { "epoch": 0.48263652015722963, "grad_norm": 0.39227211475372314, "learning_rate": 5.698424563936413e-06, "loss": 0.022089172154664993, "memory(GiB)": 21.48, "step": 14857, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.95703 }, { "epoch": 0.48266900561998505, "grad_norm": 0.27438080310821533, "learning_rate": 5.6978926732179595e-06, "loss": 0.019249260425567627, "memory(GiB)": 21.48, "step": 14858, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957042 }, { "epoch": 0.48270149108274046, "grad_norm": 0.3804989457130432, "learning_rate": 5.6973607744448965e-06, "loss": 0.024564363062381744, "memory(GiB)": 21.48, "step": 14859, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.957053 }, { "epoch": 0.4827339765454959, "grad_norm": 0.514972984790802, "learning_rate": 5.69682886762336e-06, "loss": 0.023271149024367332, "memory(GiB)": 21.48, "step": 14860, "token_acc": 0.9880239520958084, "train_speed(iter/s)": 0.957063 }, { "epoch": 0.4827664620082513, "grad_norm": 0.3218810260295868, "learning_rate": 5.696296952759495e-06, "loss": 0.021463921293616295, "memory(GiB)": 21.48, "step": 14861, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.957072 }, { "epoch": 0.4827989474710067, "grad_norm": 0.43932458758354187, "learning_rate": 5.6957650298594325e-06, "loss": 0.023054473102092743, "memory(GiB)": 21.48, "step": 14862, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.957082 }, { "epoch": 0.48283143293376213, "grad_norm": 0.31169936060905457, "learning_rate": 5.695233098929317e-06, "loss": 0.017483673989772797, "memory(GiB)": 21.48, "step": 14863, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.957091 }, { "epoch": 0.48286391839651754, "grad_norm": 0.4625190496444702, "learning_rate": 5.694701159975285e-06, "loss": 0.02971602976322174, "memory(GiB)": 21.48, "step": 14864, "token_acc": 0.982532751091703, "train_speed(iter/s)": 0.957099 }, { "epoch": 0.48289640385927296, "grad_norm": 0.3196776807308197, "learning_rate": 5.694169213003478e-06, "loss": 0.02480969950556755, "memory(GiB)": 21.48, "step": 14865, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957109 }, { "epoch": 0.4829288893220284, "grad_norm": 0.31455010175704956, "learning_rate": 5.693637258020035e-06, "loss": 0.01868610270321369, "memory(GiB)": 21.48, "step": 14866, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.957119 }, { "epoch": 0.4829613747847838, "grad_norm": 0.5425282716751099, "learning_rate": 5.693105295031094e-06, "loss": 0.024211138486862183, "memory(GiB)": 21.48, "step": 14867, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957128 }, { "epoch": 0.4829938602475392, "grad_norm": 0.42137718200683594, "learning_rate": 5.692573324042796e-06, "loss": 0.025366919115185738, "memory(GiB)": 21.48, "step": 14868, "token_acc": 0.9963636363636363, "train_speed(iter/s)": 0.957139 }, { "epoch": 0.4830263457102946, "grad_norm": 0.4093799293041229, "learning_rate": 5.69204134506128e-06, "loss": 0.021737592294812202, "memory(GiB)": 21.48, "step": 14869, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.957149 }, { "epoch": 0.48305883117305004, "grad_norm": 0.31502053141593933, "learning_rate": 5.691509358092685e-06, "loss": 0.02216527797281742, "memory(GiB)": 21.48, "step": 14870, "token_acc": 0.9898305084745763, "train_speed(iter/s)": 0.957159 }, { "epoch": 0.48309131663580546, "grad_norm": 0.34154993295669556, "learning_rate": 5.690977363143152e-06, "loss": 0.022653747349977493, "memory(GiB)": 21.48, "step": 14871, "token_acc": 0.985, "train_speed(iter/s)": 0.957169 }, { "epoch": 0.4831238020985609, "grad_norm": 0.2752682566642761, "learning_rate": 5.69044536021882e-06, "loss": 0.01893308386206627, "memory(GiB)": 21.48, "step": 14872, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957179 }, { "epoch": 0.4831562875613163, "grad_norm": 0.3752550482749939, "learning_rate": 5.689913349325831e-06, "loss": 0.02448069490492344, "memory(GiB)": 21.48, "step": 14873, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.957188 }, { "epoch": 0.4831887730240717, "grad_norm": 0.251942902803421, "learning_rate": 5.689381330470322e-06, "loss": 0.015075527131557465, "memory(GiB)": 21.48, "step": 14874, "token_acc": 1.0, "train_speed(iter/s)": 0.957199 }, { "epoch": 0.4832212584868271, "grad_norm": 0.36170193552970886, "learning_rate": 5.6888493036584354e-06, "loss": 0.016317129135131836, "memory(GiB)": 21.48, "step": 14875, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.957209 }, { "epoch": 0.48325374394958254, "grad_norm": 0.4019443690776825, "learning_rate": 5.6883172688963106e-06, "loss": 0.02638019248843193, "memory(GiB)": 21.48, "step": 14876, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.957219 }, { "epoch": 0.48328622941233795, "grad_norm": 0.45239806175231934, "learning_rate": 5.68778522619009e-06, "loss": 0.024536896497011185, "memory(GiB)": 21.48, "step": 14877, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957229 }, { "epoch": 0.48331871487509337, "grad_norm": 0.3472808003425598, "learning_rate": 5.687253175545912e-06, "loss": 0.01891040802001953, "memory(GiB)": 21.48, "step": 14878, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957239 }, { "epoch": 0.4833512003378488, "grad_norm": 0.34447965025901794, "learning_rate": 5.686721116969916e-06, "loss": 0.0198859591037035, "memory(GiB)": 21.48, "step": 14879, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.957249 }, { "epoch": 0.48338368580060426, "grad_norm": 0.35784509778022766, "learning_rate": 5.686189050468245e-06, "loss": 0.020210016518831253, "memory(GiB)": 21.48, "step": 14880, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957259 }, { "epoch": 0.4834161712633597, "grad_norm": 0.22918497025966644, "learning_rate": 5.685656976047039e-06, "loss": 0.015792910009622574, "memory(GiB)": 21.48, "step": 14881, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.95727 }, { "epoch": 0.4834486567261151, "grad_norm": 0.47404199838638306, "learning_rate": 5.685124893712439e-06, "loss": 0.025590265169739723, "memory(GiB)": 21.48, "step": 14882, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957281 }, { "epoch": 0.4834811421888705, "grad_norm": 0.4047176241874695, "learning_rate": 5.684592803470588e-06, "loss": 0.016159681603312492, "memory(GiB)": 21.48, "step": 14883, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.957293 }, { "epoch": 0.4835136276516259, "grad_norm": 0.4662766754627228, "learning_rate": 5.684060705327622e-06, "loss": 0.022697918117046356, "memory(GiB)": 21.48, "step": 14884, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957307 }, { "epoch": 0.48354611311438134, "grad_norm": 0.4484863579273224, "learning_rate": 5.683528599289686e-06, "loss": 0.023531505838036537, "memory(GiB)": 21.48, "step": 14885, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.957321 }, { "epoch": 0.48357859857713675, "grad_norm": 0.3473398983478546, "learning_rate": 5.682996485362919e-06, "loss": 0.01997816003859043, "memory(GiB)": 21.48, "step": 14886, "token_acc": 0.9892857142857143, "train_speed(iter/s)": 0.957335 }, { "epoch": 0.48361108403989217, "grad_norm": 0.426251083612442, "learning_rate": 5.682464363553466e-06, "loss": 0.02225763350725174, "memory(GiB)": 21.48, "step": 14887, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.957348 }, { "epoch": 0.4836435695026476, "grad_norm": 0.27088022232055664, "learning_rate": 5.6819322338674635e-06, "loss": 0.014389961957931519, "memory(GiB)": 21.48, "step": 14888, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.957361 }, { "epoch": 0.483676054965403, "grad_norm": 0.3004744052886963, "learning_rate": 5.6814000963110575e-06, "loss": 0.01613960973918438, "memory(GiB)": 21.48, "step": 14889, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.957375 }, { "epoch": 0.4837085404281584, "grad_norm": 0.26163315773010254, "learning_rate": 5.680867950890385e-06, "loss": 0.012980002909898758, "memory(GiB)": 21.48, "step": 14890, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.957388 }, { "epoch": 0.48374102589091383, "grad_norm": 0.27934718132019043, "learning_rate": 5.680335797611592e-06, "loss": 0.01502710860222578, "memory(GiB)": 21.48, "step": 14891, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957402 }, { "epoch": 0.48377351135366925, "grad_norm": 0.43617475032806396, "learning_rate": 5.6798036364808185e-06, "loss": 0.024951409548521042, "memory(GiB)": 21.48, "step": 14892, "token_acc": 0.992, "train_speed(iter/s)": 0.957415 }, { "epoch": 0.48380599681642467, "grad_norm": 0.3613136410713196, "learning_rate": 5.679271467504205e-06, "loss": 0.023560944944620132, "memory(GiB)": 21.48, "step": 14893, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957429 }, { "epoch": 0.4838384822791801, "grad_norm": 0.578755795955658, "learning_rate": 5.678739290687896e-06, "loss": 0.032308563590049744, "memory(GiB)": 21.48, "step": 14894, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957443 }, { "epoch": 0.4838709677419355, "grad_norm": 0.25697481632232666, "learning_rate": 5.6782071060380295e-06, "loss": 0.017455965280532837, "memory(GiB)": 21.48, "step": 14895, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.957455 }, { "epoch": 0.4839034532046909, "grad_norm": 0.34132516384124756, "learning_rate": 5.6776749135607526e-06, "loss": 0.02239389345049858, "memory(GiB)": 21.48, "step": 14896, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.957469 }, { "epoch": 0.48393593866744633, "grad_norm": 0.4022541642189026, "learning_rate": 5.677142713262205e-06, "loss": 0.021644962951540947, "memory(GiB)": 21.48, "step": 14897, "token_acc": 0.988950276243094, "train_speed(iter/s)": 0.957482 }, { "epoch": 0.48396842413020175, "grad_norm": 0.38852939009666443, "learning_rate": 5.676610505148528e-06, "loss": 0.014667303301393986, "memory(GiB)": 21.48, "step": 14898, "token_acc": 0.9948979591836735, "train_speed(iter/s)": 0.957496 }, { "epoch": 0.48400090959295716, "grad_norm": 2.289057970046997, "learning_rate": 5.676078289225867e-06, "loss": 0.02543490007519722, "memory(GiB)": 21.48, "step": 14899, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.957509 }, { "epoch": 0.4840333950557126, "grad_norm": 1.413717269897461, "learning_rate": 5.675546065500362e-06, "loss": 0.03170202299952507, "memory(GiB)": 21.48, "step": 14900, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.957522 }, { "epoch": 0.484065880518468, "grad_norm": 0.30919596552848816, "learning_rate": 5.675013833978156e-06, "loss": 0.01920437067747116, "memory(GiB)": 21.48, "step": 14901, "token_acc": 1.0, "train_speed(iter/s)": 0.957535 }, { "epoch": 0.4840983659812234, "grad_norm": 0.940255880355835, "learning_rate": 5.674481594665392e-06, "loss": 0.03330884873867035, "memory(GiB)": 21.48, "step": 14902, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957548 }, { "epoch": 0.4841308514439788, "grad_norm": 0.4877484142780304, "learning_rate": 5.673949347568213e-06, "loss": 0.028032243251800537, "memory(GiB)": 21.48, "step": 14903, "token_acc": 0.988, "train_speed(iter/s)": 0.957561 }, { "epoch": 0.48416333690673424, "grad_norm": 0.2838321328163147, "learning_rate": 5.67341709269276e-06, "loss": 0.020028550177812576, "memory(GiB)": 21.48, "step": 14904, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957572 }, { "epoch": 0.48419582236948966, "grad_norm": 0.3786459267139435, "learning_rate": 5.67288483004518e-06, "loss": 0.023308025673031807, "memory(GiB)": 21.48, "step": 14905, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.957582 }, { "epoch": 0.4842283078322451, "grad_norm": 0.3190920650959015, "learning_rate": 5.672352559631612e-06, "loss": 0.015887292101979256, "memory(GiB)": 21.48, "step": 14906, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.957592 }, { "epoch": 0.4842607932950005, "grad_norm": 0.5310505628585815, "learning_rate": 5.6718202814582e-06, "loss": 0.021301480010151863, "memory(GiB)": 21.48, "step": 14907, "token_acc": 1.0, "train_speed(iter/s)": 0.957603 }, { "epoch": 0.4842932787577559, "grad_norm": 0.4331549108028412, "learning_rate": 5.67128799553109e-06, "loss": 0.02238244190812111, "memory(GiB)": 21.48, "step": 14908, "token_acc": 1.0, "train_speed(iter/s)": 0.957613 }, { "epoch": 0.4843257642205113, "grad_norm": 0.3875202536582947, "learning_rate": 5.67075570185642e-06, "loss": 0.01566510833799839, "memory(GiB)": 21.48, "step": 14909, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957624 }, { "epoch": 0.48435824968326674, "grad_norm": 0.4064323306083679, "learning_rate": 5.670223400440339e-06, "loss": 0.03376751020550728, "memory(GiB)": 21.48, "step": 14910, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.957602 }, { "epoch": 0.48439073514602216, "grad_norm": 0.31270405650138855, "learning_rate": 5.669691091288986e-06, "loss": 0.014530295506119728, "memory(GiB)": 21.48, "step": 14911, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.957612 }, { "epoch": 0.48442322060877757, "grad_norm": 0.3485606014728546, "learning_rate": 5.669158774408507e-06, "loss": 0.023379599675536156, "memory(GiB)": 21.48, "step": 14912, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.957622 }, { "epoch": 0.484455706071533, "grad_norm": 0.33840903639793396, "learning_rate": 5.6686264498050455e-06, "loss": 0.018541835248470306, "memory(GiB)": 21.48, "step": 14913, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957632 }, { "epoch": 0.4844881915342884, "grad_norm": 0.821576714515686, "learning_rate": 5.668094117484744e-06, "loss": 0.023585667833685875, "memory(GiB)": 21.48, "step": 14914, "token_acc": 1.0, "train_speed(iter/s)": 0.957642 }, { "epoch": 0.4845206769970438, "grad_norm": 0.34127336740493774, "learning_rate": 5.667561777453748e-06, "loss": 0.01776857301592827, "memory(GiB)": 21.48, "step": 14915, "token_acc": 1.0, "train_speed(iter/s)": 0.957652 }, { "epoch": 0.48455316245979924, "grad_norm": 0.4553844630718231, "learning_rate": 5.6670294297182e-06, "loss": 0.02794821374118328, "memory(GiB)": 21.48, "step": 14916, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957663 }, { "epoch": 0.48458564792255465, "grad_norm": 0.3897249400615692, "learning_rate": 5.666497074284245e-06, "loss": 0.024650873616337776, "memory(GiB)": 21.48, "step": 14917, "token_acc": 0.9965635738831615, "train_speed(iter/s)": 0.957674 }, { "epoch": 0.48461813338531007, "grad_norm": 0.34506431221961975, "learning_rate": 5.665964711158026e-06, "loss": 0.027786606922745705, "memory(GiB)": 21.48, "step": 14918, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.957685 }, { "epoch": 0.4846506188480655, "grad_norm": 0.2853822410106659, "learning_rate": 5.665432340345689e-06, "loss": 0.01839296706020832, "memory(GiB)": 21.48, "step": 14919, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957696 }, { "epoch": 0.4846831043108209, "grad_norm": 0.34914109110832214, "learning_rate": 5.664899961853377e-06, "loss": 0.022544486448168755, "memory(GiB)": 21.48, "step": 14920, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.957707 }, { "epoch": 0.4847155897735763, "grad_norm": 0.3174334168434143, "learning_rate": 5.664367575687234e-06, "loss": 0.018589472398161888, "memory(GiB)": 21.48, "step": 14921, "token_acc": 1.0, "train_speed(iter/s)": 0.957717 }, { "epoch": 0.48474807523633173, "grad_norm": 0.37118756771087646, "learning_rate": 5.6638351818534055e-06, "loss": 0.017627432942390442, "memory(GiB)": 21.48, "step": 14922, "token_acc": 1.0, "train_speed(iter/s)": 0.957725 }, { "epoch": 0.48478056069908715, "grad_norm": 0.42561081051826477, "learning_rate": 5.663302780358034e-06, "loss": 0.0239943265914917, "memory(GiB)": 21.48, "step": 14923, "token_acc": 0.974025974025974, "train_speed(iter/s)": 0.957733 }, { "epoch": 0.48481304616184256, "grad_norm": 0.31755825877189636, "learning_rate": 5.662770371207267e-06, "loss": 0.020307086408138275, "memory(GiB)": 21.48, "step": 14924, "token_acc": 1.0, "train_speed(iter/s)": 0.95774 }, { "epoch": 0.484845531624598, "grad_norm": 0.2973635792732239, "learning_rate": 5.662237954407247e-06, "loss": 0.01530690211802721, "memory(GiB)": 21.48, "step": 14925, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957749 }, { "epoch": 0.4848780170873534, "grad_norm": 0.2887667119503021, "learning_rate": 5.661705529964119e-06, "loss": 0.015145612880587578, "memory(GiB)": 21.48, "step": 14926, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957758 }, { "epoch": 0.4849105025501088, "grad_norm": 0.961347758769989, "learning_rate": 5.66117309788403e-06, "loss": 0.01892460137605667, "memory(GiB)": 21.48, "step": 14927, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.957767 }, { "epoch": 0.48494298801286423, "grad_norm": 0.40838295221328735, "learning_rate": 5.660640658173122e-06, "loss": 0.023089395835995674, "memory(GiB)": 21.48, "step": 14928, "token_acc": 0.9869565217391304, "train_speed(iter/s)": 0.957777 }, { "epoch": 0.48497547347561965, "grad_norm": 0.3811721205711365, "learning_rate": 5.660108210837543e-06, "loss": 0.019305484369397163, "memory(GiB)": 21.48, "step": 14929, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.957785 }, { "epoch": 0.48500795893837506, "grad_norm": 0.6329752802848816, "learning_rate": 5.6595757558834344e-06, "loss": 0.028859352692961693, "memory(GiB)": 21.48, "step": 14930, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957794 }, { "epoch": 0.4850404444011305, "grad_norm": 0.5248342156410217, "learning_rate": 5.659043293316946e-06, "loss": 0.02347562089562416, "memory(GiB)": 21.48, "step": 14931, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.957803 }, { "epoch": 0.4850729298638859, "grad_norm": 0.2971206307411194, "learning_rate": 5.6585108231442185e-06, "loss": 0.02068149857223034, "memory(GiB)": 21.48, "step": 14932, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.957813 }, { "epoch": 0.4851054153266413, "grad_norm": 0.2783203125, "learning_rate": 5.657978345371402e-06, "loss": 0.013994582928717136, "memory(GiB)": 21.48, "step": 14933, "token_acc": 0.9947643979057592, "train_speed(iter/s)": 0.95782 }, { "epoch": 0.4851379007893967, "grad_norm": 0.4137612581253052, "learning_rate": 5.6574458600046385e-06, "loss": 0.029598237946629524, "memory(GiB)": 21.48, "step": 14934, "token_acc": 0.9813084112149533, "train_speed(iter/s)": 0.957828 }, { "epoch": 0.48517038625215214, "grad_norm": 0.5166440010070801, "learning_rate": 5.656913367050074e-06, "loss": 0.021012283861637115, "memory(GiB)": 21.48, "step": 14935, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.957838 }, { "epoch": 0.48520287171490756, "grad_norm": 0.41121748089790344, "learning_rate": 5.656380866513857e-06, "loss": 0.023514417931437492, "memory(GiB)": 21.48, "step": 14936, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.957847 }, { "epoch": 0.485235357177663, "grad_norm": 0.5086445808410645, "learning_rate": 5.655848358402129e-06, "loss": 0.03068876825273037, "memory(GiB)": 21.48, "step": 14937, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.957856 }, { "epoch": 0.4852678426404184, "grad_norm": 0.343697190284729, "learning_rate": 5.65531584272104e-06, "loss": 0.02463109977543354, "memory(GiB)": 21.48, "step": 14938, "token_acc": 0.9836956521739131, "train_speed(iter/s)": 0.957865 }, { "epoch": 0.4853003281031738, "grad_norm": 0.3550584614276886, "learning_rate": 5.654783319476732e-06, "loss": 0.018979091197252274, "memory(GiB)": 21.48, "step": 14939, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.957876 }, { "epoch": 0.4853328135659292, "grad_norm": 0.2627396881580353, "learning_rate": 5.654250788675355e-06, "loss": 0.014961840584874153, "memory(GiB)": 21.48, "step": 14940, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.957885 }, { "epoch": 0.48536529902868464, "grad_norm": 0.4649517834186554, "learning_rate": 5.65371825032305e-06, "loss": 0.03344859182834625, "memory(GiB)": 21.48, "step": 14941, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957898 }, { "epoch": 0.48539778449144005, "grad_norm": 0.4117577373981476, "learning_rate": 5.653185704425969e-06, "loss": 0.01797408238053322, "memory(GiB)": 21.48, "step": 14942, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.95791 }, { "epoch": 0.48543026995419547, "grad_norm": 0.2086879014968872, "learning_rate": 5.652653150990255e-06, "loss": 0.012967567890882492, "memory(GiB)": 21.48, "step": 14943, "token_acc": 1.0, "train_speed(iter/s)": 0.957923 }, { "epoch": 0.48546275541695094, "grad_norm": 0.41204020380973816, "learning_rate": 5.652120590022054e-06, "loss": 0.019356071949005127, "memory(GiB)": 21.48, "step": 14944, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.957935 }, { "epoch": 0.48549524087970636, "grad_norm": 0.35179704427719116, "learning_rate": 5.651588021527514e-06, "loss": 0.026567544788122177, "memory(GiB)": 21.48, "step": 14945, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957949 }, { "epoch": 0.4855277263424618, "grad_norm": 0.37112322449684143, "learning_rate": 5.651055445512781e-06, "loss": 0.027205441147089005, "memory(GiB)": 21.48, "step": 14946, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.957962 }, { "epoch": 0.4855602118052172, "grad_norm": 0.21730901300907135, "learning_rate": 5.650522861984003e-06, "loss": 0.012351700104773045, "memory(GiB)": 21.48, "step": 14947, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.957974 }, { "epoch": 0.4855926972679726, "grad_norm": 0.3799557387828827, "learning_rate": 5.649990270947324e-06, "loss": 0.023637376725673676, "memory(GiB)": 21.48, "step": 14948, "token_acc": 0.9875, "train_speed(iter/s)": 0.957987 }, { "epoch": 0.485625182730728, "grad_norm": 0.3783937692642212, "learning_rate": 5.649457672408891e-06, "loss": 0.023719076067209244, "memory(GiB)": 21.48, "step": 14949, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.958 }, { "epoch": 0.48565766819348344, "grad_norm": 0.4936557114124298, "learning_rate": 5.648925066374854e-06, "loss": 0.027257420122623444, "memory(GiB)": 21.48, "step": 14950, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.958014 }, { "epoch": 0.48569015365623885, "grad_norm": 0.32698729634284973, "learning_rate": 5.648392452851356e-06, "loss": 0.020238365978002548, "memory(GiB)": 21.48, "step": 14951, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.958026 }, { "epoch": 0.48572263911899427, "grad_norm": 0.2606421113014221, "learning_rate": 5.647859831844549e-06, "loss": 0.016986481845378876, "memory(GiB)": 21.48, "step": 14952, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.958038 }, { "epoch": 0.4857551245817497, "grad_norm": 0.2998652458190918, "learning_rate": 5.647327203360575e-06, "loss": 0.020231278613209724, "memory(GiB)": 21.48, "step": 14953, "token_acc": 1.0, "train_speed(iter/s)": 0.958051 }, { "epoch": 0.4857876100445051, "grad_norm": 0.3219328820705414, "learning_rate": 5.646794567405585e-06, "loss": 0.02271367609500885, "memory(GiB)": 21.48, "step": 14954, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.958063 }, { "epoch": 0.4858200955072605, "grad_norm": 0.43645283579826355, "learning_rate": 5.646261923985723e-06, "loss": 0.024249793961644173, "memory(GiB)": 21.48, "step": 14955, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.958076 }, { "epoch": 0.48585258097001593, "grad_norm": 0.6017448306083679, "learning_rate": 5.64572927310714e-06, "loss": 0.025845283642411232, "memory(GiB)": 21.48, "step": 14956, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.958089 }, { "epoch": 0.48588506643277135, "grad_norm": 0.4666193425655365, "learning_rate": 5.64519661477598e-06, "loss": 0.019473832100629807, "memory(GiB)": 21.48, "step": 14957, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.958102 }, { "epoch": 0.48591755189552677, "grad_norm": 0.6135491132736206, "learning_rate": 5.644663948998392e-06, "loss": 0.022004976868629456, "memory(GiB)": 21.48, "step": 14958, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.958115 }, { "epoch": 0.4859500373582822, "grad_norm": 0.37662047147750854, "learning_rate": 5.644131275780526e-06, "loss": 0.023973427712917328, "memory(GiB)": 21.48, "step": 14959, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.958127 }, { "epoch": 0.4859825228210376, "grad_norm": 0.3724295496940613, "learning_rate": 5.643598595128527e-06, "loss": 0.023423155769705772, "memory(GiB)": 21.48, "step": 14960, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.95814 }, { "epoch": 0.486015008283793, "grad_norm": 0.3605092167854309, "learning_rate": 5.643065907048544e-06, "loss": 0.028042634949088097, "memory(GiB)": 21.48, "step": 14961, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.958152 }, { "epoch": 0.48604749374654843, "grad_norm": 0.9271541237831116, "learning_rate": 5.642533211546723e-06, "loss": 0.02462022751569748, "memory(GiB)": 21.48, "step": 14962, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.958164 }, { "epoch": 0.48607997920930385, "grad_norm": 0.3815937638282776, "learning_rate": 5.642000508629215e-06, "loss": 0.026954062283039093, "memory(GiB)": 21.48, "step": 14963, "token_acc": 1.0, "train_speed(iter/s)": 0.958177 }, { "epoch": 0.48611246467205926, "grad_norm": 0.32169702649116516, "learning_rate": 5.641467798302167e-06, "loss": 0.024604232981801033, "memory(GiB)": 21.48, "step": 14964, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.958186 }, { "epoch": 0.4861449501348147, "grad_norm": 0.36859050393104553, "learning_rate": 5.640935080571726e-06, "loss": 0.02759261056780815, "memory(GiB)": 21.48, "step": 14965, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.958195 }, { "epoch": 0.4861774355975701, "grad_norm": 0.20544905960559845, "learning_rate": 5.6404023554440405e-06, "loss": 0.011816052719950676, "memory(GiB)": 21.48, "step": 14966, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.958206 }, { "epoch": 0.4862099210603255, "grad_norm": 0.32313528656959534, "learning_rate": 5.639869622925259e-06, "loss": 0.023842306807637215, "memory(GiB)": 21.48, "step": 14967, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.958216 }, { "epoch": 0.48624240652308093, "grad_norm": 0.3912615478038788, "learning_rate": 5.639336883021532e-06, "loss": 0.014885976910591125, "memory(GiB)": 21.48, "step": 14968, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.958225 }, { "epoch": 0.48627489198583634, "grad_norm": 0.2830488681793213, "learning_rate": 5.638804135739005e-06, "loss": 0.020922265946865082, "memory(GiB)": 21.48, "step": 14969, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.958235 }, { "epoch": 0.48630737744859176, "grad_norm": 0.4507772624492645, "learning_rate": 5.638271381083829e-06, "loss": 0.02318601682782173, "memory(GiB)": 21.48, "step": 14970, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.958245 }, { "epoch": 0.4863398629113472, "grad_norm": 0.32897818088531494, "learning_rate": 5.637738619062153e-06, "loss": 0.022315263748168945, "memory(GiB)": 21.48, "step": 14971, "token_acc": 0.9938271604938271, "train_speed(iter/s)": 0.958255 }, { "epoch": 0.4863723483741026, "grad_norm": 0.33431577682495117, "learning_rate": 5.637205849680123e-06, "loss": 0.020314138382673264, "memory(GiB)": 21.48, "step": 14972, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.958265 }, { "epoch": 0.486404833836858, "grad_norm": 1.892041802406311, "learning_rate": 5.636673072943888e-06, "loss": 0.024612657725811005, "memory(GiB)": 21.48, "step": 14973, "token_acc": 0.9822485207100592, "train_speed(iter/s)": 0.958276 }, { "epoch": 0.4864373192996134, "grad_norm": 0.2906374931335449, "learning_rate": 5.636140288859599e-06, "loss": 0.019321586936712265, "memory(GiB)": 21.48, "step": 14974, "token_acc": 1.0, "train_speed(iter/s)": 0.958284 }, { "epoch": 0.48646980476236884, "grad_norm": 0.4033031463623047, "learning_rate": 5.635607497433406e-06, "loss": 0.02645406313240528, "memory(GiB)": 21.48, "step": 14975, "token_acc": 0.9893238434163701, "train_speed(iter/s)": 0.958294 }, { "epoch": 0.48650229022512426, "grad_norm": 0.323444128036499, "learning_rate": 5.6350746986714565e-06, "loss": 0.027741560712456703, "memory(GiB)": 21.48, "step": 14976, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.958305 }, { "epoch": 0.4865347756878797, "grad_norm": 0.290195107460022, "learning_rate": 5.634541892579898e-06, "loss": 0.0164145790040493, "memory(GiB)": 21.48, "step": 14977, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.958316 }, { "epoch": 0.4865672611506351, "grad_norm": 0.34643444418907166, "learning_rate": 5.634009079164883e-06, "loss": 0.01829102262854576, "memory(GiB)": 21.48, "step": 14978, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.958326 }, { "epoch": 0.4865997466133905, "grad_norm": 0.3333662748336792, "learning_rate": 5.633476258432557e-06, "loss": 0.01886865496635437, "memory(GiB)": 21.48, "step": 14979, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.958337 }, { "epoch": 0.4866322320761459, "grad_norm": 0.5068550109863281, "learning_rate": 5.632943430389075e-06, "loss": 0.021645614877343178, "memory(GiB)": 21.48, "step": 14980, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.958345 }, { "epoch": 0.48666471753890134, "grad_norm": 0.40205734968185425, "learning_rate": 5.6324105950405806e-06, "loss": 0.01723112165927887, "memory(GiB)": 21.48, "step": 14981, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.958354 }, { "epoch": 0.48669720300165675, "grad_norm": 0.40324723720550537, "learning_rate": 5.631877752393229e-06, "loss": 0.024434279650449753, "memory(GiB)": 21.48, "step": 14982, "token_acc": 1.0, "train_speed(iter/s)": 0.958362 }, { "epoch": 0.48672968846441217, "grad_norm": 0.27818331122398376, "learning_rate": 5.631344902453164e-06, "loss": 0.019197069108486176, "memory(GiB)": 21.48, "step": 14983, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.958369 }, { "epoch": 0.4867621739271676, "grad_norm": 0.36486831307411194, "learning_rate": 5.630812045226541e-06, "loss": 0.019161637872457504, "memory(GiB)": 21.48, "step": 14984, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.958379 }, { "epoch": 0.486794659389923, "grad_norm": 0.8468413352966309, "learning_rate": 5.630279180719507e-06, "loss": 0.027000507339835167, "memory(GiB)": 21.48, "step": 14985, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.958387 }, { "epoch": 0.4868271448526784, "grad_norm": 0.2556363344192505, "learning_rate": 5.629746308938211e-06, "loss": 0.01526419073343277, "memory(GiB)": 21.48, "step": 14986, "token_acc": 0.9881656804733728, "train_speed(iter/s)": 0.958396 }, { "epoch": 0.48685963031543383, "grad_norm": 0.3313472867012024, "learning_rate": 5.629213429888806e-06, "loss": 0.0168889369815588, "memory(GiB)": 21.48, "step": 14987, "token_acc": 1.0, "train_speed(iter/s)": 0.958404 }, { "epoch": 0.48689211577818925, "grad_norm": 0.27192750573158264, "learning_rate": 5.628680543577439e-06, "loss": 0.013807477429509163, "memory(GiB)": 21.48, "step": 14988, "token_acc": 1.0, "train_speed(iter/s)": 0.958412 }, { "epoch": 0.48692460124094467, "grad_norm": 0.3042827546596527, "learning_rate": 5.628147650010262e-06, "loss": 0.025187205523252487, "memory(GiB)": 21.48, "step": 14989, "token_acc": 1.0, "train_speed(iter/s)": 0.958421 }, { "epoch": 0.4869570867037001, "grad_norm": 0.2965985834598541, "learning_rate": 5.627614749193426e-06, "loss": 0.016239885240793228, "memory(GiB)": 21.48, "step": 14990, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.958431 }, { "epoch": 0.4869895721664555, "grad_norm": 0.35176223516464233, "learning_rate": 5.6270818411330794e-06, "loss": 0.0206126905977726, "memory(GiB)": 21.48, "step": 14991, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.95844 }, { "epoch": 0.4870220576292109, "grad_norm": 0.21286436915397644, "learning_rate": 5.626548925835374e-06, "loss": 0.016442419961094856, "memory(GiB)": 21.48, "step": 14992, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.958449 }, { "epoch": 0.48705454309196633, "grad_norm": 0.3167708218097687, "learning_rate": 5.6260160033064605e-06, "loss": 0.02240927517414093, "memory(GiB)": 21.48, "step": 14993, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.958458 }, { "epoch": 0.48708702855472175, "grad_norm": 0.37424764037132263, "learning_rate": 5.625483073552489e-06, "loss": 0.021830108016729355, "memory(GiB)": 21.48, "step": 14994, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.958467 }, { "epoch": 0.48711951401747716, "grad_norm": 0.3294147849082947, "learning_rate": 5.62495013657961e-06, "loss": 0.020311620086431503, "memory(GiB)": 21.48, "step": 14995, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.958478 }, { "epoch": 0.4871519994802326, "grad_norm": 0.3565068542957306, "learning_rate": 5.624417192393976e-06, "loss": 0.022905707359313965, "memory(GiB)": 21.48, "step": 14996, "token_acc": 1.0, "train_speed(iter/s)": 0.958487 }, { "epoch": 0.487184484942988, "grad_norm": 0.40742501616477966, "learning_rate": 5.623884241001735e-06, "loss": 0.021948333829641342, "memory(GiB)": 21.48, "step": 14997, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.958496 }, { "epoch": 0.4872169704057434, "grad_norm": 0.39607182145118713, "learning_rate": 5.623351282409039e-06, "loss": 0.023342227563261986, "memory(GiB)": 21.48, "step": 14998, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.958506 }, { "epoch": 0.4872494558684988, "grad_norm": 0.5115576386451721, "learning_rate": 5.622818316622042e-06, "loss": 0.029535621404647827, "memory(GiB)": 21.48, "step": 14999, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.958516 }, { "epoch": 0.48728194133125424, "grad_norm": 0.327521950006485, "learning_rate": 5.62228534364689e-06, "loss": 0.017246615141630173, "memory(GiB)": 21.48, "step": 15000, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.95853 }, { "epoch": 0.48728194133125424, "eval_loss": 0.021805090829730034, "eval_runtime": 80.1773, "eval_samples_per_second": 124.1, "eval_steps_per_second": 3.879, "eval_token_acc": 0.9912787854138458, "step": 15000 }, { "epoch": 0.48731442679400966, "grad_norm": 0.22719168663024902, "learning_rate": 5.6217523634897386e-06, "loss": 0.017649520188570023, "memory(GiB)": 21.48, "step": 15001, "token_acc": 0.9907844828766654, "train_speed(iter/s)": 0.952961 }, { "epoch": 0.4873469122567651, "grad_norm": 0.42109155654907227, "learning_rate": 5.6212193761567355e-06, "loss": 0.029878132045269012, "memory(GiB)": 21.48, "step": 15002, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.95297 }, { "epoch": 0.4873793977195205, "grad_norm": 0.3127906322479248, "learning_rate": 5.620686381654035e-06, "loss": 0.019252341240644455, "memory(GiB)": 21.48, "step": 15003, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.952979 }, { "epoch": 0.4874118831822759, "grad_norm": 0.3163250684738159, "learning_rate": 5.620153379987788e-06, "loss": 0.016906699165701866, "memory(GiB)": 21.48, "step": 15004, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.952989 }, { "epoch": 0.4874443686450313, "grad_norm": 0.357185959815979, "learning_rate": 5.619620371164145e-06, "loss": 0.01845122128725052, "memory(GiB)": 21.48, "step": 15005, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.952999 }, { "epoch": 0.48747685410778674, "grad_norm": 0.4608286917209625, "learning_rate": 5.619087355189259e-06, "loss": 0.024112608283758163, "memory(GiB)": 21.48, "step": 15006, "token_acc": 0.9810606060606061, "train_speed(iter/s)": 0.953008 }, { "epoch": 0.48750933957054216, "grad_norm": 0.6518738269805908, "learning_rate": 5.61855433206928e-06, "loss": 0.02223648689687252, "memory(GiB)": 21.48, "step": 15007, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.953017 }, { "epoch": 0.4875418250332976, "grad_norm": 0.5883215069770813, "learning_rate": 5.618021301810361e-06, "loss": 0.01785307377576828, "memory(GiB)": 21.48, "step": 15008, "token_acc": 1.0, "train_speed(iter/s)": 0.953027 }, { "epoch": 0.48757431049605304, "grad_norm": 0.33174192905426025, "learning_rate": 5.6174882644186534e-06, "loss": 0.018873222172260284, "memory(GiB)": 21.48, "step": 15009, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.953037 }, { "epoch": 0.48760679595880846, "grad_norm": 0.37018105387687683, "learning_rate": 5.6169552199003105e-06, "loss": 0.018585486337542534, "memory(GiB)": 21.48, "step": 15010, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.953046 }, { "epoch": 0.4876392814215639, "grad_norm": 0.39510178565979004, "learning_rate": 5.616422168261482e-06, "loss": 0.0232875794172287, "memory(GiB)": 21.48, "step": 15011, "token_acc": 0.9896907216494846, "train_speed(iter/s)": 0.953054 }, { "epoch": 0.4876717668843193, "grad_norm": 0.3603411912918091, "learning_rate": 5.6158891095083215e-06, "loss": 0.017334209755063057, "memory(GiB)": 21.48, "step": 15012, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.953063 }, { "epoch": 0.4877042523470747, "grad_norm": 0.2508661150932312, "learning_rate": 5.615356043646981e-06, "loss": 0.015860900282859802, "memory(GiB)": 21.48, "step": 15013, "token_acc": 0.9856459330143541, "train_speed(iter/s)": 0.953072 }, { "epoch": 0.4877367378098301, "grad_norm": 0.5018419623374939, "learning_rate": 5.614822970683613e-06, "loss": 0.017580052837729454, "memory(GiB)": 21.48, "step": 15014, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.953083 }, { "epoch": 0.48776922327258554, "grad_norm": 0.2989625632762909, "learning_rate": 5.614289890624369e-06, "loss": 0.018838442862033844, "memory(GiB)": 21.48, "step": 15015, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.953095 }, { "epoch": 0.48780170873534096, "grad_norm": 0.2780369818210602, "learning_rate": 5.613756803475402e-06, "loss": 0.014826350845396519, "memory(GiB)": 21.48, "step": 15016, "token_acc": 0.9858657243816255, "train_speed(iter/s)": 0.953106 }, { "epoch": 0.48783419419809637, "grad_norm": 1.5967133045196533, "learning_rate": 5.613223709242866e-06, "loss": 0.02479207143187523, "memory(GiB)": 21.48, "step": 15017, "token_acc": 1.0, "train_speed(iter/s)": 0.953117 }, { "epoch": 0.4878666796608518, "grad_norm": 0.2557687759399414, "learning_rate": 5.61269060793291e-06, "loss": 0.01746342144906521, "memory(GiB)": 21.48, "step": 15018, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.953128 }, { "epoch": 0.4878991651236072, "grad_norm": 0.43231135606765747, "learning_rate": 5.612157499551689e-06, "loss": 0.02684997022151947, "memory(GiB)": 21.48, "step": 15019, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.953136 }, { "epoch": 0.4879316505863626, "grad_norm": 0.5817883014678955, "learning_rate": 5.6116243841053575e-06, "loss": 0.01800069399178028, "memory(GiB)": 21.48, "step": 15020, "token_acc": 0.9877551020408163, "train_speed(iter/s)": 0.953147 }, { "epoch": 0.48796413604911804, "grad_norm": 0.3645212650299072, "learning_rate": 5.611091261600066e-06, "loss": 0.024511044844985008, "memory(GiB)": 21.48, "step": 15021, "token_acc": 0.9813953488372092, "train_speed(iter/s)": 0.953158 }, { "epoch": 0.48799662151187345, "grad_norm": 0.4760620594024658, "learning_rate": 5.610558132041967e-06, "loss": 0.02000664733350277, "memory(GiB)": 21.48, "step": 15022, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.953169 }, { "epoch": 0.48802910697462887, "grad_norm": 0.3480527102947235, "learning_rate": 5.610024995437215e-06, "loss": 0.024076633155345917, "memory(GiB)": 21.48, "step": 15023, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.953181 }, { "epoch": 0.4880615924373843, "grad_norm": 0.3796110451221466, "learning_rate": 5.6094918517919635e-06, "loss": 0.022870130836963654, "memory(GiB)": 21.48, "step": 15024, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.953192 }, { "epoch": 0.4880940779001397, "grad_norm": 0.3441150486469269, "learning_rate": 5.608958701112364e-06, "loss": 0.017894215881824493, "memory(GiB)": 21.48, "step": 15025, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.953203 }, { "epoch": 0.4881265633628951, "grad_norm": 0.6979503631591797, "learning_rate": 5.608425543404571e-06, "loss": 0.023236233741044998, "memory(GiB)": 21.48, "step": 15026, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.953215 }, { "epoch": 0.48815904882565053, "grad_norm": 0.333942711353302, "learning_rate": 5.607892378674737e-06, "loss": 0.021086517721414566, "memory(GiB)": 21.48, "step": 15027, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.953226 }, { "epoch": 0.48819153428840595, "grad_norm": 0.46397385001182556, "learning_rate": 5.607359206929016e-06, "loss": 0.02170056290924549, "memory(GiB)": 21.48, "step": 15028, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.953239 }, { "epoch": 0.48822401975116136, "grad_norm": 0.3190777599811554, "learning_rate": 5.606826028173562e-06, "loss": 0.014394668862223625, "memory(GiB)": 21.48, "step": 15029, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.953252 }, { "epoch": 0.4882565052139168, "grad_norm": 0.33712947368621826, "learning_rate": 5.606292842414527e-06, "loss": 0.016379836946725845, "memory(GiB)": 21.48, "step": 15030, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.953266 }, { "epoch": 0.4882889906766722, "grad_norm": 0.45276331901550293, "learning_rate": 5.605759649658066e-06, "loss": 0.019890345633029938, "memory(GiB)": 21.48, "step": 15031, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.953279 }, { "epoch": 0.4883214761394276, "grad_norm": 0.486960768699646, "learning_rate": 5.6052264499103316e-06, "loss": 0.019725343212485313, "memory(GiB)": 21.48, "step": 15032, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.953293 }, { "epoch": 0.48835396160218303, "grad_norm": 0.40078648924827576, "learning_rate": 5.604693243177479e-06, "loss": 0.021323896944522858, "memory(GiB)": 21.48, "step": 15033, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.953305 }, { "epoch": 0.48838644706493844, "grad_norm": 0.32768410444259644, "learning_rate": 5.604160029465662e-06, "loss": 0.016478203237056732, "memory(GiB)": 21.48, "step": 15034, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.953319 }, { "epoch": 0.48841893252769386, "grad_norm": 0.3261471688747406, "learning_rate": 5.603626808781033e-06, "loss": 0.015127581544220448, "memory(GiB)": 21.48, "step": 15035, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.953333 }, { "epoch": 0.4884514179904493, "grad_norm": 0.5259016156196594, "learning_rate": 5.603093581129749e-06, "loss": 0.037006158381700516, "memory(GiB)": 21.48, "step": 15036, "token_acc": 0.9813953488372092, "train_speed(iter/s)": 0.953346 }, { "epoch": 0.4884839034532047, "grad_norm": 0.8150378465652466, "learning_rate": 5.60256034651796e-06, "loss": 0.029695965349674225, "memory(GiB)": 21.48, "step": 15037, "token_acc": 0.9859649122807017, "train_speed(iter/s)": 0.953361 }, { "epoch": 0.4885163889159601, "grad_norm": 0.42486631870269775, "learning_rate": 5.602027104951824e-06, "loss": 0.028685199096798897, "memory(GiB)": 21.48, "step": 15038, "token_acc": 0.9806949806949807, "train_speed(iter/s)": 0.953374 }, { "epoch": 0.4885488743787155, "grad_norm": 0.3719501495361328, "learning_rate": 5.601493856437494e-06, "loss": 0.017504094168543816, "memory(GiB)": 21.48, "step": 15039, "token_acc": 0.99609375, "train_speed(iter/s)": 0.953388 }, { "epoch": 0.48858135984147094, "grad_norm": 0.3273191452026367, "learning_rate": 5.600960600981124e-06, "loss": 0.017603376880288124, "memory(GiB)": 21.48, "step": 15040, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.953402 }, { "epoch": 0.48861384530422636, "grad_norm": 0.31586748361587524, "learning_rate": 5.600427338588869e-06, "loss": 0.017737288028001785, "memory(GiB)": 21.48, "step": 15041, "token_acc": 1.0, "train_speed(iter/s)": 0.953415 }, { "epoch": 0.4886463307669818, "grad_norm": 0.5397804975509644, "learning_rate": 5.599894069266884e-06, "loss": 0.031429942697286606, "memory(GiB)": 21.48, "step": 15042, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.953411 }, { "epoch": 0.4886788162297372, "grad_norm": 0.37693166732788086, "learning_rate": 5.599360793021322e-06, "loss": 0.02397829107940197, "memory(GiB)": 21.48, "step": 15043, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.953425 }, { "epoch": 0.4887113016924926, "grad_norm": 0.35094496607780457, "learning_rate": 5.598827509858338e-06, "loss": 0.019883448258042336, "memory(GiB)": 21.48, "step": 15044, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.953439 }, { "epoch": 0.488743787155248, "grad_norm": 0.3969474732875824, "learning_rate": 5.59829421978409e-06, "loss": 0.030009090900421143, "memory(GiB)": 21.48, "step": 15045, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.953453 }, { "epoch": 0.48877627261800344, "grad_norm": 0.6184219717979431, "learning_rate": 5.5977609228047265e-06, "loss": 0.018015433102846146, "memory(GiB)": 21.48, "step": 15046, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.953467 }, { "epoch": 0.48880875808075885, "grad_norm": 0.34237003326416016, "learning_rate": 5.59722761892641e-06, "loss": 0.019562402740120888, "memory(GiB)": 21.48, "step": 15047, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.953481 }, { "epoch": 0.48884124354351427, "grad_norm": 0.33693069219589233, "learning_rate": 5.596694308155289e-06, "loss": 0.017735842615365982, "memory(GiB)": 21.48, "step": 15048, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.953494 }, { "epoch": 0.4888737290062697, "grad_norm": 0.39412686228752136, "learning_rate": 5.596160990497524e-06, "loss": 0.027020655572414398, "memory(GiB)": 21.48, "step": 15049, "token_acc": 1.0, "train_speed(iter/s)": 0.953508 }, { "epoch": 0.4889062144690251, "grad_norm": 0.40099528431892395, "learning_rate": 5.595627665959267e-06, "loss": 0.018178753554821014, "memory(GiB)": 21.48, "step": 15050, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.953522 }, { "epoch": 0.4889386999317805, "grad_norm": 0.5424286723136902, "learning_rate": 5.5950943345466725e-06, "loss": 0.023523282259702682, "memory(GiB)": 21.48, "step": 15051, "token_acc": 0.9934640522875817, "train_speed(iter/s)": 0.953536 }, { "epoch": 0.48897118539453593, "grad_norm": 0.3028998076915741, "learning_rate": 5.594560996265899e-06, "loss": 0.0211662407964468, "memory(GiB)": 21.48, "step": 15052, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.953547 }, { "epoch": 0.48900367085729135, "grad_norm": 0.2843001186847687, "learning_rate": 5.594027651123098e-06, "loss": 0.019404422491788864, "memory(GiB)": 21.48, "step": 15053, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.953557 }, { "epoch": 0.48903615632004677, "grad_norm": 0.3467327654361725, "learning_rate": 5.5934942991244305e-06, "loss": 0.01655624434351921, "memory(GiB)": 21.48, "step": 15054, "token_acc": 1.0, "train_speed(iter/s)": 0.953568 }, { "epoch": 0.4890686417828022, "grad_norm": 0.33098745346069336, "learning_rate": 5.592960940276046e-06, "loss": 0.023174475878477097, "memory(GiB)": 21.48, "step": 15055, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.953579 }, { "epoch": 0.4891011272455576, "grad_norm": 0.21611596643924713, "learning_rate": 5.592427574584105e-06, "loss": 0.01463160291314125, "memory(GiB)": 21.48, "step": 15056, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.95359 }, { "epoch": 0.489133612708313, "grad_norm": 0.4525495171546936, "learning_rate": 5.591894202054762e-06, "loss": 0.027771534398198128, "memory(GiB)": 21.48, "step": 15057, "token_acc": 0.992, "train_speed(iter/s)": 0.9536 }, { "epoch": 0.48916609817106843, "grad_norm": 0.5453715324401855, "learning_rate": 5.59136082269417e-06, "loss": 0.022681746631860733, "memory(GiB)": 21.48, "step": 15058, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.953612 }, { "epoch": 0.48919858363382385, "grad_norm": 0.270788311958313, "learning_rate": 5.590827436508488e-06, "loss": 0.017338857054710388, "memory(GiB)": 21.48, "step": 15059, "token_acc": 1.0, "train_speed(iter/s)": 0.953621 }, { "epoch": 0.48923106909657926, "grad_norm": 0.3534867465496063, "learning_rate": 5.59029404350387e-06, "loss": 0.021810080856084824, "memory(GiB)": 21.48, "step": 15060, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.95363 }, { "epoch": 0.4892635545593347, "grad_norm": 0.2740188539028168, "learning_rate": 5.5897606436864735e-06, "loss": 0.02172047272324562, "memory(GiB)": 21.48, "step": 15061, "token_acc": 0.9793388429752066, "train_speed(iter/s)": 0.95364 }, { "epoch": 0.4892960400220901, "grad_norm": 0.49067291617393494, "learning_rate": 5.5892272370624545e-06, "loss": 0.02894182875752449, "memory(GiB)": 21.48, "step": 15062, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.95365 }, { "epoch": 0.4893285254848455, "grad_norm": 0.4475950300693512, "learning_rate": 5.58869382363797e-06, "loss": 0.027518413960933685, "memory(GiB)": 21.48, "step": 15063, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.95366 }, { "epoch": 0.4893610109476009, "grad_norm": 0.4197726547718048, "learning_rate": 5.588160403419173e-06, "loss": 0.02207884006202221, "memory(GiB)": 21.48, "step": 15064, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.953669 }, { "epoch": 0.48939349641035634, "grad_norm": 0.3230343163013458, "learning_rate": 5.587626976412222e-06, "loss": 0.018593907356262207, "memory(GiB)": 21.48, "step": 15065, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.953679 }, { "epoch": 0.48942598187311176, "grad_norm": 0.32102325558662415, "learning_rate": 5.5870935426232756e-06, "loss": 0.018584955483675003, "memory(GiB)": 21.48, "step": 15066, "token_acc": 1.0, "train_speed(iter/s)": 0.953688 }, { "epoch": 0.4894584673358672, "grad_norm": 0.35834577679634094, "learning_rate": 5.586560102058487e-06, "loss": 0.02145988494157791, "memory(GiB)": 21.48, "step": 15067, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.953698 }, { "epoch": 0.4894909527986226, "grad_norm": 0.3498130440711975, "learning_rate": 5.586026654724015e-06, "loss": 0.022500628605484962, "memory(GiB)": 21.48, "step": 15068, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.953707 }, { "epoch": 0.489523438261378, "grad_norm": 0.38663676381111145, "learning_rate": 5.585493200626014e-06, "loss": 0.023933853954076767, "memory(GiB)": 21.48, "step": 15069, "token_acc": 1.0, "train_speed(iter/s)": 0.953717 }, { "epoch": 0.4895559237241334, "grad_norm": 0.3510795533657074, "learning_rate": 5.584959739770645e-06, "loss": 0.012785800732672215, "memory(GiB)": 21.48, "step": 15070, "token_acc": 1.0, "train_speed(iter/s)": 0.953727 }, { "epoch": 0.48958840918688884, "grad_norm": 0.43240925669670105, "learning_rate": 5.58442627216406e-06, "loss": 0.020435264334082603, "memory(GiB)": 21.48, "step": 15071, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.953731 }, { "epoch": 0.4896208946496443, "grad_norm": 0.21125850081443787, "learning_rate": 5.583892797812417e-06, "loss": 0.014914835803210735, "memory(GiB)": 21.48, "step": 15072, "token_acc": 0.9798994974874372, "train_speed(iter/s)": 0.953741 }, { "epoch": 0.4896533801123997, "grad_norm": 0.462210088968277, "learning_rate": 5.583359316721876e-06, "loss": 0.020823320373892784, "memory(GiB)": 21.48, "step": 15073, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.953752 }, { "epoch": 0.48968586557515514, "grad_norm": 0.47222423553466797, "learning_rate": 5.58282582889859e-06, "loss": 0.027760248631238937, "memory(GiB)": 21.48, "step": 15074, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.953763 }, { "epoch": 0.48971835103791056, "grad_norm": 0.3078916072845459, "learning_rate": 5.58229233434872e-06, "loss": 0.020737987011671066, "memory(GiB)": 21.48, "step": 15075, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.953774 }, { "epoch": 0.489750836500666, "grad_norm": 0.48675647377967834, "learning_rate": 5.58175883307842e-06, "loss": 0.03216198459267616, "memory(GiB)": 21.48, "step": 15076, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.953785 }, { "epoch": 0.4897833219634214, "grad_norm": 0.3173507750034332, "learning_rate": 5.581225325093851e-06, "loss": 0.02244230918586254, "memory(GiB)": 21.48, "step": 15077, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.953795 }, { "epoch": 0.4898158074261768, "grad_norm": 0.43405258655548096, "learning_rate": 5.580691810401165e-06, "loss": 0.021610353142023087, "memory(GiB)": 21.48, "step": 15078, "token_acc": 0.9831460674157303, "train_speed(iter/s)": 0.953805 }, { "epoch": 0.4898482928889322, "grad_norm": 0.35497742891311646, "learning_rate": 5.580158289006526e-06, "loss": 0.01890200562775135, "memory(GiB)": 21.48, "step": 15079, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.953816 }, { "epoch": 0.48988077835168764, "grad_norm": 0.4460330307483673, "learning_rate": 5.5796247609160856e-06, "loss": 0.024509239941835403, "memory(GiB)": 21.48, "step": 15080, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.953827 }, { "epoch": 0.48991326381444306, "grad_norm": 0.43159642815589905, "learning_rate": 5.579091226136003e-06, "loss": 0.02467365749180317, "memory(GiB)": 21.48, "step": 15081, "token_acc": 1.0, "train_speed(iter/s)": 0.953838 }, { "epoch": 0.4899457492771985, "grad_norm": 0.3453650176525116, "learning_rate": 5.57855768467244e-06, "loss": 0.019856374710798264, "memory(GiB)": 21.48, "step": 15082, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.953849 }, { "epoch": 0.4899782347399539, "grad_norm": 0.32990914583206177, "learning_rate": 5.578024136531549e-06, "loss": 0.022188644856214523, "memory(GiB)": 21.48, "step": 15083, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.953861 }, { "epoch": 0.4900107202027093, "grad_norm": 0.9623172879219055, "learning_rate": 5.577490581719491e-06, "loss": 0.02969839610159397, "memory(GiB)": 21.48, "step": 15084, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.95387 }, { "epoch": 0.4900432056654647, "grad_norm": 0.41498538851737976, "learning_rate": 5.576957020242424e-06, "loss": 0.02403545007109642, "memory(GiB)": 21.48, "step": 15085, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.95388 }, { "epoch": 0.49007569112822014, "grad_norm": 0.32621920108795166, "learning_rate": 5.576423452106503e-06, "loss": 0.017808755859732628, "memory(GiB)": 21.48, "step": 15086, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.953891 }, { "epoch": 0.49010817659097555, "grad_norm": 0.32177433371543884, "learning_rate": 5.5758898773178905e-06, "loss": 0.020706437528133392, "memory(GiB)": 21.48, "step": 15087, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.953903 }, { "epoch": 0.49014066205373097, "grad_norm": 0.49189504981040955, "learning_rate": 5.57535629588274e-06, "loss": 0.02295943722128868, "memory(GiB)": 21.48, "step": 15088, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.953914 }, { "epoch": 0.4901731475164864, "grad_norm": 0.5140026807785034, "learning_rate": 5.574822707807214e-06, "loss": 0.03221257030963898, "memory(GiB)": 21.48, "step": 15089, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.953926 }, { "epoch": 0.4902056329792418, "grad_norm": 0.3780464231967926, "learning_rate": 5.574289113097467e-06, "loss": 0.019741468131542206, "memory(GiB)": 21.48, "step": 15090, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.953938 }, { "epoch": 0.4902381184419972, "grad_norm": 0.40264979004859924, "learning_rate": 5.573755511759661e-06, "loss": 0.01978253945708275, "memory(GiB)": 21.48, "step": 15091, "token_acc": 0.9856459330143541, "train_speed(iter/s)": 0.953952 }, { "epoch": 0.49027060390475263, "grad_norm": 0.5724150538444519, "learning_rate": 5.573221903799951e-06, "loss": 0.03297834098339081, "memory(GiB)": 21.48, "step": 15092, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.953965 }, { "epoch": 0.49030308936750805, "grad_norm": 0.3394615054130554, "learning_rate": 5.5726882892244995e-06, "loss": 0.023929834365844727, "memory(GiB)": 21.48, "step": 15093, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.953978 }, { "epoch": 0.49033557483026347, "grad_norm": 0.37201038002967834, "learning_rate": 5.5721546680394625e-06, "loss": 0.020818542689085007, "memory(GiB)": 21.48, "step": 15094, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.953991 }, { "epoch": 0.4903680602930189, "grad_norm": 0.35416385531425476, "learning_rate": 5.571621040250997e-06, "loss": 0.021138988435268402, "memory(GiB)": 21.48, "step": 15095, "token_acc": 0.9809523809523809, "train_speed(iter/s)": 0.954004 }, { "epoch": 0.4904005457557743, "grad_norm": 0.3629145920276642, "learning_rate": 5.571087405865264e-06, "loss": 0.02023174613714218, "memory(GiB)": 21.48, "step": 15096, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.954018 }, { "epoch": 0.4904330312185297, "grad_norm": 0.27182015776634216, "learning_rate": 5.570553764888423e-06, "loss": 0.0200355164706707, "memory(GiB)": 21.48, "step": 15097, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.954031 }, { "epoch": 0.49046551668128513, "grad_norm": 0.2822687327861786, "learning_rate": 5.570020117326633e-06, "loss": 0.016928309574723244, "memory(GiB)": 21.48, "step": 15098, "token_acc": 0.995, "train_speed(iter/s)": 0.954044 }, { "epoch": 0.49049800214404055, "grad_norm": 0.3982015550136566, "learning_rate": 5.569486463186052e-06, "loss": 0.018710598349571228, "memory(GiB)": 21.48, "step": 15099, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.954057 }, { "epoch": 0.49053048760679596, "grad_norm": 0.39054223895072937, "learning_rate": 5.568952802472839e-06, "loss": 0.020059064030647278, "memory(GiB)": 21.48, "step": 15100, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.954071 }, { "epoch": 0.4905629730695514, "grad_norm": 0.4239155054092407, "learning_rate": 5.5684191351931536e-06, "loss": 0.02489408478140831, "memory(GiB)": 21.48, "step": 15101, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.954085 }, { "epoch": 0.4905954585323068, "grad_norm": 0.3243158161640167, "learning_rate": 5.5678854613531545e-06, "loss": 0.023703522980213165, "memory(GiB)": 21.48, "step": 15102, "token_acc": 0.9798994974874372, "train_speed(iter/s)": 0.954098 }, { "epoch": 0.4906279439950622, "grad_norm": 0.36079367995262146, "learning_rate": 5.567351780959002e-06, "loss": 0.02696685492992401, "memory(GiB)": 21.48, "step": 15103, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.954112 }, { "epoch": 0.4906604294578176, "grad_norm": 0.34720271825790405, "learning_rate": 5.566818094016853e-06, "loss": 0.01852487400174141, "memory(GiB)": 21.48, "step": 15104, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.954125 }, { "epoch": 0.49069291492057304, "grad_norm": 0.32369565963745117, "learning_rate": 5.56628440053287e-06, "loss": 0.02034473419189453, "memory(GiB)": 21.48, "step": 15105, "token_acc": 0.990521327014218, "train_speed(iter/s)": 0.954139 }, { "epoch": 0.49072540038332846, "grad_norm": 0.36922016739845276, "learning_rate": 5.565750700513211e-06, "loss": 0.02662140503525734, "memory(GiB)": 21.48, "step": 15106, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.954152 }, { "epoch": 0.4907578858460839, "grad_norm": 0.38289445638656616, "learning_rate": 5.565216993964037e-06, "loss": 0.026990246027708054, "memory(GiB)": 21.48, "step": 15107, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.954165 }, { "epoch": 0.4907903713088393, "grad_norm": 0.28417468070983887, "learning_rate": 5.564683280891505e-06, "loss": 0.016251657158136368, "memory(GiB)": 21.48, "step": 15108, "token_acc": 1.0, "train_speed(iter/s)": 0.954179 }, { "epoch": 0.4908228567715947, "grad_norm": 0.2572460174560547, "learning_rate": 5.564149561301777e-06, "loss": 0.0195756908506155, "memory(GiB)": 21.48, "step": 15109, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.954192 }, { "epoch": 0.4908553422343501, "grad_norm": 0.3229568898677826, "learning_rate": 5.563615835201013e-06, "loss": 0.022082442417740822, "memory(GiB)": 21.48, "step": 15110, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.954206 }, { "epoch": 0.49088782769710554, "grad_norm": 0.31733110547065735, "learning_rate": 5.56308210259537e-06, "loss": 0.013941662386059761, "memory(GiB)": 21.48, "step": 15111, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.954218 }, { "epoch": 0.49092031315986095, "grad_norm": 0.4250389635562897, "learning_rate": 5.5625483634910105e-06, "loss": 0.02568740025162697, "memory(GiB)": 21.48, "step": 15112, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.954227 }, { "epoch": 0.49095279862261637, "grad_norm": 0.3922876715660095, "learning_rate": 5.562014617894096e-06, "loss": 0.022170860320329666, "memory(GiB)": 21.48, "step": 15113, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.954238 }, { "epoch": 0.4909852840853718, "grad_norm": 0.31185656785964966, "learning_rate": 5.561480865810783e-06, "loss": 0.023666929453611374, "memory(GiB)": 21.48, "step": 15114, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.954249 }, { "epoch": 0.4910177695481272, "grad_norm": 0.2646479308605194, "learning_rate": 5.560947107247234e-06, "loss": 0.015232624486088753, "memory(GiB)": 21.48, "step": 15115, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.95426 }, { "epoch": 0.4910502550108826, "grad_norm": 0.3825088441371918, "learning_rate": 5.56041334220961e-06, "loss": 0.02585430257022381, "memory(GiB)": 21.48, "step": 15116, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.954268 }, { "epoch": 0.49108274047363804, "grad_norm": 0.3673606216907501, "learning_rate": 5.559879570704068e-06, "loss": 0.024445712566375732, "memory(GiB)": 21.48, "step": 15117, "token_acc": 0.9845559845559846, "train_speed(iter/s)": 0.954278 }, { "epoch": 0.49111522593639345, "grad_norm": 0.3201458156108856, "learning_rate": 5.5593457927367725e-06, "loss": 0.019431093707680702, "memory(GiB)": 21.48, "step": 15118, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.954286 }, { "epoch": 0.49114771139914887, "grad_norm": 0.42472827434539795, "learning_rate": 5.558812008313881e-06, "loss": 0.01983567327260971, "memory(GiB)": 21.48, "step": 15119, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.954295 }, { "epoch": 0.4911801968619043, "grad_norm": 0.2731395959854126, "learning_rate": 5.558278217441555e-06, "loss": 0.019215308129787445, "memory(GiB)": 21.48, "step": 15120, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.954305 }, { "epoch": 0.4912126823246597, "grad_norm": 0.4064192473888397, "learning_rate": 5.5577444201259554e-06, "loss": 0.032576654106378555, "memory(GiB)": 21.48, "step": 15121, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.954315 }, { "epoch": 0.4912451677874151, "grad_norm": 0.3756966292858124, "learning_rate": 5.5572106163732445e-06, "loss": 0.02136288583278656, "memory(GiB)": 21.48, "step": 15122, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.954326 }, { "epoch": 0.49127765325017053, "grad_norm": 0.3274291455745697, "learning_rate": 5.556676806189579e-06, "loss": 0.025574591010808945, "memory(GiB)": 21.48, "step": 15123, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.954335 }, { "epoch": 0.49131013871292595, "grad_norm": 0.25404903292655945, "learning_rate": 5.556142989581124e-06, "loss": 0.02064828760921955, "memory(GiB)": 21.48, "step": 15124, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.954344 }, { "epoch": 0.49134262417568136, "grad_norm": 0.354144811630249, "learning_rate": 5.555609166554037e-06, "loss": 0.0303490050137043, "memory(GiB)": 21.48, "step": 15125, "token_acc": 0.9846153846153847, "train_speed(iter/s)": 0.954354 }, { "epoch": 0.4913751096384368, "grad_norm": 0.4322483539581299, "learning_rate": 5.555075337114483e-06, "loss": 0.012796923518180847, "memory(GiB)": 21.48, "step": 15126, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.954364 }, { "epoch": 0.4914075951011922, "grad_norm": 0.26038309931755066, "learning_rate": 5.5545415012686175e-06, "loss": 0.02168494090437889, "memory(GiB)": 21.48, "step": 15127, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.954374 }, { "epoch": 0.4914400805639476, "grad_norm": 0.298231840133667, "learning_rate": 5.554007659022607e-06, "loss": 0.018692150712013245, "memory(GiB)": 21.48, "step": 15128, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.954384 }, { "epoch": 0.49147256602670303, "grad_norm": 0.3358255922794342, "learning_rate": 5.5534738103826115e-06, "loss": 0.027598362416028976, "memory(GiB)": 21.48, "step": 15129, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.954394 }, { "epoch": 0.49150505148945844, "grad_norm": 0.8437190055847168, "learning_rate": 5.55293995535479e-06, "loss": 0.025524985045194626, "memory(GiB)": 21.48, "step": 15130, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.954404 }, { "epoch": 0.49153753695221386, "grad_norm": 0.4032745957374573, "learning_rate": 5.552406093945306e-06, "loss": 0.025553835555911064, "memory(GiB)": 21.48, "step": 15131, "token_acc": 0.9885057471264368, "train_speed(iter/s)": 0.954414 }, { "epoch": 0.4915700224149693, "grad_norm": 0.29623934626579285, "learning_rate": 5.5518722261603205e-06, "loss": 0.016661925241351128, "memory(GiB)": 21.48, "step": 15132, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.954425 }, { "epoch": 0.4916025078777247, "grad_norm": 0.3438258171081543, "learning_rate": 5.551338352005996e-06, "loss": 0.022472290322184563, "memory(GiB)": 21.48, "step": 15133, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.954436 }, { "epoch": 0.4916349933404801, "grad_norm": 0.4360395073890686, "learning_rate": 5.550804471488492e-06, "loss": 0.0203096941113472, "memory(GiB)": 21.48, "step": 15134, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.954447 }, { "epoch": 0.4916674788032355, "grad_norm": 0.4364252984523773, "learning_rate": 5.550270584613972e-06, "loss": 0.022989308461546898, "memory(GiB)": 21.48, "step": 15135, "token_acc": 1.0, "train_speed(iter/s)": 0.954457 }, { "epoch": 0.491699964265991, "grad_norm": 0.376998633146286, "learning_rate": 5.5497366913885965e-06, "loss": 0.016715634614229202, "memory(GiB)": 21.48, "step": 15136, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.954468 }, { "epoch": 0.4917324497287464, "grad_norm": 0.4674493372440338, "learning_rate": 5.549202791818527e-06, "loss": 0.03258368745446205, "memory(GiB)": 21.48, "step": 15137, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.954479 }, { "epoch": 0.49176493519150183, "grad_norm": 0.4112469255924225, "learning_rate": 5.548668885909928e-06, "loss": 0.028747905045747757, "memory(GiB)": 21.48, "step": 15138, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.95449 }, { "epoch": 0.49179742065425724, "grad_norm": 0.30832216143608093, "learning_rate": 5.548134973668959e-06, "loss": 0.01914774253964424, "memory(GiB)": 21.48, "step": 15139, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.954502 }, { "epoch": 0.49182990611701266, "grad_norm": 0.4050219655036926, "learning_rate": 5.547601055101782e-06, "loss": 0.021342622116208076, "memory(GiB)": 21.48, "step": 15140, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.954513 }, { "epoch": 0.4918623915797681, "grad_norm": 0.23144233226776123, "learning_rate": 5.547067130214561e-06, "loss": 0.014453760348260403, "memory(GiB)": 21.48, "step": 15141, "token_acc": 1.0, "train_speed(iter/s)": 0.954523 }, { "epoch": 0.4918948770425235, "grad_norm": 0.3713054358959198, "learning_rate": 5.546533199013456e-06, "loss": 0.022532034665346146, "memory(GiB)": 21.48, "step": 15142, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.954535 }, { "epoch": 0.4919273625052789, "grad_norm": 0.34743431210517883, "learning_rate": 5.545999261504632e-06, "loss": 0.02711383067071438, "memory(GiB)": 21.48, "step": 15143, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.954546 }, { "epoch": 0.4919598479680343, "grad_norm": 0.35150444507598877, "learning_rate": 5.5454653176942485e-06, "loss": 0.023521512746810913, "memory(GiB)": 21.48, "step": 15144, "token_acc": 1.0, "train_speed(iter/s)": 0.954558 }, { "epoch": 0.49199233343078974, "grad_norm": 0.31759029626846313, "learning_rate": 5.544931367588471e-06, "loss": 0.023968715220689774, "memory(GiB)": 21.48, "step": 15145, "token_acc": 0.9776785714285714, "train_speed(iter/s)": 0.954569 }, { "epoch": 0.49202481889354516, "grad_norm": 0.3104504346847534, "learning_rate": 5.544397411193457e-06, "loss": 0.01591741107404232, "memory(GiB)": 21.48, "step": 15146, "token_acc": 1.0, "train_speed(iter/s)": 0.954581 }, { "epoch": 0.4920573043563006, "grad_norm": 0.40900251269340515, "learning_rate": 5.5438634485153754e-06, "loss": 0.023499745875597, "memory(GiB)": 21.48, "step": 15147, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.954591 }, { "epoch": 0.492089789819056, "grad_norm": 0.33557313680648804, "learning_rate": 5.543329479560384e-06, "loss": 0.01771562546491623, "memory(GiB)": 21.48, "step": 15148, "token_acc": 1.0, "train_speed(iter/s)": 0.954601 }, { "epoch": 0.4921222752818114, "grad_norm": 0.3661726117134094, "learning_rate": 5.5427955043346495e-06, "loss": 0.02017110027372837, "memory(GiB)": 21.48, "step": 15149, "token_acc": 0.9921875, "train_speed(iter/s)": 0.954611 }, { "epoch": 0.4921547607445668, "grad_norm": 0.383263498544693, "learning_rate": 5.542261522844331e-06, "loss": 0.022734936326742172, "memory(GiB)": 21.48, "step": 15150, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.954621 }, { "epoch": 0.49218724620732224, "grad_norm": 0.25482240319252014, "learning_rate": 5.541727535095592e-06, "loss": 0.016551384702324867, "memory(GiB)": 21.48, "step": 15151, "token_acc": 1.0, "train_speed(iter/s)": 0.954632 }, { "epoch": 0.49221973167007765, "grad_norm": 0.49417635798454285, "learning_rate": 5.541193541094597e-06, "loss": 0.01540694385766983, "memory(GiB)": 21.48, "step": 15152, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.954643 }, { "epoch": 0.49225221713283307, "grad_norm": 0.4870269298553467, "learning_rate": 5.5406595408475085e-06, "loss": 0.020952757447957993, "memory(GiB)": 21.48, "step": 15153, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.954654 }, { "epoch": 0.4922847025955885, "grad_norm": 0.40276724100112915, "learning_rate": 5.540125534360489e-06, "loss": 0.017200540751218796, "memory(GiB)": 21.48, "step": 15154, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.954666 }, { "epoch": 0.4923171880583439, "grad_norm": 0.24625957012176514, "learning_rate": 5.539591521639701e-06, "loss": 0.015874527394771576, "memory(GiB)": 21.48, "step": 15155, "token_acc": 0.996415770609319, "train_speed(iter/s)": 0.954679 }, { "epoch": 0.4923496735210993, "grad_norm": 0.37987032532691956, "learning_rate": 5.53905750269131e-06, "loss": 0.025856448337435722, "memory(GiB)": 21.48, "step": 15156, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.954692 }, { "epoch": 0.49238215898385473, "grad_norm": 0.3771153688430786, "learning_rate": 5.538523477521477e-06, "loss": 0.019810084253549576, "memory(GiB)": 21.48, "step": 15157, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.954706 }, { "epoch": 0.49241464444661015, "grad_norm": 0.2894471287727356, "learning_rate": 5.537989446136366e-06, "loss": 0.020045824348926544, "memory(GiB)": 21.48, "step": 15158, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.954717 }, { "epoch": 0.49244712990936557, "grad_norm": 0.29927805066108704, "learning_rate": 5.537455408542142e-06, "loss": 0.02156095951795578, "memory(GiB)": 21.48, "step": 15159, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.95473 }, { "epoch": 0.492479615372121, "grad_norm": 0.3884301483631134, "learning_rate": 5.536921364744966e-06, "loss": 0.027568984776735306, "memory(GiB)": 21.48, "step": 15160, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.954743 }, { "epoch": 0.4925121008348764, "grad_norm": 0.3132793605327606, "learning_rate": 5.5363873147510035e-06, "loss": 0.0239938423037529, "memory(GiB)": 21.48, "step": 15161, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.954757 }, { "epoch": 0.4925445862976318, "grad_norm": 0.27447351813316345, "learning_rate": 5.535853258566417e-06, "loss": 0.017324715852737427, "memory(GiB)": 21.48, "step": 15162, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.95477 }, { "epoch": 0.49257707176038723, "grad_norm": 0.42666196823120117, "learning_rate": 5.535319196197372e-06, "loss": 0.021544883027672768, "memory(GiB)": 21.48, "step": 15163, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.954783 }, { "epoch": 0.49260955722314265, "grad_norm": 0.44859278202056885, "learning_rate": 5.53478512765003e-06, "loss": 0.030631856992840767, "memory(GiB)": 21.48, "step": 15164, "token_acc": 0.9835390946502057, "train_speed(iter/s)": 0.954797 }, { "epoch": 0.49264204268589806, "grad_norm": 0.3605421781539917, "learning_rate": 5.534251052930555e-06, "loss": 0.02464887499809265, "memory(GiB)": 21.48, "step": 15165, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.954809 }, { "epoch": 0.4926745281486535, "grad_norm": 0.4683056175708771, "learning_rate": 5.533716972045113e-06, "loss": 0.029978176578879356, "memory(GiB)": 21.48, "step": 15166, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.954823 }, { "epoch": 0.4927070136114089, "grad_norm": 0.28443777561187744, "learning_rate": 5.533182884999866e-06, "loss": 0.018636692315340042, "memory(GiB)": 21.48, "step": 15167, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.954836 }, { "epoch": 0.4927394990741643, "grad_norm": 0.35466283559799194, "learning_rate": 5.53264879180098e-06, "loss": 0.022626571357250214, "memory(GiB)": 21.48, "step": 15168, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.95485 }, { "epoch": 0.4927719845369197, "grad_norm": 0.3477902114391327, "learning_rate": 5.5321146924546164e-06, "loss": 0.01950691267848015, "memory(GiB)": 21.48, "step": 15169, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.954863 }, { "epoch": 0.49280446999967514, "grad_norm": 0.37573927640914917, "learning_rate": 5.5315805869669425e-06, "loss": 0.025341087952256203, "memory(GiB)": 21.48, "step": 15170, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.954877 }, { "epoch": 0.49283695546243056, "grad_norm": 0.4832330644130707, "learning_rate": 5.531046475344119e-06, "loss": 0.02141837775707245, "memory(GiB)": 21.48, "step": 15171, "token_acc": 1.0, "train_speed(iter/s)": 0.954887 }, { "epoch": 0.492869440925186, "grad_norm": 0.3373444676399231, "learning_rate": 5.5305123575923145e-06, "loss": 0.024602318182587624, "memory(GiB)": 21.48, "step": 15172, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.954895 }, { "epoch": 0.4929019263879414, "grad_norm": 0.3735027313232422, "learning_rate": 5.529978233717691e-06, "loss": 0.02427741140127182, "memory(GiB)": 21.48, "step": 15173, "token_acc": 0.9846938775510204, "train_speed(iter/s)": 0.954905 }, { "epoch": 0.4929344118506968, "grad_norm": 0.38785871863365173, "learning_rate": 5.529444103726412e-06, "loss": 0.020107364282011986, "memory(GiB)": 21.48, "step": 15174, "token_acc": 1.0, "train_speed(iter/s)": 0.954915 }, { "epoch": 0.4929668973134522, "grad_norm": 0.37772950530052185, "learning_rate": 5.528909967624645e-06, "loss": 0.021152539178729057, "memory(GiB)": 21.48, "step": 15175, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.954925 }, { "epoch": 0.49299938277620764, "grad_norm": 0.35763460397720337, "learning_rate": 5.528375825418551e-06, "loss": 0.019687319174408913, "memory(GiB)": 21.48, "step": 15176, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.954935 }, { "epoch": 0.49303186823896306, "grad_norm": 0.42177727818489075, "learning_rate": 5.527841677114299e-06, "loss": 0.024647224694490433, "memory(GiB)": 21.48, "step": 15177, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.954947 }, { "epoch": 0.49306435370171847, "grad_norm": 0.39086267352104187, "learning_rate": 5.5273075227180506e-06, "loss": 0.026702292263507843, "memory(GiB)": 21.48, "step": 15178, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.954958 }, { "epoch": 0.4930968391644739, "grad_norm": 0.413303405046463, "learning_rate": 5.5267733622359706e-06, "loss": 0.021113714203238487, "memory(GiB)": 21.48, "step": 15179, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.954966 }, { "epoch": 0.4931293246272293, "grad_norm": 0.4530164897441864, "learning_rate": 5.526239195674225e-06, "loss": 0.020784441381692886, "memory(GiB)": 21.48, "step": 15180, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.954977 }, { "epoch": 0.4931618100899847, "grad_norm": 0.43359726667404175, "learning_rate": 5.52570502303898e-06, "loss": 0.02368713542819023, "memory(GiB)": 21.48, "step": 15181, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.954988 }, { "epoch": 0.49319429555274014, "grad_norm": 0.31941473484039307, "learning_rate": 5.5251708443364e-06, "loss": 0.022394362837076187, "memory(GiB)": 21.48, "step": 15182, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.954998 }, { "epoch": 0.49322678101549555, "grad_norm": 0.3081487715244293, "learning_rate": 5.5246366595726465e-06, "loss": 0.02005346305668354, "memory(GiB)": 21.48, "step": 15183, "token_acc": 0.9770992366412213, "train_speed(iter/s)": 0.955007 }, { "epoch": 0.49325926647825097, "grad_norm": 0.43449732661247253, "learning_rate": 5.524102468753889e-06, "loss": 0.027954034507274628, "memory(GiB)": 21.48, "step": 15184, "token_acc": 0.9783549783549783, "train_speed(iter/s)": 0.955017 }, { "epoch": 0.4932917519410064, "grad_norm": 0.3654243052005768, "learning_rate": 5.523568271886292e-06, "loss": 0.019592322409152985, "memory(GiB)": 21.48, "step": 15185, "token_acc": 0.9875, "train_speed(iter/s)": 0.955028 }, { "epoch": 0.4933242374037618, "grad_norm": 0.4066922068595886, "learning_rate": 5.52303406897602e-06, "loss": 0.027789704501628876, "memory(GiB)": 21.48, "step": 15186, "token_acc": 0.9880239520958084, "train_speed(iter/s)": 0.955038 }, { "epoch": 0.4933567228665172, "grad_norm": 0.9036153554916382, "learning_rate": 5.522499860029239e-06, "loss": 0.018980637192726135, "memory(GiB)": 21.48, "step": 15187, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955049 }, { "epoch": 0.49338920832927263, "grad_norm": 0.4598066508769989, "learning_rate": 5.521965645052113e-06, "loss": 0.0253098476678133, "memory(GiB)": 21.48, "step": 15188, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.95506 }, { "epoch": 0.49342169379202805, "grad_norm": 0.3776674270629883, "learning_rate": 5.521431424050809e-06, "loss": 0.021095938980579376, "memory(GiB)": 21.48, "step": 15189, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.95507 }, { "epoch": 0.49345417925478346, "grad_norm": 0.4498048424720764, "learning_rate": 5.5208971970314925e-06, "loss": 0.024130601435899734, "memory(GiB)": 21.48, "step": 15190, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.955081 }, { "epoch": 0.4934866647175389, "grad_norm": 0.43234962224960327, "learning_rate": 5.52036296400033e-06, "loss": 0.021655678749084473, "memory(GiB)": 21.48, "step": 15191, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955094 }, { "epoch": 0.4935191501802943, "grad_norm": 0.2962404191493988, "learning_rate": 5.519828724963486e-06, "loss": 0.015797391533851624, "memory(GiB)": 21.48, "step": 15192, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955104 }, { "epoch": 0.4935516356430497, "grad_norm": 0.6263920664787292, "learning_rate": 5.519294479927126e-06, "loss": 0.017820095643401146, "memory(GiB)": 21.48, "step": 15193, "token_acc": 1.0, "train_speed(iter/s)": 0.955114 }, { "epoch": 0.49358412110580513, "grad_norm": 0.30580517649650574, "learning_rate": 5.518760228897418e-06, "loss": 0.016774747520685196, "memory(GiB)": 21.48, "step": 15194, "token_acc": 1.0, "train_speed(iter/s)": 0.955122 }, { "epoch": 0.49361660656856055, "grad_norm": 0.39015406370162964, "learning_rate": 5.518225971880524e-06, "loss": 0.02105611190199852, "memory(GiB)": 21.48, "step": 15195, "token_acc": 1.0, "train_speed(iter/s)": 0.955132 }, { "epoch": 0.49364909203131596, "grad_norm": 0.375323086977005, "learning_rate": 5.517691708882614e-06, "loss": 0.02075047418475151, "memory(GiB)": 21.48, "step": 15196, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.955143 }, { "epoch": 0.4936815774940714, "grad_norm": 0.30183637142181396, "learning_rate": 5.517157439909851e-06, "loss": 0.021096795797348022, "memory(GiB)": 21.48, "step": 15197, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.955154 }, { "epoch": 0.4937140629568268, "grad_norm": 0.3498413860797882, "learning_rate": 5.5166231649684056e-06, "loss": 0.017277084290981293, "memory(GiB)": 21.48, "step": 15198, "token_acc": 1.0, "train_speed(iter/s)": 0.955166 }, { "epoch": 0.4937465484195822, "grad_norm": 0.33923545479774475, "learning_rate": 5.516088884064439e-06, "loss": 0.019781529903411865, "memory(GiB)": 21.48, "step": 15199, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.955177 }, { "epoch": 0.4937790338823377, "grad_norm": 0.416262149810791, "learning_rate": 5.515554597204121e-06, "loss": 0.01879451423883438, "memory(GiB)": 21.48, "step": 15200, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.955186 }, { "epoch": 0.4938115193450931, "grad_norm": 0.5262521505355835, "learning_rate": 5.515020304393618e-06, "loss": 0.030267108231782913, "memory(GiB)": 21.48, "step": 15201, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.955196 }, { "epoch": 0.4938440048078485, "grad_norm": 0.38427770137786865, "learning_rate": 5.5144860056390925e-06, "loss": 0.025291550904512405, "memory(GiB)": 21.48, "step": 15202, "token_acc": 0.9869565217391304, "train_speed(iter/s)": 0.955206 }, { "epoch": 0.49387649027060393, "grad_norm": 0.3871684968471527, "learning_rate": 5.513951700946716e-06, "loss": 0.022346530109643936, "memory(GiB)": 21.48, "step": 15203, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.955216 }, { "epoch": 0.49390897573335935, "grad_norm": 0.49981021881103516, "learning_rate": 5.5134173903226495e-06, "loss": 0.025106381624937057, "memory(GiB)": 21.48, "step": 15204, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.955226 }, { "epoch": 0.49394146119611476, "grad_norm": 0.4055338203907013, "learning_rate": 5.512883073773065e-06, "loss": 0.020164061337709427, "memory(GiB)": 21.48, "step": 15205, "token_acc": 1.0, "train_speed(iter/s)": 0.955237 }, { "epoch": 0.4939739466588702, "grad_norm": 0.3468916118144989, "learning_rate": 5.512348751304128e-06, "loss": 0.023102179169654846, "memory(GiB)": 21.48, "step": 15206, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.955247 }, { "epoch": 0.4940064321216256, "grad_norm": 1.0089242458343506, "learning_rate": 5.511814422922004e-06, "loss": 0.02916778437793255, "memory(GiB)": 21.48, "step": 15207, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955258 }, { "epoch": 0.494038917584381, "grad_norm": 0.41830974817276, "learning_rate": 5.51128008863286e-06, "loss": 0.02429497241973877, "memory(GiB)": 21.48, "step": 15208, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.955269 }, { "epoch": 0.4940714030471364, "grad_norm": 0.45130330324172974, "learning_rate": 5.510745748442863e-06, "loss": 0.035799480974674225, "memory(GiB)": 21.48, "step": 15209, "token_acc": 0.996, "train_speed(iter/s)": 0.955281 }, { "epoch": 0.49410388850989184, "grad_norm": 0.4235750734806061, "learning_rate": 5.510211402358181e-06, "loss": 0.025144588202238083, "memory(GiB)": 21.48, "step": 15210, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.95529 }, { "epoch": 0.49413637397264726, "grad_norm": 0.35435834527015686, "learning_rate": 5.509677050384979e-06, "loss": 0.026055846363306046, "memory(GiB)": 21.48, "step": 15211, "token_acc": 0.9817073170731707, "train_speed(iter/s)": 0.9553 }, { "epoch": 0.4941688594354027, "grad_norm": 0.4277518689632416, "learning_rate": 5.509142692529427e-06, "loss": 0.02677353098988533, "memory(GiB)": 21.48, "step": 15212, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955311 }, { "epoch": 0.4942013448981581, "grad_norm": 0.342575341463089, "learning_rate": 5.50860832879769e-06, "loss": 0.02210160717368126, "memory(GiB)": 21.48, "step": 15213, "token_acc": 0.9812206572769953, "train_speed(iter/s)": 0.955322 }, { "epoch": 0.4942338303609135, "grad_norm": 0.41234445571899414, "learning_rate": 5.5080739591959364e-06, "loss": 0.02021031454205513, "memory(GiB)": 21.48, "step": 15214, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.955333 }, { "epoch": 0.4942663158236689, "grad_norm": 0.36193710565567017, "learning_rate": 5.5075395837303325e-06, "loss": 0.022400666028261185, "memory(GiB)": 21.48, "step": 15215, "token_acc": 1.0, "train_speed(iter/s)": 0.955344 }, { "epoch": 0.49429880128642434, "grad_norm": 0.4283778667449951, "learning_rate": 5.507005202407047e-06, "loss": 0.020067306235432625, "memory(GiB)": 21.48, "step": 15216, "token_acc": 0.9923371647509579, "train_speed(iter/s)": 0.955357 }, { "epoch": 0.49433128674917975, "grad_norm": 0.3236580789089203, "learning_rate": 5.506470815232246e-06, "loss": 0.01889592781662941, "memory(GiB)": 21.48, "step": 15217, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.95537 }, { "epoch": 0.49436377221193517, "grad_norm": 0.4295044541358948, "learning_rate": 5.505936422212097e-06, "loss": 0.024291764944791794, "memory(GiB)": 21.48, "step": 15218, "token_acc": 0.9966442953020134, "train_speed(iter/s)": 0.955384 }, { "epoch": 0.4943962576746906, "grad_norm": 0.2729284167289734, "learning_rate": 5.50540202335277e-06, "loss": 0.023726822808384895, "memory(GiB)": 21.48, "step": 15219, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.955397 }, { "epoch": 0.494428743137446, "grad_norm": 0.31829676032066345, "learning_rate": 5.504867618660429e-06, "loss": 0.017951110377907753, "memory(GiB)": 21.48, "step": 15220, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.955408 }, { "epoch": 0.4944612286002014, "grad_norm": 0.3001404106616974, "learning_rate": 5.504333208141244e-06, "loss": 0.01809687539935112, "memory(GiB)": 21.48, "step": 15221, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955421 }, { "epoch": 0.49449371406295684, "grad_norm": 0.3572012186050415, "learning_rate": 5.503798791801384e-06, "loss": 0.018507283180952072, "memory(GiB)": 21.48, "step": 15222, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955433 }, { "epoch": 0.49452619952571225, "grad_norm": 0.3869667053222656, "learning_rate": 5.503264369647013e-06, "loss": 0.02113288640975952, "memory(GiB)": 21.48, "step": 15223, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.955447 }, { "epoch": 0.49455868498846767, "grad_norm": 0.641620934009552, "learning_rate": 5.5027299416843035e-06, "loss": 0.023475676774978638, "memory(GiB)": 21.48, "step": 15224, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.95546 }, { "epoch": 0.4945911704512231, "grad_norm": 0.29638850688934326, "learning_rate": 5.502195507919421e-06, "loss": 0.016982775181531906, "memory(GiB)": 21.48, "step": 15225, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.955474 }, { "epoch": 0.4946236559139785, "grad_norm": 0.27871596813201904, "learning_rate": 5.501661068358534e-06, "loss": 0.013320709578692913, "memory(GiB)": 21.48, "step": 15226, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.955488 }, { "epoch": 0.4946561413767339, "grad_norm": 0.3077317178249359, "learning_rate": 5.50112662300781e-06, "loss": 0.015357249416410923, "memory(GiB)": 21.48, "step": 15227, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.955499 }, { "epoch": 0.49468862683948933, "grad_norm": 0.663872241973877, "learning_rate": 5.500592171873418e-06, "loss": 0.03290893882513046, "memory(GiB)": 21.48, "step": 15228, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.955511 }, { "epoch": 0.49472111230224475, "grad_norm": 0.36703401803970337, "learning_rate": 5.500057714961526e-06, "loss": 0.0174606591463089, "memory(GiB)": 21.48, "step": 15229, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955525 }, { "epoch": 0.49475359776500016, "grad_norm": 0.366677463054657, "learning_rate": 5.4995232522783025e-06, "loss": 0.01695382408797741, "memory(GiB)": 21.48, "step": 15230, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.955535 }, { "epoch": 0.4947860832277556, "grad_norm": 0.3144901990890503, "learning_rate": 5.498988783829917e-06, "loss": 0.01635335013270378, "memory(GiB)": 21.48, "step": 15231, "token_acc": 0.9918367346938776, "train_speed(iter/s)": 0.955545 }, { "epoch": 0.494818568690511, "grad_norm": 0.37142932415008545, "learning_rate": 5.498454309622534e-06, "loss": 0.019244477152824402, "memory(GiB)": 21.48, "step": 15232, "token_acc": 0.9893992932862191, "train_speed(iter/s)": 0.955555 }, { "epoch": 0.4948510541532664, "grad_norm": 0.34721139073371887, "learning_rate": 5.497919829662327e-06, "loss": 0.017745910212397575, "memory(GiB)": 21.48, "step": 15233, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.955566 }, { "epoch": 0.49488353961602183, "grad_norm": 0.33106204867362976, "learning_rate": 5.497385343955462e-06, "loss": 0.020464999601244926, "memory(GiB)": 21.48, "step": 15234, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.955576 }, { "epoch": 0.49491602507877724, "grad_norm": 0.4190961420536041, "learning_rate": 5.496850852508107e-06, "loss": 0.01615155301988125, "memory(GiB)": 21.48, "step": 15235, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.955585 }, { "epoch": 0.49494851054153266, "grad_norm": 0.2514367699623108, "learning_rate": 5.496316355326433e-06, "loss": 0.01423042081296444, "memory(GiB)": 21.48, "step": 15236, "token_acc": 1.0, "train_speed(iter/s)": 0.955596 }, { "epoch": 0.4949809960042881, "grad_norm": 0.472265362739563, "learning_rate": 5.495781852416607e-06, "loss": 0.021542536094784737, "memory(GiB)": 21.48, "step": 15237, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.955606 }, { "epoch": 0.4950134814670435, "grad_norm": 0.5449764132499695, "learning_rate": 5.4952473437847996e-06, "loss": 0.024656038731336594, "memory(GiB)": 21.48, "step": 15238, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955615 }, { "epoch": 0.4950459669297989, "grad_norm": 0.5217713713645935, "learning_rate": 5.494712829437177e-06, "loss": 0.0312737338244915, "memory(GiB)": 21.48, "step": 15239, "token_acc": 0.9853479853479854, "train_speed(iter/s)": 0.955625 }, { "epoch": 0.4950784523925543, "grad_norm": 0.44259434938430786, "learning_rate": 5.494178309379912e-06, "loss": 0.028362536802887917, "memory(GiB)": 21.48, "step": 15240, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.955635 }, { "epoch": 0.49511093785530974, "grad_norm": 0.3924392759799957, "learning_rate": 5.49364378361917e-06, "loss": 0.0239680428057909, "memory(GiB)": 21.48, "step": 15241, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955645 }, { "epoch": 0.49514342331806516, "grad_norm": 0.8410825729370117, "learning_rate": 5.493109252161123e-06, "loss": 0.026810240000486374, "memory(GiB)": 21.48, "step": 15242, "token_acc": 1.0, "train_speed(iter/s)": 0.955654 }, { "epoch": 0.4951759087808206, "grad_norm": 0.44431939721107483, "learning_rate": 5.492574715011937e-06, "loss": 0.02308090403676033, "memory(GiB)": 21.48, "step": 15243, "token_acc": 1.0, "train_speed(iter/s)": 0.955664 }, { "epoch": 0.495208394243576, "grad_norm": 0.31430479884147644, "learning_rate": 5.492040172177785e-06, "loss": 0.01665556989610195, "memory(GiB)": 21.48, "step": 15244, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.955675 }, { "epoch": 0.4952408797063314, "grad_norm": 0.458048015832901, "learning_rate": 5.491505623664835e-06, "loss": 0.020408278331160545, "memory(GiB)": 21.48, "step": 15245, "token_acc": 0.9904153354632588, "train_speed(iter/s)": 0.955685 }, { "epoch": 0.4952733651690868, "grad_norm": 0.32919609546661377, "learning_rate": 5.490971069479254e-06, "loss": 0.01975695788860321, "memory(GiB)": 21.48, "step": 15246, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955695 }, { "epoch": 0.49530585063184224, "grad_norm": 0.699309229850769, "learning_rate": 5.490436509627214e-06, "loss": 0.03002629056572914, "memory(GiB)": 21.48, "step": 15247, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.955705 }, { "epoch": 0.49533833609459765, "grad_norm": 0.354406476020813, "learning_rate": 5.489901944114884e-06, "loss": 0.020035449415445328, "memory(GiB)": 21.48, "step": 15248, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.955715 }, { "epoch": 0.49537082155735307, "grad_norm": 0.3467535972595215, "learning_rate": 5.489367372948433e-06, "loss": 0.01798446662724018, "memory(GiB)": 21.48, "step": 15249, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.955726 }, { "epoch": 0.4954033070201085, "grad_norm": 0.3074810802936554, "learning_rate": 5.488832796134032e-06, "loss": 0.02065384015440941, "memory(GiB)": 21.48, "step": 15250, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.95574 }, { "epoch": 0.4954357924828639, "grad_norm": 0.4229792654514313, "learning_rate": 5.488298213677848e-06, "loss": 0.026830049231648445, "memory(GiB)": 21.48, "step": 15251, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.955754 }, { "epoch": 0.4954682779456193, "grad_norm": 0.35173115134239197, "learning_rate": 5.487763625586055e-06, "loss": 0.017010275274515152, "memory(GiB)": 21.48, "step": 15252, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.955767 }, { "epoch": 0.49550076340837473, "grad_norm": 0.247157022356987, "learning_rate": 5.48722903186482e-06, "loss": 0.013327404856681824, "memory(GiB)": 21.48, "step": 15253, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.955779 }, { "epoch": 0.49553324887113015, "grad_norm": 0.3151703178882599, "learning_rate": 5.486694432520313e-06, "loss": 0.01567528024315834, "memory(GiB)": 21.48, "step": 15254, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.955788 }, { "epoch": 0.49556573433388557, "grad_norm": 1.0748802423477173, "learning_rate": 5.486159827558705e-06, "loss": 0.017581135034561157, "memory(GiB)": 21.48, "step": 15255, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955799 }, { "epoch": 0.495598219796641, "grad_norm": 0.3775542080402374, "learning_rate": 5.485625216986165e-06, "loss": 0.025507250800728798, "memory(GiB)": 21.48, "step": 15256, "token_acc": 0.9832402234636871, "train_speed(iter/s)": 0.955809 }, { "epoch": 0.4956307052593964, "grad_norm": 0.3093922734260559, "learning_rate": 5.485090600808863e-06, "loss": 0.014935738407075405, "memory(GiB)": 21.48, "step": 15257, "token_acc": 1.0, "train_speed(iter/s)": 0.955819 }, { "epoch": 0.4956631907221518, "grad_norm": 0.4330742359161377, "learning_rate": 5.484555979032971e-06, "loss": 0.014457209035754204, "memory(GiB)": 21.48, "step": 15258, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.95583 }, { "epoch": 0.49569567618490723, "grad_norm": 0.3047219216823578, "learning_rate": 5.484021351664659e-06, "loss": 0.020479222759604454, "memory(GiB)": 21.48, "step": 15259, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.955841 }, { "epoch": 0.49572816164766265, "grad_norm": 0.2519819140434265, "learning_rate": 5.483486718710095e-06, "loss": 0.01137307845056057, "memory(GiB)": 21.48, "step": 15260, "token_acc": 1.0, "train_speed(iter/s)": 0.955851 }, { "epoch": 0.49576064711041806, "grad_norm": 0.34935125708580017, "learning_rate": 5.482952080175451e-06, "loss": 0.021672319620847702, "memory(GiB)": 21.48, "step": 15261, "token_acc": 1.0, "train_speed(iter/s)": 0.955862 }, { "epoch": 0.4957931325731735, "grad_norm": 0.28073617815971375, "learning_rate": 5.4824174360668965e-06, "loss": 0.014567974954843521, "memory(GiB)": 21.48, "step": 15262, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.955873 }, { "epoch": 0.4958256180359289, "grad_norm": 0.3477644622325897, "learning_rate": 5.481882786390604e-06, "loss": 0.019955366849899292, "memory(GiB)": 21.48, "step": 15263, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.955882 }, { "epoch": 0.49585810349868437, "grad_norm": 0.24216945469379425, "learning_rate": 5.481348131152742e-06, "loss": 0.01399206928908825, "memory(GiB)": 21.48, "step": 15264, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955891 }, { "epoch": 0.4958905889614398, "grad_norm": 0.3356362581253052, "learning_rate": 5.480813470359482e-06, "loss": 0.024401405826210976, "memory(GiB)": 21.48, "step": 15265, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955902 }, { "epoch": 0.4959230744241952, "grad_norm": 0.5854240655899048, "learning_rate": 5.4802788040169945e-06, "loss": 0.02592357248067856, "memory(GiB)": 21.48, "step": 15266, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.955912 }, { "epoch": 0.4959555598869506, "grad_norm": 0.38972896337509155, "learning_rate": 5.479744132131449e-06, "loss": 0.028406519442796707, "memory(GiB)": 21.48, "step": 15267, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.955923 }, { "epoch": 0.49598804534970603, "grad_norm": 0.4311874806880951, "learning_rate": 5.479209454709019e-06, "loss": 0.02298717200756073, "memory(GiB)": 21.48, "step": 15268, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955934 }, { "epoch": 0.49602053081246145, "grad_norm": 0.3463499844074249, "learning_rate": 5.478674771755874e-06, "loss": 0.01968958228826523, "memory(GiB)": 21.48, "step": 15269, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.955944 }, { "epoch": 0.49605301627521686, "grad_norm": 0.4410755932331085, "learning_rate": 5.478140083278185e-06, "loss": 0.016530703753232956, "memory(GiB)": 21.48, "step": 15270, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.955954 }, { "epoch": 0.4960855017379723, "grad_norm": 0.5995295643806458, "learning_rate": 5.477605389282124e-06, "loss": 0.025758400559425354, "memory(GiB)": 21.48, "step": 15271, "token_acc": 0.9832402234636871, "train_speed(iter/s)": 0.955964 }, { "epoch": 0.4961179872007277, "grad_norm": 1.2342514991760254, "learning_rate": 5.47707068977386e-06, "loss": 0.030045032501220703, "memory(GiB)": 21.48, "step": 15272, "token_acc": 1.0, "train_speed(iter/s)": 0.955972 }, { "epoch": 0.4961504726634831, "grad_norm": 0.4636164605617523, "learning_rate": 5.476535984759566e-06, "loss": 0.02115577831864357, "memory(GiB)": 21.48, "step": 15273, "token_acc": 0.9964285714285714, "train_speed(iter/s)": 0.955981 }, { "epoch": 0.4961829581262385, "grad_norm": 0.3135819435119629, "learning_rate": 5.476001274245412e-06, "loss": 0.01596943661570549, "memory(GiB)": 21.48, "step": 15274, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955991 }, { "epoch": 0.49621544358899394, "grad_norm": 0.35628044605255127, "learning_rate": 5.47546655823757e-06, "loss": 0.019726809114217758, "memory(GiB)": 21.48, "step": 15275, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.956001 }, { "epoch": 0.49624792905174936, "grad_norm": 0.27783724665641785, "learning_rate": 5.47493183674221e-06, "loss": 0.01473909243941307, "memory(GiB)": 21.48, "step": 15276, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.956012 }, { "epoch": 0.4962804145145048, "grad_norm": 0.31197819113731384, "learning_rate": 5.474397109765506e-06, "loss": 0.0237466711550951, "memory(GiB)": 21.48, "step": 15277, "token_acc": 1.0, "train_speed(iter/s)": 0.956024 }, { "epoch": 0.4963128999772602, "grad_norm": 0.38843879103660583, "learning_rate": 5.473862377313627e-06, "loss": 0.025785021483898163, "memory(GiB)": 21.48, "step": 15278, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.956037 }, { "epoch": 0.4963453854400156, "grad_norm": 0.34338006377220154, "learning_rate": 5.473327639392746e-06, "loss": 0.019352542236447334, "memory(GiB)": 21.48, "step": 15279, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.956051 }, { "epoch": 0.496377870902771, "grad_norm": 0.5515908598899841, "learning_rate": 5.472792896009033e-06, "loss": 0.02769082598388195, "memory(GiB)": 21.48, "step": 15280, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956063 }, { "epoch": 0.49641035636552644, "grad_norm": 0.4837811589241028, "learning_rate": 5.472258147168661e-06, "loss": 0.02566901408135891, "memory(GiB)": 21.48, "step": 15281, "token_acc": 1.0, "train_speed(iter/s)": 0.956076 }, { "epoch": 0.49644284182828186, "grad_norm": 0.2899889051914215, "learning_rate": 5.471723392877801e-06, "loss": 0.01641249656677246, "memory(GiB)": 21.48, "step": 15282, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.95609 }, { "epoch": 0.49647532729103727, "grad_norm": 0.34119588136672974, "learning_rate": 5.471188633142626e-06, "loss": 0.021053314208984375, "memory(GiB)": 21.48, "step": 15283, "token_acc": 0.9885931558935361, "train_speed(iter/s)": 0.956101 }, { "epoch": 0.4965078127537927, "grad_norm": 0.41720542311668396, "learning_rate": 5.470653867969308e-06, "loss": 0.01762831211090088, "memory(GiB)": 21.48, "step": 15284, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.956115 }, { "epoch": 0.4965402982165481, "grad_norm": 0.25369465351104736, "learning_rate": 5.470119097364016e-06, "loss": 0.015722448006272316, "memory(GiB)": 21.48, "step": 15285, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956128 }, { "epoch": 0.4965727836793035, "grad_norm": 0.36703041195869446, "learning_rate": 5.469584321332926e-06, "loss": 0.02028418891131878, "memory(GiB)": 21.48, "step": 15286, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.956141 }, { "epoch": 0.49660526914205894, "grad_norm": 0.3200515806674957, "learning_rate": 5.469049539882207e-06, "loss": 0.019515225663781166, "memory(GiB)": 21.48, "step": 15287, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.956154 }, { "epoch": 0.49663775460481435, "grad_norm": 0.48653969168663025, "learning_rate": 5.468514753018031e-06, "loss": 0.028490981087088585, "memory(GiB)": 21.48, "step": 15288, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.956167 }, { "epoch": 0.49667024006756977, "grad_norm": 0.4568036198616028, "learning_rate": 5.4679799607465724e-06, "loss": 0.025199757888913155, "memory(GiB)": 21.48, "step": 15289, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956178 }, { "epoch": 0.4967027255303252, "grad_norm": 0.3064236044883728, "learning_rate": 5.467445163074001e-06, "loss": 0.021101519465446472, "memory(GiB)": 21.48, "step": 15290, "token_acc": 1.0, "train_speed(iter/s)": 0.956186 }, { "epoch": 0.4967352109930806, "grad_norm": 0.3141282796859741, "learning_rate": 5.466910360006491e-06, "loss": 0.018757997080683708, "memory(GiB)": 21.48, "step": 15291, "token_acc": 1.0, "train_speed(iter/s)": 0.956195 }, { "epoch": 0.496767696455836, "grad_norm": 0.3653087913990021, "learning_rate": 5.466375551550214e-06, "loss": 0.021116454154253006, "memory(GiB)": 21.48, "step": 15292, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.956205 }, { "epoch": 0.49680018191859143, "grad_norm": 1.063354253768921, "learning_rate": 5.4658407377113435e-06, "loss": 0.022109318524599075, "memory(GiB)": 21.48, "step": 15293, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956215 }, { "epoch": 0.49683266738134685, "grad_norm": 0.3109085261821747, "learning_rate": 5.4653059184960476e-06, "loss": 0.019634196534752846, "memory(GiB)": 21.48, "step": 15294, "token_acc": 1.0, "train_speed(iter/s)": 0.956224 }, { "epoch": 0.49686515284410226, "grad_norm": 0.3656459450721741, "learning_rate": 5.464771093910504e-06, "loss": 0.026434803381562233, "memory(GiB)": 21.48, "step": 15295, "token_acc": 1.0, "train_speed(iter/s)": 0.956235 }, { "epoch": 0.4968976383068577, "grad_norm": 0.40794986486434937, "learning_rate": 5.464236263960884e-06, "loss": 0.019576899707317352, "memory(GiB)": 21.48, "step": 15296, "token_acc": 1.0, "train_speed(iter/s)": 0.956244 }, { "epoch": 0.4969301237696131, "grad_norm": 0.5150452852249146, "learning_rate": 5.463701428653358e-06, "loss": 0.03290867805480957, "memory(GiB)": 21.48, "step": 15297, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.956254 }, { "epoch": 0.4969626092323685, "grad_norm": 0.3859834671020508, "learning_rate": 5.463166587994103e-06, "loss": 0.020110435783863068, "memory(GiB)": 21.48, "step": 15298, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.956264 }, { "epoch": 0.49699509469512393, "grad_norm": 0.27342894673347473, "learning_rate": 5.462631741989287e-06, "loss": 0.015382146462798119, "memory(GiB)": 21.48, "step": 15299, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.956273 }, { "epoch": 0.49702758015787935, "grad_norm": 0.2607838809490204, "learning_rate": 5.462096890645085e-06, "loss": 0.018491923809051514, "memory(GiB)": 21.48, "step": 15300, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956283 }, { "epoch": 0.49706006562063476, "grad_norm": 0.35222211480140686, "learning_rate": 5.461562033967671e-06, "loss": 0.01853567734360695, "memory(GiB)": 21.48, "step": 15301, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.956292 }, { "epoch": 0.4970925510833902, "grad_norm": 0.3285856544971466, "learning_rate": 5.461027171963216e-06, "loss": 0.02310536429286003, "memory(GiB)": 21.48, "step": 15302, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956302 }, { "epoch": 0.4971250365461456, "grad_norm": 0.3653823733329773, "learning_rate": 5.460492304637894e-06, "loss": 0.020442595705389977, "memory(GiB)": 21.48, "step": 15303, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.956312 }, { "epoch": 0.497157522008901, "grad_norm": 0.48296305537223816, "learning_rate": 5.459957431997877e-06, "loss": 0.025461144745349884, "memory(GiB)": 21.48, "step": 15304, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.956322 }, { "epoch": 0.4971900074716564, "grad_norm": 0.5202679634094238, "learning_rate": 5.4594225540493406e-06, "loss": 0.023997299373149872, "memory(GiB)": 21.48, "step": 15305, "token_acc": 1.0, "train_speed(iter/s)": 0.956332 }, { "epoch": 0.49722249293441184, "grad_norm": 0.3959389626979828, "learning_rate": 5.458887670798456e-06, "loss": 0.02513236179947853, "memory(GiB)": 21.48, "step": 15306, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.956343 }, { "epoch": 0.49725497839716726, "grad_norm": 0.3443087637424469, "learning_rate": 5.458352782251397e-06, "loss": 0.025488877668976784, "memory(GiB)": 21.48, "step": 15307, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.956352 }, { "epoch": 0.4972874638599227, "grad_norm": 0.31910067796707153, "learning_rate": 5.457817888414335e-06, "loss": 0.018645431846380234, "memory(GiB)": 21.48, "step": 15308, "token_acc": 1.0, "train_speed(iter/s)": 0.956361 }, { "epoch": 0.4973199493226781, "grad_norm": 0.5105417370796204, "learning_rate": 5.457282989293447e-06, "loss": 0.0246572345495224, "memory(GiB)": 21.48, "step": 15309, "token_acc": 0.9842105263157894, "train_speed(iter/s)": 0.956374 }, { "epoch": 0.4973524347854335, "grad_norm": 0.4100956916809082, "learning_rate": 5.456748084894905e-06, "loss": 0.026846792548894882, "memory(GiB)": 21.48, "step": 15310, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.956387 }, { "epoch": 0.4973849202481889, "grad_norm": 0.38082045316696167, "learning_rate": 5.456213175224882e-06, "loss": 0.02255905605852604, "memory(GiB)": 21.48, "step": 15311, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.9564 }, { "epoch": 0.49741740571094434, "grad_norm": 0.26154446601867676, "learning_rate": 5.455678260289549e-06, "loss": 0.01615203730762005, "memory(GiB)": 21.48, "step": 15312, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.956414 }, { "epoch": 0.49744989117369975, "grad_norm": 0.42818066477775574, "learning_rate": 5.455143340095085e-06, "loss": 0.02109093964099884, "memory(GiB)": 21.48, "step": 15313, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.956428 }, { "epoch": 0.49748237663645517, "grad_norm": 0.4631621539592743, "learning_rate": 5.4546084146476605e-06, "loss": 0.03139336407184601, "memory(GiB)": 21.48, "step": 15314, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.956442 }, { "epoch": 0.4975148620992106, "grad_norm": 0.3838987946510315, "learning_rate": 5.454073483953451e-06, "loss": 0.02522754669189453, "memory(GiB)": 21.48, "step": 15315, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.956453 }, { "epoch": 0.497547347561966, "grad_norm": 0.5396453738212585, "learning_rate": 5.453538548018628e-06, "loss": 0.02959972806274891, "memory(GiB)": 21.48, "step": 15316, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.956463 }, { "epoch": 0.4975798330247214, "grad_norm": 0.3345315754413605, "learning_rate": 5.453003606849366e-06, "loss": 0.015065254643559456, "memory(GiB)": 21.48, "step": 15317, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.956474 }, { "epoch": 0.49761231848747683, "grad_norm": 0.5310523509979248, "learning_rate": 5.4524686604518405e-06, "loss": 0.03669397905468941, "memory(GiB)": 21.48, "step": 15318, "token_acc": 0.984, "train_speed(iter/s)": 0.956485 }, { "epoch": 0.49764480395023225, "grad_norm": 0.4046303629875183, "learning_rate": 5.451933708832224e-06, "loss": 0.024079766124486923, "memory(GiB)": 21.48, "step": 15319, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.956496 }, { "epoch": 0.49767728941298767, "grad_norm": 1.0483357906341553, "learning_rate": 5.45139875199669e-06, "loss": 0.022132646292448044, "memory(GiB)": 21.48, "step": 15320, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.956506 }, { "epoch": 0.4977097748757431, "grad_norm": 0.278073251247406, "learning_rate": 5.450863789951415e-06, "loss": 0.01925433799624443, "memory(GiB)": 21.48, "step": 15321, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.956517 }, { "epoch": 0.4977422603384985, "grad_norm": 0.3272864818572998, "learning_rate": 5.4503288227025705e-06, "loss": 0.01732008345425129, "memory(GiB)": 21.48, "step": 15322, "token_acc": 0.9930313588850174, "train_speed(iter/s)": 0.956528 }, { "epoch": 0.4977747458012539, "grad_norm": 0.3176876902580261, "learning_rate": 5.4497938502563344e-06, "loss": 0.012400221079587936, "memory(GiB)": 21.48, "step": 15323, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.956538 }, { "epoch": 0.49780723126400933, "grad_norm": 0.37609007954597473, "learning_rate": 5.4492588726188765e-06, "loss": 0.018094507977366447, "memory(GiB)": 21.48, "step": 15324, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.956549 }, { "epoch": 0.49783971672676475, "grad_norm": 0.24058254063129425, "learning_rate": 5.448723889796373e-06, "loss": 0.013824120163917542, "memory(GiB)": 21.48, "step": 15325, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956558 }, { "epoch": 0.49787220218952016, "grad_norm": 0.29722723364830017, "learning_rate": 5.448188901795001e-06, "loss": 0.016163906082510948, "memory(GiB)": 21.48, "step": 15326, "token_acc": 0.983957219251337, "train_speed(iter/s)": 0.956568 }, { "epoch": 0.4979046876522756, "grad_norm": 0.3405958414077759, "learning_rate": 5.447653908620929e-06, "loss": 0.02009773999452591, "memory(GiB)": 21.48, "step": 15327, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.956577 }, { "epoch": 0.49793717311503105, "grad_norm": 0.32683268189430237, "learning_rate": 5.447118910280337e-06, "loss": 0.021099139004945755, "memory(GiB)": 21.48, "step": 15328, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.956587 }, { "epoch": 0.49796965857778647, "grad_norm": 0.2886667549610138, "learning_rate": 5.446583906779398e-06, "loss": 0.020131772384047508, "memory(GiB)": 21.48, "step": 15329, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.956598 }, { "epoch": 0.4980021440405419, "grad_norm": 0.47051724791526794, "learning_rate": 5.446048898124285e-06, "loss": 0.021924467757344246, "memory(GiB)": 21.48, "step": 15330, "token_acc": 0.9929824561403509, "train_speed(iter/s)": 0.95661 }, { "epoch": 0.4980346295032973, "grad_norm": 0.31338125467300415, "learning_rate": 5.445513884321175e-06, "loss": 0.018621914088726044, "memory(GiB)": 21.48, "step": 15331, "token_acc": 1.0, "train_speed(iter/s)": 0.95662 }, { "epoch": 0.4980671149660527, "grad_norm": 0.3092935383319855, "learning_rate": 5.444978865376241e-06, "loss": 0.021408721804618835, "memory(GiB)": 21.48, "step": 15332, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.95663 }, { "epoch": 0.49809960042880813, "grad_norm": 0.3169275224208832, "learning_rate": 5.44444384129566e-06, "loss": 0.026698531582951546, "memory(GiB)": 21.48, "step": 15333, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.956639 }, { "epoch": 0.49813208589156355, "grad_norm": 0.3742784261703491, "learning_rate": 5.443908812085604e-06, "loss": 0.025251086801290512, "memory(GiB)": 21.48, "step": 15334, "token_acc": 0.9930313588850174, "train_speed(iter/s)": 0.956649 }, { "epoch": 0.49816457135431896, "grad_norm": 0.5420575141906738, "learning_rate": 5.443373777752251e-06, "loss": 0.02509693056344986, "memory(GiB)": 21.48, "step": 15335, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956659 }, { "epoch": 0.4981970568170744, "grad_norm": 0.8002694249153137, "learning_rate": 5.442838738301774e-06, "loss": 0.018323898315429688, "memory(GiB)": 21.48, "step": 15336, "token_acc": 1.0, "train_speed(iter/s)": 0.95667 }, { "epoch": 0.4982295422798298, "grad_norm": 0.44438549876213074, "learning_rate": 5.4423036937403484e-06, "loss": 0.02193520963191986, "memory(GiB)": 21.48, "step": 15337, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.956681 }, { "epoch": 0.4982620277425852, "grad_norm": 0.3709922432899475, "learning_rate": 5.4417686440741504e-06, "loss": 0.018506182357668877, "memory(GiB)": 21.48, "step": 15338, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.956691 }, { "epoch": 0.49829451320534063, "grad_norm": 0.4523831903934479, "learning_rate": 5.441233589309351e-06, "loss": 0.025476280599832535, "memory(GiB)": 21.48, "step": 15339, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.956702 }, { "epoch": 0.49832699866809604, "grad_norm": 0.4880850911140442, "learning_rate": 5.440698529452133e-06, "loss": 0.030701063573360443, "memory(GiB)": 21.48, "step": 15340, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.956716 }, { "epoch": 0.49835948413085146, "grad_norm": 0.4129771888256073, "learning_rate": 5.440163464508664e-06, "loss": 0.03376636654138565, "memory(GiB)": 21.48, "step": 15341, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.956729 }, { "epoch": 0.4983919695936069, "grad_norm": 0.30701303482055664, "learning_rate": 5.439628394485124e-06, "loss": 0.018754184246063232, "memory(GiB)": 21.48, "step": 15342, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.956742 }, { "epoch": 0.4984244550563623, "grad_norm": 0.46314531564712524, "learning_rate": 5.439093319387687e-06, "loss": 0.02601170912384987, "memory(GiB)": 21.48, "step": 15343, "token_acc": 0.9931506849315068, "train_speed(iter/s)": 0.956755 }, { "epoch": 0.4984569405191177, "grad_norm": 0.4635438919067383, "learning_rate": 5.438558239222529e-06, "loss": 0.02388949505984783, "memory(GiB)": 21.48, "step": 15344, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.956768 }, { "epoch": 0.4984894259818731, "grad_norm": 0.3138609230518341, "learning_rate": 5.438023153995826e-06, "loss": 0.01784757897257805, "memory(GiB)": 21.48, "step": 15345, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.95678 }, { "epoch": 0.49852191144462854, "grad_norm": 0.4107179343700409, "learning_rate": 5.43748806371375e-06, "loss": 0.02355985715985298, "memory(GiB)": 21.48, "step": 15346, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.956794 }, { "epoch": 0.49855439690738396, "grad_norm": 0.3939019739627838, "learning_rate": 5.436952968382483e-06, "loss": 0.02403092198073864, "memory(GiB)": 21.48, "step": 15347, "token_acc": 1.0, "train_speed(iter/s)": 0.956808 }, { "epoch": 0.4985868823701394, "grad_norm": 0.48484936356544495, "learning_rate": 5.436417868008194e-06, "loss": 0.02092122845351696, "memory(GiB)": 21.48, "step": 15348, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.95682 }, { "epoch": 0.4986193678328948, "grad_norm": 0.3974664509296417, "learning_rate": 5.435882762597065e-06, "loss": 0.03051314875483513, "memory(GiB)": 21.48, "step": 15349, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.956831 }, { "epoch": 0.4986518532956502, "grad_norm": 0.3076312243938446, "learning_rate": 5.435347652155266e-06, "loss": 0.014184395782649517, "memory(GiB)": 21.48, "step": 15350, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956841 }, { "epoch": 0.4986843387584056, "grad_norm": 0.35502761602401733, "learning_rate": 5.434812536688978e-06, "loss": 0.02482694387435913, "memory(GiB)": 21.48, "step": 15351, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.95685 }, { "epoch": 0.49871682422116104, "grad_norm": 0.31369659304618835, "learning_rate": 5.434277416204375e-06, "loss": 0.018521375954151154, "memory(GiB)": 21.48, "step": 15352, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.95686 }, { "epoch": 0.49874930968391645, "grad_norm": 0.3491804897785187, "learning_rate": 5.433742290707628e-06, "loss": 0.016947206109762192, "memory(GiB)": 21.48, "step": 15353, "token_acc": 1.0, "train_speed(iter/s)": 0.95687 }, { "epoch": 0.49878179514667187, "grad_norm": 0.4152708351612091, "learning_rate": 5.433207160204923e-06, "loss": 0.020119335502386093, "memory(GiB)": 21.48, "step": 15354, "token_acc": 1.0, "train_speed(iter/s)": 0.95688 }, { "epoch": 0.4988142806094273, "grad_norm": 0.26186901330947876, "learning_rate": 5.432672024702428e-06, "loss": 0.01586904376745224, "memory(GiB)": 21.48, "step": 15355, "token_acc": 0.995, "train_speed(iter/s)": 0.956891 }, { "epoch": 0.4988467660721827, "grad_norm": 0.3464905917644501, "learning_rate": 5.432136884206322e-06, "loss": 0.021684017032384872, "memory(GiB)": 21.48, "step": 15356, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956902 }, { "epoch": 0.4988792515349381, "grad_norm": 0.44462138414382935, "learning_rate": 5.431601738722782e-06, "loss": 0.021718565374612808, "memory(GiB)": 21.48, "step": 15357, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956909 }, { "epoch": 0.49891173699769353, "grad_norm": 0.37769392132759094, "learning_rate": 5.431066588257982e-06, "loss": 0.023652568459510803, "memory(GiB)": 21.48, "step": 15358, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.956919 }, { "epoch": 0.49894422246044895, "grad_norm": 0.8461969494819641, "learning_rate": 5.430531432818101e-06, "loss": 0.027509164065122604, "memory(GiB)": 21.48, "step": 15359, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.956928 }, { "epoch": 0.49897670792320437, "grad_norm": 0.2910330593585968, "learning_rate": 5.429996272409314e-06, "loss": 0.017807774245738983, "memory(GiB)": 21.48, "step": 15360, "token_acc": 0.9921875, "train_speed(iter/s)": 0.956937 }, { "epoch": 0.4990091933859598, "grad_norm": 0.5463353991508484, "learning_rate": 5.429461107037798e-06, "loss": 0.03138268366456032, "memory(GiB)": 21.48, "step": 15361, "token_acc": 1.0, "train_speed(iter/s)": 0.956947 }, { "epoch": 0.4990416788487152, "grad_norm": 0.2832213342189789, "learning_rate": 5.428925936709729e-06, "loss": 0.01837892457842827, "memory(GiB)": 21.48, "step": 15362, "token_acc": 1.0, "train_speed(iter/s)": 0.956957 }, { "epoch": 0.4990741643114706, "grad_norm": 0.3978520333766937, "learning_rate": 5.428390761431284e-06, "loss": 0.020749952644109726, "memory(GiB)": 21.48, "step": 15363, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.956965 }, { "epoch": 0.49910664977422603, "grad_norm": 0.4660024344921112, "learning_rate": 5.4278555812086385e-06, "loss": 0.020482137799263, "memory(GiB)": 21.48, "step": 15364, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956976 }, { "epoch": 0.49913913523698145, "grad_norm": 0.3577319383621216, "learning_rate": 5.427320396047971e-06, "loss": 0.021749403327703476, "memory(GiB)": 21.48, "step": 15365, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.956987 }, { "epoch": 0.49917162069973686, "grad_norm": 0.3495346009731293, "learning_rate": 5.426785205955458e-06, "loss": 0.017956513911485672, "memory(GiB)": 21.48, "step": 15366, "token_acc": 0.9826839826839827, "train_speed(iter/s)": 0.956997 }, { "epoch": 0.4992041061624923, "grad_norm": 0.28816312551498413, "learning_rate": 5.426250010937275e-06, "loss": 0.01623058132827282, "memory(GiB)": 21.48, "step": 15367, "token_acc": 1.0, "train_speed(iter/s)": 0.957007 }, { "epoch": 0.4992365916252477, "grad_norm": 0.34993648529052734, "learning_rate": 5.425714810999599e-06, "loss": 0.018468016758561134, "memory(GiB)": 21.48, "step": 15368, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957018 }, { "epoch": 0.4992690770880031, "grad_norm": 0.3639467656612396, "learning_rate": 5.425179606148609e-06, "loss": 0.024125877767801285, "memory(GiB)": 21.48, "step": 15369, "token_acc": 1.0, "train_speed(iter/s)": 0.957031 }, { "epoch": 0.4993015625507585, "grad_norm": 0.22759562730789185, "learning_rate": 5.424644396390479e-06, "loss": 0.015961255878210068, "memory(GiB)": 21.48, "step": 15370, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.957044 }, { "epoch": 0.49933404801351394, "grad_norm": 0.3098064064979553, "learning_rate": 5.424109181731387e-06, "loss": 0.020307939499616623, "memory(GiB)": 21.48, "step": 15371, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.957057 }, { "epoch": 0.49936653347626936, "grad_norm": 0.502121090888977, "learning_rate": 5.423573962177512e-06, "loss": 0.02404201030731201, "memory(GiB)": 21.48, "step": 15372, "token_acc": 0.9927007299270073, "train_speed(iter/s)": 0.957071 }, { "epoch": 0.4993990189390248, "grad_norm": 0.3309169113636017, "learning_rate": 5.423038737735029e-06, "loss": 0.019013136625289917, "memory(GiB)": 21.48, "step": 15373, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.957083 }, { "epoch": 0.4994315044017802, "grad_norm": 0.361459881067276, "learning_rate": 5.422503508410115e-06, "loss": 0.021921318024396896, "memory(GiB)": 21.48, "step": 15374, "token_acc": 0.9885931558935361, "train_speed(iter/s)": 0.957095 }, { "epoch": 0.4994639898645356, "grad_norm": 0.5148546099662781, "learning_rate": 5.421968274208949e-06, "loss": 0.02932443656027317, "memory(GiB)": 21.48, "step": 15375, "token_acc": 0.9816849816849816, "train_speed(iter/s)": 0.957105 }, { "epoch": 0.499496475327291, "grad_norm": 0.40897661447525024, "learning_rate": 5.421433035137707e-06, "loss": 0.02169126272201538, "memory(GiB)": 21.48, "step": 15376, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.957115 }, { "epoch": 0.49952896079004644, "grad_norm": 0.4736376702785492, "learning_rate": 5.420897791202568e-06, "loss": 0.018277544528245926, "memory(GiB)": 21.48, "step": 15377, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.957124 }, { "epoch": 0.49956144625280186, "grad_norm": 0.31843799352645874, "learning_rate": 5.420362542409709e-06, "loss": 0.02070040814578533, "memory(GiB)": 21.48, "step": 15378, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957134 }, { "epoch": 0.49959393171555727, "grad_norm": 0.44599997997283936, "learning_rate": 5.4198272887653045e-06, "loss": 0.026684116572141647, "memory(GiB)": 21.48, "step": 15379, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.957143 }, { "epoch": 0.4996264171783127, "grad_norm": 0.2875498831272125, "learning_rate": 5.419292030275536e-06, "loss": 0.02262120321393013, "memory(GiB)": 21.48, "step": 15380, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.957151 }, { "epoch": 0.4996589026410681, "grad_norm": 0.550178050994873, "learning_rate": 5.418756766946578e-06, "loss": 0.014579770155251026, "memory(GiB)": 21.48, "step": 15381, "token_acc": 1.0, "train_speed(iter/s)": 0.95716 }, { "epoch": 0.4996913881038235, "grad_norm": 0.34860628843307495, "learning_rate": 5.418221498784612e-06, "loss": 0.02269282191991806, "memory(GiB)": 21.48, "step": 15382, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.95717 }, { "epoch": 0.49972387356657894, "grad_norm": 0.3284352123737335, "learning_rate": 5.41768622579581e-06, "loss": 0.019542936235666275, "memory(GiB)": 21.48, "step": 15383, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957182 }, { "epoch": 0.49975635902933435, "grad_norm": 0.2516530752182007, "learning_rate": 5.417150947986355e-06, "loss": 0.014907150529325008, "memory(GiB)": 21.48, "step": 15384, "token_acc": 1.0, "train_speed(iter/s)": 0.957192 }, { "epoch": 0.49978884449208977, "grad_norm": 0.31004539132118225, "learning_rate": 5.416615665362421e-06, "loss": 0.022824080660939217, "memory(GiB)": 21.48, "step": 15385, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957202 }, { "epoch": 0.4998213299548452, "grad_norm": 0.3659883141517639, "learning_rate": 5.41608037793019e-06, "loss": 0.022749505937099457, "memory(GiB)": 21.48, "step": 15386, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.957212 }, { "epoch": 0.4998538154176006, "grad_norm": 0.2636886537075043, "learning_rate": 5.415545085695837e-06, "loss": 0.021402603015303612, "memory(GiB)": 21.48, "step": 15387, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.957223 }, { "epoch": 0.499886300880356, "grad_norm": 0.4302537143230438, "learning_rate": 5.41500978866554e-06, "loss": 0.020356180146336555, "memory(GiB)": 21.48, "step": 15388, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.957232 }, { "epoch": 0.49991878634311143, "grad_norm": 0.4385589361190796, "learning_rate": 5.414474486845479e-06, "loss": 0.020843636244535446, "memory(GiB)": 21.48, "step": 15389, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.957243 }, { "epoch": 0.49995127180586685, "grad_norm": 0.4738750159740448, "learning_rate": 5.413939180241828e-06, "loss": 0.026552747935056686, "memory(GiB)": 21.48, "step": 15390, "token_acc": 1.0, "train_speed(iter/s)": 0.957253 }, { "epoch": 0.49998375726862226, "grad_norm": 0.44026654958724976, "learning_rate": 5.413403868860771e-06, "loss": 0.024822019040584564, "memory(GiB)": 21.48, "step": 15391, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.957264 }, { "epoch": 0.5000162427313777, "grad_norm": 0.27164119482040405, "learning_rate": 5.41286855270848e-06, "loss": 0.021136371418833733, "memory(GiB)": 21.48, "step": 15392, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.957275 }, { "epoch": 0.5000487281941332, "grad_norm": 0.49103787541389465, "learning_rate": 5.4123332317911396e-06, "loss": 0.025518300011754036, "memory(GiB)": 21.48, "step": 15393, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.957286 }, { "epoch": 0.5000812136568885, "grad_norm": 0.3414548337459564, "learning_rate": 5.411797906114923e-06, "loss": 0.0247734934091568, "memory(GiB)": 21.48, "step": 15394, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.957297 }, { "epoch": 0.500113699119644, "grad_norm": 0.3711470663547516, "learning_rate": 5.411262575686011e-06, "loss": 0.02468983829021454, "memory(GiB)": 21.48, "step": 15395, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.957307 }, { "epoch": 0.5001461845823993, "grad_norm": 0.5217627882957458, "learning_rate": 5.4107272405105816e-06, "loss": 0.01897306926548481, "memory(GiB)": 21.48, "step": 15396, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957316 }, { "epoch": 0.5001786700451548, "grad_norm": 0.3309459090232849, "learning_rate": 5.410191900594812e-06, "loss": 0.02154233120381832, "memory(GiB)": 21.48, "step": 15397, "token_acc": 1.0, "train_speed(iter/s)": 0.957326 }, { "epoch": 0.5002111555079102, "grad_norm": 0.3555467426776886, "learning_rate": 5.409656555944883e-06, "loss": 0.0172734335064888, "memory(GiB)": 21.48, "step": 15398, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.957337 }, { "epoch": 0.5002436409706656, "grad_norm": 0.8226844072341919, "learning_rate": 5.409121206566972e-06, "loss": 0.03198741003870964, "memory(GiB)": 21.48, "step": 15399, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.957348 }, { "epoch": 0.500276126433421, "grad_norm": 0.2902989387512207, "learning_rate": 5.408585852467258e-06, "loss": 0.022407162934541702, "memory(GiB)": 21.48, "step": 15400, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.957359 }, { "epoch": 0.5003086118961765, "grad_norm": 0.3076598644256592, "learning_rate": 5.408050493651919e-06, "loss": 0.01720273494720459, "memory(GiB)": 21.48, "step": 15401, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.957372 }, { "epoch": 0.5003410973589318, "grad_norm": 0.3365178406238556, "learning_rate": 5.4075151301271345e-06, "loss": 0.01738910749554634, "memory(GiB)": 21.48, "step": 15402, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957385 }, { "epoch": 0.5003735828216873, "grad_norm": 0.3899049460887909, "learning_rate": 5.406979761899083e-06, "loss": 0.02073197066783905, "memory(GiB)": 21.48, "step": 15403, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957397 }, { "epoch": 0.5004060682844427, "grad_norm": 0.26584434509277344, "learning_rate": 5.406444388973942e-06, "loss": 0.021844033151865005, "memory(GiB)": 21.48, "step": 15404, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.957411 }, { "epoch": 0.5004385537471981, "grad_norm": 0.3611633777618408, "learning_rate": 5.4059090113578926e-06, "loss": 0.019712166860699654, "memory(GiB)": 21.48, "step": 15405, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.957422 }, { "epoch": 0.5004710392099535, "grad_norm": 0.2981257140636444, "learning_rate": 5.405373629057114e-06, "loss": 0.02195877581834793, "memory(GiB)": 21.48, "step": 15406, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.957436 }, { "epoch": 0.500503524672709, "grad_norm": 0.295985609292984, "learning_rate": 5.404838242077784e-06, "loss": 0.013780293054878712, "memory(GiB)": 21.48, "step": 15407, "token_acc": 0.9938650306748467, "train_speed(iter/s)": 0.957449 }, { "epoch": 0.5005360101354643, "grad_norm": 0.3622039556503296, "learning_rate": 5.4043028504260805e-06, "loss": 0.023152519017457962, "memory(GiB)": 21.48, "step": 15408, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.957459 }, { "epoch": 0.5005684955982198, "grad_norm": 0.6537864208221436, "learning_rate": 5.4037674541081845e-06, "loss": 0.02660553902387619, "memory(GiB)": 21.48, "step": 15409, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.957469 }, { "epoch": 0.5006009810609752, "grad_norm": 0.680034339427948, "learning_rate": 5.403232053130275e-06, "loss": 0.028005599975585938, "memory(GiB)": 21.48, "step": 15410, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957479 }, { "epoch": 0.5006334665237306, "grad_norm": 0.4789699912071228, "learning_rate": 5.402696647498531e-06, "loss": 0.020454753190279007, "memory(GiB)": 21.48, "step": 15411, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.957488 }, { "epoch": 0.500665951986486, "grad_norm": 0.43033137917518616, "learning_rate": 5.402161237219132e-06, "loss": 0.01994452439248562, "memory(GiB)": 21.48, "step": 15412, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.957497 }, { "epoch": 0.5006984374492415, "grad_norm": 0.37586453557014465, "learning_rate": 5.4016258222982555e-06, "loss": 0.02141612581908703, "memory(GiB)": 21.48, "step": 15413, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.957506 }, { "epoch": 0.5007309229119968, "grad_norm": 0.3695167303085327, "learning_rate": 5.401090402742084e-06, "loss": 0.02263122797012329, "memory(GiB)": 21.48, "step": 15414, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957515 }, { "epoch": 0.5007634083747523, "grad_norm": 0.35500380396842957, "learning_rate": 5.400554978556795e-06, "loss": 0.016527187079191208, "memory(GiB)": 21.48, "step": 15415, "token_acc": 1.0, "train_speed(iter/s)": 0.957524 }, { "epoch": 0.5007958938375077, "grad_norm": 0.4624939560890198, "learning_rate": 5.400019549748568e-06, "loss": 0.020702512934803963, "memory(GiB)": 21.48, "step": 15416, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.957534 }, { "epoch": 0.5008283793002631, "grad_norm": 0.3938560485839844, "learning_rate": 5.3994841163235845e-06, "loss": 0.017554527148604393, "memory(GiB)": 21.48, "step": 15417, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.957544 }, { "epoch": 0.5008608647630185, "grad_norm": 0.7914811372756958, "learning_rate": 5.398948678288021e-06, "loss": 0.013922963291406631, "memory(GiB)": 21.48, "step": 15418, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.957554 }, { "epoch": 0.500893350225774, "grad_norm": 0.5612779259681702, "learning_rate": 5.398413235648059e-06, "loss": 0.02358989417552948, "memory(GiB)": 21.48, "step": 15419, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.957563 }, { "epoch": 0.5009258356885293, "grad_norm": 0.3103935122489929, "learning_rate": 5.397877788409877e-06, "loss": 0.02077137865126133, "memory(GiB)": 21.48, "step": 15420, "token_acc": 0.9823008849557522, "train_speed(iter/s)": 0.957571 }, { "epoch": 0.5009583211512848, "grad_norm": 0.4633605480194092, "learning_rate": 5.397342336579657e-06, "loss": 0.02555893361568451, "memory(GiB)": 21.48, "step": 15421, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957582 }, { "epoch": 0.5009908066140402, "grad_norm": 0.3625987768173218, "learning_rate": 5.396806880163578e-06, "loss": 0.015925370156764984, "memory(GiB)": 21.48, "step": 15422, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.957593 }, { "epoch": 0.5010232920767956, "grad_norm": 0.4483063817024231, "learning_rate": 5.396271419167819e-06, "loss": 0.028915200382471085, "memory(GiB)": 21.48, "step": 15423, "token_acc": 1.0, "train_speed(iter/s)": 0.957602 }, { "epoch": 0.5010557775395511, "grad_norm": 0.27743467688560486, "learning_rate": 5.395735953598561e-06, "loss": 0.01766376942396164, "memory(GiB)": 21.48, "step": 15424, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957613 }, { "epoch": 0.5010882630023065, "grad_norm": 0.3199654817581177, "learning_rate": 5.395200483461982e-06, "loss": 0.01826184242963791, "memory(GiB)": 21.48, "step": 15425, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.957623 }, { "epoch": 0.5011207484650619, "grad_norm": 0.39892005920410156, "learning_rate": 5.394665008764264e-06, "loss": 0.018615461885929108, "memory(GiB)": 21.48, "step": 15426, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.957633 }, { "epoch": 0.5011532339278173, "grad_norm": 0.462846964597702, "learning_rate": 5.3941295295115855e-06, "loss": 0.018990475684404373, "memory(GiB)": 21.48, "step": 15427, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957643 }, { "epoch": 0.5011857193905728, "grad_norm": 0.30084145069122314, "learning_rate": 5.39359404571013e-06, "loss": 0.01811603456735611, "memory(GiB)": 21.48, "step": 15428, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.957653 }, { "epoch": 0.5012182048533281, "grad_norm": 0.3281514048576355, "learning_rate": 5.393058557366073e-06, "loss": 0.018063295632600784, "memory(GiB)": 21.48, "step": 15429, "token_acc": 0.9894179894179894, "train_speed(iter/s)": 0.957664 }, { "epoch": 0.5012506903160836, "grad_norm": 0.5009407997131348, "learning_rate": 5.392523064485598e-06, "loss": 0.021030738949775696, "memory(GiB)": 21.48, "step": 15430, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957677 }, { "epoch": 0.501283175778839, "grad_norm": 0.2964431047439575, "learning_rate": 5.391987567074886e-06, "loss": 0.02419937774538994, "memory(GiB)": 21.48, "step": 15431, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.957689 }, { "epoch": 0.5013156612415944, "grad_norm": 0.4693427085876465, "learning_rate": 5.391452065140114e-06, "loss": 0.022961486130952835, "memory(GiB)": 21.48, "step": 15432, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957703 }, { "epoch": 0.5013481467043498, "grad_norm": 0.3547430634498596, "learning_rate": 5.390916558687464e-06, "loss": 0.018330011516809464, "memory(GiB)": 21.48, "step": 15433, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.957716 }, { "epoch": 0.5013806321671053, "grad_norm": 0.45910242199897766, "learning_rate": 5.390381047723117e-06, "loss": 0.027101650834083557, "memory(GiB)": 21.48, "step": 15434, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.95773 }, { "epoch": 0.5014131176298606, "grad_norm": 0.4200829267501831, "learning_rate": 5.389845532253254e-06, "loss": 0.024727892130613327, "memory(GiB)": 21.48, "step": 15435, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.957743 }, { "epoch": 0.5014456030926161, "grad_norm": 0.4020186960697174, "learning_rate": 5.389310012284052e-06, "loss": 0.025018658488988876, "memory(GiB)": 21.48, "step": 15436, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.957756 }, { "epoch": 0.5014780885553715, "grad_norm": 0.326863557100296, "learning_rate": 5.388774487821696e-06, "loss": 0.01921033300459385, "memory(GiB)": 21.48, "step": 15437, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.957769 }, { "epoch": 0.5015105740181269, "grad_norm": 0.41442248225212097, "learning_rate": 5.3882389588723664e-06, "loss": 0.019854985177516937, "memory(GiB)": 21.48, "step": 15438, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.957781 }, { "epoch": 0.5015430594808823, "grad_norm": 0.3829021155834198, "learning_rate": 5.38770342544224e-06, "loss": 0.01955786533653736, "memory(GiB)": 21.48, "step": 15439, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957792 }, { "epoch": 0.5015755449436378, "grad_norm": 0.37520548701286316, "learning_rate": 5.3871678875375025e-06, "loss": 0.025094322860240936, "memory(GiB)": 21.48, "step": 15440, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957802 }, { "epoch": 0.5016080304063931, "grad_norm": 0.38851961493492126, "learning_rate": 5.38663234516433e-06, "loss": 0.01723618060350418, "memory(GiB)": 21.48, "step": 15441, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.957813 }, { "epoch": 0.5016405158691486, "grad_norm": 0.3998652398586273, "learning_rate": 5.386096798328907e-06, "loss": 0.024893969297409058, "memory(GiB)": 21.48, "step": 15442, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.957823 }, { "epoch": 0.501673001331904, "grad_norm": 0.3489818871021271, "learning_rate": 5.385561247037412e-06, "loss": 0.021505966782569885, "memory(GiB)": 21.48, "step": 15443, "token_acc": 0.98, "train_speed(iter/s)": 0.957834 }, { "epoch": 0.5017054867946594, "grad_norm": 0.4221495985984802, "learning_rate": 5.385025691296029e-06, "loss": 0.027353068813681602, "memory(GiB)": 21.48, "step": 15444, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.957845 }, { "epoch": 0.5017379722574148, "grad_norm": 0.4083709120750427, "learning_rate": 5.384490131110934e-06, "loss": 0.025546416640281677, "memory(GiB)": 21.48, "step": 15445, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.957855 }, { "epoch": 0.5017704577201703, "grad_norm": 0.49682778120040894, "learning_rate": 5.383954566488314e-06, "loss": 0.02938985824584961, "memory(GiB)": 21.48, "step": 15446, "token_acc": 1.0, "train_speed(iter/s)": 0.957865 }, { "epoch": 0.5018029431829256, "grad_norm": 0.3452301323413849, "learning_rate": 5.383418997434347e-06, "loss": 0.016535872593522072, "memory(GiB)": 21.48, "step": 15447, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.957876 }, { "epoch": 0.5018354286456811, "grad_norm": 0.4170120656490326, "learning_rate": 5.382883423955212e-06, "loss": 0.02808670699596405, "memory(GiB)": 21.48, "step": 15448, "token_acc": 1.0, "train_speed(iter/s)": 0.957885 }, { "epoch": 0.5018679141084365, "grad_norm": 0.3655157685279846, "learning_rate": 5.3823478460570945e-06, "loss": 0.021405160427093506, "memory(GiB)": 21.48, "step": 15449, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.957894 }, { "epoch": 0.5019003995711919, "grad_norm": 0.3755228817462921, "learning_rate": 5.381812263746173e-06, "loss": 0.0286086592823267, "memory(GiB)": 21.48, "step": 15450, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.957903 }, { "epoch": 0.5019328850339473, "grad_norm": 0.4664534330368042, "learning_rate": 5.381276677028631e-06, "loss": 0.02174193784594536, "memory(GiB)": 21.48, "step": 15451, "token_acc": 1.0, "train_speed(iter/s)": 0.957914 }, { "epoch": 0.5019653704967028, "grad_norm": 0.4396848678588867, "learning_rate": 5.3807410859106475e-06, "loss": 0.021003225818276405, "memory(GiB)": 21.48, "step": 15452, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.957924 }, { "epoch": 0.5019978559594581, "grad_norm": 0.3488459289073944, "learning_rate": 5.380205490398406e-06, "loss": 0.02195611596107483, "memory(GiB)": 21.48, "step": 15453, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.957935 }, { "epoch": 0.5020303414222136, "grad_norm": 0.3084022104740143, "learning_rate": 5.379669890498087e-06, "loss": 0.028998786583542824, "memory(GiB)": 21.48, "step": 15454, "token_acc": 1.0, "train_speed(iter/s)": 0.957946 }, { "epoch": 0.502062826884969, "grad_norm": 0.46870601177215576, "learning_rate": 5.379134286215871e-06, "loss": 0.023251648992300034, "memory(GiB)": 21.48, "step": 15455, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.957955 }, { "epoch": 0.5020953123477244, "grad_norm": 0.2476268708705902, "learning_rate": 5.378598677557942e-06, "loss": 0.01729736477136612, "memory(GiB)": 21.48, "step": 15456, "token_acc": 0.9961977186311787, "train_speed(iter/s)": 0.957965 }, { "epoch": 0.5021277978104798, "grad_norm": 0.3596803843975067, "learning_rate": 5.37806306453048e-06, "loss": 0.021692167967557907, "memory(GiB)": 21.48, "step": 15457, "token_acc": 0.9724770642201835, "train_speed(iter/s)": 0.957973 }, { "epoch": 0.5021602832732353, "grad_norm": 0.33989110589027405, "learning_rate": 5.377527447139668e-06, "loss": 0.01804114691913128, "memory(GiB)": 21.48, "step": 15458, "token_acc": 1.0, "train_speed(iter/s)": 0.957984 }, { "epoch": 0.5021927687359906, "grad_norm": 0.6441145539283752, "learning_rate": 5.376991825391684e-06, "loss": 0.027913358062505722, "memory(GiB)": 21.48, "step": 15459, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.957994 }, { "epoch": 0.5022252541987461, "grad_norm": 0.3232063055038452, "learning_rate": 5.376456199292715e-06, "loss": 0.022940028458833694, "memory(GiB)": 21.48, "step": 15460, "token_acc": 1.0, "train_speed(iter/s)": 0.958004 }, { "epoch": 0.5022577396615014, "grad_norm": 0.4572533965110779, "learning_rate": 5.37592056884894e-06, "loss": 0.02210673689842224, "memory(GiB)": 21.48, "step": 15461, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.958014 }, { "epoch": 0.5022902251242569, "grad_norm": 0.3145766258239746, "learning_rate": 5.375384934066541e-06, "loss": 0.019186222925782204, "memory(GiB)": 21.48, "step": 15462, "token_acc": 1.0, "train_speed(iter/s)": 0.958026 }, { "epoch": 0.5023227105870123, "grad_norm": 0.4075814187526703, "learning_rate": 5.374849294951701e-06, "loss": 0.02670474350452423, "memory(GiB)": 21.48, "step": 15463, "token_acc": 0.9746835443037974, "train_speed(iter/s)": 0.958039 }, { "epoch": 0.5023551960497677, "grad_norm": 0.3133203089237213, "learning_rate": 5.3743136515106e-06, "loss": 0.020630888640880585, "memory(GiB)": 21.48, "step": 15464, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.958053 }, { "epoch": 0.5023876815125231, "grad_norm": 0.5063047409057617, "learning_rate": 5.373778003749422e-06, "loss": 0.01818203553557396, "memory(GiB)": 21.48, "step": 15465, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.958065 }, { "epoch": 0.5024201669752786, "grad_norm": 0.3663688004016876, "learning_rate": 5.373242351674347e-06, "loss": 0.016560306772589684, "memory(GiB)": 21.48, "step": 15466, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.958078 }, { "epoch": 0.5024526524380339, "grad_norm": 0.3601090610027313, "learning_rate": 5.372706695291559e-06, "loss": 0.018434789031744003, "memory(GiB)": 21.48, "step": 15467, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.958088 }, { "epoch": 0.5024851379007894, "grad_norm": 0.38393545150756836, "learning_rate": 5.372171034607241e-06, "loss": 0.018610425293445587, "memory(GiB)": 21.48, "step": 15468, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.958097 }, { "epoch": 0.5025176233635448, "grad_norm": 0.2666803002357483, "learning_rate": 5.371635369627573e-06, "loss": 0.018041010946035385, "memory(GiB)": 21.48, "step": 15469, "token_acc": 0.992, "train_speed(iter/s)": 0.958107 }, { "epoch": 0.5025501088263002, "grad_norm": 0.2554417848587036, "learning_rate": 5.371099700358738e-06, "loss": 0.014110658317804337, "memory(GiB)": 21.48, "step": 15470, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.958117 }, { "epoch": 0.5025825942890556, "grad_norm": 0.32496410608291626, "learning_rate": 5.370564026806918e-06, "loss": 0.018901709467172623, "memory(GiB)": 21.48, "step": 15471, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.958127 }, { "epoch": 0.5026150797518111, "grad_norm": 0.3186054825782776, "learning_rate": 5.370028348978298e-06, "loss": 0.021874263882637024, "memory(GiB)": 21.48, "step": 15472, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.958136 }, { "epoch": 0.5026475652145664, "grad_norm": 0.32757413387298584, "learning_rate": 5.369492666879056e-06, "loss": 0.016559848561882973, "memory(GiB)": 21.48, "step": 15473, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.958146 }, { "epoch": 0.5026800506773219, "grad_norm": 0.556881308555603, "learning_rate": 5.368956980515379e-06, "loss": 0.020483337342739105, "memory(GiB)": 21.48, "step": 15474, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.958156 }, { "epoch": 0.5027125361400773, "grad_norm": 0.3285908102989197, "learning_rate": 5.368421289893446e-06, "loss": 0.018522508442401886, "memory(GiB)": 21.48, "step": 15475, "token_acc": 1.0, "train_speed(iter/s)": 0.958167 }, { "epoch": 0.5027450216028327, "grad_norm": 0.3424038887023926, "learning_rate": 5.3678855950194405e-06, "loss": 0.01607395149767399, "memory(GiB)": 21.48, "step": 15476, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.958177 }, { "epoch": 0.5027775070655881, "grad_norm": 0.5480061769485474, "learning_rate": 5.367349895899547e-06, "loss": 0.025037622079253197, "memory(GiB)": 21.48, "step": 15477, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.958186 }, { "epoch": 0.5028099925283436, "grad_norm": 0.5966224670410156, "learning_rate": 5.366814192539946e-06, "loss": 0.023424023762345314, "memory(GiB)": 21.48, "step": 15478, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.958196 }, { "epoch": 0.5028424779910989, "grad_norm": 0.3976726531982422, "learning_rate": 5.366278484946822e-06, "loss": 0.025744520127773285, "memory(GiB)": 21.48, "step": 15479, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.958205 }, { "epoch": 0.5028749634538544, "grad_norm": 0.5473056435585022, "learning_rate": 5.365742773126355e-06, "loss": 0.02493511140346527, "memory(GiB)": 21.48, "step": 15480, "token_acc": 1.0, "train_speed(iter/s)": 0.958216 }, { "epoch": 0.5029074489166098, "grad_norm": 0.365935355424881, "learning_rate": 5.36520705708473e-06, "loss": 0.022986486554145813, "memory(GiB)": 21.48, "step": 15481, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.958226 }, { "epoch": 0.5029399343793652, "grad_norm": 0.3330400586128235, "learning_rate": 5.36467133682813e-06, "loss": 0.01790154166519642, "memory(GiB)": 21.48, "step": 15482, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.958236 }, { "epoch": 0.5029724198421206, "grad_norm": 0.30091553926467896, "learning_rate": 5.364135612362737e-06, "loss": 0.017736731097102165, "memory(GiB)": 21.48, "step": 15483, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.958246 }, { "epoch": 0.5030049053048761, "grad_norm": 0.3173673748970032, "learning_rate": 5.363599883694734e-06, "loss": 0.018464302644133568, "memory(GiB)": 21.48, "step": 15484, "token_acc": 0.9952830188679245, "train_speed(iter/s)": 0.958255 }, { "epoch": 0.5030373907676314, "grad_norm": 0.47662344574928284, "learning_rate": 5.363064150830305e-06, "loss": 0.029464302584528923, "memory(GiB)": 21.48, "step": 15485, "token_acc": 0.9702380952380952, "train_speed(iter/s)": 0.958265 }, { "epoch": 0.5030698762303869, "grad_norm": 0.37365052103996277, "learning_rate": 5.362528413775633e-06, "loss": 0.030243368819355965, "memory(GiB)": 21.48, "step": 15486, "token_acc": 0.9940476190476191, "train_speed(iter/s)": 0.958273 }, { "epoch": 0.5031023616931423, "grad_norm": 0.529068648815155, "learning_rate": 5.3619926725369e-06, "loss": 0.026467856019735336, "memory(GiB)": 21.48, "step": 15487, "token_acc": 1.0, "train_speed(iter/s)": 0.958286 }, { "epoch": 0.5031348471558977, "grad_norm": 0.8191161751747131, "learning_rate": 5.36145692712029e-06, "loss": 0.021783262491226196, "memory(GiB)": 21.48, "step": 15488, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.958299 }, { "epoch": 0.5031673326186532, "grad_norm": 0.6029771566390991, "learning_rate": 5.360921177531986e-06, "loss": 0.028066396713256836, "memory(GiB)": 21.48, "step": 15489, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.958312 }, { "epoch": 0.5031998180814086, "grad_norm": 0.41972866654396057, "learning_rate": 5.3603854237781706e-06, "loss": 0.021243680268526077, "memory(GiB)": 21.48, "step": 15490, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.958325 }, { "epoch": 0.503232303544164, "grad_norm": 0.34143200516700745, "learning_rate": 5.359849665865029e-06, "loss": 0.017796121537685394, "memory(GiB)": 21.48, "step": 15491, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.958338 }, { "epoch": 0.5032647890069194, "grad_norm": 0.40990519523620605, "learning_rate": 5.359313903798742e-06, "loss": 0.024044806137681007, "memory(GiB)": 21.48, "step": 15492, "token_acc": 1.0, "train_speed(iter/s)": 0.95835 }, { "epoch": 0.5032972744696749, "grad_norm": 0.29772230982780457, "learning_rate": 5.3587781375854955e-06, "loss": 0.014696610160171986, "memory(GiB)": 21.48, "step": 15493, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.958363 }, { "epoch": 0.5033297599324302, "grad_norm": 0.28855645656585693, "learning_rate": 5.35824236723147e-06, "loss": 0.01964365318417549, "memory(GiB)": 21.48, "step": 15494, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.958376 }, { "epoch": 0.5033622453951857, "grad_norm": 0.3664027452468872, "learning_rate": 5.357706592742852e-06, "loss": 0.019067607820034027, "memory(GiB)": 21.48, "step": 15495, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.958389 }, { "epoch": 0.5033947308579411, "grad_norm": 0.39050978422164917, "learning_rate": 5.357170814125823e-06, "loss": 0.017213955521583557, "memory(GiB)": 21.48, "step": 15496, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.958402 }, { "epoch": 0.5034272163206965, "grad_norm": 0.3167690932750702, "learning_rate": 5.356635031386569e-06, "loss": 0.020215975120663643, "memory(GiB)": 21.48, "step": 15497, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.958415 }, { "epoch": 0.5034597017834519, "grad_norm": 0.34263741970062256, "learning_rate": 5.35609924453127e-06, "loss": 0.023821113631129265, "memory(GiB)": 21.48, "step": 15498, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.958427 }, { "epoch": 0.5034921872462074, "grad_norm": 0.40104028582572937, "learning_rate": 5.355563453566113e-06, "loss": 0.017719164490699768, "memory(GiB)": 21.48, "step": 15499, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.958438 }, { "epoch": 0.5035246727089627, "grad_norm": 0.46099144220352173, "learning_rate": 5.35502765849728e-06, "loss": 0.027068203315138817, "memory(GiB)": 21.48, "step": 15500, "token_acc": 0.9665271966527197, "train_speed(iter/s)": 0.958449 }, { "epoch": 0.5035246727089627, "eval_loss": 0.021635860204696655, "eval_runtime": 80.4718, "eval_samples_per_second": 123.646, "eval_steps_per_second": 3.865, "eval_token_acc": 0.9914212216221596, "step": 15500 }, { "epoch": 0.5035571581717182, "grad_norm": 0.17997506260871887, "learning_rate": 5.354491859330955e-06, "loss": 0.011118398979306221, "memory(GiB)": 21.48, "step": 15501, "token_acc": 0.9909447092991397, "train_speed(iter/s)": 0.953087 }, { "epoch": 0.5035896436344736, "grad_norm": 0.3353104293346405, "learning_rate": 5.353956056073324e-06, "loss": 0.022045068442821503, "memory(GiB)": 21.48, "step": 15502, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.953095 }, { "epoch": 0.503622129097229, "grad_norm": 0.3171809911727905, "learning_rate": 5.353420248730568e-06, "loss": 0.017862850800156593, "memory(GiB)": 21.48, "step": 15503, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.953103 }, { "epoch": 0.5036546145599844, "grad_norm": 0.3624473214149475, "learning_rate": 5.352884437308871e-06, "loss": 0.019881434738636017, "memory(GiB)": 21.48, "step": 15504, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.95311 }, { "epoch": 0.5036871000227399, "grad_norm": 0.3986060619354248, "learning_rate": 5.3523486218144184e-06, "loss": 0.014681613072752953, "memory(GiB)": 21.48, "step": 15505, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.953104 }, { "epoch": 0.5037195854854952, "grad_norm": 2.945650100708008, "learning_rate": 5.351812802253393e-06, "loss": 0.029714927077293396, "memory(GiB)": 21.48, "step": 15506, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.953115 }, { "epoch": 0.5037520709482507, "grad_norm": 0.40857362747192383, "learning_rate": 5.3512769786319815e-06, "loss": 0.029894748702645302, "memory(GiB)": 21.48, "step": 15507, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.953127 }, { "epoch": 0.5037845564110061, "grad_norm": 0.32725152373313904, "learning_rate": 5.350741150956364e-06, "loss": 0.018843594938516617, "memory(GiB)": 21.48, "step": 15508, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.953139 }, { "epoch": 0.5038170418737615, "grad_norm": 0.3727181553840637, "learning_rate": 5.350205319232729e-06, "loss": 0.01839723065495491, "memory(GiB)": 21.48, "step": 15509, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.953151 }, { "epoch": 0.5038495273365169, "grad_norm": 0.3434174060821533, "learning_rate": 5.3496694834672566e-06, "loss": 0.016852691769599915, "memory(GiB)": 21.48, "step": 15510, "token_acc": 1.0, "train_speed(iter/s)": 0.953164 }, { "epoch": 0.5038820127992724, "grad_norm": 0.35862839221954346, "learning_rate": 5.349133643666134e-06, "loss": 0.02109362930059433, "memory(GiB)": 21.48, "step": 15511, "token_acc": 0.988, "train_speed(iter/s)": 0.953177 }, { "epoch": 0.5039144982620277, "grad_norm": 0.3902769386768341, "learning_rate": 5.3485977998355456e-06, "loss": 0.024980081245303154, "memory(GiB)": 21.48, "step": 15512, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.95319 }, { "epoch": 0.5039469837247832, "grad_norm": 0.3299649655818939, "learning_rate": 5.348061951981672e-06, "loss": 0.017570417374372482, "memory(GiB)": 21.48, "step": 15513, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.953202 }, { "epoch": 0.5039794691875386, "grad_norm": 0.36711084842681885, "learning_rate": 5.347526100110701e-06, "loss": 0.02329859882593155, "memory(GiB)": 21.48, "step": 15514, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.953214 }, { "epoch": 0.504011954650294, "grad_norm": 0.3870706856250763, "learning_rate": 5.346990244228815e-06, "loss": 0.018453923985362053, "memory(GiB)": 21.48, "step": 15515, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.953226 }, { "epoch": 0.5040444401130494, "grad_norm": 1.432345986366272, "learning_rate": 5.346454384342202e-06, "loss": 0.025973137468099594, "memory(GiB)": 21.48, "step": 15516, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.953239 }, { "epoch": 0.5040769255758049, "grad_norm": 0.31489649415016174, "learning_rate": 5.345918520457045e-06, "loss": 0.016833622008562088, "memory(GiB)": 21.48, "step": 15517, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.953252 }, { "epoch": 0.5041094110385602, "grad_norm": 0.28855428099632263, "learning_rate": 5.345382652579526e-06, "loss": 0.020806346088647842, "memory(GiB)": 21.48, "step": 15518, "token_acc": 1.0, "train_speed(iter/s)": 0.953265 }, { "epoch": 0.5041418965013157, "grad_norm": 0.38148024678230286, "learning_rate": 5.3448467807158325e-06, "loss": 0.019182121381163597, "memory(GiB)": 21.48, "step": 15519, "token_acc": 1.0, "train_speed(iter/s)": 0.953277 }, { "epoch": 0.504174381964071, "grad_norm": 0.3320631980895996, "learning_rate": 5.344310904872146e-06, "loss": 0.019081464037299156, "memory(GiB)": 21.48, "step": 15520, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.95329 }, { "epoch": 0.5042068674268265, "grad_norm": 0.324449360370636, "learning_rate": 5.343775025054656e-06, "loss": 0.018587974831461906, "memory(GiB)": 21.48, "step": 15521, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.953302 }, { "epoch": 0.5042393528895819, "grad_norm": 0.23929940164089203, "learning_rate": 5.343239141269544e-06, "loss": 0.01757744699716568, "memory(GiB)": 21.48, "step": 15522, "token_acc": 1.0, "train_speed(iter/s)": 0.953315 }, { "epoch": 0.5042718383523374, "grad_norm": 0.33063358068466187, "learning_rate": 5.342703253522995e-06, "loss": 0.021710481494665146, "memory(GiB)": 21.48, "step": 15523, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.953329 }, { "epoch": 0.5043043238150927, "grad_norm": 0.5553038120269775, "learning_rate": 5.342167361821194e-06, "loss": 0.02327590435743332, "memory(GiB)": 21.48, "step": 15524, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.953342 }, { "epoch": 0.5043368092778482, "grad_norm": 0.34716805815696716, "learning_rate": 5.3416314661703276e-06, "loss": 0.022768858820199966, "memory(GiB)": 21.48, "step": 15525, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.953355 }, { "epoch": 0.5043692947406035, "grad_norm": 0.6385418772697449, "learning_rate": 5.3410955665765785e-06, "loss": 0.027241600677371025, "memory(GiB)": 21.48, "step": 15526, "token_acc": 0.9894179894179894, "train_speed(iter/s)": 0.953368 }, { "epoch": 0.504401780203359, "grad_norm": 0.4733898937702179, "learning_rate": 5.340559663046132e-06, "loss": 0.02364184707403183, "memory(GiB)": 21.48, "step": 15527, "token_acc": 0.9858156028368794, "train_speed(iter/s)": 0.95338 }, { "epoch": 0.5044342656661144, "grad_norm": 0.34743189811706543, "learning_rate": 5.340023755585176e-06, "loss": 0.023360485211014748, "memory(GiB)": 21.48, "step": 15528, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.953393 }, { "epoch": 0.5044667511288699, "grad_norm": 0.4473996162414551, "learning_rate": 5.339487844199889e-06, "loss": 0.024535441771149635, "memory(GiB)": 21.48, "step": 15529, "token_acc": 1.0, "train_speed(iter/s)": 0.953407 }, { "epoch": 0.5044992365916252, "grad_norm": 0.40600764751434326, "learning_rate": 5.338951928896463e-06, "loss": 0.02066687121987343, "memory(GiB)": 21.48, "step": 15530, "token_acc": 1.0, "train_speed(iter/s)": 0.953419 }, { "epoch": 0.5045317220543807, "grad_norm": 0.444221168756485, "learning_rate": 5.338416009681081e-06, "loss": 0.02392886020243168, "memory(GiB)": 21.48, "step": 15531, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.953433 }, { "epoch": 0.504564207517136, "grad_norm": 0.4310755133628845, "learning_rate": 5.337880086559927e-06, "loss": 0.019011979922652245, "memory(GiB)": 21.48, "step": 15532, "token_acc": 0.9890710382513661, "train_speed(iter/s)": 0.953444 }, { "epoch": 0.5045966929798915, "grad_norm": 0.36350134015083313, "learning_rate": 5.337344159539188e-06, "loss": 0.016649391502141953, "memory(GiB)": 21.48, "step": 15533, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.953457 }, { "epoch": 0.5046291784426469, "grad_norm": 0.4047710597515106, "learning_rate": 5.336808228625047e-06, "loss": 0.02217419445514679, "memory(GiB)": 21.48, "step": 15534, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.95347 }, { "epoch": 0.5046616639054023, "grad_norm": 0.2883865535259247, "learning_rate": 5.336272293823692e-06, "loss": 0.017308812588453293, "memory(GiB)": 21.48, "step": 15535, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.953484 }, { "epoch": 0.5046941493681577, "grad_norm": 0.37255802750587463, "learning_rate": 5.3357363551413065e-06, "loss": 0.023117858916521072, "memory(GiB)": 21.48, "step": 15536, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.953498 }, { "epoch": 0.5047266348309132, "grad_norm": 0.4917413294315338, "learning_rate": 5.335200412584077e-06, "loss": 0.021043425425887108, "memory(GiB)": 21.48, "step": 15537, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.95351 }, { "epoch": 0.5047591202936685, "grad_norm": 0.5318479537963867, "learning_rate": 5.334664466158188e-06, "loss": 0.02938210964202881, "memory(GiB)": 21.48, "step": 15538, "token_acc": 0.970873786407767, "train_speed(iter/s)": 0.953524 }, { "epoch": 0.504791605756424, "grad_norm": 0.32379910349845886, "learning_rate": 5.334128515869826e-06, "loss": 0.019103677943348885, "memory(GiB)": 21.48, "step": 15539, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.953538 }, { "epoch": 0.5048240912191794, "grad_norm": 0.4908006489276886, "learning_rate": 5.333592561725177e-06, "loss": 0.02873564139008522, "memory(GiB)": 21.48, "step": 15540, "token_acc": 1.0, "train_speed(iter/s)": 0.953551 }, { "epoch": 0.5048565766819348, "grad_norm": 0.47618532180786133, "learning_rate": 5.3330566037304245e-06, "loss": 0.022616807371377945, "memory(GiB)": 21.48, "step": 15541, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.953564 }, { "epoch": 0.5048890621446902, "grad_norm": 0.41809749603271484, "learning_rate": 5.332520641891756e-06, "loss": 0.01736798882484436, "memory(GiB)": 21.48, "step": 15542, "token_acc": 1.0, "train_speed(iter/s)": 0.953578 }, { "epoch": 0.5049215476074457, "grad_norm": 0.3138713538646698, "learning_rate": 5.331984676215357e-06, "loss": 0.01788874715566635, "memory(GiB)": 21.48, "step": 15543, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.953591 }, { "epoch": 0.504954033070201, "grad_norm": 0.40842828154563904, "learning_rate": 5.331448706707412e-06, "loss": 0.016871510073542595, "memory(GiB)": 21.48, "step": 15544, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.953604 }, { "epoch": 0.5049865185329565, "grad_norm": 0.40494677424430847, "learning_rate": 5.330912733374109e-06, "loss": 0.016402211040258408, "memory(GiB)": 21.48, "step": 15545, "token_acc": 1.0, "train_speed(iter/s)": 0.953615 }, { "epoch": 0.5050190039957119, "grad_norm": 0.35182416439056396, "learning_rate": 5.330376756221631e-06, "loss": 0.018244026228785515, "memory(GiB)": 21.48, "step": 15546, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.953626 }, { "epoch": 0.5050514894584673, "grad_norm": 0.2845441699028015, "learning_rate": 5.329840775256168e-06, "loss": 0.01765609160065651, "memory(GiB)": 21.48, "step": 15547, "token_acc": 0.9836956521739131, "train_speed(iter/s)": 0.953635 }, { "epoch": 0.5050839749212227, "grad_norm": 0.4357369840145111, "learning_rate": 5.329304790483901e-06, "loss": 0.02045321650803089, "memory(GiB)": 21.48, "step": 15548, "token_acc": 1.0, "train_speed(iter/s)": 0.953643 }, { "epoch": 0.5051164603839782, "grad_norm": 0.41017547249794006, "learning_rate": 5.32876880191102e-06, "loss": 0.01790778897702694, "memory(GiB)": 21.48, "step": 15549, "token_acc": 0.9921875, "train_speed(iter/s)": 0.953652 }, { "epoch": 0.5051489458467335, "grad_norm": 0.3473914563655853, "learning_rate": 5.328232809543709e-06, "loss": 0.019189754500985146, "memory(GiB)": 21.48, "step": 15550, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.95366 }, { "epoch": 0.505181431309489, "grad_norm": 0.332857608795166, "learning_rate": 5.3276968133881546e-06, "loss": 0.027615362778306007, "memory(GiB)": 21.48, "step": 15551, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.95367 }, { "epoch": 0.5052139167722445, "grad_norm": 0.2764514088630676, "learning_rate": 5.3271608134505415e-06, "loss": 0.019061677157878876, "memory(GiB)": 21.48, "step": 15552, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.953679 }, { "epoch": 0.5052464022349998, "grad_norm": 0.37260621786117554, "learning_rate": 5.326624809737059e-06, "loss": 0.01746608130633831, "memory(GiB)": 21.48, "step": 15553, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.953688 }, { "epoch": 0.5052788876977553, "grad_norm": 0.32024210691452026, "learning_rate": 5.326088802253892e-06, "loss": 0.020525161176919937, "memory(GiB)": 21.48, "step": 15554, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.953697 }, { "epoch": 0.5053113731605107, "grad_norm": 0.8267192840576172, "learning_rate": 5.325552791007223e-06, "loss": 0.028123455122113228, "memory(GiB)": 21.48, "step": 15555, "token_acc": 1.0, "train_speed(iter/s)": 0.953705 }, { "epoch": 0.5053438586232661, "grad_norm": 0.29655346274375916, "learning_rate": 5.325016776003245e-06, "loss": 0.01965966634452343, "memory(GiB)": 21.48, "step": 15556, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.953714 }, { "epoch": 0.5053763440860215, "grad_norm": 0.2684435248374939, "learning_rate": 5.324480757248138e-06, "loss": 0.01839625835418701, "memory(GiB)": 21.48, "step": 15557, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.953723 }, { "epoch": 0.505408829548777, "grad_norm": 0.3542035222053528, "learning_rate": 5.323944734748093e-06, "loss": 0.02480732649564743, "memory(GiB)": 21.48, "step": 15558, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.953732 }, { "epoch": 0.5054413150115323, "grad_norm": 0.5239900350570679, "learning_rate": 5.323408708509293e-06, "loss": 0.02231547422707081, "memory(GiB)": 21.48, "step": 15559, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.95374 }, { "epoch": 0.5054738004742878, "grad_norm": 0.352113276720047, "learning_rate": 5.322872678537927e-06, "loss": 0.023986775428056717, "memory(GiB)": 21.48, "step": 15560, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.953748 }, { "epoch": 0.5055062859370432, "grad_norm": 0.3021664619445801, "learning_rate": 5.322336644840181e-06, "loss": 0.01693178340792656, "memory(GiB)": 21.48, "step": 15561, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.953756 }, { "epoch": 0.5055387713997986, "grad_norm": 0.405894011259079, "learning_rate": 5.32180060742224e-06, "loss": 0.019687118008732796, "memory(GiB)": 21.48, "step": 15562, "token_acc": 0.9927797833935018, "train_speed(iter/s)": 0.953765 }, { "epoch": 0.505571256862554, "grad_norm": 0.5437832474708557, "learning_rate": 5.321264566290293e-06, "loss": 0.030922237783670425, "memory(GiB)": 21.48, "step": 15563, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.953773 }, { "epoch": 0.5056037423253095, "grad_norm": 0.3118865191936493, "learning_rate": 5.3207285214505234e-06, "loss": 0.013092066161334515, "memory(GiB)": 21.48, "step": 15564, "token_acc": 0.992, "train_speed(iter/s)": 0.953781 }, { "epoch": 0.5056362277880648, "grad_norm": 0.29589489102363586, "learning_rate": 5.320192472909122e-06, "loss": 0.01852046698331833, "memory(GiB)": 21.48, "step": 15565, "token_acc": 1.0, "train_speed(iter/s)": 0.95379 }, { "epoch": 0.5056687132508203, "grad_norm": 0.32160839438438416, "learning_rate": 5.319656420672271e-06, "loss": 0.01530797965824604, "memory(GiB)": 21.48, "step": 15566, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.9538 }, { "epoch": 0.5057011987135757, "grad_norm": 0.429946631193161, "learning_rate": 5.319120364746162e-06, "loss": 0.02905198000371456, "memory(GiB)": 21.48, "step": 15567, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.953809 }, { "epoch": 0.5057336841763311, "grad_norm": 0.34440454840660095, "learning_rate": 5.318584305136979e-06, "loss": 0.02518535777926445, "memory(GiB)": 21.48, "step": 15568, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.953821 }, { "epoch": 0.5057661696390865, "grad_norm": 0.8095340728759766, "learning_rate": 5.318048241850907e-06, "loss": 0.023406006395816803, "memory(GiB)": 21.48, "step": 15569, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.953833 }, { "epoch": 0.505798655101842, "grad_norm": 0.27658259868621826, "learning_rate": 5.317512174894136e-06, "loss": 0.019761182367801666, "memory(GiB)": 21.48, "step": 15570, "token_acc": 1.0, "train_speed(iter/s)": 0.953846 }, { "epoch": 0.5058311405645973, "grad_norm": 0.31414854526519775, "learning_rate": 5.316976104272852e-06, "loss": 0.014435925520956516, "memory(GiB)": 21.48, "step": 15571, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.953858 }, { "epoch": 0.5058636260273528, "grad_norm": 0.35137662291526794, "learning_rate": 5.316440029993242e-06, "loss": 0.01917203515768051, "memory(GiB)": 21.48, "step": 15572, "token_acc": 1.0, "train_speed(iter/s)": 0.953871 }, { "epoch": 0.5058961114901082, "grad_norm": 0.3307780623435974, "learning_rate": 5.315903952061492e-06, "loss": 0.02247151918709278, "memory(GiB)": 21.48, "step": 15573, "token_acc": 1.0, "train_speed(iter/s)": 0.953883 }, { "epoch": 0.5059285969528636, "grad_norm": 0.384051114320755, "learning_rate": 5.315367870483792e-06, "loss": 0.01670799031853676, "memory(GiB)": 21.48, "step": 15574, "token_acc": 0.9880239520958084, "train_speed(iter/s)": 0.953896 }, { "epoch": 0.505961082415619, "grad_norm": 0.4328612983226776, "learning_rate": 5.314831785266325e-06, "loss": 0.0222143717110157, "memory(GiB)": 21.48, "step": 15575, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.953909 }, { "epoch": 0.5059935678783745, "grad_norm": 0.3648166060447693, "learning_rate": 5.314295696415279e-06, "loss": 0.018205169588327408, "memory(GiB)": 21.48, "step": 15576, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.953922 }, { "epoch": 0.5060260533411298, "grad_norm": 0.3024473488330841, "learning_rate": 5.3137596039368455e-06, "loss": 0.01589011587202549, "memory(GiB)": 21.48, "step": 15577, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.953935 }, { "epoch": 0.5060585388038853, "grad_norm": 0.2620236873626709, "learning_rate": 5.3132235078372066e-06, "loss": 0.018495963886380196, "memory(GiB)": 21.48, "step": 15578, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.953949 }, { "epoch": 0.5060910242666407, "grad_norm": 0.30883079767227173, "learning_rate": 5.312687408122552e-06, "loss": 0.014135613106191158, "memory(GiB)": 21.48, "step": 15579, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.953961 }, { "epoch": 0.5061235097293961, "grad_norm": 0.37191030383110046, "learning_rate": 5.312151304799068e-06, "loss": 0.024007130414247513, "memory(GiB)": 21.48, "step": 15580, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.953974 }, { "epoch": 0.5061559951921515, "grad_norm": 0.38402029871940613, "learning_rate": 5.311615197872944e-06, "loss": 0.01984604261815548, "memory(GiB)": 21.48, "step": 15581, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.953987 }, { "epoch": 0.506188480654907, "grad_norm": 0.49247556924819946, "learning_rate": 5.311079087350366e-06, "loss": 0.0224197655916214, "memory(GiB)": 21.48, "step": 15582, "token_acc": 1.0, "train_speed(iter/s)": 0.954 }, { "epoch": 0.5062209661176623, "grad_norm": 0.3773771822452545, "learning_rate": 5.310542973237519e-06, "loss": 0.020747389644384384, "memory(GiB)": 21.48, "step": 15583, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.954012 }, { "epoch": 0.5062534515804178, "grad_norm": 0.3513532876968384, "learning_rate": 5.310006855540595e-06, "loss": 0.023487819358706474, "memory(GiB)": 21.48, "step": 15584, "token_acc": 0.981203007518797, "train_speed(iter/s)": 0.954026 }, { "epoch": 0.5062859370431732, "grad_norm": 0.3283049464225769, "learning_rate": 5.309470734265777e-06, "loss": 0.022750981152057648, "memory(GiB)": 21.48, "step": 15585, "token_acc": 0.9930795847750865, "train_speed(iter/s)": 0.954038 }, { "epoch": 0.5063184225059286, "grad_norm": 0.3950612246990204, "learning_rate": 5.308934609419257e-06, "loss": 0.016772428527474403, "memory(GiB)": 21.48, "step": 15586, "token_acc": 0.9877049180327869, "train_speed(iter/s)": 0.95405 }, { "epoch": 0.506350907968684, "grad_norm": 0.283926397562027, "learning_rate": 5.308398481007219e-06, "loss": 0.02190750278532505, "memory(GiB)": 21.48, "step": 15587, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.954064 }, { "epoch": 0.5063833934314395, "grad_norm": 0.33831480145454407, "learning_rate": 5.3078623490358525e-06, "loss": 0.021475834771990776, "memory(GiB)": 21.48, "step": 15588, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.954076 }, { "epoch": 0.5064158788941948, "grad_norm": 0.7964510321617126, "learning_rate": 5.307326213511345e-06, "loss": 0.01898997277021408, "memory(GiB)": 21.48, "step": 15589, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.954088 }, { "epoch": 0.5064483643569503, "grad_norm": 0.340313583612442, "learning_rate": 5.306790074439883e-06, "loss": 0.019697314128279686, "memory(GiB)": 21.48, "step": 15590, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.954101 }, { "epoch": 0.5064808498197056, "grad_norm": 0.37008994817733765, "learning_rate": 5.306253931827656e-06, "loss": 0.02140049636363983, "memory(GiB)": 21.48, "step": 15591, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.954114 }, { "epoch": 0.5065133352824611, "grad_norm": 0.4960850179195404, "learning_rate": 5.3057177856808505e-06, "loss": 0.026078686118125916, "memory(GiB)": 21.48, "step": 15592, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.954127 }, { "epoch": 0.5065458207452165, "grad_norm": 0.28365325927734375, "learning_rate": 5.305181636005656e-06, "loss": 0.018196988850831985, "memory(GiB)": 21.48, "step": 15593, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.95414 }, { "epoch": 0.506578306207972, "grad_norm": 0.4139713644981384, "learning_rate": 5.304645482808257e-06, "loss": 0.021994195878505707, "memory(GiB)": 21.48, "step": 15594, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.954152 }, { "epoch": 0.5066107916707273, "grad_norm": 0.6141822934150696, "learning_rate": 5.3041093260948464e-06, "loss": 0.023923851549625397, "memory(GiB)": 21.48, "step": 15595, "token_acc": 1.0, "train_speed(iter/s)": 0.954165 }, { "epoch": 0.5066432771334828, "grad_norm": 0.41520437598228455, "learning_rate": 5.303573165871608e-06, "loss": 0.022592846304178238, "memory(GiB)": 21.48, "step": 15596, "token_acc": 1.0, "train_speed(iter/s)": 0.954178 }, { "epoch": 0.5066757625962381, "grad_norm": 0.5256704688072205, "learning_rate": 5.303037002144731e-06, "loss": 0.023200156167149544, "memory(GiB)": 21.48, "step": 15597, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.954189 }, { "epoch": 0.5067082480589936, "grad_norm": 0.379462867975235, "learning_rate": 5.302500834920405e-06, "loss": 0.019098840653896332, "memory(GiB)": 21.48, "step": 15598, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.954202 }, { "epoch": 0.506740733521749, "grad_norm": 0.3943493068218231, "learning_rate": 5.301964664204815e-06, "loss": 0.022522777318954468, "memory(GiB)": 21.48, "step": 15599, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.954215 }, { "epoch": 0.5067732189845044, "grad_norm": 0.4873085021972656, "learning_rate": 5.3014284900041526e-06, "loss": 0.02507024258375168, "memory(GiB)": 21.48, "step": 15600, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.954228 }, { "epoch": 0.5068057044472598, "grad_norm": 0.31562861800193787, "learning_rate": 5.300892312324603e-06, "loss": 0.01735278032720089, "memory(GiB)": 21.48, "step": 15601, "token_acc": 0.9897435897435898, "train_speed(iter/s)": 0.954241 }, { "epoch": 0.5068381899100153, "grad_norm": 0.5067010521888733, "learning_rate": 5.300356131172356e-06, "loss": 0.027839334681630135, "memory(GiB)": 21.48, "step": 15602, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.954254 }, { "epoch": 0.5068706753727706, "grad_norm": 0.3104734718799591, "learning_rate": 5.2998199465535985e-06, "loss": 0.02123490534722805, "memory(GiB)": 21.48, "step": 15603, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.954266 }, { "epoch": 0.5069031608355261, "grad_norm": 0.3980279862880707, "learning_rate": 5.2992837584745205e-06, "loss": 0.023785142228007317, "memory(GiB)": 21.48, "step": 15604, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.954279 }, { "epoch": 0.5069356462982815, "grad_norm": 0.3840377926826477, "learning_rate": 5.29874756694131e-06, "loss": 0.023075103759765625, "memory(GiB)": 21.48, "step": 15605, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.954292 }, { "epoch": 0.5069681317610369, "grad_norm": 0.3254335820674896, "learning_rate": 5.2982113719601536e-06, "loss": 0.020301906391978264, "memory(GiB)": 21.48, "step": 15606, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.954305 }, { "epoch": 0.5070006172237923, "grad_norm": 0.35145387053489685, "learning_rate": 5.297675173537242e-06, "loss": 0.0203696358948946, "memory(GiB)": 21.48, "step": 15607, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.954317 }, { "epoch": 0.5070331026865478, "grad_norm": 0.3890244960784912, "learning_rate": 5.297138971678763e-06, "loss": 0.021102480590343475, "memory(GiB)": 21.48, "step": 15608, "token_acc": 0.9846153846153847, "train_speed(iter/s)": 0.954327 }, { "epoch": 0.5070655881493031, "grad_norm": 0.4243115782737732, "learning_rate": 5.296602766390904e-06, "loss": 0.020562997087836266, "memory(GiB)": 21.48, "step": 15609, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.954337 }, { "epoch": 0.5070980736120586, "grad_norm": 0.31412529945373535, "learning_rate": 5.296066557679856e-06, "loss": 0.01635550893843174, "memory(GiB)": 21.48, "step": 15610, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.954346 }, { "epoch": 0.507130559074814, "grad_norm": 0.6346181035041809, "learning_rate": 5.2955303455518045e-06, "loss": 0.02288024127483368, "memory(GiB)": 21.48, "step": 15611, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.954355 }, { "epoch": 0.5071630445375694, "grad_norm": 0.32641568779945374, "learning_rate": 5.294994130012941e-06, "loss": 0.019533520564436913, "memory(GiB)": 21.48, "step": 15612, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.954363 }, { "epoch": 0.5071955300003248, "grad_norm": 0.3730071783065796, "learning_rate": 5.294457911069449e-06, "loss": 0.020442601293325424, "memory(GiB)": 21.48, "step": 15613, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.954371 }, { "epoch": 0.5072280154630803, "grad_norm": 0.3252478837966919, "learning_rate": 5.293921688727524e-06, "loss": 0.014143155887722969, "memory(GiB)": 21.48, "step": 15614, "token_acc": 1.0, "train_speed(iter/s)": 0.95438 }, { "epoch": 0.5072605009258356, "grad_norm": 0.40495312213897705, "learning_rate": 5.29338546299335e-06, "loss": 0.018420692533254623, "memory(GiB)": 21.48, "step": 15615, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.954389 }, { "epoch": 0.5072929863885911, "grad_norm": 0.3454551100730896, "learning_rate": 5.292849233873119e-06, "loss": 0.016538012772798538, "memory(GiB)": 21.48, "step": 15616, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.954398 }, { "epoch": 0.5073254718513466, "grad_norm": 0.3483426868915558, "learning_rate": 5.2923130013730165e-06, "loss": 0.019944123923778534, "memory(GiB)": 21.48, "step": 15617, "token_acc": 1.0, "train_speed(iter/s)": 0.954408 }, { "epoch": 0.5073579573141019, "grad_norm": 0.3759543001651764, "learning_rate": 5.291776765499234e-06, "loss": 0.02235039696097374, "memory(GiB)": 21.48, "step": 15618, "token_acc": 0.9776785714285714, "train_speed(iter/s)": 0.954416 }, { "epoch": 0.5073904427768574, "grad_norm": 0.30187031626701355, "learning_rate": 5.291240526257958e-06, "loss": 0.013252762146294117, "memory(GiB)": 21.48, "step": 15619, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.954425 }, { "epoch": 0.5074229282396128, "grad_norm": 0.36303451657295227, "learning_rate": 5.2907042836553794e-06, "loss": 0.02497362531721592, "memory(GiB)": 21.48, "step": 15620, "token_acc": 1.0, "train_speed(iter/s)": 0.954433 }, { "epoch": 0.5074554137023682, "grad_norm": 0.37882307171821594, "learning_rate": 5.290168037697685e-06, "loss": 0.023605886846780777, "memory(GiB)": 21.48, "step": 15621, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.954442 }, { "epoch": 0.5074878991651236, "grad_norm": 0.3116704523563385, "learning_rate": 5.2896317883910665e-06, "loss": 0.018865123391151428, "memory(GiB)": 21.48, "step": 15622, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.95445 }, { "epoch": 0.5075203846278791, "grad_norm": 0.513123095035553, "learning_rate": 5.2890955357417115e-06, "loss": 0.024397945031523705, "memory(GiB)": 21.48, "step": 15623, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.954457 }, { "epoch": 0.5075528700906344, "grad_norm": 0.4863262474536896, "learning_rate": 5.288559279755811e-06, "loss": 0.023043587803840637, "memory(GiB)": 21.48, "step": 15624, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.954464 }, { "epoch": 0.5075853555533899, "grad_norm": 0.398975133895874, "learning_rate": 5.288023020439549e-06, "loss": 0.019573209807276726, "memory(GiB)": 21.48, "step": 15625, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.954472 }, { "epoch": 0.5076178410161453, "grad_norm": 0.29013770818710327, "learning_rate": 5.2874867577991204e-06, "loss": 0.020854884758591652, "memory(GiB)": 21.48, "step": 15626, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.954481 }, { "epoch": 0.5076503264789007, "grad_norm": 0.2907518744468689, "learning_rate": 5.286950491840711e-06, "loss": 0.016683422029018402, "memory(GiB)": 21.48, "step": 15627, "token_acc": 1.0, "train_speed(iter/s)": 0.954491 }, { "epoch": 0.5076828119416561, "grad_norm": 0.2486400008201599, "learning_rate": 5.2864142225705115e-06, "loss": 0.017319466918706894, "memory(GiB)": 21.48, "step": 15628, "token_acc": 1.0, "train_speed(iter/s)": 0.954499 }, { "epoch": 0.5077152974044116, "grad_norm": 0.4890594780445099, "learning_rate": 5.28587794999471e-06, "loss": 0.02285309135913849, "memory(GiB)": 21.48, "step": 15629, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.954509 }, { "epoch": 0.5077477828671669, "grad_norm": 0.5428239703178406, "learning_rate": 5.285341674119497e-06, "loss": 0.021494567394256592, "memory(GiB)": 21.48, "step": 15630, "token_acc": 0.9923371647509579, "train_speed(iter/s)": 0.954519 }, { "epoch": 0.5077802683299224, "grad_norm": 0.4223412275314331, "learning_rate": 5.284805394951061e-06, "loss": 0.02120041660964489, "memory(GiB)": 21.48, "step": 15631, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.954529 }, { "epoch": 0.5078127537926778, "grad_norm": 0.47352519631385803, "learning_rate": 5.284269112495592e-06, "loss": 0.026619458571076393, "memory(GiB)": 21.48, "step": 15632, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.954539 }, { "epoch": 0.5078452392554332, "grad_norm": 0.28287002444267273, "learning_rate": 5.2837328267592805e-06, "loss": 0.017872938886284828, "memory(GiB)": 21.48, "step": 15633, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.954551 }, { "epoch": 0.5078777247181886, "grad_norm": 0.4047946631908417, "learning_rate": 5.283196537748312e-06, "loss": 0.02290964312851429, "memory(GiB)": 21.48, "step": 15634, "token_acc": 0.9947643979057592, "train_speed(iter/s)": 0.954564 }, { "epoch": 0.5079102101809441, "grad_norm": 0.42224517464637756, "learning_rate": 5.28266024546888e-06, "loss": 0.024927105754613876, "memory(GiB)": 21.48, "step": 15635, "token_acc": 0.9846743295019157, "train_speed(iter/s)": 0.954577 }, { "epoch": 0.5079426956436994, "grad_norm": 0.7331331968307495, "learning_rate": 5.282123949927172e-06, "loss": 0.031207630410790443, "memory(GiB)": 21.48, "step": 15636, "token_acc": 0.9813953488372092, "train_speed(iter/s)": 0.954589 }, { "epoch": 0.5079751811064549, "grad_norm": 0.36198657751083374, "learning_rate": 5.281587651129377e-06, "loss": 0.01867726631462574, "memory(GiB)": 21.48, "step": 15637, "token_acc": 1.0, "train_speed(iter/s)": 0.954602 }, { "epoch": 0.5080076665692103, "grad_norm": 0.3511042892932892, "learning_rate": 5.281051349081688e-06, "loss": 0.01843916065990925, "memory(GiB)": 21.48, "step": 15638, "token_acc": 0.9921875, "train_speed(iter/s)": 0.954614 }, { "epoch": 0.5080401520319657, "grad_norm": 0.4023081660270691, "learning_rate": 5.2805150437902895e-06, "loss": 0.024326195940375328, "memory(GiB)": 21.48, "step": 15639, "token_acc": 1.0, "train_speed(iter/s)": 0.954627 }, { "epoch": 0.5080726374947211, "grad_norm": 0.39345991611480713, "learning_rate": 5.2799787352613765e-06, "loss": 0.022109048441052437, "memory(GiB)": 21.48, "step": 15640, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.95464 }, { "epoch": 0.5081051229574766, "grad_norm": 0.3432596027851105, "learning_rate": 5.279442423501135e-06, "loss": 0.023218875750899315, "memory(GiB)": 21.48, "step": 15641, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.954653 }, { "epoch": 0.5081376084202319, "grad_norm": 0.2925454378128052, "learning_rate": 5.278906108515757e-06, "loss": 0.020443979650735855, "memory(GiB)": 21.48, "step": 15642, "token_acc": 0.995, "train_speed(iter/s)": 0.954665 }, { "epoch": 0.5081700938829874, "grad_norm": 0.5481588840484619, "learning_rate": 5.27836979031143e-06, "loss": 0.023808253929018974, "memory(GiB)": 21.48, "step": 15643, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.954678 }, { "epoch": 0.5082025793457428, "grad_norm": 0.5524899363517761, "learning_rate": 5.277833468894347e-06, "loss": 0.02131737768650055, "memory(GiB)": 21.48, "step": 15644, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.954691 }, { "epoch": 0.5082350648084982, "grad_norm": 0.3374369740486145, "learning_rate": 5.277297144270694e-06, "loss": 0.021764051169157028, "memory(GiB)": 21.48, "step": 15645, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.954704 }, { "epoch": 0.5082675502712536, "grad_norm": 0.2110702097415924, "learning_rate": 5.276760816446664e-06, "loss": 0.012086999602615833, "memory(GiB)": 21.48, "step": 15646, "token_acc": 1.0, "train_speed(iter/s)": 0.954717 }, { "epoch": 0.5083000357340091, "grad_norm": 0.27730023860931396, "learning_rate": 5.276224485428446e-06, "loss": 0.019299428910017014, "memory(GiB)": 21.48, "step": 15647, "token_acc": 0.9929824561403509, "train_speed(iter/s)": 0.95473 }, { "epoch": 0.5083325211967644, "grad_norm": 0.40328043699264526, "learning_rate": 5.275688151222229e-06, "loss": 0.023345254361629486, "memory(GiB)": 21.48, "step": 15648, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.954742 }, { "epoch": 0.5083650066595199, "grad_norm": 0.31152522563934326, "learning_rate": 5.275151813834205e-06, "loss": 0.019579757004976273, "memory(GiB)": 21.48, "step": 15649, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.954755 }, { "epoch": 0.5083974921222753, "grad_norm": 0.40479782223701477, "learning_rate": 5.274615473270561e-06, "loss": 0.02412799745798111, "memory(GiB)": 21.48, "step": 15650, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.954768 }, { "epoch": 0.5084299775850307, "grad_norm": 0.3821665942668915, "learning_rate": 5.274079129537491e-06, "loss": 0.020323384553194046, "memory(GiB)": 21.48, "step": 15651, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.954781 }, { "epoch": 0.5084624630477861, "grad_norm": 0.28779172897338867, "learning_rate": 5.27354278264118e-06, "loss": 0.014235740527510643, "memory(GiB)": 21.48, "step": 15652, "token_acc": 1.0, "train_speed(iter/s)": 0.954794 }, { "epoch": 0.5084949485105416, "grad_norm": 0.33065593242645264, "learning_rate": 5.273006432587824e-06, "loss": 0.023147542029619217, "memory(GiB)": 21.48, "step": 15653, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.954808 }, { "epoch": 0.5085274339732969, "grad_norm": 0.32947173714637756, "learning_rate": 5.272470079383609e-06, "loss": 0.02306397259235382, "memory(GiB)": 21.48, "step": 15654, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.95482 }, { "epoch": 0.5085599194360524, "grad_norm": 0.3621479868888855, "learning_rate": 5.2719337230347266e-06, "loss": 0.022111238911747932, "memory(GiB)": 21.48, "step": 15655, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.954832 }, { "epoch": 0.5085924048988077, "grad_norm": 0.34156087040901184, "learning_rate": 5.271397363547368e-06, "loss": 0.020231904461979866, "memory(GiB)": 21.48, "step": 15656, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.954844 }, { "epoch": 0.5086248903615632, "grad_norm": 0.3766205608844757, "learning_rate": 5.270861000927722e-06, "loss": 0.014962034299969673, "memory(GiB)": 21.48, "step": 15657, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.954857 }, { "epoch": 0.5086573758243186, "grad_norm": 5.463540077209473, "learning_rate": 5.270324635181981e-06, "loss": 0.01385610830038786, "memory(GiB)": 21.48, "step": 15658, "token_acc": 1.0, "train_speed(iter/s)": 0.95487 }, { "epoch": 0.508689861287074, "grad_norm": 0.27503690123558044, "learning_rate": 5.269788266316332e-06, "loss": 0.018409743905067444, "memory(GiB)": 21.48, "step": 15659, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.954883 }, { "epoch": 0.5087223467498294, "grad_norm": 0.36077365279197693, "learning_rate": 5.269251894336968e-06, "loss": 0.02171185426414013, "memory(GiB)": 21.48, "step": 15660, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.954896 }, { "epoch": 0.5087548322125849, "grad_norm": 0.27108514308929443, "learning_rate": 5.268715519250078e-06, "loss": 0.015992481261491776, "memory(GiB)": 21.48, "step": 15661, "token_acc": 1.0, "train_speed(iter/s)": 0.954909 }, { "epoch": 0.5087873176753402, "grad_norm": 0.36505287885665894, "learning_rate": 5.268179141061854e-06, "loss": 0.026314949616789818, "memory(GiB)": 21.48, "step": 15662, "token_acc": 0.9805825242718447, "train_speed(iter/s)": 0.954921 }, { "epoch": 0.5088198031380957, "grad_norm": 0.3512277603149414, "learning_rate": 5.267642759778488e-06, "loss": 0.0223254207521677, "memory(GiB)": 21.48, "step": 15663, "token_acc": 1.0, "train_speed(iter/s)": 0.954934 }, { "epoch": 0.5088522886008511, "grad_norm": 0.3597261607646942, "learning_rate": 5.267106375406165e-06, "loss": 0.02041918784379959, "memory(GiB)": 21.48, "step": 15664, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.954948 }, { "epoch": 0.5088847740636065, "grad_norm": 0.47980013489723206, "learning_rate": 5.26656998795108e-06, "loss": 0.024426132440567017, "memory(GiB)": 21.48, "step": 15665, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.95496 }, { "epoch": 0.5089172595263619, "grad_norm": 0.45746347308158875, "learning_rate": 5.266033597419422e-06, "loss": 0.03037315048277378, "memory(GiB)": 21.48, "step": 15666, "token_acc": 0.9921875, "train_speed(iter/s)": 0.954972 }, { "epoch": 0.5089497449891174, "grad_norm": 0.4006723463535309, "learning_rate": 5.265497203817383e-06, "loss": 0.025265956297516823, "memory(GiB)": 21.48, "step": 15667, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.954982 }, { "epoch": 0.5089822304518727, "grad_norm": 0.4628917872905731, "learning_rate": 5.264960807151152e-06, "loss": 0.029282012954354286, "memory(GiB)": 21.48, "step": 15668, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.954991 }, { "epoch": 0.5090147159146282, "grad_norm": 0.36561569571495056, "learning_rate": 5.264424407426922e-06, "loss": 0.019338056445121765, "memory(GiB)": 21.48, "step": 15669, "token_acc": 0.9922779922779923, "train_speed(iter/s)": 0.955001 }, { "epoch": 0.5090472013773836, "grad_norm": 0.34005820751190186, "learning_rate": 5.263888004650881e-06, "loss": 0.024756532162427902, "memory(GiB)": 21.48, "step": 15670, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.955011 }, { "epoch": 0.509079686840139, "grad_norm": 0.4085652530193329, "learning_rate": 5.263351598829222e-06, "loss": 0.023134391754865646, "memory(GiB)": 21.48, "step": 15671, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.95502 }, { "epoch": 0.5091121723028944, "grad_norm": 0.3098052144050598, "learning_rate": 5.262815189968135e-06, "loss": 0.01949426904320717, "memory(GiB)": 21.48, "step": 15672, "token_acc": 0.984, "train_speed(iter/s)": 0.955028 }, { "epoch": 0.5091446577656499, "grad_norm": 0.2933368682861328, "learning_rate": 5.262278778073811e-06, "loss": 0.01574202999472618, "memory(GiB)": 21.48, "step": 15673, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955036 }, { "epoch": 0.5091771432284052, "grad_norm": 0.42021024227142334, "learning_rate": 5.26174236315244e-06, "loss": 0.0217200368642807, "memory(GiB)": 21.48, "step": 15674, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.955044 }, { "epoch": 0.5092096286911607, "grad_norm": 0.2730364203453064, "learning_rate": 5.261205945210214e-06, "loss": 0.01810571551322937, "memory(GiB)": 21.48, "step": 15675, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.955053 }, { "epoch": 0.5092421141539161, "grad_norm": 0.28667494654655457, "learning_rate": 5.260669524253325e-06, "loss": 0.014579791575670242, "memory(GiB)": 21.48, "step": 15676, "token_acc": 0.9965156794425087, "train_speed(iter/s)": 0.955061 }, { "epoch": 0.5092745996166715, "grad_norm": 0.4535976052284241, "learning_rate": 5.260133100287962e-06, "loss": 0.02101556584239006, "memory(GiB)": 21.48, "step": 15677, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.95507 }, { "epoch": 0.5093070850794269, "grad_norm": 0.3137589991092682, "learning_rate": 5.259596673320316e-06, "loss": 0.015239602886140347, "memory(GiB)": 21.48, "step": 15678, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.955079 }, { "epoch": 0.5093395705421824, "grad_norm": 0.3328092098236084, "learning_rate": 5.25906024335658e-06, "loss": 0.015110727399587631, "memory(GiB)": 21.48, "step": 15679, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955087 }, { "epoch": 0.5093720560049378, "grad_norm": 0.2680187523365021, "learning_rate": 5.258523810402942e-06, "loss": 0.02182805724442005, "memory(GiB)": 21.48, "step": 15680, "token_acc": 1.0, "train_speed(iter/s)": 0.955095 }, { "epoch": 0.5094045414676932, "grad_norm": 0.37868180871009827, "learning_rate": 5.257987374465598e-06, "loss": 0.029027707874774933, "memory(GiB)": 21.48, "step": 15681, "token_acc": 0.9824561403508771, "train_speed(iter/s)": 0.955104 }, { "epoch": 0.5094370269304487, "grad_norm": 0.5023783445358276, "learning_rate": 5.2574509355507345e-06, "loss": 0.030335988849401474, "memory(GiB)": 21.48, "step": 15682, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.955113 }, { "epoch": 0.509469512393204, "grad_norm": 0.35532817244529724, "learning_rate": 5.256914493664544e-06, "loss": 0.014111438766121864, "memory(GiB)": 21.48, "step": 15683, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.955121 }, { "epoch": 0.5095019978559595, "grad_norm": 0.573947548866272, "learning_rate": 5.2563780488132196e-06, "loss": 0.026482073590159416, "memory(GiB)": 21.48, "step": 15684, "token_acc": 0.9767441860465116, "train_speed(iter/s)": 0.95513 }, { "epoch": 0.5095344833187149, "grad_norm": 0.32311272621154785, "learning_rate": 5.255841601002951e-06, "loss": 0.021180108189582825, "memory(GiB)": 21.48, "step": 15685, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.955137 }, { "epoch": 0.5095669687814703, "grad_norm": 0.7974959015846252, "learning_rate": 5.25530515023993e-06, "loss": 0.02474675327539444, "memory(GiB)": 21.48, "step": 15686, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955146 }, { "epoch": 0.5095994542442257, "grad_norm": 0.311577707529068, "learning_rate": 5.254768696530347e-06, "loss": 0.015231925994157791, "memory(GiB)": 21.48, "step": 15687, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.955156 }, { "epoch": 0.5096319397069812, "grad_norm": 0.4238414466381073, "learning_rate": 5.254232239880394e-06, "loss": 0.02050921320915222, "memory(GiB)": 21.48, "step": 15688, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.955167 }, { "epoch": 0.5096644251697365, "grad_norm": 0.413932204246521, "learning_rate": 5.253695780296263e-06, "loss": 0.020422179251909256, "memory(GiB)": 21.48, "step": 15689, "token_acc": 1.0, "train_speed(iter/s)": 0.955177 }, { "epoch": 0.509696910632492, "grad_norm": 0.28249457478523254, "learning_rate": 5.253159317784146e-06, "loss": 0.022218823432922363, "memory(GiB)": 21.48, "step": 15690, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.955185 }, { "epoch": 0.5097293960952474, "grad_norm": 0.39966562390327454, "learning_rate": 5.2526228523502335e-06, "loss": 0.023477736860513687, "memory(GiB)": 21.48, "step": 15691, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955194 }, { "epoch": 0.5097618815580028, "grad_norm": 0.362074613571167, "learning_rate": 5.2520863840007156e-06, "loss": 0.027954861521720886, "memory(GiB)": 21.48, "step": 15692, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.955205 }, { "epoch": 0.5097943670207582, "grad_norm": 0.3111141622066498, "learning_rate": 5.251549912741787e-06, "loss": 0.015372619032859802, "memory(GiB)": 21.48, "step": 15693, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.955215 }, { "epoch": 0.5098268524835137, "grad_norm": 0.33566340804100037, "learning_rate": 5.251013438579635e-06, "loss": 0.01884300634264946, "memory(GiB)": 21.48, "step": 15694, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.955225 }, { "epoch": 0.509859337946269, "grad_norm": 0.32270392775535583, "learning_rate": 5.2504769615204566e-06, "loss": 0.014695094898343086, "memory(GiB)": 21.48, "step": 15695, "token_acc": 1.0, "train_speed(iter/s)": 0.955236 }, { "epoch": 0.5098918234090245, "grad_norm": 0.3542906641960144, "learning_rate": 5.2499404815704395e-06, "loss": 0.019511140882968903, "memory(GiB)": 21.48, "step": 15696, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.955249 }, { "epoch": 0.5099243088717799, "grad_norm": 0.4315028786659241, "learning_rate": 5.2494039987357756e-06, "loss": 0.0324297770857811, "memory(GiB)": 21.48, "step": 15697, "token_acc": 0.98828125, "train_speed(iter/s)": 0.955262 }, { "epoch": 0.5099567943345353, "grad_norm": 0.45129334926605225, "learning_rate": 5.248867513022658e-06, "loss": 0.019130362197756767, "memory(GiB)": 21.48, "step": 15698, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.955275 }, { "epoch": 0.5099892797972907, "grad_norm": 0.3013083040714264, "learning_rate": 5.248331024437278e-06, "loss": 0.01718355342745781, "memory(GiB)": 21.48, "step": 15699, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.955288 }, { "epoch": 0.5100217652600462, "grad_norm": 0.5081890225410461, "learning_rate": 5.2477945329858295e-06, "loss": 0.02233206480741501, "memory(GiB)": 21.48, "step": 15700, "token_acc": 1.0, "train_speed(iter/s)": 0.9553 }, { "epoch": 0.5100542507228015, "grad_norm": 0.40313786268234253, "learning_rate": 5.2472580386745e-06, "loss": 0.025828277692198753, "memory(GiB)": 21.48, "step": 15701, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.955313 }, { "epoch": 0.510086736185557, "grad_norm": 0.419904500246048, "learning_rate": 5.2467215415094854e-06, "loss": 0.022730758413672447, "memory(GiB)": 21.48, "step": 15702, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.955326 }, { "epoch": 0.5101192216483124, "grad_norm": 0.38807904720306396, "learning_rate": 5.246185041496974e-06, "loss": 0.017340321093797684, "memory(GiB)": 21.48, "step": 15703, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955339 }, { "epoch": 0.5101517071110678, "grad_norm": 0.38309261202812195, "learning_rate": 5.245648538643161e-06, "loss": 0.02398441731929779, "memory(GiB)": 21.48, "step": 15704, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955352 }, { "epoch": 0.5101841925738232, "grad_norm": 0.3700752258300781, "learning_rate": 5.245112032954236e-06, "loss": 0.016731735318899155, "memory(GiB)": 21.48, "step": 15705, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.955365 }, { "epoch": 0.5102166780365787, "grad_norm": 0.3718193769454956, "learning_rate": 5.2445755244363925e-06, "loss": 0.0224200077354908, "memory(GiB)": 21.48, "step": 15706, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.955377 }, { "epoch": 0.510249163499334, "grad_norm": 0.3201198875904083, "learning_rate": 5.2440390130958215e-06, "loss": 0.018581729382276535, "memory(GiB)": 21.48, "step": 15707, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955391 }, { "epoch": 0.5102816489620895, "grad_norm": 0.26263320446014404, "learning_rate": 5.243502498938715e-06, "loss": 0.018053364008665085, "memory(GiB)": 21.48, "step": 15708, "token_acc": 0.99609375, "train_speed(iter/s)": 0.955404 }, { "epoch": 0.5103141344248449, "grad_norm": 0.6666492819786072, "learning_rate": 5.242965981971268e-06, "loss": 0.023466695100069046, "memory(GiB)": 21.48, "step": 15709, "token_acc": 0.9742489270386266, "train_speed(iter/s)": 0.955417 }, { "epoch": 0.5103466198876003, "grad_norm": 0.5127735733985901, "learning_rate": 5.242429462199666e-06, "loss": 0.029880670830607414, "memory(GiB)": 21.48, "step": 15710, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955429 }, { "epoch": 0.5103791053503557, "grad_norm": 0.3219946622848511, "learning_rate": 5.241892939630109e-06, "loss": 0.02621876448392868, "memory(GiB)": 21.48, "step": 15711, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955443 }, { "epoch": 0.5104115908131112, "grad_norm": 0.3092971444129944, "learning_rate": 5.241356414268783e-06, "loss": 0.019492480903863907, "memory(GiB)": 21.48, "step": 15712, "token_acc": 0.9846938775510204, "train_speed(iter/s)": 0.955456 }, { "epoch": 0.5104440762758665, "grad_norm": 0.4248868525028229, "learning_rate": 5.240819886121885e-06, "loss": 0.02220415323972702, "memory(GiB)": 21.48, "step": 15713, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.955469 }, { "epoch": 0.510476561738622, "grad_norm": 0.29978272318840027, "learning_rate": 5.240283355195603e-06, "loss": 0.02444029413163662, "memory(GiB)": 21.48, "step": 15714, "token_acc": 0.9895104895104895, "train_speed(iter/s)": 0.955482 }, { "epoch": 0.5105090472013774, "grad_norm": 0.34227606654167175, "learning_rate": 5.239746821496132e-06, "loss": 0.019230710342526436, "memory(GiB)": 21.48, "step": 15715, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.955495 }, { "epoch": 0.5105415326641328, "grad_norm": 0.3530951142311096, "learning_rate": 5.2392102850296645e-06, "loss": 0.02023683674633503, "memory(GiB)": 21.48, "step": 15716, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.955508 }, { "epoch": 0.5105740181268882, "grad_norm": 0.33417990803718567, "learning_rate": 5.23867374580239e-06, "loss": 0.015643982216715813, "memory(GiB)": 21.48, "step": 15717, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.95552 }, { "epoch": 0.5106065035896437, "grad_norm": 0.3043023645877838, "learning_rate": 5.238137203820505e-06, "loss": 0.019461311399936676, "memory(GiB)": 21.48, "step": 15718, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955532 }, { "epoch": 0.510638989052399, "grad_norm": 0.319145530462265, "learning_rate": 5.237600659090199e-06, "loss": 0.013691076077520847, "memory(GiB)": 21.48, "step": 15719, "token_acc": 1.0, "train_speed(iter/s)": 0.955545 }, { "epoch": 0.5106714745151545, "grad_norm": 0.30119994282722473, "learning_rate": 5.237064111617664e-06, "loss": 0.021241527050733566, "memory(GiB)": 21.48, "step": 15720, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.955558 }, { "epoch": 0.5107039599779098, "grad_norm": 0.44866517186164856, "learning_rate": 5.236527561409096e-06, "loss": 0.02150370366871357, "memory(GiB)": 21.48, "step": 15721, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955571 }, { "epoch": 0.5107364454406653, "grad_norm": 0.31248533725738525, "learning_rate": 5.235991008470683e-06, "loss": 0.02675669454038143, "memory(GiB)": 21.48, "step": 15722, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.955584 }, { "epoch": 0.5107689309034207, "grad_norm": 0.37651297450065613, "learning_rate": 5.2354544528086205e-06, "loss": 0.024742189794778824, "memory(GiB)": 21.48, "step": 15723, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.955597 }, { "epoch": 0.5108014163661762, "grad_norm": 0.39782366156578064, "learning_rate": 5.234917894429099e-06, "loss": 0.023095745593309402, "memory(GiB)": 21.48, "step": 15724, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.955609 }, { "epoch": 0.5108339018289315, "grad_norm": 0.3798416554927826, "learning_rate": 5.2343813333383145e-06, "loss": 0.026216935366392136, "memory(GiB)": 21.48, "step": 15725, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.955622 }, { "epoch": 0.510866387291687, "grad_norm": 0.39207395911216736, "learning_rate": 5.233844769542456e-06, "loss": 0.021669313311576843, "memory(GiB)": 21.48, "step": 15726, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.955634 }, { "epoch": 0.5108988727544423, "grad_norm": 0.5559450387954712, "learning_rate": 5.233308203047718e-06, "loss": 0.018058080226182938, "memory(GiB)": 21.48, "step": 15727, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955645 }, { "epoch": 0.5109313582171978, "grad_norm": 0.46975794434547424, "learning_rate": 5.232771633860293e-06, "loss": 0.021833114326000214, "memory(GiB)": 21.48, "step": 15728, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.955655 }, { "epoch": 0.5109638436799532, "grad_norm": 0.2265448123216629, "learning_rate": 5.232235061986371e-06, "loss": 0.016098378226161003, "memory(GiB)": 21.48, "step": 15729, "token_acc": 0.9888268156424581, "train_speed(iter/s)": 0.955664 }, { "epoch": 0.5109963291427086, "grad_norm": 0.41904160380363464, "learning_rate": 5.231698487432151e-06, "loss": 0.03073825128376484, "memory(GiB)": 21.48, "step": 15730, "token_acc": 0.9829351535836177, "train_speed(iter/s)": 0.955674 }, { "epoch": 0.511028814605464, "grad_norm": 0.36045563220977783, "learning_rate": 5.231161910203819e-06, "loss": 0.021358488127589226, "memory(GiB)": 21.48, "step": 15731, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.955684 }, { "epoch": 0.5110613000682195, "grad_norm": 0.42458391189575195, "learning_rate": 5.230625330307573e-06, "loss": 0.024008996784687042, "memory(GiB)": 21.48, "step": 15732, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.955693 }, { "epoch": 0.5110937855309748, "grad_norm": 0.34631145000457764, "learning_rate": 5.230088747749604e-06, "loss": 0.018824728205800056, "memory(GiB)": 21.48, "step": 15733, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.955703 }, { "epoch": 0.5111262709937303, "grad_norm": 0.39271846413612366, "learning_rate": 5.229552162536103e-06, "loss": 0.01642542891204357, "memory(GiB)": 21.48, "step": 15734, "token_acc": 0.9930313588850174, "train_speed(iter/s)": 0.955712 }, { "epoch": 0.5111587564564857, "grad_norm": 0.7896620631217957, "learning_rate": 5.229015574673266e-06, "loss": 0.023927180096507072, "memory(GiB)": 21.48, "step": 15735, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.955722 }, { "epoch": 0.5111912419192411, "grad_norm": 0.24740737676620483, "learning_rate": 5.228478984167283e-06, "loss": 0.02025223709642887, "memory(GiB)": 21.48, "step": 15736, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.95573 }, { "epoch": 0.5112237273819965, "grad_norm": 0.36083176732063293, "learning_rate": 5.22794239102435e-06, "loss": 0.023080959916114807, "memory(GiB)": 21.48, "step": 15737, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.955739 }, { "epoch": 0.511256212844752, "grad_norm": 0.3817262351512909, "learning_rate": 5.227405795250657e-06, "loss": 0.019547000527381897, "memory(GiB)": 21.48, "step": 15738, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.955748 }, { "epoch": 0.5112886983075073, "grad_norm": 0.401451051235199, "learning_rate": 5.2268691968523984e-06, "loss": 0.021032724529504776, "memory(GiB)": 21.48, "step": 15739, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.955757 }, { "epoch": 0.5113211837702628, "grad_norm": 0.33479204773902893, "learning_rate": 5.226332595835768e-06, "loss": 0.0221319030970335, "memory(GiB)": 21.48, "step": 15740, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955766 }, { "epoch": 0.5113536692330182, "grad_norm": 0.3899191617965698, "learning_rate": 5.225795992206959e-06, "loss": 0.019654158502817154, "memory(GiB)": 21.48, "step": 15741, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955774 }, { "epoch": 0.5113861546957736, "grad_norm": 0.28834548592567444, "learning_rate": 5.2252593859721636e-06, "loss": 0.018017562106251717, "memory(GiB)": 21.48, "step": 15742, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.955783 }, { "epoch": 0.511418640158529, "grad_norm": 0.4489292502403259, "learning_rate": 5.224722777137573e-06, "loss": 0.023704947903752327, "memory(GiB)": 21.48, "step": 15743, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.955792 }, { "epoch": 0.5114511256212845, "grad_norm": 0.4505940079689026, "learning_rate": 5.224186165709384e-06, "loss": 0.01674017496407032, "memory(GiB)": 21.48, "step": 15744, "token_acc": 1.0, "train_speed(iter/s)": 0.955801 }, { "epoch": 0.51148361108404, "grad_norm": 0.3809649646282196, "learning_rate": 5.223649551693787e-06, "loss": 0.02202131226658821, "memory(GiB)": 21.48, "step": 15745, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.95581 }, { "epoch": 0.5115160965467953, "grad_norm": 0.28975191712379456, "learning_rate": 5.223112935096978e-06, "loss": 0.011953880079090595, "memory(GiB)": 21.48, "step": 15746, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955819 }, { "epoch": 0.5115485820095508, "grad_norm": 0.4913301169872284, "learning_rate": 5.222576315925148e-06, "loss": 0.017159707844257355, "memory(GiB)": 21.48, "step": 15747, "token_acc": 1.0, "train_speed(iter/s)": 0.955829 }, { "epoch": 0.5115810674723061, "grad_norm": 0.31958043575286865, "learning_rate": 5.222039694184491e-06, "loss": 0.01631784997880459, "memory(GiB)": 21.48, "step": 15748, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.955839 }, { "epoch": 0.5116135529350616, "grad_norm": 0.26249682903289795, "learning_rate": 5.221503069881202e-06, "loss": 0.01723342388868332, "memory(GiB)": 21.48, "step": 15749, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.955849 }, { "epoch": 0.511646038397817, "grad_norm": 0.3887576460838318, "learning_rate": 5.22096644302147e-06, "loss": 0.021222321316599846, "memory(GiB)": 21.48, "step": 15750, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.955861 }, { "epoch": 0.5116785238605724, "grad_norm": 0.29266029596328735, "learning_rate": 5.220429813611494e-06, "loss": 0.020554181188344955, "memory(GiB)": 21.48, "step": 15751, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.955871 }, { "epoch": 0.5117110093233278, "grad_norm": 0.391315758228302, "learning_rate": 5.219893181657461e-06, "loss": 0.01749880239367485, "memory(GiB)": 21.48, "step": 15752, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.955882 }, { "epoch": 0.5117434947860833, "grad_norm": 0.32052549719810486, "learning_rate": 5.219356547165572e-06, "loss": 0.014432773925364017, "memory(GiB)": 21.48, "step": 15753, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.955892 }, { "epoch": 0.5117759802488386, "grad_norm": 0.4911515712738037, "learning_rate": 5.218819910142013e-06, "loss": 0.02274201065301895, "memory(GiB)": 21.48, "step": 15754, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955901 }, { "epoch": 0.5118084657115941, "grad_norm": 0.4545692503452301, "learning_rate": 5.218283270592984e-06, "loss": 0.023130251094698906, "memory(GiB)": 21.48, "step": 15755, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.95591 }, { "epoch": 0.5118409511743495, "grad_norm": 0.4337124824523926, "learning_rate": 5.2177466285246744e-06, "loss": 0.02638724260032177, "memory(GiB)": 21.48, "step": 15756, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.955921 }, { "epoch": 0.5118734366371049, "grad_norm": 0.3169197142124176, "learning_rate": 5.217209983943279e-06, "loss": 0.018371988087892532, "memory(GiB)": 21.48, "step": 15757, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.955932 }, { "epoch": 0.5119059220998603, "grad_norm": 0.40933018922805786, "learning_rate": 5.21667333685499e-06, "loss": 0.02490065060555935, "memory(GiB)": 21.48, "step": 15758, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.955943 }, { "epoch": 0.5119384075626158, "grad_norm": 0.3462646007537842, "learning_rate": 5.216136687266004e-06, "loss": 0.021076582372188568, "memory(GiB)": 21.48, "step": 15759, "token_acc": 0.9730941704035875, "train_speed(iter/s)": 0.955953 }, { "epoch": 0.5119708930253711, "grad_norm": 0.40552037954330444, "learning_rate": 5.215600035182511e-06, "loss": 0.023384902626276016, "memory(GiB)": 21.48, "step": 15760, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.955966 }, { "epoch": 0.5120033784881266, "grad_norm": 0.38320815563201904, "learning_rate": 5.215063380610709e-06, "loss": 0.022921759635210037, "memory(GiB)": 21.48, "step": 15761, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.955979 }, { "epoch": 0.512035863950882, "grad_norm": 0.5495309233665466, "learning_rate": 5.214526723556786e-06, "loss": 0.02089579962193966, "memory(GiB)": 21.48, "step": 15762, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955992 }, { "epoch": 0.5120683494136374, "grad_norm": 0.2929103374481201, "learning_rate": 5.213990064026942e-06, "loss": 0.015935784205794334, "memory(GiB)": 21.48, "step": 15763, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.956005 }, { "epoch": 0.5121008348763928, "grad_norm": 0.47436732053756714, "learning_rate": 5.213453402027367e-06, "loss": 0.031513892114162445, "memory(GiB)": 21.48, "step": 15764, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956017 }, { "epoch": 0.5121333203391483, "grad_norm": 0.44418585300445557, "learning_rate": 5.212916737564257e-06, "loss": 0.027336765080690384, "memory(GiB)": 21.48, "step": 15765, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.95603 }, { "epoch": 0.5121658058019036, "grad_norm": 0.3216679096221924, "learning_rate": 5.212380070643802e-06, "loss": 0.024755749851465225, "memory(GiB)": 21.48, "step": 15766, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.956044 }, { "epoch": 0.5121982912646591, "grad_norm": 0.3856346607208252, "learning_rate": 5.2118434012722e-06, "loss": 0.020063702017068863, "memory(GiB)": 21.48, "step": 15767, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.956056 }, { "epoch": 0.5122307767274145, "grad_norm": 0.24413354694843292, "learning_rate": 5.211306729455642e-06, "loss": 0.01204829104244709, "memory(GiB)": 21.48, "step": 15768, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.956069 }, { "epoch": 0.5122632621901699, "grad_norm": 0.33321869373321533, "learning_rate": 5.210770055200325e-06, "loss": 0.021065790206193924, "memory(GiB)": 21.48, "step": 15769, "token_acc": 1.0, "train_speed(iter/s)": 0.956082 }, { "epoch": 0.5122957476529253, "grad_norm": 0.5308170318603516, "learning_rate": 5.210233378512439e-06, "loss": 0.018264755606651306, "memory(GiB)": 21.48, "step": 15770, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956095 }, { "epoch": 0.5123282331156808, "grad_norm": 0.26920193433761597, "learning_rate": 5.209696699398181e-06, "loss": 0.015869908034801483, "memory(GiB)": 21.48, "step": 15771, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956108 }, { "epoch": 0.5123607185784361, "grad_norm": 0.2537499964237213, "learning_rate": 5.209160017863743e-06, "loss": 0.012793175876140594, "memory(GiB)": 21.48, "step": 15772, "token_acc": 0.9875, "train_speed(iter/s)": 0.956121 }, { "epoch": 0.5123932040411916, "grad_norm": 0.4090891182422638, "learning_rate": 5.208623333915321e-06, "loss": 0.023947324603796005, "memory(GiB)": 21.48, "step": 15773, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.956133 }, { "epoch": 0.512425689503947, "grad_norm": 0.34713757038116455, "learning_rate": 5.208086647559108e-06, "loss": 0.019406761974096298, "memory(GiB)": 21.48, "step": 15774, "token_acc": 0.9921875, "train_speed(iter/s)": 0.956146 }, { "epoch": 0.5124581749667024, "grad_norm": 0.3092705011367798, "learning_rate": 5.207549958801298e-06, "loss": 0.01578223705291748, "memory(GiB)": 21.48, "step": 15775, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.956159 }, { "epoch": 0.5124906604294578, "grad_norm": 0.5270521640777588, "learning_rate": 5.207013267648085e-06, "loss": 0.0280500166118145, "memory(GiB)": 21.48, "step": 15776, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956173 }, { "epoch": 0.5125231458922133, "grad_norm": 0.41913390159606934, "learning_rate": 5.206476574105663e-06, "loss": 0.022487740963697433, "memory(GiB)": 21.48, "step": 15777, "token_acc": 0.980544747081712, "train_speed(iter/s)": 0.956186 }, { "epoch": 0.5125556313549686, "grad_norm": 0.4482034742832184, "learning_rate": 5.205939878180227e-06, "loss": 0.02177353948354721, "memory(GiB)": 21.48, "step": 15778, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956198 }, { "epoch": 0.5125881168177241, "grad_norm": 0.6852365732192993, "learning_rate": 5.205403179877972e-06, "loss": 0.01750887930393219, "memory(GiB)": 21.48, "step": 15779, "token_acc": 0.9842105263157894, "train_speed(iter/s)": 0.956211 }, { "epoch": 0.5126206022804795, "grad_norm": 0.3602536618709564, "learning_rate": 5.204866479205089e-06, "loss": 0.018352925777435303, "memory(GiB)": 21.48, "step": 15780, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.956224 }, { "epoch": 0.5126530877432349, "grad_norm": 0.4033580720424652, "learning_rate": 5.204329776167775e-06, "loss": 0.01989062689244747, "memory(GiB)": 21.48, "step": 15781, "token_acc": 0.9818840579710145, "train_speed(iter/s)": 0.956237 }, { "epoch": 0.5126855732059903, "grad_norm": 0.4990328848361969, "learning_rate": 5.203793070772222e-06, "loss": 0.027044374495744705, "memory(GiB)": 21.48, "step": 15782, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.956251 }, { "epoch": 0.5127180586687458, "grad_norm": 0.3317500650882721, "learning_rate": 5.203256363024628e-06, "loss": 0.021667350083589554, "memory(GiB)": 21.48, "step": 15783, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.956263 }, { "epoch": 0.5127505441315011, "grad_norm": 0.32613715529441833, "learning_rate": 5.202719652931184e-06, "loss": 0.015715446323156357, "memory(GiB)": 21.48, "step": 15784, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956276 }, { "epoch": 0.5127830295942566, "grad_norm": 0.3363941013813019, "learning_rate": 5.202182940498085e-06, "loss": 0.019942712038755417, "memory(GiB)": 21.48, "step": 15785, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956289 }, { "epoch": 0.512815515057012, "grad_norm": 0.37880784273147583, "learning_rate": 5.201646225731527e-06, "loss": 0.015245284885168076, "memory(GiB)": 21.48, "step": 15786, "token_acc": 1.0, "train_speed(iter/s)": 0.956302 }, { "epoch": 0.5128480005197674, "grad_norm": 0.402646005153656, "learning_rate": 5.201109508637702e-06, "loss": 0.02038365975022316, "memory(GiB)": 21.48, "step": 15787, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.956313 }, { "epoch": 0.5128804859825228, "grad_norm": 0.4250907003879547, "learning_rate": 5.200572789222805e-06, "loss": 0.01868262141942978, "memory(GiB)": 21.48, "step": 15788, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.956322 }, { "epoch": 0.5129129714452783, "grad_norm": 0.5107430815696716, "learning_rate": 5.2000360674930315e-06, "loss": 0.02952537313103676, "memory(GiB)": 21.48, "step": 15789, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.956331 }, { "epoch": 0.5129454569080336, "grad_norm": 0.3276418447494507, "learning_rate": 5.199499343454577e-06, "loss": 0.01605577953159809, "memory(GiB)": 21.48, "step": 15790, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.956341 }, { "epoch": 0.5129779423707891, "grad_norm": 0.36944225430488586, "learning_rate": 5.198962617113633e-06, "loss": 0.017194103449583054, "memory(GiB)": 21.48, "step": 15791, "token_acc": 1.0, "train_speed(iter/s)": 0.95635 }, { "epoch": 0.5130104278335444, "grad_norm": 0.3492443263530731, "learning_rate": 5.198425888476395e-06, "loss": 0.020019974559545517, "memory(GiB)": 21.48, "step": 15792, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.956359 }, { "epoch": 0.5130429132962999, "grad_norm": 0.3879531919956207, "learning_rate": 5.19788915754906e-06, "loss": 0.016462106257677078, "memory(GiB)": 21.48, "step": 15793, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.956369 }, { "epoch": 0.5130753987590553, "grad_norm": 0.2809964120388031, "learning_rate": 5.197352424337819e-06, "loss": 0.016756178811192513, "memory(GiB)": 21.48, "step": 15794, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956379 }, { "epoch": 0.5131078842218108, "grad_norm": 0.3775944113731384, "learning_rate": 5.196815688848871e-06, "loss": 0.02219526469707489, "memory(GiB)": 21.48, "step": 15795, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956387 }, { "epoch": 0.5131403696845661, "grad_norm": 0.29282110929489136, "learning_rate": 5.196278951088405e-06, "loss": 0.02101563848555088, "memory(GiB)": 21.48, "step": 15796, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.956397 }, { "epoch": 0.5131728551473216, "grad_norm": 0.38973158597946167, "learning_rate": 5.195742211062621e-06, "loss": 0.023974452167749405, "memory(GiB)": 21.48, "step": 15797, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.956407 }, { "epoch": 0.5132053406100769, "grad_norm": 0.5307965278625488, "learning_rate": 5.195205468777711e-06, "loss": 0.0247565396130085, "memory(GiB)": 21.48, "step": 15798, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.956415 }, { "epoch": 0.5132378260728324, "grad_norm": 0.4375476539134979, "learning_rate": 5.194668724239869e-06, "loss": 0.02034638449549675, "memory(GiB)": 21.48, "step": 15799, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.956423 }, { "epoch": 0.5132703115355878, "grad_norm": 0.3805173933506012, "learning_rate": 5.1941319774552914e-06, "loss": 0.025033380836248398, "memory(GiB)": 21.48, "step": 15800, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.956431 }, { "epoch": 0.5133027969983432, "grad_norm": 0.4050992429256439, "learning_rate": 5.193595228430171e-06, "loss": 0.02009809948503971, "memory(GiB)": 21.48, "step": 15801, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.95644 }, { "epoch": 0.5133352824610986, "grad_norm": 0.38553890585899353, "learning_rate": 5.193058477170706e-06, "loss": 0.025104809552431107, "memory(GiB)": 21.48, "step": 15802, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.956449 }, { "epoch": 0.5133677679238541, "grad_norm": 0.3525184392929077, "learning_rate": 5.192521723683087e-06, "loss": 0.019870122894644737, "memory(GiB)": 21.48, "step": 15803, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.956457 }, { "epoch": 0.5134002533866094, "grad_norm": 0.4680013656616211, "learning_rate": 5.191984967973513e-06, "loss": 0.029077328741550446, "memory(GiB)": 21.48, "step": 15804, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956465 }, { "epoch": 0.5134327388493649, "grad_norm": 0.5031686425209045, "learning_rate": 5.191448210048177e-06, "loss": 0.028056912124156952, "memory(GiB)": 21.48, "step": 15805, "token_acc": 0.9754901960784313, "train_speed(iter/s)": 0.956475 }, { "epoch": 0.5134652243121203, "grad_norm": 0.39301586151123047, "learning_rate": 5.190911449913273e-06, "loss": 0.016737075522542, "memory(GiB)": 21.48, "step": 15806, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.956484 }, { "epoch": 0.5134977097748757, "grad_norm": 0.3911978602409363, "learning_rate": 5.190374687574996e-06, "loss": 0.028293317183852196, "memory(GiB)": 21.48, "step": 15807, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.956494 }, { "epoch": 0.5135301952376312, "grad_norm": 0.45991891622543335, "learning_rate": 5.189837923039542e-06, "loss": 0.02371399477124214, "memory(GiB)": 21.48, "step": 15808, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.956504 }, { "epoch": 0.5135626807003866, "grad_norm": 0.34909215569496155, "learning_rate": 5.189301156313108e-06, "loss": 0.018942540511488914, "memory(GiB)": 21.48, "step": 15809, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.956513 }, { "epoch": 0.513595166163142, "grad_norm": 0.325289785861969, "learning_rate": 5.188764387401885e-06, "loss": 0.023071758449077606, "memory(GiB)": 21.48, "step": 15810, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.956523 }, { "epoch": 0.5136276516258974, "grad_norm": 0.3395022749900818, "learning_rate": 5.188227616312069e-06, "loss": 0.020147990435361862, "memory(GiB)": 21.48, "step": 15811, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.956533 }, { "epoch": 0.5136601370886529, "grad_norm": 0.28627005219459534, "learning_rate": 5.1876908430498575e-06, "loss": 0.01910238340497017, "memory(GiB)": 21.48, "step": 15812, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956542 }, { "epoch": 0.5136926225514082, "grad_norm": 0.367786169052124, "learning_rate": 5.187154067621443e-06, "loss": 0.025395646691322327, "memory(GiB)": 21.48, "step": 15813, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956552 }, { "epoch": 0.5137251080141637, "grad_norm": 0.24881455302238464, "learning_rate": 5.186617290033021e-06, "loss": 0.024615846574306488, "memory(GiB)": 21.48, "step": 15814, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.956563 }, { "epoch": 0.5137575934769191, "grad_norm": 0.37409257888793945, "learning_rate": 5.186080510290787e-06, "loss": 0.01721205562353134, "memory(GiB)": 21.48, "step": 15815, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.956574 }, { "epoch": 0.5137900789396745, "grad_norm": 0.35625237226486206, "learning_rate": 5.185543728400938e-06, "loss": 0.017158588394522667, "memory(GiB)": 21.48, "step": 15816, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.956584 }, { "epoch": 0.5138225644024299, "grad_norm": 0.3003169298171997, "learning_rate": 5.185006944369665e-06, "loss": 0.01859159581363201, "memory(GiB)": 21.48, "step": 15817, "token_acc": 0.99609375, "train_speed(iter/s)": 0.956593 }, { "epoch": 0.5138550498651854, "grad_norm": 0.3537934124469757, "learning_rate": 5.184470158203167e-06, "loss": 0.020889725536108017, "memory(GiB)": 21.48, "step": 15818, "token_acc": 1.0, "train_speed(iter/s)": 0.956603 }, { "epoch": 0.5138875353279407, "grad_norm": 0.30950674414634705, "learning_rate": 5.183933369907638e-06, "loss": 0.021119993180036545, "memory(GiB)": 21.48, "step": 15819, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956614 }, { "epoch": 0.5139200207906962, "grad_norm": 0.432875394821167, "learning_rate": 5.183396579489272e-06, "loss": 0.015868309885263443, "memory(GiB)": 21.48, "step": 15820, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956624 }, { "epoch": 0.5139525062534516, "grad_norm": 0.2652360796928406, "learning_rate": 5.182859786954268e-06, "loss": 0.01626836322247982, "memory(GiB)": 21.48, "step": 15821, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956634 }, { "epoch": 0.513984991716207, "grad_norm": 0.39816227555274963, "learning_rate": 5.1823229923088145e-06, "loss": 0.0217435359954834, "memory(GiB)": 21.48, "step": 15822, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956646 }, { "epoch": 0.5140174771789624, "grad_norm": 0.7726209163665771, "learning_rate": 5.181786195559114e-06, "loss": 0.021314771845936775, "memory(GiB)": 21.48, "step": 15823, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.956658 }, { "epoch": 0.5140499626417179, "grad_norm": 0.2969062030315399, "learning_rate": 5.181249396711357e-06, "loss": 0.01504574529826641, "memory(GiB)": 21.48, "step": 15824, "token_acc": 1.0, "train_speed(iter/s)": 0.956671 }, { "epoch": 0.5140824481044732, "grad_norm": 0.3313242793083191, "learning_rate": 5.180712595771743e-06, "loss": 0.018297452479600906, "memory(GiB)": 21.48, "step": 15825, "token_acc": 0.9862542955326461, "train_speed(iter/s)": 0.956684 }, { "epoch": 0.5141149335672287, "grad_norm": 0.2895814776420593, "learning_rate": 5.180175792746465e-06, "loss": 0.01977374404668808, "memory(GiB)": 21.48, "step": 15826, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956696 }, { "epoch": 0.5141474190299841, "grad_norm": 0.3605382442474365, "learning_rate": 5.179638987641716e-06, "loss": 0.01930784434080124, "memory(GiB)": 21.48, "step": 15827, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.956709 }, { "epoch": 0.5141799044927395, "grad_norm": 0.35050269961357117, "learning_rate": 5.179102180463696e-06, "loss": 0.019851800054311752, "memory(GiB)": 21.48, "step": 15828, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956722 }, { "epoch": 0.5142123899554949, "grad_norm": 0.36132147908210754, "learning_rate": 5.178565371218598e-06, "loss": 0.022141898050904274, "memory(GiB)": 21.48, "step": 15829, "token_acc": 1.0, "train_speed(iter/s)": 0.956735 }, { "epoch": 0.5142448754182504, "grad_norm": 0.38453802466392517, "learning_rate": 5.178028559912618e-06, "loss": 0.017809337005019188, "memory(GiB)": 21.48, "step": 15830, "token_acc": 1.0, "train_speed(iter/s)": 0.956747 }, { "epoch": 0.5142773608810057, "grad_norm": 0.3675253987312317, "learning_rate": 5.177491746551951e-06, "loss": 0.019241763278841972, "memory(GiB)": 21.48, "step": 15831, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.956759 }, { "epoch": 0.5143098463437612, "grad_norm": 0.42897751927375793, "learning_rate": 5.1769549311427935e-06, "loss": 0.02213684841990471, "memory(GiB)": 21.48, "step": 15832, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.956772 }, { "epoch": 0.5143423318065166, "grad_norm": 0.283473402261734, "learning_rate": 5.176418113691339e-06, "loss": 0.02301909402012825, "memory(GiB)": 21.48, "step": 15833, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.956785 }, { "epoch": 0.514374817269272, "grad_norm": 0.5919013023376465, "learning_rate": 5.175881294203788e-06, "loss": 0.02606256678700447, "memory(GiB)": 21.48, "step": 15834, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.956798 }, { "epoch": 0.5144073027320274, "grad_norm": 0.541641116142273, "learning_rate": 5.175344472686331e-06, "loss": 0.03242204338312149, "memory(GiB)": 21.48, "step": 15835, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.956811 }, { "epoch": 0.5144397881947829, "grad_norm": 0.3372432589530945, "learning_rate": 5.174807649145164e-06, "loss": 0.017795905470848083, "memory(GiB)": 21.48, "step": 15836, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.956824 }, { "epoch": 0.5144722736575382, "grad_norm": 0.3528628945350647, "learning_rate": 5.174270823586486e-06, "loss": 0.01828364096581936, "memory(GiB)": 21.48, "step": 15837, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956836 }, { "epoch": 0.5145047591202937, "grad_norm": 0.40772420167922974, "learning_rate": 5.1737339960164885e-06, "loss": 0.03334423154592514, "memory(GiB)": 21.48, "step": 15838, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.95685 }, { "epoch": 0.5145372445830491, "grad_norm": 0.39563506841659546, "learning_rate": 5.173197166441372e-06, "loss": 0.02395961433649063, "memory(GiB)": 21.48, "step": 15839, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.956863 }, { "epoch": 0.5145697300458045, "grad_norm": 0.4468362331390381, "learning_rate": 5.172660334867329e-06, "loss": 0.02111811377108097, "memory(GiB)": 21.48, "step": 15840, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.956876 }, { "epoch": 0.5146022155085599, "grad_norm": 0.3696002662181854, "learning_rate": 5.172123501300554e-06, "loss": 0.01528458297252655, "memory(GiB)": 21.48, "step": 15841, "token_acc": 1.0, "train_speed(iter/s)": 0.956888 }, { "epoch": 0.5146347009713154, "grad_norm": 0.26983940601348877, "learning_rate": 5.1715866657472465e-06, "loss": 0.015100443735718727, "memory(GiB)": 21.48, "step": 15842, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956901 }, { "epoch": 0.5146671864340707, "grad_norm": 0.37708234786987305, "learning_rate": 5.1710498282136e-06, "loss": 0.01081580389291048, "memory(GiB)": 21.48, "step": 15843, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956914 }, { "epoch": 0.5146996718968262, "grad_norm": 0.2459513545036316, "learning_rate": 5.170512988705812e-06, "loss": 0.015325010754168034, "memory(GiB)": 21.48, "step": 15844, "token_acc": 1.0, "train_speed(iter/s)": 0.956928 }, { "epoch": 0.5147321573595816, "grad_norm": 0.39169618487358093, "learning_rate": 5.169976147230076e-06, "loss": 0.019526511430740356, "memory(GiB)": 21.48, "step": 15845, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.956941 }, { "epoch": 0.514764642822337, "grad_norm": 0.43337416648864746, "learning_rate": 5.169439303792589e-06, "loss": 0.023242678493261337, "memory(GiB)": 21.48, "step": 15846, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.956952 }, { "epoch": 0.5147971282850924, "grad_norm": 0.40157821774482727, "learning_rate": 5.168902458399547e-06, "loss": 0.018749918788671494, "memory(GiB)": 21.48, "step": 15847, "token_acc": 0.984375, "train_speed(iter/s)": 0.956961 }, { "epoch": 0.5148296137478479, "grad_norm": 0.28406405448913574, "learning_rate": 5.168365611057147e-06, "loss": 0.01682814583182335, "memory(GiB)": 21.48, "step": 15848, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956972 }, { "epoch": 0.5148620992106032, "grad_norm": 0.49397000670433044, "learning_rate": 5.167828761771583e-06, "loss": 0.022043438628315926, "memory(GiB)": 21.48, "step": 15849, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.95698 }, { "epoch": 0.5148945846733587, "grad_norm": 0.42871060967445374, "learning_rate": 5.16729191054905e-06, "loss": 0.03318290784955025, "memory(GiB)": 21.48, "step": 15850, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.95699 }, { "epoch": 0.514927070136114, "grad_norm": 0.37291762232780457, "learning_rate": 5.166755057395748e-06, "loss": 0.019896261394023895, "memory(GiB)": 21.48, "step": 15851, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.957 }, { "epoch": 0.5149595555988695, "grad_norm": 1.0961413383483887, "learning_rate": 5.16621820231787e-06, "loss": 0.02248282916843891, "memory(GiB)": 21.48, "step": 15852, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.95701 }, { "epoch": 0.5149920410616249, "grad_norm": 0.35457468032836914, "learning_rate": 5.1656813453216116e-06, "loss": 0.016899921000003815, "memory(GiB)": 21.48, "step": 15853, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.957019 }, { "epoch": 0.5150245265243804, "grad_norm": 0.43176183104515076, "learning_rate": 5.1651444864131705e-06, "loss": 0.029565218836069107, "memory(GiB)": 21.48, "step": 15854, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.957028 }, { "epoch": 0.5150570119871357, "grad_norm": 0.5140279531478882, "learning_rate": 5.1646076255987435e-06, "loss": 0.023801110684871674, "memory(GiB)": 21.48, "step": 15855, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957037 }, { "epoch": 0.5150894974498912, "grad_norm": 0.28612056374549866, "learning_rate": 5.164070762884524e-06, "loss": 0.017827238887548447, "memory(GiB)": 21.48, "step": 15856, "token_acc": 0.9779411764705882, "train_speed(iter/s)": 0.957047 }, { "epoch": 0.5151219829126465, "grad_norm": 0.3529509902000427, "learning_rate": 5.16353389827671e-06, "loss": 0.021264968439936638, "memory(GiB)": 21.48, "step": 15857, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.957056 }, { "epoch": 0.515154468375402, "grad_norm": 0.5449675917625427, "learning_rate": 5.162997031781498e-06, "loss": 0.02527059055864811, "memory(GiB)": 21.48, "step": 15858, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.957062 }, { "epoch": 0.5151869538381574, "grad_norm": 0.41122204065322876, "learning_rate": 5.162460163405082e-06, "loss": 0.028124937787652016, "memory(GiB)": 21.48, "step": 15859, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.957071 }, { "epoch": 0.5152194393009129, "grad_norm": 0.32391154766082764, "learning_rate": 5.1619232931536604e-06, "loss": 0.026095164939761162, "memory(GiB)": 21.48, "step": 15860, "token_acc": 0.98828125, "train_speed(iter/s)": 0.957081 }, { "epoch": 0.5152519247636682, "grad_norm": 0.3461275100708008, "learning_rate": 5.161386421033427e-06, "loss": 0.01930532604455948, "memory(GiB)": 21.48, "step": 15861, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957091 }, { "epoch": 0.5152844102264237, "grad_norm": 0.3632783591747284, "learning_rate": 5.1608495470505815e-06, "loss": 0.015536382794380188, "memory(GiB)": 21.48, "step": 15862, "token_acc": 0.995, "train_speed(iter/s)": 0.957101 }, { "epoch": 0.515316895689179, "grad_norm": 0.31098225712776184, "learning_rate": 5.160312671211316e-06, "loss": 0.02471299096941948, "memory(GiB)": 21.48, "step": 15863, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957109 }, { "epoch": 0.5153493811519345, "grad_norm": 0.40486565232276917, "learning_rate": 5.159775793521831e-06, "loss": 0.018631257116794586, "memory(GiB)": 21.48, "step": 15864, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.957118 }, { "epoch": 0.5153818666146899, "grad_norm": 0.318272203207016, "learning_rate": 5.15923891398832e-06, "loss": 0.019840683788061142, "memory(GiB)": 21.48, "step": 15865, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.957126 }, { "epoch": 0.5154143520774453, "grad_norm": 0.5212675929069519, "learning_rate": 5.1587020326169805e-06, "loss": 0.017340337857604027, "memory(GiB)": 21.48, "step": 15866, "token_acc": 1.0, "train_speed(iter/s)": 0.957136 }, { "epoch": 0.5154468375402007, "grad_norm": 0.456563800573349, "learning_rate": 5.158165149414007e-06, "loss": 0.014874645508825779, "memory(GiB)": 21.48, "step": 15867, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957146 }, { "epoch": 0.5154793230029562, "grad_norm": 0.3381851315498352, "learning_rate": 5.157628264385597e-06, "loss": 0.026611488312482834, "memory(GiB)": 21.48, "step": 15868, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.957155 }, { "epoch": 0.5155118084657115, "grad_norm": 0.256151407957077, "learning_rate": 5.157091377537947e-06, "loss": 0.01673974096775055, "memory(GiB)": 21.48, "step": 15869, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.957165 }, { "epoch": 0.515544293928467, "grad_norm": 0.2681848406791687, "learning_rate": 5.156554488877256e-06, "loss": 0.019549308344721794, "memory(GiB)": 21.48, "step": 15870, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.957176 }, { "epoch": 0.5155767793912224, "grad_norm": 0.4090709090232849, "learning_rate": 5.156017598409715e-06, "loss": 0.02837841771543026, "memory(GiB)": 21.48, "step": 15871, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957186 }, { "epoch": 0.5156092648539778, "grad_norm": 0.33767396211624146, "learning_rate": 5.155480706141525e-06, "loss": 0.02078206092119217, "memory(GiB)": 21.48, "step": 15872, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.957195 }, { "epoch": 0.5156417503167333, "grad_norm": 0.30009254813194275, "learning_rate": 5.154943812078879e-06, "loss": 0.019213825464248657, "memory(GiB)": 21.48, "step": 15873, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.957205 }, { "epoch": 0.5156742357794887, "grad_norm": 0.3231235146522522, "learning_rate": 5.154406916227977e-06, "loss": 0.01800258457660675, "memory(GiB)": 21.48, "step": 15874, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957215 }, { "epoch": 0.5157067212422441, "grad_norm": 0.3322608768939972, "learning_rate": 5.153870018595012e-06, "loss": 0.01575411856174469, "memory(GiB)": 21.48, "step": 15875, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957225 }, { "epoch": 0.5157392067049995, "grad_norm": 0.4068484902381897, "learning_rate": 5.153333119186184e-06, "loss": 0.01900908723473549, "memory(GiB)": 21.48, "step": 15876, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957234 }, { "epoch": 0.515771692167755, "grad_norm": 0.2579573094844818, "learning_rate": 5.152796218007685e-06, "loss": 0.014267395250499249, "memory(GiB)": 21.48, "step": 15877, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.957244 }, { "epoch": 0.5158041776305103, "grad_norm": 0.30175232887268066, "learning_rate": 5.152259315065718e-06, "loss": 0.016387661918997765, "memory(GiB)": 21.48, "step": 15878, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.957254 }, { "epoch": 0.5158366630932658, "grad_norm": 0.3839828073978424, "learning_rate": 5.151722410366473e-06, "loss": 0.019151896238327026, "memory(GiB)": 21.48, "step": 15879, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.957265 }, { "epoch": 0.5158691485560212, "grad_norm": 0.38796329498291016, "learning_rate": 5.1511855039161495e-06, "loss": 0.02027924358844757, "memory(GiB)": 21.48, "step": 15880, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957275 }, { "epoch": 0.5159016340187766, "grad_norm": 0.29939377307891846, "learning_rate": 5.150648595720946e-06, "loss": 0.016371827572584152, "memory(GiB)": 21.48, "step": 15881, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957284 }, { "epoch": 0.515934119481532, "grad_norm": 0.3699062168598175, "learning_rate": 5.1501116857870545e-06, "loss": 0.01998589187860489, "memory(GiB)": 21.48, "step": 15882, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.957292 }, { "epoch": 0.5159666049442875, "grad_norm": 0.5644625425338745, "learning_rate": 5.149574774120677e-06, "loss": 0.02117946557700634, "memory(GiB)": 21.48, "step": 15883, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.957303 }, { "epoch": 0.5159990904070428, "grad_norm": 0.43958422541618347, "learning_rate": 5.149037860728005e-06, "loss": 0.022447198629379272, "memory(GiB)": 21.48, "step": 15884, "token_acc": 0.99, "train_speed(iter/s)": 0.957312 }, { "epoch": 0.5160315758697983, "grad_norm": 0.35813507437705994, "learning_rate": 5.148500945615239e-06, "loss": 0.021767262369394302, "memory(GiB)": 21.48, "step": 15885, "token_acc": 0.9867109634551495, "train_speed(iter/s)": 0.957323 }, { "epoch": 0.5160640613325537, "grad_norm": 0.3959333300590515, "learning_rate": 5.147964028788576e-06, "loss": 0.021124795079231262, "memory(GiB)": 21.48, "step": 15886, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.957333 }, { "epoch": 0.5160965467953091, "grad_norm": 0.31977975368499756, "learning_rate": 5.1474271102542085e-06, "loss": 0.022107107564806938, "memory(GiB)": 21.48, "step": 15887, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.957346 }, { "epoch": 0.5161290322580645, "grad_norm": 0.2940022945404053, "learning_rate": 5.146890190018337e-06, "loss": 0.022476188838481903, "memory(GiB)": 21.48, "step": 15888, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.957359 }, { "epoch": 0.51616151772082, "grad_norm": 0.3153463900089264, "learning_rate": 5.146353268087158e-06, "loss": 0.018288791179656982, "memory(GiB)": 21.48, "step": 15889, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957372 }, { "epoch": 0.5161940031835753, "grad_norm": 0.3695099353790283, "learning_rate": 5.1458163444668675e-06, "loss": 0.018839841708540916, "memory(GiB)": 21.48, "step": 15890, "token_acc": 1.0, "train_speed(iter/s)": 0.957385 }, { "epoch": 0.5162264886463308, "grad_norm": 0.3912169635295868, "learning_rate": 5.14527941916366e-06, "loss": 0.022691743448376656, "memory(GiB)": 21.48, "step": 15891, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.957398 }, { "epoch": 0.5162589741090862, "grad_norm": 0.36407560110092163, "learning_rate": 5.144742492183737e-06, "loss": 0.016484979540109634, "memory(GiB)": 21.48, "step": 15892, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.957411 }, { "epoch": 0.5162914595718416, "grad_norm": 0.34398338198661804, "learning_rate": 5.144205563533293e-06, "loss": 0.018422342836856842, "memory(GiB)": 21.48, "step": 15893, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.957423 }, { "epoch": 0.516323945034597, "grad_norm": 0.4496036469936371, "learning_rate": 5.143668633218524e-06, "loss": 0.03535209596157074, "memory(GiB)": 21.48, "step": 15894, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.957436 }, { "epoch": 0.5163564304973525, "grad_norm": 0.2962201237678528, "learning_rate": 5.143131701245628e-06, "loss": 0.012237879447638988, "memory(GiB)": 21.48, "step": 15895, "token_acc": 0.9963636363636363, "train_speed(iter/s)": 0.957449 }, { "epoch": 0.5163889159601078, "grad_norm": 0.4670940339565277, "learning_rate": 5.142594767620801e-06, "loss": 0.028857609257102013, "memory(GiB)": 21.48, "step": 15896, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.957462 }, { "epoch": 0.5164214014228633, "grad_norm": 0.31298381090164185, "learning_rate": 5.142057832350241e-06, "loss": 0.017651522532105446, "memory(GiB)": 21.48, "step": 15897, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.957475 }, { "epoch": 0.5164538868856187, "grad_norm": 0.3004955053329468, "learning_rate": 5.141520895440145e-06, "loss": 0.014291427098214626, "memory(GiB)": 21.48, "step": 15898, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.957489 }, { "epoch": 0.5164863723483741, "grad_norm": 0.3508031666278839, "learning_rate": 5.140983956896708e-06, "loss": 0.015324763022363186, "memory(GiB)": 21.48, "step": 15899, "token_acc": 1.0, "train_speed(iter/s)": 0.957501 }, { "epoch": 0.5165188578111295, "grad_norm": 0.45079320669174194, "learning_rate": 5.14044701672613e-06, "loss": 0.02223159186542034, "memory(GiB)": 21.48, "step": 15900, "token_acc": 0.9842105263157894, "train_speed(iter/s)": 0.957513 }, { "epoch": 0.516551343273885, "grad_norm": 0.43006837368011475, "learning_rate": 5.1399100749346064e-06, "loss": 0.021240223199129105, "memory(GiB)": 21.48, "step": 15901, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.957526 }, { "epoch": 0.5165838287366403, "grad_norm": 0.5664529204368591, "learning_rate": 5.139373131528335e-06, "loss": 0.019840182736516, "memory(GiB)": 21.48, "step": 15902, "token_acc": 0.9846743295019157, "train_speed(iter/s)": 0.957539 }, { "epoch": 0.5166163141993958, "grad_norm": 0.3092401921749115, "learning_rate": 5.138836186513509e-06, "loss": 0.021460991352796555, "memory(GiB)": 21.48, "step": 15903, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.957552 }, { "epoch": 0.5166487996621512, "grad_norm": 0.32709208130836487, "learning_rate": 5.138299239896332e-06, "loss": 0.01915680430829525, "memory(GiB)": 21.48, "step": 15904, "token_acc": 0.983957219251337, "train_speed(iter/s)": 0.957564 }, { "epoch": 0.5166812851249066, "grad_norm": 0.5477913022041321, "learning_rate": 5.137762291682996e-06, "loss": 0.023911774158477783, "memory(GiB)": 21.48, "step": 15905, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.957576 }, { "epoch": 0.516713770587662, "grad_norm": 0.31740549206733704, "learning_rate": 5.137225341879701e-06, "loss": 0.018333252519369125, "memory(GiB)": 21.48, "step": 15906, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957585 }, { "epoch": 0.5167462560504175, "grad_norm": 0.546677827835083, "learning_rate": 5.136688390492643e-06, "loss": 0.023711886256933212, "memory(GiB)": 21.48, "step": 15907, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.957595 }, { "epoch": 0.5167787415131728, "grad_norm": 0.4220576882362366, "learning_rate": 5.1361514375280174e-06, "loss": 0.02072216011583805, "memory(GiB)": 21.48, "step": 15908, "token_acc": 0.9947368421052631, "train_speed(iter/s)": 0.957605 }, { "epoch": 0.5168112269759283, "grad_norm": 0.2935704290866852, "learning_rate": 5.135614482992024e-06, "loss": 0.014422537758946419, "memory(GiB)": 21.48, "step": 15909, "token_acc": 1.0, "train_speed(iter/s)": 0.957615 }, { "epoch": 0.5168437124386837, "grad_norm": 0.52254718542099, "learning_rate": 5.135077526890858e-06, "loss": 0.020694313570857048, "memory(GiB)": 21.48, "step": 15910, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957624 }, { "epoch": 0.5168761979014391, "grad_norm": 0.40875259041786194, "learning_rate": 5.13454056923072e-06, "loss": 0.02481316588819027, "memory(GiB)": 21.48, "step": 15911, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957634 }, { "epoch": 0.5169086833641945, "grad_norm": 0.31957465410232544, "learning_rate": 5.134003610017802e-06, "loss": 0.01647716760635376, "memory(GiB)": 21.48, "step": 15912, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957644 }, { "epoch": 0.51694116882695, "grad_norm": 0.49320319294929504, "learning_rate": 5.133466649258305e-06, "loss": 0.01840738207101822, "memory(GiB)": 21.48, "step": 15913, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.957654 }, { "epoch": 0.5169736542897053, "grad_norm": 0.46244314312934875, "learning_rate": 5.132929686958425e-06, "loss": 0.02279217354953289, "memory(GiB)": 21.48, "step": 15914, "token_acc": 0.990521327014218, "train_speed(iter/s)": 0.957663 }, { "epoch": 0.5170061397524608, "grad_norm": 0.306375116109848, "learning_rate": 5.13239272312436e-06, "loss": 0.01919843815267086, "memory(GiB)": 21.48, "step": 15915, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.957672 }, { "epoch": 0.5170386252152162, "grad_norm": 0.31325796246528625, "learning_rate": 5.131855757762306e-06, "loss": 0.018914341926574707, "memory(GiB)": 21.48, "step": 15916, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.957682 }, { "epoch": 0.5170711106779716, "grad_norm": 0.3235618472099304, "learning_rate": 5.131318790878461e-06, "loss": 0.021986693143844604, "memory(GiB)": 21.48, "step": 15917, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.957691 }, { "epoch": 0.517103596140727, "grad_norm": 0.44422197341918945, "learning_rate": 5.130781822479024e-06, "loss": 0.018080446869134903, "memory(GiB)": 21.48, "step": 15918, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.957701 }, { "epoch": 0.5171360816034825, "grad_norm": 0.35585811734199524, "learning_rate": 5.13024485257019e-06, "loss": 0.01522862259298563, "memory(GiB)": 21.48, "step": 15919, "token_acc": 1.0, "train_speed(iter/s)": 0.957711 }, { "epoch": 0.5171685670662378, "grad_norm": 0.379237562417984, "learning_rate": 5.129707881158157e-06, "loss": 0.02089395932853222, "memory(GiB)": 21.48, "step": 15920, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.957721 }, { "epoch": 0.5172010525289933, "grad_norm": 0.6379397511482239, "learning_rate": 5.1291709082491224e-06, "loss": 0.013013804331421852, "memory(GiB)": 21.48, "step": 15921, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.957731 }, { "epoch": 0.5172335379917486, "grad_norm": 0.4221273362636566, "learning_rate": 5.128633933849283e-06, "loss": 0.023918287828564644, "memory(GiB)": 21.48, "step": 15922, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.95774 }, { "epoch": 0.5172660234545041, "grad_norm": 0.4263851046562195, "learning_rate": 5.128096957964837e-06, "loss": 0.02668147161602974, "memory(GiB)": 21.48, "step": 15923, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.957748 }, { "epoch": 0.5172985089172595, "grad_norm": 0.4972039461135864, "learning_rate": 5.127559980601981e-06, "loss": 0.015985891222953796, "memory(GiB)": 21.48, "step": 15924, "token_acc": 1.0, "train_speed(iter/s)": 0.957758 }, { "epoch": 0.517330994380015, "grad_norm": 0.42623257637023926, "learning_rate": 5.127023001766916e-06, "loss": 0.02524677664041519, "memory(GiB)": 21.48, "step": 15925, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.957768 }, { "epoch": 0.5173634798427703, "grad_norm": 0.7220668196678162, "learning_rate": 5.126486021465833e-06, "loss": 0.03651776164770126, "memory(GiB)": 21.48, "step": 15926, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.957781 }, { "epoch": 0.5173959653055258, "grad_norm": 0.324219286441803, "learning_rate": 5.125949039704934e-06, "loss": 0.02269001305103302, "memory(GiB)": 21.48, "step": 15927, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957791 }, { "epoch": 0.5174284507682811, "grad_norm": 0.40839582681655884, "learning_rate": 5.125412056490415e-06, "loss": 0.01790633052587509, "memory(GiB)": 21.48, "step": 15928, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.957801 }, { "epoch": 0.5174609362310366, "grad_norm": 0.4519350826740265, "learning_rate": 5.124875071828476e-06, "loss": 0.02363048493862152, "memory(GiB)": 21.48, "step": 15929, "token_acc": 1.0, "train_speed(iter/s)": 0.957811 }, { "epoch": 0.517493421693792, "grad_norm": 0.40609368681907654, "learning_rate": 5.12433808572531e-06, "loss": 0.019636543467640877, "memory(GiB)": 21.48, "step": 15930, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.957822 }, { "epoch": 0.5175259071565474, "grad_norm": 0.32566457986831665, "learning_rate": 5.1238010981871176e-06, "loss": 0.01521952822804451, "memory(GiB)": 21.48, "step": 15931, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.957832 }, { "epoch": 0.5175583926193028, "grad_norm": 0.32277029752731323, "learning_rate": 5.123264109220096e-06, "loss": 0.021481413394212723, "memory(GiB)": 21.48, "step": 15932, "token_acc": 0.994413407821229, "train_speed(iter/s)": 0.957843 }, { "epoch": 0.5175908780820583, "grad_norm": 0.29394251108169556, "learning_rate": 5.1227271188304425e-06, "loss": 0.015754178166389465, "memory(GiB)": 21.48, "step": 15933, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957853 }, { "epoch": 0.5176233635448136, "grad_norm": 0.3462085425853729, "learning_rate": 5.122190127024356e-06, "loss": 0.017754144966602325, "memory(GiB)": 21.48, "step": 15934, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957864 }, { "epoch": 0.5176558490075691, "grad_norm": 0.37414318323135376, "learning_rate": 5.121653133808032e-06, "loss": 0.022848177701234818, "memory(GiB)": 21.48, "step": 15935, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957873 }, { "epoch": 0.5176883344703246, "grad_norm": 0.5767039060592651, "learning_rate": 5.121116139187669e-06, "loss": 0.019657321274280548, "memory(GiB)": 21.48, "step": 15936, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957884 }, { "epoch": 0.51772081993308, "grad_norm": 0.39650633931159973, "learning_rate": 5.120579143169464e-06, "loss": 0.02568645216524601, "memory(GiB)": 21.48, "step": 15937, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.957894 }, { "epoch": 0.5177533053958354, "grad_norm": 0.36404895782470703, "learning_rate": 5.120042145759615e-06, "loss": 0.020292188972234726, "memory(GiB)": 21.48, "step": 15938, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957904 }, { "epoch": 0.5177857908585908, "grad_norm": 0.2779011130332947, "learning_rate": 5.119505146964321e-06, "loss": 0.02079237625002861, "memory(GiB)": 21.48, "step": 15939, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957913 }, { "epoch": 0.5178182763213462, "grad_norm": 0.4793466031551361, "learning_rate": 5.118968146789777e-06, "loss": 0.024576028808951378, "memory(GiB)": 21.48, "step": 15940, "token_acc": 1.0, "train_speed(iter/s)": 0.957923 }, { "epoch": 0.5178507617841016, "grad_norm": 0.35487228631973267, "learning_rate": 5.1184311452421834e-06, "loss": 0.017890917137265205, "memory(GiB)": 21.48, "step": 15941, "token_acc": 1.0, "train_speed(iter/s)": 0.957933 }, { "epoch": 0.5178832472468571, "grad_norm": 0.4624805450439453, "learning_rate": 5.117894142327736e-06, "loss": 0.02398241125047207, "memory(GiB)": 21.48, "step": 15942, "token_acc": 0.9890510948905109, "train_speed(iter/s)": 0.957944 }, { "epoch": 0.5179157327096124, "grad_norm": 0.31253302097320557, "learning_rate": 5.117357138052634e-06, "loss": 0.01792083866894245, "memory(GiB)": 21.48, "step": 15943, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957955 }, { "epoch": 0.5179482181723679, "grad_norm": 0.4624735713005066, "learning_rate": 5.116820132423075e-06, "loss": 0.021757833659648895, "memory(GiB)": 21.48, "step": 15944, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.957966 }, { "epoch": 0.5179807036351233, "grad_norm": 0.5739753842353821, "learning_rate": 5.116283125445255e-06, "loss": 0.02303025871515274, "memory(GiB)": 21.48, "step": 15945, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.957975 }, { "epoch": 0.5180131890978787, "grad_norm": 0.40646615624427795, "learning_rate": 5.115746117125372e-06, "loss": 0.02144894190132618, "memory(GiB)": 21.48, "step": 15946, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.957983 }, { "epoch": 0.5180456745606341, "grad_norm": 0.3031882643699646, "learning_rate": 5.115209107469625e-06, "loss": 0.019959747791290283, "memory(GiB)": 21.48, "step": 15947, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.957993 }, { "epoch": 0.5180781600233896, "grad_norm": 0.3172054588794708, "learning_rate": 5.114672096484213e-06, "loss": 0.022590629756450653, "memory(GiB)": 21.48, "step": 15948, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.958004 }, { "epoch": 0.5181106454861449, "grad_norm": 0.3178151547908783, "learning_rate": 5.114135084175334e-06, "loss": 0.01924395002424717, "memory(GiB)": 21.48, "step": 15949, "token_acc": 1.0, "train_speed(iter/s)": 0.958014 }, { "epoch": 0.5181431309489004, "grad_norm": 0.3291648030281067, "learning_rate": 5.11359807054918e-06, "loss": 0.017011189833283424, "memory(GiB)": 21.48, "step": 15950, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.958024 }, { "epoch": 0.5181756164116558, "grad_norm": 0.3874562382698059, "learning_rate": 5.113061055611957e-06, "loss": 0.022243695333600044, "memory(GiB)": 21.48, "step": 15951, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.958035 }, { "epoch": 0.5182081018744112, "grad_norm": 0.32567447423934937, "learning_rate": 5.112524039369856e-06, "loss": 0.02500857226550579, "memory(GiB)": 21.48, "step": 15952, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.958046 }, { "epoch": 0.5182405873371666, "grad_norm": 0.43171483278274536, "learning_rate": 5.11198702182908e-06, "loss": 0.024925144389271736, "memory(GiB)": 21.48, "step": 15953, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.958059 }, { "epoch": 0.5182730727999221, "grad_norm": 0.31473812460899353, "learning_rate": 5.1114500029958225e-06, "loss": 0.015173118561506271, "memory(GiB)": 21.48, "step": 15954, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.958072 }, { "epoch": 0.5183055582626774, "grad_norm": 0.3331563174724579, "learning_rate": 5.1109129828762864e-06, "loss": 0.01563846692442894, "memory(GiB)": 21.48, "step": 15955, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.958085 }, { "epoch": 0.5183380437254329, "grad_norm": 0.2754216194152832, "learning_rate": 5.110375961476663e-06, "loss": 0.01648247055709362, "memory(GiB)": 21.48, "step": 15956, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.958098 }, { "epoch": 0.5183705291881883, "grad_norm": 0.4439460337162018, "learning_rate": 5.109838938803158e-06, "loss": 0.02393597736954689, "memory(GiB)": 21.48, "step": 15957, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.95811 }, { "epoch": 0.5184030146509437, "grad_norm": 0.22642235457897186, "learning_rate": 5.109301914861964e-06, "loss": 0.018456533551216125, "memory(GiB)": 21.48, "step": 15958, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.958123 }, { "epoch": 0.5184355001136991, "grad_norm": 0.39527446031570435, "learning_rate": 5.10876488965928e-06, "loss": 0.015198755078017712, "memory(GiB)": 21.48, "step": 15959, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.958136 }, { "epoch": 0.5184679855764546, "grad_norm": 0.42526572942733765, "learning_rate": 5.108227863201305e-06, "loss": 0.02151498571038246, "memory(GiB)": 21.48, "step": 15960, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.958148 }, { "epoch": 0.5185004710392099, "grad_norm": 0.39991816878318787, "learning_rate": 5.1076908354942345e-06, "loss": 0.024136202409863472, "memory(GiB)": 21.48, "step": 15961, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.958162 }, { "epoch": 0.5185329565019654, "grad_norm": 0.3913334906101227, "learning_rate": 5.107153806544271e-06, "loss": 0.023035433143377304, "memory(GiB)": 21.48, "step": 15962, "token_acc": 0.985, "train_speed(iter/s)": 0.958175 }, { "epoch": 0.5185654419647208, "grad_norm": 0.42076003551483154, "learning_rate": 5.106616776357609e-06, "loss": 0.023604709655046463, "memory(GiB)": 21.48, "step": 15963, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.958187 }, { "epoch": 0.5185979274274762, "grad_norm": 0.392917662858963, "learning_rate": 5.106079744940447e-06, "loss": 0.024476662278175354, "memory(GiB)": 21.48, "step": 15964, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.9582 }, { "epoch": 0.5186304128902316, "grad_norm": 0.357916921377182, "learning_rate": 5.1055427122989844e-06, "loss": 0.018603019416332245, "memory(GiB)": 21.48, "step": 15965, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.95821 }, { "epoch": 0.5186628983529871, "grad_norm": 0.3438490629196167, "learning_rate": 5.105005678439418e-06, "loss": 0.018475601449608803, "memory(GiB)": 21.48, "step": 15966, "token_acc": 0.9961832061068703, "train_speed(iter/s)": 0.958221 }, { "epoch": 0.5186953838157424, "grad_norm": 0.5172410011291504, "learning_rate": 5.104468643367946e-06, "loss": 0.02838500216603279, "memory(GiB)": 21.48, "step": 15967, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.95823 }, { "epoch": 0.5187278692784979, "grad_norm": 0.8088099956512451, "learning_rate": 5.103931607090766e-06, "loss": 0.024635307490825653, "memory(GiB)": 21.48, "step": 15968, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.958241 }, { "epoch": 0.5187603547412533, "grad_norm": 0.39442670345306396, "learning_rate": 5.103394569614079e-06, "loss": 0.026808323338627815, "memory(GiB)": 21.48, "step": 15969, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.958251 }, { "epoch": 0.5187928402040087, "grad_norm": 0.3131733536720276, "learning_rate": 5.10285753094408e-06, "loss": 0.014488336630165577, "memory(GiB)": 21.48, "step": 15970, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.958259 }, { "epoch": 0.5188253256667641, "grad_norm": 0.3335666060447693, "learning_rate": 5.102320491086968e-06, "loss": 0.017287105321884155, "memory(GiB)": 21.48, "step": 15971, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.958269 }, { "epoch": 0.5188578111295196, "grad_norm": 0.31492865085601807, "learning_rate": 5.101783450048942e-06, "loss": 0.020603299140930176, "memory(GiB)": 21.48, "step": 15972, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.958279 }, { "epoch": 0.5188902965922749, "grad_norm": 0.41964033246040344, "learning_rate": 5.101246407836199e-06, "loss": 0.022347785532474518, "memory(GiB)": 21.48, "step": 15973, "token_acc": 1.0, "train_speed(iter/s)": 0.958288 }, { "epoch": 0.5189227820550304, "grad_norm": 0.23479218780994415, "learning_rate": 5.100709364454939e-06, "loss": 0.01767832040786743, "memory(GiB)": 21.48, "step": 15974, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.958297 }, { "epoch": 0.5189552675177858, "grad_norm": 0.2814796566963196, "learning_rate": 5.100172319911357e-06, "loss": 0.015379073098301888, "memory(GiB)": 21.48, "step": 15975, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.958306 }, { "epoch": 0.5189877529805412, "grad_norm": 0.30339476466178894, "learning_rate": 5.099635274211654e-06, "loss": 0.01593742147088051, "memory(GiB)": 21.48, "step": 15976, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.958316 }, { "epoch": 0.5190202384432966, "grad_norm": 0.40252718329429626, "learning_rate": 5.099098227362026e-06, "loss": 0.02624204196035862, "memory(GiB)": 21.48, "step": 15977, "token_acc": 0.9921875, "train_speed(iter/s)": 0.958326 }, { "epoch": 0.5190527239060521, "grad_norm": 0.26532572507858276, "learning_rate": 5.098561179368674e-06, "loss": 0.015627460554242134, "memory(GiB)": 21.48, "step": 15978, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.958336 }, { "epoch": 0.5190852093688074, "grad_norm": 0.45215609669685364, "learning_rate": 5.0980241302377946e-06, "loss": 0.02827509492635727, "memory(GiB)": 21.48, "step": 15979, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.958345 }, { "epoch": 0.5191176948315629, "grad_norm": 0.26083463430404663, "learning_rate": 5.097487079975586e-06, "loss": 0.011968499049544334, "memory(GiB)": 21.48, "step": 15980, "token_acc": 0.9900662251655629, "train_speed(iter/s)": 0.958355 }, { "epoch": 0.5191501802943183, "grad_norm": 0.343527615070343, "learning_rate": 5.096950028588247e-06, "loss": 0.02081991173326969, "memory(GiB)": 21.48, "step": 15981, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.958365 }, { "epoch": 0.5191826657570737, "grad_norm": 0.569318413734436, "learning_rate": 5.096412976081975e-06, "loss": 0.030858371406793594, "memory(GiB)": 21.48, "step": 15982, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.958374 }, { "epoch": 0.5192151512198291, "grad_norm": 0.4143904447555542, "learning_rate": 5.09587592246297e-06, "loss": 0.022420883178710938, "memory(GiB)": 21.48, "step": 15983, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.958383 }, { "epoch": 0.5192476366825846, "grad_norm": 0.2798667252063751, "learning_rate": 5.095338867737428e-06, "loss": 0.015955515205860138, "memory(GiB)": 21.48, "step": 15984, "token_acc": 1.0, "train_speed(iter/s)": 0.958393 }, { "epoch": 0.5192801221453399, "grad_norm": 0.3798815608024597, "learning_rate": 5.0948018119115496e-06, "loss": 0.019893016666173935, "memory(GiB)": 21.48, "step": 15985, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.958404 }, { "epoch": 0.5193126076080954, "grad_norm": 0.3982766270637512, "learning_rate": 5.094264754991533e-06, "loss": 0.022434983402490616, "memory(GiB)": 21.48, "step": 15986, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.958417 }, { "epoch": 0.5193450930708507, "grad_norm": 0.27181580662727356, "learning_rate": 5.0937276969835735e-06, "loss": 0.01204809732735157, "memory(GiB)": 21.48, "step": 15987, "token_acc": 1.0, "train_speed(iter/s)": 0.95843 }, { "epoch": 0.5193775785336062, "grad_norm": 0.33720847964286804, "learning_rate": 5.0931906378938735e-06, "loss": 0.02102656103670597, "memory(GiB)": 21.48, "step": 15988, "token_acc": 1.0, "train_speed(iter/s)": 0.958443 }, { "epoch": 0.5194100639963616, "grad_norm": 0.39133742451667786, "learning_rate": 5.092653577728628e-06, "loss": 0.02085862308740616, "memory(GiB)": 21.48, "step": 15989, "token_acc": 0.98, "train_speed(iter/s)": 0.958455 }, { "epoch": 0.519442549459117, "grad_norm": 0.939857542514801, "learning_rate": 5.092116516494038e-06, "loss": 0.03301308676600456, "memory(GiB)": 21.48, "step": 15990, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.958468 }, { "epoch": 0.5194750349218724, "grad_norm": 0.4283129870891571, "learning_rate": 5.091579454196299e-06, "loss": 0.018587183207273483, "memory(GiB)": 21.48, "step": 15991, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.95848 }, { "epoch": 0.5195075203846279, "grad_norm": 0.5656141042709351, "learning_rate": 5.091042390841612e-06, "loss": 0.014226383529603481, "memory(GiB)": 21.48, "step": 15992, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.95849 }, { "epoch": 0.5195400058473832, "grad_norm": 0.481633722782135, "learning_rate": 5.090505326436176e-06, "loss": 0.026802271604537964, "memory(GiB)": 21.48, "step": 15993, "token_acc": 1.0, "train_speed(iter/s)": 0.9585 }, { "epoch": 0.5195724913101387, "grad_norm": 0.36935195326805115, "learning_rate": 5.089968260986187e-06, "loss": 0.018007652834057808, "memory(GiB)": 21.48, "step": 15994, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.95851 }, { "epoch": 0.5196049767728941, "grad_norm": 0.42645466327667236, "learning_rate": 5.089431194497845e-06, "loss": 0.017973465844988823, "memory(GiB)": 21.48, "step": 15995, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.95852 }, { "epoch": 0.5196374622356495, "grad_norm": 0.3277091979980469, "learning_rate": 5.088894126977346e-06, "loss": 0.01729031838476658, "memory(GiB)": 21.48, "step": 15996, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.958531 }, { "epoch": 0.5196699476984049, "grad_norm": 0.21613270044326782, "learning_rate": 5.088357058430894e-06, "loss": 0.01324988529086113, "memory(GiB)": 21.48, "step": 15997, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.958541 }, { "epoch": 0.5197024331611604, "grad_norm": 0.4708937406539917, "learning_rate": 5.087819988864681e-06, "loss": 0.01978622004389763, "memory(GiB)": 21.48, "step": 15998, "token_acc": 0.98046875, "train_speed(iter/s)": 0.95855 }, { "epoch": 0.5197349186239157, "grad_norm": 0.4399353563785553, "learning_rate": 5.08728291828491e-06, "loss": 0.016042398288846016, "memory(GiB)": 21.48, "step": 15999, "token_acc": 0.992, "train_speed(iter/s)": 0.958559 }, { "epoch": 0.5197674040866712, "grad_norm": 0.4149158298969269, "learning_rate": 5.0867458466977786e-06, "loss": 0.021598974242806435, "memory(GiB)": 21.48, "step": 16000, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.958569 }, { "epoch": 0.5197674040866712, "eval_loss": 0.021263862028717995, "eval_runtime": 80.7776, "eval_samples_per_second": 123.178, "eval_steps_per_second": 3.85, "eval_token_acc": 0.9915344841492526, "step": 16000 }, { "epoch": 0.5197998895494267, "grad_norm": 0.3038855195045471, "learning_rate": 5.086208774109482e-06, "loss": 0.022035878151655197, "memory(GiB)": 21.48, "step": 16001, "token_acc": 0.991449477924854, "train_speed(iter/s)": 0.953339 }, { "epoch": 0.519832375012182, "grad_norm": 0.36207473278045654, "learning_rate": 5.085671700526223e-06, "loss": 0.021135486662387848, "memory(GiB)": 21.48, "step": 16002, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.953348 }, { "epoch": 0.5198648604749375, "grad_norm": 0.5075126886367798, "learning_rate": 5.085134625954199e-06, "loss": 0.031596943736076355, "memory(GiB)": 21.48, "step": 16003, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.953357 }, { "epoch": 0.5198973459376929, "grad_norm": 0.3886084258556366, "learning_rate": 5.084597550399608e-06, "loss": 0.02885277569293976, "memory(GiB)": 21.48, "step": 16004, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.953365 }, { "epoch": 0.5199298314004484, "grad_norm": 0.3610430657863617, "learning_rate": 5.084060473868647e-06, "loss": 0.01786019466817379, "memory(GiB)": 21.48, "step": 16005, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.953374 }, { "epoch": 0.5199623168632037, "grad_norm": 0.32673317193984985, "learning_rate": 5.083523396367519e-06, "loss": 0.016454551368951797, "memory(GiB)": 21.48, "step": 16006, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.953383 }, { "epoch": 0.5199948023259592, "grad_norm": 0.34302231669425964, "learning_rate": 5.0829863179024176e-06, "loss": 0.022658612579107285, "memory(GiB)": 21.48, "step": 16007, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.953391 }, { "epoch": 0.5200272877887145, "grad_norm": 0.2582206130027771, "learning_rate": 5.082449238479543e-06, "loss": 0.016489699482917786, "memory(GiB)": 21.48, "step": 16008, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.953399 }, { "epoch": 0.52005977325147, "grad_norm": 0.4118562936782837, "learning_rate": 5.081912158105097e-06, "loss": 0.026531536132097244, "memory(GiB)": 21.48, "step": 16009, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.953408 }, { "epoch": 0.5200922587142254, "grad_norm": 0.26020336151123047, "learning_rate": 5.081375076785274e-06, "loss": 0.01619134470820427, "memory(GiB)": 21.48, "step": 16010, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.953417 }, { "epoch": 0.5201247441769808, "grad_norm": 0.3841753602027893, "learning_rate": 5.080837994526275e-06, "loss": 0.01932845078408718, "memory(GiB)": 21.48, "step": 16011, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.953425 }, { "epoch": 0.5201572296397362, "grad_norm": 0.33291009068489075, "learning_rate": 5.0803009113342975e-06, "loss": 0.014536245726048946, "memory(GiB)": 21.48, "step": 16012, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.953433 }, { "epoch": 0.5201897151024917, "grad_norm": 0.3127515912055969, "learning_rate": 5.079763827215541e-06, "loss": 0.01812063902616501, "memory(GiB)": 21.48, "step": 16013, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.953442 }, { "epoch": 0.520222200565247, "grad_norm": 0.3773494064807892, "learning_rate": 5.079226742176204e-06, "loss": 0.012366032227873802, "memory(GiB)": 21.48, "step": 16014, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.953448 }, { "epoch": 0.5202546860280025, "grad_norm": 0.2668991982936859, "learning_rate": 5.078689656222484e-06, "loss": 0.01818619854748249, "memory(GiB)": 21.48, "step": 16015, "token_acc": 0.9759036144578314, "train_speed(iter/s)": 0.953456 }, { "epoch": 0.5202871714907579, "grad_norm": 0.4311716854572296, "learning_rate": 5.078152569360581e-06, "loss": 0.028515977784991264, "memory(GiB)": 21.48, "step": 16016, "token_acc": 0.996, "train_speed(iter/s)": 0.953465 }, { "epoch": 0.5203196569535133, "grad_norm": 0.3808355927467346, "learning_rate": 5.077615481596694e-06, "loss": 0.02074323035776615, "memory(GiB)": 21.48, "step": 16017, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.953473 }, { "epoch": 0.5203521424162687, "grad_norm": 0.35148143768310547, "learning_rate": 5.077078392937021e-06, "loss": 0.023083265870809555, "memory(GiB)": 21.48, "step": 16018, "token_acc": 0.9878048780487805, "train_speed(iter/s)": 0.953481 }, { "epoch": 0.5203846278790242, "grad_norm": 0.3337448239326477, "learning_rate": 5.07654130338776e-06, "loss": 0.021526716649532318, "memory(GiB)": 21.48, "step": 16019, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.95349 }, { "epoch": 0.5204171133417795, "grad_norm": 0.37252891063690186, "learning_rate": 5.076004212955112e-06, "loss": 0.02047017589211464, "memory(GiB)": 21.48, "step": 16020, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.953498 }, { "epoch": 0.520449598804535, "grad_norm": 0.4892149865627289, "learning_rate": 5.075467121645271e-06, "loss": 0.03405667096376419, "memory(GiB)": 21.48, "step": 16021, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.953507 }, { "epoch": 0.5204820842672904, "grad_norm": 0.2680093050003052, "learning_rate": 5.0749300294644424e-06, "loss": 0.0185845959931612, "memory(GiB)": 21.48, "step": 16022, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.953517 }, { "epoch": 0.5205145697300458, "grad_norm": 0.3103979229927063, "learning_rate": 5.0743929364188195e-06, "loss": 0.020797986537218094, "memory(GiB)": 21.48, "step": 16023, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.953528 }, { "epoch": 0.5205470551928012, "grad_norm": 0.2943651080131531, "learning_rate": 5.073855842514603e-06, "loss": 0.01954852044582367, "memory(GiB)": 21.48, "step": 16024, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.95354 }, { "epoch": 0.5205795406555567, "grad_norm": 0.384227991104126, "learning_rate": 5.0733187477579925e-06, "loss": 0.019808251410722733, "memory(GiB)": 21.48, "step": 16025, "token_acc": 0.9929824561403509, "train_speed(iter/s)": 0.953553 }, { "epoch": 0.520612026118312, "grad_norm": 0.32581719756126404, "learning_rate": 5.072781652155186e-06, "loss": 0.020912379026412964, "memory(GiB)": 21.48, "step": 16026, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.953566 }, { "epoch": 0.5206445115810675, "grad_norm": 0.24501392245292664, "learning_rate": 5.072244555712384e-06, "loss": 0.015621503815054893, "memory(GiB)": 21.48, "step": 16027, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.953578 }, { "epoch": 0.5206769970438229, "grad_norm": 0.47009870409965515, "learning_rate": 5.071707458435781e-06, "loss": 0.023112762719392776, "memory(GiB)": 21.48, "step": 16028, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.953591 }, { "epoch": 0.5207094825065783, "grad_norm": 0.32167527079582214, "learning_rate": 5.07117036033158e-06, "loss": 0.02147854119539261, "memory(GiB)": 21.48, "step": 16029, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.953597 }, { "epoch": 0.5207419679693337, "grad_norm": 0.6219615340232849, "learning_rate": 5.070633261405978e-06, "loss": 0.025613538920879364, "memory(GiB)": 21.48, "step": 16030, "token_acc": 1.0, "train_speed(iter/s)": 0.953607 }, { "epoch": 0.5207744534320892, "grad_norm": 0.39347386360168457, "learning_rate": 5.070096161665173e-06, "loss": 0.016673799604177475, "memory(GiB)": 21.48, "step": 16031, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.95362 }, { "epoch": 0.5208069388948445, "grad_norm": 0.46717187762260437, "learning_rate": 5.069559061115367e-06, "loss": 0.020324278622865677, "memory(GiB)": 21.48, "step": 16032, "token_acc": 0.9858490566037735, "train_speed(iter/s)": 0.953633 }, { "epoch": 0.5208394243576, "grad_norm": 0.37185433506965637, "learning_rate": 5.0690219597627555e-06, "loss": 0.021259916946291924, "memory(GiB)": 21.48, "step": 16033, "token_acc": 0.988, "train_speed(iter/s)": 0.953647 }, { "epoch": 0.5208719098203554, "grad_norm": 0.4460793435573578, "learning_rate": 5.068484857613539e-06, "loss": 0.024366937577724457, "memory(GiB)": 21.48, "step": 16034, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.953659 }, { "epoch": 0.5209043952831108, "grad_norm": 0.3406793177127838, "learning_rate": 5.067947754673917e-06, "loss": 0.015914902091026306, "memory(GiB)": 21.48, "step": 16035, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.953671 }, { "epoch": 0.5209368807458662, "grad_norm": 0.2539600133895874, "learning_rate": 5.067410650950087e-06, "loss": 0.017527583986520767, "memory(GiB)": 21.48, "step": 16036, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.953684 }, { "epoch": 0.5209693662086217, "grad_norm": 0.21143685281276703, "learning_rate": 5.066873546448247e-06, "loss": 0.012925279326736927, "memory(GiB)": 21.48, "step": 16037, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.953696 }, { "epoch": 0.521001851671377, "grad_norm": 0.26315242052078247, "learning_rate": 5.066336441174598e-06, "loss": 0.01643846556544304, "memory(GiB)": 21.48, "step": 16038, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.953709 }, { "epoch": 0.5210343371341325, "grad_norm": 0.6236198544502258, "learning_rate": 5.065799335135339e-06, "loss": 0.025891372933983803, "memory(GiB)": 21.48, "step": 16039, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.953721 }, { "epoch": 0.5210668225968879, "grad_norm": 0.6263956427574158, "learning_rate": 5.065262228336666e-06, "loss": 0.028720252215862274, "memory(GiB)": 21.48, "step": 16040, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.953734 }, { "epoch": 0.5210993080596433, "grad_norm": 0.3647666573524475, "learning_rate": 5.064725120784782e-06, "loss": 0.015707943588495255, "memory(GiB)": 21.48, "step": 16041, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.953746 }, { "epoch": 0.5211317935223987, "grad_norm": 0.37502649426460266, "learning_rate": 5.064188012485884e-06, "loss": 0.019582334905862808, "memory(GiB)": 21.48, "step": 16042, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.953759 }, { "epoch": 0.5211642789851542, "grad_norm": 0.377469003200531, "learning_rate": 5.06365090344617e-06, "loss": 0.016897235065698624, "memory(GiB)": 21.48, "step": 16043, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.953772 }, { "epoch": 0.5211967644479095, "grad_norm": 0.37558072805404663, "learning_rate": 5.063113793671841e-06, "loss": 0.018677033483982086, "memory(GiB)": 21.48, "step": 16044, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.953784 }, { "epoch": 0.521229249910665, "grad_norm": 0.2617570161819458, "learning_rate": 5.062576683169094e-06, "loss": 0.015266621485352516, "memory(GiB)": 21.48, "step": 16045, "token_acc": 0.9939024390243902, "train_speed(iter/s)": 0.953797 }, { "epoch": 0.5212617353734204, "grad_norm": 0.549853503704071, "learning_rate": 5.06203957194413e-06, "loss": 0.03202133998274803, "memory(GiB)": 21.48, "step": 16046, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.953809 }, { "epoch": 0.5212942208361758, "grad_norm": 0.6526930928230286, "learning_rate": 5.061502460003145e-06, "loss": 0.029051747173070908, "memory(GiB)": 21.48, "step": 16047, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.953821 }, { "epoch": 0.5213267062989312, "grad_norm": 0.4135952889919281, "learning_rate": 5.06096534735234e-06, "loss": 0.016680870205163956, "memory(GiB)": 21.48, "step": 16048, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.953834 }, { "epoch": 0.5213591917616867, "grad_norm": 1.7311819791793823, "learning_rate": 5.060428233997914e-06, "loss": 0.025189371779561043, "memory(GiB)": 21.48, "step": 16049, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.953847 }, { "epoch": 0.521391677224442, "grad_norm": 0.3805360496044159, "learning_rate": 5.0598911199460665e-06, "loss": 0.019642163068056107, "memory(GiB)": 21.48, "step": 16050, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.95386 }, { "epoch": 0.5214241626871975, "grad_norm": 0.3317365348339081, "learning_rate": 5.059354005202996e-06, "loss": 0.018543614074587822, "memory(GiB)": 21.48, "step": 16051, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.953857 }, { "epoch": 0.5214566481499529, "grad_norm": 0.36283719539642334, "learning_rate": 5.058816889774899e-06, "loss": 0.02485949546098709, "memory(GiB)": 21.48, "step": 16052, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.953869 }, { "epoch": 0.5214891336127083, "grad_norm": 0.32079648971557617, "learning_rate": 5.058279773667979e-06, "loss": 0.016834646463394165, "memory(GiB)": 21.48, "step": 16053, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.953882 }, { "epoch": 0.5215216190754637, "grad_norm": 0.3425792455673218, "learning_rate": 5.0577426568884304e-06, "loss": 0.01820199564099312, "memory(GiB)": 21.48, "step": 16054, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.953894 }, { "epoch": 0.5215541045382192, "grad_norm": 0.33765166997909546, "learning_rate": 5.0572055394424565e-06, "loss": 0.015000519342720509, "memory(GiB)": 21.48, "step": 16055, "token_acc": 1.0, "train_speed(iter/s)": 0.953905 }, { "epoch": 0.5215865900009745, "grad_norm": 0.38933467864990234, "learning_rate": 5.056668421336255e-06, "loss": 0.023540697991847992, "memory(GiB)": 21.48, "step": 16056, "token_acc": 0.9898477157360406, "train_speed(iter/s)": 0.953918 }, { "epoch": 0.52161907546373, "grad_norm": 0.48702993988990784, "learning_rate": 5.056131302576024e-06, "loss": 0.023923350498080254, "memory(GiB)": 21.48, "step": 16057, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.953931 }, { "epoch": 0.5216515609264853, "grad_norm": 0.3105226457118988, "learning_rate": 5.055594183167963e-06, "loss": 0.01622629165649414, "memory(GiB)": 21.48, "step": 16058, "token_acc": 1.0, "train_speed(iter/s)": 0.953942 }, { "epoch": 0.5216840463892408, "grad_norm": 0.4165165424346924, "learning_rate": 5.05505706311827e-06, "loss": 0.017906254157423973, "memory(GiB)": 21.48, "step": 16059, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.953954 }, { "epoch": 0.5217165318519962, "grad_norm": 0.3859642744064331, "learning_rate": 5.054519942433148e-06, "loss": 0.021167002618312836, "memory(GiB)": 21.48, "step": 16060, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.953966 }, { "epoch": 0.5217490173147517, "grad_norm": 0.8035604357719421, "learning_rate": 5.053982821118791e-06, "loss": 0.03233034163713455, "memory(GiB)": 21.48, "step": 16061, "token_acc": 0.9771863117870723, "train_speed(iter/s)": 0.953979 }, { "epoch": 0.521781502777507, "grad_norm": 0.3538617193698883, "learning_rate": 5.053445699181402e-06, "loss": 0.016057178378105164, "memory(GiB)": 21.48, "step": 16062, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.953991 }, { "epoch": 0.5218139882402625, "grad_norm": 0.5042276382446289, "learning_rate": 5.052908576627177e-06, "loss": 0.02130172774195671, "memory(GiB)": 21.48, "step": 16063, "token_acc": 0.9961538461538462, "train_speed(iter/s)": 0.954001 }, { "epoch": 0.521846473703018, "grad_norm": 0.3188153803348541, "learning_rate": 5.052371453462317e-06, "loss": 0.020933877676725388, "memory(GiB)": 21.48, "step": 16064, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.954009 }, { "epoch": 0.5218789591657733, "grad_norm": 0.3058767020702362, "learning_rate": 5.05183432969302e-06, "loss": 0.016648918390274048, "memory(GiB)": 21.48, "step": 16065, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.954017 }, { "epoch": 0.5219114446285288, "grad_norm": 0.34408658742904663, "learning_rate": 5.051297205325487e-06, "loss": 0.012760394252836704, "memory(GiB)": 21.48, "step": 16066, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.954025 }, { "epoch": 0.5219439300912841, "grad_norm": 0.3257344365119934, "learning_rate": 5.0507600803659166e-06, "loss": 0.020155319944024086, "memory(GiB)": 21.48, "step": 16067, "token_acc": 0.9812206572769953, "train_speed(iter/s)": 0.954034 }, { "epoch": 0.5219764155540396, "grad_norm": 0.4582658112049103, "learning_rate": 5.050222954820506e-06, "loss": 0.02101857028901577, "memory(GiB)": 21.48, "step": 16068, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.954042 }, { "epoch": 0.522008901016795, "grad_norm": 0.4141581952571869, "learning_rate": 5.0496858286954555e-06, "loss": 0.02005031704902649, "memory(GiB)": 21.48, "step": 16069, "token_acc": 1.0, "train_speed(iter/s)": 0.95405 }, { "epoch": 0.5220413864795505, "grad_norm": 0.2945137619972229, "learning_rate": 5.049148701996965e-06, "loss": 0.016630131751298904, "memory(GiB)": 21.48, "step": 16070, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.954059 }, { "epoch": 0.5220738719423058, "grad_norm": 0.3435404598712921, "learning_rate": 5.048611574731233e-06, "loss": 0.016203850507736206, "memory(GiB)": 21.48, "step": 16071, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.954067 }, { "epoch": 0.5221063574050613, "grad_norm": 0.3232367932796478, "learning_rate": 5.048074446904458e-06, "loss": 0.017112622037529945, "memory(GiB)": 21.48, "step": 16072, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.954075 }, { "epoch": 0.5221388428678166, "grad_norm": 0.37448757886886597, "learning_rate": 5.047537318522841e-06, "loss": 0.01979845017194748, "memory(GiB)": 21.48, "step": 16073, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.954084 }, { "epoch": 0.5221713283305721, "grad_norm": 0.4180486798286438, "learning_rate": 5.04700018959258e-06, "loss": 0.023393958806991577, "memory(GiB)": 21.48, "step": 16074, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.954092 }, { "epoch": 0.5222038137933275, "grad_norm": 1.7119749784469604, "learning_rate": 5.046463060119874e-06, "loss": 0.019200198352336884, "memory(GiB)": 21.48, "step": 16075, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.954101 }, { "epoch": 0.522236299256083, "grad_norm": 0.7871873378753662, "learning_rate": 5.0459259301109224e-06, "loss": 0.01855229213833809, "memory(GiB)": 21.48, "step": 16076, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.954108 }, { "epoch": 0.5222687847188383, "grad_norm": 0.5314779281616211, "learning_rate": 5.045388799571924e-06, "loss": 0.023414161056280136, "memory(GiB)": 21.48, "step": 16077, "token_acc": 0.9768518518518519, "train_speed(iter/s)": 0.954116 }, { "epoch": 0.5223012701815938, "grad_norm": 0.362377792596817, "learning_rate": 5.044851668509079e-06, "loss": 0.024278361350297928, "memory(GiB)": 21.48, "step": 16078, "token_acc": 0.982532751091703, "train_speed(iter/s)": 0.954124 }, { "epoch": 0.5223337556443491, "grad_norm": 0.34862712025642395, "learning_rate": 5.044314536928585e-06, "loss": 0.01745869405567646, "memory(GiB)": 21.48, "step": 16079, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.954132 }, { "epoch": 0.5223662411071046, "grad_norm": 0.3588005602359772, "learning_rate": 5.043777404836644e-06, "loss": 0.024777989834547043, "memory(GiB)": 21.48, "step": 16080, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.95414 }, { "epoch": 0.52239872656986, "grad_norm": 0.37275227904319763, "learning_rate": 5.0432402722394535e-06, "loss": 0.019903022795915604, "memory(GiB)": 21.48, "step": 16081, "token_acc": 1.0, "train_speed(iter/s)": 0.954148 }, { "epoch": 0.5224312120326154, "grad_norm": 0.32395562529563904, "learning_rate": 5.042703139143211e-06, "loss": 0.013419397175312042, "memory(GiB)": 21.48, "step": 16082, "token_acc": 1.0, "train_speed(iter/s)": 0.954155 }, { "epoch": 0.5224636974953708, "grad_norm": 0.34521690011024475, "learning_rate": 5.042166005554119e-06, "loss": 0.017569370567798615, "memory(GiB)": 21.48, "step": 16083, "token_acc": 0.9810606060606061, "train_speed(iter/s)": 0.954165 }, { "epoch": 0.5224961829581263, "grad_norm": 0.26889267563819885, "learning_rate": 5.041628871478373e-06, "loss": 0.013245921581983566, "memory(GiB)": 21.48, "step": 16084, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.954176 }, { "epoch": 0.5225286684208816, "grad_norm": 0.4341003894805908, "learning_rate": 5.041091736922175e-06, "loss": 0.020662274211645126, "memory(GiB)": 21.48, "step": 16085, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.954187 }, { "epoch": 0.5225611538836371, "grad_norm": 0.4312751591205597, "learning_rate": 5.040554601891726e-06, "loss": 0.016212671995162964, "memory(GiB)": 21.48, "step": 16086, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.954197 }, { "epoch": 0.5225936393463925, "grad_norm": 0.3631649613380432, "learning_rate": 5.040017466393221e-06, "loss": 0.027538370341062546, "memory(GiB)": 21.48, "step": 16087, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.954208 }, { "epoch": 0.5226261248091479, "grad_norm": 0.5074607133865356, "learning_rate": 5.039480330432862e-06, "loss": 0.025503724813461304, "memory(GiB)": 21.48, "step": 16088, "token_acc": 0.9849056603773585, "train_speed(iter/s)": 0.95422 }, { "epoch": 0.5226586102719033, "grad_norm": 0.46052971482276917, "learning_rate": 5.0389431940168466e-06, "loss": 0.02573324367403984, "memory(GiB)": 21.48, "step": 16089, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.954232 }, { "epoch": 0.5226910957346588, "grad_norm": 0.40012404322624207, "learning_rate": 5.038406057151376e-06, "loss": 0.019457653164863586, "memory(GiB)": 21.48, "step": 16090, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.954243 }, { "epoch": 0.5227235811974141, "grad_norm": 0.29530856013298035, "learning_rate": 5.037868919842648e-06, "loss": 0.015472247265279293, "memory(GiB)": 21.48, "step": 16091, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.954255 }, { "epoch": 0.5227560666601696, "grad_norm": 0.398605614900589, "learning_rate": 5.037331782096861e-06, "loss": 0.02632579579949379, "memory(GiB)": 21.48, "step": 16092, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.954268 }, { "epoch": 0.522788552122925, "grad_norm": 0.2886177599430084, "learning_rate": 5.0367946439202165e-06, "loss": 0.0124953119084239, "memory(GiB)": 21.48, "step": 16093, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.95428 }, { "epoch": 0.5228210375856804, "grad_norm": 0.2738480865955353, "learning_rate": 5.036257505318914e-06, "loss": 0.02082224003970623, "memory(GiB)": 21.48, "step": 16094, "token_acc": 1.0, "train_speed(iter/s)": 0.954292 }, { "epoch": 0.5228535230484358, "grad_norm": 0.26139897108078003, "learning_rate": 5.03572036629915e-06, "loss": 0.02007032372057438, "memory(GiB)": 21.48, "step": 16095, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.954304 }, { "epoch": 0.5228860085111913, "grad_norm": 0.4722438156604767, "learning_rate": 5.035183226867127e-06, "loss": 0.024162907153367996, "memory(GiB)": 21.48, "step": 16096, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.954317 }, { "epoch": 0.5229184939739466, "grad_norm": 0.45245814323425293, "learning_rate": 5.034646087029041e-06, "loss": 0.023915819823741913, "memory(GiB)": 21.48, "step": 16097, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.95433 }, { "epoch": 0.5229509794367021, "grad_norm": 0.29818102717399597, "learning_rate": 5.034108946791093e-06, "loss": 0.022211682051420212, "memory(GiB)": 21.48, "step": 16098, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.954343 }, { "epoch": 0.5229834648994575, "grad_norm": 0.26440101861953735, "learning_rate": 5.033571806159484e-06, "loss": 0.02310365065932274, "memory(GiB)": 21.48, "step": 16099, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.954355 }, { "epoch": 0.5230159503622129, "grad_norm": 0.281818687915802, "learning_rate": 5.033034665140411e-06, "loss": 0.01903487741947174, "memory(GiB)": 21.48, "step": 16100, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.954368 }, { "epoch": 0.5230484358249683, "grad_norm": 0.3261679708957672, "learning_rate": 5.032497523740073e-06, "loss": 0.018258169293403625, "memory(GiB)": 21.48, "step": 16101, "token_acc": 1.0, "train_speed(iter/s)": 0.95438 }, { "epoch": 0.5230809212877238, "grad_norm": 0.9065921306610107, "learning_rate": 5.031960381964671e-06, "loss": 0.03311305120587349, "memory(GiB)": 21.48, "step": 16102, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.954392 }, { "epoch": 0.5231134067504791, "grad_norm": 0.21265201270580292, "learning_rate": 5.031423239820403e-06, "loss": 0.012350658886134624, "memory(GiB)": 21.48, "step": 16103, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.954405 }, { "epoch": 0.5231458922132346, "grad_norm": 0.308775931596756, "learning_rate": 5.030886097313471e-06, "loss": 0.020230382680892944, "memory(GiB)": 21.48, "step": 16104, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.954418 }, { "epoch": 0.52317837767599, "grad_norm": 0.3602910339832306, "learning_rate": 5.03034895445007e-06, "loss": 0.018355634063482285, "memory(GiB)": 21.48, "step": 16105, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.954431 }, { "epoch": 0.5232108631387454, "grad_norm": 0.3204387128353119, "learning_rate": 5.029811811236404e-06, "loss": 0.020119881257414818, "memory(GiB)": 21.48, "step": 16106, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.954443 }, { "epoch": 0.5232433486015008, "grad_norm": 0.26686689257621765, "learning_rate": 5.029274667678667e-06, "loss": 0.020485932007431984, "memory(GiB)": 21.48, "step": 16107, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.954455 }, { "epoch": 0.5232758340642563, "grad_norm": 0.4036104083061218, "learning_rate": 5.028737523783064e-06, "loss": 0.024219783022999763, "memory(GiB)": 21.48, "step": 16108, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.954468 }, { "epoch": 0.5233083195270116, "grad_norm": 0.4338463842868805, "learning_rate": 5.028200379555791e-06, "loss": 0.02630842849612236, "memory(GiB)": 21.48, "step": 16109, "token_acc": 0.99609375, "train_speed(iter/s)": 0.95448 }, { "epoch": 0.5233408049897671, "grad_norm": 0.370834082365036, "learning_rate": 5.0276632350030475e-06, "loss": 0.016628973186016083, "memory(GiB)": 21.48, "step": 16110, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.954492 }, { "epoch": 0.5233732904525225, "grad_norm": 0.322563499212265, "learning_rate": 5.027126090131034e-06, "loss": 0.018901098519563675, "memory(GiB)": 21.48, "step": 16111, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.954505 }, { "epoch": 0.5234057759152779, "grad_norm": 0.4265487790107727, "learning_rate": 5.0265889449459494e-06, "loss": 0.018184388056397438, "memory(GiB)": 21.48, "step": 16112, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.954517 }, { "epoch": 0.5234382613780333, "grad_norm": 0.40085697174072266, "learning_rate": 5.026051799453994e-06, "loss": 0.021391062065958977, "memory(GiB)": 21.48, "step": 16113, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.95453 }, { "epoch": 0.5234707468407888, "grad_norm": 0.42074576020240784, "learning_rate": 5.025514653661364e-06, "loss": 0.01861872337758541, "memory(GiB)": 21.48, "step": 16114, "token_acc": 0.9867549668874173, "train_speed(iter/s)": 0.954542 }, { "epoch": 0.5235032323035441, "grad_norm": 0.36151766777038574, "learning_rate": 5.024977507574262e-06, "loss": 0.02063192054629326, "memory(GiB)": 21.48, "step": 16115, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.954555 }, { "epoch": 0.5235357177662996, "grad_norm": 0.33430778980255127, "learning_rate": 5.024440361198886e-06, "loss": 0.017566921189427376, "memory(GiB)": 21.48, "step": 16116, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.954568 }, { "epoch": 0.523568203229055, "grad_norm": 0.4249767065048218, "learning_rate": 5.023903214541435e-06, "loss": 0.0180208683013916, "memory(GiB)": 21.48, "step": 16117, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.95458 }, { "epoch": 0.5236006886918104, "grad_norm": 0.3246772289276123, "learning_rate": 5.023366067608111e-06, "loss": 0.017012719064950943, "memory(GiB)": 21.48, "step": 16118, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.954593 }, { "epoch": 0.5236331741545658, "grad_norm": 0.29532063007354736, "learning_rate": 5.02282892040511e-06, "loss": 0.021652711555361748, "memory(GiB)": 21.48, "step": 16119, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.954605 }, { "epoch": 0.5236656596173213, "grad_norm": 0.34748029708862305, "learning_rate": 5.022291772938633e-06, "loss": 0.016559071838855743, "memory(GiB)": 21.48, "step": 16120, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.954618 }, { "epoch": 0.5236981450800766, "grad_norm": 0.2835300862789154, "learning_rate": 5.0217546252148795e-06, "loss": 0.0158031415194273, "memory(GiB)": 21.48, "step": 16121, "token_acc": 0.976, "train_speed(iter/s)": 0.95463 }, { "epoch": 0.5237306305428321, "grad_norm": 0.29440534114837646, "learning_rate": 5.021217477240049e-06, "loss": 0.016764389351010323, "memory(GiB)": 21.48, "step": 16122, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.954643 }, { "epoch": 0.5237631160055874, "grad_norm": 0.3216303884983063, "learning_rate": 5.020680329020341e-06, "loss": 0.01905585080385208, "memory(GiB)": 21.48, "step": 16123, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.954654 }, { "epoch": 0.5237956014683429, "grad_norm": 0.34870779514312744, "learning_rate": 5.020143180561953e-06, "loss": 0.022989405319094658, "memory(GiB)": 21.48, "step": 16124, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.954662 }, { "epoch": 0.5238280869310983, "grad_norm": 0.36684390902519226, "learning_rate": 5.019606031871087e-06, "loss": 0.02210398018360138, "memory(GiB)": 21.48, "step": 16125, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.95467 }, { "epoch": 0.5238605723938538, "grad_norm": 0.45163238048553467, "learning_rate": 5.01906888295394e-06, "loss": 0.03035012260079384, "memory(GiB)": 21.48, "step": 16126, "token_acc": 0.988, "train_speed(iter/s)": 0.95468 }, { "epoch": 0.5238930578566091, "grad_norm": 0.3624260425567627, "learning_rate": 5.018531733816715e-06, "loss": 0.020266905426979065, "memory(GiB)": 21.48, "step": 16127, "token_acc": 1.0, "train_speed(iter/s)": 0.95469 }, { "epoch": 0.5239255433193646, "grad_norm": 0.33983731269836426, "learning_rate": 5.017994584465607e-06, "loss": 0.024232134222984314, "memory(GiB)": 21.48, "step": 16128, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.954698 }, { "epoch": 0.52395802878212, "grad_norm": 0.3589063286781311, "learning_rate": 5.017457434906819e-06, "loss": 0.022115737199783325, "memory(GiB)": 21.48, "step": 16129, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.954707 }, { "epoch": 0.5239905142448754, "grad_norm": 0.3891540467739105, "learning_rate": 5.0169202851465485e-06, "loss": 0.022770319133996964, "memory(GiB)": 21.48, "step": 16130, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.954715 }, { "epoch": 0.5240229997076309, "grad_norm": 0.32042258977890015, "learning_rate": 5.016383135190994e-06, "loss": 0.01588229276239872, "memory(GiB)": 21.48, "step": 16131, "token_acc": 1.0, "train_speed(iter/s)": 0.954723 }, { "epoch": 0.5240554851703862, "grad_norm": 0.6444287300109863, "learning_rate": 5.015845985046358e-06, "loss": 0.03309256583452225, "memory(GiB)": 21.48, "step": 16132, "token_acc": 0.9943502824858758, "train_speed(iter/s)": 0.954731 }, { "epoch": 0.5240879706331417, "grad_norm": 0.390406996011734, "learning_rate": 5.015308834718838e-06, "loss": 0.01994878239929676, "memory(GiB)": 21.48, "step": 16133, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.954739 }, { "epoch": 0.5241204560958971, "grad_norm": 0.3790639638900757, "learning_rate": 5.014771684214633e-06, "loss": 0.028331773355603218, "memory(GiB)": 21.48, "step": 16134, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.954748 }, { "epoch": 0.5241529415586526, "grad_norm": 0.3229029178619385, "learning_rate": 5.014234533539944e-06, "loss": 0.022044338285923004, "memory(GiB)": 21.48, "step": 16135, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.954756 }, { "epoch": 0.5241854270214079, "grad_norm": 0.38161197304725647, "learning_rate": 5.01369738270097e-06, "loss": 0.022428713738918304, "memory(GiB)": 21.48, "step": 16136, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.954764 }, { "epoch": 0.5242179124841634, "grad_norm": 0.44858360290527344, "learning_rate": 5.013160231703909e-06, "loss": 0.02641340345144272, "memory(GiB)": 21.48, "step": 16137, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.954773 }, { "epoch": 0.5242503979469187, "grad_norm": 0.4068063497543335, "learning_rate": 5.012623080554961e-06, "loss": 0.01837391033768654, "memory(GiB)": 21.48, "step": 16138, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.954782 }, { "epoch": 0.5242828834096742, "grad_norm": 0.31469592452049255, "learning_rate": 5.012085929260326e-06, "loss": 0.021922873333096504, "memory(GiB)": 21.48, "step": 16139, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.95479 }, { "epoch": 0.5243153688724296, "grad_norm": 0.29401978850364685, "learning_rate": 5.011548777826204e-06, "loss": 0.018331216648221016, "memory(GiB)": 21.48, "step": 16140, "token_acc": 1.0, "train_speed(iter/s)": 0.954798 }, { "epoch": 0.524347854335185, "grad_norm": 0.5297715067863464, "learning_rate": 5.0110116262587936e-06, "loss": 0.028514914214611053, "memory(GiB)": 21.48, "step": 16141, "token_acc": 0.996415770609319, "train_speed(iter/s)": 0.954805 }, { "epoch": 0.5243803397979404, "grad_norm": 0.349351167678833, "learning_rate": 5.0104744745642934e-06, "loss": 0.02437637187540531, "memory(GiB)": 21.48, "step": 16142, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.954814 }, { "epoch": 0.5244128252606959, "grad_norm": 0.3295886218547821, "learning_rate": 5.009937322748905e-06, "loss": 0.014102566987276077, "memory(GiB)": 21.48, "step": 16143, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.954825 }, { "epoch": 0.5244453107234512, "grad_norm": 0.3758181035518646, "learning_rate": 5.009400170818825e-06, "loss": 0.02009035460650921, "memory(GiB)": 21.48, "step": 16144, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.954836 }, { "epoch": 0.5244777961862067, "grad_norm": 0.3177046775817871, "learning_rate": 5.008863018780257e-06, "loss": 0.014844895340502262, "memory(GiB)": 21.48, "step": 16145, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.954847 }, { "epoch": 0.5245102816489621, "grad_norm": 0.3799510896205902, "learning_rate": 5.008325866639398e-06, "loss": 0.02023240737617016, "memory(GiB)": 21.48, "step": 16146, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.954856 }, { "epoch": 0.5245427671117175, "grad_norm": 0.36937299370765686, "learning_rate": 5.007788714402444e-06, "loss": 0.024924714118242264, "memory(GiB)": 21.48, "step": 16147, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.954865 }, { "epoch": 0.5245752525744729, "grad_norm": 0.2738804817199707, "learning_rate": 5.0072515620756e-06, "loss": 0.017587989568710327, "memory(GiB)": 21.48, "step": 16148, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.954875 }, { "epoch": 0.5246077380372284, "grad_norm": 0.35172945261001587, "learning_rate": 5.006714409665063e-06, "loss": 0.026137812063097954, "memory(GiB)": 21.48, "step": 16149, "token_acc": 0.9721115537848606, "train_speed(iter/s)": 0.954886 }, { "epoch": 0.5246402234999837, "grad_norm": 0.41676878929138184, "learning_rate": 5.006177257177034e-06, "loss": 0.02372569590806961, "memory(GiB)": 21.48, "step": 16150, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.954897 }, { "epoch": 0.5246727089627392, "grad_norm": 1.201534628868103, "learning_rate": 5.00564010461771e-06, "loss": 0.019495567306876183, "memory(GiB)": 21.48, "step": 16151, "token_acc": 1.0, "train_speed(iter/s)": 0.954907 }, { "epoch": 0.5247051944254946, "grad_norm": 0.5794497728347778, "learning_rate": 5.005102951993292e-06, "loss": 0.02460925281047821, "memory(GiB)": 21.48, "step": 16152, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.954919 }, { "epoch": 0.52473767988825, "grad_norm": 0.45563048124313354, "learning_rate": 5.00456579930998e-06, "loss": 0.031111765652894974, "memory(GiB)": 21.48, "step": 16153, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.954931 }, { "epoch": 0.5247701653510054, "grad_norm": 0.29854172468185425, "learning_rate": 5.0040286465739705e-06, "loss": 0.01921921782195568, "memory(GiB)": 21.48, "step": 16154, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.954944 }, { "epoch": 0.5248026508137609, "grad_norm": 0.26228851079940796, "learning_rate": 5.003491493791467e-06, "loss": 0.01403247844427824, "memory(GiB)": 21.48, "step": 16155, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.954956 }, { "epoch": 0.5248351362765162, "grad_norm": 0.5517885684967041, "learning_rate": 5.002954340968665e-06, "loss": 0.0235136728733778, "memory(GiB)": 21.48, "step": 16156, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.954969 }, { "epoch": 0.5248676217392717, "grad_norm": 0.4231211245059967, "learning_rate": 5.002417188111769e-06, "loss": 0.024396399036049843, "memory(GiB)": 21.48, "step": 16157, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.954981 }, { "epoch": 0.5249001072020271, "grad_norm": 0.43138155341148376, "learning_rate": 5.001880035226973e-06, "loss": 0.025346754118800163, "memory(GiB)": 21.48, "step": 16158, "token_acc": 0.9897260273972602, "train_speed(iter/s)": 0.954993 }, { "epoch": 0.5249325926647825, "grad_norm": 0.5810091495513916, "learning_rate": 5.00134288232048e-06, "loss": 0.026543835178017616, "memory(GiB)": 21.48, "step": 16159, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955006 }, { "epoch": 0.5249650781275379, "grad_norm": 0.45045334100723267, "learning_rate": 5.000805729398488e-06, "loss": 0.022730320692062378, "memory(GiB)": 21.48, "step": 16160, "token_acc": 1.0, "train_speed(iter/s)": 0.955019 }, { "epoch": 0.5249975635902934, "grad_norm": 0.35526853799819946, "learning_rate": 5.000268576467197e-06, "loss": 0.02078825980424881, "memory(GiB)": 21.48, "step": 16161, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.955031 }, { "epoch": 0.5250300490530487, "grad_norm": 0.5000081658363342, "learning_rate": 4.999731423532806e-06, "loss": 0.01691942662000656, "memory(GiB)": 21.48, "step": 16162, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955044 }, { "epoch": 0.5250625345158042, "grad_norm": 1.1722166538238525, "learning_rate": 4.999194270601514e-06, "loss": 0.029259461909532547, "memory(GiB)": 21.48, "step": 16163, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.955057 }, { "epoch": 0.5250950199785596, "grad_norm": 0.20195743441581726, "learning_rate": 4.998657117679523e-06, "loss": 0.008525056764483452, "memory(GiB)": 21.48, "step": 16164, "token_acc": 1.0, "train_speed(iter/s)": 0.95507 }, { "epoch": 0.525127505441315, "grad_norm": 0.2916161119937897, "learning_rate": 4.998119964773029e-06, "loss": 0.016873471438884735, "memory(GiB)": 21.48, "step": 16165, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955082 }, { "epoch": 0.5251599909040704, "grad_norm": 0.2841629683971405, "learning_rate": 4.997582811888234e-06, "loss": 0.016908099874854088, "memory(GiB)": 21.48, "step": 16166, "token_acc": 1.0, "train_speed(iter/s)": 0.955095 }, { "epoch": 0.5251924763668259, "grad_norm": 0.39344027638435364, "learning_rate": 4.997045659031336e-06, "loss": 0.021895788609981537, "memory(GiB)": 21.48, "step": 16167, "token_acc": 1.0, "train_speed(iter/s)": 0.955107 }, { "epoch": 0.5252249618295812, "grad_norm": 0.3168019652366638, "learning_rate": 4.996508506208536e-06, "loss": 0.018444688990712166, "memory(GiB)": 21.48, "step": 16168, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.955119 }, { "epoch": 0.5252574472923367, "grad_norm": 0.2726091742515564, "learning_rate": 4.99597135342603e-06, "loss": 0.012833889573812485, "memory(GiB)": 21.48, "step": 16169, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.955132 }, { "epoch": 0.5252899327550921, "grad_norm": 0.3106713593006134, "learning_rate": 4.995434200690022e-06, "loss": 0.023042283952236176, "memory(GiB)": 21.48, "step": 16170, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955145 }, { "epoch": 0.5253224182178475, "grad_norm": 0.350700318813324, "learning_rate": 4.99489704800671e-06, "loss": 0.01903209276497364, "memory(GiB)": 21.48, "step": 16171, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.955157 }, { "epoch": 0.5253549036806029, "grad_norm": 0.5915251970291138, "learning_rate": 4.994359895382291e-06, "loss": 0.0265298280864954, "memory(GiB)": 21.48, "step": 16172, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955169 }, { "epoch": 0.5253873891433584, "grad_norm": 0.47737252712249756, "learning_rate": 4.993822742822968e-06, "loss": 0.018485810607671738, "memory(GiB)": 21.48, "step": 16173, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.955181 }, { "epoch": 0.5254198746061137, "grad_norm": 0.3574310839176178, "learning_rate": 4.993285590334938e-06, "loss": 0.018855154514312744, "memory(GiB)": 21.48, "step": 16174, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.955193 }, { "epoch": 0.5254523600688692, "grad_norm": 0.6705458760261536, "learning_rate": 4.992748437924401e-06, "loss": 0.016058795154094696, "memory(GiB)": 21.48, "step": 16175, "token_acc": 0.978021978021978, "train_speed(iter/s)": 0.955206 }, { "epoch": 0.5254848455316246, "grad_norm": 0.3575431704521179, "learning_rate": 4.992211285597557e-06, "loss": 0.02319246158003807, "memory(GiB)": 21.48, "step": 16176, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.955218 }, { "epoch": 0.52551733099438, "grad_norm": 0.266491174697876, "learning_rate": 4.991674133360606e-06, "loss": 0.019749969244003296, "memory(GiB)": 21.48, "step": 16177, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955231 }, { "epoch": 0.5255498164571354, "grad_norm": 0.40149012207984924, "learning_rate": 4.991136981219745e-06, "loss": 0.024466747418045998, "memory(GiB)": 21.48, "step": 16178, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.955243 }, { "epoch": 0.5255823019198909, "grad_norm": 0.37097007036209106, "learning_rate": 4.9905998291811755e-06, "loss": 0.024204585701227188, "memory(GiB)": 21.48, "step": 16179, "token_acc": 0.992, "train_speed(iter/s)": 0.955255 }, { "epoch": 0.5256147873826462, "grad_norm": 0.39223387837409973, "learning_rate": 4.990062677251096e-06, "loss": 0.020294712856411934, "memory(GiB)": 21.48, "step": 16180, "token_acc": 1.0, "train_speed(iter/s)": 0.955268 }, { "epoch": 0.5256472728454017, "grad_norm": 0.377334326505661, "learning_rate": 4.989525525435708e-06, "loss": 0.020487193018198013, "memory(GiB)": 21.48, "step": 16181, "token_acc": 0.9923076923076923, "train_speed(iter/s)": 0.95528 }, { "epoch": 0.525679758308157, "grad_norm": 0.49078235030174255, "learning_rate": 4.988988373741208e-06, "loss": 0.032960593700408936, "memory(GiB)": 21.48, "step": 16182, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.955292 }, { "epoch": 0.5257122437709125, "grad_norm": 0.518949031829834, "learning_rate": 4.988451222173799e-06, "loss": 0.018561583012342453, "memory(GiB)": 21.48, "step": 16183, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955302 }, { "epoch": 0.5257447292336679, "grad_norm": 0.31047868728637695, "learning_rate": 4.987914070739675e-06, "loss": 0.020169097930192947, "memory(GiB)": 21.48, "step": 16184, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955311 }, { "epoch": 0.5257772146964234, "grad_norm": 0.4114791750907898, "learning_rate": 4.98737691944504e-06, "loss": 0.01967761293053627, "memory(GiB)": 21.48, "step": 16185, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.95532 }, { "epoch": 0.5258097001591787, "grad_norm": 0.35464948415756226, "learning_rate": 4.986839768296093e-06, "loss": 0.016539115458726883, "memory(GiB)": 21.48, "step": 16186, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.955331 }, { "epoch": 0.5258421856219342, "grad_norm": 0.692946195602417, "learning_rate": 4.986302617299031e-06, "loss": 0.021175896748900414, "memory(GiB)": 21.48, "step": 16187, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.95534 }, { "epoch": 0.5258746710846895, "grad_norm": 0.3485991954803467, "learning_rate": 4.985765466460058e-06, "loss": 0.019201654940843582, "memory(GiB)": 21.48, "step": 16188, "token_acc": 0.9868995633187773, "train_speed(iter/s)": 0.955349 }, { "epoch": 0.525907156547445, "grad_norm": 0.4157334864139557, "learning_rate": 4.985228315785367e-06, "loss": 0.023048456758260727, "memory(GiB)": 21.48, "step": 16189, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.955358 }, { "epoch": 0.5259396420102004, "grad_norm": 0.46946772933006287, "learning_rate": 4.984691165281164e-06, "loss": 0.02658623829483986, "memory(GiB)": 21.48, "step": 16190, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.955367 }, { "epoch": 0.5259721274729559, "grad_norm": 0.44205421209335327, "learning_rate": 4.984154014953642e-06, "loss": 0.02276073396205902, "memory(GiB)": 21.48, "step": 16191, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.955376 }, { "epoch": 0.5260046129357113, "grad_norm": 0.3057050108909607, "learning_rate": 4.983616864809007e-06, "loss": 0.01722722314298153, "memory(GiB)": 21.48, "step": 16192, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.955385 }, { "epoch": 0.5260370983984667, "grad_norm": 0.39961758255958557, "learning_rate": 4.983079714853453e-06, "loss": 0.022206785157322884, "memory(GiB)": 21.48, "step": 16193, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955393 }, { "epoch": 0.5260695838612222, "grad_norm": 0.29110342264175415, "learning_rate": 4.982542565093183e-06, "loss": 0.016966933384537697, "memory(GiB)": 21.48, "step": 16194, "token_acc": 1.0, "train_speed(iter/s)": 0.955401 }, { "epoch": 0.5261020693239775, "grad_norm": 0.3752358555793762, "learning_rate": 4.9820054155343945e-06, "loss": 0.019756777212023735, "memory(GiB)": 21.48, "step": 16195, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.955409 }, { "epoch": 0.526134554786733, "grad_norm": 0.41232284903526306, "learning_rate": 4.981468266183287e-06, "loss": 0.02441425621509552, "memory(GiB)": 21.48, "step": 16196, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955417 }, { "epoch": 0.5261670402494883, "grad_norm": 0.29669857025146484, "learning_rate": 4.980931117046061e-06, "loss": 0.019498102366924286, "memory(GiB)": 21.48, "step": 16197, "token_acc": 0.9798387096774194, "train_speed(iter/s)": 0.955426 }, { "epoch": 0.5261995257122438, "grad_norm": 0.22218136489391327, "learning_rate": 4.980393968128916e-06, "loss": 0.015031429007649422, "memory(GiB)": 21.48, "step": 16198, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.955435 }, { "epoch": 0.5262320111749992, "grad_norm": 0.2581884562969208, "learning_rate": 4.979856819438048e-06, "loss": 0.014916897751390934, "memory(GiB)": 21.48, "step": 16199, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.955443 }, { "epoch": 0.5262644966377547, "grad_norm": 0.412176251411438, "learning_rate": 4.979319670979662e-06, "loss": 0.02455555647611618, "memory(GiB)": 21.48, "step": 16200, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.955452 }, { "epoch": 0.52629698210051, "grad_norm": 0.21569493412971497, "learning_rate": 4.9787825227599525e-06, "loss": 0.018787086009979248, "memory(GiB)": 21.48, "step": 16201, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.95546 }, { "epoch": 0.5263294675632655, "grad_norm": 0.3424663543701172, "learning_rate": 4.978245374785121e-06, "loss": 0.018077025189995766, "memory(GiB)": 21.48, "step": 16202, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955468 }, { "epoch": 0.5263619530260208, "grad_norm": 0.30856582522392273, "learning_rate": 4.977708227061368e-06, "loss": 0.01783077046275139, "memory(GiB)": 21.48, "step": 16203, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.955477 }, { "epoch": 0.5263944384887763, "grad_norm": 0.6140968203544617, "learning_rate": 4.977171079594891e-06, "loss": 0.027197277173399925, "memory(GiB)": 21.48, "step": 16204, "token_acc": 1.0, "train_speed(iter/s)": 0.955487 }, { "epoch": 0.5264269239515317, "grad_norm": 0.41646692156791687, "learning_rate": 4.976633932391891e-06, "loss": 0.01787387952208519, "memory(GiB)": 21.48, "step": 16205, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.955496 }, { "epoch": 0.5264594094142871, "grad_norm": 0.36002612113952637, "learning_rate": 4.976096785458565e-06, "loss": 0.01824173331260681, "memory(GiB)": 21.48, "step": 16206, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955506 }, { "epoch": 0.5264918948770425, "grad_norm": 0.30512961745262146, "learning_rate": 4.975559638801116e-06, "loss": 0.019132237881422043, "memory(GiB)": 21.48, "step": 16207, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.955516 }, { "epoch": 0.526524380339798, "grad_norm": 0.4079890251159668, "learning_rate": 4.97502249242574e-06, "loss": 0.023596029728651047, "memory(GiB)": 21.48, "step": 16208, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.955526 }, { "epoch": 0.5265568658025533, "grad_norm": 0.2535816729068756, "learning_rate": 4.974485346338637e-06, "loss": 0.014830639585852623, "memory(GiB)": 21.48, "step": 16209, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.955536 }, { "epoch": 0.5265893512653088, "grad_norm": 0.3266950249671936, "learning_rate": 4.973948200546008e-06, "loss": 0.021463563665747643, "memory(GiB)": 21.48, "step": 16210, "token_acc": 0.993103448275862, "train_speed(iter/s)": 0.955546 }, { "epoch": 0.5266218367280642, "grad_norm": 0.33476853370666504, "learning_rate": 4.973411055054053e-06, "loss": 0.025350553914904594, "memory(GiB)": 21.48, "step": 16211, "token_acc": 1.0, "train_speed(iter/s)": 0.955553 }, { "epoch": 0.5266543221908196, "grad_norm": 0.5628129243850708, "learning_rate": 4.972873909868966e-06, "loss": 0.029625102877616882, "memory(GiB)": 21.48, "step": 16212, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.955562 }, { "epoch": 0.526686807653575, "grad_norm": 0.3893716633319855, "learning_rate": 4.972336764996954e-06, "loss": 0.02745361253619194, "memory(GiB)": 21.48, "step": 16213, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.955572 }, { "epoch": 0.5267192931163305, "grad_norm": 0.252084344625473, "learning_rate": 4.97179962044421e-06, "loss": 0.0160911213606596, "memory(GiB)": 21.48, "step": 16214, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.955582 }, { "epoch": 0.5267517785790858, "grad_norm": 0.3710465133190155, "learning_rate": 4.971262476216938e-06, "loss": 0.020779643207788467, "memory(GiB)": 21.48, "step": 16215, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955592 }, { "epoch": 0.5267842640418413, "grad_norm": 0.49319198727607727, "learning_rate": 4.970725332321334e-06, "loss": 0.02507345750927925, "memory(GiB)": 21.48, "step": 16216, "token_acc": 0.9806949806949807, "train_speed(iter/s)": 0.955603 }, { "epoch": 0.5268167495045967, "grad_norm": 0.33166712522506714, "learning_rate": 4.970188188763597e-06, "loss": 0.02105499431490898, "memory(GiB)": 21.48, "step": 16217, "token_acc": 0.9963503649635036, "train_speed(iter/s)": 0.955616 }, { "epoch": 0.5268492349673521, "grad_norm": 0.39952918887138367, "learning_rate": 4.969651045549931e-06, "loss": 0.0189376138150692, "memory(GiB)": 21.48, "step": 16218, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.955628 }, { "epoch": 0.5268817204301075, "grad_norm": 0.2946290969848633, "learning_rate": 4.96911390268653e-06, "loss": 0.021409042179584503, "memory(GiB)": 21.48, "step": 16219, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.95564 }, { "epoch": 0.526914205892863, "grad_norm": 0.30892425775527954, "learning_rate": 4.968576760179598e-06, "loss": 0.023106249049305916, "memory(GiB)": 21.48, "step": 16220, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.955653 }, { "epoch": 0.5269466913556183, "grad_norm": 0.475227415561676, "learning_rate": 4.96803961803533e-06, "loss": 0.03208870440721512, "memory(GiB)": 21.48, "step": 16221, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955666 }, { "epoch": 0.5269791768183738, "grad_norm": 0.3270283043384552, "learning_rate": 4.9675024762599285e-06, "loss": 0.015260564163327217, "memory(GiB)": 21.48, "step": 16222, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.955678 }, { "epoch": 0.5270116622811292, "grad_norm": 0.3721867501735687, "learning_rate": 4.966965334859591e-06, "loss": 0.02250261977314949, "memory(GiB)": 21.48, "step": 16223, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.95569 }, { "epoch": 0.5270441477438846, "grad_norm": 0.39464932680130005, "learning_rate": 4.966428193840518e-06, "loss": 0.022876178845763206, "memory(GiB)": 21.48, "step": 16224, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955703 }, { "epoch": 0.52707663320664, "grad_norm": 0.33799460530281067, "learning_rate": 4.965891053208908e-06, "loss": 0.024201439693570137, "memory(GiB)": 21.48, "step": 16225, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955716 }, { "epoch": 0.5271091186693955, "grad_norm": 0.2576005756855011, "learning_rate": 4.965353912970961e-06, "loss": 0.018374739214777946, "memory(GiB)": 21.48, "step": 16226, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.955728 }, { "epoch": 0.5271416041321508, "grad_norm": 0.21430756151676178, "learning_rate": 4.964816773132875e-06, "loss": 0.014101996086537838, "memory(GiB)": 21.48, "step": 16227, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.95574 }, { "epoch": 0.5271740895949063, "grad_norm": 0.3710618317127228, "learning_rate": 4.964279633700852e-06, "loss": 0.020030219107866287, "memory(GiB)": 21.48, "step": 16228, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955752 }, { "epoch": 0.5272065750576617, "grad_norm": 0.7329292297363281, "learning_rate": 4.963742494681087e-06, "loss": 0.029773082584142685, "memory(GiB)": 21.48, "step": 16229, "token_acc": 0.9890710382513661, "train_speed(iter/s)": 0.955764 }, { "epoch": 0.5272390605204171, "grad_norm": 0.46680358052253723, "learning_rate": 4.963205356079784e-06, "loss": 0.023478958755731583, "memory(GiB)": 21.48, "step": 16230, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.955776 }, { "epoch": 0.5272715459831725, "grad_norm": 0.4118156433105469, "learning_rate": 4.96266821790314e-06, "loss": 0.017966004088521004, "memory(GiB)": 21.48, "step": 16231, "token_acc": 1.0, "train_speed(iter/s)": 0.955789 }, { "epoch": 0.527304031445928, "grad_norm": 0.3459497094154358, "learning_rate": 4.962131080157353e-06, "loss": 0.02231820486485958, "memory(GiB)": 21.48, "step": 16232, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955801 }, { "epoch": 0.5273365169086833, "grad_norm": 0.3612702786922455, "learning_rate": 4.961593942848626e-06, "loss": 0.02341165766119957, "memory(GiB)": 21.48, "step": 16233, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.955813 }, { "epoch": 0.5273690023714388, "grad_norm": 0.2742403745651245, "learning_rate": 4.961056805983153e-06, "loss": 0.01617124304175377, "memory(GiB)": 21.48, "step": 16234, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955825 }, { "epoch": 0.5274014878341942, "grad_norm": 0.3485272526741028, "learning_rate": 4.96051966956714e-06, "loss": 0.020589441061019897, "memory(GiB)": 21.48, "step": 16235, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.955836 }, { "epoch": 0.5274339732969496, "grad_norm": 0.3175720274448395, "learning_rate": 4.959982533606779e-06, "loss": 0.018657159060239792, "memory(GiB)": 21.48, "step": 16236, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.955847 }, { "epoch": 0.527466458759705, "grad_norm": 0.42170509696006775, "learning_rate": 4.959445398108275e-06, "loss": 0.024166874587535858, "memory(GiB)": 21.48, "step": 16237, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.955858 }, { "epoch": 0.5274989442224605, "grad_norm": 0.38100722432136536, "learning_rate": 4.958908263077824e-06, "loss": 0.016868755221366882, "memory(GiB)": 21.48, "step": 16238, "token_acc": 0.9834254143646409, "train_speed(iter/s)": 0.95587 }, { "epoch": 0.5275314296852158, "grad_norm": 0.34602198004722595, "learning_rate": 4.9583711285216284e-06, "loss": 0.01890830509364605, "memory(GiB)": 21.48, "step": 16239, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955883 }, { "epoch": 0.5275639151479713, "grad_norm": 0.33388853073120117, "learning_rate": 4.957833994445883e-06, "loss": 0.022449307143688202, "memory(GiB)": 21.48, "step": 16240, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.955895 }, { "epoch": 0.5275964006107267, "grad_norm": 0.31525906920433044, "learning_rate": 4.957296860856791e-06, "loss": 0.01931234449148178, "memory(GiB)": 21.48, "step": 16241, "token_acc": 1.0, "train_speed(iter/s)": 0.955907 }, { "epoch": 0.5276288860734821, "grad_norm": 0.36139318346977234, "learning_rate": 4.956759727760548e-06, "loss": 0.02949051931500435, "memory(GiB)": 21.48, "step": 16242, "token_acc": 1.0, "train_speed(iter/s)": 0.955918 }, { "epoch": 0.5276613715362375, "grad_norm": 0.31189998984336853, "learning_rate": 4.956222595163358e-06, "loss": 0.02064792811870575, "memory(GiB)": 21.48, "step": 16243, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.955928 }, { "epoch": 0.527693856998993, "grad_norm": 0.47797906398773193, "learning_rate": 4.955685463071416e-06, "loss": 0.028283370658755302, "memory(GiB)": 21.48, "step": 16244, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.955938 }, { "epoch": 0.5277263424617483, "grad_norm": 0.4119974374771118, "learning_rate": 4.955148331490923e-06, "loss": 0.021536560729146004, "memory(GiB)": 21.48, "step": 16245, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.955947 }, { "epoch": 0.5277588279245038, "grad_norm": 0.2555552124977112, "learning_rate": 4.954611200428078e-06, "loss": 0.01625032164156437, "memory(GiB)": 21.48, "step": 16246, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.955958 }, { "epoch": 0.5277913133872592, "grad_norm": 0.3586328625679016, "learning_rate": 4.9540740698890775e-06, "loss": 0.02213229611515999, "memory(GiB)": 21.48, "step": 16247, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.955966 }, { "epoch": 0.5278237988500146, "grad_norm": 0.5150686502456665, "learning_rate": 4.953536939880128e-06, "loss": 0.02997010201215744, "memory(GiB)": 21.48, "step": 16248, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955975 }, { "epoch": 0.52785628431277, "grad_norm": 0.4708808660507202, "learning_rate": 4.952999810407421e-06, "loss": 0.022417772561311722, "memory(GiB)": 21.48, "step": 16249, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.955984 }, { "epoch": 0.5278887697755255, "grad_norm": 0.382514625787735, "learning_rate": 4.95246268147716e-06, "loss": 0.013915857300162315, "memory(GiB)": 21.48, "step": 16250, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955993 }, { "epoch": 0.5279212552382808, "grad_norm": 0.39155393838882446, "learning_rate": 4.951925553095543e-06, "loss": 0.021098149940371513, "memory(GiB)": 21.48, "step": 16251, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.956001 }, { "epoch": 0.5279537407010363, "grad_norm": 0.40517833828926086, "learning_rate": 4.951388425268769e-06, "loss": 0.02142241969704628, "memory(GiB)": 21.48, "step": 16252, "token_acc": 0.9858156028368794, "train_speed(iter/s)": 0.956011 }, { "epoch": 0.5279862261637916, "grad_norm": 0.4089035093784332, "learning_rate": 4.950851298003036e-06, "loss": 0.017908992245793343, "memory(GiB)": 21.48, "step": 16253, "token_acc": 0.9961538461538462, "train_speed(iter/s)": 0.956021 }, { "epoch": 0.5280187116265471, "grad_norm": 0.31334230303764343, "learning_rate": 4.950314171304546e-06, "loss": 0.012107121758162975, "memory(GiB)": 21.48, "step": 16254, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.95603 }, { "epoch": 0.5280511970893025, "grad_norm": 0.29481494426727295, "learning_rate": 4.949777045179496e-06, "loss": 0.02084018848836422, "memory(GiB)": 21.48, "step": 16255, "token_acc": 1.0, "train_speed(iter/s)": 0.95604 }, { "epoch": 0.528083682552058, "grad_norm": 0.2856338620185852, "learning_rate": 4.949239919634086e-06, "loss": 0.015757670626044273, "memory(GiB)": 21.48, "step": 16256, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956049 }, { "epoch": 0.5281161680148134, "grad_norm": 0.2915746569633484, "learning_rate": 4.948702794674514e-06, "loss": 0.01876104809343815, "memory(GiB)": 21.48, "step": 16257, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956057 }, { "epoch": 0.5281486534775688, "grad_norm": 0.42558765411376953, "learning_rate": 4.948165670306981e-06, "loss": 0.021135415881872177, "memory(GiB)": 21.48, "step": 16258, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.956065 }, { "epoch": 0.5281811389403243, "grad_norm": 0.3772987723350525, "learning_rate": 4.9476285465376845e-06, "loss": 0.023828556761145592, "memory(GiB)": 21.48, "step": 16259, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.956074 }, { "epoch": 0.5282136244030796, "grad_norm": 0.3729523718357086, "learning_rate": 4.947091423372825e-06, "loss": 0.020928626880049706, "memory(GiB)": 21.48, "step": 16260, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.956082 }, { "epoch": 0.5282461098658351, "grad_norm": 0.38928544521331787, "learning_rate": 4.9465543008186e-06, "loss": 0.017529943957924843, "memory(GiB)": 21.48, "step": 16261, "token_acc": 0.9930313588850174, "train_speed(iter/s)": 0.95609 }, { "epoch": 0.5282785953285905, "grad_norm": 0.39332908391952515, "learning_rate": 4.94601717888121e-06, "loss": 0.020117487758398056, "memory(GiB)": 21.48, "step": 16262, "token_acc": 0.9835164835164835, "train_speed(iter/s)": 0.9561 }, { "epoch": 0.5283110807913459, "grad_norm": 0.3329906165599823, "learning_rate": 4.945480057566853e-06, "loss": 0.013035843148827553, "memory(GiB)": 21.48, "step": 16263, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.95611 }, { "epoch": 0.5283435662541013, "grad_norm": 0.32469090819358826, "learning_rate": 4.944942936881729e-06, "loss": 0.015688009560108185, "memory(GiB)": 21.48, "step": 16264, "token_acc": 0.996, "train_speed(iter/s)": 0.956118 }, { "epoch": 0.5283760517168568, "grad_norm": 0.38787955045700073, "learning_rate": 4.944405816832038e-06, "loss": 0.019940020516514778, "memory(GiB)": 21.48, "step": 16265, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.956128 }, { "epoch": 0.5284085371796121, "grad_norm": 0.38648897409439087, "learning_rate": 4.943868697423976e-06, "loss": 0.021126993000507355, "memory(GiB)": 21.48, "step": 16266, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956137 }, { "epoch": 0.5284410226423676, "grad_norm": 0.30771803855895996, "learning_rate": 4.943331578663747e-06, "loss": 0.014325175434350967, "memory(GiB)": 21.48, "step": 16267, "token_acc": 1.0, "train_speed(iter/s)": 0.956146 }, { "epoch": 0.528473508105123, "grad_norm": 0.428925484418869, "learning_rate": 4.942794460557543e-06, "loss": 0.018786516040563583, "memory(GiB)": 21.48, "step": 16268, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.956155 }, { "epoch": 0.5285059935678784, "grad_norm": 0.37700778245925903, "learning_rate": 4.94225734311157e-06, "loss": 0.022082053124904633, "memory(GiB)": 21.48, "step": 16269, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.956165 }, { "epoch": 0.5285384790306338, "grad_norm": 0.3205452263355255, "learning_rate": 4.9417202263320225e-06, "loss": 0.02235950529575348, "memory(GiB)": 21.48, "step": 16270, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.956175 }, { "epoch": 0.5285709644933893, "grad_norm": 0.30462396144866943, "learning_rate": 4.9411831102251026e-06, "loss": 0.014678527601063251, "memory(GiB)": 21.48, "step": 16271, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956186 }, { "epoch": 0.5286034499561446, "grad_norm": 0.31268760561943054, "learning_rate": 4.940645994797006e-06, "loss": 0.02163432538509369, "memory(GiB)": 21.48, "step": 16272, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.956195 }, { "epoch": 0.5286359354189001, "grad_norm": 0.2994349002838135, "learning_rate": 4.940108880053936e-06, "loss": 0.013780273497104645, "memory(GiB)": 21.48, "step": 16273, "token_acc": 0.9856459330143541, "train_speed(iter/s)": 0.956204 }, { "epoch": 0.5286684208816554, "grad_norm": 2.0337162017822266, "learning_rate": 4.9395717660020875e-06, "loss": 0.030297383666038513, "memory(GiB)": 21.48, "step": 16274, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.956214 }, { "epoch": 0.5287009063444109, "grad_norm": 0.23990857601165771, "learning_rate": 4.939034652647662e-06, "loss": 0.01615450531244278, "memory(GiB)": 21.48, "step": 16275, "token_acc": 1.0, "train_speed(iter/s)": 0.956222 }, { "epoch": 0.5287333918071663, "grad_norm": 0.43383660912513733, "learning_rate": 4.938497539996857e-06, "loss": 0.02204909548163414, "memory(GiB)": 21.48, "step": 16276, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.956231 }, { "epoch": 0.5287658772699217, "grad_norm": 0.3669760823249817, "learning_rate": 4.937960428055874e-06, "loss": 0.024050595238804817, "memory(GiB)": 21.48, "step": 16277, "token_acc": 0.98828125, "train_speed(iter/s)": 0.956242 }, { "epoch": 0.5287983627326771, "grad_norm": 0.5445210337638855, "learning_rate": 4.937423316830908e-06, "loss": 0.029995890334248543, "memory(GiB)": 21.48, "step": 16278, "token_acc": 1.0, "train_speed(iter/s)": 0.956251 }, { "epoch": 0.5288308481954326, "grad_norm": 0.34044012427330017, "learning_rate": 4.93688620632816e-06, "loss": 0.024753667414188385, "memory(GiB)": 21.48, "step": 16279, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.956261 }, { "epoch": 0.5288633336581879, "grad_norm": 0.3860861361026764, "learning_rate": 4.936349096553831e-06, "loss": 0.020579468458890915, "memory(GiB)": 21.48, "step": 16280, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956272 }, { "epoch": 0.5288958191209434, "grad_norm": 0.34136319160461426, "learning_rate": 4.935811987514117e-06, "loss": 0.02205086499452591, "memory(GiB)": 21.48, "step": 16281, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956283 }, { "epoch": 0.5289283045836988, "grad_norm": 0.43347543478012085, "learning_rate": 4.935274879215219e-06, "loss": 0.018170107156038284, "memory(GiB)": 21.48, "step": 16282, "token_acc": 0.9774436090225563, "train_speed(iter/s)": 0.956295 }, { "epoch": 0.5289607900464542, "grad_norm": 0.4170038402080536, "learning_rate": 4.934737771663334e-06, "loss": 0.02423294074833393, "memory(GiB)": 21.48, "step": 16283, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.956307 }, { "epoch": 0.5289932755092096, "grad_norm": 0.4691791534423828, "learning_rate": 4.934200664864663e-06, "loss": 0.02177128568291664, "memory(GiB)": 21.48, "step": 16284, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.956319 }, { "epoch": 0.5290257609719651, "grad_norm": 1.0715441703796387, "learning_rate": 4.933663558825403e-06, "loss": 0.02775881625711918, "memory(GiB)": 21.48, "step": 16285, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956331 }, { "epoch": 0.5290582464347204, "grad_norm": 0.4752189517021179, "learning_rate": 4.933126453551755e-06, "loss": 0.026721032336354256, "memory(GiB)": 21.48, "step": 16286, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.956343 }, { "epoch": 0.5290907318974759, "grad_norm": 0.4876650869846344, "learning_rate": 4.932589349049916e-06, "loss": 0.02354222722351551, "memory(GiB)": 21.48, "step": 16287, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956355 }, { "epoch": 0.5291232173602313, "grad_norm": 1.5121057033538818, "learning_rate": 4.932052245326086e-06, "loss": 0.01978118345141411, "memory(GiB)": 21.48, "step": 16288, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.956368 }, { "epoch": 0.5291557028229867, "grad_norm": 0.32863208651542664, "learning_rate": 4.931515142386462e-06, "loss": 0.019765343517065048, "memory(GiB)": 21.48, "step": 16289, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.95638 }, { "epoch": 0.5291881882857421, "grad_norm": 0.4431390166282654, "learning_rate": 4.930978040237246e-06, "loss": 0.023974012583494186, "memory(GiB)": 21.48, "step": 16290, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.956391 }, { "epoch": 0.5292206737484976, "grad_norm": 0.3453657329082489, "learning_rate": 4.930440938884635e-06, "loss": 0.01226108893752098, "memory(GiB)": 21.48, "step": 16291, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956404 }, { "epoch": 0.5292531592112529, "grad_norm": 0.35756295919418335, "learning_rate": 4.929903838334829e-06, "loss": 0.025619931519031525, "memory(GiB)": 21.48, "step": 16292, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956415 }, { "epoch": 0.5292856446740084, "grad_norm": 0.38127046823501587, "learning_rate": 4.929366738594023e-06, "loss": 0.02194982022047043, "memory(GiB)": 21.48, "step": 16293, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.956427 }, { "epoch": 0.5293181301367638, "grad_norm": 0.30672842264175415, "learning_rate": 4.9288296396684206e-06, "loss": 0.017588140442967415, "memory(GiB)": 21.48, "step": 16294, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.956439 }, { "epoch": 0.5293506155995192, "grad_norm": 0.6906728148460388, "learning_rate": 4.92829254156422e-06, "loss": 0.024751098826527596, "memory(GiB)": 21.48, "step": 16295, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.956452 }, { "epoch": 0.5293831010622746, "grad_norm": 0.38449540734291077, "learning_rate": 4.927755444287617e-06, "loss": 0.01703782007098198, "memory(GiB)": 21.48, "step": 16296, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.956464 }, { "epoch": 0.5294155865250301, "grad_norm": 0.3289647698402405, "learning_rate": 4.927218347844815e-06, "loss": 0.017673060297966003, "memory(GiB)": 21.48, "step": 16297, "token_acc": 1.0, "train_speed(iter/s)": 0.956476 }, { "epoch": 0.5294480719877854, "grad_norm": 0.4012846052646637, "learning_rate": 4.9266812522420074e-06, "loss": 0.022061550989747047, "memory(GiB)": 21.48, "step": 16298, "token_acc": 0.9932659932659933, "train_speed(iter/s)": 0.956488 }, { "epoch": 0.5294805574505409, "grad_norm": 0.21715570986270905, "learning_rate": 4.926144157485398e-06, "loss": 0.011611465364694595, "memory(GiB)": 21.48, "step": 16299, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.956501 }, { "epoch": 0.5295130429132963, "grad_norm": 0.412019819021225, "learning_rate": 4.925607063581181e-06, "loss": 0.018710220232605934, "memory(GiB)": 21.48, "step": 16300, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.956514 }, { "epoch": 0.5295455283760517, "grad_norm": 0.3174039125442505, "learning_rate": 4.92506997053556e-06, "loss": 0.02270922251045704, "memory(GiB)": 21.48, "step": 16301, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.956524 }, { "epoch": 0.5295780138388071, "grad_norm": 0.27756035327911377, "learning_rate": 4.92453287835473e-06, "loss": 0.013515334576368332, "memory(GiB)": 21.48, "step": 16302, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.956534 }, { "epoch": 0.5296104993015626, "grad_norm": 0.28345537185668945, "learning_rate": 4.923995787044891e-06, "loss": 0.021086111664772034, "memory(GiB)": 21.48, "step": 16303, "token_acc": 1.0, "train_speed(iter/s)": 0.956543 }, { "epoch": 0.5296429847643179, "grad_norm": 0.44320330023765564, "learning_rate": 4.923458696612242e-06, "loss": 0.021677618846297264, "memory(GiB)": 21.48, "step": 16304, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.956553 }, { "epoch": 0.5296754702270734, "grad_norm": 0.30995845794677734, "learning_rate": 4.922921607062981e-06, "loss": 0.021132707595825195, "memory(GiB)": 21.48, "step": 16305, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.956562 }, { "epoch": 0.5297079556898288, "grad_norm": 0.204905703663826, "learning_rate": 4.922384518403308e-06, "loss": 0.01734854094684124, "memory(GiB)": 21.48, "step": 16306, "token_acc": 0.98828125, "train_speed(iter/s)": 0.956571 }, { "epoch": 0.5297404411525842, "grad_norm": 0.46164461970329285, "learning_rate": 4.9218474306394204e-06, "loss": 0.02073173224925995, "memory(GiB)": 21.48, "step": 16307, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956581 }, { "epoch": 0.5297729266153396, "grad_norm": 0.3631772994995117, "learning_rate": 4.9213103437775175e-06, "loss": 0.022372514009475708, "memory(GiB)": 21.48, "step": 16308, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.95659 }, { "epoch": 0.5298054120780951, "grad_norm": 0.398055762052536, "learning_rate": 4.920773257823797e-06, "loss": 0.027984706684947014, "memory(GiB)": 21.48, "step": 16309, "token_acc": 1.0, "train_speed(iter/s)": 0.9566 }, { "epoch": 0.5298378975408504, "grad_norm": 0.2694995701313019, "learning_rate": 4.92023617278446e-06, "loss": 0.01758422702550888, "memory(GiB)": 21.48, "step": 16310, "token_acc": 0.9884615384615385, "train_speed(iter/s)": 0.956609 }, { "epoch": 0.5298703830036059, "grad_norm": 0.31063687801361084, "learning_rate": 4.919699088665703e-06, "loss": 0.020656827837228775, "memory(GiB)": 21.48, "step": 16311, "token_acc": 0.98046875, "train_speed(iter/s)": 0.956618 }, { "epoch": 0.5299028684663613, "grad_norm": 0.375652015209198, "learning_rate": 4.9191620054737264e-06, "loss": 0.02367505058646202, "memory(GiB)": 21.48, "step": 16312, "token_acc": 0.9854014598540146, "train_speed(iter/s)": 0.956626 }, { "epoch": 0.5299353539291167, "grad_norm": 0.44007793068885803, "learning_rate": 4.918624923214727e-06, "loss": 0.020769719034433365, "memory(GiB)": 21.48, "step": 16313, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.956636 }, { "epoch": 0.5299678393918721, "grad_norm": 0.3005051612854004, "learning_rate": 4.9180878418949045e-06, "loss": 0.013999124057590961, "memory(GiB)": 21.48, "step": 16314, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956645 }, { "epoch": 0.5300003248546276, "grad_norm": 0.3842286765575409, "learning_rate": 4.9175507615204574e-06, "loss": 0.02551114559173584, "memory(GiB)": 21.48, "step": 16315, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.956655 }, { "epoch": 0.5300328103173829, "grad_norm": 0.3344385027885437, "learning_rate": 4.917013682097584e-06, "loss": 0.0199847724288702, "memory(GiB)": 21.48, "step": 16316, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.956664 }, { "epoch": 0.5300652957801384, "grad_norm": 0.4374436140060425, "learning_rate": 4.916476603632483e-06, "loss": 0.020817618817090988, "memory(GiB)": 21.48, "step": 16317, "token_acc": 1.0, "train_speed(iter/s)": 0.956672 }, { "epoch": 0.5300977812428938, "grad_norm": 0.37188559770584106, "learning_rate": 4.915939526131354e-06, "loss": 0.02522394061088562, "memory(GiB)": 21.48, "step": 16318, "token_acc": 0.9826839826839827, "train_speed(iter/s)": 0.956682 }, { "epoch": 0.5301302667056492, "grad_norm": 0.3737264573574066, "learning_rate": 4.915402449600394e-06, "loss": 0.014696182683110237, "memory(GiB)": 21.48, "step": 16319, "token_acc": 1.0, "train_speed(iter/s)": 0.956691 }, { "epoch": 0.5301627521684047, "grad_norm": 0.2854233682155609, "learning_rate": 4.914865374045804e-06, "loss": 0.02197130396962166, "memory(GiB)": 21.48, "step": 16320, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.9567 }, { "epoch": 0.53019523763116, "grad_norm": 0.2769180238246918, "learning_rate": 4.914328299473778e-06, "loss": 0.013189326040446758, "memory(GiB)": 21.48, "step": 16321, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956709 }, { "epoch": 0.5302277230939155, "grad_norm": 0.33463117480278015, "learning_rate": 4.913791225890521e-06, "loss": 0.018684983253479004, "memory(GiB)": 21.48, "step": 16322, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.956718 }, { "epoch": 0.5302602085566709, "grad_norm": 0.3136075437068939, "learning_rate": 4.913254153302223e-06, "loss": 0.017058484256267548, "memory(GiB)": 21.48, "step": 16323, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.956728 }, { "epoch": 0.5302926940194264, "grad_norm": 0.30316272377967834, "learning_rate": 4.912717081715091e-06, "loss": 0.017041031271219254, "memory(GiB)": 21.48, "step": 16324, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.956736 }, { "epoch": 0.5303251794821817, "grad_norm": 0.371501088142395, "learning_rate": 4.912180011135321e-06, "loss": 0.020903851836919785, "memory(GiB)": 21.48, "step": 16325, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.956746 }, { "epoch": 0.5303576649449372, "grad_norm": 0.3356177806854248, "learning_rate": 4.9116429415691065e-06, "loss": 0.021359721198678017, "memory(GiB)": 21.48, "step": 16326, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.956756 }, { "epoch": 0.5303901504076926, "grad_norm": 0.38419803977012634, "learning_rate": 4.911105873022655e-06, "loss": 0.019012033939361572, "memory(GiB)": 21.48, "step": 16327, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956765 }, { "epoch": 0.530422635870448, "grad_norm": 0.2872975468635559, "learning_rate": 4.910568805502155e-06, "loss": 0.016183219850063324, "memory(GiB)": 21.48, "step": 16328, "token_acc": 1.0, "train_speed(iter/s)": 0.956774 }, { "epoch": 0.5304551213332034, "grad_norm": 0.2702539265155792, "learning_rate": 4.9100317390138156e-06, "loss": 0.013237372040748596, "memory(GiB)": 21.48, "step": 16329, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.956783 }, { "epoch": 0.5304876067959589, "grad_norm": 0.3764024078845978, "learning_rate": 4.909494673563826e-06, "loss": 0.02150241658091545, "memory(GiB)": 21.48, "step": 16330, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956792 }, { "epoch": 0.5305200922587142, "grad_norm": 0.40569618344306946, "learning_rate": 4.908957609158389e-06, "loss": 0.018231097608804703, "memory(GiB)": 21.48, "step": 16331, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.956801 }, { "epoch": 0.5305525777214697, "grad_norm": 0.3519703447818756, "learning_rate": 4.908420545803702e-06, "loss": 0.0156698040664196, "memory(GiB)": 21.48, "step": 16332, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.95681 }, { "epoch": 0.530585063184225, "grad_norm": 0.2646985948085785, "learning_rate": 4.907883483505965e-06, "loss": 0.01568150892853737, "memory(GiB)": 21.48, "step": 16333, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.95682 }, { "epoch": 0.5306175486469805, "grad_norm": 0.425434410572052, "learning_rate": 4.907346422271374e-06, "loss": 0.026475634425878525, "memory(GiB)": 21.48, "step": 16334, "token_acc": 0.9794238683127572, "train_speed(iter/s)": 0.956829 }, { "epoch": 0.5306500341097359, "grad_norm": 0.3221217691898346, "learning_rate": 4.90680936210613e-06, "loss": 0.01863652467727661, "memory(GiB)": 21.48, "step": 16335, "token_acc": 0.993006993006993, "train_speed(iter/s)": 0.956839 }, { "epoch": 0.5306825195724914, "grad_norm": 0.14854846894741058, "learning_rate": 4.906272303016428e-06, "loss": 0.012164007872343063, "memory(GiB)": 21.48, "step": 16336, "token_acc": 1.0, "train_speed(iter/s)": 0.956849 }, { "epoch": 0.5307150050352467, "grad_norm": 0.26164308190345764, "learning_rate": 4.90573524500847e-06, "loss": 0.015810592100024223, "memory(GiB)": 21.48, "step": 16337, "token_acc": 1.0, "train_speed(iter/s)": 0.956859 }, { "epoch": 0.5307474904980022, "grad_norm": 0.3823453187942505, "learning_rate": 4.905198188088451e-06, "loss": 0.02187167853116989, "memory(GiB)": 21.48, "step": 16338, "token_acc": 1.0, "train_speed(iter/s)": 0.956868 }, { "epoch": 0.5307799759607575, "grad_norm": 0.5029954314231873, "learning_rate": 4.904661132262573e-06, "loss": 0.029985643923282623, "memory(GiB)": 21.48, "step": 16339, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956875 }, { "epoch": 0.530812461423513, "grad_norm": 0.3757797181606293, "learning_rate": 4.904124077537031e-06, "loss": 0.02044188603758812, "memory(GiB)": 21.48, "step": 16340, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.956885 }, { "epoch": 0.5308449468862684, "grad_norm": 0.36865541338920593, "learning_rate": 4.903587023918025e-06, "loss": 0.023635704070329666, "memory(GiB)": 21.48, "step": 16341, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.956894 }, { "epoch": 0.5308774323490238, "grad_norm": 0.48329436779022217, "learning_rate": 4.903049971411755e-06, "loss": 0.01840343326330185, "memory(GiB)": 21.48, "step": 16342, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.956904 }, { "epoch": 0.5309099178117792, "grad_norm": 0.33535629510879517, "learning_rate": 4.902512920024415e-06, "loss": 0.01778470352292061, "memory(GiB)": 21.48, "step": 16343, "token_acc": 1.0, "train_speed(iter/s)": 0.956913 }, { "epoch": 0.5309424032745347, "grad_norm": 0.3846135437488556, "learning_rate": 4.901975869762206e-06, "loss": 0.022695330902934074, "memory(GiB)": 21.48, "step": 16344, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956923 }, { "epoch": 0.53097488873729, "grad_norm": 0.3839251399040222, "learning_rate": 4.901438820631327e-06, "loss": 0.01725134626030922, "memory(GiB)": 21.48, "step": 16345, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956935 }, { "epoch": 0.5310073742000455, "grad_norm": 0.4376799762248993, "learning_rate": 4.900901772637975e-06, "loss": 0.016642669215798378, "memory(GiB)": 21.48, "step": 16346, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956948 }, { "epoch": 0.5310398596628009, "grad_norm": 1.2174235582351685, "learning_rate": 4.9003647257883475e-06, "loss": 0.02672387659549713, "memory(GiB)": 21.48, "step": 16347, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956959 }, { "epoch": 0.5310723451255563, "grad_norm": 0.4520404636859894, "learning_rate": 4.899827680088646e-06, "loss": 0.020999688655138016, "memory(GiB)": 21.48, "step": 16348, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.956971 }, { "epoch": 0.5311048305883117, "grad_norm": 0.28664910793304443, "learning_rate": 4.899290635545063e-06, "loss": 0.016295991837978363, "memory(GiB)": 21.48, "step": 16349, "token_acc": 1.0, "train_speed(iter/s)": 0.956983 }, { "epoch": 0.5311373160510672, "grad_norm": 0.5985412001609802, "learning_rate": 4.898753592163803e-06, "loss": 0.019947156310081482, "memory(GiB)": 21.48, "step": 16350, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.956996 }, { "epoch": 0.5311698015138225, "grad_norm": 0.3566044569015503, "learning_rate": 4.898216549951059e-06, "loss": 0.01543533243238926, "memory(GiB)": 21.48, "step": 16351, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.957007 }, { "epoch": 0.531202286976578, "grad_norm": 0.5335527062416077, "learning_rate": 4.897679508913034e-06, "loss": 0.022509336471557617, "memory(GiB)": 21.48, "step": 16352, "token_acc": 1.0, "train_speed(iter/s)": 0.957019 }, { "epoch": 0.5312347724393334, "grad_norm": 0.37677764892578125, "learning_rate": 4.897142469055922e-06, "loss": 0.01914406381547451, "memory(GiB)": 21.48, "step": 16353, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.957031 }, { "epoch": 0.5312672579020888, "grad_norm": 0.46684861183166504, "learning_rate": 4.896605430385924e-06, "loss": 0.021456053480505943, "memory(GiB)": 21.48, "step": 16354, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.957044 }, { "epoch": 0.5312997433648442, "grad_norm": 0.45093590021133423, "learning_rate": 4.896068392909235e-06, "loss": 0.028440941125154495, "memory(GiB)": 21.48, "step": 16355, "token_acc": 0.9789473684210527, "train_speed(iter/s)": 0.957056 }, { "epoch": 0.5313322288275997, "grad_norm": 0.5056983232498169, "learning_rate": 4.895531356632054e-06, "loss": 0.02447843924164772, "memory(GiB)": 21.48, "step": 16356, "token_acc": 1.0, "train_speed(iter/s)": 0.957068 }, { "epoch": 0.531364714290355, "grad_norm": 0.4482913911342621, "learning_rate": 4.894994321560585e-06, "loss": 0.02048645168542862, "memory(GiB)": 21.48, "step": 16357, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.95708 }, { "epoch": 0.5313971997531105, "grad_norm": 0.38926494121551514, "learning_rate": 4.894457287701017e-06, "loss": 0.018744351342320442, "memory(GiB)": 21.48, "step": 16358, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.957092 }, { "epoch": 0.5314296852158659, "grad_norm": 0.47512251138687134, "learning_rate": 4.893920255059555e-06, "loss": 0.025685906410217285, "memory(GiB)": 21.48, "step": 16359, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.957104 }, { "epoch": 0.5314621706786213, "grad_norm": 0.44116753339767456, "learning_rate": 4.893383223642393e-06, "loss": 0.0236147940158844, "memory(GiB)": 21.48, "step": 16360, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.957116 }, { "epoch": 0.5314946561413767, "grad_norm": 0.3439151644706726, "learning_rate": 4.892846193455732e-06, "loss": 0.019733978435397148, "memory(GiB)": 21.48, "step": 16361, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.957126 }, { "epoch": 0.5315271416041322, "grad_norm": 0.42246371507644653, "learning_rate": 4.892309164505766e-06, "loss": 0.023976949974894524, "memory(GiB)": 21.48, "step": 16362, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957135 }, { "epoch": 0.5315596270668875, "grad_norm": 0.4807449281215668, "learning_rate": 4.891772136798698e-06, "loss": 0.017633112147450447, "memory(GiB)": 21.48, "step": 16363, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957144 }, { "epoch": 0.531592112529643, "grad_norm": 0.3773491382598877, "learning_rate": 4.891235110340722e-06, "loss": 0.022810885682702065, "memory(GiB)": 21.48, "step": 16364, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.957154 }, { "epoch": 0.5316245979923984, "grad_norm": 0.2535208463668823, "learning_rate": 4.890698085138039e-06, "loss": 0.02056056447327137, "memory(GiB)": 21.48, "step": 16365, "token_acc": 0.9812734082397003, "train_speed(iter/s)": 0.957164 }, { "epoch": 0.5316570834551538, "grad_norm": 0.3791359066963196, "learning_rate": 4.890161061196844e-06, "loss": 0.02232368476688862, "memory(GiB)": 21.48, "step": 16366, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.957173 }, { "epoch": 0.5316895689179092, "grad_norm": 0.3930113911628723, "learning_rate": 4.8896240385233375e-06, "loss": 0.021735070273280144, "memory(GiB)": 21.48, "step": 16367, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.957182 }, { "epoch": 0.5317220543806647, "grad_norm": 0.8311419486999512, "learning_rate": 4.889087017123716e-06, "loss": 0.02137969434261322, "memory(GiB)": 21.48, "step": 16368, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.957192 }, { "epoch": 0.53175453984342, "grad_norm": 0.6495532393455505, "learning_rate": 4.888549997004179e-06, "loss": 0.030076181516051292, "memory(GiB)": 21.48, "step": 16369, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.957201 }, { "epoch": 0.5317870253061755, "grad_norm": 0.3678906559944153, "learning_rate": 4.888012978170922e-06, "loss": 0.022251268848776817, "memory(GiB)": 21.48, "step": 16370, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.95721 }, { "epoch": 0.5318195107689309, "grad_norm": 0.33617162704467773, "learning_rate": 4.8874759606301445e-06, "loss": 0.023966152220964432, "memory(GiB)": 21.48, "step": 16371, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957219 }, { "epoch": 0.5318519962316863, "grad_norm": 0.2712152898311615, "learning_rate": 4.886938944388045e-06, "loss": 0.01377926766872406, "memory(GiB)": 21.48, "step": 16372, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.95723 }, { "epoch": 0.5318844816944417, "grad_norm": 0.369373083114624, "learning_rate": 4.88640192945082e-06, "loss": 0.017997097223997116, "memory(GiB)": 21.48, "step": 16373, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.957239 }, { "epoch": 0.5319169671571972, "grad_norm": 1.6521276235580444, "learning_rate": 4.885864915824669e-06, "loss": 0.02308277226984501, "memory(GiB)": 21.48, "step": 16374, "token_acc": 0.9583333333333334, "train_speed(iter/s)": 0.957248 }, { "epoch": 0.5319494526199525, "grad_norm": 0.4909634590148926, "learning_rate": 4.885327903515787e-06, "loss": 0.0255591943860054, "memory(GiB)": 21.48, "step": 16375, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.957257 }, { "epoch": 0.531981938082708, "grad_norm": 0.34047403931617737, "learning_rate": 4.884790892530376e-06, "loss": 0.02535424754023552, "memory(GiB)": 21.48, "step": 16376, "token_acc": 0.9771689497716894, "train_speed(iter/s)": 0.957267 }, { "epoch": 0.5320144235454634, "grad_norm": 0.36904406547546387, "learning_rate": 4.884253882874629e-06, "loss": 0.030430955812335014, "memory(GiB)": 21.48, "step": 16377, "token_acc": 0.9769230769230769, "train_speed(iter/s)": 0.957277 }, { "epoch": 0.5320469090082188, "grad_norm": 0.3524322211742401, "learning_rate": 4.883716874554748e-06, "loss": 0.017523830756545067, "memory(GiB)": 21.48, "step": 16378, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.957287 }, { "epoch": 0.5320793944709742, "grad_norm": 0.29974475502967834, "learning_rate": 4.883179867576927e-06, "loss": 0.021853484213352203, "memory(GiB)": 21.48, "step": 16379, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.957296 }, { "epoch": 0.5321118799337297, "grad_norm": 0.31322628259658813, "learning_rate": 4.882642861947369e-06, "loss": 0.02187507227063179, "memory(GiB)": 21.48, "step": 16380, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.957307 }, { "epoch": 0.532144365396485, "grad_norm": 0.36502543091773987, "learning_rate": 4.882105857672266e-06, "loss": 0.023323938250541687, "memory(GiB)": 21.48, "step": 16381, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957319 }, { "epoch": 0.5321768508592405, "grad_norm": 0.5373106002807617, "learning_rate": 4.881568854757819e-06, "loss": 0.02263525128364563, "memory(GiB)": 21.48, "step": 16382, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957332 }, { "epoch": 0.5322093363219959, "grad_norm": 0.29873085021972656, "learning_rate": 4.881031853210225e-06, "loss": 0.015540027990937233, "memory(GiB)": 21.48, "step": 16383, "token_acc": 1.0, "train_speed(iter/s)": 0.957345 }, { "epoch": 0.5322418217847513, "grad_norm": 0.2686860263347626, "learning_rate": 4.880494853035682e-06, "loss": 0.017441514879465103, "memory(GiB)": 21.48, "step": 16384, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957357 }, { "epoch": 0.5322743072475068, "grad_norm": 0.318263977766037, "learning_rate": 4.879957854240387e-06, "loss": 0.01829354465007782, "memory(GiB)": 21.48, "step": 16385, "token_acc": 1.0, "train_speed(iter/s)": 0.957367 }, { "epoch": 0.5323067927102622, "grad_norm": 0.458541601896286, "learning_rate": 4.8794208568305375e-06, "loss": 0.02981405146420002, "memory(GiB)": 21.48, "step": 16386, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.957376 }, { "epoch": 0.5323392781730176, "grad_norm": 0.30773067474365234, "learning_rate": 4.878883860812334e-06, "loss": 0.017880603671073914, "memory(GiB)": 21.48, "step": 16387, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957386 }, { "epoch": 0.532371763635773, "grad_norm": 0.318459689617157, "learning_rate": 4.878346866191969e-06, "loss": 0.02042161300778389, "memory(GiB)": 21.48, "step": 16388, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957395 }, { "epoch": 0.5324042490985285, "grad_norm": 0.41139987111091614, "learning_rate": 4.877809872975646e-06, "loss": 0.017383603379130363, "memory(GiB)": 21.48, "step": 16389, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957405 }, { "epoch": 0.5324367345612838, "grad_norm": 0.3497026860713959, "learning_rate": 4.8772728811695574e-06, "loss": 0.021763615310192108, "memory(GiB)": 21.48, "step": 16390, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957415 }, { "epoch": 0.5324692200240393, "grad_norm": 0.2945813834667206, "learning_rate": 4.8767358907799045e-06, "loss": 0.018117059022188187, "memory(GiB)": 21.48, "step": 16391, "token_acc": 1.0, "train_speed(iter/s)": 0.957424 }, { "epoch": 0.5325017054867947, "grad_norm": 0.3934534192085266, "learning_rate": 4.876198901812883e-06, "loss": 0.02747276984155178, "memory(GiB)": 21.48, "step": 16392, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.957434 }, { "epoch": 0.5325341909495501, "grad_norm": 0.47507986426353455, "learning_rate": 4.875661914274691e-06, "loss": 0.021679406985640526, "memory(GiB)": 21.48, "step": 16393, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.957444 }, { "epoch": 0.5325666764123055, "grad_norm": 0.40866371989250183, "learning_rate": 4.875124928171526e-06, "loss": 0.01986975036561489, "memory(GiB)": 21.48, "step": 16394, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.957454 }, { "epoch": 0.532599161875061, "grad_norm": 0.31898853182792664, "learning_rate": 4.874587943509586e-06, "loss": 0.0200902558863163, "memory(GiB)": 21.48, "step": 16395, "token_acc": 0.9948979591836735, "train_speed(iter/s)": 0.957463 }, { "epoch": 0.5326316473378163, "grad_norm": 0.34099435806274414, "learning_rate": 4.874050960295067e-06, "loss": 0.020739106461405754, "memory(GiB)": 21.48, "step": 16396, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957472 }, { "epoch": 0.5326641328005718, "grad_norm": 0.3220469653606415, "learning_rate": 4.873513978534169e-06, "loss": 0.019027408212423325, "memory(GiB)": 21.48, "step": 16397, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957482 }, { "epoch": 0.5326966182633271, "grad_norm": 0.3701182007789612, "learning_rate": 4.872976998233087e-06, "loss": 0.015864774584770203, "memory(GiB)": 21.48, "step": 16398, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.95749 }, { "epoch": 0.5327291037260826, "grad_norm": 0.34103304147720337, "learning_rate": 4.8724400193980205e-06, "loss": 0.01831737346947193, "memory(GiB)": 21.48, "step": 16399, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.957501 }, { "epoch": 0.532761589188838, "grad_norm": 0.3992672264575958, "learning_rate": 4.871903042035164e-06, "loss": 0.023866023868322372, "memory(GiB)": 21.48, "step": 16400, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.957512 }, { "epoch": 0.5327940746515935, "grad_norm": 0.45550212264060974, "learning_rate": 4.871366066150718e-06, "loss": 0.02320638671517372, "memory(GiB)": 21.48, "step": 16401, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.957522 }, { "epoch": 0.5328265601143488, "grad_norm": 0.41581323742866516, "learning_rate": 4.870829091750879e-06, "loss": 0.020029088482260704, "memory(GiB)": 21.48, "step": 16402, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957532 }, { "epoch": 0.5328590455771043, "grad_norm": 0.27224990725517273, "learning_rate": 4.870292118841844e-06, "loss": 0.018643047660589218, "memory(GiB)": 21.48, "step": 16403, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.957539 }, { "epoch": 0.5328915310398596, "grad_norm": 0.5308165550231934, "learning_rate": 4.869755147429812e-06, "loss": 0.02825421839952469, "memory(GiB)": 21.48, "step": 16404, "token_acc": 0.984375, "train_speed(iter/s)": 0.957547 }, { "epoch": 0.5329240165026151, "grad_norm": 0.31236422061920166, "learning_rate": 4.869218177520976e-06, "loss": 0.017833691090345383, "memory(GiB)": 21.48, "step": 16405, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.957557 }, { "epoch": 0.5329565019653705, "grad_norm": 0.26191362738609314, "learning_rate": 4.8686812091215395e-06, "loss": 0.016394738107919693, "memory(GiB)": 21.48, "step": 16406, "token_acc": 0.994475138121547, "train_speed(iter/s)": 0.957567 }, { "epoch": 0.532988987428126, "grad_norm": 0.34088465571403503, "learning_rate": 4.868144242237694e-06, "loss": 0.01970832422375679, "memory(GiB)": 21.48, "step": 16407, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.957577 }, { "epoch": 0.5330214728908813, "grad_norm": 0.3495713174343109, "learning_rate": 4.867607276875642e-06, "loss": 0.011396708898246288, "memory(GiB)": 21.48, "step": 16408, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957587 }, { "epoch": 0.5330539583536368, "grad_norm": 0.2425379753112793, "learning_rate": 4.8670703130415755e-06, "loss": 0.01628991588950157, "memory(GiB)": 21.48, "step": 16409, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.9576 }, { "epoch": 0.5330864438163921, "grad_norm": 0.4348738491535187, "learning_rate": 4.866533350741697e-06, "loss": 0.03128235414624214, "memory(GiB)": 21.48, "step": 16410, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.957613 }, { "epoch": 0.5331189292791476, "grad_norm": 0.2961963415145874, "learning_rate": 4.865996389982199e-06, "loss": 0.01926380768418312, "memory(GiB)": 21.48, "step": 16411, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.957626 }, { "epoch": 0.533151414741903, "grad_norm": 0.4087742567062378, "learning_rate": 4.865459430769283e-06, "loss": 0.030194628983736038, "memory(GiB)": 21.48, "step": 16412, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.957637 }, { "epoch": 0.5331839002046584, "grad_norm": 0.36051318049430847, "learning_rate": 4.864922473109143e-06, "loss": 0.015978405252099037, "memory(GiB)": 21.48, "step": 16413, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.95765 }, { "epoch": 0.5332163856674138, "grad_norm": 0.33083784580230713, "learning_rate": 4.8643855170079785e-06, "loss": 0.01942082680761814, "memory(GiB)": 21.48, "step": 16414, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.957663 }, { "epoch": 0.5332488711301693, "grad_norm": 0.3572314977645874, "learning_rate": 4.863848562471984e-06, "loss": 0.02656031772494316, "memory(GiB)": 21.48, "step": 16415, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.957674 }, { "epoch": 0.5332813565929246, "grad_norm": 0.4611055850982666, "learning_rate": 4.86331160950736e-06, "loss": 0.022699933499097824, "memory(GiB)": 21.48, "step": 16416, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957686 }, { "epoch": 0.5333138420556801, "grad_norm": 0.4283570647239685, "learning_rate": 4.8627746581203e-06, "loss": 0.015969786792993546, "memory(GiB)": 21.48, "step": 16417, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.957699 }, { "epoch": 0.5333463275184355, "grad_norm": 0.345146507024765, "learning_rate": 4.862237708317004e-06, "loss": 0.018468007445335388, "memory(GiB)": 21.48, "step": 16418, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.957712 }, { "epoch": 0.5333788129811909, "grad_norm": 0.3656686246395111, "learning_rate": 4.861700760103669e-06, "loss": 0.01671230047941208, "memory(GiB)": 21.48, "step": 16419, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.957724 }, { "epoch": 0.5334112984439463, "grad_norm": 0.3424414396286011, "learning_rate": 4.861163813486491e-06, "loss": 0.016713574528694153, "memory(GiB)": 21.48, "step": 16420, "token_acc": 0.9930313588850174, "train_speed(iter/s)": 0.957735 }, { "epoch": 0.5334437839067018, "grad_norm": 0.42104044556617737, "learning_rate": 4.860626868471667e-06, "loss": 0.020635100081562996, "memory(GiB)": 21.48, "step": 16421, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.957744 }, { "epoch": 0.5334762693694571, "grad_norm": 0.5961227416992188, "learning_rate": 4.860089925065394e-06, "loss": 0.027836311608552933, "memory(GiB)": 21.48, "step": 16422, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.957754 }, { "epoch": 0.5335087548322126, "grad_norm": 0.5485398769378662, "learning_rate": 4.8595529832738706e-06, "loss": 0.019580524414777756, "memory(GiB)": 21.48, "step": 16423, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.957764 }, { "epoch": 0.533541240294968, "grad_norm": 0.3477337956428528, "learning_rate": 4.8590160431032925e-06, "loss": 0.014054197818040848, "memory(GiB)": 21.48, "step": 16424, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.957775 }, { "epoch": 0.5335737257577234, "grad_norm": 0.3748767375946045, "learning_rate": 4.858479104559856e-06, "loss": 0.013814263977110386, "memory(GiB)": 21.48, "step": 16425, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.957784 }, { "epoch": 0.5336062112204788, "grad_norm": 0.4206070899963379, "learning_rate": 4.85794216764976e-06, "loss": 0.019653186202049255, "memory(GiB)": 21.48, "step": 16426, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.957793 }, { "epoch": 0.5336386966832343, "grad_norm": 0.31582704186439514, "learning_rate": 4.857405232379201e-06, "loss": 0.01676824316382408, "memory(GiB)": 21.48, "step": 16427, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.957802 }, { "epoch": 0.5336711821459896, "grad_norm": 0.3209061026573181, "learning_rate": 4.8568682987543735e-06, "loss": 0.0196908637881279, "memory(GiB)": 21.48, "step": 16428, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.957812 }, { "epoch": 0.5337036676087451, "grad_norm": 0.285788893699646, "learning_rate": 4.856331366781479e-06, "loss": 0.016322236508131027, "memory(GiB)": 21.48, "step": 16429, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957822 }, { "epoch": 0.5337361530715005, "grad_norm": 0.2995230257511139, "learning_rate": 4.855794436466708e-06, "loss": 0.013048638589680195, "memory(GiB)": 21.48, "step": 16430, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.957832 }, { "epoch": 0.5337686385342559, "grad_norm": 0.3253219425678253, "learning_rate": 4.855257507816265e-06, "loss": 0.015320910140872002, "memory(GiB)": 21.48, "step": 16431, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.95784 }, { "epoch": 0.5338011239970113, "grad_norm": 0.3291589021682739, "learning_rate": 4.854720580836341e-06, "loss": 0.023104682564735413, "memory(GiB)": 21.48, "step": 16432, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.95785 }, { "epoch": 0.5338336094597668, "grad_norm": 0.33096668124198914, "learning_rate": 4.854183655533133e-06, "loss": 0.018459532409906387, "memory(GiB)": 21.48, "step": 16433, "token_acc": 0.9927007299270073, "train_speed(iter/s)": 0.95786 }, { "epoch": 0.5338660949225221, "grad_norm": 0.32103031873703003, "learning_rate": 4.853646731912844e-06, "loss": 0.01920998841524124, "memory(GiB)": 21.48, "step": 16434, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.95787 }, { "epoch": 0.5338985803852776, "grad_norm": 0.4583573043346405, "learning_rate": 4.8531098099816626e-06, "loss": 0.022835638374090195, "memory(GiB)": 21.48, "step": 16435, "token_acc": 0.9823008849557522, "train_speed(iter/s)": 0.957879 }, { "epoch": 0.533931065848033, "grad_norm": 0.2971877455711365, "learning_rate": 4.852572889745792e-06, "loss": 0.016025736927986145, "memory(GiB)": 21.48, "step": 16436, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.957889 }, { "epoch": 0.5339635513107884, "grad_norm": 0.3435492515563965, "learning_rate": 4.852035971211426e-06, "loss": 0.02084418199956417, "memory(GiB)": 21.48, "step": 16437, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.957898 }, { "epoch": 0.5339960367735438, "grad_norm": 0.23595888912677765, "learning_rate": 4.8514990543847615e-06, "loss": 0.01852220669388771, "memory(GiB)": 21.48, "step": 16438, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.957908 }, { "epoch": 0.5340285222362993, "grad_norm": 0.43269526958465576, "learning_rate": 4.850962139271996e-06, "loss": 0.016567803919315338, "memory(GiB)": 21.48, "step": 16439, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.957918 }, { "epoch": 0.5340610076990546, "grad_norm": 0.31645849347114563, "learning_rate": 4.850425225879326e-06, "loss": 0.019315220415592194, "memory(GiB)": 21.48, "step": 16440, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.95793 }, { "epoch": 0.5340934931618101, "grad_norm": 0.3850494921207428, "learning_rate": 4.849888314212946e-06, "loss": 0.025615790858864784, "memory(GiB)": 21.48, "step": 16441, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.957943 }, { "epoch": 0.5341259786245655, "grad_norm": 0.3063304126262665, "learning_rate": 4.849351404279057e-06, "loss": 0.016961675137281418, "memory(GiB)": 21.48, "step": 16442, "token_acc": 1.0, "train_speed(iter/s)": 0.957955 }, { "epoch": 0.5341584640873209, "grad_norm": 0.3615315556526184, "learning_rate": 4.848814496083851e-06, "loss": 0.025375520810484886, "memory(GiB)": 21.48, "step": 16443, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957968 }, { "epoch": 0.5341909495500763, "grad_norm": 0.4073669910430908, "learning_rate": 4.8482775896335295e-06, "loss": 0.021294180303812027, "memory(GiB)": 21.48, "step": 16444, "token_acc": 0.9790209790209791, "train_speed(iter/s)": 0.957981 }, { "epoch": 0.5342234350128318, "grad_norm": 0.34143489599227905, "learning_rate": 4.847740684934285e-06, "loss": 0.023351717740297318, "memory(GiB)": 21.48, "step": 16445, "token_acc": 0.9937888198757764, "train_speed(iter/s)": 0.957993 }, { "epoch": 0.5342559204755871, "grad_norm": 0.4341824948787689, "learning_rate": 4.847203781992316e-06, "loss": 0.01882108300924301, "memory(GiB)": 21.48, "step": 16446, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.958004 }, { "epoch": 0.5342884059383426, "grad_norm": 0.5107173323631287, "learning_rate": 4.846666880813818e-06, "loss": 0.020023629069328308, "memory(GiB)": 21.48, "step": 16447, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.958017 }, { "epoch": 0.5343208914010981, "grad_norm": 0.43419724702835083, "learning_rate": 4.846129981404989e-06, "loss": 0.019453562796115875, "memory(GiB)": 21.48, "step": 16448, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.95803 }, { "epoch": 0.5343533768638534, "grad_norm": 0.20987258851528168, "learning_rate": 4.845593083772025e-06, "loss": 0.012861503288149834, "memory(GiB)": 21.48, "step": 16449, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.958041 }, { "epoch": 0.5343858623266089, "grad_norm": 0.27515530586242676, "learning_rate": 4.845056187921122e-06, "loss": 0.02165275812149048, "memory(GiB)": 21.48, "step": 16450, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.958051 }, { "epoch": 0.5344183477893643, "grad_norm": 0.44480738043785095, "learning_rate": 4.844519293858477e-06, "loss": 0.03299336135387421, "memory(GiB)": 21.48, "step": 16451, "token_acc": 0.9856459330143541, "train_speed(iter/s)": 0.95806 }, { "epoch": 0.5344508332521197, "grad_norm": 0.4090331494808197, "learning_rate": 4.843982401590285e-06, "loss": 0.02233530767261982, "memory(GiB)": 21.48, "step": 16452, "token_acc": 1.0, "train_speed(iter/s)": 0.95807 }, { "epoch": 0.5344833187148751, "grad_norm": 0.5838001370429993, "learning_rate": 4.843445511122747e-06, "loss": 0.02933541312813759, "memory(GiB)": 21.48, "step": 16453, "token_acc": 1.0, "train_speed(iter/s)": 0.958081 }, { "epoch": 0.5345158041776306, "grad_norm": 0.3549306392669678, "learning_rate": 4.842908622462052e-06, "loss": 0.019258547574281693, "memory(GiB)": 21.48, "step": 16454, "token_acc": 0.9869565217391304, "train_speed(iter/s)": 0.958091 }, { "epoch": 0.5345482896403859, "grad_norm": 0.3018769323825836, "learning_rate": 4.842371735614405e-06, "loss": 0.01732458546757698, "memory(GiB)": 21.48, "step": 16455, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.958101 }, { "epoch": 0.5345807751031414, "grad_norm": 0.36870625615119934, "learning_rate": 4.841834850585994e-06, "loss": 0.024276215583086014, "memory(GiB)": 21.48, "step": 16456, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.95811 }, { "epoch": 0.5346132605658968, "grad_norm": 0.41432610154151917, "learning_rate": 4.841297967383023e-06, "loss": 0.020286984741687775, "memory(GiB)": 21.48, "step": 16457, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.958119 }, { "epoch": 0.5346457460286522, "grad_norm": 0.34366172552108765, "learning_rate": 4.840761086011681e-06, "loss": 0.01773020625114441, "memory(GiB)": 21.48, "step": 16458, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.958128 }, { "epoch": 0.5346782314914076, "grad_norm": 0.27703458070755005, "learning_rate": 4.840224206478171e-06, "loss": 0.015923351049423218, "memory(GiB)": 21.48, "step": 16459, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.958137 }, { "epoch": 0.5347107169541631, "grad_norm": 0.44801849126815796, "learning_rate": 4.8396873287886844e-06, "loss": 0.017361655831336975, "memory(GiB)": 21.48, "step": 16460, "token_acc": 0.9895470383275261, "train_speed(iter/s)": 0.958147 }, { "epoch": 0.5347432024169184, "grad_norm": 0.39155223965644836, "learning_rate": 4.839150452949421e-06, "loss": 0.013069471344351768, "memory(GiB)": 21.48, "step": 16461, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.958156 }, { "epoch": 0.5347756878796739, "grad_norm": 0.36347073316574097, "learning_rate": 4.8386135789665735e-06, "loss": 0.022114573046565056, "memory(GiB)": 21.48, "step": 16462, "token_acc": 0.988, "train_speed(iter/s)": 0.958166 }, { "epoch": 0.5348081733424292, "grad_norm": 0.28943079710006714, "learning_rate": 4.8380767068463395e-06, "loss": 0.019584858790040016, "memory(GiB)": 21.48, "step": 16463, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.958177 }, { "epoch": 0.5348406588051847, "grad_norm": 0.6587437987327576, "learning_rate": 4.8375398365949195e-06, "loss": 0.024471137672662735, "memory(GiB)": 21.48, "step": 16464, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.958186 }, { "epoch": 0.5348731442679401, "grad_norm": 0.3521692156791687, "learning_rate": 4.8370029682185025e-06, "loss": 0.022876370698213577, "memory(GiB)": 21.48, "step": 16465, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.958197 }, { "epoch": 0.5349056297306956, "grad_norm": 0.26399075984954834, "learning_rate": 4.8364661017232915e-06, "loss": 0.017697960138320923, "memory(GiB)": 21.48, "step": 16466, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.958207 }, { "epoch": 0.5349381151934509, "grad_norm": 0.551934003829956, "learning_rate": 4.835929237115477e-06, "loss": 0.02959522232413292, "memory(GiB)": 21.48, "step": 16467, "token_acc": 0.9711538461538461, "train_speed(iter/s)": 0.958216 }, { "epoch": 0.5349706006562064, "grad_norm": 0.20619772374629974, "learning_rate": 4.835392374401258e-06, "loss": 0.013602152466773987, "memory(GiB)": 21.48, "step": 16468, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.958221 }, { "epoch": 0.5350030861189617, "grad_norm": 0.4249570071697235, "learning_rate": 4.8348555135868295e-06, "loss": 0.022779900580644608, "memory(GiB)": 21.48, "step": 16469, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.95823 }, { "epoch": 0.5350355715817172, "grad_norm": 0.33242759108543396, "learning_rate": 4.834318654678389e-06, "loss": 0.02212509885430336, "memory(GiB)": 21.48, "step": 16470, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.95824 }, { "epoch": 0.5350680570444726, "grad_norm": 0.3628598749637604, "learning_rate": 4.833781797682132e-06, "loss": 0.030593466013669968, "memory(GiB)": 21.48, "step": 16471, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.958249 }, { "epoch": 0.535100542507228, "grad_norm": 0.33164817094802856, "learning_rate": 4.833244942604254e-06, "loss": 0.01897461898624897, "memory(GiB)": 21.48, "step": 16472, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.958258 }, { "epoch": 0.5351330279699834, "grad_norm": 0.326481431722641, "learning_rate": 4.8327080894509505e-06, "loss": 0.015348980203270912, "memory(GiB)": 21.48, "step": 16473, "token_acc": 1.0, "train_speed(iter/s)": 0.958269 }, { "epoch": 0.5351655134327389, "grad_norm": 0.3538767099380493, "learning_rate": 4.8321712382284195e-06, "loss": 0.012643221765756607, "memory(GiB)": 21.48, "step": 16474, "token_acc": 1.0, "train_speed(iter/s)": 0.95828 }, { "epoch": 0.5351979988954942, "grad_norm": 0.7270895838737488, "learning_rate": 4.831634388942855e-06, "loss": 0.022525355219841003, "memory(GiB)": 21.48, "step": 16475, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.958293 }, { "epoch": 0.5352304843582497, "grad_norm": 0.3400350511074066, "learning_rate": 4.831097541600454e-06, "loss": 0.0177895650267601, "memory(GiB)": 21.48, "step": 16476, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.958305 }, { "epoch": 0.5352629698210051, "grad_norm": 0.39553365111351013, "learning_rate": 4.830560696207412e-06, "loss": 0.01531820185482502, "memory(GiB)": 21.48, "step": 16477, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.958318 }, { "epoch": 0.5352954552837605, "grad_norm": 0.3715137839317322, "learning_rate": 4.830023852769927e-06, "loss": 0.01836981438100338, "memory(GiB)": 21.48, "step": 16478, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.95833 }, { "epoch": 0.5353279407465159, "grad_norm": 0.36073216795921326, "learning_rate": 4.8294870112941895e-06, "loss": 0.02100992761552334, "memory(GiB)": 21.48, "step": 16479, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.958342 }, { "epoch": 0.5353604262092714, "grad_norm": 0.5857973694801331, "learning_rate": 4.828950171786399e-06, "loss": 0.023716825991868973, "memory(GiB)": 21.48, "step": 16480, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.958352 }, { "epoch": 0.5353929116720267, "grad_norm": 0.5529085993766785, "learning_rate": 4.828413334252754e-06, "loss": 0.025918954983353615, "memory(GiB)": 21.48, "step": 16481, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.958361 }, { "epoch": 0.5354253971347822, "grad_norm": 0.5224534273147583, "learning_rate": 4.827876498699446e-06, "loss": 0.015822159126400948, "memory(GiB)": 21.48, "step": 16482, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.958372 }, { "epoch": 0.5354578825975376, "grad_norm": 0.35564252734184265, "learning_rate": 4.827339665132673e-06, "loss": 0.017781341448426247, "memory(GiB)": 21.48, "step": 16483, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.958381 }, { "epoch": 0.535490368060293, "grad_norm": 0.32378050684928894, "learning_rate": 4.826802833558629e-06, "loss": 0.016619481146335602, "memory(GiB)": 21.48, "step": 16484, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.95839 }, { "epoch": 0.5355228535230484, "grad_norm": 0.4361170530319214, "learning_rate": 4.826266003983513e-06, "loss": 0.02123284339904785, "memory(GiB)": 21.48, "step": 16485, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.958399 }, { "epoch": 0.5355553389858039, "grad_norm": 0.4408891797065735, "learning_rate": 4.825729176413516e-06, "loss": 0.02219836413860321, "memory(GiB)": 21.48, "step": 16486, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.958408 }, { "epoch": 0.5355878244485592, "grad_norm": 0.5911251902580261, "learning_rate": 4.825192350854838e-06, "loss": 0.02573264390230179, "memory(GiB)": 21.48, "step": 16487, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.958416 }, { "epoch": 0.5356203099113147, "grad_norm": 0.2102729082107544, "learning_rate": 4.8246555273136706e-06, "loss": 0.014142964035272598, "memory(GiB)": 21.48, "step": 16488, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.958425 }, { "epoch": 0.5356527953740701, "grad_norm": 0.24739494919776917, "learning_rate": 4.824118705796215e-06, "loss": 0.011997279711067677, "memory(GiB)": 21.48, "step": 16489, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.958434 }, { "epoch": 0.5356852808368255, "grad_norm": 0.2663577198982239, "learning_rate": 4.823581886308662e-06, "loss": 0.014806946739554405, "memory(GiB)": 21.48, "step": 16490, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.958444 }, { "epoch": 0.5357177662995809, "grad_norm": 0.3192915618419647, "learning_rate": 4.823045068857209e-06, "loss": 0.02291717939078808, "memory(GiB)": 21.48, "step": 16491, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.958452 }, { "epoch": 0.5357502517623364, "grad_norm": 0.43940597772598267, "learning_rate": 4.822508253448051e-06, "loss": 0.026603899896144867, "memory(GiB)": 21.48, "step": 16492, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.958461 }, { "epoch": 0.5357827372250917, "grad_norm": 0.36685287952423096, "learning_rate": 4.8219714400873845e-06, "loss": 0.014808891341090202, "memory(GiB)": 21.48, "step": 16493, "token_acc": 0.9965156794425087, "train_speed(iter/s)": 0.95847 }, { "epoch": 0.5358152226878472, "grad_norm": 0.505321204662323, "learning_rate": 4.821434628781404e-06, "loss": 0.02373463660478592, "memory(GiB)": 21.48, "step": 16494, "token_acc": 1.0, "train_speed(iter/s)": 0.958479 }, { "epoch": 0.5358477081506026, "grad_norm": 0.27640146017074585, "learning_rate": 4.820897819536305e-06, "loss": 0.01974213868379593, "memory(GiB)": 21.48, "step": 16495, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.958489 }, { "epoch": 0.535880193613358, "grad_norm": 0.4001140892505646, "learning_rate": 4.820361012358286e-06, "loss": 0.027174850925803185, "memory(GiB)": 21.48, "step": 16496, "token_acc": 0.9858156028368794, "train_speed(iter/s)": 0.958497 }, { "epoch": 0.5359126790761134, "grad_norm": 0.3837110996246338, "learning_rate": 4.8198242072535375e-06, "loss": 0.02284691110253334, "memory(GiB)": 21.48, "step": 16497, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.958507 }, { "epoch": 0.5359451645388689, "grad_norm": 0.2932373285293579, "learning_rate": 4.819287404228259e-06, "loss": 0.02122684195637703, "memory(GiB)": 21.48, "step": 16498, "token_acc": 1.0, "train_speed(iter/s)": 0.958516 }, { "epoch": 0.5359776500016242, "grad_norm": 0.6177448034286499, "learning_rate": 4.8187506032886426e-06, "loss": 0.021638531237840652, "memory(GiB)": 21.48, "step": 16499, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.958526 }, { "epoch": 0.5360101354643797, "grad_norm": 0.30805566906929016, "learning_rate": 4.818213804440888e-06, "loss": 0.021324045956134796, "memory(GiB)": 21.48, "step": 16500, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.958539 }, { "epoch": 0.5360101354643797, "eval_loss": 0.02062080428004265, "eval_runtime": 80.2556, "eval_samples_per_second": 123.979, "eval_steps_per_second": 3.875, "eval_token_acc": 0.9916323017862874, "step": 16500 }, { "epoch": 0.5360426209271351, "grad_norm": 0.3932285010814667, "learning_rate": 4.817677007691186e-06, "loss": 0.021393775939941406, "memory(GiB)": 21.48, "step": 16501, "token_acc": 0.9911012540386617, "train_speed(iter/s)": 0.953491 }, { "epoch": 0.5360751063898905, "grad_norm": 0.3282966911792755, "learning_rate": 4.817140213045735e-06, "loss": 0.0130069674924016, "memory(GiB)": 21.48, "step": 16502, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.953498 }, { "epoch": 0.5361075918526459, "grad_norm": 0.2867771089076996, "learning_rate": 4.8166034205107286e-06, "loss": 0.013175928965210915, "memory(GiB)": 21.48, "step": 16503, "token_acc": 1.0, "train_speed(iter/s)": 0.953506 }, { "epoch": 0.5361400773154014, "grad_norm": 0.453279972076416, "learning_rate": 4.816066630092364e-06, "loss": 0.033043909817934036, "memory(GiB)": 21.48, "step": 16504, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.953514 }, { "epoch": 0.5361725627781567, "grad_norm": 0.24651123583316803, "learning_rate": 4.8155298417968336e-06, "loss": 0.016407020390033722, "memory(GiB)": 21.48, "step": 16505, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.953523 }, { "epoch": 0.5362050482409122, "grad_norm": 0.3138335049152374, "learning_rate": 4.814993055630336e-06, "loss": 0.023464025929570198, "memory(GiB)": 21.48, "step": 16506, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.95353 }, { "epoch": 0.5362375337036676, "grad_norm": 0.34356847405433655, "learning_rate": 4.814456271599064e-06, "loss": 0.015050931833684444, "memory(GiB)": 21.48, "step": 16507, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.953537 }, { "epoch": 0.536270019166423, "grad_norm": 0.35536283254623413, "learning_rate": 4.813919489709215e-06, "loss": 0.020298562943935394, "memory(GiB)": 21.48, "step": 16508, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.953546 }, { "epoch": 0.5363025046291784, "grad_norm": 0.3198648691177368, "learning_rate": 4.81338270996698e-06, "loss": 0.01892011985182762, "memory(GiB)": 21.48, "step": 16509, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.953556 }, { "epoch": 0.5363349900919339, "grad_norm": 0.371511846780777, "learning_rate": 4.812845932378558e-06, "loss": 0.027803825214505196, "memory(GiB)": 21.48, "step": 16510, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.953566 }, { "epoch": 0.5363674755546892, "grad_norm": 0.3402479588985443, "learning_rate": 4.812309156950144e-06, "loss": 0.019903482869267464, "memory(GiB)": 21.48, "step": 16511, "token_acc": 0.9893992932862191, "train_speed(iter/s)": 0.953576 }, { "epoch": 0.5363999610174447, "grad_norm": 0.29378795623779297, "learning_rate": 4.811772383687931e-06, "loss": 0.014631852507591248, "memory(GiB)": 21.48, "step": 16512, "token_acc": 1.0, "train_speed(iter/s)": 0.953586 }, { "epoch": 0.5364324464802002, "grad_norm": 0.2808138132095337, "learning_rate": 4.811235612598118e-06, "loss": 0.015694409608840942, "memory(GiB)": 21.48, "step": 16513, "token_acc": 1.0, "train_speed(iter/s)": 0.953582 }, { "epoch": 0.5364649319429555, "grad_norm": 0.326793372631073, "learning_rate": 4.8106988436868926e-06, "loss": 0.01763889566063881, "memory(GiB)": 21.48, "step": 16514, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.953594 }, { "epoch": 0.536497417405711, "grad_norm": 0.30773553252220154, "learning_rate": 4.8101620769604585e-06, "loss": 0.021255914121866226, "memory(GiB)": 21.48, "step": 16515, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.953607 }, { "epoch": 0.5365299028684664, "grad_norm": 0.5219404101371765, "learning_rate": 4.809625312425004e-06, "loss": 0.021937182173132896, "memory(GiB)": 21.48, "step": 16516, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.953619 }, { "epoch": 0.5365623883312218, "grad_norm": 0.3741462528705597, "learning_rate": 4.80908855008673e-06, "loss": 0.01772928610444069, "memory(GiB)": 21.48, "step": 16517, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.953632 }, { "epoch": 0.5365948737939772, "grad_norm": 0.3121908903121948, "learning_rate": 4.808551789951826e-06, "loss": 0.022096555680036545, "memory(GiB)": 21.48, "step": 16518, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.953644 }, { "epoch": 0.5366273592567327, "grad_norm": 0.3106257915496826, "learning_rate": 4.808015032026489e-06, "loss": 0.016091736033558846, "memory(GiB)": 21.48, "step": 16519, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.953656 }, { "epoch": 0.536659844719488, "grad_norm": 0.2548435628414154, "learning_rate": 4.807478276316914e-06, "loss": 0.017556196078658104, "memory(GiB)": 21.48, "step": 16520, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.953669 }, { "epoch": 0.5366923301822435, "grad_norm": 0.2905818521976471, "learning_rate": 4.806941522829296e-06, "loss": 0.019556432962417603, "memory(GiB)": 21.48, "step": 16521, "token_acc": 1.0, "train_speed(iter/s)": 0.953681 }, { "epoch": 0.5367248156449989, "grad_norm": 0.27865153551101685, "learning_rate": 4.80640477156983e-06, "loss": 0.01921270042657852, "memory(GiB)": 21.48, "step": 16522, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.953693 }, { "epoch": 0.5367573011077543, "grad_norm": 0.3031059503555298, "learning_rate": 4.805868022544712e-06, "loss": 0.01760372892022133, "memory(GiB)": 21.48, "step": 16523, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.953705 }, { "epoch": 0.5367897865705097, "grad_norm": 0.3218786120414734, "learning_rate": 4.805331275760134e-06, "loss": 0.015715211629867554, "memory(GiB)": 21.48, "step": 16524, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.953718 }, { "epoch": 0.5368222720332652, "grad_norm": 0.2983459234237671, "learning_rate": 4.804794531222291e-06, "loss": 0.016275767236948013, "memory(GiB)": 21.48, "step": 16525, "token_acc": 1.0, "train_speed(iter/s)": 0.95373 }, { "epoch": 0.5368547574960205, "grad_norm": 0.3545669913291931, "learning_rate": 4.8042577889373805e-06, "loss": 0.023537514731287956, "memory(GiB)": 21.48, "step": 16526, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.953743 }, { "epoch": 0.536887242958776, "grad_norm": 0.4150821268558502, "learning_rate": 4.803721048911595e-06, "loss": 0.022101886570453644, "memory(GiB)": 21.48, "step": 16527, "token_acc": 0.9838056680161943, "train_speed(iter/s)": 0.953755 }, { "epoch": 0.5369197284215314, "grad_norm": 0.2563735544681549, "learning_rate": 4.803184311151131e-06, "loss": 0.01543604489415884, "memory(GiB)": 21.48, "step": 16528, "token_acc": 1.0, "train_speed(iter/s)": 0.953768 }, { "epoch": 0.5369522138842868, "grad_norm": 0.4444626271724701, "learning_rate": 4.802647575662181e-06, "loss": 0.018275201320648193, "memory(GiB)": 21.48, "step": 16529, "token_acc": 1.0, "train_speed(iter/s)": 0.95378 }, { "epoch": 0.5369846993470422, "grad_norm": 0.3881646692752838, "learning_rate": 4.802110842450941e-06, "loss": 0.019025437533855438, "memory(GiB)": 21.48, "step": 16530, "token_acc": 0.99609375, "train_speed(iter/s)": 0.953792 }, { "epoch": 0.5370171848097977, "grad_norm": 0.5140908360481262, "learning_rate": 4.801574111523605e-06, "loss": 0.018534908071160316, "memory(GiB)": 21.48, "step": 16531, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.953804 }, { "epoch": 0.537049670272553, "grad_norm": 0.39526036381721497, "learning_rate": 4.801037382886368e-06, "loss": 0.0246720090508461, "memory(GiB)": 21.48, "step": 16532, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.953816 }, { "epoch": 0.5370821557353085, "grad_norm": 0.4531046748161316, "learning_rate": 4.800500656545425e-06, "loss": 0.027901601046323776, "memory(GiB)": 21.48, "step": 16533, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.953827 }, { "epoch": 0.5371146411980638, "grad_norm": 0.2814272344112396, "learning_rate": 4.799963932506969e-06, "loss": 0.01344995852559805, "memory(GiB)": 21.48, "step": 16534, "token_acc": 0.99, "train_speed(iter/s)": 0.95384 }, { "epoch": 0.5371471266608193, "grad_norm": 0.33760643005371094, "learning_rate": 4.7994272107771955e-06, "loss": 0.02217867597937584, "memory(GiB)": 21.48, "step": 16535, "token_acc": 0.9939393939393939, "train_speed(iter/s)": 0.953852 }, { "epoch": 0.5371796121235747, "grad_norm": 0.2560555338859558, "learning_rate": 4.798890491362302e-06, "loss": 0.015123551711440086, "memory(GiB)": 21.48, "step": 16536, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.953864 }, { "epoch": 0.5372120975863302, "grad_norm": 0.2675037384033203, "learning_rate": 4.798353774268475e-06, "loss": 0.014965362846851349, "memory(GiB)": 21.48, "step": 16537, "token_acc": 1.0, "train_speed(iter/s)": 0.953877 }, { "epoch": 0.5372445830490855, "grad_norm": 0.45797422528266907, "learning_rate": 4.797817059501917e-06, "loss": 0.022314315661787987, "memory(GiB)": 21.48, "step": 16538, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.953889 }, { "epoch": 0.537277068511841, "grad_norm": 0.41370296478271484, "learning_rate": 4.797280347068817e-06, "loss": 0.01921304129064083, "memory(GiB)": 21.48, "step": 16539, "token_acc": 1.0, "train_speed(iter/s)": 0.953901 }, { "epoch": 0.5373095539745963, "grad_norm": 0.3088473975658417, "learning_rate": 4.7967436369753725e-06, "loss": 0.016579575836658478, "memory(GiB)": 21.48, "step": 16540, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.953912 }, { "epoch": 0.5373420394373518, "grad_norm": 0.28774139285087585, "learning_rate": 4.796206929227779e-06, "loss": 0.014424145221710205, "memory(GiB)": 21.48, "step": 16541, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.953924 }, { "epoch": 0.5373745249001072, "grad_norm": 0.3509754240512848, "learning_rate": 4.795670223832226e-06, "loss": 0.019102536141872406, "memory(GiB)": 21.48, "step": 16542, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.953937 }, { "epoch": 0.5374070103628626, "grad_norm": 0.3419157564640045, "learning_rate": 4.795133520794913e-06, "loss": 0.020514270290732384, "memory(GiB)": 21.48, "step": 16543, "token_acc": 1.0, "train_speed(iter/s)": 0.953949 }, { "epoch": 0.537439495825618, "grad_norm": 0.4876999855041504, "learning_rate": 4.794596820122029e-06, "loss": 0.018321271985769272, "memory(GiB)": 21.48, "step": 16544, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.953961 }, { "epoch": 0.5374719812883735, "grad_norm": 0.32986071705818176, "learning_rate": 4.794060121819775e-06, "loss": 0.014989618211984634, "memory(GiB)": 21.48, "step": 16545, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.953972 }, { "epoch": 0.5375044667511288, "grad_norm": 0.4301779866218567, "learning_rate": 4.793523425894338e-06, "loss": 0.032183822244405746, "memory(GiB)": 21.48, "step": 16546, "token_acc": 0.9835390946502057, "train_speed(iter/s)": 0.953981 }, { "epoch": 0.5375369522138843, "grad_norm": 0.3924970328807831, "learning_rate": 4.792986732351917e-06, "loss": 0.023873478174209595, "memory(GiB)": 21.48, "step": 16547, "token_acc": 0.9965635738831615, "train_speed(iter/s)": 0.953991 }, { "epoch": 0.5375694376766397, "grad_norm": 0.40991005301475525, "learning_rate": 4.7924500411987036e-06, "loss": 0.028087157756090164, "memory(GiB)": 21.48, "step": 16548, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.954001 }, { "epoch": 0.5376019231393951, "grad_norm": 0.4065603017807007, "learning_rate": 4.791913352440894e-06, "loss": 0.019885458052158356, "memory(GiB)": 21.48, "step": 16549, "token_acc": 0.9918367346938776, "train_speed(iter/s)": 0.954011 }, { "epoch": 0.5376344086021505, "grad_norm": 0.2393762171268463, "learning_rate": 4.79137666608468e-06, "loss": 0.010417637415230274, "memory(GiB)": 21.48, "step": 16550, "token_acc": 0.9930555555555556, "train_speed(iter/s)": 0.954021 }, { "epoch": 0.537666894064906, "grad_norm": 0.4778406620025635, "learning_rate": 4.790839982136258e-06, "loss": 0.0216786190867424, "memory(GiB)": 21.48, "step": 16551, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.954031 }, { "epoch": 0.5376993795276613, "grad_norm": 0.6568092107772827, "learning_rate": 4.7903033006018205e-06, "loss": 0.015112030319869518, "memory(GiB)": 21.48, "step": 16552, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.95404 }, { "epoch": 0.5377318649904168, "grad_norm": 0.39230552315711975, "learning_rate": 4.7897666214875626e-06, "loss": 0.015481959097087383, "memory(GiB)": 21.48, "step": 16553, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.954049 }, { "epoch": 0.5377643504531722, "grad_norm": 0.422330379486084, "learning_rate": 4.789229944799677e-06, "loss": 0.013594027608633041, "memory(GiB)": 21.48, "step": 16554, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.954057 }, { "epoch": 0.5377968359159276, "grad_norm": 0.31976011395454407, "learning_rate": 4.788693270544359e-06, "loss": 0.012883979827165604, "memory(GiB)": 21.48, "step": 16555, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.954065 }, { "epoch": 0.537829321378683, "grad_norm": 0.5098048448562622, "learning_rate": 4.788156598727802e-06, "loss": 0.027107052505016327, "memory(GiB)": 21.48, "step": 16556, "token_acc": 0.9837837837837838, "train_speed(iter/s)": 0.954074 }, { "epoch": 0.5378618068414385, "grad_norm": 0.30695751309394836, "learning_rate": 4.787619929356198e-06, "loss": 0.01858292892575264, "memory(GiB)": 21.48, "step": 16557, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.954083 }, { "epoch": 0.5378942923041938, "grad_norm": 0.3496299088001251, "learning_rate": 4.787083262435745e-06, "loss": 0.020067598670721054, "memory(GiB)": 21.48, "step": 16558, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.954091 }, { "epoch": 0.5379267777669493, "grad_norm": 0.3446558117866516, "learning_rate": 4.786546597972634e-06, "loss": 0.02387414686381817, "memory(GiB)": 21.48, "step": 16559, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.954099 }, { "epoch": 0.5379592632297047, "grad_norm": 0.38046491146087646, "learning_rate": 4.7860099359730585e-06, "loss": 0.020896337926387787, "memory(GiB)": 21.48, "step": 16560, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.954107 }, { "epoch": 0.5379917486924601, "grad_norm": 0.3553866744041443, "learning_rate": 4.785473276443214e-06, "loss": 0.01850368268787861, "memory(GiB)": 21.48, "step": 16561, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.954115 }, { "epoch": 0.5380242341552155, "grad_norm": 0.2945346534252167, "learning_rate": 4.7849366193892935e-06, "loss": 0.020145181566476822, "memory(GiB)": 21.48, "step": 16562, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.954124 }, { "epoch": 0.538056719617971, "grad_norm": 0.5262593030929565, "learning_rate": 4.784399964817489e-06, "loss": 0.0263490192592144, "memory(GiB)": 21.48, "step": 16563, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.954132 }, { "epoch": 0.5380892050807263, "grad_norm": 0.5542699098587036, "learning_rate": 4.783863312733999e-06, "loss": 0.02200377732515335, "memory(GiB)": 21.48, "step": 16564, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.95414 }, { "epoch": 0.5381216905434818, "grad_norm": 0.43817466497421265, "learning_rate": 4.783326663145011e-06, "loss": 0.026096880435943604, "memory(GiB)": 21.48, "step": 16565, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.954149 }, { "epoch": 0.5381541760062372, "grad_norm": 0.35133543610572815, "learning_rate": 4.7827900160567245e-06, "loss": 0.018029894679784775, "memory(GiB)": 21.48, "step": 16566, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.954158 }, { "epoch": 0.5381866614689926, "grad_norm": 0.41123420000076294, "learning_rate": 4.782253371475326e-06, "loss": 0.015489412471652031, "memory(GiB)": 21.48, "step": 16567, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.954168 }, { "epoch": 0.538219146931748, "grad_norm": 0.4493386447429657, "learning_rate": 4.781716729407019e-06, "loss": 0.023616479709744453, "memory(GiB)": 21.48, "step": 16568, "token_acc": 1.0, "train_speed(iter/s)": 0.954179 }, { "epoch": 0.5382516323945035, "grad_norm": 0.49891307950019836, "learning_rate": 4.781180089857988e-06, "loss": 0.02415461093187332, "memory(GiB)": 21.48, "step": 16569, "token_acc": 0.9965156794425087, "train_speed(iter/s)": 0.954188 }, { "epoch": 0.5382841178572588, "grad_norm": 0.4392346143722534, "learning_rate": 4.780643452834431e-06, "loss": 0.019973691552877426, "memory(GiB)": 21.48, "step": 16570, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.954197 }, { "epoch": 0.5383166033200143, "grad_norm": 0.3754047155380249, "learning_rate": 4.7801068183425394e-06, "loss": 0.01524524949491024, "memory(GiB)": 21.48, "step": 16571, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.954205 }, { "epoch": 0.5383490887827697, "grad_norm": 0.38643160462379456, "learning_rate": 4.779570186388507e-06, "loss": 0.022996213287115097, "memory(GiB)": 21.48, "step": 16572, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.954214 }, { "epoch": 0.5383815742455251, "grad_norm": 0.262124627828598, "learning_rate": 4.779033556978531e-06, "loss": 0.016709355637431145, "memory(GiB)": 21.48, "step": 16573, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.954224 }, { "epoch": 0.5384140597082805, "grad_norm": 0.6036669015884399, "learning_rate": 4.7784969301188e-06, "loss": 0.024213720113039017, "memory(GiB)": 21.48, "step": 16574, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.954233 }, { "epoch": 0.538446545171036, "grad_norm": 0.33815979957580566, "learning_rate": 4.77796030581551e-06, "loss": 0.016310131177306175, "memory(GiB)": 21.48, "step": 16575, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.954243 }, { "epoch": 0.5384790306337914, "grad_norm": 0.31879618763923645, "learning_rate": 4.7774236840748524e-06, "loss": 0.018795978277921677, "memory(GiB)": 21.48, "step": 16576, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.954253 }, { "epoch": 0.5385115160965468, "grad_norm": 0.2845641076564789, "learning_rate": 4.776887064903024e-06, "loss": 0.01630154438316822, "memory(GiB)": 21.48, "step": 16577, "token_acc": 0.9794238683127572, "train_speed(iter/s)": 0.954264 }, { "epoch": 0.5385440015593023, "grad_norm": 0.46963977813720703, "learning_rate": 4.776350448306214e-06, "loss": 0.02440691366791725, "memory(GiB)": 21.48, "step": 16578, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.954276 }, { "epoch": 0.5385764870220576, "grad_norm": 0.42751461267471313, "learning_rate": 4.775813834290618e-06, "loss": 0.01573757827281952, "memory(GiB)": 21.48, "step": 16579, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.954288 }, { "epoch": 0.5386089724848131, "grad_norm": 0.5850005149841309, "learning_rate": 4.775277222862428e-06, "loss": 0.03442796319723129, "memory(GiB)": 21.48, "step": 16580, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.9543 }, { "epoch": 0.5386414579475685, "grad_norm": 0.4041447937488556, "learning_rate": 4.77474061402784e-06, "loss": 0.021253734827041626, "memory(GiB)": 21.48, "step": 16581, "token_acc": 0.98828125, "train_speed(iter/s)": 0.954313 }, { "epoch": 0.5386739434103239, "grad_norm": 0.34936070442199707, "learning_rate": 4.774204007793043e-06, "loss": 0.023787032812833786, "memory(GiB)": 21.48, "step": 16582, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.954325 }, { "epoch": 0.5387064288730793, "grad_norm": 0.4562794864177704, "learning_rate": 4.773667404164233e-06, "loss": 0.018719535320997238, "memory(GiB)": 21.48, "step": 16583, "token_acc": 0.9947643979057592, "train_speed(iter/s)": 0.954337 }, { "epoch": 0.5387389143358348, "grad_norm": 0.45713692903518677, "learning_rate": 4.773130803147602e-06, "loss": 0.016450610011816025, "memory(GiB)": 21.48, "step": 16584, "token_acc": 0.9961538461538462, "train_speed(iter/s)": 0.95435 }, { "epoch": 0.5387713997985901, "grad_norm": 0.282943993806839, "learning_rate": 4.772594204749344e-06, "loss": 0.017834065482020378, "memory(GiB)": 21.48, "step": 16585, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.954362 }, { "epoch": 0.5388038852613456, "grad_norm": 0.48421862721443176, "learning_rate": 4.7720576089756515e-06, "loss": 0.024292659014463425, "memory(GiB)": 21.48, "step": 16586, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.954374 }, { "epoch": 0.538836370724101, "grad_norm": 0.22298790514469147, "learning_rate": 4.771521015832718e-06, "loss": 0.012732432223856449, "memory(GiB)": 21.48, "step": 16587, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.954386 }, { "epoch": 0.5388688561868564, "grad_norm": 0.3366277813911438, "learning_rate": 4.770984425326736e-06, "loss": 0.025268934667110443, "memory(GiB)": 21.48, "step": 16588, "token_acc": 0.9947916666666666, "train_speed(iter/s)": 0.954398 }, { "epoch": 0.5389013416496118, "grad_norm": 0.2944435179233551, "learning_rate": 4.770447837463898e-06, "loss": 0.017812861129641533, "memory(GiB)": 21.48, "step": 16589, "token_acc": 0.977859778597786, "train_speed(iter/s)": 0.95441 }, { "epoch": 0.5389338271123673, "grad_norm": 0.29384782910346985, "learning_rate": 4.769911252250398e-06, "loss": 0.017040863633155823, "memory(GiB)": 21.48, "step": 16590, "token_acc": 1.0, "train_speed(iter/s)": 0.954422 }, { "epoch": 0.5389663125751226, "grad_norm": 0.3323363661766052, "learning_rate": 4.769374669692427e-06, "loss": 0.020345021039247513, "memory(GiB)": 21.48, "step": 16591, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.954434 }, { "epoch": 0.5389987980378781, "grad_norm": 0.4051830768585205, "learning_rate": 4.7688380897961826e-06, "loss": 0.025562256574630737, "memory(GiB)": 21.48, "step": 16592, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.954446 }, { "epoch": 0.5390312835006335, "grad_norm": 0.36228036880493164, "learning_rate": 4.76830151256785e-06, "loss": 0.024068579077720642, "memory(GiB)": 21.48, "step": 16593, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.954459 }, { "epoch": 0.5390637689633889, "grad_norm": 0.3935975432395935, "learning_rate": 4.7677649380136304e-06, "loss": 0.022076919674873352, "memory(GiB)": 21.48, "step": 16594, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.95447 }, { "epoch": 0.5390962544261443, "grad_norm": 0.3338083028793335, "learning_rate": 4.767228366139708e-06, "loss": 0.020243950188159943, "memory(GiB)": 21.48, "step": 16595, "token_acc": 1.0, "train_speed(iter/s)": 0.954482 }, { "epoch": 0.5391287398888998, "grad_norm": 0.2931303083896637, "learning_rate": 4.766691796952284e-06, "loss": 0.014908827841281891, "memory(GiB)": 21.48, "step": 16596, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.954494 }, { "epoch": 0.5391612253516551, "grad_norm": 1.2215330600738525, "learning_rate": 4.766155230457547e-06, "loss": 0.019214343279600143, "memory(GiB)": 21.48, "step": 16597, "token_acc": 1.0, "train_speed(iter/s)": 0.954505 }, { "epoch": 0.5391937108144106, "grad_norm": 0.4198606610298157, "learning_rate": 4.765618666661688e-06, "loss": 0.01672123745083809, "memory(GiB)": 21.48, "step": 16598, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.954518 }, { "epoch": 0.539226196277166, "grad_norm": 1.5582507848739624, "learning_rate": 4.765082105570902e-06, "loss": 0.0195547416806221, "memory(GiB)": 21.48, "step": 16599, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.95453 }, { "epoch": 0.5392586817399214, "grad_norm": 0.3643204867839813, "learning_rate": 4.764545547191382e-06, "loss": 0.01812339574098587, "memory(GiB)": 21.48, "step": 16600, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.954542 }, { "epoch": 0.5392911672026768, "grad_norm": 0.46501657366752625, "learning_rate": 4.764008991529318e-06, "loss": 0.021738586947321892, "memory(GiB)": 21.48, "step": 16601, "token_acc": 1.0, "train_speed(iter/s)": 0.954555 }, { "epoch": 0.5393236526654323, "grad_norm": 0.3016856908798218, "learning_rate": 4.763472438590906e-06, "loss": 0.021286724135279655, "memory(GiB)": 21.48, "step": 16602, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.954567 }, { "epoch": 0.5393561381281876, "grad_norm": 0.32760000228881836, "learning_rate": 4.762935888382337e-06, "loss": 0.017009766772389412, "memory(GiB)": 21.48, "step": 16603, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.95458 }, { "epoch": 0.5393886235909431, "grad_norm": 0.24102918803691864, "learning_rate": 4.762399340909802e-06, "loss": 0.016077302396297455, "memory(GiB)": 21.48, "step": 16604, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.954592 }, { "epoch": 0.5394211090536984, "grad_norm": 0.3494970202445984, "learning_rate": 4.761862796179496e-06, "loss": 0.02107398211956024, "memory(GiB)": 21.48, "step": 16605, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.954602 }, { "epoch": 0.5394535945164539, "grad_norm": 0.49006131291389465, "learning_rate": 4.76132625419761e-06, "loss": 0.018464796245098114, "memory(GiB)": 21.48, "step": 16606, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.95461 }, { "epoch": 0.5394860799792093, "grad_norm": 0.3363787531852722, "learning_rate": 4.760789714970337e-06, "loss": 0.019732315093278885, "memory(GiB)": 21.48, "step": 16607, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.954621 }, { "epoch": 0.5395185654419647, "grad_norm": 0.43256860971450806, "learning_rate": 4.760253178503868e-06, "loss": 0.02449575811624527, "memory(GiB)": 21.48, "step": 16608, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.95463 }, { "epoch": 0.5395510509047201, "grad_norm": 1.1989567279815674, "learning_rate": 4.759716644804398e-06, "loss": 0.03802571818232536, "memory(GiB)": 21.48, "step": 16609, "token_acc": 0.9704641350210971, "train_speed(iter/s)": 0.954639 }, { "epoch": 0.5395835363674756, "grad_norm": 0.3758043944835663, "learning_rate": 4.759180113878116e-06, "loss": 0.019710278138518333, "memory(GiB)": 21.48, "step": 16610, "token_acc": 0.9890510948905109, "train_speed(iter/s)": 0.95465 }, { "epoch": 0.5396160218302309, "grad_norm": 0.3719869554042816, "learning_rate": 4.758643585731218e-06, "loss": 0.02203121781349182, "memory(GiB)": 21.48, "step": 16611, "token_acc": 1.0, "train_speed(iter/s)": 0.954659 }, { "epoch": 0.5396485072929864, "grad_norm": 0.33020439743995667, "learning_rate": 4.7581070603698925e-06, "loss": 0.01593323051929474, "memory(GiB)": 21.48, "step": 16612, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.954669 }, { "epoch": 0.5396809927557418, "grad_norm": 0.37442460656166077, "learning_rate": 4.7575705378003345e-06, "loss": 0.01751299574971199, "memory(GiB)": 21.48, "step": 16613, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.954678 }, { "epoch": 0.5397134782184972, "grad_norm": 0.34293919801712036, "learning_rate": 4.757034018028735e-06, "loss": 0.018460318446159363, "memory(GiB)": 21.48, "step": 16614, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.954688 }, { "epoch": 0.5397459636812526, "grad_norm": 0.32901886105537415, "learning_rate": 4.756497501061287e-06, "loss": 0.013395590707659721, "memory(GiB)": 21.48, "step": 16615, "token_acc": 0.992, "train_speed(iter/s)": 0.954698 }, { "epoch": 0.5397784491440081, "grad_norm": 0.3240931034088135, "learning_rate": 4.755960986904179e-06, "loss": 0.019608449190855026, "memory(GiB)": 21.48, "step": 16616, "token_acc": 1.0, "train_speed(iter/s)": 0.954708 }, { "epoch": 0.5398109346067634, "grad_norm": 0.38830825686454773, "learning_rate": 4.75542447556361e-06, "loss": 0.021063227206468582, "memory(GiB)": 21.48, "step": 16617, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.954716 }, { "epoch": 0.5398434200695189, "grad_norm": 0.5153012871742249, "learning_rate": 4.754887967045764e-06, "loss": 0.022414371371269226, "memory(GiB)": 21.48, "step": 16618, "token_acc": 0.9923371647509579, "train_speed(iter/s)": 0.954725 }, { "epoch": 0.5398759055322743, "grad_norm": 0.4626305103302002, "learning_rate": 4.7543514613568395e-06, "loss": 0.019661039113998413, "memory(GiB)": 21.48, "step": 16619, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.954734 }, { "epoch": 0.5399083909950297, "grad_norm": 0.23312552273273468, "learning_rate": 4.753814958503028e-06, "loss": 0.012504421174526215, "memory(GiB)": 21.48, "step": 16620, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.954742 }, { "epoch": 0.5399408764577851, "grad_norm": 0.3443354368209839, "learning_rate": 4.753278458490515e-06, "loss": 0.022355742752552032, "memory(GiB)": 21.48, "step": 16621, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.954751 }, { "epoch": 0.5399733619205406, "grad_norm": 0.4320737421512604, "learning_rate": 4.7527419613255015e-06, "loss": 0.020706212148070335, "memory(GiB)": 21.48, "step": 16622, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.95476 }, { "epoch": 0.5400058473832959, "grad_norm": 1.000551700592041, "learning_rate": 4.752205467014171e-06, "loss": 0.01677858456969261, "memory(GiB)": 21.48, "step": 16623, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.954768 }, { "epoch": 0.5400383328460514, "grad_norm": 0.37083497643470764, "learning_rate": 4.7516689755627225e-06, "loss": 0.019043458625674248, "memory(GiB)": 21.48, "step": 16624, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.954776 }, { "epoch": 0.5400708183088068, "grad_norm": 0.34634408354759216, "learning_rate": 4.751132486977343e-06, "loss": 0.02109164372086525, "memory(GiB)": 21.48, "step": 16625, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.954785 }, { "epoch": 0.5401033037715622, "grad_norm": 0.3562556207180023, "learning_rate": 4.750596001264227e-06, "loss": 0.019427329301834106, "memory(GiB)": 21.48, "step": 16626, "token_acc": 0.9933110367892977, "train_speed(iter/s)": 0.954795 }, { "epoch": 0.5401357892343176, "grad_norm": 0.5269557237625122, "learning_rate": 4.750059518429563e-06, "loss": 0.02211235836148262, "memory(GiB)": 21.48, "step": 16627, "token_acc": 0.9885931558935361, "train_speed(iter/s)": 0.954803 }, { "epoch": 0.5401682746970731, "grad_norm": 0.43149352073669434, "learning_rate": 4.749523038479546e-06, "loss": 0.023164598271250725, "memory(GiB)": 21.48, "step": 16628, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.954813 }, { "epoch": 0.5402007601598284, "grad_norm": 0.386042982339859, "learning_rate": 4.748986561420366e-06, "loss": 0.013159448280930519, "memory(GiB)": 21.48, "step": 16629, "token_acc": 1.0, "train_speed(iter/s)": 0.954822 }, { "epoch": 0.5402332456225839, "grad_norm": 0.3377240300178528, "learning_rate": 4.7484500872582165e-06, "loss": 0.01849851943552494, "memory(GiB)": 21.48, "step": 16630, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.954831 }, { "epoch": 0.5402657310853393, "grad_norm": 0.45194607973098755, "learning_rate": 4.747913615999285e-06, "loss": 0.025385845452547073, "memory(GiB)": 21.48, "step": 16631, "token_acc": 1.0, "train_speed(iter/s)": 0.954841 }, { "epoch": 0.5402982165480947, "grad_norm": 0.40650835633277893, "learning_rate": 4.747377147649769e-06, "loss": 0.014530593529343605, "memory(GiB)": 21.48, "step": 16632, "token_acc": 0.994475138121547, "train_speed(iter/s)": 0.95485 }, { "epoch": 0.5403307020108501, "grad_norm": 0.46244317293167114, "learning_rate": 4.746840682215855e-06, "loss": 0.021627817302942276, "memory(GiB)": 21.48, "step": 16633, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.95486 }, { "epoch": 0.5403631874736056, "grad_norm": 0.39747026562690735, "learning_rate": 4.746304219703737e-06, "loss": 0.018345247954130173, "memory(GiB)": 21.48, "step": 16634, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.95487 }, { "epoch": 0.5403956729363609, "grad_norm": 0.44647058844566345, "learning_rate": 4.7457677601196064e-06, "loss": 0.02161797136068344, "memory(GiB)": 21.48, "step": 16635, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.954878 }, { "epoch": 0.5404281583991164, "grad_norm": 0.38664594292640686, "learning_rate": 4.745231303469653e-06, "loss": 0.019300971180200577, "memory(GiB)": 21.48, "step": 16636, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.954887 }, { "epoch": 0.5404606438618718, "grad_norm": 0.42371928691864014, "learning_rate": 4.744694849760071e-06, "loss": 0.021625375375151634, "memory(GiB)": 21.48, "step": 16637, "token_acc": 1.0, "train_speed(iter/s)": 0.954897 }, { "epoch": 0.5404931293246272, "grad_norm": 0.348606675863266, "learning_rate": 4.74415839899705e-06, "loss": 0.016085032373666763, "memory(GiB)": 21.48, "step": 16638, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.954906 }, { "epoch": 0.5405256147873826, "grad_norm": 0.36997032165527344, "learning_rate": 4.743621951186781e-06, "loss": 0.017690561711788177, "memory(GiB)": 21.48, "step": 16639, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.954916 }, { "epoch": 0.5405581002501381, "grad_norm": 0.45697709918022156, "learning_rate": 4.7430855063354565e-06, "loss": 0.01909775100648403, "memory(GiB)": 21.48, "step": 16640, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.954926 }, { "epoch": 0.5405905857128935, "grad_norm": 0.32578474283218384, "learning_rate": 4.742549064449267e-06, "loss": 0.012426473200321198, "memory(GiB)": 21.48, "step": 16641, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.954937 }, { "epoch": 0.5406230711756489, "grad_norm": 0.346897691488266, "learning_rate": 4.742012625534403e-06, "loss": 0.01943720132112503, "memory(GiB)": 21.48, "step": 16642, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.954949 }, { "epoch": 0.5406555566384044, "grad_norm": 0.5306717753410339, "learning_rate": 4.741476189597059e-06, "loss": 0.025829797610640526, "memory(GiB)": 21.48, "step": 16643, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.954961 }, { "epoch": 0.5406880421011597, "grad_norm": 0.4902666211128235, "learning_rate": 4.740939756643421e-06, "loss": 0.03128527104854584, "memory(GiB)": 21.48, "step": 16644, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.954973 }, { "epoch": 0.5407205275639152, "grad_norm": 1.5734304189682007, "learning_rate": 4.740403326679686e-06, "loss": 0.019554264843463898, "memory(GiB)": 21.48, "step": 16645, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.954985 }, { "epoch": 0.5407530130266706, "grad_norm": 0.37064045667648315, "learning_rate": 4.739866899712039e-06, "loss": 0.021470002830028534, "memory(GiB)": 21.48, "step": 16646, "token_acc": 1.0, "train_speed(iter/s)": 0.954996 }, { "epoch": 0.540785498489426, "grad_norm": 0.27141785621643066, "learning_rate": 4.7393304757466775e-06, "loss": 0.013755748979747295, "memory(GiB)": 21.48, "step": 16647, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955008 }, { "epoch": 0.5408179839521814, "grad_norm": 0.313559889793396, "learning_rate": 4.738794054789787e-06, "loss": 0.017917640507221222, "memory(GiB)": 21.48, "step": 16648, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.95502 }, { "epoch": 0.5408504694149369, "grad_norm": 0.38494008779525757, "learning_rate": 4.73825763684756e-06, "loss": 0.02279188483953476, "memory(GiB)": 21.48, "step": 16649, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.955033 }, { "epoch": 0.5408829548776922, "grad_norm": 0.4205333888530731, "learning_rate": 4.737721221926192e-06, "loss": 0.021877465769648552, "memory(GiB)": 21.48, "step": 16650, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.955045 }, { "epoch": 0.5409154403404477, "grad_norm": 0.4643181264400482, "learning_rate": 4.737184810031865e-06, "loss": 0.016501745209097862, "memory(GiB)": 21.48, "step": 16651, "token_acc": 0.9947916666666666, "train_speed(iter/s)": 0.955057 }, { "epoch": 0.5409479258032031, "grad_norm": 0.41772374510765076, "learning_rate": 4.73664840117078e-06, "loss": 0.0183489378541708, "memory(GiB)": 21.48, "step": 16652, "token_acc": 1.0, "train_speed(iter/s)": 0.955069 }, { "epoch": 0.5409804112659585, "grad_norm": 0.4237664043903351, "learning_rate": 4.7361119953491206e-06, "loss": 0.024664144963026047, "memory(GiB)": 21.48, "step": 16653, "token_acc": 1.0, "train_speed(iter/s)": 0.955081 }, { "epoch": 0.5410128967287139, "grad_norm": 0.33055636286735535, "learning_rate": 4.73557559257308e-06, "loss": 0.019259363412857056, "memory(GiB)": 21.48, "step": 16654, "token_acc": 0.9934640522875817, "train_speed(iter/s)": 0.955094 }, { "epoch": 0.5410453821914694, "grad_norm": 0.39203372597694397, "learning_rate": 4.735039192848848e-06, "loss": 0.024100638926029205, "memory(GiB)": 21.48, "step": 16655, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.955106 }, { "epoch": 0.5410778676542247, "grad_norm": 0.39286068081855774, "learning_rate": 4.734502796182619e-06, "loss": 0.024023959413170815, "memory(GiB)": 21.48, "step": 16656, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955118 }, { "epoch": 0.5411103531169802, "grad_norm": 0.664383590221405, "learning_rate": 4.733966402580579e-06, "loss": 0.021210897713899612, "memory(GiB)": 21.48, "step": 16657, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.95513 }, { "epoch": 0.5411428385797356, "grad_norm": 0.3882468044757843, "learning_rate": 4.733430012048922e-06, "loss": 0.02171376720070839, "memory(GiB)": 21.48, "step": 16658, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.955142 }, { "epoch": 0.541175324042491, "grad_norm": 0.36555254459381104, "learning_rate": 4.732893624593837e-06, "loss": 0.016153443604707718, "memory(GiB)": 21.48, "step": 16659, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955154 }, { "epoch": 0.5412078095052464, "grad_norm": 0.385337233543396, "learning_rate": 4.732357240221516e-06, "loss": 0.02924494817852974, "memory(GiB)": 21.48, "step": 16660, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.955165 }, { "epoch": 0.5412402949680019, "grad_norm": 0.7544562220573425, "learning_rate": 4.731820858938147e-06, "loss": 0.027869826182723045, "memory(GiB)": 21.48, "step": 16661, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955177 }, { "epoch": 0.5412727804307572, "grad_norm": 0.3484193980693817, "learning_rate": 4.731284480749923e-06, "loss": 0.019311100244522095, "memory(GiB)": 21.48, "step": 16662, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.955189 }, { "epoch": 0.5413052658935127, "grad_norm": 0.3155559301376343, "learning_rate": 4.7307481056630335e-06, "loss": 0.019807636737823486, "memory(GiB)": 21.48, "step": 16663, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.955201 }, { "epoch": 0.541337751356268, "grad_norm": 0.2821364402770996, "learning_rate": 4.730211733683669e-06, "loss": 0.013875050470232964, "memory(GiB)": 21.48, "step": 16664, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955211 }, { "epoch": 0.5413702368190235, "grad_norm": 0.38105061650276184, "learning_rate": 4.729675364818021e-06, "loss": 0.018618490546941757, "memory(GiB)": 21.48, "step": 16665, "token_acc": 1.0, "train_speed(iter/s)": 0.95522 }, { "epoch": 0.5414027222817789, "grad_norm": 0.28641343116760254, "learning_rate": 4.729138999072278e-06, "loss": 0.016171278432011604, "memory(GiB)": 21.48, "step": 16666, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.955228 }, { "epoch": 0.5414352077445344, "grad_norm": 0.2771211862564087, "learning_rate": 4.728602636452633e-06, "loss": 0.01152749266475439, "memory(GiB)": 21.48, "step": 16667, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955238 }, { "epoch": 0.5414676932072897, "grad_norm": 0.3783608376979828, "learning_rate": 4.728066276965273e-06, "loss": 0.028837978839874268, "memory(GiB)": 21.48, "step": 16668, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.955247 }, { "epoch": 0.5415001786700452, "grad_norm": 0.5340381860733032, "learning_rate": 4.727529920616392e-06, "loss": 0.02097461372613907, "memory(GiB)": 21.48, "step": 16669, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.955257 }, { "epoch": 0.5415326641328005, "grad_norm": 0.34494051337242126, "learning_rate": 4.726993567412177e-06, "loss": 0.013966487720608711, "memory(GiB)": 21.48, "step": 16670, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.955267 }, { "epoch": 0.541565149595556, "grad_norm": 0.32850074768066406, "learning_rate": 4.726457217358822e-06, "loss": 0.017548011615872383, "memory(GiB)": 21.48, "step": 16671, "token_acc": 0.9823788546255506, "train_speed(iter/s)": 0.955276 }, { "epoch": 0.5415976350583114, "grad_norm": 0.3117758631706238, "learning_rate": 4.725920870462511e-06, "loss": 0.023962130770087242, "memory(GiB)": 21.48, "step": 16672, "token_acc": 0.992, "train_speed(iter/s)": 0.955286 }, { "epoch": 0.5416301205210668, "grad_norm": 0.3008842468261719, "learning_rate": 4.7253845267294416e-06, "loss": 0.021614357829093933, "memory(GiB)": 21.48, "step": 16673, "token_acc": 1.0, "train_speed(iter/s)": 0.955295 }, { "epoch": 0.5416626059838222, "grad_norm": 1.9145272970199585, "learning_rate": 4.724848186165797e-06, "loss": 0.026045458391308784, "memory(GiB)": 21.48, "step": 16674, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.955305 }, { "epoch": 0.5416950914465777, "grad_norm": 0.25210097432136536, "learning_rate": 4.724311848777773e-06, "loss": 0.011733124032616615, "memory(GiB)": 21.48, "step": 16675, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.955314 }, { "epoch": 0.541727576909333, "grad_norm": 0.44677624106407166, "learning_rate": 4.723775514571555e-06, "loss": 0.023527907207608223, "memory(GiB)": 21.48, "step": 16676, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.955324 }, { "epoch": 0.5417600623720885, "grad_norm": 0.49242809414863586, "learning_rate": 4.723239183553339e-06, "loss": 0.021917954087257385, "memory(GiB)": 21.48, "step": 16677, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955334 }, { "epoch": 0.5417925478348439, "grad_norm": 0.29541894793510437, "learning_rate": 4.722702855729308e-06, "loss": 0.017860354855656624, "memory(GiB)": 21.48, "step": 16678, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.955343 }, { "epoch": 0.5418250332975993, "grad_norm": 0.28078755736351013, "learning_rate": 4.722166531105656e-06, "loss": 0.018364213407039642, "memory(GiB)": 21.48, "step": 16679, "token_acc": 1.0, "train_speed(iter/s)": 0.955351 }, { "epoch": 0.5418575187603547, "grad_norm": 0.3256266713142395, "learning_rate": 4.721630209688571e-06, "loss": 0.02316528558731079, "memory(GiB)": 21.48, "step": 16680, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.95536 }, { "epoch": 0.5418900042231102, "grad_norm": 0.36189818382263184, "learning_rate": 4.721093891484243e-06, "loss": 0.022047128528356552, "memory(GiB)": 21.48, "step": 16681, "token_acc": 0.9898477157360406, "train_speed(iter/s)": 0.955368 }, { "epoch": 0.5419224896858655, "grad_norm": 0.5587909817695618, "learning_rate": 4.720557576498866e-06, "loss": 0.03162684291601181, "memory(GiB)": 21.48, "step": 16682, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.955377 }, { "epoch": 0.541954975148621, "grad_norm": 0.3266066908836365, "learning_rate": 4.720021264738624e-06, "loss": 0.017828645184636116, "memory(GiB)": 21.48, "step": 16683, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.955385 }, { "epoch": 0.5419874606113764, "grad_norm": 0.24865742027759552, "learning_rate": 4.719484956209711e-06, "loss": 0.014934893697500229, "memory(GiB)": 21.48, "step": 16684, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.955393 }, { "epoch": 0.5420199460741318, "grad_norm": 0.40253961086273193, "learning_rate": 4.718948650918314e-06, "loss": 0.01983959600329399, "memory(GiB)": 21.48, "step": 16685, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.955403 }, { "epoch": 0.5420524315368872, "grad_norm": 0.27535390853881836, "learning_rate": 4.718412348870624e-06, "loss": 0.019737012684345245, "memory(GiB)": 21.48, "step": 16686, "token_acc": 0.9965277777777778, "train_speed(iter/s)": 0.955413 }, { "epoch": 0.5420849169996427, "grad_norm": 0.3627792000770569, "learning_rate": 4.71787605007283e-06, "loss": 0.017232954502105713, "memory(GiB)": 21.48, "step": 16687, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.955423 }, { "epoch": 0.542117402462398, "grad_norm": 0.5093158483505249, "learning_rate": 4.717339754531122e-06, "loss": 0.02366437017917633, "memory(GiB)": 21.48, "step": 16688, "token_acc": 0.9900662251655629, "train_speed(iter/s)": 0.955431 }, { "epoch": 0.5421498879251535, "grad_norm": 0.34865278005599976, "learning_rate": 4.7168034622516894e-06, "loss": 0.026480264961719513, "memory(GiB)": 21.48, "step": 16689, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.95544 }, { "epoch": 0.5421823733879089, "grad_norm": 0.3309377431869507, "learning_rate": 4.716267173240723e-06, "loss": 0.02102893963456154, "memory(GiB)": 21.48, "step": 16690, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.95545 }, { "epoch": 0.5422148588506643, "grad_norm": 0.4511929154396057, "learning_rate": 4.7157308875044085e-06, "loss": 0.020902380347251892, "memory(GiB)": 21.48, "step": 16691, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.955459 }, { "epoch": 0.5422473443134197, "grad_norm": 0.3761442005634308, "learning_rate": 4.715194605048941e-06, "loss": 0.019541997462511063, "memory(GiB)": 21.48, "step": 16692, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.955469 }, { "epoch": 0.5422798297761752, "grad_norm": 0.2408856749534607, "learning_rate": 4.714658325880504e-06, "loss": 0.014849920757114887, "memory(GiB)": 21.48, "step": 16693, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.955478 }, { "epoch": 0.5423123152389305, "grad_norm": 0.5335096120834351, "learning_rate": 4.714122050005291e-06, "loss": 0.031188208609819412, "memory(GiB)": 21.48, "step": 16694, "token_acc": 0.9822695035460993, "train_speed(iter/s)": 0.955488 }, { "epoch": 0.542344800701686, "grad_norm": 0.429259717464447, "learning_rate": 4.713585777429489e-06, "loss": 0.027311043813824654, "memory(GiB)": 21.48, "step": 16695, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955498 }, { "epoch": 0.5423772861644414, "grad_norm": 0.30930769443511963, "learning_rate": 4.713049508159289e-06, "loss": 0.02222076803445816, "memory(GiB)": 21.48, "step": 16696, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.955508 }, { "epoch": 0.5424097716271968, "grad_norm": 0.3192840814590454, "learning_rate": 4.71251324220088e-06, "loss": 0.013923797756433487, "memory(GiB)": 21.48, "step": 16697, "token_acc": 0.9890710382513661, "train_speed(iter/s)": 0.955516 }, { "epoch": 0.5424422570899522, "grad_norm": 0.49512943625450134, "learning_rate": 4.711976979560451e-06, "loss": 0.02840309776365757, "memory(GiB)": 21.48, "step": 16698, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.955525 }, { "epoch": 0.5424747425527077, "grad_norm": 0.2894686460494995, "learning_rate": 4.711440720244192e-06, "loss": 0.017267923802137375, "memory(GiB)": 21.48, "step": 16699, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955535 }, { "epoch": 0.542507228015463, "grad_norm": 0.44373631477355957, "learning_rate": 4.7109044642582885e-06, "loss": 0.01896640658378601, "memory(GiB)": 21.48, "step": 16700, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.955544 }, { "epoch": 0.5425397134782185, "grad_norm": 0.3766518235206604, "learning_rate": 4.710368211608935e-06, "loss": 0.016335846856236458, "memory(GiB)": 21.48, "step": 16701, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955553 }, { "epoch": 0.5425721989409739, "grad_norm": 0.3638158440589905, "learning_rate": 4.709831962302315e-06, "loss": 0.018835023045539856, "memory(GiB)": 21.48, "step": 16702, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.955564 }, { "epoch": 0.5426046844037293, "grad_norm": 0.3443963825702667, "learning_rate": 4.709295716344623e-06, "loss": 0.018295947462320328, "memory(GiB)": 21.48, "step": 16703, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.955576 }, { "epoch": 0.5426371698664848, "grad_norm": 0.2902071177959442, "learning_rate": 4.708759473742044e-06, "loss": 0.01624116115272045, "memory(GiB)": 21.48, "step": 16704, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.955588 }, { "epoch": 0.5426696553292402, "grad_norm": 0.2353413999080658, "learning_rate": 4.7082232345007696e-06, "loss": 0.013923294842243195, "memory(GiB)": 21.48, "step": 16705, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955599 }, { "epoch": 0.5427021407919956, "grad_norm": 0.3383132219314575, "learning_rate": 4.707686998626986e-06, "loss": 0.02207670360803604, "memory(GiB)": 21.48, "step": 16706, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.955612 }, { "epoch": 0.542734626254751, "grad_norm": 0.4261939227581024, "learning_rate": 4.707150766126884e-06, "loss": 0.019785728305578232, "memory(GiB)": 21.48, "step": 16707, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.955624 }, { "epoch": 0.5427671117175065, "grad_norm": 0.5111053586006165, "learning_rate": 4.706614537006651e-06, "loss": 0.025442225858569145, "memory(GiB)": 21.48, "step": 16708, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.955636 }, { "epoch": 0.5427995971802618, "grad_norm": 0.3304425776004791, "learning_rate": 4.706078311272478e-06, "loss": 0.016650596633553505, "memory(GiB)": 21.48, "step": 16709, "token_acc": 1.0, "train_speed(iter/s)": 0.955648 }, { "epoch": 0.5428320826430173, "grad_norm": 0.2906755208969116, "learning_rate": 4.7055420889305515e-06, "loss": 0.019834572449326515, "memory(GiB)": 21.48, "step": 16710, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.95566 }, { "epoch": 0.5428645681057727, "grad_norm": 0.45322948694229126, "learning_rate": 4.705005869987062e-06, "loss": 0.022387385368347168, "memory(GiB)": 21.48, "step": 16711, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955671 }, { "epoch": 0.5428970535685281, "grad_norm": 0.42226532101631165, "learning_rate": 4.704469654448197e-06, "loss": 0.021868785843253136, "memory(GiB)": 21.48, "step": 16712, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.955684 }, { "epoch": 0.5429295390312835, "grad_norm": 0.30184483528137207, "learning_rate": 4.7039334423201445e-06, "loss": 0.017214685678482056, "memory(GiB)": 21.48, "step": 16713, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.955695 }, { "epoch": 0.542962024494039, "grad_norm": 0.3355475962162018, "learning_rate": 4.7033972336090965e-06, "loss": 0.020713090896606445, "memory(GiB)": 21.48, "step": 16714, "token_acc": 0.9941860465116279, "train_speed(iter/s)": 0.955708 }, { "epoch": 0.5429945099567943, "grad_norm": 0.34380534291267395, "learning_rate": 4.702861028321237e-06, "loss": 0.02004946395754814, "memory(GiB)": 21.48, "step": 16715, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.955719 }, { "epoch": 0.5430269954195498, "grad_norm": 0.31452640891075134, "learning_rate": 4.7023248264627586e-06, "loss": 0.026253346353769302, "memory(GiB)": 21.48, "step": 16716, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.955729 }, { "epoch": 0.5430594808823052, "grad_norm": 0.39296287298202515, "learning_rate": 4.701788628039846e-06, "loss": 0.02603534609079361, "memory(GiB)": 21.48, "step": 16717, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.95574 }, { "epoch": 0.5430919663450606, "grad_norm": 0.48763278126716614, "learning_rate": 4.701252433058692e-06, "loss": 0.023358825594186783, "memory(GiB)": 21.48, "step": 16718, "token_acc": 1.0, "train_speed(iter/s)": 0.955752 }, { "epoch": 0.543124451807816, "grad_norm": 0.3440515100955963, "learning_rate": 4.70071624152548e-06, "loss": 0.021671293303370476, "memory(GiB)": 21.48, "step": 16719, "token_acc": 0.9965277777777778, "train_speed(iter/s)": 0.955764 }, { "epoch": 0.5431569372705715, "grad_norm": 0.357538104057312, "learning_rate": 4.700180053446403e-06, "loss": 0.019072147086262703, "memory(GiB)": 21.48, "step": 16720, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.955776 }, { "epoch": 0.5431894227333268, "grad_norm": 0.3092128336429596, "learning_rate": 4.699643868827645e-06, "loss": 0.019754011183977127, "memory(GiB)": 21.48, "step": 16721, "token_acc": 0.9838056680161943, "train_speed(iter/s)": 0.955787 }, { "epoch": 0.5432219081960823, "grad_norm": 0.5176153779029846, "learning_rate": 4.6991076876754e-06, "loss": 0.015952248126268387, "memory(GiB)": 21.48, "step": 16722, "token_acc": 1.0, "train_speed(iter/s)": 0.9558 }, { "epoch": 0.5432543936588377, "grad_norm": 0.33313697576522827, "learning_rate": 4.698571509995849e-06, "loss": 0.025341007858514786, "memory(GiB)": 21.48, "step": 16723, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.955813 }, { "epoch": 0.5432868791215931, "grad_norm": 0.39902058243751526, "learning_rate": 4.6980353357951875e-06, "loss": 0.024859653785824776, "memory(GiB)": 21.48, "step": 16724, "token_acc": 0.9875, "train_speed(iter/s)": 0.955822 }, { "epoch": 0.5433193645843485, "grad_norm": 0.25295984745025635, "learning_rate": 4.697499165079596e-06, "loss": 0.013322275131940842, "memory(GiB)": 21.48, "step": 16725, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.955831 }, { "epoch": 0.543351850047104, "grad_norm": 0.24427606165409088, "learning_rate": 4.696962997855269e-06, "loss": 0.01523341704159975, "memory(GiB)": 21.48, "step": 16726, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.955841 }, { "epoch": 0.5433843355098593, "grad_norm": 0.3310539424419403, "learning_rate": 4.6964268341283934e-06, "loss": 0.020993294194340706, "memory(GiB)": 21.48, "step": 16727, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.955852 }, { "epoch": 0.5434168209726148, "grad_norm": 0.42795225977897644, "learning_rate": 4.695890673905154e-06, "loss": 0.01806245557963848, "memory(GiB)": 21.48, "step": 16728, "token_acc": 0.984, "train_speed(iter/s)": 0.955861 }, { "epoch": 0.5434493064353701, "grad_norm": 0.3338731527328491, "learning_rate": 4.695354517191744e-06, "loss": 0.017529023811221123, "memory(GiB)": 21.48, "step": 16729, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.95587 }, { "epoch": 0.5434817918981256, "grad_norm": 0.2544921636581421, "learning_rate": 4.694818363994345e-06, "loss": 0.015449611470103264, "memory(GiB)": 21.48, "step": 16730, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.95588 }, { "epoch": 0.543514277360881, "grad_norm": 0.27630963921546936, "learning_rate": 4.694282214319151e-06, "loss": 0.014247184619307518, "memory(GiB)": 21.48, "step": 16731, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.955889 }, { "epoch": 0.5435467628236365, "grad_norm": 0.46075814962387085, "learning_rate": 4.693746068172344e-06, "loss": 0.021174784749746323, "memory(GiB)": 21.48, "step": 16732, "token_acc": 0.9965277777777778, "train_speed(iter/s)": 0.955899 }, { "epoch": 0.5435792482863918, "grad_norm": 0.3152956962585449, "learning_rate": 4.693209925560119e-06, "loss": 0.013310285285115242, "memory(GiB)": 21.48, "step": 16733, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.955909 }, { "epoch": 0.5436117337491473, "grad_norm": 0.5465775728225708, "learning_rate": 4.692673786488657e-06, "loss": 0.02641412988305092, "memory(GiB)": 21.48, "step": 16734, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.955918 }, { "epoch": 0.5436442192119026, "grad_norm": 0.341967910528183, "learning_rate": 4.69213765096415e-06, "loss": 0.025310881435871124, "memory(GiB)": 21.48, "step": 16735, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.955927 }, { "epoch": 0.5436767046746581, "grad_norm": 0.5283414125442505, "learning_rate": 4.691601518992783e-06, "loss": 0.02910413220524788, "memory(GiB)": 21.48, "step": 16736, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.955936 }, { "epoch": 0.5437091901374135, "grad_norm": 0.4700721204280853, "learning_rate": 4.691065390580746e-06, "loss": 0.01901322975754738, "memory(GiB)": 21.48, "step": 16737, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955945 }, { "epoch": 0.543741675600169, "grad_norm": 0.26457127928733826, "learning_rate": 4.690529265734224e-06, "loss": 0.012699953280389309, "memory(GiB)": 21.48, "step": 16738, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955954 }, { "epoch": 0.5437741610629243, "grad_norm": 0.3557768166065216, "learning_rate": 4.6899931444594075e-06, "loss": 0.021761003881692886, "memory(GiB)": 21.48, "step": 16739, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.955963 }, { "epoch": 0.5438066465256798, "grad_norm": 0.41678470373153687, "learning_rate": 4.6894570267624825e-06, "loss": 0.022076072171330452, "memory(GiB)": 21.48, "step": 16740, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.955973 }, { "epoch": 0.5438391319884351, "grad_norm": 0.38073840737342834, "learning_rate": 4.688920912649636e-06, "loss": 0.02034015581011772, "memory(GiB)": 21.48, "step": 16741, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.95598 }, { "epoch": 0.5438716174511906, "grad_norm": 0.3531095087528229, "learning_rate": 4.688384802127057e-06, "loss": 0.020777985453605652, "memory(GiB)": 21.48, "step": 16742, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955989 }, { "epoch": 0.543904102913946, "grad_norm": 0.34212344884872437, "learning_rate": 4.687848695200932e-06, "loss": 0.018951548263430595, "memory(GiB)": 21.48, "step": 16743, "token_acc": 0.9845559845559846, "train_speed(iter/s)": 0.955997 }, { "epoch": 0.5439365883767014, "grad_norm": 0.3766162395477295, "learning_rate": 4.6873125918774495e-06, "loss": 0.015086141414940357, "memory(GiB)": 21.48, "step": 16744, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.956005 }, { "epoch": 0.5439690738394568, "grad_norm": 0.39276599884033203, "learning_rate": 4.686776492162794e-06, "loss": 0.01972576417028904, "memory(GiB)": 21.48, "step": 16745, "token_acc": 0.9922779922779923, "train_speed(iter/s)": 0.956015 }, { "epoch": 0.5440015593022123, "grad_norm": 0.24671123921871185, "learning_rate": 4.686240396063156e-06, "loss": 0.016665715724229813, "memory(GiB)": 21.48, "step": 16746, "token_acc": 1.0, "train_speed(iter/s)": 0.956025 }, { "epoch": 0.5440340447649676, "grad_norm": 0.32240673899650574, "learning_rate": 4.68570430358472e-06, "loss": 0.014699770137667656, "memory(GiB)": 21.48, "step": 16747, "token_acc": 1.0, "train_speed(iter/s)": 0.956035 }, { "epoch": 0.5440665302277231, "grad_norm": 0.4322228729724884, "learning_rate": 4.685168214733677e-06, "loss": 0.023703940212726593, "memory(GiB)": 21.48, "step": 16748, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956044 }, { "epoch": 0.5440990156904785, "grad_norm": 0.39425361156463623, "learning_rate": 4.68463212951621e-06, "loss": 0.021300237625837326, "memory(GiB)": 21.48, "step": 16749, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.956054 }, { "epoch": 0.5441315011532339, "grad_norm": 0.3153837323188782, "learning_rate": 4.684096047938509e-06, "loss": 0.020202744752168655, "memory(GiB)": 21.48, "step": 16750, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956063 }, { "epoch": 0.5441639866159893, "grad_norm": 0.3548915982246399, "learning_rate": 4.683559970006759e-06, "loss": 0.021051649004220963, "memory(GiB)": 21.48, "step": 16751, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.956072 }, { "epoch": 0.5441964720787448, "grad_norm": 0.41221508383750916, "learning_rate": 4.6830238957271505e-06, "loss": 0.02096519060432911, "memory(GiB)": 21.48, "step": 16752, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.956082 }, { "epoch": 0.5442289575415001, "grad_norm": 0.2567678689956665, "learning_rate": 4.6824878251058646e-06, "loss": 0.01237793080508709, "memory(GiB)": 21.48, "step": 16753, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956091 }, { "epoch": 0.5442614430042556, "grad_norm": 0.42747369408607483, "learning_rate": 4.6819517581490955e-06, "loss": 0.02405995875597, "memory(GiB)": 21.48, "step": 16754, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.956099 }, { "epoch": 0.544293928467011, "grad_norm": 0.3754931688308716, "learning_rate": 4.6814156948630225e-06, "loss": 0.013471605256199837, "memory(GiB)": 21.48, "step": 16755, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.956109 }, { "epoch": 0.5443264139297664, "grad_norm": 0.3006363809108734, "learning_rate": 4.680879635253841e-06, "loss": 0.019808579236268997, "memory(GiB)": 21.48, "step": 16756, "token_acc": 1.0, "train_speed(iter/s)": 0.956119 }, { "epoch": 0.5443588993925218, "grad_norm": 0.4319727122783661, "learning_rate": 4.6803435793277294e-06, "loss": 0.01535620354115963, "memory(GiB)": 21.48, "step": 16757, "token_acc": 0.9921875, "train_speed(iter/s)": 0.956129 }, { "epoch": 0.5443913848552773, "grad_norm": 0.3602900803089142, "learning_rate": 4.6798075270908785e-06, "loss": 0.02050594426691532, "memory(GiB)": 21.48, "step": 16758, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.956139 }, { "epoch": 0.5444238703180326, "grad_norm": 0.3634966015815735, "learning_rate": 4.679271478549477e-06, "loss": 0.0189640112221241, "memory(GiB)": 21.48, "step": 16759, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956147 }, { "epoch": 0.5444563557807881, "grad_norm": 0.29624783992767334, "learning_rate": 4.678735433709708e-06, "loss": 0.010455148294568062, "memory(GiB)": 21.48, "step": 16760, "token_acc": 0.9947643979057592, "train_speed(iter/s)": 0.956157 }, { "epoch": 0.5444888412435435, "grad_norm": 0.3424312472343445, "learning_rate": 4.678199392577761e-06, "loss": 0.020955899730324745, "memory(GiB)": 21.48, "step": 16761, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956166 }, { "epoch": 0.5445213267062989, "grad_norm": 0.4433539807796478, "learning_rate": 4.67766335515982e-06, "loss": 0.01951025240123272, "memory(GiB)": 21.48, "step": 16762, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.956176 }, { "epoch": 0.5445538121690543, "grad_norm": 0.31085404753685, "learning_rate": 4.677127321462075e-06, "loss": 0.016691789031028748, "memory(GiB)": 21.48, "step": 16763, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956186 }, { "epoch": 0.5445862976318098, "grad_norm": 5.939358711242676, "learning_rate": 4.676591291490708e-06, "loss": 0.025925029069185257, "memory(GiB)": 21.48, "step": 16764, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.956196 }, { "epoch": 0.5446187830945651, "grad_norm": 0.45590680837631226, "learning_rate": 4.67605526525191e-06, "loss": 0.021193813532590866, "memory(GiB)": 21.48, "step": 16765, "token_acc": 0.9815950920245399, "train_speed(iter/s)": 0.956207 }, { "epoch": 0.5446512685573206, "grad_norm": 0.31072118878364563, "learning_rate": 4.675519242751863e-06, "loss": 0.020800847560167313, "memory(GiB)": 21.48, "step": 16766, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.956219 }, { "epoch": 0.544683754020076, "grad_norm": 0.2881220579147339, "learning_rate": 4.674983223996758e-06, "loss": 0.015407131984829903, "memory(GiB)": 21.48, "step": 16767, "token_acc": 0.9932885906040269, "train_speed(iter/s)": 0.956231 }, { "epoch": 0.5447162394828314, "grad_norm": 0.4565969705581665, "learning_rate": 4.6744472089927775e-06, "loss": 0.028823193162679672, "memory(GiB)": 21.48, "step": 16768, "token_acc": 0.9691629955947136, "train_speed(iter/s)": 0.956243 }, { "epoch": 0.5447487249455869, "grad_norm": 0.6829410791397095, "learning_rate": 4.673911197746112e-06, "loss": 0.026379279792308807, "memory(GiB)": 21.48, "step": 16769, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.956255 }, { "epoch": 0.5447812104083423, "grad_norm": 0.3940593898296356, "learning_rate": 4.673375190262943e-06, "loss": 0.019207168370485306, "memory(GiB)": 21.48, "step": 16770, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.956267 }, { "epoch": 0.5448136958710977, "grad_norm": 0.3218836188316345, "learning_rate": 4.67283918654946e-06, "loss": 0.02246737852692604, "memory(GiB)": 21.48, "step": 16771, "token_acc": 1.0, "train_speed(iter/s)": 0.956278 }, { "epoch": 0.5448461813338531, "grad_norm": 0.6400787234306335, "learning_rate": 4.672303186611847e-06, "loss": 0.016439765691757202, "memory(GiB)": 21.48, "step": 16772, "token_acc": 0.9968354430379747, "train_speed(iter/s)": 0.956291 }, { "epoch": 0.5448786667966086, "grad_norm": 0.3879238963127136, "learning_rate": 4.671767190456292e-06, "loss": 0.0214325413107872, "memory(GiB)": 21.48, "step": 16773, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.956303 }, { "epoch": 0.5449111522593639, "grad_norm": 0.3404114246368408, "learning_rate": 4.671231198088981e-06, "loss": 0.029166266322135925, "memory(GiB)": 21.48, "step": 16774, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.956314 }, { "epoch": 0.5449436377221194, "grad_norm": 0.400534987449646, "learning_rate": 4.6706952095161e-06, "loss": 0.020447060465812683, "memory(GiB)": 21.48, "step": 16775, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.956327 }, { "epoch": 0.5449761231848748, "grad_norm": 0.4799930155277252, "learning_rate": 4.670159224743834e-06, "loss": 0.017588932067155838, "memory(GiB)": 21.48, "step": 16776, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.956338 }, { "epoch": 0.5450086086476302, "grad_norm": 0.3823026418685913, "learning_rate": 4.669623243778369e-06, "loss": 0.022955594584345818, "memory(GiB)": 21.48, "step": 16777, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.95635 }, { "epoch": 0.5450410941103856, "grad_norm": 0.2534503638744354, "learning_rate": 4.669087266625892e-06, "loss": 0.01735268346965313, "memory(GiB)": 21.48, "step": 16778, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956362 }, { "epoch": 0.5450735795731411, "grad_norm": 0.4172409772872925, "learning_rate": 4.668551293292589e-06, "loss": 0.021421730518341064, "memory(GiB)": 21.48, "step": 16779, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956374 }, { "epoch": 0.5451060650358964, "grad_norm": 0.391073077917099, "learning_rate": 4.668015323784646e-06, "loss": 0.026787379756569862, "memory(GiB)": 21.48, "step": 16780, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.956386 }, { "epoch": 0.5451385504986519, "grad_norm": 0.43882322311401367, "learning_rate": 4.667479358108245e-06, "loss": 0.018038691952824593, "memory(GiB)": 21.48, "step": 16781, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.956398 }, { "epoch": 0.5451710359614073, "grad_norm": 0.45220327377319336, "learning_rate": 4.666943396269578e-06, "loss": 0.017907531931996346, "memory(GiB)": 21.48, "step": 16782, "token_acc": 0.9963503649635036, "train_speed(iter/s)": 0.95641 }, { "epoch": 0.5452035214241627, "grad_norm": 0.29370540380477905, "learning_rate": 4.666407438274824e-06, "loss": 0.019612278789281845, "memory(GiB)": 21.48, "step": 16783, "token_acc": 1.0, "train_speed(iter/s)": 0.956421 }, { "epoch": 0.5452360068869181, "grad_norm": 0.2975381314754486, "learning_rate": 4.665871484130176e-06, "loss": 0.018513765186071396, "memory(GiB)": 21.48, "step": 16784, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.956431 }, { "epoch": 0.5452684923496736, "grad_norm": 0.2944582998752594, "learning_rate": 4.665335533841814e-06, "loss": 0.016845393925905228, "memory(GiB)": 21.48, "step": 16785, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.95644 }, { "epoch": 0.5453009778124289, "grad_norm": 0.47532299160957336, "learning_rate": 4.6647995874159255e-06, "loss": 0.027140213176608086, "memory(GiB)": 21.48, "step": 16786, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.956448 }, { "epoch": 0.5453334632751844, "grad_norm": 0.3686317503452301, "learning_rate": 4.664263644858695e-06, "loss": 0.023336224257946014, "memory(GiB)": 21.48, "step": 16787, "token_acc": 0.9823943661971831, "train_speed(iter/s)": 0.956457 }, { "epoch": 0.5453659487379398, "grad_norm": 0.28644436597824097, "learning_rate": 4.6637277061763085e-06, "loss": 0.01834847778081894, "memory(GiB)": 21.48, "step": 16788, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.956467 }, { "epoch": 0.5453984342006952, "grad_norm": 0.42111751437187195, "learning_rate": 4.663191771374955e-06, "loss": 0.023258492350578308, "memory(GiB)": 21.48, "step": 16789, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.956477 }, { "epoch": 0.5454309196634506, "grad_norm": 0.41616466641426086, "learning_rate": 4.662655840460813e-06, "loss": 0.022882826626300812, "memory(GiB)": 21.48, "step": 16790, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956485 }, { "epoch": 0.5454634051262061, "grad_norm": 0.45339488983154297, "learning_rate": 4.662119913440075e-06, "loss": 0.024349380284547806, "memory(GiB)": 21.48, "step": 16791, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.956495 }, { "epoch": 0.5454958905889614, "grad_norm": 0.41486990451812744, "learning_rate": 4.66158399031892e-06, "loss": 0.020046230405569077, "memory(GiB)": 21.48, "step": 16792, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956504 }, { "epoch": 0.5455283760517169, "grad_norm": 0.3233709931373596, "learning_rate": 4.661048071103538e-06, "loss": 0.018103422597050667, "memory(GiB)": 21.48, "step": 16793, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956513 }, { "epoch": 0.5455608615144723, "grad_norm": 0.3877279758453369, "learning_rate": 4.660512155800112e-06, "loss": 0.01770998351275921, "memory(GiB)": 21.48, "step": 16794, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.956523 }, { "epoch": 0.5455933469772277, "grad_norm": 0.29741451144218445, "learning_rate": 4.659976244414828e-06, "loss": 0.01476688589900732, "memory(GiB)": 21.48, "step": 16795, "token_acc": 1.0, "train_speed(iter/s)": 0.956532 }, { "epoch": 0.5456258324399831, "grad_norm": 0.32060039043426514, "learning_rate": 4.65944033695387e-06, "loss": 0.017358146607875824, "memory(GiB)": 21.48, "step": 16796, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.956542 }, { "epoch": 0.5456583179027386, "grad_norm": 0.3339194059371948, "learning_rate": 4.658904433423424e-06, "loss": 0.01876836270093918, "memory(GiB)": 21.48, "step": 16797, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956551 }, { "epoch": 0.5456908033654939, "grad_norm": 0.3464908003807068, "learning_rate": 4.658368533829674e-06, "loss": 0.021541405469179153, "memory(GiB)": 21.48, "step": 16798, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.95656 }, { "epoch": 0.5457232888282494, "grad_norm": 0.36141082644462585, "learning_rate": 4.6578326381788075e-06, "loss": 0.01620221883058548, "memory(GiB)": 21.48, "step": 16799, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.956569 }, { "epoch": 0.5457557742910047, "grad_norm": 0.33560842275619507, "learning_rate": 4.657296746477007e-06, "loss": 0.014199336990714073, "memory(GiB)": 21.48, "step": 16800, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956578 }, { "epoch": 0.5457882597537602, "grad_norm": 0.4008716642856598, "learning_rate": 4.656760858730458e-06, "loss": 0.018345778807997704, "memory(GiB)": 21.48, "step": 16801, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.956588 }, { "epoch": 0.5458207452165156, "grad_norm": 0.33581820130348206, "learning_rate": 4.656224974945344e-06, "loss": 0.023462001234292984, "memory(GiB)": 21.48, "step": 16802, "token_acc": 1.0, "train_speed(iter/s)": 0.956597 }, { "epoch": 0.545853230679271, "grad_norm": 0.40801486372947693, "learning_rate": 4.655689095127853e-06, "loss": 0.01781010627746582, "memory(GiB)": 21.48, "step": 16803, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.956605 }, { "epoch": 0.5458857161420264, "grad_norm": 0.2916942238807678, "learning_rate": 4.655153219284169e-06, "loss": 0.01592589169740677, "memory(GiB)": 21.48, "step": 16804, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.956613 }, { "epoch": 0.5459182016047819, "grad_norm": 0.3798670172691345, "learning_rate": 4.654617347420475e-06, "loss": 0.013661115430295467, "memory(GiB)": 21.48, "step": 16805, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956623 }, { "epoch": 0.5459506870675372, "grad_norm": 0.3435945510864258, "learning_rate": 4.654081479542957e-06, "loss": 0.018855633214116096, "memory(GiB)": 21.48, "step": 16806, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.956633 }, { "epoch": 0.5459831725302927, "grad_norm": 0.6042318940162659, "learning_rate": 4.653545615657798e-06, "loss": 0.018349718302488327, "memory(GiB)": 21.48, "step": 16807, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956643 }, { "epoch": 0.5460156579930481, "grad_norm": 0.2675512731075287, "learning_rate": 4.6530097557711855e-06, "loss": 0.01741022989153862, "memory(GiB)": 21.48, "step": 16808, "token_acc": 0.9936708860759493, "train_speed(iter/s)": 0.956652 }, { "epoch": 0.5460481434558035, "grad_norm": 0.42067277431488037, "learning_rate": 4.652473899889299e-06, "loss": 0.027905020862817764, "memory(GiB)": 21.48, "step": 16809, "token_acc": 1.0, "train_speed(iter/s)": 0.956661 }, { "epoch": 0.5460806289185589, "grad_norm": 1.408062219619751, "learning_rate": 4.651938048018331e-06, "loss": 0.021598879247903824, "memory(GiB)": 21.48, "step": 16810, "token_acc": 1.0, "train_speed(iter/s)": 0.956671 }, { "epoch": 0.5461131143813144, "grad_norm": 0.4204803705215454, "learning_rate": 4.651402200164457e-06, "loss": 0.02157367393374443, "memory(GiB)": 21.48, "step": 16811, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956681 }, { "epoch": 0.5461455998440697, "grad_norm": 0.40631556510925293, "learning_rate": 4.650866356333868e-06, "loss": 0.018865374848246574, "memory(GiB)": 21.48, "step": 16812, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956689 }, { "epoch": 0.5461780853068252, "grad_norm": 0.5962962508201599, "learning_rate": 4.650330516532744e-06, "loss": 0.01716533675789833, "memory(GiB)": 21.48, "step": 16813, "token_acc": 1.0, "train_speed(iter/s)": 0.956699 }, { "epoch": 0.5462105707695806, "grad_norm": 0.36239856481552124, "learning_rate": 4.649794680767273e-06, "loss": 0.01654873602092266, "memory(GiB)": 21.48, "step": 16814, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.956708 }, { "epoch": 0.546243056232336, "grad_norm": 0.416898638010025, "learning_rate": 4.649258849043637e-06, "loss": 0.023001540452241898, "memory(GiB)": 21.48, "step": 16815, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956718 }, { "epoch": 0.5462755416950914, "grad_norm": 0.3492121398448944, "learning_rate": 4.648723021368021e-06, "loss": 0.023307140916585922, "memory(GiB)": 21.48, "step": 16816, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.956728 }, { "epoch": 0.5463080271578469, "grad_norm": 0.47041070461273193, "learning_rate": 4.648187197746608e-06, "loss": 0.016537168994545937, "memory(GiB)": 21.48, "step": 16817, "token_acc": 1.0, "train_speed(iter/s)": 0.956738 }, { "epoch": 0.5463405126206022, "grad_norm": 0.4761108458042145, "learning_rate": 4.647651378185583e-06, "loss": 0.024604346603155136, "memory(GiB)": 21.48, "step": 16818, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.956747 }, { "epoch": 0.5463729980833577, "grad_norm": 0.4152655601501465, "learning_rate": 4.647115562691131e-06, "loss": 0.024587862193584442, "memory(GiB)": 21.48, "step": 16819, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956757 }, { "epoch": 0.5464054835461131, "grad_norm": 0.3944617807865143, "learning_rate": 4.646579751269434e-06, "loss": 0.018465010449290276, "memory(GiB)": 21.48, "step": 16820, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956767 }, { "epoch": 0.5464379690088685, "grad_norm": 0.4503342807292938, "learning_rate": 4.646043943926678e-06, "loss": 0.017896704375743866, "memory(GiB)": 21.48, "step": 16821, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.956775 }, { "epoch": 0.5464704544716239, "grad_norm": 0.2535695731639862, "learning_rate": 4.645508140669045e-06, "loss": 0.01526192668825388, "memory(GiB)": 21.48, "step": 16822, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956785 }, { "epoch": 0.5465029399343794, "grad_norm": 0.26925522089004517, "learning_rate": 4.644972341502721e-06, "loss": 0.01059349812567234, "memory(GiB)": 21.48, "step": 16823, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.956794 }, { "epoch": 0.5465354253971347, "grad_norm": 0.37047573924064636, "learning_rate": 4.644436546433888e-06, "loss": 0.021629979833960533, "memory(GiB)": 21.48, "step": 16824, "token_acc": 0.992831541218638, "train_speed(iter/s)": 0.956804 }, { "epoch": 0.5465679108598902, "grad_norm": 0.33157074451446533, "learning_rate": 4.643900755468731e-06, "loss": 0.018435712903738022, "memory(GiB)": 21.48, "step": 16825, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.956813 }, { "epoch": 0.5466003963226456, "grad_norm": 0.3542448580265045, "learning_rate": 4.6433649686134325e-06, "loss": 0.019397713243961334, "memory(GiB)": 21.48, "step": 16826, "token_acc": 1.0, "train_speed(iter/s)": 0.956824 }, { "epoch": 0.546632881785401, "grad_norm": 0.38399815559387207, "learning_rate": 4.642829185874178e-06, "loss": 0.01949142850935459, "memory(GiB)": 21.48, "step": 16827, "token_acc": 1.0, "train_speed(iter/s)": 0.956835 }, { "epoch": 0.5466653672481564, "grad_norm": 0.38350340723991394, "learning_rate": 4.642293407257149e-06, "loss": 0.01674165204167366, "memory(GiB)": 21.48, "step": 16828, "token_acc": 1.0, "train_speed(iter/s)": 0.956847 }, { "epoch": 0.5466978527109119, "grad_norm": 0.4241160750389099, "learning_rate": 4.6417576327685314e-06, "loss": 0.017801877111196518, "memory(GiB)": 21.48, "step": 16829, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.956859 }, { "epoch": 0.5467303381736672, "grad_norm": 0.4321448802947998, "learning_rate": 4.641221862414507e-06, "loss": 0.028367018327116966, "memory(GiB)": 21.48, "step": 16830, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956871 }, { "epoch": 0.5467628236364227, "grad_norm": 0.2831467390060425, "learning_rate": 4.640686096201261e-06, "loss": 0.020388733595609665, "memory(GiB)": 21.48, "step": 16831, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.956884 }, { "epoch": 0.5467953090991782, "grad_norm": 0.30245068669319153, "learning_rate": 4.640150334134973e-06, "loss": 0.01583092287182808, "memory(GiB)": 21.48, "step": 16832, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956896 }, { "epoch": 0.5468277945619335, "grad_norm": 0.3583574891090393, "learning_rate": 4.639614576221832e-06, "loss": 0.019560806453227997, "memory(GiB)": 21.48, "step": 16833, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.956908 }, { "epoch": 0.546860280024689, "grad_norm": 0.4045851528644562, "learning_rate": 4.639078822468015e-06, "loss": 0.027384117245674133, "memory(GiB)": 21.48, "step": 16834, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.95692 }, { "epoch": 0.5468927654874444, "grad_norm": 0.3684837818145752, "learning_rate": 4.638543072879711e-06, "loss": 0.021271657198667526, "memory(GiB)": 21.48, "step": 16835, "token_acc": 1.0, "train_speed(iter/s)": 0.956932 }, { "epoch": 0.5469252509501998, "grad_norm": 0.5091747045516968, "learning_rate": 4.638007327463101e-06, "loss": 0.022065650671720505, "memory(GiB)": 21.48, "step": 16836, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956944 }, { "epoch": 0.5469577364129552, "grad_norm": 0.6463502645492554, "learning_rate": 4.637471586224367e-06, "loss": 0.02589694783091545, "memory(GiB)": 21.48, "step": 16837, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.956956 }, { "epoch": 0.5469902218757107, "grad_norm": 0.35489189624786377, "learning_rate": 4.636935849169697e-06, "loss": 0.01910863257944584, "memory(GiB)": 21.48, "step": 16838, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.956968 }, { "epoch": 0.547022707338466, "grad_norm": 0.23498302698135376, "learning_rate": 4.636400116305266e-06, "loss": 0.011372779496014118, "memory(GiB)": 21.48, "step": 16839, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.95698 }, { "epoch": 0.5470551928012215, "grad_norm": 0.3042210638523102, "learning_rate": 4.635864387637265e-06, "loss": 0.016076665371656418, "memory(GiB)": 21.48, "step": 16840, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.956992 }, { "epoch": 0.5470876782639769, "grad_norm": 0.2049073874950409, "learning_rate": 4.635328663171871e-06, "loss": 0.01178588904440403, "memory(GiB)": 21.48, "step": 16841, "token_acc": 0.9940476190476191, "train_speed(iter/s)": 0.957004 }, { "epoch": 0.5471201637267323, "grad_norm": 1.2350451946258545, "learning_rate": 4.6347929429152726e-06, "loss": 0.023044954985380173, "memory(GiB)": 21.48, "step": 16842, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.957016 }, { "epoch": 0.5471526491894877, "grad_norm": 0.42287710309028625, "learning_rate": 4.634257226873647e-06, "loss": 0.025861036032438278, "memory(GiB)": 21.48, "step": 16843, "token_acc": 0.9808612440191388, "train_speed(iter/s)": 0.957027 }, { "epoch": 0.5471851346522432, "grad_norm": 0.3048291504383087, "learning_rate": 4.633721515053181e-06, "loss": 0.01619817689061165, "memory(GiB)": 21.48, "step": 16844, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.957035 }, { "epoch": 0.5472176201149985, "grad_norm": 0.3807629942893982, "learning_rate": 4.633185807460056e-06, "loss": 0.024530567228794098, "memory(GiB)": 21.48, "step": 16845, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.957044 }, { "epoch": 0.547250105577754, "grad_norm": 0.5959275364875793, "learning_rate": 4.6326501041004555e-06, "loss": 0.02032928541302681, "memory(GiB)": 21.48, "step": 16846, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.957054 }, { "epoch": 0.5472825910405094, "grad_norm": 0.4013631045818329, "learning_rate": 4.63211440498056e-06, "loss": 0.026686809957027435, "memory(GiB)": 21.48, "step": 16847, "token_acc": 1.0, "train_speed(iter/s)": 0.957064 }, { "epoch": 0.5473150765032648, "grad_norm": 0.404915452003479, "learning_rate": 4.631578710106557e-06, "loss": 0.023213496431708336, "memory(GiB)": 21.48, "step": 16848, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957072 }, { "epoch": 0.5473475619660202, "grad_norm": 0.506325900554657, "learning_rate": 4.631043019484623e-06, "loss": 0.025666335597634315, "memory(GiB)": 21.48, "step": 16849, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.957082 }, { "epoch": 0.5473800474287757, "grad_norm": 0.31176137924194336, "learning_rate": 4.630507333120944e-06, "loss": 0.021114863455295563, "memory(GiB)": 21.48, "step": 16850, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.957092 }, { "epoch": 0.547412532891531, "grad_norm": 0.2422362118959427, "learning_rate": 4.629971651021704e-06, "loss": 0.01664721965789795, "memory(GiB)": 21.48, "step": 16851, "token_acc": 0.99, "train_speed(iter/s)": 0.957101 }, { "epoch": 0.5474450183542865, "grad_norm": 0.28942447900772095, "learning_rate": 4.629435973193081e-06, "loss": 0.018088843673467636, "memory(GiB)": 21.48, "step": 16852, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.957109 }, { "epoch": 0.5474775038170419, "grad_norm": 0.3905206322669983, "learning_rate": 4.6289002996412626e-06, "loss": 0.022678317502141, "memory(GiB)": 21.48, "step": 16853, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.957118 }, { "epoch": 0.5475099892797973, "grad_norm": 0.2647734582424164, "learning_rate": 4.628364630372428e-06, "loss": 0.01998787187039852, "memory(GiB)": 21.48, "step": 16854, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.957127 }, { "epoch": 0.5475424747425527, "grad_norm": 0.470152884721756, "learning_rate": 4.627828965392761e-06, "loss": 0.026344094425439835, "memory(GiB)": 21.48, "step": 16855, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.957136 }, { "epoch": 0.5475749602053082, "grad_norm": 0.3847660422325134, "learning_rate": 4.627293304708441e-06, "loss": 0.019319549202919006, "memory(GiB)": 21.48, "step": 16856, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.957144 }, { "epoch": 0.5476074456680635, "grad_norm": 0.2998385429382324, "learning_rate": 4.6267576483256545e-06, "loss": 0.015454549342393875, "memory(GiB)": 21.48, "step": 16857, "token_acc": 1.0, "train_speed(iter/s)": 0.957152 }, { "epoch": 0.547639931130819, "grad_norm": 0.42495036125183105, "learning_rate": 4.626221996250579e-06, "loss": 0.021789787337183952, "memory(GiB)": 21.48, "step": 16858, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.957162 }, { "epoch": 0.5476724165935744, "grad_norm": 0.34412455558776855, "learning_rate": 4.6256863484894026e-06, "loss": 0.018508268520236015, "memory(GiB)": 21.48, "step": 16859, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.957171 }, { "epoch": 0.5477049020563298, "grad_norm": 0.2744629979133606, "learning_rate": 4.625150705048301e-06, "loss": 0.017945922911167145, "memory(GiB)": 21.48, "step": 16860, "token_acc": 1.0, "train_speed(iter/s)": 0.95718 }, { "epoch": 0.5477373875190852, "grad_norm": 0.4048035144805908, "learning_rate": 4.624615065933461e-06, "loss": 0.024587925523519516, "memory(GiB)": 21.48, "step": 16861, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.95719 }, { "epoch": 0.5477698729818407, "grad_norm": 0.47953304648399353, "learning_rate": 4.624079431151061e-06, "loss": 0.025662241503596306, "memory(GiB)": 21.48, "step": 16862, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.957199 }, { "epoch": 0.547802358444596, "grad_norm": 0.27249574661254883, "learning_rate": 4.623543800707287e-06, "loss": 0.016146816313266754, "memory(GiB)": 21.48, "step": 16863, "token_acc": 0.9796747967479674, "train_speed(iter/s)": 0.957211 }, { "epoch": 0.5478348439073515, "grad_norm": 0.3996172249317169, "learning_rate": 4.623008174608317e-06, "loss": 0.02122369036078453, "memory(GiB)": 21.48, "step": 16864, "token_acc": 1.0, "train_speed(iter/s)": 0.957222 }, { "epoch": 0.5478673293701068, "grad_norm": 0.47687411308288574, "learning_rate": 4.622472552860333e-06, "loss": 0.019562287256121635, "memory(GiB)": 21.48, "step": 16865, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.957231 }, { "epoch": 0.5478998148328623, "grad_norm": 0.28496578335762024, "learning_rate": 4.621936935469522e-06, "loss": 0.015208248049020767, "memory(GiB)": 21.48, "step": 16866, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.957241 }, { "epoch": 0.5479323002956177, "grad_norm": 0.38229501247406006, "learning_rate": 4.621401322442058e-06, "loss": 0.025140468031167984, "memory(GiB)": 21.48, "step": 16867, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.95725 }, { "epoch": 0.5479647857583732, "grad_norm": 0.2778821587562561, "learning_rate": 4.62086571378413e-06, "loss": 0.020145682618021965, "memory(GiB)": 21.48, "step": 16868, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.95726 }, { "epoch": 0.5479972712211285, "grad_norm": 0.3387918770313263, "learning_rate": 4.620330109501915e-06, "loss": 0.01912342943251133, "memory(GiB)": 21.48, "step": 16869, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.957269 }, { "epoch": 0.548029756683884, "grad_norm": 0.4607792794704437, "learning_rate": 4.619794509601596e-06, "loss": 0.015030164271593094, "memory(GiB)": 21.48, "step": 16870, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.957279 }, { "epoch": 0.5480622421466393, "grad_norm": 0.35379651188850403, "learning_rate": 4.619258914089353e-06, "loss": 0.019395949319005013, "memory(GiB)": 21.48, "step": 16871, "token_acc": 1.0, "train_speed(iter/s)": 0.957288 }, { "epoch": 0.5480947276093948, "grad_norm": 0.3175031840801239, "learning_rate": 4.6187233229713715e-06, "loss": 0.0163209680467844, "memory(GiB)": 21.48, "step": 16872, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.957297 }, { "epoch": 0.5481272130721502, "grad_norm": 0.46093013882637024, "learning_rate": 4.618187736253828e-06, "loss": 0.025086045265197754, "memory(GiB)": 21.48, "step": 16873, "token_acc": 0.9948717948717949, "train_speed(iter/s)": 0.957307 }, { "epoch": 0.5481596985349056, "grad_norm": 0.414731502532959, "learning_rate": 4.617652153942907e-06, "loss": 0.0212238859385252, "memory(GiB)": 21.48, "step": 16874, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957315 }, { "epoch": 0.548192183997661, "grad_norm": 0.39260026812553406, "learning_rate": 4.61711657604479e-06, "loss": 0.017166800796985626, "memory(GiB)": 21.48, "step": 16875, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957324 }, { "epoch": 0.5482246694604165, "grad_norm": 0.3687138855457306, "learning_rate": 4.6165810025656565e-06, "loss": 0.012444664724171162, "memory(GiB)": 21.48, "step": 16876, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.957333 }, { "epoch": 0.5482571549231718, "grad_norm": 0.6804946660995483, "learning_rate": 4.616045433511688e-06, "loss": 0.016995809972286224, "memory(GiB)": 21.48, "step": 16877, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.957343 }, { "epoch": 0.5482896403859273, "grad_norm": 0.3802049458026886, "learning_rate": 4.615509868889067e-06, "loss": 0.020734000951051712, "memory(GiB)": 21.48, "step": 16878, "token_acc": 1.0, "train_speed(iter/s)": 0.957352 }, { "epoch": 0.5483221258486827, "grad_norm": 0.28866952657699585, "learning_rate": 4.614974308703974e-06, "loss": 0.015450503677129745, "memory(GiB)": 21.48, "step": 16879, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.957362 }, { "epoch": 0.5483546113114381, "grad_norm": 0.3474087417125702, "learning_rate": 4.614438752962589e-06, "loss": 0.021386131644248962, "memory(GiB)": 21.48, "step": 16880, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957371 }, { "epoch": 0.5483870967741935, "grad_norm": 0.3036988079547882, "learning_rate": 4.613903201671094e-06, "loss": 0.015445267781615257, "memory(GiB)": 21.48, "step": 16881, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.95738 }, { "epoch": 0.548419582236949, "grad_norm": 0.36357709765434265, "learning_rate": 4.61336765483567e-06, "loss": 0.019810480996966362, "memory(GiB)": 21.48, "step": 16882, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.95739 }, { "epoch": 0.5484520676997043, "grad_norm": 0.3082219958305359, "learning_rate": 4.612832112462499e-06, "loss": 0.015382295474410057, "memory(GiB)": 21.48, "step": 16883, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.957397 }, { "epoch": 0.5484845531624598, "grad_norm": 0.3532159924507141, "learning_rate": 4.61229657455776e-06, "loss": 0.02108953706920147, "memory(GiB)": 21.48, "step": 16884, "token_acc": 1.0, "train_speed(iter/s)": 0.957406 }, { "epoch": 0.5485170386252152, "grad_norm": 0.2879994511604309, "learning_rate": 4.611761041127635e-06, "loss": 0.01614425703883171, "memory(GiB)": 21.48, "step": 16885, "token_acc": 1.0, "train_speed(iter/s)": 0.957416 }, { "epoch": 0.5485495240879706, "grad_norm": 0.3562847673892975, "learning_rate": 4.611225512178304e-06, "loss": 0.01665453240275383, "memory(GiB)": 21.48, "step": 16886, "token_acc": 0.9814126394052045, "train_speed(iter/s)": 0.957425 }, { "epoch": 0.548582009550726, "grad_norm": 0.36672189831733704, "learning_rate": 4.6106899877159495e-06, "loss": 0.019810179248452187, "memory(GiB)": 21.48, "step": 16887, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.957435 }, { "epoch": 0.5486144950134815, "grad_norm": 0.4266105592250824, "learning_rate": 4.610154467746747e-06, "loss": 0.016227243468165398, "memory(GiB)": 21.48, "step": 16888, "token_acc": 1.0, "train_speed(iter/s)": 0.957446 }, { "epoch": 0.5486469804762368, "grad_norm": 0.3148987293243408, "learning_rate": 4.609618952276885e-06, "loss": 0.016561856493353844, "memory(GiB)": 21.48, "step": 16889, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.957458 }, { "epoch": 0.5486794659389923, "grad_norm": 0.5952337980270386, "learning_rate": 4.6090834413125365e-06, "loss": 0.035134583711624146, "memory(GiB)": 21.48, "step": 16890, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.95747 }, { "epoch": 0.5487119514017477, "grad_norm": 0.3221687972545624, "learning_rate": 4.608547934859889e-06, "loss": 0.022569511085748672, "memory(GiB)": 21.48, "step": 16891, "token_acc": 1.0, "train_speed(iter/s)": 0.957482 }, { "epoch": 0.5487444368645031, "grad_norm": 0.43338799476623535, "learning_rate": 4.608012432925115e-06, "loss": 0.030707882717251778, "memory(GiB)": 21.48, "step": 16892, "token_acc": 0.9764705882352941, "train_speed(iter/s)": 0.957494 }, { "epoch": 0.5487769223272585, "grad_norm": 0.3952304720878601, "learning_rate": 4.607476935514403e-06, "loss": 0.015249572694301605, "memory(GiB)": 21.48, "step": 16893, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.957506 }, { "epoch": 0.548809407790014, "grad_norm": 0.457643061876297, "learning_rate": 4.606941442633928e-06, "loss": 0.030224941670894623, "memory(GiB)": 21.48, "step": 16894, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.957518 }, { "epoch": 0.5488418932527693, "grad_norm": 0.3355112373828888, "learning_rate": 4.606405954289872e-06, "loss": 0.01399377454072237, "memory(GiB)": 21.48, "step": 16895, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.95753 }, { "epoch": 0.5488743787155248, "grad_norm": 0.2643108665943146, "learning_rate": 4.605870470488415e-06, "loss": 0.014391136355698109, "memory(GiB)": 21.48, "step": 16896, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.957542 }, { "epoch": 0.5489068641782803, "grad_norm": 0.2574581503868103, "learning_rate": 4.605334991235736e-06, "loss": 0.016505271196365356, "memory(GiB)": 21.48, "step": 16897, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.957554 }, { "epoch": 0.5489393496410356, "grad_norm": 0.3596246838569641, "learning_rate": 4.60479951653802e-06, "loss": 0.024735672399401665, "memory(GiB)": 21.48, "step": 16898, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957566 }, { "epoch": 0.5489718351037911, "grad_norm": 0.25238141417503357, "learning_rate": 4.604264046401441e-06, "loss": 0.013994617387652397, "memory(GiB)": 21.48, "step": 16899, "token_acc": 0.9905660377358491, "train_speed(iter/s)": 0.957577 }, { "epoch": 0.5490043205665465, "grad_norm": 0.2665998637676239, "learning_rate": 4.603728580832183e-06, "loss": 0.017658255994319916, "memory(GiB)": 21.48, "step": 16900, "token_acc": 1.0, "train_speed(iter/s)": 0.957589 }, { "epoch": 0.5490368060293019, "grad_norm": 0.34549641609191895, "learning_rate": 4.603193119836423e-06, "loss": 0.013996107503771782, "memory(GiB)": 21.48, "step": 16901, "token_acc": 1.0, "train_speed(iter/s)": 0.957602 }, { "epoch": 0.5490692914920573, "grad_norm": 0.3059801757335663, "learning_rate": 4.602657663420344e-06, "loss": 0.018276941031217575, "memory(GiB)": 21.48, "step": 16902, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.957613 }, { "epoch": 0.5491017769548128, "grad_norm": 0.2820185720920563, "learning_rate": 4.602122211590123e-06, "loss": 0.019105853512883186, "memory(GiB)": 21.48, "step": 16903, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957622 }, { "epoch": 0.5491342624175681, "grad_norm": 0.33738312125205994, "learning_rate": 4.601586764351943e-06, "loss": 0.02032463438808918, "memory(GiB)": 21.48, "step": 16904, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.957631 }, { "epoch": 0.5491667478803236, "grad_norm": 0.3857310116291046, "learning_rate": 4.601051321711981e-06, "loss": 0.016612309962511063, "memory(GiB)": 21.48, "step": 16905, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.95764 }, { "epoch": 0.549199233343079, "grad_norm": 0.31178975105285645, "learning_rate": 4.600515883676418e-06, "loss": 0.014783807098865509, "memory(GiB)": 21.48, "step": 16906, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.95765 }, { "epoch": 0.5492317188058344, "grad_norm": 0.2875937521457672, "learning_rate": 4.5999804502514324e-06, "loss": 0.017615800723433495, "memory(GiB)": 21.48, "step": 16907, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.957659 }, { "epoch": 0.5492642042685898, "grad_norm": 0.45030197501182556, "learning_rate": 4.599445021443207e-06, "loss": 0.025384576991200447, "memory(GiB)": 21.48, "step": 16908, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.957668 }, { "epoch": 0.5492966897313453, "grad_norm": 0.41560736298561096, "learning_rate": 4.5989095972579166e-06, "loss": 0.02089116908609867, "memory(GiB)": 21.48, "step": 16909, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957677 }, { "epoch": 0.5493291751941006, "grad_norm": 0.43333402276039124, "learning_rate": 4.598374177701745e-06, "loss": 0.025031764060258865, "memory(GiB)": 21.48, "step": 16910, "token_acc": 0.9877049180327869, "train_speed(iter/s)": 0.957686 }, { "epoch": 0.5493616606568561, "grad_norm": 0.4295622706413269, "learning_rate": 4.5978387627808696e-06, "loss": 0.022379904985427856, "memory(GiB)": 21.48, "step": 16911, "token_acc": 0.9895470383275261, "train_speed(iter/s)": 0.957694 }, { "epoch": 0.5493941461196115, "grad_norm": 0.4108676612377167, "learning_rate": 4.597303352501469e-06, "loss": 0.019263071939349174, "memory(GiB)": 21.48, "step": 16912, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.957703 }, { "epoch": 0.5494266315823669, "grad_norm": 0.40919673442840576, "learning_rate": 4.596767946869726e-06, "loss": 0.018707606941461563, "memory(GiB)": 21.48, "step": 16913, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.957712 }, { "epoch": 0.5494591170451223, "grad_norm": 0.35237744450569153, "learning_rate": 4.5962325458918155e-06, "loss": 0.01855590008199215, "memory(GiB)": 21.48, "step": 16914, "token_acc": 0.9932885906040269, "train_speed(iter/s)": 0.957721 }, { "epoch": 0.5494916025078778, "grad_norm": 0.48186907172203064, "learning_rate": 4.59569714957392e-06, "loss": 0.027877820655703545, "memory(GiB)": 21.48, "step": 16915, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.957729 }, { "epoch": 0.5495240879706331, "grad_norm": 0.34805646538734436, "learning_rate": 4.595161757922217e-06, "loss": 0.02533925697207451, "memory(GiB)": 21.48, "step": 16916, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.957737 }, { "epoch": 0.5495565734333886, "grad_norm": 0.38768142461776733, "learning_rate": 4.594626370942888e-06, "loss": 0.01999293826520443, "memory(GiB)": 21.48, "step": 16917, "token_acc": 1.0, "train_speed(iter/s)": 0.957746 }, { "epoch": 0.549589058896144, "grad_norm": 0.43549036979675293, "learning_rate": 4.594090988642107e-06, "loss": 0.018144886940717697, "memory(GiB)": 21.48, "step": 16918, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957755 }, { "epoch": 0.5496215443588994, "grad_norm": 0.3695293366909027, "learning_rate": 4.59355561102606e-06, "loss": 0.017809676006436348, "memory(GiB)": 21.48, "step": 16919, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.957764 }, { "epoch": 0.5496540298216548, "grad_norm": 0.3227598965167999, "learning_rate": 4.593020238100918e-06, "loss": 0.023093681782484055, "memory(GiB)": 21.48, "step": 16920, "token_acc": 0.9948453608247423, "train_speed(iter/s)": 0.957772 }, { "epoch": 0.5496865152844103, "grad_norm": 0.42934536933898926, "learning_rate": 4.592484869872868e-06, "loss": 0.026160866022109985, "memory(GiB)": 21.48, "step": 16921, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.957782 }, { "epoch": 0.5497190007471656, "grad_norm": 0.38445934653282166, "learning_rate": 4.591949506348083e-06, "loss": 0.02488814853131771, "memory(GiB)": 21.48, "step": 16922, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.957791 }, { "epoch": 0.5497514862099211, "grad_norm": 0.3312850296497345, "learning_rate": 4.591414147532744e-06, "loss": 0.019471941515803337, "memory(GiB)": 21.48, "step": 16923, "token_acc": 1.0, "train_speed(iter/s)": 0.957803 }, { "epoch": 0.5497839716726765, "grad_norm": 0.38932979106903076, "learning_rate": 4.5908787934330295e-06, "loss": 0.01607702672481537, "memory(GiB)": 21.48, "step": 16924, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957815 }, { "epoch": 0.5498164571354319, "grad_norm": 0.3306044340133667, "learning_rate": 4.590343444055119e-06, "loss": 0.013048054650425911, "memory(GiB)": 21.48, "step": 16925, "token_acc": 1.0, "train_speed(iter/s)": 0.957826 }, { "epoch": 0.5498489425981873, "grad_norm": 0.41356170177459717, "learning_rate": 4.589808099405189e-06, "loss": 0.02000931277871132, "memory(GiB)": 21.48, "step": 16926, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957835 }, { "epoch": 0.5498814280609428, "grad_norm": 0.3529287874698639, "learning_rate": 4.58927275948942e-06, "loss": 0.022901155054569244, "memory(GiB)": 21.48, "step": 16927, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957845 }, { "epoch": 0.5499139135236981, "grad_norm": 0.48646512627601624, "learning_rate": 4.588737424313991e-06, "loss": 0.026618240401148796, "memory(GiB)": 21.48, "step": 16928, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.957854 }, { "epoch": 0.5499463989864536, "grad_norm": 0.3097262978553772, "learning_rate": 4.588202093885078e-06, "loss": 0.016790077090263367, "memory(GiB)": 21.48, "step": 16929, "token_acc": 1.0, "train_speed(iter/s)": 0.957864 }, { "epoch": 0.549978884449209, "grad_norm": 0.2520141899585724, "learning_rate": 4.587666768208863e-06, "loss": 0.013243664056062698, "memory(GiB)": 21.48, "step": 16930, "token_acc": 1.0, "train_speed(iter/s)": 0.957873 }, { "epoch": 0.5500113699119644, "grad_norm": 0.2640310823917389, "learning_rate": 4.58713144729152e-06, "loss": 0.015009650960564613, "memory(GiB)": 21.48, "step": 16931, "token_acc": 1.0, "train_speed(iter/s)": 0.957882 }, { "epoch": 0.5500438553747198, "grad_norm": 0.36983758211135864, "learning_rate": 4.586596131139232e-06, "loss": 0.021923307329416275, "memory(GiB)": 21.48, "step": 16932, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.957892 }, { "epoch": 0.5500763408374753, "grad_norm": 0.33125364780426025, "learning_rate": 4.586060819758173e-06, "loss": 0.015200862661004066, "memory(GiB)": 21.48, "step": 16933, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.957901 }, { "epoch": 0.5501088263002306, "grad_norm": 0.4084625244140625, "learning_rate": 4.5855255131545235e-06, "loss": 0.020889893174171448, "memory(GiB)": 21.48, "step": 16934, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.957911 }, { "epoch": 0.5501413117629861, "grad_norm": 0.4602504372596741, "learning_rate": 4.5849902113344614e-06, "loss": 0.01735939085483551, "memory(GiB)": 21.48, "step": 16935, "token_acc": 1.0, "train_speed(iter/s)": 0.957921 }, { "epoch": 0.5501737972257414, "grad_norm": 0.3183304965496063, "learning_rate": 4.584454914304165e-06, "loss": 0.015308714471757412, "memory(GiB)": 21.48, "step": 16936, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.957929 }, { "epoch": 0.5502062826884969, "grad_norm": 0.4607716202735901, "learning_rate": 4.583919622069811e-06, "loss": 0.02114485204219818, "memory(GiB)": 21.48, "step": 16937, "token_acc": 0.9866220735785953, "train_speed(iter/s)": 0.957938 }, { "epoch": 0.5502387681512523, "grad_norm": 0.296403706073761, "learning_rate": 4.58338433463758e-06, "loss": 0.015990007668733597, "memory(GiB)": 21.48, "step": 16938, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.957948 }, { "epoch": 0.5502712536140077, "grad_norm": 0.43230557441711426, "learning_rate": 4.582849052013647e-06, "loss": 0.015541225671768188, "memory(GiB)": 21.48, "step": 16939, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.957957 }, { "epoch": 0.5503037390767631, "grad_norm": 0.4584532380104065, "learning_rate": 4.582313774204192e-06, "loss": 0.021582305431365967, "memory(GiB)": 21.48, "step": 16940, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957966 }, { "epoch": 0.5503362245395186, "grad_norm": 0.45490172505378723, "learning_rate": 4.58177850121539e-06, "loss": 0.0297923032194376, "memory(GiB)": 21.48, "step": 16941, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957977 }, { "epoch": 0.5503687100022739, "grad_norm": 0.4922512173652649, "learning_rate": 4.581243233053421e-06, "loss": 0.023906152695417404, "memory(GiB)": 21.48, "step": 16942, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.957986 }, { "epoch": 0.5504011954650294, "grad_norm": 0.28612858057022095, "learning_rate": 4.580707969724465e-06, "loss": 0.01642501726746559, "memory(GiB)": 21.48, "step": 16943, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957996 }, { "epoch": 0.5504336809277848, "grad_norm": 0.3611474931240082, "learning_rate": 4.5801727112346955e-06, "loss": 0.018855253234505653, "memory(GiB)": 21.48, "step": 16944, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.958005 }, { "epoch": 0.5504661663905402, "grad_norm": 0.30938491225242615, "learning_rate": 4.579637457590294e-06, "loss": 0.018963254988193512, "memory(GiB)": 21.48, "step": 16945, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.958012 }, { "epoch": 0.5504986518532956, "grad_norm": 0.31214645504951477, "learning_rate": 4.579102208797431e-06, "loss": 0.020113758742809296, "memory(GiB)": 21.48, "step": 16946, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.958021 }, { "epoch": 0.5505311373160511, "grad_norm": 0.41852426528930664, "learning_rate": 4.578566964862294e-06, "loss": 0.023692220449447632, "memory(GiB)": 21.48, "step": 16947, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.958031 }, { "epoch": 0.5505636227788064, "grad_norm": 0.25481778383255005, "learning_rate": 4.578031725791051e-06, "loss": 0.014728314243257046, "memory(GiB)": 21.48, "step": 16948, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.95804 }, { "epoch": 0.5505961082415619, "grad_norm": 0.3957161009311676, "learning_rate": 4.577496491589886e-06, "loss": 0.015397804789245129, "memory(GiB)": 21.48, "step": 16949, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.95805 }, { "epoch": 0.5506285937043173, "grad_norm": 0.3365034759044647, "learning_rate": 4.576961262264973e-06, "loss": 0.020693372935056686, "memory(GiB)": 21.48, "step": 16950, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.95806 }, { "epoch": 0.5506610791670727, "grad_norm": 0.43050262331962585, "learning_rate": 4.57642603782249e-06, "loss": 0.024225935339927673, "memory(GiB)": 21.48, "step": 16951, "token_acc": 1.0, "train_speed(iter/s)": 0.958071 }, { "epoch": 0.5506935646298281, "grad_norm": 0.43545448780059814, "learning_rate": 4.575890818268614e-06, "loss": 0.017917010933160782, "memory(GiB)": 21.48, "step": 16952, "token_acc": 0.9963636363636363, "train_speed(iter/s)": 0.958083 }, { "epoch": 0.5507260500925836, "grad_norm": 0.38609591126441956, "learning_rate": 4.5753556036095235e-06, "loss": 0.013285737484693527, "memory(GiB)": 21.48, "step": 16953, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.958095 }, { "epoch": 0.5507585355553389, "grad_norm": 0.6157059669494629, "learning_rate": 4.574820393851394e-06, "loss": 0.036705031991004944, "memory(GiB)": 21.48, "step": 16954, "token_acc": 0.9895104895104895, "train_speed(iter/s)": 0.958107 }, { "epoch": 0.5507910210180944, "grad_norm": 0.23007053136825562, "learning_rate": 4.574285189000402e-06, "loss": 0.016234107315540314, "memory(GiB)": 21.48, "step": 16955, "token_acc": 0.9923954372623575, "train_speed(iter/s)": 0.958119 }, { "epoch": 0.5508235064808498, "grad_norm": 0.47787049412727356, "learning_rate": 4.573749989062727e-06, "loss": 0.021068425849080086, "memory(GiB)": 21.48, "step": 16956, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.958131 }, { "epoch": 0.5508559919436052, "grad_norm": 0.40758275985717773, "learning_rate": 4.573214794044544e-06, "loss": 0.022381912916898727, "memory(GiB)": 21.48, "step": 16957, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.958143 }, { "epoch": 0.5508884774063606, "grad_norm": 0.3724617063999176, "learning_rate": 4.5726796039520295e-06, "loss": 0.0197131484746933, "memory(GiB)": 21.48, "step": 16958, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.958155 }, { "epoch": 0.5509209628691161, "grad_norm": 0.32184186577796936, "learning_rate": 4.5721444187913615e-06, "loss": 0.018339665606617928, "memory(GiB)": 21.48, "step": 16959, "token_acc": 0.996, "train_speed(iter/s)": 0.958167 }, { "epoch": 0.5509534483318715, "grad_norm": 0.3803083002567291, "learning_rate": 4.571609238568717e-06, "loss": 0.022463347762823105, "memory(GiB)": 21.48, "step": 16960, "token_acc": 1.0, "train_speed(iter/s)": 0.958179 }, { "epoch": 0.5509859337946269, "grad_norm": 0.3266737759113312, "learning_rate": 4.571074063290272e-06, "loss": 0.01905655488371849, "memory(GiB)": 21.48, "step": 16961, "token_acc": 0.994475138121547, "train_speed(iter/s)": 0.958191 }, { "epoch": 0.5510184192573824, "grad_norm": 0.22702240943908691, "learning_rate": 4.570538892962203e-06, "loss": 0.009640727192163467, "memory(GiB)": 21.48, "step": 16962, "token_acc": 0.9927007299270073, "train_speed(iter/s)": 0.958201 }, { "epoch": 0.5510509047201377, "grad_norm": 0.43306612968444824, "learning_rate": 4.570003727590687e-06, "loss": 0.01796267181634903, "memory(GiB)": 21.48, "step": 16963, "token_acc": 0.9885496183206107, "train_speed(iter/s)": 0.958209 }, { "epoch": 0.5510833901828932, "grad_norm": 0.39749857783317566, "learning_rate": 4.5694685671819e-06, "loss": 0.023262884467840195, "memory(GiB)": 21.48, "step": 16964, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.958218 }, { "epoch": 0.5511158756456486, "grad_norm": 0.2861350476741791, "learning_rate": 4.568933411742018e-06, "loss": 0.015344678424298763, "memory(GiB)": 21.48, "step": 16965, "token_acc": 0.9898477157360406, "train_speed(iter/s)": 0.958228 }, { "epoch": 0.551148361108404, "grad_norm": 0.49165406823158264, "learning_rate": 4.56839826127722e-06, "loss": 0.02467285469174385, "memory(GiB)": 21.48, "step": 16966, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.958236 }, { "epoch": 0.5511808465711594, "grad_norm": 0.24528715014457703, "learning_rate": 4.567863115793679e-06, "loss": 0.01654132828116417, "memory(GiB)": 21.48, "step": 16967, "token_acc": 1.0, "train_speed(iter/s)": 0.958239 }, { "epoch": 0.5512133320339149, "grad_norm": 0.3036417067050934, "learning_rate": 4.567327975297575e-06, "loss": 0.020606044679880142, "memory(GiB)": 21.48, "step": 16968, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.958243 }, { "epoch": 0.5512458174966702, "grad_norm": 0.25189951062202454, "learning_rate": 4.5667928397950785e-06, "loss": 0.014809135347604752, "memory(GiB)": 21.48, "step": 16969, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.958245 }, { "epoch": 0.5512783029594257, "grad_norm": 0.39523234963417053, "learning_rate": 4.566257709292373e-06, "loss": 0.01609688438475132, "memory(GiB)": 21.48, "step": 16970, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.958254 }, { "epoch": 0.5513107884221811, "grad_norm": 0.34261271357536316, "learning_rate": 4.565722583795628e-06, "loss": 0.02096136473119259, "memory(GiB)": 21.48, "step": 16971, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.958258 }, { "epoch": 0.5513432738849365, "grad_norm": 0.3809593915939331, "learning_rate": 4.565187463311025e-06, "loss": 0.02621760405600071, "memory(GiB)": 21.48, "step": 16972, "token_acc": 0.9847908745247148, "train_speed(iter/s)": 0.958266 }, { "epoch": 0.5513757593476919, "grad_norm": 0.3389165997505188, "learning_rate": 4.5646523478447355e-06, "loss": 0.01760774850845337, "memory(GiB)": 21.48, "step": 16973, "token_acc": 1.0, "train_speed(iter/s)": 0.958275 }, { "epoch": 0.5514082448104474, "grad_norm": 0.4209836423397064, "learning_rate": 4.564117237402936e-06, "loss": 0.022420261055231094, "memory(GiB)": 21.48, "step": 16974, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.958281 }, { "epoch": 0.5514407302732027, "grad_norm": 0.31482595205307007, "learning_rate": 4.5635821319918064e-06, "loss": 0.017934545874595642, "memory(GiB)": 21.48, "step": 16975, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.958283 }, { "epoch": 0.5514732157359582, "grad_norm": 0.386996865272522, "learning_rate": 4.563047031617518e-06, "loss": 0.014695988968014717, "memory(GiB)": 21.48, "step": 16976, "token_acc": 1.0, "train_speed(iter/s)": 0.958289 }, { "epoch": 0.5515057011987136, "grad_norm": 0.4570825397968292, "learning_rate": 4.562511936286251e-06, "loss": 0.02195940911769867, "memory(GiB)": 21.48, "step": 16977, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.958294 }, { "epoch": 0.551538186661469, "grad_norm": 0.33750084042549133, "learning_rate": 4.561976846004176e-06, "loss": 0.018244538456201553, "memory(GiB)": 21.48, "step": 16978, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.958303 }, { "epoch": 0.5515706721242244, "grad_norm": 2.1364479064941406, "learning_rate": 4.561441760777473e-06, "loss": 0.018385138362646103, "memory(GiB)": 21.48, "step": 16979, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.958312 }, { "epoch": 0.5516031575869799, "grad_norm": 0.4626023471355438, "learning_rate": 4.560906680612314e-06, "loss": 0.027868788689374924, "memory(GiB)": 21.48, "step": 16980, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.958319 }, { "epoch": 0.5516356430497352, "grad_norm": 0.3409593403339386, "learning_rate": 4.560371605514878e-06, "loss": 0.01640177145600319, "memory(GiB)": 21.48, "step": 16981, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.958329 }, { "epoch": 0.5516681285124907, "grad_norm": 0.22753432393074036, "learning_rate": 4.559836535491337e-06, "loss": 0.012098428793251514, "memory(GiB)": 21.48, "step": 16982, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.958341 }, { "epoch": 0.5517006139752461, "grad_norm": 0.26553940773010254, "learning_rate": 4.55930147054787e-06, "loss": 0.020446915179491043, "memory(GiB)": 21.48, "step": 16983, "token_acc": 1.0, "train_speed(iter/s)": 0.958353 }, { "epoch": 0.5517330994380015, "grad_norm": 0.33265331387519836, "learning_rate": 4.55876641069065e-06, "loss": 0.015533287078142166, "memory(GiB)": 21.48, "step": 16984, "token_acc": 1.0, "train_speed(iter/s)": 0.958346 }, { "epoch": 0.5517655849007569, "grad_norm": 0.36674508452415466, "learning_rate": 4.558231355925853e-06, "loss": 0.021601971238851547, "memory(GiB)": 21.48, "step": 16985, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.95835 }, { "epoch": 0.5517980703635124, "grad_norm": 0.4279349446296692, "learning_rate": 4.557696306259653e-06, "loss": 0.018238738179206848, "memory(GiB)": 21.48, "step": 16986, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.95836 }, { "epoch": 0.5518305558262677, "grad_norm": 0.29333391785621643, "learning_rate": 4.557161261698228e-06, "loss": 0.017131386324763298, "memory(GiB)": 21.48, "step": 16987, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.958369 }, { "epoch": 0.5518630412890232, "grad_norm": 0.4017694294452667, "learning_rate": 4.55662622224775e-06, "loss": 0.01763513684272766, "memory(GiB)": 21.48, "step": 16988, "token_acc": 1.0, "train_speed(iter/s)": 0.958372 }, { "epoch": 0.5518955267517786, "grad_norm": 0.9043558239936829, "learning_rate": 4.5560911879143956e-06, "loss": 0.020841050893068314, "memory(GiB)": 21.48, "step": 16989, "token_acc": 1.0, "train_speed(iter/s)": 0.958382 }, { "epoch": 0.551928012214534, "grad_norm": 0.41107863187789917, "learning_rate": 4.555556158704341e-06, "loss": 0.026207678020000458, "memory(GiB)": 21.48, "step": 16990, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.958391 }, { "epoch": 0.5519604976772894, "grad_norm": 0.3878718912601471, "learning_rate": 4.555021134623759e-06, "loss": 0.013713842257857323, "memory(GiB)": 21.48, "step": 16991, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.958401 }, { "epoch": 0.5519929831400449, "grad_norm": 0.4459846615791321, "learning_rate": 4.5544861156788265e-06, "loss": 0.015351526439189911, "memory(GiB)": 21.48, "step": 16992, "token_acc": 1.0, "train_speed(iter/s)": 0.95841 }, { "epoch": 0.5520254686028002, "grad_norm": 0.4101387858390808, "learning_rate": 4.5539511018757155e-06, "loss": 0.023287024348974228, "memory(GiB)": 21.48, "step": 16993, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.958399 }, { "epoch": 0.5520579540655557, "grad_norm": 0.34158146381378174, "learning_rate": 4.553416093220603e-06, "loss": 0.01690680906176567, "memory(GiB)": 21.48, "step": 16994, "token_acc": 0.988, "train_speed(iter/s)": 0.958403 }, { "epoch": 0.552090439528311, "grad_norm": 0.4611804485321045, "learning_rate": 4.552881089719663e-06, "loss": 0.020015671849250793, "memory(GiB)": 21.48, "step": 16995, "token_acc": 1.0, "train_speed(iter/s)": 0.958409 }, { "epoch": 0.5521229249910665, "grad_norm": 0.3963209390640259, "learning_rate": 4.552346091379073e-06, "loss": 0.01950148493051529, "memory(GiB)": 21.48, "step": 16996, "token_acc": 1.0, "train_speed(iter/s)": 0.958418 }, { "epoch": 0.5521554104538219, "grad_norm": 0.3373129963874817, "learning_rate": 4.551811098205001e-06, "loss": 0.017495622858405113, "memory(GiB)": 21.48, "step": 16997, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.958424 }, { "epoch": 0.5521878959165774, "grad_norm": 0.29523077607154846, "learning_rate": 4.551276110203629e-06, "loss": 0.017814934253692627, "memory(GiB)": 21.48, "step": 16998, "token_acc": 1.0, "train_speed(iter/s)": 0.958432 }, { "epoch": 0.5522203813793327, "grad_norm": 0.5405510067939758, "learning_rate": 4.550741127381125e-06, "loss": 0.01931397244334221, "memory(GiB)": 21.48, "step": 16999, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.958441 }, { "epoch": 0.5522528668420882, "grad_norm": 0.42658907175064087, "learning_rate": 4.550206149743669e-06, "loss": 0.014968903735280037, "memory(GiB)": 21.48, "step": 17000, "token_acc": 1.0, "train_speed(iter/s)": 0.95845 }, { "epoch": 0.5522528668420882, "eval_loss": 0.01986846514046192, "eval_runtime": 81.2244, "eval_samples_per_second": 122.5, "eval_steps_per_second": 3.829, "eval_token_acc": 0.992174589037217, "step": 17000 }, { "epoch": 0.5522853523048435, "grad_norm": 0.4090375602245331, "learning_rate": 4.54967117729743e-06, "loss": 0.018601685762405396, "memory(GiB)": 21.48, "step": 17001, "token_acc": 0.9915217093548829, "train_speed(iter/s)": 0.953484 }, { "epoch": 0.552317837767599, "grad_norm": 0.34465643763542175, "learning_rate": 4.549136210048587e-06, "loss": 0.025539105758070946, "memory(GiB)": 21.48, "step": 17002, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.953492 }, { "epoch": 0.5523503232303544, "grad_norm": 0.3733673095703125, "learning_rate": 4.548601248003311e-06, "loss": 0.020220983773469925, "memory(GiB)": 21.48, "step": 17003, "token_acc": 1.0, "train_speed(iter/s)": 0.9535 }, { "epoch": 0.5523828086931099, "grad_norm": 0.37895429134368896, "learning_rate": 4.548066291167776e-06, "loss": 0.021224141120910645, "memory(GiB)": 21.48, "step": 17004, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.953509 }, { "epoch": 0.5524152941558652, "grad_norm": 0.37410736083984375, "learning_rate": 4.547531339548161e-06, "loss": 0.016875632107257843, "memory(GiB)": 21.48, "step": 17005, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.953518 }, { "epoch": 0.5524477796186207, "grad_norm": 0.31280994415283203, "learning_rate": 4.546996393150634e-06, "loss": 0.018582377582788467, "memory(GiB)": 21.48, "step": 17006, "token_acc": 0.9835164835164835, "train_speed(iter/s)": 0.953527 }, { "epoch": 0.552480265081376, "grad_norm": 0.5292258262634277, "learning_rate": 4.546461451981374e-06, "loss": 0.02953284978866577, "memory(GiB)": 21.48, "step": 17007, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.953534 }, { "epoch": 0.5525127505441315, "grad_norm": 0.2702510952949524, "learning_rate": 4.54592651604655e-06, "loss": 0.014143266715109348, "memory(GiB)": 21.48, "step": 17008, "token_acc": 1.0, "train_speed(iter/s)": 0.953543 }, { "epoch": 0.5525452360068869, "grad_norm": 0.34100019931793213, "learning_rate": 4.54539158535234e-06, "loss": 0.017348993569612503, "memory(GiB)": 21.48, "step": 17009, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.953553 }, { "epoch": 0.5525777214696423, "grad_norm": 0.2394781857728958, "learning_rate": 4.544856659904916e-06, "loss": 0.019228775054216385, "memory(GiB)": 21.48, "step": 17010, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.95356 }, { "epoch": 0.5526102069323977, "grad_norm": 0.3548494875431061, "learning_rate": 4.5443217397104515e-06, "loss": 0.024049606174230576, "memory(GiB)": 21.48, "step": 17011, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.953566 }, { "epoch": 0.5526426923951532, "grad_norm": 0.34469082951545715, "learning_rate": 4.54378682477512e-06, "loss": 0.015409802086651325, "memory(GiB)": 21.48, "step": 17012, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.953575 }, { "epoch": 0.5526751778579085, "grad_norm": 0.30659565329551697, "learning_rate": 4.543251915105097e-06, "loss": 0.016885071992874146, "memory(GiB)": 21.48, "step": 17013, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.953584 }, { "epoch": 0.552707663320664, "grad_norm": 0.2516123056411743, "learning_rate": 4.542717010706554e-06, "loss": 0.010255198925733566, "memory(GiB)": 21.48, "step": 17014, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.953596 }, { "epoch": 0.5527401487834194, "grad_norm": 0.3381185829639435, "learning_rate": 4.542182111585666e-06, "loss": 0.017424529418349266, "memory(GiB)": 21.48, "step": 17015, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.953608 }, { "epoch": 0.5527726342461748, "grad_norm": 0.37264734506607056, "learning_rate": 4.541647217748605e-06, "loss": 0.023672742769122124, "memory(GiB)": 21.48, "step": 17016, "token_acc": 0.9842105263157894, "train_speed(iter/s)": 0.953621 }, { "epoch": 0.5528051197089302, "grad_norm": 0.27839183807373047, "learning_rate": 4.541112329201546e-06, "loss": 0.014946307986974716, "memory(GiB)": 21.48, "step": 17017, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.953633 }, { "epoch": 0.5528376051716857, "grad_norm": 0.4055286645889282, "learning_rate": 4.54057744595066e-06, "loss": 0.02569219470024109, "memory(GiB)": 21.48, "step": 17018, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.953646 }, { "epoch": 0.552870090634441, "grad_norm": 0.39626553654670715, "learning_rate": 4.540042568002123e-06, "loss": 0.020003948360681534, "memory(GiB)": 21.48, "step": 17019, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.953657 }, { "epoch": 0.5529025760971965, "grad_norm": 0.35873186588287354, "learning_rate": 4.539507695362107e-06, "loss": 0.022279823198914528, "memory(GiB)": 21.48, "step": 17020, "token_acc": 0.9844559585492227, "train_speed(iter/s)": 0.953669 }, { "epoch": 0.5529350615599519, "grad_norm": 0.4158022105693817, "learning_rate": 4.538972828036784e-06, "loss": 0.023134224116802216, "memory(GiB)": 21.48, "step": 17021, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.953681 }, { "epoch": 0.5529675470227073, "grad_norm": 0.41964107751846313, "learning_rate": 4.53843796603233e-06, "loss": 0.020035352557897568, "memory(GiB)": 21.48, "step": 17022, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.953693 }, { "epoch": 0.5530000324854627, "grad_norm": 0.23622384667396545, "learning_rate": 4.537903109354915e-06, "loss": 0.014303606003522873, "memory(GiB)": 21.48, "step": 17023, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.953705 }, { "epoch": 0.5530325179482182, "grad_norm": 0.30279943346977234, "learning_rate": 4.537368258010715e-06, "loss": 0.013455154374241829, "memory(GiB)": 21.48, "step": 17024, "token_acc": 1.0, "train_speed(iter/s)": 0.953715 }, { "epoch": 0.5530650034109736, "grad_norm": 0.8769996166229248, "learning_rate": 4.536833412005898e-06, "loss": 0.016726713627576828, "memory(GiB)": 21.48, "step": 17025, "token_acc": 1.0, "train_speed(iter/s)": 0.953727 }, { "epoch": 0.553097488873729, "grad_norm": 0.34680357575416565, "learning_rate": 4.536298571346643e-06, "loss": 0.01966327242553234, "memory(GiB)": 21.48, "step": 17026, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.953738 }, { "epoch": 0.5531299743364845, "grad_norm": 0.31710946559906006, "learning_rate": 4.535763736039116e-06, "loss": 0.017443262040615082, "memory(GiB)": 21.48, "step": 17027, "token_acc": 0.984, "train_speed(iter/s)": 0.95375 }, { "epoch": 0.5531624597992398, "grad_norm": 0.3644188642501831, "learning_rate": 4.535228906089497e-06, "loss": 0.02138591557741165, "memory(GiB)": 21.48, "step": 17028, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.95376 }, { "epoch": 0.5531949452619953, "grad_norm": 0.30742764472961426, "learning_rate": 4.534694081503953e-06, "loss": 0.022539887577295303, "memory(GiB)": 21.48, "step": 17029, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.95377 }, { "epoch": 0.5532274307247507, "grad_norm": 0.43105548620224, "learning_rate": 4.53415926228866e-06, "loss": 0.02079186588525772, "memory(GiB)": 21.48, "step": 17030, "token_acc": 0.98828125, "train_speed(iter/s)": 0.95378 }, { "epoch": 0.5532599161875061, "grad_norm": 0.48505789041519165, "learning_rate": 4.533624448449788e-06, "loss": 0.01860867254436016, "memory(GiB)": 21.48, "step": 17031, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.953789 }, { "epoch": 0.5532924016502615, "grad_norm": 0.3519599139690399, "learning_rate": 4.533089639993511e-06, "loss": 0.022047024220228195, "memory(GiB)": 21.48, "step": 17032, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.953798 }, { "epoch": 0.553324887113017, "grad_norm": 0.38795992732048035, "learning_rate": 4.532554836926001e-06, "loss": 0.017635608091950417, "memory(GiB)": 21.48, "step": 17033, "token_acc": 1.0, "train_speed(iter/s)": 0.953807 }, { "epoch": 0.5533573725757723, "grad_norm": 0.3645792603492737, "learning_rate": 4.53202003925343e-06, "loss": 0.02304898202419281, "memory(GiB)": 21.48, "step": 17034, "token_acc": 0.992831541218638, "train_speed(iter/s)": 0.953817 }, { "epoch": 0.5533898580385278, "grad_norm": 0.33741268515586853, "learning_rate": 4.53148524698197e-06, "loss": 0.01758100837469101, "memory(GiB)": 21.48, "step": 17035, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.953828 }, { "epoch": 0.5534223435012832, "grad_norm": 1.4745348691940308, "learning_rate": 4.530950460117794e-06, "loss": 0.030151601880788803, "memory(GiB)": 21.48, "step": 17036, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.953838 }, { "epoch": 0.5534548289640386, "grad_norm": 0.35010772943496704, "learning_rate": 4.530415678667076e-06, "loss": 0.020924124866724014, "memory(GiB)": 21.48, "step": 17037, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.953848 }, { "epoch": 0.553487314426794, "grad_norm": 0.28657791018486023, "learning_rate": 4.5298809026359845e-06, "loss": 0.01738276518881321, "memory(GiB)": 21.48, "step": 17038, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.953857 }, { "epoch": 0.5535197998895495, "grad_norm": 0.33956027030944824, "learning_rate": 4.529346132030694e-06, "loss": 0.012484309263527393, "memory(GiB)": 21.48, "step": 17039, "token_acc": 1.0, "train_speed(iter/s)": 0.953866 }, { "epoch": 0.5535522853523048, "grad_norm": 0.281297892332077, "learning_rate": 4.528811366857374e-06, "loss": 0.01706247217953205, "memory(GiB)": 21.48, "step": 17040, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.953875 }, { "epoch": 0.5535847708150603, "grad_norm": 0.3490903973579407, "learning_rate": 4.5282766071222e-06, "loss": 0.020761478692293167, "memory(GiB)": 21.48, "step": 17041, "token_acc": 0.9878048780487805, "train_speed(iter/s)": 0.953883 }, { "epoch": 0.5536172562778157, "grad_norm": 0.25557661056518555, "learning_rate": 4.52774185283134e-06, "loss": 0.017427410930395126, "memory(GiB)": 21.48, "step": 17042, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.953893 }, { "epoch": 0.5536497417405711, "grad_norm": 0.49744582176208496, "learning_rate": 4.527207103990969e-06, "loss": 0.02375197596848011, "memory(GiB)": 21.48, "step": 17043, "token_acc": 0.9807692307692307, "train_speed(iter/s)": 0.953902 }, { "epoch": 0.5536822272033265, "grad_norm": 0.5357193946838379, "learning_rate": 4.526672360607256e-06, "loss": 0.025826910510659218, "memory(GiB)": 21.48, "step": 17044, "token_acc": 1.0, "train_speed(iter/s)": 0.953912 }, { "epoch": 0.553714712666082, "grad_norm": 0.2955452799797058, "learning_rate": 4.526137622686375e-06, "loss": 0.016139857470989227, "memory(GiB)": 21.48, "step": 17045, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.953923 }, { "epoch": 0.5537471981288373, "grad_norm": 0.274110347032547, "learning_rate": 4.525602890234496e-06, "loss": 0.014096285216510296, "memory(GiB)": 21.48, "step": 17046, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.953932 }, { "epoch": 0.5537796835915928, "grad_norm": 0.3217414617538452, "learning_rate": 4.525068163257793e-06, "loss": 0.02255009301006794, "memory(GiB)": 21.48, "step": 17047, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.953942 }, { "epoch": 0.5538121690543482, "grad_norm": 0.5505573749542236, "learning_rate": 4.524533441762432e-06, "loss": 0.025914210826158524, "memory(GiB)": 21.48, "step": 17048, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.953949 }, { "epoch": 0.5538446545171036, "grad_norm": 0.46294739842414856, "learning_rate": 4.523998725754592e-06, "loss": 0.03270241618156433, "memory(GiB)": 21.48, "step": 17049, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.953958 }, { "epoch": 0.553877139979859, "grad_norm": 0.4245390295982361, "learning_rate": 4.523464015240435e-06, "loss": 0.026929564774036407, "memory(GiB)": 21.48, "step": 17050, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.953967 }, { "epoch": 0.5539096254426145, "grad_norm": 0.24720719456672668, "learning_rate": 4.522929310226141e-06, "loss": 0.016208400949835777, "memory(GiB)": 21.48, "step": 17051, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.953976 }, { "epoch": 0.5539421109053698, "grad_norm": 0.334491491317749, "learning_rate": 4.522394610717878e-06, "loss": 0.02009960636496544, "memory(GiB)": 21.48, "step": 17052, "token_acc": 1.0, "train_speed(iter/s)": 0.953987 }, { "epoch": 0.5539745963681253, "grad_norm": 0.48933127522468567, "learning_rate": 4.521859916721815e-06, "loss": 0.020165234804153442, "memory(GiB)": 21.48, "step": 17053, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.953995 }, { "epoch": 0.5540070818308807, "grad_norm": 0.3428989052772522, "learning_rate": 4.5213252282441274e-06, "loss": 0.02164684236049652, "memory(GiB)": 21.48, "step": 17054, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.954001 }, { "epoch": 0.5540395672936361, "grad_norm": 0.4416296184062958, "learning_rate": 4.52079054529098e-06, "loss": 0.018521394580602646, "memory(GiB)": 21.48, "step": 17055, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.954002 }, { "epoch": 0.5540720527563915, "grad_norm": 0.4386883080005646, "learning_rate": 4.5202558678685525e-06, "loss": 0.018185220658779144, "memory(GiB)": 21.48, "step": 17056, "token_acc": 0.9878787878787879, "train_speed(iter/s)": 0.954011 }, { "epoch": 0.554104538219147, "grad_norm": 0.2929232120513916, "learning_rate": 4.519721195983008e-06, "loss": 0.019410597160458565, "memory(GiB)": 21.48, "step": 17057, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.95402 }, { "epoch": 0.5541370236819023, "grad_norm": 0.47297579050064087, "learning_rate": 4.519186529640521e-06, "loss": 0.03001975640654564, "memory(GiB)": 21.48, "step": 17058, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.954029 }, { "epoch": 0.5541695091446578, "grad_norm": 0.39904195070266724, "learning_rate": 4.51865186884726e-06, "loss": 0.018079867586493492, "memory(GiB)": 21.48, "step": 17059, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.954021 }, { "epoch": 0.5542019946074132, "grad_norm": 0.355154812335968, "learning_rate": 4.518117213609398e-06, "loss": 0.018457554280757904, "memory(GiB)": 21.48, "step": 17060, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.954029 }, { "epoch": 0.5542344800701686, "grad_norm": 0.25638002157211304, "learning_rate": 4.517582563933105e-06, "loss": 0.019796222448349, "memory(GiB)": 21.48, "step": 17061, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.954038 }, { "epoch": 0.554266965532924, "grad_norm": 0.476715624332428, "learning_rate": 4.517047919824552e-06, "loss": 0.029070314019918442, "memory(GiB)": 21.48, "step": 17062, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.954047 }, { "epoch": 0.5542994509956795, "grad_norm": 0.3551475703716278, "learning_rate": 4.5165132812899064e-06, "loss": 0.021289974451065063, "memory(GiB)": 21.48, "step": 17063, "token_acc": 1.0, "train_speed(iter/s)": 0.954057 }, { "epoch": 0.5543319364584348, "grad_norm": 0.48719272017478943, "learning_rate": 4.515978648335344e-06, "loss": 0.020528648048639297, "memory(GiB)": 21.48, "step": 17064, "token_acc": 1.0, "train_speed(iter/s)": 0.954066 }, { "epoch": 0.5543644219211903, "grad_norm": 0.29930010437965393, "learning_rate": 4.51544402096703e-06, "loss": 0.014001959003508091, "memory(GiB)": 21.48, "step": 17065, "token_acc": 1.0, "train_speed(iter/s)": 0.954075 }, { "epoch": 0.5543969073839456, "grad_norm": 0.30520614981651306, "learning_rate": 4.514909399191136e-06, "loss": 0.015840010717511177, "memory(GiB)": 21.48, "step": 17066, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.954084 }, { "epoch": 0.5544293928467011, "grad_norm": 0.31988662481307983, "learning_rate": 4.514374783013836e-06, "loss": 0.018584750592708588, "memory(GiB)": 21.48, "step": 17067, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.954093 }, { "epoch": 0.5544618783094565, "grad_norm": 0.3971313238143921, "learning_rate": 4.513840172441295e-06, "loss": 0.01549476571381092, "memory(GiB)": 21.48, "step": 17068, "token_acc": 1.0, "train_speed(iter/s)": 0.954102 }, { "epoch": 0.554494363772212, "grad_norm": 0.45066991448402405, "learning_rate": 4.5133055674796875e-06, "loss": 0.01746699959039688, "memory(GiB)": 21.48, "step": 17069, "token_acc": 1.0, "train_speed(iter/s)": 0.954111 }, { "epoch": 0.5545268492349673, "grad_norm": 0.3061891198158264, "learning_rate": 4.5127709681351805e-06, "loss": 0.01512294914573431, "memory(GiB)": 21.48, "step": 17070, "token_acc": 1.0, "train_speed(iter/s)": 0.954119 }, { "epoch": 0.5545593346977228, "grad_norm": 0.6169889569282532, "learning_rate": 4.512236374413946e-06, "loss": 0.02026551030576229, "memory(GiB)": 21.48, "step": 17071, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.954128 }, { "epoch": 0.5545918201604781, "grad_norm": 0.25807636976242065, "learning_rate": 4.511701786322152e-06, "loss": 0.012316029518842697, "memory(GiB)": 21.48, "step": 17072, "token_acc": 1.0, "train_speed(iter/s)": 0.954138 }, { "epoch": 0.5546243056232336, "grad_norm": 0.34003934264183044, "learning_rate": 4.51116720386597e-06, "loss": 0.024373050779104233, "memory(GiB)": 21.48, "step": 17073, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.954147 }, { "epoch": 0.554656791085989, "grad_norm": 0.23353877663612366, "learning_rate": 4.510632627051568e-06, "loss": 0.008856285363435745, "memory(GiB)": 21.48, "step": 17074, "token_acc": 1.0, "train_speed(iter/s)": 0.954159 }, { "epoch": 0.5546892765487444, "grad_norm": 0.3795120120048523, "learning_rate": 4.510098055885119e-06, "loss": 0.01746753603219986, "memory(GiB)": 21.48, "step": 17075, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.95417 }, { "epoch": 0.5547217620114998, "grad_norm": 0.3071979284286499, "learning_rate": 4.509563490372787e-06, "loss": 0.016327349469065666, "memory(GiB)": 21.48, "step": 17076, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.954182 }, { "epoch": 0.5547542474742553, "grad_norm": 0.23863911628723145, "learning_rate": 4.509028930520749e-06, "loss": 0.012282514944672585, "memory(GiB)": 21.48, "step": 17077, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.954193 }, { "epoch": 0.5547867329370106, "grad_norm": 0.329349547624588, "learning_rate": 4.508494376335167e-06, "loss": 0.016341455280780792, "memory(GiB)": 21.48, "step": 17078, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.954206 }, { "epoch": 0.5548192183997661, "grad_norm": 0.25593093037605286, "learning_rate": 4.507959827822217e-06, "loss": 0.011179228313267231, "memory(GiB)": 21.48, "step": 17079, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.954217 }, { "epoch": 0.5548517038625215, "grad_norm": 0.3313382565975189, "learning_rate": 4.507425284988064e-06, "loss": 0.013937091454863548, "memory(GiB)": 21.48, "step": 17080, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.954229 }, { "epoch": 0.5548841893252769, "grad_norm": 0.26552945375442505, "learning_rate": 4.506890747838877e-06, "loss": 0.01772628352046013, "memory(GiB)": 21.48, "step": 17081, "token_acc": 1.0, "train_speed(iter/s)": 0.95424 }, { "epoch": 0.5549166747880323, "grad_norm": 0.41473153233528137, "learning_rate": 4.506356216380831e-06, "loss": 0.02154945582151413, "memory(GiB)": 21.48, "step": 17082, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.954251 }, { "epoch": 0.5549491602507878, "grad_norm": 0.4910028576850891, "learning_rate": 4.505821690620088e-06, "loss": 0.02447783201932907, "memory(GiB)": 21.48, "step": 17083, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.954263 }, { "epoch": 0.5549816457135431, "grad_norm": 0.29663488268852234, "learning_rate": 4.5052871705628235e-06, "loss": 0.01968255825340748, "memory(GiB)": 21.48, "step": 17084, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.954275 }, { "epoch": 0.5550141311762986, "grad_norm": 0.35892921686172485, "learning_rate": 4.5047526562152e-06, "loss": 0.01977771706879139, "memory(GiB)": 21.48, "step": 17085, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.954287 }, { "epoch": 0.555046616639054, "grad_norm": 0.9176548719406128, "learning_rate": 4.504218147583394e-06, "loss": 0.024645736441016197, "memory(GiB)": 21.48, "step": 17086, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.954298 }, { "epoch": 0.5550791021018094, "grad_norm": 0.5208609700202942, "learning_rate": 4.503683644673568e-06, "loss": 0.023391153663396835, "memory(GiB)": 21.48, "step": 17087, "token_acc": 0.99, "train_speed(iter/s)": 0.95431 }, { "epoch": 0.5551115875645649, "grad_norm": 0.4154762029647827, "learning_rate": 4.503149147491894e-06, "loss": 0.027780460193753242, "memory(GiB)": 21.48, "step": 17088, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.954321 }, { "epoch": 0.5551440730273203, "grad_norm": 0.4016175866127014, "learning_rate": 4.502614656044539e-06, "loss": 0.019267579540610313, "memory(GiB)": 21.48, "step": 17089, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.954333 }, { "epoch": 0.5551765584900757, "grad_norm": 0.40947192907333374, "learning_rate": 4.502080170337675e-06, "loss": 0.018897265195846558, "memory(GiB)": 21.48, "step": 17090, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.954345 }, { "epoch": 0.5552090439528311, "grad_norm": 0.21131740510463715, "learning_rate": 4.501545690377467e-06, "loss": 0.010227218270301819, "memory(GiB)": 21.48, "step": 17091, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.954358 }, { "epoch": 0.5552415294155866, "grad_norm": 0.4336506128311157, "learning_rate": 4.501011216170086e-06, "loss": 0.01577354595065117, "memory(GiB)": 21.48, "step": 17092, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.954369 }, { "epoch": 0.5552740148783419, "grad_norm": 0.3154732882976532, "learning_rate": 4.500476747721699e-06, "loss": 0.008492153137922287, "memory(GiB)": 21.48, "step": 17093, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.954379 }, { "epoch": 0.5553065003410974, "grad_norm": 0.33497685194015503, "learning_rate": 4.4999422850384755e-06, "loss": 0.023461751639842987, "memory(GiB)": 21.48, "step": 17094, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.954388 }, { "epoch": 0.5553389858038528, "grad_norm": 0.26701340079307556, "learning_rate": 4.499407828126583e-06, "loss": 0.017026599496603012, "memory(GiB)": 21.48, "step": 17095, "token_acc": 0.99609375, "train_speed(iter/s)": 0.954398 }, { "epoch": 0.5553714712666082, "grad_norm": 0.35337716341018677, "learning_rate": 4.498873376992192e-06, "loss": 0.02115584909915924, "memory(GiB)": 21.48, "step": 17096, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.954407 }, { "epoch": 0.5554039567293636, "grad_norm": 0.3260838985443115, "learning_rate": 4.498338931641467e-06, "loss": 0.023919986560940742, "memory(GiB)": 21.48, "step": 17097, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.954416 }, { "epoch": 0.5554364421921191, "grad_norm": 0.3289312422275543, "learning_rate": 4.49780449208058e-06, "loss": 0.017558038234710693, "memory(GiB)": 21.48, "step": 17098, "token_acc": 1.0, "train_speed(iter/s)": 0.954425 }, { "epoch": 0.5554689276548744, "grad_norm": 0.334393709897995, "learning_rate": 4.497270058315697e-06, "loss": 0.018338609486818314, "memory(GiB)": 21.48, "step": 17099, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.954435 }, { "epoch": 0.5555014131176299, "grad_norm": 0.45758911967277527, "learning_rate": 4.496735630352986e-06, "loss": 0.02436891943216324, "memory(GiB)": 21.48, "step": 17100, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.954444 }, { "epoch": 0.5555338985803853, "grad_norm": 0.38258644938468933, "learning_rate": 4.496201208198618e-06, "loss": 0.02580752782523632, "memory(GiB)": 21.48, "step": 17101, "token_acc": 1.0, "train_speed(iter/s)": 0.954454 }, { "epoch": 0.5555663840431407, "grad_norm": 0.37118154764175415, "learning_rate": 4.495666791858756e-06, "loss": 0.018035557121038437, "memory(GiB)": 21.48, "step": 17102, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.954463 }, { "epoch": 0.5555988695058961, "grad_norm": 0.4540210962295532, "learning_rate": 4.495132381339572e-06, "loss": 0.019619420170783997, "memory(GiB)": 21.48, "step": 17103, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.954473 }, { "epoch": 0.5556313549686516, "grad_norm": 0.33446621894836426, "learning_rate": 4.494597976647232e-06, "loss": 0.01668912172317505, "memory(GiB)": 21.48, "step": 17104, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.954482 }, { "epoch": 0.5556638404314069, "grad_norm": 0.579289436340332, "learning_rate": 4.494063577787905e-06, "loss": 0.03260628506541252, "memory(GiB)": 21.48, "step": 17105, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.954491 }, { "epoch": 0.5556963258941624, "grad_norm": 0.3562462031841278, "learning_rate": 4.493529184767755e-06, "loss": 0.023426024243235588, "memory(GiB)": 21.48, "step": 17106, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.9545 }, { "epoch": 0.5557288113569178, "grad_norm": 0.2574521601200104, "learning_rate": 4.492994797592956e-06, "loss": 0.015728404745459557, "memory(GiB)": 21.48, "step": 17107, "token_acc": 1.0, "train_speed(iter/s)": 0.954509 }, { "epoch": 0.5557612968196732, "grad_norm": 0.2461732029914856, "learning_rate": 4.492460416269668e-06, "loss": 0.010490483604371548, "memory(GiB)": 21.48, "step": 17108, "token_acc": 1.0, "train_speed(iter/s)": 0.954518 }, { "epoch": 0.5557937822824286, "grad_norm": 0.23852461576461792, "learning_rate": 4.491926040804067e-06, "loss": 0.012905806303024292, "memory(GiB)": 21.48, "step": 17109, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.954528 }, { "epoch": 0.5558262677451841, "grad_norm": 0.4159591495990753, "learning_rate": 4.491391671202312e-06, "loss": 0.024368038401007652, "memory(GiB)": 21.48, "step": 17110, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.954538 }, { "epoch": 0.5558587532079394, "grad_norm": 0.38113242387771606, "learning_rate": 4.490857307470575e-06, "loss": 0.028401276096701622, "memory(GiB)": 21.48, "step": 17111, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.954547 }, { "epoch": 0.5558912386706949, "grad_norm": 0.3977821469306946, "learning_rate": 4.490322949615022e-06, "loss": 0.02690233290195465, "memory(GiB)": 21.48, "step": 17112, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.954555 }, { "epoch": 0.5559237241334503, "grad_norm": 0.39889761805534363, "learning_rate": 4.4897885976418195e-06, "loss": 0.02929682284593582, "memory(GiB)": 21.48, "step": 17113, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.954562 }, { "epoch": 0.5559562095962057, "grad_norm": 0.3236859142780304, "learning_rate": 4.489254251557138e-06, "loss": 0.016331929713487625, "memory(GiB)": 21.48, "step": 17114, "token_acc": 1.0, "train_speed(iter/s)": 0.95457 }, { "epoch": 0.5559886950589611, "grad_norm": 0.23066461086273193, "learning_rate": 4.488719911367141e-06, "loss": 0.010401478037238121, "memory(GiB)": 21.48, "step": 17115, "token_acc": 1.0, "train_speed(iter/s)": 0.954577 }, { "epoch": 0.5560211805217166, "grad_norm": 0.29911139607429504, "learning_rate": 4.488185577077998e-06, "loss": 0.017156537622213364, "memory(GiB)": 21.48, "step": 17116, "token_acc": 0.9822064056939501, "train_speed(iter/s)": 0.954586 }, { "epoch": 0.5560536659844719, "grad_norm": 0.31003445386886597, "learning_rate": 4.4876512486958736e-06, "loss": 0.016403499990701675, "memory(GiB)": 21.48, "step": 17117, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.954593 }, { "epoch": 0.5560861514472274, "grad_norm": 0.40421149134635925, "learning_rate": 4.487116926226936e-06, "loss": 0.025995982810854912, "memory(GiB)": 21.48, "step": 17118, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.954601 }, { "epoch": 0.5561186369099828, "grad_norm": 0.2958192825317383, "learning_rate": 4.486582609677351e-06, "loss": 0.012627596966922283, "memory(GiB)": 21.48, "step": 17119, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.954609 }, { "epoch": 0.5561511223727382, "grad_norm": 0.34631630778312683, "learning_rate": 4.4860482990532876e-06, "loss": 0.025154950097203255, "memory(GiB)": 21.48, "step": 17120, "token_acc": 1.0, "train_speed(iter/s)": 0.954618 }, { "epoch": 0.5561836078354936, "grad_norm": 0.40121543407440186, "learning_rate": 4.485513994360909e-06, "loss": 0.023710552603006363, "memory(GiB)": 21.48, "step": 17121, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.954627 }, { "epoch": 0.5562160932982491, "grad_norm": 0.5123015642166138, "learning_rate": 4.484979695606385e-06, "loss": 0.022712599486112595, "memory(GiB)": 21.48, "step": 17122, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.954636 }, { "epoch": 0.5562485787610044, "grad_norm": 0.5047634243965149, "learning_rate": 4.48444540279588e-06, "loss": 0.025380320847034454, "memory(GiB)": 21.48, "step": 17123, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.954645 }, { "epoch": 0.5562810642237599, "grad_norm": 0.31555619835853577, "learning_rate": 4.483911115935562e-06, "loss": 0.02207590825855732, "memory(GiB)": 21.48, "step": 17124, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.954653 }, { "epoch": 0.5563135496865153, "grad_norm": 0.35804715752601624, "learning_rate": 4.483376835031596e-06, "loss": 0.02199796587228775, "memory(GiB)": 21.48, "step": 17125, "token_acc": 0.9885496183206107, "train_speed(iter/s)": 0.954662 }, { "epoch": 0.5563460351492707, "grad_norm": 0.5120515823364258, "learning_rate": 4.48284256009015e-06, "loss": 0.01787053979933262, "memory(GiB)": 21.48, "step": 17126, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.954672 }, { "epoch": 0.5563785206120261, "grad_norm": 0.29512616991996765, "learning_rate": 4.482308291117388e-06, "loss": 0.018179848790168762, "memory(GiB)": 21.48, "step": 17127, "token_acc": 0.9921875, "train_speed(iter/s)": 0.954681 }, { "epoch": 0.5564110060747816, "grad_norm": 0.39063382148742676, "learning_rate": 4.481774028119477e-06, "loss": 0.025007683783769608, "memory(GiB)": 21.48, "step": 17128, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.95469 }, { "epoch": 0.5564434915375369, "grad_norm": 0.22103442251682281, "learning_rate": 4.481239771102585e-06, "loss": 0.010915878228843212, "memory(GiB)": 21.48, "step": 17129, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.954698 }, { "epoch": 0.5564759770002924, "grad_norm": 0.3287576138973236, "learning_rate": 4.480705520072874e-06, "loss": 0.01948593184351921, "memory(GiB)": 21.48, "step": 17130, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.954708 }, { "epoch": 0.5565084624630477, "grad_norm": 0.3544811010360718, "learning_rate": 4.480171275036516e-06, "loss": 0.019604794681072235, "memory(GiB)": 21.48, "step": 17131, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.954718 }, { "epoch": 0.5565409479258032, "grad_norm": 0.4388119876384735, "learning_rate": 4.479637035999671e-06, "loss": 0.02154880203306675, "memory(GiB)": 21.48, "step": 17132, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.954728 }, { "epoch": 0.5565734333885586, "grad_norm": 0.3455806076526642, "learning_rate": 4.479102802968509e-06, "loss": 0.023704927414655685, "memory(GiB)": 21.48, "step": 17133, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.95474 }, { "epoch": 0.556605918851314, "grad_norm": 0.5870881080627441, "learning_rate": 4.478568575949191e-06, "loss": 0.02054583840072155, "memory(GiB)": 21.48, "step": 17134, "token_acc": 1.0, "train_speed(iter/s)": 0.954751 }, { "epoch": 0.5566384043140694, "grad_norm": 0.3134700357913971, "learning_rate": 4.478034354947889e-06, "loss": 0.018334243446588516, "memory(GiB)": 21.48, "step": 17135, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.954763 }, { "epoch": 0.5566708897768249, "grad_norm": 0.3115060031414032, "learning_rate": 4.477500139970762e-06, "loss": 0.016547903418540955, "memory(GiB)": 21.48, "step": 17136, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.954775 }, { "epoch": 0.5567033752395802, "grad_norm": 0.25990238785743713, "learning_rate": 4.476965931023983e-06, "loss": 0.02046084776520729, "memory(GiB)": 21.48, "step": 17137, "token_acc": 0.9966329966329966, "train_speed(iter/s)": 0.954787 }, { "epoch": 0.5567358607023357, "grad_norm": 0.46083998680114746, "learning_rate": 4.47643172811371e-06, "loss": 0.02361316606402397, "memory(GiB)": 21.48, "step": 17138, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.9548 }, { "epoch": 0.5567683461650911, "grad_norm": 0.35210758447647095, "learning_rate": 4.4758975312461125e-06, "loss": 0.021649230271577835, "memory(GiB)": 21.48, "step": 17139, "token_acc": 0.9875, "train_speed(iter/s)": 0.954811 }, { "epoch": 0.5568008316278465, "grad_norm": 0.2875318229198456, "learning_rate": 4.475363340427355e-06, "loss": 0.01734786666929722, "memory(GiB)": 21.48, "step": 17140, "token_acc": 1.0, "train_speed(iter/s)": 0.954823 }, { "epoch": 0.5568333170906019, "grad_norm": 0.3958037197589874, "learning_rate": 4.474829155663604e-06, "loss": 0.02020104043185711, "memory(GiB)": 21.48, "step": 17141, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.954835 }, { "epoch": 0.5568658025533574, "grad_norm": 0.3296760022640228, "learning_rate": 4.474294976961022e-06, "loss": 0.018002375960350037, "memory(GiB)": 21.48, "step": 17142, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.954847 }, { "epoch": 0.5568982880161127, "grad_norm": 0.3436897099018097, "learning_rate": 4.473760804325776e-06, "loss": 0.018938817083835602, "memory(GiB)": 21.48, "step": 17143, "token_acc": 0.9904761904761905, "train_speed(iter/s)": 0.954859 }, { "epoch": 0.5569307734788682, "grad_norm": 0.35266461968421936, "learning_rate": 4.473226637764031e-06, "loss": 0.017759796231985092, "memory(GiB)": 21.48, "step": 17144, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.95487 }, { "epoch": 0.5569632589416236, "grad_norm": 0.2707744538784027, "learning_rate": 4.472692477281951e-06, "loss": 0.01822381094098091, "memory(GiB)": 21.48, "step": 17145, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.954882 }, { "epoch": 0.556995744404379, "grad_norm": 0.46477562189102173, "learning_rate": 4.472158322885703e-06, "loss": 0.022243335843086243, "memory(GiB)": 21.48, "step": 17146, "token_acc": 0.9877049180327869, "train_speed(iter/s)": 0.954893 }, { "epoch": 0.5570282298671344, "grad_norm": 0.39508283138275146, "learning_rate": 4.4716241745814495e-06, "loss": 0.019141942262649536, "memory(GiB)": 21.48, "step": 17147, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.954904 }, { "epoch": 0.5570607153298899, "grad_norm": 0.27613574266433716, "learning_rate": 4.471090032375357e-06, "loss": 0.017444636672735214, "memory(GiB)": 21.48, "step": 17148, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.954916 }, { "epoch": 0.5570932007926452, "grad_norm": 0.31504619121551514, "learning_rate": 4.470555896273588e-06, "loss": 0.021387223154306412, "memory(GiB)": 21.48, "step": 17149, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.954928 }, { "epoch": 0.5571256862554007, "grad_norm": 0.34257516264915466, "learning_rate": 4.470021766282311e-06, "loss": 0.020415963605046272, "memory(GiB)": 21.48, "step": 17150, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.954939 }, { "epoch": 0.5571581717181561, "grad_norm": 0.44344642758369446, "learning_rate": 4.469487642407686e-06, "loss": 0.024865131825208664, "memory(GiB)": 21.48, "step": 17151, "token_acc": 0.9793103448275862, "train_speed(iter/s)": 0.954951 }, { "epoch": 0.5571906571809115, "grad_norm": 0.4861723482608795, "learning_rate": 4.468953524655882e-06, "loss": 0.021803665906190872, "memory(GiB)": 21.48, "step": 17152, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.954963 }, { "epoch": 0.557223142643667, "grad_norm": 0.3864729702472687, "learning_rate": 4.468419413033059e-06, "loss": 0.022058289498090744, "memory(GiB)": 21.48, "step": 17153, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.954974 }, { "epoch": 0.5572556281064224, "grad_norm": 0.3379465937614441, "learning_rate": 4.467885307545385e-06, "loss": 0.021701961755752563, "memory(GiB)": 21.48, "step": 17154, "token_acc": 1.0, "train_speed(iter/s)": 0.954986 }, { "epoch": 0.5572881135691778, "grad_norm": 0.3583180904388428, "learning_rate": 4.467351208199022e-06, "loss": 0.011699942871928215, "memory(GiB)": 21.48, "step": 17155, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.954997 }, { "epoch": 0.5573205990319332, "grad_norm": 0.40322813391685486, "learning_rate": 4.466817115000137e-06, "loss": 0.019042935222387314, "memory(GiB)": 21.48, "step": 17156, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.955009 }, { "epoch": 0.5573530844946887, "grad_norm": 0.49067452549934387, "learning_rate": 4.466283027954888e-06, "loss": 0.030425013974308968, "memory(GiB)": 21.48, "step": 17157, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.955021 }, { "epoch": 0.557385569957444, "grad_norm": 0.6969674229621887, "learning_rate": 4.465748947069445e-06, "loss": 0.026074115186929703, "memory(GiB)": 21.48, "step": 17158, "token_acc": 0.9943181818181818, "train_speed(iter/s)": 0.955031 }, { "epoch": 0.5574180554201995, "grad_norm": 0.29031893610954285, "learning_rate": 4.465214872349972e-06, "loss": 0.014466765336692333, "memory(GiB)": 21.48, "step": 17159, "token_acc": 1.0, "train_speed(iter/s)": 0.955041 }, { "epoch": 0.5574505408829549, "grad_norm": 0.406734824180603, "learning_rate": 4.464680803802629e-06, "loss": 0.022785602137446404, "memory(GiB)": 21.48, "step": 17160, "token_acc": 1.0, "train_speed(iter/s)": 0.955049 }, { "epoch": 0.5574830263457103, "grad_norm": 0.38939914107322693, "learning_rate": 4.4641467414335845e-06, "loss": 0.01966598816215992, "memory(GiB)": 21.48, "step": 17161, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.955059 }, { "epoch": 0.5575155118084657, "grad_norm": 0.3343534767627716, "learning_rate": 4.463612685248997e-06, "loss": 0.019348574802279472, "memory(GiB)": 21.48, "step": 17162, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.955068 }, { "epoch": 0.5575479972712212, "grad_norm": 0.38749292492866516, "learning_rate": 4.463078635255036e-06, "loss": 0.024386737495660782, "memory(GiB)": 21.48, "step": 17163, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.955077 }, { "epoch": 0.5575804827339765, "grad_norm": 0.35170885920524597, "learning_rate": 4.462544591457859e-06, "loss": 0.020418647676706314, "memory(GiB)": 21.48, "step": 17164, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.955084 }, { "epoch": 0.557612968196732, "grad_norm": 0.35485267639160156, "learning_rate": 4.462010553863636e-06, "loss": 0.018086891621351242, "memory(GiB)": 21.48, "step": 17165, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.955094 }, { "epoch": 0.5576454536594874, "grad_norm": 0.3381290137767792, "learning_rate": 4.461476522478525e-06, "loss": 0.017327142879366875, "memory(GiB)": 21.48, "step": 17166, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.955104 }, { "epoch": 0.5576779391222428, "grad_norm": 0.2992725074291229, "learning_rate": 4.460942497308693e-06, "loss": 0.019143270328640938, "memory(GiB)": 21.48, "step": 17167, "token_acc": 1.0, "train_speed(iter/s)": 0.955113 }, { "epoch": 0.5577104245849982, "grad_norm": 2.2781713008880615, "learning_rate": 4.460408478360301e-06, "loss": 0.02721170336008072, "memory(GiB)": 21.48, "step": 17168, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.955121 }, { "epoch": 0.5577429100477537, "grad_norm": 0.324493408203125, "learning_rate": 4.4598744656395136e-06, "loss": 0.02232518047094345, "memory(GiB)": 21.48, "step": 17169, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.95513 }, { "epoch": 0.557775395510509, "grad_norm": 0.25484660267829895, "learning_rate": 4.459340459152493e-06, "loss": 0.019336918368935585, "memory(GiB)": 21.48, "step": 17170, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.955139 }, { "epoch": 0.5578078809732645, "grad_norm": 0.34864237904548645, "learning_rate": 4.458806458905405e-06, "loss": 0.01892055943608284, "memory(GiB)": 21.48, "step": 17171, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.955148 }, { "epoch": 0.5578403664360199, "grad_norm": 0.25488847494125366, "learning_rate": 4.4582724649044095e-06, "loss": 0.011988525278866291, "memory(GiB)": 21.48, "step": 17172, "token_acc": 0.9883040935672515, "train_speed(iter/s)": 0.955156 }, { "epoch": 0.5578728518987753, "grad_norm": 0.34417232871055603, "learning_rate": 4.457738477155672e-06, "loss": 0.02178647555410862, "memory(GiB)": 21.48, "step": 17173, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955164 }, { "epoch": 0.5579053373615307, "grad_norm": 0.7345989942550659, "learning_rate": 4.457204495665353e-06, "loss": 0.016565851867198944, "memory(GiB)": 21.48, "step": 17174, "token_acc": 1.0, "train_speed(iter/s)": 0.955173 }, { "epoch": 0.5579378228242862, "grad_norm": 0.268032431602478, "learning_rate": 4.456670520439616e-06, "loss": 0.019396714866161346, "memory(GiB)": 21.48, "step": 17175, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955181 }, { "epoch": 0.5579703082870415, "grad_norm": 0.34638088941574097, "learning_rate": 4.456136551484625e-06, "loss": 0.0180472731590271, "memory(GiB)": 21.48, "step": 17176, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.955189 }, { "epoch": 0.558002793749797, "grad_norm": 0.2907859981060028, "learning_rate": 4.455602588806542e-06, "loss": 0.015539133921265602, "memory(GiB)": 21.48, "step": 17177, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955195 }, { "epoch": 0.5580352792125524, "grad_norm": 0.3295277953147888, "learning_rate": 4.4550686324115314e-06, "loss": 0.023942336440086365, "memory(GiB)": 21.48, "step": 17178, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.955203 }, { "epoch": 0.5580677646753078, "grad_norm": 0.3047303557395935, "learning_rate": 4.4545346823057515e-06, "loss": 0.019386539235711098, "memory(GiB)": 21.48, "step": 17179, "token_acc": 1.0, "train_speed(iter/s)": 0.95521 }, { "epoch": 0.5581002501380632, "grad_norm": 0.2869659960269928, "learning_rate": 4.45400073849537e-06, "loss": 0.01455962285399437, "memory(GiB)": 21.48, "step": 17180, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.955218 }, { "epoch": 0.5581327356008187, "grad_norm": 0.34510380029678345, "learning_rate": 4.453466800986545e-06, "loss": 0.021041013300418854, "memory(GiB)": 21.48, "step": 17181, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955227 }, { "epoch": 0.558165221063574, "grad_norm": 0.3628617823123932, "learning_rate": 4.452932869785441e-06, "loss": 0.020896468311548233, "memory(GiB)": 21.48, "step": 17182, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.955236 }, { "epoch": 0.5581977065263295, "grad_norm": 0.48743271827697754, "learning_rate": 4.4523989448982195e-06, "loss": 0.022489506751298904, "memory(GiB)": 21.48, "step": 17183, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.955245 }, { "epoch": 0.5582301919890849, "grad_norm": 0.3550039231777191, "learning_rate": 4.451865026331044e-06, "loss": 0.02120007574558258, "memory(GiB)": 21.48, "step": 17184, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955254 }, { "epoch": 0.5582626774518403, "grad_norm": 0.37878555059432983, "learning_rate": 4.4513311140900735e-06, "loss": 0.01989259012043476, "memory(GiB)": 21.48, "step": 17185, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.955263 }, { "epoch": 0.5582951629145957, "grad_norm": 0.7498802542686462, "learning_rate": 4.450797208181476e-06, "loss": 0.02675100788474083, "memory(GiB)": 21.48, "step": 17186, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955272 }, { "epoch": 0.5583276483773512, "grad_norm": 0.3882003426551819, "learning_rate": 4.450263308611405e-06, "loss": 0.02467319741845131, "memory(GiB)": 21.48, "step": 17187, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.955281 }, { "epoch": 0.5583601338401065, "grad_norm": 0.4213075041770935, "learning_rate": 4.4497294153860315e-06, "loss": 0.018717346712946892, "memory(GiB)": 21.48, "step": 17188, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.955289 }, { "epoch": 0.558392619302862, "grad_norm": 0.44041678309440613, "learning_rate": 4.44919552851151e-06, "loss": 0.024361606687307358, "memory(GiB)": 21.48, "step": 17189, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.955297 }, { "epoch": 0.5584251047656174, "grad_norm": 0.3782203197479248, "learning_rate": 4.448661647994005e-06, "loss": 0.026806028559803963, "memory(GiB)": 21.48, "step": 17190, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.955306 }, { "epoch": 0.5584575902283728, "grad_norm": 0.24077108502388, "learning_rate": 4.44812777383968e-06, "loss": 0.017634496092796326, "memory(GiB)": 21.48, "step": 17191, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.955316 }, { "epoch": 0.5584900756911282, "grad_norm": 0.3997897505760193, "learning_rate": 4.447593906054694e-06, "loss": 0.02265065908432007, "memory(GiB)": 21.48, "step": 17192, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.955327 }, { "epoch": 0.5585225611538837, "grad_norm": 0.49578654766082764, "learning_rate": 4.447060044645212e-06, "loss": 0.016351500526070595, "memory(GiB)": 21.48, "step": 17193, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955339 }, { "epoch": 0.558555046616639, "grad_norm": 0.3988144099712372, "learning_rate": 4.44652618961739e-06, "loss": 0.02070530131459236, "memory(GiB)": 21.48, "step": 17194, "token_acc": 1.0, "train_speed(iter/s)": 0.955351 }, { "epoch": 0.5585875320793945, "grad_norm": 0.39340177178382874, "learning_rate": 4.445992340977394e-06, "loss": 0.020933203399181366, "memory(GiB)": 21.48, "step": 17195, "token_acc": 0.9921875, "train_speed(iter/s)": 0.955363 }, { "epoch": 0.5586200175421498, "grad_norm": 0.30857333540916443, "learning_rate": 4.445458498731383e-06, "loss": 0.014654045924544334, "memory(GiB)": 21.48, "step": 17196, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.955374 }, { "epoch": 0.5586525030049053, "grad_norm": 0.29566124081611633, "learning_rate": 4.44492466288552e-06, "loss": 0.023097561672329903, "memory(GiB)": 21.48, "step": 17197, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.955386 }, { "epoch": 0.5586849884676607, "grad_norm": 0.23511797189712524, "learning_rate": 4.444390833445964e-06, "loss": 0.013705722987651825, "memory(GiB)": 21.48, "step": 17198, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955398 }, { "epoch": 0.5587174739304162, "grad_norm": 0.2127852737903595, "learning_rate": 4.443857010418879e-06, "loss": 0.010442813858389854, "memory(GiB)": 21.48, "step": 17199, "token_acc": 1.0, "train_speed(iter/s)": 0.955409 }, { "epoch": 0.5587499593931715, "grad_norm": 0.26353248953819275, "learning_rate": 4.443323193810422e-06, "loss": 0.01960904523730278, "memory(GiB)": 21.48, "step": 17200, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.955421 }, { "epoch": 0.558782444855927, "grad_norm": 0.301967978477478, "learning_rate": 4.442789383626758e-06, "loss": 0.01754167675971985, "memory(GiB)": 21.48, "step": 17201, "token_acc": 1.0, "train_speed(iter/s)": 0.955433 }, { "epoch": 0.5588149303186823, "grad_norm": 0.9093594551086426, "learning_rate": 4.442255579874045e-06, "loss": 0.024889178574085236, "memory(GiB)": 21.48, "step": 17202, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.955445 }, { "epoch": 0.5588474157814378, "grad_norm": 0.2528657019138336, "learning_rate": 4.441721782558447e-06, "loss": 0.012770883738994598, "memory(GiB)": 21.48, "step": 17203, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.955457 }, { "epoch": 0.5588799012441932, "grad_norm": 0.289296418428421, "learning_rate": 4.4411879916861205e-06, "loss": 0.02270522527396679, "memory(GiB)": 21.48, "step": 17204, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.95547 }, { "epoch": 0.5589123867069486, "grad_norm": 0.29110217094421387, "learning_rate": 4.440654207263228e-06, "loss": 0.01769118383526802, "memory(GiB)": 21.48, "step": 17205, "token_acc": 1.0, "train_speed(iter/s)": 0.955481 }, { "epoch": 0.558944872169704, "grad_norm": 0.25208139419555664, "learning_rate": 4.4401204292959325e-06, "loss": 0.0160260871052742, "memory(GiB)": 21.48, "step": 17206, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955493 }, { "epoch": 0.5589773576324595, "grad_norm": 0.3871867060661316, "learning_rate": 4.439586657790391e-06, "loss": 0.020031113177537918, "memory(GiB)": 21.48, "step": 17207, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.955505 }, { "epoch": 0.5590098430952148, "grad_norm": 0.30719664692878723, "learning_rate": 4.439052892752767e-06, "loss": 0.015391005203127861, "memory(GiB)": 21.48, "step": 17208, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.955517 }, { "epoch": 0.5590423285579703, "grad_norm": 0.3646736741065979, "learning_rate": 4.438519134189218e-06, "loss": 0.02070588991045952, "memory(GiB)": 21.48, "step": 17209, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.955529 }, { "epoch": 0.5590748140207257, "grad_norm": 0.527480959892273, "learning_rate": 4.437985382105906e-06, "loss": 0.014178290963172913, "memory(GiB)": 21.48, "step": 17210, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.955541 }, { "epoch": 0.5591072994834811, "grad_norm": 0.5620855093002319, "learning_rate": 4.4374516365089895e-06, "loss": 0.01987520046532154, "memory(GiB)": 21.48, "step": 17211, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.955553 }, { "epoch": 0.5591397849462365, "grad_norm": 0.489801824092865, "learning_rate": 4.4369178974046326e-06, "loss": 0.024291187524795532, "memory(GiB)": 21.48, "step": 17212, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.955565 }, { "epoch": 0.559172270408992, "grad_norm": 0.24107840657234192, "learning_rate": 4.436384164798989e-06, "loss": 0.015872154384851456, "memory(GiB)": 21.48, "step": 17213, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.955577 }, { "epoch": 0.5592047558717473, "grad_norm": 0.27537983655929565, "learning_rate": 4.435850438698226e-06, "loss": 0.016250580549240112, "memory(GiB)": 21.48, "step": 17214, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.955588 }, { "epoch": 0.5592372413345028, "grad_norm": 0.2741008400917053, "learning_rate": 4.435316719108496e-06, "loss": 0.016629567369818687, "memory(GiB)": 21.48, "step": 17215, "token_acc": 1.0, "train_speed(iter/s)": 0.9556 }, { "epoch": 0.5592697267972583, "grad_norm": 0.29190656542778015, "learning_rate": 4.4347830060359666e-06, "loss": 0.020316459238529205, "memory(GiB)": 21.48, "step": 17216, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.955611 }, { "epoch": 0.5593022122600136, "grad_norm": 0.8431104421615601, "learning_rate": 4.434249299486791e-06, "loss": 0.020751487463712692, "memory(GiB)": 21.48, "step": 17217, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.955623 }, { "epoch": 0.5593346977227691, "grad_norm": 0.2991117835044861, "learning_rate": 4.433715599467133e-06, "loss": 0.017805688083171844, "memory(GiB)": 21.48, "step": 17218, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955635 }, { "epoch": 0.5593671831855245, "grad_norm": 0.1963442862033844, "learning_rate": 4.433181905983149e-06, "loss": 0.01037105917930603, "memory(GiB)": 21.48, "step": 17219, "token_acc": 1.0, "train_speed(iter/s)": 0.955647 }, { "epoch": 0.55939966864828, "grad_norm": 0.3487871587276459, "learning_rate": 4.432648219040999e-06, "loss": 0.02005636692047119, "memory(GiB)": 21.48, "step": 17220, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.955658 }, { "epoch": 0.5594321541110353, "grad_norm": 0.32737937569618225, "learning_rate": 4.432114538646848e-06, "loss": 0.01333470270037651, "memory(GiB)": 21.48, "step": 17221, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.955668 }, { "epoch": 0.5594646395737908, "grad_norm": 0.3849399983882904, "learning_rate": 4.431580864806848e-06, "loss": 0.01723039150238037, "memory(GiB)": 21.48, "step": 17222, "token_acc": 0.9792387543252595, "train_speed(iter/s)": 0.955677 }, { "epoch": 0.5594971250365461, "grad_norm": 0.43139809370040894, "learning_rate": 4.431047197527162e-06, "loss": 0.023016594350337982, "memory(GiB)": 21.48, "step": 17223, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.955687 }, { "epoch": 0.5595296104993016, "grad_norm": 0.45831599831581116, "learning_rate": 4.430513536813949e-06, "loss": 0.016166504472494125, "memory(GiB)": 21.48, "step": 17224, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.955696 }, { "epoch": 0.559562095962057, "grad_norm": 0.383258193731308, "learning_rate": 4.429979882673368e-06, "loss": 0.011660216376185417, "memory(GiB)": 21.48, "step": 17225, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.955705 }, { "epoch": 0.5595945814248124, "grad_norm": 0.3962269425392151, "learning_rate": 4.429446235111577e-06, "loss": 0.021258678287267685, "memory(GiB)": 21.48, "step": 17226, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.955714 }, { "epoch": 0.5596270668875678, "grad_norm": 0.4115523099899292, "learning_rate": 4.4289125941347365e-06, "loss": 0.018381427973508835, "memory(GiB)": 21.48, "step": 17227, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.955723 }, { "epoch": 0.5596595523503233, "grad_norm": 0.28265058994293213, "learning_rate": 4.428378959749004e-06, "loss": 0.018544677644968033, "memory(GiB)": 21.48, "step": 17228, "token_acc": 0.996, "train_speed(iter/s)": 0.955732 }, { "epoch": 0.5596920378130786, "grad_norm": 0.3621208667755127, "learning_rate": 4.427845331960541e-06, "loss": 0.01766354590654373, "memory(GiB)": 21.48, "step": 17229, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.955742 }, { "epoch": 0.5597245232758341, "grad_norm": 0.23921920359134674, "learning_rate": 4.427311710775502e-06, "loss": 0.01880684494972229, "memory(GiB)": 21.48, "step": 17230, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.955751 }, { "epoch": 0.5597570087385895, "grad_norm": 0.4569821059703827, "learning_rate": 4.42677809620005e-06, "loss": 0.024604888632893562, "memory(GiB)": 21.48, "step": 17231, "token_acc": 1.0, "train_speed(iter/s)": 0.95576 }, { "epoch": 0.5597894942013449, "grad_norm": 0.4215538501739502, "learning_rate": 4.42624448824034e-06, "loss": 0.01881161704659462, "memory(GiB)": 21.48, "step": 17232, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.955767 }, { "epoch": 0.5598219796641003, "grad_norm": 0.331074982881546, "learning_rate": 4.425710886902534e-06, "loss": 0.014448842033743858, "memory(GiB)": 21.48, "step": 17233, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.955775 }, { "epoch": 0.5598544651268558, "grad_norm": 0.47805914282798767, "learning_rate": 4.425177292192787e-06, "loss": 0.021329756826162338, "memory(GiB)": 21.48, "step": 17234, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.955782 }, { "epoch": 0.5598869505896111, "grad_norm": 0.3425197899341583, "learning_rate": 4.4246437041172625e-06, "loss": 0.019032558426260948, "memory(GiB)": 21.48, "step": 17235, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955789 }, { "epoch": 0.5599194360523666, "grad_norm": 0.33118340373039246, "learning_rate": 4.424110122682111e-06, "loss": 0.01616055890917778, "memory(GiB)": 21.48, "step": 17236, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.955797 }, { "epoch": 0.559951921515122, "grad_norm": 1.0449234247207642, "learning_rate": 4.423576547893497e-06, "loss": 0.02070305123925209, "memory(GiB)": 21.48, "step": 17237, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.955804 }, { "epoch": 0.5599844069778774, "grad_norm": 0.36739763617515564, "learning_rate": 4.423042979757578e-06, "loss": 0.022441767156124115, "memory(GiB)": 21.48, "step": 17238, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.955812 }, { "epoch": 0.5600168924406328, "grad_norm": 0.37853583693504333, "learning_rate": 4.422509418280509e-06, "loss": 0.023508794605731964, "memory(GiB)": 21.48, "step": 17239, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.955819 }, { "epoch": 0.5600493779033883, "grad_norm": 0.6311019062995911, "learning_rate": 4.421975863468452e-06, "loss": 0.017406493425369263, "memory(GiB)": 21.48, "step": 17240, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.955827 }, { "epoch": 0.5600818633661436, "grad_norm": 0.6923772096633911, "learning_rate": 4.421442315327561e-06, "loss": 0.024724019691348076, "memory(GiB)": 21.48, "step": 17241, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.955835 }, { "epoch": 0.5601143488288991, "grad_norm": 0.4230770170688629, "learning_rate": 4.420908773863998e-06, "loss": 0.020959148183465004, "memory(GiB)": 21.48, "step": 17242, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.955844 }, { "epoch": 0.5601468342916545, "grad_norm": 0.31753236055374146, "learning_rate": 4.420375239083915e-06, "loss": 0.022058114409446716, "memory(GiB)": 21.48, "step": 17243, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955851 }, { "epoch": 0.5601793197544099, "grad_norm": 0.44426172971725464, "learning_rate": 4.419841710993478e-06, "loss": 0.02317473292350769, "memory(GiB)": 21.48, "step": 17244, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.95586 }, { "epoch": 0.5602118052171653, "grad_norm": 0.3261796832084656, "learning_rate": 4.419308189598836e-06, "loss": 0.012630286626517773, "memory(GiB)": 21.48, "step": 17245, "token_acc": 1.0, "train_speed(iter/s)": 0.955867 }, { "epoch": 0.5602442906799208, "grad_norm": 0.4459609389305115, "learning_rate": 4.418774674906152e-06, "loss": 0.01736883632838726, "memory(GiB)": 21.48, "step": 17246, "token_acc": 1.0, "train_speed(iter/s)": 0.955876 }, { "epoch": 0.5602767761426761, "grad_norm": 0.23038269579410553, "learning_rate": 4.418241166921581e-06, "loss": 0.013568875379860401, "memory(GiB)": 21.48, "step": 17247, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955886 }, { "epoch": 0.5603092616054316, "grad_norm": 0.3687059283256531, "learning_rate": 4.417707665651282e-06, "loss": 0.024596944451332092, "memory(GiB)": 21.48, "step": 17248, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.955894 }, { "epoch": 0.560341747068187, "grad_norm": 1.9463955163955688, "learning_rate": 4.417174171101411e-06, "loss": 0.020237352699041367, "memory(GiB)": 21.48, "step": 17249, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.955903 }, { "epoch": 0.5603742325309424, "grad_norm": 0.34012484550476074, "learning_rate": 4.416640683278126e-06, "loss": 0.014767255634069443, "memory(GiB)": 21.48, "step": 17250, "token_acc": 1.0, "train_speed(iter/s)": 0.955911 }, { "epoch": 0.5604067179936978, "grad_norm": 0.2576884329319, "learning_rate": 4.4161072021875835e-06, "loss": 0.012643227353692055, "memory(GiB)": 21.48, "step": 17251, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955921 }, { "epoch": 0.5604392034564533, "grad_norm": 0.3147308826446533, "learning_rate": 4.415573727835942e-06, "loss": 0.018018046393990517, "memory(GiB)": 21.48, "step": 17252, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955933 }, { "epoch": 0.5604716889192086, "grad_norm": 0.4661919176578522, "learning_rate": 4.415040260229357e-06, "loss": 0.02144288271665573, "memory(GiB)": 21.48, "step": 17253, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.955945 }, { "epoch": 0.5605041743819641, "grad_norm": 0.3494166433811188, "learning_rate": 4.414506799373986e-06, "loss": 0.013972284272313118, "memory(GiB)": 21.48, "step": 17254, "token_acc": 0.9927007299270073, "train_speed(iter/s)": 0.955957 }, { "epoch": 0.5605366598447195, "grad_norm": 0.4020952880382538, "learning_rate": 4.413973345275987e-06, "loss": 0.018780551850795746, "memory(GiB)": 21.48, "step": 17255, "token_acc": 1.0, "train_speed(iter/s)": 0.955969 }, { "epoch": 0.5605691453074749, "grad_norm": 0.39032039046287537, "learning_rate": 4.413439897941514e-06, "loss": 0.021599609404802322, "memory(GiB)": 21.48, "step": 17256, "token_acc": 0.9893992932862191, "train_speed(iter/s)": 0.955981 }, { "epoch": 0.5606016307702303, "grad_norm": 0.4370061457157135, "learning_rate": 4.412906457376725e-06, "loss": 0.018724573776125908, "memory(GiB)": 21.48, "step": 17257, "token_acc": 1.0, "train_speed(iter/s)": 0.955993 }, { "epoch": 0.5606341162329858, "grad_norm": 0.2649841904640198, "learning_rate": 4.4123730235877786e-06, "loss": 0.014938811771571636, "memory(GiB)": 21.48, "step": 17258, "token_acc": 0.995, "train_speed(iter/s)": 0.956005 }, { "epoch": 0.5606666016957411, "grad_norm": 0.3607844114303589, "learning_rate": 4.411839596580829e-06, "loss": 0.016162065789103508, "memory(GiB)": 21.48, "step": 17259, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.956017 }, { "epoch": 0.5606990871584966, "grad_norm": 0.3358725905418396, "learning_rate": 4.411306176362032e-06, "loss": 0.01531297154724598, "memory(GiB)": 21.48, "step": 17260, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.956028 }, { "epoch": 0.560731572621252, "grad_norm": 0.3824654519557953, "learning_rate": 4.410772762937547e-06, "loss": 0.020550359040498734, "memory(GiB)": 21.48, "step": 17261, "token_acc": 1.0, "train_speed(iter/s)": 0.95604 }, { "epoch": 0.5607640580840074, "grad_norm": 0.3909437358379364, "learning_rate": 4.410239356313527e-06, "loss": 0.01620032824575901, "memory(GiB)": 21.48, "step": 17262, "token_acc": 0.9875, "train_speed(iter/s)": 0.956052 }, { "epoch": 0.5607965435467628, "grad_norm": 0.3645482063293457, "learning_rate": 4.409705956496133e-06, "loss": 0.01743950881063938, "memory(GiB)": 21.48, "step": 17263, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956064 }, { "epoch": 0.5608290290095183, "grad_norm": 0.467411607503891, "learning_rate": 4.4091725634915135e-06, "loss": 0.022674333304166794, "memory(GiB)": 21.48, "step": 17264, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956076 }, { "epoch": 0.5608615144722736, "grad_norm": 0.28788989782333374, "learning_rate": 4.408639177305833e-06, "loss": 0.01243649236857891, "memory(GiB)": 21.48, "step": 17265, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956086 }, { "epoch": 0.5608939999350291, "grad_norm": 0.3608552813529968, "learning_rate": 4.4081057979452404e-06, "loss": 0.014100901782512665, "memory(GiB)": 21.48, "step": 17266, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.956099 }, { "epoch": 0.5609264853977844, "grad_norm": 0.3275024890899658, "learning_rate": 4.407572425415895e-06, "loss": 0.015740511938929558, "memory(GiB)": 21.48, "step": 17267, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.95611 }, { "epoch": 0.5609589708605399, "grad_norm": 0.3933304250240326, "learning_rate": 4.407039059723955e-06, "loss": 0.016151752322912216, "memory(GiB)": 21.48, "step": 17268, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.956121 }, { "epoch": 0.5609914563232953, "grad_norm": 0.34058502316474915, "learning_rate": 4.40650570087557e-06, "loss": 0.0144040547311306, "memory(GiB)": 21.48, "step": 17269, "token_acc": 0.9898648648648649, "train_speed(iter/s)": 0.956133 }, { "epoch": 0.5610239417860508, "grad_norm": 0.42421630024909973, "learning_rate": 4.405972348876903e-06, "loss": 0.021027853712439537, "memory(GiB)": 21.48, "step": 17270, "token_acc": 0.994475138121547, "train_speed(iter/s)": 0.956144 }, { "epoch": 0.5610564272488061, "grad_norm": 0.27065902948379517, "learning_rate": 4.405439003734102e-06, "loss": 0.009683292359113693, "memory(GiB)": 21.48, "step": 17271, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956157 }, { "epoch": 0.5610889127115616, "grad_norm": 0.3460295498371124, "learning_rate": 4.404905665453329e-06, "loss": 0.018802586942911148, "memory(GiB)": 21.48, "step": 17272, "token_acc": 1.0, "train_speed(iter/s)": 0.956168 }, { "epoch": 0.5611213981743169, "grad_norm": 0.34992915391921997, "learning_rate": 4.404372334040735e-06, "loss": 0.02132233791053295, "memory(GiB)": 21.48, "step": 17273, "token_acc": 1.0, "train_speed(iter/s)": 0.95618 }, { "epoch": 0.5611538836370724, "grad_norm": 0.4501512050628662, "learning_rate": 4.403839009502478e-06, "loss": 0.018791474401950836, "memory(GiB)": 21.48, "step": 17274, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956193 }, { "epoch": 0.5611863690998278, "grad_norm": 0.4594528079032898, "learning_rate": 4.4033056918447115e-06, "loss": 0.01668315753340721, "memory(GiB)": 21.48, "step": 17275, "token_acc": 1.0, "train_speed(iter/s)": 0.956204 }, { "epoch": 0.5612188545625832, "grad_norm": 0.27414175868034363, "learning_rate": 4.402772381073593e-06, "loss": 0.01559144351631403, "memory(GiB)": 21.48, "step": 17276, "token_acc": 1.0, "train_speed(iter/s)": 0.956216 }, { "epoch": 0.5612513400253386, "grad_norm": 0.38536107540130615, "learning_rate": 4.402239077195274e-06, "loss": 0.021460149437189102, "memory(GiB)": 21.48, "step": 17277, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.956228 }, { "epoch": 0.5612838254880941, "grad_norm": 0.438856840133667, "learning_rate": 4.401705780215914e-06, "loss": 0.022267021238803864, "memory(GiB)": 21.48, "step": 17278, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.95624 }, { "epoch": 0.5613163109508494, "grad_norm": 0.29941174387931824, "learning_rate": 4.401172490141664e-06, "loss": 0.017627013847231865, "memory(GiB)": 21.48, "step": 17279, "token_acc": 1.0, "train_speed(iter/s)": 0.956252 }, { "epoch": 0.5613487964136049, "grad_norm": 0.3627564609050751, "learning_rate": 4.4006392069786805e-06, "loss": 0.020420297980308533, "memory(GiB)": 21.48, "step": 17280, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.956264 }, { "epoch": 0.5613812818763604, "grad_norm": 0.37238937616348267, "learning_rate": 4.400105930733119e-06, "loss": 0.02122354507446289, "memory(GiB)": 21.48, "step": 17281, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956276 }, { "epoch": 0.5614137673391157, "grad_norm": 0.4426075220108032, "learning_rate": 4.399572661411132e-06, "loss": 0.015002502128481865, "memory(GiB)": 21.48, "step": 17282, "token_acc": 1.0, "train_speed(iter/s)": 0.956287 }, { "epoch": 0.5614462528018712, "grad_norm": 0.3113543391227722, "learning_rate": 4.399039399018877e-06, "loss": 0.017702700570225716, "memory(GiB)": 21.48, "step": 17283, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.956296 }, { "epoch": 0.5614787382646266, "grad_norm": 0.3751703202724457, "learning_rate": 4.398506143562507e-06, "loss": 0.01634218357503414, "memory(GiB)": 21.48, "step": 17284, "token_acc": 1.0, "train_speed(iter/s)": 0.956305 }, { "epoch": 0.561511223727382, "grad_norm": 0.31734171509742737, "learning_rate": 4.397972895048177e-06, "loss": 0.018094081431627274, "memory(GiB)": 21.48, "step": 17285, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.956314 }, { "epoch": 0.5615437091901374, "grad_norm": 0.35501065850257874, "learning_rate": 4.397439653482041e-06, "loss": 0.014511646702885628, "memory(GiB)": 21.48, "step": 17286, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.956323 }, { "epoch": 0.5615761946528929, "grad_norm": 0.4484044909477234, "learning_rate": 4.396906418870254e-06, "loss": 0.01920153573155403, "memory(GiB)": 21.48, "step": 17287, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.956332 }, { "epoch": 0.5616086801156482, "grad_norm": 0.24205611646175385, "learning_rate": 4.396373191218968e-06, "loss": 0.01633145660161972, "memory(GiB)": 21.48, "step": 17288, "token_acc": 1.0, "train_speed(iter/s)": 0.956341 }, { "epoch": 0.5616411655784037, "grad_norm": 0.3873383402824402, "learning_rate": 4.39583997053434e-06, "loss": 0.01689007878303528, "memory(GiB)": 21.48, "step": 17289, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956349 }, { "epoch": 0.5616736510411591, "grad_norm": 0.6665573716163635, "learning_rate": 4.395306756822522e-06, "loss": 0.018497709184885025, "memory(GiB)": 21.48, "step": 17290, "token_acc": 1.0, "train_speed(iter/s)": 0.956358 }, { "epoch": 0.5617061365039145, "grad_norm": 0.3823961615562439, "learning_rate": 4.394773550089671e-06, "loss": 0.021106194704771042, "memory(GiB)": 21.48, "step": 17291, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.956367 }, { "epoch": 0.5617386219666699, "grad_norm": 0.2568527162075043, "learning_rate": 4.394240350341936e-06, "loss": 0.011853998526930809, "memory(GiB)": 21.48, "step": 17292, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.95634 }, { "epoch": 0.5617711074294254, "grad_norm": 0.34848153591156006, "learning_rate": 4.393707157585476e-06, "loss": 0.014558413997292519, "memory(GiB)": 21.48, "step": 17293, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956347 }, { "epoch": 0.5618035928921807, "grad_norm": 0.4223417341709137, "learning_rate": 4.39317397182644e-06, "loss": 0.01935564912855625, "memory(GiB)": 21.48, "step": 17294, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.956353 }, { "epoch": 0.5618360783549362, "grad_norm": 0.3847464621067047, "learning_rate": 4.392640793070987e-06, "loss": 0.01947459578514099, "memory(GiB)": 21.48, "step": 17295, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.95636 }, { "epoch": 0.5618685638176916, "grad_norm": 0.38263756036758423, "learning_rate": 4.392107621325264e-06, "loss": 0.020745472982525826, "memory(GiB)": 21.48, "step": 17296, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.956367 }, { "epoch": 0.561901049280447, "grad_norm": 0.32726186513900757, "learning_rate": 4.3915744565954295e-06, "loss": 0.01461785938590765, "memory(GiB)": 21.48, "step": 17297, "token_acc": 1.0, "train_speed(iter/s)": 0.956375 }, { "epoch": 0.5619335347432024, "grad_norm": 0.2953425347805023, "learning_rate": 4.391041298887638e-06, "loss": 0.015795059502124786, "memory(GiB)": 21.48, "step": 17298, "token_acc": 1.0, "train_speed(iter/s)": 0.956383 }, { "epoch": 0.5619660202059579, "grad_norm": 0.31766197085380554, "learning_rate": 4.390508148208037e-06, "loss": 0.017579596489667892, "memory(GiB)": 21.48, "step": 17299, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.95639 }, { "epoch": 0.5619985056687132, "grad_norm": 0.41375869512557983, "learning_rate": 4.389975004562786e-06, "loss": 0.017970111221075058, "memory(GiB)": 21.48, "step": 17300, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.956397 }, { "epoch": 0.5620309911314687, "grad_norm": 0.35111746191978455, "learning_rate": 4.389441867958033e-06, "loss": 0.02395334653556347, "memory(GiB)": 21.48, "step": 17301, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.956405 }, { "epoch": 0.5620634765942241, "grad_norm": 0.4085356593132019, "learning_rate": 4.388908738399936e-06, "loss": 0.022966112941503525, "memory(GiB)": 21.48, "step": 17302, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.956411 }, { "epoch": 0.5620959620569795, "grad_norm": 0.3174291253089905, "learning_rate": 4.388375615894643e-06, "loss": 0.023923859000205994, "memory(GiB)": 21.48, "step": 17303, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.956419 }, { "epoch": 0.5621284475197349, "grad_norm": 0.47641217708587646, "learning_rate": 4.387842500448312e-06, "loss": 0.02094629779458046, "memory(GiB)": 21.48, "step": 17304, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.956427 }, { "epoch": 0.5621609329824904, "grad_norm": 0.2855171263217926, "learning_rate": 4.3873093920670915e-06, "loss": 0.016444355249404907, "memory(GiB)": 21.48, "step": 17305, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.956436 }, { "epoch": 0.5621934184452457, "grad_norm": 0.31973013281822205, "learning_rate": 4.386776290757137e-06, "loss": 0.013155958615243435, "memory(GiB)": 21.48, "step": 17306, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.956445 }, { "epoch": 0.5622259039080012, "grad_norm": 0.3066859245300293, "learning_rate": 4.386243196524599e-06, "loss": 0.016192683950066566, "memory(GiB)": 21.48, "step": 17307, "token_acc": 1.0, "train_speed(iter/s)": 0.956453 }, { "epoch": 0.5622583893707566, "grad_norm": 0.3290812373161316, "learning_rate": 4.385710109375633e-06, "loss": 0.017603091895580292, "memory(GiB)": 21.48, "step": 17308, "token_acc": 0.985, "train_speed(iter/s)": 0.956463 }, { "epoch": 0.562290874833512, "grad_norm": 0.3789060413837433, "learning_rate": 4.385177029316388e-06, "loss": 0.02042582258582115, "memory(GiB)": 21.48, "step": 17309, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.956472 }, { "epoch": 0.5623233602962674, "grad_norm": 0.3268166780471802, "learning_rate": 4.384643956353021e-06, "loss": 0.017769504338502884, "memory(GiB)": 21.48, "step": 17310, "token_acc": 1.0, "train_speed(iter/s)": 0.956482 }, { "epoch": 0.5623558457590229, "grad_norm": 0.35600653290748596, "learning_rate": 4.384110890491679e-06, "loss": 0.02577512338757515, "memory(GiB)": 21.48, "step": 17311, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.956493 }, { "epoch": 0.5623883312217782, "grad_norm": 0.38640427589416504, "learning_rate": 4.38357783173852e-06, "loss": 0.02131841517984867, "memory(GiB)": 21.48, "step": 17312, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956505 }, { "epoch": 0.5624208166845337, "grad_norm": 0.3655112683773041, "learning_rate": 4.383044780099691e-06, "loss": 0.017010707408189774, "memory(GiB)": 21.48, "step": 17313, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956517 }, { "epoch": 0.5624533021472891, "grad_norm": 0.27249276638031006, "learning_rate": 4.3825117355813465e-06, "loss": 0.015322674065828323, "memory(GiB)": 21.48, "step": 17314, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.956529 }, { "epoch": 0.5624857876100445, "grad_norm": 0.3082468509674072, "learning_rate": 4.38197869818964e-06, "loss": 0.017182687297463417, "memory(GiB)": 21.48, "step": 17315, "token_acc": 1.0, "train_speed(iter/s)": 0.956541 }, { "epoch": 0.5625182730727999, "grad_norm": 0.3557647168636322, "learning_rate": 4.3814456679307205e-06, "loss": 0.021137436851859093, "memory(GiB)": 21.48, "step": 17316, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.956553 }, { "epoch": 0.5625507585355554, "grad_norm": 0.2936929166316986, "learning_rate": 4.380912644810742e-06, "loss": 0.015468761324882507, "memory(GiB)": 21.48, "step": 17317, "token_acc": 1.0, "train_speed(iter/s)": 0.956564 }, { "epoch": 0.5625832439983107, "grad_norm": 0.27397122979164124, "learning_rate": 4.380379628835855e-06, "loss": 0.012924911454319954, "memory(GiB)": 21.48, "step": 17318, "token_acc": 1.0, "train_speed(iter/s)": 0.956576 }, { "epoch": 0.5626157294610662, "grad_norm": 0.39400872588157654, "learning_rate": 4.3798466200122135e-06, "loss": 0.016688860952854156, "memory(GiB)": 21.48, "step": 17319, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956587 }, { "epoch": 0.5626482149238216, "grad_norm": 0.2943447530269623, "learning_rate": 4.379313618345965e-06, "loss": 0.015613044612109661, "memory(GiB)": 21.48, "step": 17320, "token_acc": 1.0, "train_speed(iter/s)": 0.956599 }, { "epoch": 0.562680700386577, "grad_norm": 0.3957657814025879, "learning_rate": 4.378780623843266e-06, "loss": 0.02301882952451706, "memory(GiB)": 21.48, "step": 17321, "token_acc": 1.0, "train_speed(iter/s)": 0.956611 }, { "epoch": 0.5627131858493324, "grad_norm": 0.3215223550796509, "learning_rate": 4.378247636510263e-06, "loss": 0.018330145627260208, "memory(GiB)": 21.48, "step": 17322, "token_acc": 0.9947643979057592, "train_speed(iter/s)": 0.956623 }, { "epoch": 0.5627456713120879, "grad_norm": 0.3304702341556549, "learning_rate": 4.377714656353112e-06, "loss": 0.014737136662006378, "memory(GiB)": 21.48, "step": 17323, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.956634 }, { "epoch": 0.5627781567748432, "grad_norm": 0.3358224630355835, "learning_rate": 4.37718168337796e-06, "loss": 0.022507939487695694, "memory(GiB)": 21.48, "step": 17324, "token_acc": 0.995, "train_speed(iter/s)": 0.956646 }, { "epoch": 0.5628106422375987, "grad_norm": 0.32958653569221497, "learning_rate": 4.376648717590962e-06, "loss": 0.01931493729352951, "memory(GiB)": 21.48, "step": 17325, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.956657 }, { "epoch": 0.562843127700354, "grad_norm": 0.4734668433666229, "learning_rate": 4.376115758998267e-06, "loss": 0.032284677028656006, "memory(GiB)": 21.48, "step": 17326, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.956669 }, { "epoch": 0.5628756131631095, "grad_norm": 0.2059425264596939, "learning_rate": 4.375582807606027e-06, "loss": 0.014299501664936543, "memory(GiB)": 21.48, "step": 17327, "token_acc": 1.0, "train_speed(iter/s)": 0.956681 }, { "epoch": 0.5629080986258649, "grad_norm": 0.40908363461494446, "learning_rate": 4.3750498634203915e-06, "loss": 0.01896090805530548, "memory(GiB)": 21.48, "step": 17328, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.956693 }, { "epoch": 0.5629405840886204, "grad_norm": 0.26709502935409546, "learning_rate": 4.37451692644751e-06, "loss": 0.014937386848032475, "memory(GiB)": 21.48, "step": 17329, "token_acc": 1.0, "train_speed(iter/s)": 0.956705 }, { "epoch": 0.5629730695513757, "grad_norm": 0.3633054494857788, "learning_rate": 4.373983996693541e-06, "loss": 0.02342771179974079, "memory(GiB)": 21.48, "step": 17330, "token_acc": 0.9776785714285714, "train_speed(iter/s)": 0.956716 }, { "epoch": 0.5630055550141312, "grad_norm": 0.29921379685401917, "learning_rate": 4.373451074164627e-06, "loss": 0.017341718077659607, "memory(GiB)": 21.48, "step": 17331, "token_acc": 0.9836065573770492, "train_speed(iter/s)": 0.956727 }, { "epoch": 0.5630380404768865, "grad_norm": 0.32875749468803406, "learning_rate": 4.372918158866922e-06, "loss": 0.015050210990011692, "memory(GiB)": 21.48, "step": 17332, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956739 }, { "epoch": 0.563070525939642, "grad_norm": 0.4156778156757355, "learning_rate": 4.372385250806576e-06, "loss": 0.014567684382200241, "memory(GiB)": 21.48, "step": 17333, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.956751 }, { "epoch": 0.5631030114023974, "grad_norm": 0.5429278016090393, "learning_rate": 4.37185234998974e-06, "loss": 0.016909271478652954, "memory(GiB)": 21.48, "step": 17334, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.956763 }, { "epoch": 0.5631354968651529, "grad_norm": 0.46568557620048523, "learning_rate": 4.371319456422562e-06, "loss": 0.025404823943972588, "memory(GiB)": 21.48, "step": 17335, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.956775 }, { "epoch": 0.5631679823279082, "grad_norm": 0.2581700086593628, "learning_rate": 4.370786570111197e-06, "loss": 0.0122622549533844, "memory(GiB)": 21.48, "step": 17336, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.956787 }, { "epoch": 0.5632004677906637, "grad_norm": 0.28105902671813965, "learning_rate": 4.37025369106179e-06, "loss": 0.022415637969970703, "memory(GiB)": 21.48, "step": 17337, "token_acc": 0.9869565217391304, "train_speed(iter/s)": 0.956799 }, { "epoch": 0.563232953253419, "grad_norm": 0.4107425808906555, "learning_rate": 4.369720819280496e-06, "loss": 0.024792732670903206, "memory(GiB)": 21.48, "step": 17338, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.956811 }, { "epoch": 0.5632654387161745, "grad_norm": 0.3293324112892151, "learning_rate": 4.36918795477346e-06, "loss": 0.020600704476237297, "memory(GiB)": 21.48, "step": 17339, "token_acc": 0.996, "train_speed(iter/s)": 0.956822 }, { "epoch": 0.5632979241789299, "grad_norm": 0.4337792694568634, "learning_rate": 4.368655097546837e-06, "loss": 0.01699286699295044, "memory(GiB)": 21.48, "step": 17340, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.956833 }, { "epoch": 0.5633304096416853, "grad_norm": 1.4643480777740479, "learning_rate": 4.368122247606773e-06, "loss": 0.021841172128915787, "memory(GiB)": 21.48, "step": 17341, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.956845 }, { "epoch": 0.5633628951044407, "grad_norm": 0.31825315952301025, "learning_rate": 4.36758940495942e-06, "loss": 0.01419568806886673, "memory(GiB)": 21.48, "step": 17342, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956857 }, { "epoch": 0.5633953805671962, "grad_norm": 0.3368339240550995, "learning_rate": 4.367056569610927e-06, "loss": 0.014847713522613049, "memory(GiB)": 21.48, "step": 17343, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956869 }, { "epoch": 0.5634278660299517, "grad_norm": 0.31655073165893555, "learning_rate": 4.366523741567443e-06, "loss": 0.013938823714852333, "memory(GiB)": 21.48, "step": 17344, "token_acc": 1.0, "train_speed(iter/s)": 0.95688 }, { "epoch": 0.563460351492707, "grad_norm": 0.3644876778125763, "learning_rate": 4.365990920835119e-06, "loss": 0.024343006312847137, "memory(GiB)": 21.48, "step": 17345, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.95689 }, { "epoch": 0.5634928369554625, "grad_norm": 0.47097650170326233, "learning_rate": 4.365458107420103e-06, "loss": 0.021362757310271263, "memory(GiB)": 21.48, "step": 17346, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.9569 }, { "epoch": 0.5635253224182178, "grad_norm": 0.3378269374370575, "learning_rate": 4.364925301328545e-06, "loss": 0.022365476936101913, "memory(GiB)": 21.48, "step": 17347, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956909 }, { "epoch": 0.5635578078809733, "grad_norm": 0.3755309581756592, "learning_rate": 4.364392502566595e-06, "loss": 0.01976107992231846, "memory(GiB)": 21.48, "step": 17348, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.956919 }, { "epoch": 0.5635902933437287, "grad_norm": 0.3366420269012451, "learning_rate": 4.3638597111404016e-06, "loss": 0.01655525341629982, "memory(GiB)": 21.48, "step": 17349, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.956929 }, { "epoch": 0.5636227788064841, "grad_norm": 0.4291292130947113, "learning_rate": 4.363326927056112e-06, "loss": 0.020382529124617577, "memory(GiB)": 21.48, "step": 17350, "token_acc": 1.0, "train_speed(iter/s)": 0.956938 }, { "epoch": 0.5636552642692395, "grad_norm": 0.3378838002681732, "learning_rate": 4.3627941503198804e-06, "loss": 0.019996706396341324, "memory(GiB)": 21.48, "step": 17351, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.956948 }, { "epoch": 0.563687749731995, "grad_norm": 0.3826693892478943, "learning_rate": 4.362261380937848e-06, "loss": 0.02240592986345291, "memory(GiB)": 21.48, "step": 17352, "token_acc": 0.988950276243094, "train_speed(iter/s)": 0.956955 }, { "epoch": 0.5637202351947503, "grad_norm": 0.3715521991252899, "learning_rate": 4.361728618916173e-06, "loss": 0.020685037598013878, "memory(GiB)": 21.48, "step": 17353, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.956964 }, { "epoch": 0.5637527206575058, "grad_norm": 0.3065927028656006, "learning_rate": 4.3611958642609955e-06, "loss": 0.016115766018629074, "memory(GiB)": 21.48, "step": 17354, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.956971 }, { "epoch": 0.5637852061202612, "grad_norm": 0.4185277819633484, "learning_rate": 4.3606631169784705e-06, "loss": 0.019105467945337296, "memory(GiB)": 21.48, "step": 17355, "token_acc": 1.0, "train_speed(iter/s)": 0.956978 }, { "epoch": 0.5638176915830166, "grad_norm": 0.31911590695381165, "learning_rate": 4.3601303770747426e-06, "loss": 0.019200142472982407, "memory(GiB)": 21.48, "step": 17356, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.956986 }, { "epoch": 0.563850177045772, "grad_norm": 0.4031745195388794, "learning_rate": 4.359597644555962e-06, "loss": 0.026608619838953018, "memory(GiB)": 21.48, "step": 17357, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.956994 }, { "epoch": 0.5638826625085275, "grad_norm": 0.28743937611579895, "learning_rate": 4.359064919428277e-06, "loss": 0.016731079667806625, "memory(GiB)": 21.48, "step": 17358, "token_acc": 0.9948717948717949, "train_speed(iter/s)": 0.957001 }, { "epoch": 0.5639151479712828, "grad_norm": 0.5347073078155518, "learning_rate": 4.358532201697835e-06, "loss": 0.01796257123351097, "memory(GiB)": 21.48, "step": 17359, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.957009 }, { "epoch": 0.5639476334340383, "grad_norm": 0.4284214377403259, "learning_rate": 4.3579994913707865e-06, "loss": 0.021183466538786888, "memory(GiB)": 21.48, "step": 17360, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957017 }, { "epoch": 0.5639801188967937, "grad_norm": 0.3248186409473419, "learning_rate": 4.357466788453277e-06, "loss": 0.017404889687895775, "memory(GiB)": 21.48, "step": 17361, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.957025 }, { "epoch": 0.5640126043595491, "grad_norm": 0.389355331659317, "learning_rate": 4.356934092951458e-06, "loss": 0.015326341614127159, "memory(GiB)": 21.48, "step": 17362, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.957034 }, { "epoch": 0.5640450898223045, "grad_norm": 0.2844639718532562, "learning_rate": 4.356401404871474e-06, "loss": 0.017754312604665756, "memory(GiB)": 21.48, "step": 17363, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.95704 }, { "epoch": 0.56407757528506, "grad_norm": 0.3963387608528137, "learning_rate": 4.355868724219475e-06, "loss": 0.02181766927242279, "memory(GiB)": 21.48, "step": 17364, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.957048 }, { "epoch": 0.5641100607478153, "grad_norm": 0.36583033204078674, "learning_rate": 4.355336051001608e-06, "loss": 0.020752212032675743, "memory(GiB)": 21.48, "step": 17365, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.957056 }, { "epoch": 0.5641425462105708, "grad_norm": 0.34972310066223145, "learning_rate": 4.354803385224021e-06, "loss": 0.024603327736258507, "memory(GiB)": 21.48, "step": 17366, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.957065 }, { "epoch": 0.5641750316733262, "grad_norm": 0.34193480014801025, "learning_rate": 4.3542707268928615e-06, "loss": 0.020349685102701187, "memory(GiB)": 21.48, "step": 17367, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957073 }, { "epoch": 0.5642075171360816, "grad_norm": 0.3957913815975189, "learning_rate": 4.353738076014278e-06, "loss": 0.017882809042930603, "memory(GiB)": 21.48, "step": 17368, "token_acc": 0.9966216216216216, "train_speed(iter/s)": 0.957081 }, { "epoch": 0.564240002598837, "grad_norm": 0.37807580828666687, "learning_rate": 4.353205432594417e-06, "loss": 0.018380839377641678, "memory(GiB)": 21.48, "step": 17369, "token_acc": 1.0, "train_speed(iter/s)": 0.957091 }, { "epoch": 0.5642724880615925, "grad_norm": 0.29743918776512146, "learning_rate": 4.352672796639426e-06, "loss": 0.013690445572137833, "memory(GiB)": 21.48, "step": 17370, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.9571 }, { "epoch": 0.5643049735243478, "grad_norm": 0.41111430525779724, "learning_rate": 4.352140168155452e-06, "loss": 0.019635755568742752, "memory(GiB)": 21.48, "step": 17371, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.957111 }, { "epoch": 0.5643374589871033, "grad_norm": 0.3445240557193756, "learning_rate": 4.351607547148645e-06, "loss": 0.01273147203028202, "memory(GiB)": 21.48, "step": 17372, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.957122 }, { "epoch": 0.5643699444498587, "grad_norm": 0.4242480993270874, "learning_rate": 4.351074933625147e-06, "loss": 0.02830951288342476, "memory(GiB)": 21.48, "step": 17373, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.957133 }, { "epoch": 0.5644024299126141, "grad_norm": 0.3737774193286896, "learning_rate": 4.3505423275911104e-06, "loss": 0.023352673277258873, "memory(GiB)": 21.48, "step": 17374, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.957145 }, { "epoch": 0.5644349153753695, "grad_norm": 0.32567739486694336, "learning_rate": 4.350009729052677e-06, "loss": 0.0148041732609272, "memory(GiB)": 21.48, "step": 17375, "token_acc": 0.9897959183673469, "train_speed(iter/s)": 0.957157 }, { "epoch": 0.564467400838125, "grad_norm": 0.3795967102050781, "learning_rate": 4.349477138015997e-06, "loss": 0.01956353150308132, "memory(GiB)": 21.48, "step": 17376, "token_acc": 0.996, "train_speed(iter/s)": 0.957168 }, { "epoch": 0.5644998863008803, "grad_norm": 0.49292972683906555, "learning_rate": 4.34894455448722e-06, "loss": 0.022538060322403908, "memory(GiB)": 21.48, "step": 17377, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957179 }, { "epoch": 0.5645323717636358, "grad_norm": 0.42704179883003235, "learning_rate": 4.348411978472486e-06, "loss": 0.018419837579131126, "memory(GiB)": 21.48, "step": 17378, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.95719 }, { "epoch": 0.5645648572263912, "grad_norm": 0.3258041739463806, "learning_rate": 4.3478794099779475e-06, "loss": 0.017966698855161667, "memory(GiB)": 21.48, "step": 17379, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.957201 }, { "epoch": 0.5645973426891466, "grad_norm": 0.32562389969825745, "learning_rate": 4.347346849009746e-06, "loss": 0.01681451126933098, "memory(GiB)": 21.48, "step": 17380, "token_acc": 0.995, "train_speed(iter/s)": 0.957213 }, { "epoch": 0.564629828151902, "grad_norm": 0.34296151995658875, "learning_rate": 4.346814295574034e-06, "loss": 0.01947580836713314, "memory(GiB)": 21.48, "step": 17381, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957224 }, { "epoch": 0.5646623136146575, "grad_norm": 0.29462048411369324, "learning_rate": 4.34628174967695e-06, "loss": 0.014507659710943699, "memory(GiB)": 21.48, "step": 17382, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957236 }, { "epoch": 0.5646947990774128, "grad_norm": 0.5124883651733398, "learning_rate": 4.345749211324648e-06, "loss": 0.018294675275683403, "memory(GiB)": 21.48, "step": 17383, "token_acc": 1.0, "train_speed(iter/s)": 0.957248 }, { "epoch": 0.5647272845401683, "grad_norm": 0.46919891238212585, "learning_rate": 4.345216680523269e-06, "loss": 0.01944197714328766, "memory(GiB)": 21.48, "step": 17384, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.95726 }, { "epoch": 0.5647597700029237, "grad_norm": 0.3579772412776947, "learning_rate": 4.344684157278963e-06, "loss": 0.022273030132055283, "memory(GiB)": 21.48, "step": 17385, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957272 }, { "epoch": 0.5647922554656791, "grad_norm": 0.4290415942668915, "learning_rate": 4.344151641597872e-06, "loss": 0.01920231059193611, "memory(GiB)": 21.48, "step": 17386, "token_acc": 1.0, "train_speed(iter/s)": 0.957284 }, { "epoch": 0.5648247409284345, "grad_norm": 0.4645758867263794, "learning_rate": 4.343619133486145e-06, "loss": 0.019658245146274567, "memory(GiB)": 21.48, "step": 17387, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.957296 }, { "epoch": 0.56485722639119, "grad_norm": 0.4742984175682068, "learning_rate": 4.343086632949927e-06, "loss": 0.031916435807943344, "memory(GiB)": 21.48, "step": 17388, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.957308 }, { "epoch": 0.5648897118539453, "grad_norm": 0.3305003345012665, "learning_rate": 4.342554139995363e-06, "loss": 0.02084071934223175, "memory(GiB)": 21.48, "step": 17389, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.95732 }, { "epoch": 0.5649221973167008, "grad_norm": 0.3648093640804291, "learning_rate": 4.3420216546286e-06, "loss": 0.017022188752889633, "memory(GiB)": 21.48, "step": 17390, "token_acc": 0.993421052631579, "train_speed(iter/s)": 0.957332 }, { "epoch": 0.5649546827794562, "grad_norm": 0.4991373121738434, "learning_rate": 4.3414891768557815e-06, "loss": 0.022447235882282257, "memory(GiB)": 21.48, "step": 17391, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.957343 }, { "epoch": 0.5649871682422116, "grad_norm": 0.5742383599281311, "learning_rate": 4.340956706683056e-06, "loss": 0.017453975975513458, "memory(GiB)": 21.48, "step": 17392, "token_acc": 1.0, "train_speed(iter/s)": 0.957354 }, { "epoch": 0.565019653704967, "grad_norm": 0.952390193939209, "learning_rate": 4.3404242441165655e-06, "loss": 0.027362525463104248, "memory(GiB)": 21.48, "step": 17393, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957366 }, { "epoch": 0.5650521391677225, "grad_norm": 0.6598585247993469, "learning_rate": 4.3398917891624596e-06, "loss": 0.021626245230436325, "memory(GiB)": 21.48, "step": 17394, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.957378 }, { "epoch": 0.5650846246304778, "grad_norm": 0.28258123993873596, "learning_rate": 4.339359341826879e-06, "loss": 0.013091184198856354, "memory(GiB)": 21.48, "step": 17395, "token_acc": 1.0, "train_speed(iter/s)": 0.957389 }, { "epoch": 0.5651171100932333, "grad_norm": 0.4787445664405823, "learning_rate": 4.338826902115972e-06, "loss": 0.025777285918593407, "memory(GiB)": 21.48, "step": 17396, "token_acc": 1.0, "train_speed(iter/s)": 0.9574 }, { "epoch": 0.5651495955559886, "grad_norm": 0.4536420404911041, "learning_rate": 4.338294470035881e-06, "loss": 0.01466324646025896, "memory(GiB)": 21.48, "step": 17397, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957412 }, { "epoch": 0.5651820810187441, "grad_norm": 0.4001735746860504, "learning_rate": 4.337762045592755e-06, "loss": 0.022127831354737282, "memory(GiB)": 21.48, "step": 17398, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.957423 }, { "epoch": 0.5652145664814995, "grad_norm": 0.3706139922142029, "learning_rate": 4.337229628792735e-06, "loss": 0.02040894329547882, "memory(GiB)": 21.48, "step": 17399, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.957435 }, { "epoch": 0.565247051944255, "grad_norm": 0.42260533571243286, "learning_rate": 4.3366972196419685e-06, "loss": 0.017715368419885635, "memory(GiB)": 21.48, "step": 17400, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.957446 }, { "epoch": 0.5652795374070103, "grad_norm": 0.3626737892627716, "learning_rate": 4.336164818146596e-06, "loss": 0.014772897586226463, "memory(GiB)": 21.48, "step": 17401, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.957458 }, { "epoch": 0.5653120228697658, "grad_norm": 0.314340740442276, "learning_rate": 4.335632424312769e-06, "loss": 0.012345383875072002, "memory(GiB)": 21.48, "step": 17402, "token_acc": 1.0, "train_speed(iter/s)": 0.95747 }, { "epoch": 0.5653445083325211, "grad_norm": 0.4799005091190338, "learning_rate": 4.335100038146624e-06, "loss": 0.022130770608782768, "memory(GiB)": 21.48, "step": 17403, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.95748 }, { "epoch": 0.5653769937952766, "grad_norm": 0.2911141514778137, "learning_rate": 4.334567659654313e-06, "loss": 0.018188683316111565, "memory(GiB)": 21.48, "step": 17404, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957492 }, { "epoch": 0.565409479258032, "grad_norm": 0.2969048023223877, "learning_rate": 4.334035288841975e-06, "loss": 0.016549870371818542, "memory(GiB)": 21.48, "step": 17405, "token_acc": 1.0, "train_speed(iter/s)": 0.957504 }, { "epoch": 0.5654419647207874, "grad_norm": 0.3057597279548645, "learning_rate": 4.333502925715754e-06, "loss": 0.019100718200206757, "memory(GiB)": 21.48, "step": 17406, "token_acc": 1.0, "train_speed(iter/s)": 0.957516 }, { "epoch": 0.5654744501835428, "grad_norm": 0.29629066586494446, "learning_rate": 4.3329705702818015e-06, "loss": 0.020526498556137085, "memory(GiB)": 21.48, "step": 17407, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.957525 }, { "epoch": 0.5655069356462983, "grad_norm": 0.3117944896221161, "learning_rate": 4.3324382225462526e-06, "loss": 0.020515430718660355, "memory(GiB)": 21.48, "step": 17408, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.957535 }, { "epoch": 0.5655394211090538, "grad_norm": 0.32055798172950745, "learning_rate": 4.331905882515258e-06, "loss": 0.02029702626168728, "memory(GiB)": 21.48, "step": 17409, "token_acc": 0.99609375, "train_speed(iter/s)": 0.957544 }, { "epoch": 0.5655719065718091, "grad_norm": 0.3793087601661682, "learning_rate": 4.331373550194956e-06, "loss": 0.017903266474604607, "memory(GiB)": 21.48, "step": 17410, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.957554 }, { "epoch": 0.5656043920345646, "grad_norm": 0.4138706624507904, "learning_rate": 4.330841225591495e-06, "loss": 0.022141773253679276, "memory(GiB)": 21.48, "step": 17411, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.957563 }, { "epoch": 0.56563687749732, "grad_norm": 0.3895915150642395, "learning_rate": 4.330308908711016e-06, "loss": 0.019700784236192703, "memory(GiB)": 21.48, "step": 17412, "token_acc": 0.991869918699187, "train_speed(iter/s)": 0.95757 }, { "epoch": 0.5656693629600754, "grad_norm": 0.4109303653240204, "learning_rate": 4.329776599559664e-06, "loss": 0.02159765362739563, "memory(GiB)": 21.48, "step": 17413, "token_acc": 1.0, "train_speed(iter/s)": 0.957577 }, { "epoch": 0.5657018484228308, "grad_norm": 0.7229661345481873, "learning_rate": 4.329244298143581e-06, "loss": 0.02271345444023609, "memory(GiB)": 21.48, "step": 17414, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.957585 }, { "epoch": 0.5657343338855862, "grad_norm": 0.3575822710990906, "learning_rate": 4.328712004468914e-06, "loss": 0.015590879134833813, "memory(GiB)": 21.48, "step": 17415, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.957593 }, { "epoch": 0.5657668193483416, "grad_norm": 0.5854810476303101, "learning_rate": 4.328179718541801e-06, "loss": 0.024477936327457428, "memory(GiB)": 21.48, "step": 17416, "token_acc": 0.9965156794425087, "train_speed(iter/s)": 0.957601 }, { "epoch": 0.5657993048110971, "grad_norm": 0.29582467675209045, "learning_rate": 4.327647440368391e-06, "loss": 0.015795845538377762, "memory(GiB)": 21.48, "step": 17417, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957609 }, { "epoch": 0.5658317902738524, "grad_norm": 0.3115084171295166, "learning_rate": 4.327115169954822e-06, "loss": 0.014343219809234142, "memory(GiB)": 21.48, "step": 17418, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957618 }, { "epoch": 0.5658642757366079, "grad_norm": 0.3879605829715729, "learning_rate": 4.326582907307241e-06, "loss": 0.021243896335363388, "memory(GiB)": 21.48, "step": 17419, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957626 }, { "epoch": 0.5658967611993633, "grad_norm": 0.3045644462108612, "learning_rate": 4.326050652431789e-06, "loss": 0.018474793061614037, "memory(GiB)": 21.48, "step": 17420, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.957633 }, { "epoch": 0.5659292466621187, "grad_norm": 0.30051034688949585, "learning_rate": 4.325518405334609e-06, "loss": 0.018051544204354286, "memory(GiB)": 21.48, "step": 17421, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.95764 }, { "epoch": 0.5659617321248741, "grad_norm": 0.4429610073566437, "learning_rate": 4.324986166021845e-06, "loss": 0.018688876181840897, "memory(GiB)": 21.48, "step": 17422, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.957647 }, { "epoch": 0.5659942175876296, "grad_norm": 0.42790472507476807, "learning_rate": 4.324453934499639e-06, "loss": 0.027088649570941925, "memory(GiB)": 21.48, "step": 17423, "token_acc": 0.9868995633187773, "train_speed(iter/s)": 0.957654 }, { "epoch": 0.5660267030503849, "grad_norm": 0.31506383419036865, "learning_rate": 4.323921710774134e-06, "loss": 0.017472580075263977, "memory(GiB)": 21.48, "step": 17424, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.957662 }, { "epoch": 0.5660591885131404, "grad_norm": 0.364381343126297, "learning_rate": 4.3233894948514724e-06, "loss": 0.014472822658717632, "memory(GiB)": 21.48, "step": 17425, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.957669 }, { "epoch": 0.5660916739758958, "grad_norm": 0.37781596183776855, "learning_rate": 4.322857286737796e-06, "loss": 0.018924791365861893, "memory(GiB)": 21.48, "step": 17426, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957678 }, { "epoch": 0.5661241594386512, "grad_norm": 0.29072627425193787, "learning_rate": 4.322325086439248e-06, "loss": 0.014050399884581566, "memory(GiB)": 21.48, "step": 17427, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.957686 }, { "epoch": 0.5661566449014066, "grad_norm": 0.3725074529647827, "learning_rate": 4.321792893961972e-06, "loss": 0.02027203142642975, "memory(GiB)": 21.48, "step": 17428, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957693 }, { "epoch": 0.5661891303641621, "grad_norm": 0.22894787788391113, "learning_rate": 4.321260709312106e-06, "loss": 0.014720594510436058, "memory(GiB)": 21.48, "step": 17429, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957702 }, { "epoch": 0.5662216158269174, "grad_norm": 0.20411023497581482, "learning_rate": 4.320728532495798e-06, "loss": 0.008963918313384056, "memory(GiB)": 21.48, "step": 17430, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.95771 }, { "epoch": 0.5662541012896729, "grad_norm": 0.47091349959373474, "learning_rate": 4.320196363519183e-06, "loss": 0.025515474379062653, "memory(GiB)": 21.48, "step": 17431, "token_acc": 0.9939393939393939, "train_speed(iter/s)": 0.957721 }, { "epoch": 0.5662865867524283, "grad_norm": 0.4497615694999695, "learning_rate": 4.31966420238841e-06, "loss": 0.026609718799591064, "memory(GiB)": 21.48, "step": 17432, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.957732 }, { "epoch": 0.5663190722151837, "grad_norm": 0.3939923346042633, "learning_rate": 4.319132049109616e-06, "loss": 0.017243586480617523, "memory(GiB)": 21.48, "step": 17433, "token_acc": 0.992, "train_speed(iter/s)": 0.957744 }, { "epoch": 0.5663515576779391, "grad_norm": 0.3002297282218933, "learning_rate": 4.318599903688946e-06, "loss": 0.014526939019560814, "memory(GiB)": 21.48, "step": 17434, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.957756 }, { "epoch": 0.5663840431406946, "grad_norm": 0.31481367349624634, "learning_rate": 4.318067766132537e-06, "loss": 0.019268084317445755, "memory(GiB)": 21.48, "step": 17435, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.957768 }, { "epoch": 0.5664165286034499, "grad_norm": 0.35672909021377563, "learning_rate": 4.3175356364465344e-06, "loss": 0.019760286435484886, "memory(GiB)": 21.48, "step": 17436, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.957779 }, { "epoch": 0.5664490140662054, "grad_norm": 0.4175233840942383, "learning_rate": 4.317003514637082e-06, "loss": 0.02198585495352745, "memory(GiB)": 21.48, "step": 17437, "token_acc": 1.0, "train_speed(iter/s)": 0.957791 }, { "epoch": 0.5664814995289608, "grad_norm": 0.3128618896007538, "learning_rate": 4.316471400710316e-06, "loss": 0.014313233084976673, "memory(GiB)": 21.48, "step": 17438, "token_acc": 1.0, "train_speed(iter/s)": 0.957802 }, { "epoch": 0.5665139849917162, "grad_norm": 0.8132391571998596, "learning_rate": 4.31593929467238e-06, "loss": 0.02495693787932396, "memory(GiB)": 21.48, "step": 17439, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.957814 }, { "epoch": 0.5665464704544716, "grad_norm": 0.33810073137283325, "learning_rate": 4.315407196529414e-06, "loss": 0.01565955951809883, "memory(GiB)": 21.48, "step": 17440, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957826 }, { "epoch": 0.5665789559172271, "grad_norm": 0.26539671421051025, "learning_rate": 4.314875106287562e-06, "loss": 0.010510869324207306, "memory(GiB)": 21.48, "step": 17441, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.957838 }, { "epoch": 0.5666114413799824, "grad_norm": 0.4151134192943573, "learning_rate": 4.3143430239529614e-06, "loss": 0.02195374295115471, "memory(GiB)": 21.48, "step": 17442, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.95785 }, { "epoch": 0.5666439268427379, "grad_norm": 0.35733506083488464, "learning_rate": 4.313810949531756e-06, "loss": 0.01777525246143341, "memory(GiB)": 21.48, "step": 17443, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957861 }, { "epoch": 0.5666764123054933, "grad_norm": 0.27045026421546936, "learning_rate": 4.313278883030085e-06, "loss": 0.012321112677454948, "memory(GiB)": 21.48, "step": 17444, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.957872 }, { "epoch": 0.5667088977682487, "grad_norm": 0.35617583990097046, "learning_rate": 4.312746824454091e-06, "loss": 0.012767201289534569, "memory(GiB)": 21.48, "step": 17445, "token_acc": 1.0, "train_speed(iter/s)": 0.957883 }, { "epoch": 0.5667413832310041, "grad_norm": 0.2639501392841339, "learning_rate": 4.312214773809911e-06, "loss": 0.018985528498888016, "memory(GiB)": 21.48, "step": 17446, "token_acc": 1.0, "train_speed(iter/s)": 0.957895 }, { "epoch": 0.5667738686937596, "grad_norm": 0.46957796812057495, "learning_rate": 4.31168273110369e-06, "loss": 0.021495934575796127, "memory(GiB)": 21.48, "step": 17447, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.957907 }, { "epoch": 0.5668063541565149, "grad_norm": 0.2887031137943268, "learning_rate": 4.311150696341565e-06, "loss": 0.011490811593830585, "memory(GiB)": 21.48, "step": 17448, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.957919 }, { "epoch": 0.5668388396192704, "grad_norm": 0.348541796207428, "learning_rate": 4.31061866952968e-06, "loss": 0.021496273577213287, "memory(GiB)": 21.48, "step": 17449, "token_acc": 0.9764150943396226, "train_speed(iter/s)": 0.957931 }, { "epoch": 0.5668713250820258, "grad_norm": 0.31560397148132324, "learning_rate": 4.310086650674171e-06, "loss": 0.01699158549308777, "memory(GiB)": 21.48, "step": 17450, "token_acc": 0.9921875, "train_speed(iter/s)": 0.957942 }, { "epoch": 0.5669038105447812, "grad_norm": 0.5121086835861206, "learning_rate": 4.309554639781183e-06, "loss": 0.01912565901875496, "memory(GiB)": 21.48, "step": 17451, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.957954 }, { "epoch": 0.5669362960075366, "grad_norm": 0.35104385018348694, "learning_rate": 4.309022636856849e-06, "loss": 0.01720770075917244, "memory(GiB)": 21.48, "step": 17452, "token_acc": 1.0, "train_speed(iter/s)": 0.957964 }, { "epoch": 0.5669687814702921, "grad_norm": 0.3429177403450012, "learning_rate": 4.308490641907316e-06, "loss": 0.020192358642816544, "memory(GiB)": 21.48, "step": 17453, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957977 }, { "epoch": 0.5670012669330474, "grad_norm": 0.31406155228614807, "learning_rate": 4.307958654938721e-06, "loss": 0.01537112332880497, "memory(GiB)": 21.48, "step": 17454, "token_acc": 0.9927007299270073, "train_speed(iter/s)": 0.957988 }, { "epoch": 0.5670337523958029, "grad_norm": 0.22627811133861542, "learning_rate": 4.307426675957205e-06, "loss": 0.014392618089914322, "memory(GiB)": 21.48, "step": 17455, "token_acc": 1.0, "train_speed(iter/s)": 0.958 }, { "epoch": 0.5670662378585583, "grad_norm": 0.42936787009239197, "learning_rate": 4.306894704968908e-06, "loss": 0.018821369856595993, "memory(GiB)": 21.48, "step": 17456, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.958011 }, { "epoch": 0.5670987233213137, "grad_norm": 0.3998073935508728, "learning_rate": 4.306362741979966e-06, "loss": 0.021983828395605087, "memory(GiB)": 21.48, "step": 17457, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.958024 }, { "epoch": 0.5671312087840691, "grad_norm": 0.32292234897613525, "learning_rate": 4.305830786996523e-06, "loss": 0.016682522371411324, "memory(GiB)": 21.48, "step": 17458, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.958035 }, { "epoch": 0.5671636942468246, "grad_norm": 0.42465537786483765, "learning_rate": 4.305298840024714e-06, "loss": 0.02376730367541313, "memory(GiB)": 21.48, "step": 17459, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.958046 }, { "epoch": 0.5671961797095799, "grad_norm": 0.4395999014377594, "learning_rate": 4.304766901070685e-06, "loss": 0.02362135425209999, "memory(GiB)": 21.48, "step": 17460, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.958058 }, { "epoch": 0.5672286651723354, "grad_norm": 0.36266404390335083, "learning_rate": 4.304234970140569e-06, "loss": 0.016519203782081604, "memory(GiB)": 21.48, "step": 17461, "token_acc": 1.0, "train_speed(iter/s)": 0.958069 }, { "epoch": 0.5672611506350907, "grad_norm": 0.3831264078617096, "learning_rate": 4.303703047240509e-06, "loss": 0.018399164080619812, "memory(GiB)": 21.48, "step": 17462, "token_acc": 1.0, "train_speed(iter/s)": 0.958081 }, { "epoch": 0.5672936360978462, "grad_norm": 0.33747801184654236, "learning_rate": 4.3031711323766404e-06, "loss": 0.019590038806200027, "memory(GiB)": 21.48, "step": 17463, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.958093 }, { "epoch": 0.5673261215606016, "grad_norm": 0.40551984310150146, "learning_rate": 4.302639225555106e-06, "loss": 0.026564668864011765, "memory(GiB)": 21.48, "step": 17464, "token_acc": 0.99609375, "train_speed(iter/s)": 0.958104 }, { "epoch": 0.567358607023357, "grad_norm": 0.44194495677948, "learning_rate": 4.302107326782042e-06, "loss": 0.01894913986325264, "memory(GiB)": 21.48, "step": 17465, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.958116 }, { "epoch": 0.5673910924861124, "grad_norm": 0.3618682026863098, "learning_rate": 4.301575436063589e-06, "loss": 0.02275724522769451, "memory(GiB)": 21.48, "step": 17466, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.958128 }, { "epoch": 0.5674235779488679, "grad_norm": 0.33212438225746155, "learning_rate": 4.3010435534058845e-06, "loss": 0.022780830040574074, "memory(GiB)": 21.48, "step": 17467, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.95814 }, { "epoch": 0.5674560634116232, "grad_norm": 0.28127843141555786, "learning_rate": 4.3005116788150684e-06, "loss": 0.020944885909557343, "memory(GiB)": 21.48, "step": 17468, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.958152 }, { "epoch": 0.5674885488743787, "grad_norm": 0.3819032609462738, "learning_rate": 4.299979812297278e-06, "loss": 0.015328392386436462, "memory(GiB)": 21.48, "step": 17469, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.958163 }, { "epoch": 0.5675210343371341, "grad_norm": 0.3037455379962921, "learning_rate": 4.299447953858652e-06, "loss": 0.016782792285084724, "memory(GiB)": 21.48, "step": 17470, "token_acc": 1.0, "train_speed(iter/s)": 0.958172 }, { "epoch": 0.5675535197998895, "grad_norm": 0.5704126358032227, "learning_rate": 4.298916103505331e-06, "loss": 0.016311241313815117, "memory(GiB)": 21.48, "step": 17471, "token_acc": 1.0, "train_speed(iter/s)": 0.95818 }, { "epoch": 0.567586005262645, "grad_norm": 0.43898317217826843, "learning_rate": 4.298384261243449e-06, "loss": 0.02713317610323429, "memory(GiB)": 21.48, "step": 17472, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.958188 }, { "epoch": 0.5676184907254004, "grad_norm": 0.25308793783187866, "learning_rate": 4.297852427079148e-06, "loss": 0.017496634274721146, "memory(GiB)": 21.48, "step": 17473, "token_acc": 0.9947368421052631, "train_speed(iter/s)": 0.958196 }, { "epoch": 0.5676509761881559, "grad_norm": 0.4151824414730072, "learning_rate": 4.297320601018564e-06, "loss": 0.022025087848305702, "memory(GiB)": 21.48, "step": 17474, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.958204 }, { "epoch": 0.5676834616509112, "grad_norm": 0.2547301948070526, "learning_rate": 4.296788783067835e-06, "loss": 0.013074158690869808, "memory(GiB)": 21.48, "step": 17475, "token_acc": 1.0, "train_speed(iter/s)": 0.958212 }, { "epoch": 0.5677159471136667, "grad_norm": 0.6317867040634155, "learning_rate": 4.2962569732331e-06, "loss": 0.023086389526724815, "memory(GiB)": 21.48, "step": 17476, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.95822 }, { "epoch": 0.567748432576422, "grad_norm": 0.2782645523548126, "learning_rate": 4.295725171520497e-06, "loss": 0.01667923666536808, "memory(GiB)": 21.48, "step": 17477, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.958228 }, { "epoch": 0.5677809180391775, "grad_norm": 0.4277539849281311, "learning_rate": 4.295193377936161e-06, "loss": 0.012827703729271889, "memory(GiB)": 21.48, "step": 17478, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.958236 }, { "epoch": 0.5678134035019329, "grad_norm": 0.4443064033985138, "learning_rate": 4.294661592486234e-06, "loss": 0.025306757539510727, "memory(GiB)": 21.48, "step": 17479, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.958244 }, { "epoch": 0.5678458889646884, "grad_norm": 0.3444497585296631, "learning_rate": 4.294129815176848e-06, "loss": 0.01918129250407219, "memory(GiB)": 21.48, "step": 17480, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.958251 }, { "epoch": 0.5678783744274437, "grad_norm": 0.3343283236026764, "learning_rate": 4.293598046014147e-06, "loss": 0.02128443494439125, "memory(GiB)": 21.48, "step": 17481, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.958259 }, { "epoch": 0.5679108598901992, "grad_norm": 0.36117973923683167, "learning_rate": 4.293066285004261e-06, "loss": 0.023317191749811172, "memory(GiB)": 21.48, "step": 17482, "token_acc": 0.996415770609319, "train_speed(iter/s)": 0.958266 }, { "epoch": 0.5679433453529545, "grad_norm": 0.4229443669319153, "learning_rate": 4.292534532153333e-06, "loss": 0.028817271813750267, "memory(GiB)": 21.48, "step": 17483, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.958274 }, { "epoch": 0.56797583081571, "grad_norm": 0.36744001507759094, "learning_rate": 4.292002787467499e-06, "loss": 0.02101062797009945, "memory(GiB)": 21.48, "step": 17484, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.958281 }, { "epoch": 0.5680083162784654, "grad_norm": 0.3835875988006592, "learning_rate": 4.291471050952892e-06, "loss": 0.020897794514894485, "memory(GiB)": 21.48, "step": 17485, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.958289 }, { "epoch": 0.5680408017412208, "grad_norm": 0.3482365608215332, "learning_rate": 4.290939322615656e-06, "loss": 0.019576063379645348, "memory(GiB)": 21.48, "step": 17486, "token_acc": 0.993103448275862, "train_speed(iter/s)": 0.958297 }, { "epoch": 0.5680732872039762, "grad_norm": 0.4086915850639343, "learning_rate": 4.290407602461921e-06, "loss": 0.022718865424394608, "memory(GiB)": 21.48, "step": 17487, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.958305 }, { "epoch": 0.5681057726667317, "grad_norm": 0.31638211011886597, "learning_rate": 4.289875890497829e-06, "loss": 0.019038014113903046, "memory(GiB)": 21.48, "step": 17488, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.958312 }, { "epoch": 0.568138258129487, "grad_norm": 0.23658736050128937, "learning_rate": 4.289344186729514e-06, "loss": 0.016058627516031265, "memory(GiB)": 21.48, "step": 17489, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.958319 }, { "epoch": 0.5681707435922425, "grad_norm": 0.4427541196346283, "learning_rate": 4.288812491163112e-06, "loss": 0.01632075011730194, "memory(GiB)": 21.48, "step": 17490, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.958327 }, { "epoch": 0.5682032290549979, "grad_norm": 0.9871253371238708, "learning_rate": 4.28828080380476e-06, "loss": 0.018363500013947487, "memory(GiB)": 21.48, "step": 17491, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.958335 }, { "epoch": 0.5682357145177533, "grad_norm": 0.3724105954170227, "learning_rate": 4.287749124660596e-06, "loss": 0.020267417654395103, "memory(GiB)": 21.48, "step": 17492, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.958345 }, { "epoch": 0.5682681999805087, "grad_norm": 0.31955593824386597, "learning_rate": 4.287217453736754e-06, "loss": 0.01918637752532959, "memory(GiB)": 21.48, "step": 17493, "token_acc": 0.9964028776978417, "train_speed(iter/s)": 0.958356 }, { "epoch": 0.5683006854432642, "grad_norm": 0.3069517910480499, "learning_rate": 4.286685791039373e-06, "loss": 0.016645897179841995, "memory(GiB)": 21.48, "step": 17494, "token_acc": 0.9799196787148594, "train_speed(iter/s)": 0.958368 }, { "epoch": 0.5683331709060195, "grad_norm": 0.2757188081741333, "learning_rate": 4.286154136574586e-06, "loss": 0.01592087745666504, "memory(GiB)": 21.48, "step": 17495, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.95838 }, { "epoch": 0.568365656368775, "grad_norm": 1.5843154191970825, "learning_rate": 4.285622490348531e-06, "loss": 0.02201620489358902, "memory(GiB)": 21.48, "step": 17496, "token_acc": 0.984, "train_speed(iter/s)": 0.958391 }, { "epoch": 0.5683981418315304, "grad_norm": 0.3636510968208313, "learning_rate": 4.285090852367343e-06, "loss": 0.017180461436510086, "memory(GiB)": 21.48, "step": 17497, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.958402 }, { "epoch": 0.5684306272942858, "grad_norm": 0.41749855875968933, "learning_rate": 4.284559222637157e-06, "loss": 0.021582089364528656, "memory(GiB)": 21.48, "step": 17498, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.958414 }, { "epoch": 0.5684631127570412, "grad_norm": 0.37416601181030273, "learning_rate": 4.284027601164112e-06, "loss": 0.016733981668949127, "memory(GiB)": 21.48, "step": 17499, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.958426 }, { "epoch": 0.5684955982197967, "grad_norm": 0.326109915971756, "learning_rate": 4.283495987954339e-06, "loss": 0.014685088768601418, "memory(GiB)": 21.48, "step": 17500, "token_acc": 0.9805825242718447, "train_speed(iter/s)": 0.958437 }, { "epoch": 0.5684955982197967, "eval_loss": 0.018994595855474472, "eval_runtime": 79.0202, "eval_samples_per_second": 125.917, "eval_steps_per_second": 3.936, "eval_token_acc": 0.992377088706868, "step": 17500 }, { "epoch": 0.568528083682552, "grad_norm": 0.23532705008983612, "learning_rate": 4.282964383013978e-06, "loss": 0.014095744118094444, "memory(GiB)": 21.48, "step": 17501, "token_acc": 0.991877491199474, "train_speed(iter/s)": 0.953773 }, { "epoch": 0.5685605691453075, "grad_norm": 0.40980419516563416, "learning_rate": 4.282432786349161e-06, "loss": 0.022786051034927368, "memory(GiB)": 21.48, "step": 17502, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.953782 }, { "epoch": 0.5685930546080629, "grad_norm": 0.3137959837913513, "learning_rate": 4.281901197966027e-06, "loss": 0.015592342242598534, "memory(GiB)": 21.48, "step": 17503, "token_acc": 1.0, "train_speed(iter/s)": 0.953791 }, { "epoch": 0.5686255400708183, "grad_norm": 0.34882718324661255, "learning_rate": 4.281369617870706e-06, "loss": 0.020709935575723648, "memory(GiB)": 21.48, "step": 17504, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.9538 }, { "epoch": 0.5686580255335737, "grad_norm": 0.3211698830127716, "learning_rate": 4.280838046069339e-06, "loss": 0.01626194640994072, "memory(GiB)": 21.48, "step": 17505, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.953808 }, { "epoch": 0.5686905109963292, "grad_norm": 0.46522268652915955, "learning_rate": 4.280306482568056e-06, "loss": 0.018695881590247154, "memory(GiB)": 21.48, "step": 17506, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.953816 }, { "epoch": 0.5687229964590845, "grad_norm": 0.3109317719936371, "learning_rate": 4.279774927372996e-06, "loss": 0.02025599218904972, "memory(GiB)": 21.48, "step": 17507, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.953824 }, { "epoch": 0.56875548192184, "grad_norm": 0.3819800019264221, "learning_rate": 4.27924338049029e-06, "loss": 0.023807700723409653, "memory(GiB)": 21.48, "step": 17508, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.953832 }, { "epoch": 0.5687879673845954, "grad_norm": 0.2970024347305298, "learning_rate": 4.278711841926077e-06, "loss": 0.017676781862974167, "memory(GiB)": 21.48, "step": 17509, "token_acc": 1.0, "train_speed(iter/s)": 0.953839 }, { "epoch": 0.5688204528473508, "grad_norm": 0.43629395961761475, "learning_rate": 4.278180311686486e-06, "loss": 0.019679885357618332, "memory(GiB)": 21.48, "step": 17510, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.953847 }, { "epoch": 0.5688529383101062, "grad_norm": 0.24773085117340088, "learning_rate": 4.277648789777658e-06, "loss": 0.010075431317090988, "memory(GiB)": 21.48, "step": 17511, "token_acc": 0.9884169884169884, "train_speed(iter/s)": 0.953854 }, { "epoch": 0.5688854237728617, "grad_norm": 0.270534485578537, "learning_rate": 4.277117276205722e-06, "loss": 0.015626706182956696, "memory(GiB)": 21.48, "step": 17512, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.953862 }, { "epoch": 0.568917909235617, "grad_norm": 0.48587772250175476, "learning_rate": 4.2765857709768176e-06, "loss": 0.030995182693004608, "memory(GiB)": 21.48, "step": 17513, "token_acc": 0.9808612440191388, "train_speed(iter/s)": 0.95387 }, { "epoch": 0.5689503946983725, "grad_norm": 0.22494378685951233, "learning_rate": 4.276054274097074e-06, "loss": 0.01325454842299223, "memory(GiB)": 21.48, "step": 17514, "token_acc": 1.0, "train_speed(iter/s)": 0.953877 }, { "epoch": 0.5689828801611279, "grad_norm": 0.26211145520210266, "learning_rate": 4.275522785572626e-06, "loss": 0.013623601756989956, "memory(GiB)": 21.48, "step": 17515, "token_acc": 1.0, "train_speed(iter/s)": 0.953885 }, { "epoch": 0.5690153656238833, "grad_norm": 0.439299076795578, "learning_rate": 4.274991305409614e-06, "loss": 0.025345945730805397, "memory(GiB)": 21.48, "step": 17516, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.953893 }, { "epoch": 0.5690478510866387, "grad_norm": 0.3153909742832184, "learning_rate": 4.274459833614163e-06, "loss": 0.018905390053987503, "memory(GiB)": 21.48, "step": 17517, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.953901 }, { "epoch": 0.5690803365493942, "grad_norm": 0.4270707070827484, "learning_rate": 4.273928370192415e-06, "loss": 0.0154973603785038, "memory(GiB)": 21.48, "step": 17518, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.953909 }, { "epoch": 0.5691128220121495, "grad_norm": 0.31426772475242615, "learning_rate": 4.273396915150497e-06, "loss": 0.019062906503677368, "memory(GiB)": 21.48, "step": 17519, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.953918 }, { "epoch": 0.569145307474905, "grad_norm": 0.32885122299194336, "learning_rate": 4.272865468494548e-06, "loss": 0.01979626901447773, "memory(GiB)": 21.48, "step": 17520, "token_acc": 0.9966555183946488, "train_speed(iter/s)": 0.953926 }, { "epoch": 0.5691777929376604, "grad_norm": 0.4032055735588074, "learning_rate": 4.272334030230698e-06, "loss": 0.01550304889678955, "memory(GiB)": 21.48, "step": 17521, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.953935 }, { "epoch": 0.5692102784004158, "grad_norm": 0.6418893933296204, "learning_rate": 4.271802600365082e-06, "loss": 0.029032640159130096, "memory(GiB)": 21.48, "step": 17522, "token_acc": 0.9695431472081218, "train_speed(iter/s)": 0.95393 }, { "epoch": 0.5692427638631712, "grad_norm": 0.3764403760433197, "learning_rate": 4.271271178903833e-06, "loss": 0.0195521991699934, "memory(GiB)": 21.48, "step": 17523, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.953939 }, { "epoch": 0.5692752493259267, "grad_norm": 0.36395376920700073, "learning_rate": 4.270739765853086e-06, "loss": 0.019765285775065422, "memory(GiB)": 21.48, "step": 17524, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.953947 }, { "epoch": 0.569307734788682, "grad_norm": 0.3660145401954651, "learning_rate": 4.27020836121897e-06, "loss": 0.02198811247944832, "memory(GiB)": 21.48, "step": 17525, "token_acc": 1.0, "train_speed(iter/s)": 0.953954 }, { "epoch": 0.5693402202514375, "grad_norm": 0.3178136646747589, "learning_rate": 4.2696769650076235e-06, "loss": 0.017680197954177856, "memory(GiB)": 21.48, "step": 17526, "token_acc": 1.0, "train_speed(iter/s)": 0.953963 }, { "epoch": 0.5693727057141929, "grad_norm": 0.2623632550239563, "learning_rate": 4.269145577225174e-06, "loss": 0.016382742673158646, "memory(GiB)": 21.48, "step": 17527, "token_acc": 1.0, "train_speed(iter/s)": 0.953972 }, { "epoch": 0.5694051911769483, "grad_norm": 0.3351217806339264, "learning_rate": 4.26861419787776e-06, "loss": 0.02234831638634205, "memory(GiB)": 21.48, "step": 17528, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.953981 }, { "epoch": 0.5694376766397037, "grad_norm": 0.2946822941303253, "learning_rate": 4.26808282697151e-06, "loss": 0.014331884682178497, "memory(GiB)": 21.48, "step": 17529, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.953991 }, { "epoch": 0.5694701621024592, "grad_norm": 0.4007357954978943, "learning_rate": 4.267551464512559e-06, "loss": 0.013909555971622467, "memory(GiB)": 21.48, "step": 17530, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.954001 }, { "epoch": 0.5695026475652145, "grad_norm": 0.4583926200866699, "learning_rate": 4.267020110507039e-06, "loss": 0.02633468061685562, "memory(GiB)": 21.48, "step": 17531, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.954013 }, { "epoch": 0.56953513302797, "grad_norm": 0.3461771607398987, "learning_rate": 4.266488764961083e-06, "loss": 0.01621018722653389, "memory(GiB)": 21.48, "step": 17532, "token_acc": 0.9877049180327869, "train_speed(iter/s)": 0.954024 }, { "epoch": 0.5695676184907253, "grad_norm": 0.38924840092658997, "learning_rate": 4.265957427880822e-06, "loss": 0.015557852573692799, "memory(GiB)": 21.48, "step": 17533, "token_acc": 1.0, "train_speed(iter/s)": 0.954036 }, { "epoch": 0.5696001039534808, "grad_norm": 0.27784883975982666, "learning_rate": 4.2654260992723895e-06, "loss": 0.015693390741944313, "memory(GiB)": 21.48, "step": 17534, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.954048 }, { "epoch": 0.5696325894162362, "grad_norm": 0.28023990988731384, "learning_rate": 4.264894779141919e-06, "loss": 0.01863500103354454, "memory(GiB)": 21.48, "step": 17535, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.95406 }, { "epoch": 0.5696650748789917, "grad_norm": 0.39146146178245544, "learning_rate": 4.26436346749554e-06, "loss": 0.0264409352093935, "memory(GiB)": 21.48, "step": 17536, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.954071 }, { "epoch": 0.5696975603417471, "grad_norm": 0.3774860203266144, "learning_rate": 4.2638321643393866e-06, "loss": 0.016574911773204803, "memory(GiB)": 21.48, "step": 17537, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.954083 }, { "epoch": 0.5697300458045025, "grad_norm": 0.31416499614715576, "learning_rate": 4.263300869679587e-06, "loss": 0.01820235699415207, "memory(GiB)": 21.48, "step": 17538, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.954095 }, { "epoch": 0.569762531267258, "grad_norm": 0.28483471274375916, "learning_rate": 4.262769583522279e-06, "loss": 0.01751520298421383, "memory(GiB)": 21.48, "step": 17539, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.954106 }, { "epoch": 0.5697950167300133, "grad_norm": 0.3282434344291687, "learning_rate": 4.262238305873589e-06, "loss": 0.015678685158491135, "memory(GiB)": 21.48, "step": 17540, "token_acc": 0.988, "train_speed(iter/s)": 0.954118 }, { "epoch": 0.5698275021927688, "grad_norm": 0.5152119398117065, "learning_rate": 4.261707036739654e-06, "loss": 0.02716844528913498, "memory(GiB)": 21.48, "step": 17541, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.954128 }, { "epoch": 0.5698599876555241, "grad_norm": 0.30097222328186035, "learning_rate": 4.261175776126599e-06, "loss": 0.020010584965348244, "memory(GiB)": 21.48, "step": 17542, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.95414 }, { "epoch": 0.5698924731182796, "grad_norm": 0.3112972676753998, "learning_rate": 4.2606445240405615e-06, "loss": 0.016728119924664497, "memory(GiB)": 21.48, "step": 17543, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.954152 }, { "epoch": 0.569924958581035, "grad_norm": 0.253206729888916, "learning_rate": 4.260113280487669e-06, "loss": 0.01137581467628479, "memory(GiB)": 21.48, "step": 17544, "token_acc": 1.0, "train_speed(iter/s)": 0.954163 }, { "epoch": 0.5699574440437905, "grad_norm": 0.2609087824821472, "learning_rate": 4.259582045474052e-06, "loss": 0.011056715622544289, "memory(GiB)": 21.48, "step": 17545, "token_acc": 1.0, "train_speed(iter/s)": 0.954175 }, { "epoch": 0.5699899295065458, "grad_norm": 0.38628265261650085, "learning_rate": 4.259050819005848e-06, "loss": 0.018680550158023834, "memory(GiB)": 21.48, "step": 17546, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.954187 }, { "epoch": 0.5700224149693013, "grad_norm": 0.4455409646034241, "learning_rate": 4.258519601089181e-06, "loss": 0.02148311212658882, "memory(GiB)": 21.48, "step": 17547, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.954198 }, { "epoch": 0.5700549004320566, "grad_norm": 0.25963979959487915, "learning_rate": 4.257988391730185e-06, "loss": 0.016557350754737854, "memory(GiB)": 21.48, "step": 17548, "token_acc": 1.0, "train_speed(iter/s)": 0.954209 }, { "epoch": 0.5700873858948121, "grad_norm": 0.6512665152549744, "learning_rate": 4.257457190934992e-06, "loss": 0.029282590374350548, "memory(GiB)": 21.48, "step": 17549, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.95422 }, { "epoch": 0.5701198713575675, "grad_norm": 0.31054121255874634, "learning_rate": 4.25692599870973e-06, "loss": 0.014208108186721802, "memory(GiB)": 21.48, "step": 17550, "token_acc": 0.99, "train_speed(iter/s)": 0.954232 }, { "epoch": 0.570152356820323, "grad_norm": 0.48137056827545166, "learning_rate": 4.256394815060531e-06, "loss": 0.021218791604042053, "memory(GiB)": 21.48, "step": 17551, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.954244 }, { "epoch": 0.5701848422830783, "grad_norm": 0.2700752913951874, "learning_rate": 4.255863639993526e-06, "loss": 0.016296526417136192, "memory(GiB)": 21.48, "step": 17552, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.954257 }, { "epoch": 0.5702173277458338, "grad_norm": 0.3168101906776428, "learning_rate": 4.2553324735148445e-06, "loss": 0.014680400490760803, "memory(GiB)": 21.48, "step": 17553, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.954269 }, { "epoch": 0.5702498132085891, "grad_norm": 0.4326264560222626, "learning_rate": 4.254801315630619e-06, "loss": 0.025507722049951553, "memory(GiB)": 21.48, "step": 17554, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.954281 }, { "epoch": 0.5702822986713446, "grad_norm": 0.8986284732818604, "learning_rate": 4.254270166346976e-06, "loss": 0.02476648986339569, "memory(GiB)": 21.48, "step": 17555, "token_acc": 0.9890710382513661, "train_speed(iter/s)": 0.954293 }, { "epoch": 0.5703147841341, "grad_norm": 0.28558510541915894, "learning_rate": 4.25373902567005e-06, "loss": 0.016796259209513664, "memory(GiB)": 21.48, "step": 17556, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.954304 }, { "epoch": 0.5703472695968554, "grad_norm": 0.5005124807357788, "learning_rate": 4.2532078936059665e-06, "loss": 0.016725236549973488, "memory(GiB)": 21.48, "step": 17557, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.954316 }, { "epoch": 0.5703797550596108, "grad_norm": 0.5749282240867615, "learning_rate": 4.252676770160859e-06, "loss": 0.020294785499572754, "memory(GiB)": 21.48, "step": 17558, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.954325 }, { "epoch": 0.5704122405223663, "grad_norm": 0.47265738248825073, "learning_rate": 4.252145655340857e-06, "loss": 0.024271763861179352, "memory(GiB)": 21.48, "step": 17559, "token_acc": 0.99, "train_speed(iter/s)": 0.954334 }, { "epoch": 0.5704447259851216, "grad_norm": 0.44689369201660156, "learning_rate": 4.2516145491520875e-06, "loss": 0.020454464480280876, "memory(GiB)": 21.48, "step": 17560, "token_acc": 1.0, "train_speed(iter/s)": 0.954342 }, { "epoch": 0.5704772114478771, "grad_norm": 0.4422009587287903, "learning_rate": 4.2510834516006845e-06, "loss": 0.017286628484725952, "memory(GiB)": 21.48, "step": 17561, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.95435 }, { "epoch": 0.5705096969106325, "grad_norm": 0.4223676025867462, "learning_rate": 4.250552362692773e-06, "loss": 0.025206658989191055, "memory(GiB)": 21.48, "step": 17562, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.954359 }, { "epoch": 0.5705421823733879, "grad_norm": 0.3651163578033447, "learning_rate": 4.250021282434487e-06, "loss": 0.01491832360625267, "memory(GiB)": 21.48, "step": 17563, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.954367 }, { "epoch": 0.5705746678361433, "grad_norm": 0.35491880774497986, "learning_rate": 4.249490210831953e-06, "loss": 0.02015448920428753, "memory(GiB)": 21.48, "step": 17564, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.954376 }, { "epoch": 0.5706071532988988, "grad_norm": 0.3260389268398285, "learning_rate": 4.248959147891302e-06, "loss": 0.02038964256644249, "memory(GiB)": 21.48, "step": 17565, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.954384 }, { "epoch": 0.5706396387616541, "grad_norm": 0.3043409287929535, "learning_rate": 4.248428093618659e-06, "loss": 0.012788452208042145, "memory(GiB)": 21.48, "step": 17566, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.954392 }, { "epoch": 0.5706721242244096, "grad_norm": 0.4249449372291565, "learning_rate": 4.247897048020159e-06, "loss": 0.02052776888012886, "memory(GiB)": 21.48, "step": 17567, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.9544 }, { "epoch": 0.570704609687165, "grad_norm": 0.572562575340271, "learning_rate": 4.247366011101926e-06, "loss": 0.03037886694073677, "memory(GiB)": 21.48, "step": 17568, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.95441 }, { "epoch": 0.5707370951499204, "grad_norm": 0.7105928063392639, "learning_rate": 4.246834982870093e-06, "loss": 0.013868223875761032, "memory(GiB)": 21.48, "step": 17569, "token_acc": 1.0, "train_speed(iter/s)": 0.954418 }, { "epoch": 0.5707695806126758, "grad_norm": 0.35089582204818726, "learning_rate": 4.246303963330785e-06, "loss": 0.01969315856695175, "memory(GiB)": 21.48, "step": 17570, "token_acc": 0.9846743295019157, "train_speed(iter/s)": 0.954426 }, { "epoch": 0.5708020660754313, "grad_norm": 0.5561642646789551, "learning_rate": 4.245772952490134e-06, "loss": 0.0255817249417305, "memory(GiB)": 21.48, "step": 17571, "token_acc": 0.9929824561403509, "train_speed(iter/s)": 0.954434 }, { "epoch": 0.5708345515381866, "grad_norm": 0.30352121591567993, "learning_rate": 4.245241950354266e-06, "loss": 0.013931579887866974, "memory(GiB)": 21.48, "step": 17572, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.954442 }, { "epoch": 0.5708670370009421, "grad_norm": 0.3397224247455597, "learning_rate": 4.244710956929311e-06, "loss": 0.012814955785870552, "memory(GiB)": 21.48, "step": 17573, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.95445 }, { "epoch": 0.5708995224636975, "grad_norm": 0.37029486894607544, "learning_rate": 4.244179972221396e-06, "loss": 0.01688442938029766, "memory(GiB)": 21.48, "step": 17574, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.954458 }, { "epoch": 0.5709320079264529, "grad_norm": 0.3582584857940674, "learning_rate": 4.24364899623665e-06, "loss": 0.02300712652504444, "memory(GiB)": 21.48, "step": 17575, "token_acc": 0.9885931558935361, "train_speed(iter/s)": 0.954466 }, { "epoch": 0.5709644933892083, "grad_norm": 0.44762468338012695, "learning_rate": 4.243118028981203e-06, "loss": 0.014476936310529709, "memory(GiB)": 21.48, "step": 17576, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.954474 }, { "epoch": 0.5709969788519638, "grad_norm": 0.29150304198265076, "learning_rate": 4.242587070461179e-06, "loss": 0.013764479197561741, "memory(GiB)": 21.48, "step": 17577, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.954481 }, { "epoch": 0.5710294643147191, "grad_norm": 0.20429874956607819, "learning_rate": 4.24205612068271e-06, "loss": 0.01382933184504509, "memory(GiB)": 21.48, "step": 17578, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.954489 }, { "epoch": 0.5710619497774746, "grad_norm": 0.3648642599582672, "learning_rate": 4.2415251796519205e-06, "loss": 0.024674251675605774, "memory(GiB)": 21.48, "step": 17579, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.954498 }, { "epoch": 0.57109443524023, "grad_norm": 0.3346526622772217, "learning_rate": 4.240994247374942e-06, "loss": 0.018395094200968742, "memory(GiB)": 21.48, "step": 17580, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.954507 }, { "epoch": 0.5711269207029854, "grad_norm": 0.23103052377700806, "learning_rate": 4.240463323857899e-06, "loss": 0.012775545939803123, "memory(GiB)": 21.48, "step": 17581, "token_acc": 1.0, "train_speed(iter/s)": 0.954516 }, { "epoch": 0.5711594061657408, "grad_norm": 0.33393895626068115, "learning_rate": 4.239932409106921e-06, "loss": 0.019070783630013466, "memory(GiB)": 21.48, "step": 17582, "token_acc": 1.0, "train_speed(iter/s)": 0.954525 }, { "epoch": 0.5711918916284963, "grad_norm": 0.43308746814727783, "learning_rate": 4.239401503128134e-06, "loss": 0.0228425282984972, "memory(GiB)": 21.48, "step": 17583, "token_acc": 1.0, "train_speed(iter/s)": 0.954534 }, { "epoch": 0.5712243770912516, "grad_norm": 0.47358670830726624, "learning_rate": 4.2388706059276664e-06, "loss": 0.021725282073020935, "memory(GiB)": 21.48, "step": 17584, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.954544 }, { "epoch": 0.5712568625540071, "grad_norm": 0.9644396901130676, "learning_rate": 4.238339717511644e-06, "loss": 0.02813943848013878, "memory(GiB)": 21.48, "step": 17585, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.954554 }, { "epoch": 0.5712893480167625, "grad_norm": 0.39943134784698486, "learning_rate": 4.2378088378861964e-06, "loss": 0.012651992961764336, "memory(GiB)": 21.48, "step": 17586, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.954563 }, { "epoch": 0.5713218334795179, "grad_norm": 0.3015534579753876, "learning_rate": 4.237277967057449e-06, "loss": 0.018182355910539627, "memory(GiB)": 21.48, "step": 17587, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.954571 }, { "epoch": 0.5713543189422733, "grad_norm": 0.2949695289134979, "learning_rate": 4.23674710503153e-06, "loss": 0.01765189878642559, "memory(GiB)": 21.48, "step": 17588, "token_acc": 0.9884169884169884, "train_speed(iter/s)": 0.954578 }, { "epoch": 0.5713868044050288, "grad_norm": 0.4858829975128174, "learning_rate": 4.236216251814562e-06, "loss": 0.022106647491455078, "memory(GiB)": 21.48, "step": 17589, "token_acc": 1.0, "train_speed(iter/s)": 0.954587 }, { "epoch": 0.5714192898677841, "grad_norm": 0.2956286668777466, "learning_rate": 4.235685407412679e-06, "loss": 0.01543091144412756, "memory(GiB)": 21.48, "step": 17590, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.954596 }, { "epoch": 0.5714517753305396, "grad_norm": 0.36594143509864807, "learning_rate": 4.235154571832e-06, "loss": 0.015610744245350361, "memory(GiB)": 21.48, "step": 17591, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.954605 }, { "epoch": 0.571484260793295, "grad_norm": 1.1844699382781982, "learning_rate": 4.2346237450786565e-06, "loss": 0.021132266148924828, "memory(GiB)": 21.48, "step": 17592, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.954614 }, { "epoch": 0.5715167462560504, "grad_norm": 0.3950144350528717, "learning_rate": 4.2340929271587764e-06, "loss": 0.021720439195632935, "memory(GiB)": 21.48, "step": 17593, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.954622 }, { "epoch": 0.5715492317188058, "grad_norm": 0.39537176489830017, "learning_rate": 4.233562118078479e-06, "loss": 0.020710766315460205, "memory(GiB)": 21.48, "step": 17594, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.954633 }, { "epoch": 0.5715817171815613, "grad_norm": 0.5840418934822083, "learning_rate": 4.233031317843899e-06, "loss": 0.017611023038625717, "memory(GiB)": 21.48, "step": 17595, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.954644 }, { "epoch": 0.5716142026443166, "grad_norm": 0.3331671953201294, "learning_rate": 4.2325005264611545e-06, "loss": 0.020522478967905045, "memory(GiB)": 21.48, "step": 17596, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.954654 }, { "epoch": 0.5716466881070721, "grad_norm": 0.3950153887271881, "learning_rate": 4.231969743936379e-06, "loss": 0.020208653062582016, "memory(GiB)": 21.48, "step": 17597, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.954666 }, { "epoch": 0.5716791735698274, "grad_norm": 0.35761862993240356, "learning_rate": 4.231438970275693e-06, "loss": 0.019191930070519447, "memory(GiB)": 21.48, "step": 17598, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.954678 }, { "epoch": 0.5717116590325829, "grad_norm": 0.3363244831562042, "learning_rate": 4.230908205485225e-06, "loss": 0.018673844635486603, "memory(GiB)": 21.48, "step": 17599, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.954689 }, { "epoch": 0.5717441444953384, "grad_norm": 0.2987484633922577, "learning_rate": 4.230377449571099e-06, "loss": 0.01916244626045227, "memory(GiB)": 21.48, "step": 17600, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.9547 }, { "epoch": 0.5717766299580938, "grad_norm": 0.48111990094184875, "learning_rate": 4.229846702539443e-06, "loss": 0.020104505121707916, "memory(GiB)": 21.48, "step": 17601, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.954711 }, { "epoch": 0.5718091154208492, "grad_norm": 0.40601563453674316, "learning_rate": 4.2293159643963794e-06, "loss": 0.016001813113689423, "memory(GiB)": 21.48, "step": 17602, "token_acc": 0.9940828402366864, "train_speed(iter/s)": 0.954722 }, { "epoch": 0.5718416008836046, "grad_norm": 0.37719085812568665, "learning_rate": 4.228785235148037e-06, "loss": 0.022769387811422348, "memory(GiB)": 21.48, "step": 17603, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.954734 }, { "epoch": 0.57187408634636, "grad_norm": 0.3242563009262085, "learning_rate": 4.228254514800537e-06, "loss": 0.016237856820225716, "memory(GiB)": 21.48, "step": 17604, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.954745 }, { "epoch": 0.5719065718091154, "grad_norm": 0.3512616455554962, "learning_rate": 4.2277238033600104e-06, "loss": 0.01392374187707901, "memory(GiB)": 21.48, "step": 17605, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.954756 }, { "epoch": 0.5719390572718709, "grad_norm": 0.268652081489563, "learning_rate": 4.2271931008325765e-06, "loss": 0.013078557327389717, "memory(GiB)": 21.48, "step": 17606, "token_acc": 0.9779735682819384, "train_speed(iter/s)": 0.954767 }, { "epoch": 0.5719715427346262, "grad_norm": 0.34481266140937805, "learning_rate": 4.226662407224363e-06, "loss": 0.015205137431621552, "memory(GiB)": 21.48, "step": 17607, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.954778 }, { "epoch": 0.5720040281973817, "grad_norm": 0.6679655909538269, "learning_rate": 4.226131722541496e-06, "loss": 0.024351298809051514, "memory(GiB)": 21.48, "step": 17608, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.954788 }, { "epoch": 0.5720365136601371, "grad_norm": 0.39206090569496155, "learning_rate": 4.2256010467900975e-06, "loss": 0.017714526504278183, "memory(GiB)": 21.48, "step": 17609, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.9548 }, { "epoch": 0.5720689991228926, "grad_norm": 0.361311137676239, "learning_rate": 4.225070379976295e-06, "loss": 0.015566688030958176, "memory(GiB)": 21.48, "step": 17610, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.954812 }, { "epoch": 0.5721014845856479, "grad_norm": 0.46552902460098267, "learning_rate": 4.224539722106211e-06, "loss": 0.020056327804923058, "memory(GiB)": 21.48, "step": 17611, "token_acc": 0.996415770609319, "train_speed(iter/s)": 0.954823 }, { "epoch": 0.5721339700484034, "grad_norm": 0.4309994578361511, "learning_rate": 4.224009073185972e-06, "loss": 0.02244061417877674, "memory(GiB)": 21.48, "step": 17612, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.954835 }, { "epoch": 0.5721664555111587, "grad_norm": 1.0977214574813843, "learning_rate": 4.2234784332216985e-06, "loss": 0.020729346200823784, "memory(GiB)": 21.48, "step": 17613, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.954846 }, { "epoch": 0.5721989409739142, "grad_norm": 0.29260876774787903, "learning_rate": 4.222947802219521e-06, "loss": 0.014336774125695229, "memory(GiB)": 21.48, "step": 17614, "token_acc": 0.990521327014218, "train_speed(iter/s)": 0.954857 }, { "epoch": 0.5722314264366696, "grad_norm": 0.2835201919078827, "learning_rate": 4.222417180185557e-06, "loss": 0.01881069503724575, "memory(GiB)": 21.48, "step": 17615, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.954869 }, { "epoch": 0.572263911899425, "grad_norm": 0.2624404728412628, "learning_rate": 4.2218865671259354e-06, "loss": 0.012995610013604164, "memory(GiB)": 21.48, "step": 17616, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.954881 }, { "epoch": 0.5722963973621804, "grad_norm": 0.30982398986816406, "learning_rate": 4.221355963046777e-06, "loss": 0.020528050139546394, "memory(GiB)": 21.48, "step": 17617, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.954891 }, { "epoch": 0.5723288828249359, "grad_norm": 0.29264169931411743, "learning_rate": 4.220825367954209e-06, "loss": 0.014348562806844711, "memory(GiB)": 21.48, "step": 17618, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.954899 }, { "epoch": 0.5723613682876912, "grad_norm": 0.36402854323387146, "learning_rate": 4.220294781854351e-06, "loss": 0.023073021322488785, "memory(GiB)": 21.48, "step": 17619, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.954907 }, { "epoch": 0.5723938537504467, "grad_norm": 0.21119849383831024, "learning_rate": 4.2197642047533325e-06, "loss": 0.013443493284285069, "memory(GiB)": 21.48, "step": 17620, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.954917 }, { "epoch": 0.5724263392132021, "grad_norm": 0.3062569797039032, "learning_rate": 4.21923363665727e-06, "loss": 0.014778532087802887, "memory(GiB)": 21.48, "step": 17621, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.954925 }, { "epoch": 0.5724588246759575, "grad_norm": 0.5287043452262878, "learning_rate": 4.218703077572291e-06, "loss": 0.020764000713825226, "memory(GiB)": 21.48, "step": 17622, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.954933 }, { "epoch": 0.5724913101387129, "grad_norm": 0.3532344400882721, "learning_rate": 4.218172527504521e-06, "loss": 0.018496476113796234, "memory(GiB)": 21.48, "step": 17623, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.954942 }, { "epoch": 0.5725237956014684, "grad_norm": 0.3613120913505554, "learning_rate": 4.217641986460077e-06, "loss": 0.020063860341906548, "memory(GiB)": 21.48, "step": 17624, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.95495 }, { "epoch": 0.5725562810642237, "grad_norm": 0.41014358401298523, "learning_rate": 4.21711145444509e-06, "loss": 0.019427116960287094, "memory(GiB)": 21.48, "step": 17625, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.954958 }, { "epoch": 0.5725887665269792, "grad_norm": 0.3437415063381195, "learning_rate": 4.216580931465676e-06, "loss": 0.022524692118167877, "memory(GiB)": 21.48, "step": 17626, "token_acc": 1.0, "train_speed(iter/s)": 0.954967 }, { "epoch": 0.5726212519897346, "grad_norm": 0.38296234607696533, "learning_rate": 4.216050417527962e-06, "loss": 0.017632078379392624, "memory(GiB)": 21.48, "step": 17627, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.954976 }, { "epoch": 0.57265373745249, "grad_norm": 0.333981454372406, "learning_rate": 4.215519912638068e-06, "loss": 0.01716996729373932, "memory(GiB)": 21.48, "step": 17628, "token_acc": 0.9836734693877551, "train_speed(iter/s)": 0.954985 }, { "epoch": 0.5726862229152454, "grad_norm": 0.43755003809928894, "learning_rate": 4.21498941680212e-06, "loss": 0.02567298710346222, "memory(GiB)": 21.48, "step": 17629, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.954994 }, { "epoch": 0.5727187083780009, "grad_norm": 0.3633398115634918, "learning_rate": 4.214458930026238e-06, "loss": 0.02145245112478733, "memory(GiB)": 21.48, "step": 17630, "token_acc": 1.0, "train_speed(iter/s)": 0.955003 }, { "epoch": 0.5727511938407562, "grad_norm": 0.4572961628437042, "learning_rate": 4.213928452316546e-06, "loss": 0.020839009433984756, "memory(GiB)": 21.48, "step": 17631, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955012 }, { "epoch": 0.5727836793035117, "grad_norm": 0.3161812424659729, "learning_rate": 4.213397983679166e-06, "loss": 0.016822045668959618, "memory(GiB)": 21.48, "step": 17632, "token_acc": 1.0, "train_speed(iter/s)": 0.955021 }, { "epoch": 0.5728161647662671, "grad_norm": 0.3433167636394501, "learning_rate": 4.21286752412022e-06, "loss": 0.017787277698516846, "memory(GiB)": 21.48, "step": 17633, "token_acc": 0.98828125, "train_speed(iter/s)": 0.95503 }, { "epoch": 0.5728486502290225, "grad_norm": 0.6042144298553467, "learning_rate": 4.21233707364583e-06, "loss": 0.016769010573625565, "memory(GiB)": 21.48, "step": 17634, "token_acc": 0.9899665551839465, "train_speed(iter/s)": 0.955037 }, { "epoch": 0.5728811356917779, "grad_norm": 0.28445085883140564, "learning_rate": 4.21180663226212e-06, "loss": 0.017537539824843407, "memory(GiB)": 21.48, "step": 17635, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.955045 }, { "epoch": 0.5729136211545334, "grad_norm": 0.3090813159942627, "learning_rate": 4.211276199975208e-06, "loss": 0.02244878001511097, "memory(GiB)": 21.48, "step": 17636, "token_acc": 0.9776785714285714, "train_speed(iter/s)": 0.955052 }, { "epoch": 0.5729461066172887, "grad_norm": 0.4186623692512512, "learning_rate": 4.21074577679122e-06, "loss": 0.02272997796535492, "memory(GiB)": 21.48, "step": 17637, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.955061 }, { "epoch": 0.5729785920800442, "grad_norm": 0.30451932549476624, "learning_rate": 4.210215362716277e-06, "loss": 0.013857296667993069, "memory(GiB)": 21.48, "step": 17638, "token_acc": 0.9966777408637874, "train_speed(iter/s)": 0.95507 }, { "epoch": 0.5730110775427996, "grad_norm": 0.4568559527397156, "learning_rate": 4.209684957756498e-06, "loss": 0.02136809006333351, "memory(GiB)": 21.48, "step": 17639, "token_acc": 0.9748953974895398, "train_speed(iter/s)": 0.955079 }, { "epoch": 0.573043563005555, "grad_norm": 0.42478686571121216, "learning_rate": 4.2091545619180085e-06, "loss": 0.029891550540924072, "memory(GiB)": 21.48, "step": 17640, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955088 }, { "epoch": 0.5730760484683104, "grad_norm": 0.40793174505233765, "learning_rate": 4.208624175206927e-06, "loss": 0.02012757956981659, "memory(GiB)": 21.48, "step": 17641, "token_acc": 0.967391304347826, "train_speed(iter/s)": 0.955097 }, { "epoch": 0.5731085339310659, "grad_norm": 0.39669069647789, "learning_rate": 4.208093797629376e-06, "loss": 0.02507110871374607, "memory(GiB)": 21.48, "step": 17642, "token_acc": 0.9836956521739131, "train_speed(iter/s)": 0.955106 }, { "epoch": 0.5731410193938212, "grad_norm": 0.43042460083961487, "learning_rate": 4.2075634291914756e-06, "loss": 0.023030675947666168, "memory(GiB)": 21.48, "step": 17643, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.955115 }, { "epoch": 0.5731735048565767, "grad_norm": 0.37193024158477783, "learning_rate": 4.207033069899351e-06, "loss": 0.016676675528287888, "memory(GiB)": 21.48, "step": 17644, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.955124 }, { "epoch": 0.5732059903193321, "grad_norm": 0.28700941801071167, "learning_rate": 4.2065027197591165e-06, "loss": 0.01315760612487793, "memory(GiB)": 21.48, "step": 17645, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955132 }, { "epoch": 0.5732384757820875, "grad_norm": 0.3706410825252533, "learning_rate": 4.2059723787769e-06, "loss": 0.017198674380779266, "memory(GiB)": 21.48, "step": 17646, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.955142 }, { "epoch": 0.5732709612448429, "grad_norm": 0.2386375367641449, "learning_rate": 4.205442046958815e-06, "loss": 0.018837830051779747, "memory(GiB)": 21.48, "step": 17647, "token_acc": 0.9862068965517241, "train_speed(iter/s)": 0.95515 }, { "epoch": 0.5733034467075984, "grad_norm": 0.3474581241607666, "learning_rate": 4.204911724310991e-06, "loss": 0.018323104828596115, "memory(GiB)": 21.48, "step": 17648, "token_acc": 1.0, "train_speed(iter/s)": 0.955159 }, { "epoch": 0.5733359321703537, "grad_norm": 0.35434651374816895, "learning_rate": 4.204381410839542e-06, "loss": 0.021779531612992287, "memory(GiB)": 21.48, "step": 17649, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.955169 }, { "epoch": 0.5733684176331092, "grad_norm": 0.45042455196380615, "learning_rate": 4.20385110655059e-06, "loss": 0.018054403364658356, "memory(GiB)": 21.48, "step": 17650, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955178 }, { "epoch": 0.5734009030958646, "grad_norm": 0.41835933923721313, "learning_rate": 4.203320811450256e-06, "loss": 0.0276119876652956, "memory(GiB)": 21.48, "step": 17651, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.955188 }, { "epoch": 0.57343338855862, "grad_norm": 0.42550137639045715, "learning_rate": 4.202790525544661e-06, "loss": 0.02077043429017067, "memory(GiB)": 21.48, "step": 17652, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955196 }, { "epoch": 0.5734658740213754, "grad_norm": 0.45885491371154785, "learning_rate": 4.202260248839923e-06, "loss": 0.029819848015904427, "memory(GiB)": 21.48, "step": 17653, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955205 }, { "epoch": 0.5734983594841309, "grad_norm": 0.3388366997241974, "learning_rate": 4.201729981342163e-06, "loss": 0.016790669411420822, "memory(GiB)": 21.48, "step": 17654, "token_acc": 1.0, "train_speed(iter/s)": 0.955214 }, { "epoch": 0.5735308449468862, "grad_norm": 0.2523488700389862, "learning_rate": 4.201199723057504e-06, "loss": 0.01748865284025669, "memory(GiB)": 21.48, "step": 17655, "token_acc": 1.0, "train_speed(iter/s)": 0.955223 }, { "epoch": 0.5735633304096417, "grad_norm": 0.34896376729011536, "learning_rate": 4.200669473992062e-06, "loss": 0.023387424647808075, "memory(GiB)": 21.48, "step": 17656, "token_acc": 0.988, "train_speed(iter/s)": 0.955234 }, { "epoch": 0.573595815872397, "grad_norm": 0.5312727689743042, "learning_rate": 4.200139234151958e-06, "loss": 0.018734650686383247, "memory(GiB)": 21.48, "step": 17657, "token_acc": 1.0, "train_speed(iter/s)": 0.955243 }, { "epoch": 0.5736283013351525, "grad_norm": 0.42471957206726074, "learning_rate": 4.199609003543312e-06, "loss": 0.020479846745729446, "memory(GiB)": 21.48, "step": 17658, "token_acc": 0.9929824561403509, "train_speed(iter/s)": 0.955255 }, { "epoch": 0.5736607867979079, "grad_norm": 0.257050484418869, "learning_rate": 4.199078782172244e-06, "loss": 0.013815095648169518, "memory(GiB)": 21.48, "step": 17659, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955266 }, { "epoch": 0.5736932722606634, "grad_norm": 0.34324246644973755, "learning_rate": 4.198548570044872e-06, "loss": 0.023420870304107666, "memory(GiB)": 21.48, "step": 17660, "token_acc": 1.0, "train_speed(iter/s)": 0.955278 }, { "epoch": 0.5737257577234187, "grad_norm": 0.35082292556762695, "learning_rate": 4.1980183671673175e-06, "loss": 0.018744954839348793, "memory(GiB)": 21.48, "step": 17661, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.95529 }, { "epoch": 0.5737582431861742, "grad_norm": 0.43665361404418945, "learning_rate": 4.197488173545697e-06, "loss": 0.025392111390829086, "memory(GiB)": 21.48, "step": 17662, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.955301 }, { "epoch": 0.5737907286489295, "grad_norm": 0.2870311141014099, "learning_rate": 4.196957989186132e-06, "loss": 0.017477992922067642, "memory(GiB)": 21.48, "step": 17663, "token_acc": 1.0, "train_speed(iter/s)": 0.955312 }, { "epoch": 0.573823214111685, "grad_norm": 0.30830737948417664, "learning_rate": 4.19642781409474e-06, "loss": 0.01508341170847416, "memory(GiB)": 21.48, "step": 17664, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.955323 }, { "epoch": 0.5738556995744405, "grad_norm": 0.3695983588695526, "learning_rate": 4.195897648277642e-06, "loss": 0.020836809650063515, "memory(GiB)": 21.48, "step": 17665, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.955335 }, { "epoch": 0.5738881850371959, "grad_norm": 0.31259971857070923, "learning_rate": 4.1953674917409535e-06, "loss": 0.017593853175640106, "memory(GiB)": 21.48, "step": 17666, "token_acc": 1.0, "train_speed(iter/s)": 0.955346 }, { "epoch": 0.5739206704999513, "grad_norm": 0.34052902460098267, "learning_rate": 4.194837344490798e-06, "loss": 0.01704646460711956, "memory(GiB)": 21.48, "step": 17667, "token_acc": 1.0, "train_speed(iter/s)": 0.955358 }, { "epoch": 0.5739531559627067, "grad_norm": 0.3389895558357239, "learning_rate": 4.194307206533288e-06, "loss": 0.021771647036075592, "memory(GiB)": 21.48, "step": 17668, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.95537 }, { "epoch": 0.5739856414254622, "grad_norm": 0.41211459040641785, "learning_rate": 4.193777077874546e-06, "loss": 0.02067263424396515, "memory(GiB)": 21.48, "step": 17669, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.955381 }, { "epoch": 0.5740181268882175, "grad_norm": 0.3805123567581177, "learning_rate": 4.19324695852069e-06, "loss": 0.021472442895174026, "memory(GiB)": 21.48, "step": 17670, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955392 }, { "epoch": 0.574050612350973, "grad_norm": 0.3416929543018341, "learning_rate": 4.192716848477838e-06, "loss": 0.016195327043533325, "memory(GiB)": 21.48, "step": 17671, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955403 }, { "epoch": 0.5740830978137283, "grad_norm": 0.5203222632408142, "learning_rate": 4.192186747752109e-06, "loss": 0.020917288959026337, "memory(GiB)": 21.48, "step": 17672, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955414 }, { "epoch": 0.5741155832764838, "grad_norm": 0.4825177490711212, "learning_rate": 4.191656656349618e-06, "loss": 0.020166222006082535, "memory(GiB)": 21.48, "step": 17673, "token_acc": 0.9893238434163701, "train_speed(iter/s)": 0.955425 }, { "epoch": 0.5741480687392392, "grad_norm": 0.22417177259922028, "learning_rate": 4.191126574276488e-06, "loss": 0.01212350931018591, "memory(GiB)": 21.48, "step": 17674, "token_acc": 0.9966777408637874, "train_speed(iter/s)": 0.955437 }, { "epoch": 0.5741805542019947, "grad_norm": 0.2917250394821167, "learning_rate": 4.190596501538831e-06, "loss": 0.015114100649952888, "memory(GiB)": 21.48, "step": 17675, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.955448 }, { "epoch": 0.57421303966475, "grad_norm": 0.38837605714797974, "learning_rate": 4.19006643814277e-06, "loss": 0.01541139092296362, "memory(GiB)": 21.48, "step": 17676, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955459 }, { "epoch": 0.5742455251275055, "grad_norm": 0.30578091740608215, "learning_rate": 4.189536384094419e-06, "loss": 0.018723076209425926, "memory(GiB)": 21.48, "step": 17677, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.955469 }, { "epoch": 0.5742780105902608, "grad_norm": 0.5008018016815186, "learning_rate": 4.189006339399897e-06, "loss": 0.019957054406404495, "memory(GiB)": 21.48, "step": 17678, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.955477 }, { "epoch": 0.5743104960530163, "grad_norm": 0.30547648668289185, "learning_rate": 4.188476304065321e-06, "loss": 0.02374260127544403, "memory(GiB)": 21.48, "step": 17679, "token_acc": 0.9783549783549783, "train_speed(iter/s)": 0.955487 }, { "epoch": 0.5743429815157717, "grad_norm": 0.3818790316581726, "learning_rate": 4.18794627809681e-06, "loss": 0.019095227122306824, "memory(GiB)": 21.48, "step": 17680, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.955496 }, { "epoch": 0.5743754669785271, "grad_norm": 0.38138481974601746, "learning_rate": 4.187416261500477e-06, "loss": 0.016124099493026733, "memory(GiB)": 21.48, "step": 17681, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.955506 }, { "epoch": 0.5744079524412825, "grad_norm": 0.37569379806518555, "learning_rate": 4.186886254282445e-06, "loss": 0.021784819662570953, "memory(GiB)": 21.48, "step": 17682, "token_acc": 0.9813084112149533, "train_speed(iter/s)": 0.955515 }, { "epoch": 0.574440437904038, "grad_norm": 0.4273584187030792, "learning_rate": 4.186356256448826e-06, "loss": 0.027238968759775162, "memory(GiB)": 21.48, "step": 17683, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955523 }, { "epoch": 0.5744729233667933, "grad_norm": 0.3990817666053772, "learning_rate": 4.1858262680057384e-06, "loss": 0.019163809716701508, "memory(GiB)": 21.48, "step": 17684, "token_acc": 0.9948979591836735, "train_speed(iter/s)": 0.955533 }, { "epoch": 0.5745054088295488, "grad_norm": 0.34672287106513977, "learning_rate": 4.185296288959301e-06, "loss": 0.017077405005693436, "memory(GiB)": 21.48, "step": 17685, "token_acc": 1.0, "train_speed(iter/s)": 0.95554 }, { "epoch": 0.5745378942923042, "grad_norm": 0.5137193202972412, "learning_rate": 4.184766319315627e-06, "loss": 0.012798549607396126, "memory(GiB)": 21.48, "step": 17686, "token_acc": 1.0, "train_speed(iter/s)": 0.955549 }, { "epoch": 0.5745703797550596, "grad_norm": 0.4083113670349121, "learning_rate": 4.1842363590808366e-06, "loss": 0.029724061489105225, "memory(GiB)": 21.48, "step": 17687, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.955556 }, { "epoch": 0.574602865217815, "grad_norm": 0.40991511940956116, "learning_rate": 4.183706408261043e-06, "loss": 0.015691813081502914, "memory(GiB)": 21.48, "step": 17688, "token_acc": 0.9963636363636363, "train_speed(iter/s)": 0.955563 }, { "epoch": 0.5746353506805705, "grad_norm": 0.28415852785110474, "learning_rate": 4.183176466862365e-06, "loss": 0.01477043516933918, "memory(GiB)": 21.48, "step": 17689, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.955571 }, { "epoch": 0.5746678361433258, "grad_norm": 0.30521371960639954, "learning_rate": 4.182646534890916e-06, "loss": 0.015652909874916077, "memory(GiB)": 21.48, "step": 17690, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.95558 }, { "epoch": 0.5747003216060813, "grad_norm": 0.49586108326911926, "learning_rate": 4.182116612352815e-06, "loss": 0.024804364889860153, "memory(GiB)": 21.48, "step": 17691, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.95559 }, { "epoch": 0.5747328070688367, "grad_norm": 0.4166782796382904, "learning_rate": 4.181586699254178e-06, "loss": 0.02305566892027855, "memory(GiB)": 21.48, "step": 17692, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955598 }, { "epoch": 0.5747652925315921, "grad_norm": 0.2754296064376831, "learning_rate": 4.181056795601119e-06, "loss": 0.012485666200518608, "memory(GiB)": 21.48, "step": 17693, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.955607 }, { "epoch": 0.5747977779943475, "grad_norm": 0.4518143832683563, "learning_rate": 4.180526901399753e-06, "loss": 0.022945698350667953, "memory(GiB)": 21.48, "step": 17694, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955615 }, { "epoch": 0.574830263457103, "grad_norm": 0.3595356047153473, "learning_rate": 4.179997016656201e-06, "loss": 0.020066790282726288, "memory(GiB)": 21.48, "step": 17695, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.955623 }, { "epoch": 0.5748627489198583, "grad_norm": 0.4875364899635315, "learning_rate": 4.179467141376571e-06, "loss": 0.02197101339697838, "memory(GiB)": 21.48, "step": 17696, "token_acc": 1.0, "train_speed(iter/s)": 0.95563 }, { "epoch": 0.5748952343826138, "grad_norm": 0.2692987620830536, "learning_rate": 4.178937275566985e-06, "loss": 0.015411414206027985, "memory(GiB)": 21.48, "step": 17697, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955627 }, { "epoch": 0.5749277198453692, "grad_norm": 0.3860624134540558, "learning_rate": 4.178407419233553e-06, "loss": 0.016268562525510788, "memory(GiB)": 21.48, "step": 17698, "token_acc": 1.0, "train_speed(iter/s)": 0.955626 }, { "epoch": 0.5749602053081246, "grad_norm": 0.4325045347213745, "learning_rate": 4.177877572382394e-06, "loss": 0.02379985712468624, "memory(GiB)": 21.48, "step": 17699, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.955628 }, { "epoch": 0.57499269077088, "grad_norm": 0.3231373131275177, "learning_rate": 4.177347735019623e-06, "loss": 0.01661725342273712, "memory(GiB)": 21.48, "step": 17700, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955627 }, { "epoch": 0.5750251762336355, "grad_norm": 0.28774896264076233, "learning_rate": 4.176817907151352e-06, "loss": 0.013971563428640366, "memory(GiB)": 21.48, "step": 17701, "token_acc": 0.9930555555555556, "train_speed(iter/s)": 0.955627 }, { "epoch": 0.5750576616963908, "grad_norm": 0.44212979078292847, "learning_rate": 4.176288088783702e-06, "loss": 0.018568094819784164, "memory(GiB)": 21.48, "step": 17702, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.955627 }, { "epoch": 0.5750901471591463, "grad_norm": 0.4936479926109314, "learning_rate": 4.1757582799227796e-06, "loss": 0.02355000749230385, "memory(GiB)": 21.48, "step": 17703, "token_acc": 0.9918367346938776, "train_speed(iter/s)": 0.955625 }, { "epoch": 0.5751226326219017, "grad_norm": 0.3238447904586792, "learning_rate": 4.175228480574707e-06, "loss": 0.021386846899986267, "memory(GiB)": 21.48, "step": 17704, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.95562 }, { "epoch": 0.5751551180846571, "grad_norm": 0.333539217710495, "learning_rate": 4.1746986907455924e-06, "loss": 0.019920714199543, "memory(GiB)": 21.48, "step": 17705, "token_acc": 1.0, "train_speed(iter/s)": 0.955615 }, { "epoch": 0.5751876035474125, "grad_norm": 0.36032137274742126, "learning_rate": 4.1741689104415564e-06, "loss": 0.01986898109316826, "memory(GiB)": 21.48, "step": 17706, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.95562 }, { "epoch": 0.575220089010168, "grad_norm": 0.39272215962409973, "learning_rate": 4.173639139668708e-06, "loss": 0.019713832065463066, "memory(GiB)": 21.48, "step": 17707, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.955627 }, { "epoch": 0.5752525744729233, "grad_norm": 0.3483603298664093, "learning_rate": 4.173109378433166e-06, "loss": 0.020671898499131203, "memory(GiB)": 21.48, "step": 17708, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.955633 }, { "epoch": 0.5752850599356788, "grad_norm": 0.5204705595970154, "learning_rate": 4.17257962674104e-06, "loss": 0.023582229390740395, "memory(GiB)": 21.48, "step": 17709, "token_acc": 1.0, "train_speed(iter/s)": 0.955639 }, { "epoch": 0.5753175453984342, "grad_norm": 0.31587645411491394, "learning_rate": 4.172049884598448e-06, "loss": 0.01798253506422043, "memory(GiB)": 21.48, "step": 17710, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.955645 }, { "epoch": 0.5753500308611896, "grad_norm": 0.5052196979522705, "learning_rate": 4.171520152011502e-06, "loss": 0.026051390916109085, "memory(GiB)": 21.48, "step": 17711, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955652 }, { "epoch": 0.575382516323945, "grad_norm": 0.40294715762138367, "learning_rate": 4.1709904289863165e-06, "loss": 0.02021738514304161, "memory(GiB)": 21.48, "step": 17712, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.955658 }, { "epoch": 0.5754150017867005, "grad_norm": 0.3732145428657532, "learning_rate": 4.1704607155290035e-06, "loss": 0.019996371120214462, "memory(GiB)": 21.48, "step": 17713, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.955663 }, { "epoch": 0.5754474872494558, "grad_norm": 0.3125431537628174, "learning_rate": 4.169931011645679e-06, "loss": 0.020382724702358246, "memory(GiB)": 21.48, "step": 17714, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.955671 }, { "epoch": 0.5754799727122113, "grad_norm": 0.29014143347740173, "learning_rate": 4.1694013173424544e-06, "loss": 0.018170583993196487, "memory(GiB)": 21.48, "step": 17715, "token_acc": 1.0, "train_speed(iter/s)": 0.955679 }, { "epoch": 0.5755124581749667, "grad_norm": 0.36031201481819153, "learning_rate": 4.1688716326254445e-06, "loss": 0.02345927618443966, "memory(GiB)": 21.48, "step": 17716, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955685 }, { "epoch": 0.5755449436377221, "grad_norm": 0.4533676505088806, "learning_rate": 4.168341957500763e-06, "loss": 0.022038951516151428, "memory(GiB)": 21.48, "step": 17717, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.955694 }, { "epoch": 0.5755774291004775, "grad_norm": 0.34177348017692566, "learning_rate": 4.1678122919745215e-06, "loss": 0.018317341804504395, "memory(GiB)": 21.48, "step": 17718, "token_acc": 0.9921875, "train_speed(iter/s)": 0.955704 }, { "epoch": 0.575609914563233, "grad_norm": 0.3544635474681854, "learning_rate": 4.167282636052835e-06, "loss": 0.022730719298124313, "memory(GiB)": 21.48, "step": 17719, "token_acc": 0.9869565217391304, "train_speed(iter/s)": 0.955714 }, { "epoch": 0.5756424000259883, "grad_norm": 0.2714892327785492, "learning_rate": 4.166752989741814e-06, "loss": 0.02109682187438011, "memory(GiB)": 21.48, "step": 17720, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.955724 }, { "epoch": 0.5756748854887438, "grad_norm": 0.28190818428993225, "learning_rate": 4.166223353047573e-06, "loss": 0.02050340548157692, "memory(GiB)": 21.48, "step": 17721, "token_acc": 0.9905660377358491, "train_speed(iter/s)": 0.955735 }, { "epoch": 0.5757073709514992, "grad_norm": 0.34143373370170593, "learning_rate": 4.165693725976224e-06, "loss": 0.013049481436610222, "memory(GiB)": 21.48, "step": 17722, "token_acc": 1.0, "train_speed(iter/s)": 0.955746 }, { "epoch": 0.5757398564142546, "grad_norm": 0.3095753788948059, "learning_rate": 4.165164108533881e-06, "loss": 0.01632305607199669, "memory(GiB)": 21.48, "step": 17723, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955758 }, { "epoch": 0.57577234187701, "grad_norm": 0.3294905126094818, "learning_rate": 4.164634500726653e-06, "loss": 0.015439599752426147, "memory(GiB)": 21.48, "step": 17724, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.955768 }, { "epoch": 0.5758048273397655, "grad_norm": 0.26022231578826904, "learning_rate": 4.164104902560659e-06, "loss": 0.015153508633375168, "memory(GiB)": 21.48, "step": 17725, "token_acc": 0.99, "train_speed(iter/s)": 0.955776 }, { "epoch": 0.5758373128025208, "grad_norm": 0.19409623742103577, "learning_rate": 4.163575314042001e-06, "loss": 0.01154148019850254, "memory(GiB)": 21.48, "step": 17726, "token_acc": 1.0, "train_speed(iter/s)": 0.955782 }, { "epoch": 0.5758697982652763, "grad_norm": 0.3913823366165161, "learning_rate": 4.163045735176803e-06, "loss": 0.021835800260305405, "memory(GiB)": 21.48, "step": 17727, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.95579 }, { "epoch": 0.5759022837280318, "grad_norm": 0.3687421381473541, "learning_rate": 4.162516165971167e-06, "loss": 0.020600924268364906, "memory(GiB)": 21.48, "step": 17728, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955796 }, { "epoch": 0.5759347691907871, "grad_norm": 0.339470237493515, "learning_rate": 4.161986606431212e-06, "loss": 0.016237514093518257, "memory(GiB)": 21.48, "step": 17729, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.955804 }, { "epoch": 0.5759672546535426, "grad_norm": 0.4024738669395447, "learning_rate": 4.161457056563045e-06, "loss": 0.019579803571105003, "memory(GiB)": 21.48, "step": 17730, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.955809 }, { "epoch": 0.575999740116298, "grad_norm": 0.3382214903831482, "learning_rate": 4.160927516372778e-06, "loss": 0.0222216434776783, "memory(GiB)": 21.48, "step": 17731, "token_acc": 0.9933554817275747, "train_speed(iter/s)": 0.955817 }, { "epoch": 0.5760322255790534, "grad_norm": 0.39063483476638794, "learning_rate": 4.160397985866528e-06, "loss": 0.022297564893960953, "memory(GiB)": 21.48, "step": 17732, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.95582 }, { "epoch": 0.5760647110418088, "grad_norm": 0.3727354109287262, "learning_rate": 4.1598684650504e-06, "loss": 0.01849820464849472, "memory(GiB)": 21.48, "step": 17733, "token_acc": 1.0, "train_speed(iter/s)": 0.955818 }, { "epoch": 0.5760971965045643, "grad_norm": 0.239115908741951, "learning_rate": 4.159338953930511e-06, "loss": 0.012755636125802994, "memory(GiB)": 21.48, "step": 17734, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.955819 }, { "epoch": 0.5761296819673196, "grad_norm": 0.5667934417724609, "learning_rate": 4.158809452512967e-06, "loss": 0.020744720473885536, "memory(GiB)": 21.48, "step": 17735, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.955824 }, { "epoch": 0.5761621674300751, "grad_norm": 0.2845095098018646, "learning_rate": 4.158279960803883e-06, "loss": 0.01752261072397232, "memory(GiB)": 21.48, "step": 17736, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.95583 }, { "epoch": 0.5761946528928305, "grad_norm": 0.2698136866092682, "learning_rate": 4.1577504788093675e-06, "loss": 0.01496817171573639, "memory(GiB)": 21.48, "step": 17737, "token_acc": 1.0, "train_speed(iter/s)": 0.955834 }, { "epoch": 0.5762271383555859, "grad_norm": 0.29989922046661377, "learning_rate": 4.157221006535533e-06, "loss": 0.017761580646038055, "memory(GiB)": 21.48, "step": 17738, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955837 }, { "epoch": 0.5762596238183413, "grad_norm": 0.2624722719192505, "learning_rate": 4.15669154398849e-06, "loss": 0.019963916391134262, "memory(GiB)": 21.48, "step": 17739, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.955842 }, { "epoch": 0.5762921092810968, "grad_norm": 0.39478686451911926, "learning_rate": 4.156162091174349e-06, "loss": 0.021246252581477165, "memory(GiB)": 21.48, "step": 17740, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.955846 }, { "epoch": 0.5763245947438521, "grad_norm": 0.34180670976638794, "learning_rate": 4.15563264809922e-06, "loss": 0.022312527522444725, "memory(GiB)": 21.48, "step": 17741, "token_acc": 0.992831541218638, "train_speed(iter/s)": 0.95585 }, { "epoch": 0.5763570802066076, "grad_norm": 0.30118414759635925, "learning_rate": 4.155103214769215e-06, "loss": 0.015321406535804272, "memory(GiB)": 21.48, "step": 17742, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955855 }, { "epoch": 0.576389565669363, "grad_norm": 0.3756585121154785, "learning_rate": 4.154573791190442e-06, "loss": 0.027068492025136948, "memory(GiB)": 21.48, "step": 17743, "token_acc": 0.9847908745247148, "train_speed(iter/s)": 0.955861 }, { "epoch": 0.5764220511321184, "grad_norm": 0.7005519270896912, "learning_rate": 4.154044377369014e-06, "loss": 0.015202764421701431, "memory(GiB)": 21.48, "step": 17744, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955867 }, { "epoch": 0.5764545365948738, "grad_norm": 0.3775584101676941, "learning_rate": 4.153514973311039e-06, "loss": 0.01765594445168972, "memory(GiB)": 21.48, "step": 17745, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.955873 }, { "epoch": 0.5764870220576293, "grad_norm": 0.27015119791030884, "learning_rate": 4.152985579022628e-06, "loss": 0.012112262658774853, "memory(GiB)": 21.48, "step": 17746, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.95588 }, { "epoch": 0.5765195075203846, "grad_norm": 0.40237104892730713, "learning_rate": 4.1524561945098916e-06, "loss": 0.02070540562272072, "memory(GiB)": 21.48, "step": 17747, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.955887 }, { "epoch": 0.5765519929831401, "grad_norm": 0.29522421956062317, "learning_rate": 4.151926819778937e-06, "loss": 0.014001602306962013, "memory(GiB)": 21.48, "step": 17748, "token_acc": 1.0, "train_speed(iter/s)": 0.955895 }, { "epoch": 0.5765844784458954, "grad_norm": 0.29368728399276733, "learning_rate": 4.151397454835878e-06, "loss": 0.017730869352817535, "memory(GiB)": 21.48, "step": 17749, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.955903 }, { "epoch": 0.5766169639086509, "grad_norm": 0.5458894968032837, "learning_rate": 4.150868099686821e-06, "loss": 0.02446989342570305, "memory(GiB)": 21.48, "step": 17750, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.955912 }, { "epoch": 0.5766494493714063, "grad_norm": 0.36590108275413513, "learning_rate": 4.150338754337877e-06, "loss": 0.021919220685958862, "memory(GiB)": 21.48, "step": 17751, "token_acc": 1.0, "train_speed(iter/s)": 0.955921 }, { "epoch": 0.5766819348341617, "grad_norm": 0.5801164507865906, "learning_rate": 4.149809418795153e-06, "loss": 0.016711922362446785, "memory(GiB)": 21.48, "step": 17752, "token_acc": 1.0, "train_speed(iter/s)": 0.955931 }, { "epoch": 0.5767144202969171, "grad_norm": 0.18542057275772095, "learning_rate": 4.149280093064762e-06, "loss": 0.010536184534430504, "memory(GiB)": 21.48, "step": 17753, "token_acc": 1.0, "train_speed(iter/s)": 0.955942 }, { "epoch": 0.5767469057596726, "grad_norm": 0.40171316266059875, "learning_rate": 4.148750777152809e-06, "loss": 0.013300798833370209, "memory(GiB)": 21.48, "step": 17754, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.955949 }, { "epoch": 0.5767793912224279, "grad_norm": 0.35741907358169556, "learning_rate": 4.148221471065407e-06, "loss": 0.01651928946375847, "memory(GiB)": 21.48, "step": 17755, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.955957 }, { "epoch": 0.5768118766851834, "grad_norm": 0.3764823079109192, "learning_rate": 4.14769217480866e-06, "loss": 0.01835612766444683, "memory(GiB)": 21.48, "step": 17756, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.955964 }, { "epoch": 0.5768443621479388, "grad_norm": 0.36907729506492615, "learning_rate": 4.147162888388683e-06, "loss": 0.019960850477218628, "memory(GiB)": 21.48, "step": 17757, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955972 }, { "epoch": 0.5768768476106942, "grad_norm": 0.27932387590408325, "learning_rate": 4.14663361181158e-06, "loss": 0.011207877658307552, "memory(GiB)": 21.48, "step": 17758, "token_acc": 1.0, "train_speed(iter/s)": 0.95598 }, { "epoch": 0.5769093330734496, "grad_norm": 0.30337804555892944, "learning_rate": 4.146104345083461e-06, "loss": 0.018414128571748734, "memory(GiB)": 21.48, "step": 17759, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.955988 }, { "epoch": 0.5769418185362051, "grad_norm": 0.2897014319896698, "learning_rate": 4.1455750882104335e-06, "loss": 0.015428660437464714, "memory(GiB)": 21.48, "step": 17760, "token_acc": 1.0, "train_speed(iter/s)": 0.955995 }, { "epoch": 0.5769743039989604, "grad_norm": 0.23919044435024261, "learning_rate": 4.145045841198606e-06, "loss": 0.010398836806416512, "memory(GiB)": 21.48, "step": 17761, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.956001 }, { "epoch": 0.5770067894617159, "grad_norm": 0.3893774151802063, "learning_rate": 4.14451660405409e-06, "loss": 0.023951701819896698, "memory(GiB)": 21.48, "step": 17762, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.956008 }, { "epoch": 0.5770392749244713, "grad_norm": 0.3506937325000763, "learning_rate": 4.14398737678299e-06, "loss": 0.01665215939283371, "memory(GiB)": 21.48, "step": 17763, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.956014 }, { "epoch": 0.5770717603872267, "grad_norm": 0.2363608479499817, "learning_rate": 4.143458159391415e-06, "loss": 0.012599069625139236, "memory(GiB)": 21.48, "step": 17764, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.95602 }, { "epoch": 0.5771042458499821, "grad_norm": 0.4736770987510681, "learning_rate": 4.142928951885472e-06, "loss": 0.017475081607699394, "memory(GiB)": 21.48, "step": 17765, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.956027 }, { "epoch": 0.5771367313127376, "grad_norm": 0.2958519160747528, "learning_rate": 4.142399754271271e-06, "loss": 0.01709231175482273, "memory(GiB)": 21.48, "step": 17766, "token_acc": 0.98828125, "train_speed(iter/s)": 0.956033 }, { "epoch": 0.5771692167754929, "grad_norm": 0.4846011698246002, "learning_rate": 4.141870566554918e-06, "loss": 0.024715550243854523, "memory(GiB)": 21.48, "step": 17767, "token_acc": 0.9847908745247148, "train_speed(iter/s)": 0.95604 }, { "epoch": 0.5772017022382484, "grad_norm": 0.28174856305122375, "learning_rate": 4.141341388742521e-06, "loss": 0.01816692017018795, "memory(GiB)": 21.48, "step": 17768, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956048 }, { "epoch": 0.5772341877010038, "grad_norm": 0.4001566171646118, "learning_rate": 4.1408122208401865e-06, "loss": 0.017899004742503166, "memory(GiB)": 21.48, "step": 17769, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.956057 }, { "epoch": 0.5772666731637592, "grad_norm": 0.2977926731109619, "learning_rate": 4.140283062854023e-06, "loss": 0.016533609479665756, "memory(GiB)": 21.48, "step": 17770, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956065 }, { "epoch": 0.5772991586265146, "grad_norm": 0.41628313064575195, "learning_rate": 4.139753914790137e-06, "loss": 0.01993940956890583, "memory(GiB)": 21.48, "step": 17771, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.956075 }, { "epoch": 0.5773316440892701, "grad_norm": 0.3192238211631775, "learning_rate": 4.139224776654636e-06, "loss": 0.01830974593758583, "memory(GiB)": 21.48, "step": 17772, "token_acc": 0.9893992932862191, "train_speed(iter/s)": 0.956084 }, { "epoch": 0.5773641295520254, "grad_norm": 0.3965754806995392, "learning_rate": 4.138695648453627e-06, "loss": 0.02385413460433483, "memory(GiB)": 21.48, "step": 17773, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.956091 }, { "epoch": 0.5773966150147809, "grad_norm": 0.30197858810424805, "learning_rate": 4.138166530193216e-06, "loss": 0.015528829768300056, "memory(GiB)": 21.48, "step": 17774, "token_acc": 1.0, "train_speed(iter/s)": 0.956097 }, { "epoch": 0.5774291004775363, "grad_norm": 0.27000486850738525, "learning_rate": 4.137637421879511e-06, "loss": 0.013910900801420212, "memory(GiB)": 21.48, "step": 17775, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.956104 }, { "epoch": 0.5774615859402917, "grad_norm": 0.4461078345775604, "learning_rate": 4.137108323518617e-06, "loss": 0.021034518256783485, "memory(GiB)": 21.48, "step": 17776, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.956112 }, { "epoch": 0.5774940714030471, "grad_norm": 0.414816677570343, "learning_rate": 4.136579235116642e-06, "loss": 0.017862414941191673, "memory(GiB)": 21.48, "step": 17777, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.956121 }, { "epoch": 0.5775265568658026, "grad_norm": 4.184261322021484, "learning_rate": 4.1360501566796915e-06, "loss": 0.03093847446143627, "memory(GiB)": 21.48, "step": 17778, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.95613 }, { "epoch": 0.5775590423285579, "grad_norm": 0.3866554796695709, "learning_rate": 4.135521088213873e-06, "loss": 0.022312577813863754, "memory(GiB)": 21.48, "step": 17779, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956141 }, { "epoch": 0.5775915277913134, "grad_norm": 0.42920875549316406, "learning_rate": 4.134992029725291e-06, "loss": 0.023433364927768707, "memory(GiB)": 21.48, "step": 17780, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.956153 }, { "epoch": 0.5776240132540688, "grad_norm": 0.4483366310596466, "learning_rate": 4.134462981220054e-06, "loss": 0.02063482068479061, "memory(GiB)": 21.48, "step": 17781, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.956164 }, { "epoch": 0.5776564987168242, "grad_norm": 0.5187035799026489, "learning_rate": 4.133933942704264e-06, "loss": 0.019225116819143295, "memory(GiB)": 21.48, "step": 17782, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956175 }, { "epoch": 0.5776889841795796, "grad_norm": 0.24021370708942413, "learning_rate": 4.1334049141840305e-06, "loss": 0.014094606041908264, "memory(GiB)": 21.48, "step": 17783, "token_acc": 1.0, "train_speed(iter/s)": 0.956187 }, { "epoch": 0.5777214696423351, "grad_norm": 0.4946506917476654, "learning_rate": 4.1328758956654555e-06, "loss": 0.02133089303970337, "memory(GiB)": 21.48, "step": 17784, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.956198 }, { "epoch": 0.5777539551050904, "grad_norm": 0.3013932704925537, "learning_rate": 4.13234688715465e-06, "loss": 0.0185299813747406, "memory(GiB)": 21.48, "step": 17785, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956209 }, { "epoch": 0.5777864405678459, "grad_norm": 0.4596160054206848, "learning_rate": 4.131817888657714e-06, "loss": 0.019840482622385025, "memory(GiB)": 21.48, "step": 17786, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.956221 }, { "epoch": 0.5778189260306013, "grad_norm": 0.3373243510723114, "learning_rate": 4.1312889001807565e-06, "loss": 0.016419686377048492, "memory(GiB)": 21.48, "step": 17787, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956233 }, { "epoch": 0.5778514114933567, "grad_norm": 0.44273239374160767, "learning_rate": 4.13075992172988e-06, "loss": 0.021213598549365997, "memory(GiB)": 21.48, "step": 17788, "token_acc": 0.988, "train_speed(iter/s)": 0.956245 }, { "epoch": 0.5778838969561121, "grad_norm": 0.34173810482025146, "learning_rate": 4.130230953311192e-06, "loss": 0.013278276659548283, "memory(GiB)": 21.48, "step": 17789, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.956255 }, { "epoch": 0.5779163824188676, "grad_norm": 0.45115306973457336, "learning_rate": 4.129701994930796e-06, "loss": 0.01753547601401806, "memory(GiB)": 21.48, "step": 17790, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956265 }, { "epoch": 0.5779488678816229, "grad_norm": 0.36429527401924133, "learning_rate": 4.129173046594798e-06, "loss": 0.01720828004181385, "memory(GiB)": 21.48, "step": 17791, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956275 }, { "epoch": 0.5779813533443784, "grad_norm": 0.6099551916122437, "learning_rate": 4.1286441083093e-06, "loss": 0.021235087886452675, "memory(GiB)": 21.48, "step": 17792, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.956285 }, { "epoch": 0.5780138388071339, "grad_norm": 0.5852341055870056, "learning_rate": 4.1281151800804105e-06, "loss": 0.025976108387112617, "memory(GiB)": 21.48, "step": 17793, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.956294 }, { "epoch": 0.5780463242698892, "grad_norm": 0.40113234519958496, "learning_rate": 4.1275862619142324e-06, "loss": 0.024852348491549492, "memory(GiB)": 21.48, "step": 17794, "token_acc": 1.0, "train_speed(iter/s)": 0.956304 }, { "epoch": 0.5780788097326447, "grad_norm": 0.4188641607761383, "learning_rate": 4.127057353816869e-06, "loss": 0.01865316741168499, "memory(GiB)": 21.48, "step": 17795, "token_acc": 1.0, "train_speed(iter/s)": 0.956314 }, { "epoch": 0.5781112951954, "grad_norm": 0.4314280152320862, "learning_rate": 4.126528455794427e-06, "loss": 0.017193520441651344, "memory(GiB)": 21.48, "step": 17796, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.956323 }, { "epoch": 0.5781437806581555, "grad_norm": 0.2430480271577835, "learning_rate": 4.125999567853008e-06, "loss": 0.013004561886191368, "memory(GiB)": 21.48, "step": 17797, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.956332 }, { "epoch": 0.5781762661209109, "grad_norm": 0.34098920226097107, "learning_rate": 4.1254706899987175e-06, "loss": 0.019392594695091248, "memory(GiB)": 21.48, "step": 17798, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.956341 }, { "epoch": 0.5782087515836664, "grad_norm": 0.3909202218055725, "learning_rate": 4.124941822237659e-06, "loss": 0.02173035591840744, "memory(GiB)": 21.48, "step": 17799, "token_acc": 1.0, "train_speed(iter/s)": 0.95635 }, { "epoch": 0.5782412370464217, "grad_norm": 0.2685650587081909, "learning_rate": 4.124412964575938e-06, "loss": 0.010415373370051384, "memory(GiB)": 21.48, "step": 17800, "token_acc": 1.0, "train_speed(iter/s)": 0.956358 }, { "epoch": 0.5782737225091772, "grad_norm": 0.3004150092601776, "learning_rate": 4.123884117019655e-06, "loss": 0.016342775896191597, "memory(GiB)": 21.48, "step": 17801, "token_acc": 0.9805825242718447, "train_speed(iter/s)": 0.956367 }, { "epoch": 0.5783062079719326, "grad_norm": 0.258186012506485, "learning_rate": 4.123355279574917e-06, "loss": 0.010055817663669586, "memory(GiB)": 21.48, "step": 17802, "token_acc": 1.0, "train_speed(iter/s)": 0.956376 }, { "epoch": 0.578338693434688, "grad_norm": 0.3152242600917816, "learning_rate": 4.1228264522478244e-06, "loss": 0.014980453997850418, "memory(GiB)": 21.48, "step": 17803, "token_acc": 1.0, "train_speed(iter/s)": 0.956385 }, { "epoch": 0.5783711788974434, "grad_norm": 0.29579395055770874, "learning_rate": 4.122297635044484e-06, "loss": 0.019643869251012802, "memory(GiB)": 21.48, "step": 17804, "token_acc": 0.9847908745247148, "train_speed(iter/s)": 0.956394 }, { "epoch": 0.5784036643601989, "grad_norm": 0.4434329867362976, "learning_rate": 4.121768827970995e-06, "loss": 0.020683161914348602, "memory(GiB)": 21.48, "step": 17805, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956403 }, { "epoch": 0.5784361498229542, "grad_norm": 0.48160338401794434, "learning_rate": 4.121240031033465e-06, "loss": 0.026128139346837997, "memory(GiB)": 21.48, "step": 17806, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.956413 }, { "epoch": 0.5784686352857097, "grad_norm": 0.3603796064853668, "learning_rate": 4.120711244237992e-06, "loss": 0.017656929790973663, "memory(GiB)": 21.48, "step": 17807, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.956422 }, { "epoch": 0.578501120748465, "grad_norm": 0.36902910470962524, "learning_rate": 4.120182467590682e-06, "loss": 0.01732797920703888, "memory(GiB)": 21.48, "step": 17808, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956432 }, { "epoch": 0.5785336062112205, "grad_norm": 0.31986379623413086, "learning_rate": 4.119653701097639e-06, "loss": 0.014809741638600826, "memory(GiB)": 21.48, "step": 17809, "token_acc": 1.0, "train_speed(iter/s)": 0.956443 }, { "epoch": 0.5785660916739759, "grad_norm": 0.3240376114845276, "learning_rate": 4.119124944764962e-06, "loss": 0.014743717387318611, "memory(GiB)": 21.48, "step": 17810, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.956455 }, { "epoch": 0.5785985771367314, "grad_norm": 0.20821169018745422, "learning_rate": 4.118596198598759e-06, "loss": 0.012832634150981903, "memory(GiB)": 21.48, "step": 17811, "token_acc": 1.0, "train_speed(iter/s)": 0.956466 }, { "epoch": 0.5786310625994867, "grad_norm": 0.28442251682281494, "learning_rate": 4.118067462605125e-06, "loss": 0.012726083397865295, "memory(GiB)": 21.48, "step": 17812, "token_acc": 1.0, "train_speed(iter/s)": 0.956477 }, { "epoch": 0.5786635480622422, "grad_norm": 0.4019194543361664, "learning_rate": 4.11753873679017e-06, "loss": 0.027284516021609306, "memory(GiB)": 21.48, "step": 17813, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956488 }, { "epoch": 0.5786960335249975, "grad_norm": 0.43781328201293945, "learning_rate": 4.11701002115999e-06, "loss": 0.032318033277988434, "memory(GiB)": 21.48, "step": 17814, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.9565 }, { "epoch": 0.578728518987753, "grad_norm": 0.582221269607544, "learning_rate": 4.116481315720691e-06, "loss": 0.01894405484199524, "memory(GiB)": 21.48, "step": 17815, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.95651 }, { "epoch": 0.5787610044505084, "grad_norm": 0.3989321291446686, "learning_rate": 4.115952620478372e-06, "loss": 0.022524012252688408, "memory(GiB)": 21.48, "step": 17816, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.95652 }, { "epoch": 0.5787934899132638, "grad_norm": 0.4547237157821655, "learning_rate": 4.115423935439137e-06, "loss": 0.0199529267847538, "memory(GiB)": 21.48, "step": 17817, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956532 }, { "epoch": 0.5788259753760192, "grad_norm": 0.47973334789276123, "learning_rate": 4.114895260609087e-06, "loss": 0.020555388182401657, "memory(GiB)": 21.48, "step": 17818, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.956541 }, { "epoch": 0.5788584608387747, "grad_norm": 0.3022423982620239, "learning_rate": 4.114366595994324e-06, "loss": 0.015769898891448975, "memory(GiB)": 21.48, "step": 17819, "token_acc": 1.0, "train_speed(iter/s)": 0.95655 }, { "epoch": 0.57889094630153, "grad_norm": 0.4663471579551697, "learning_rate": 4.11383794160095e-06, "loss": 0.022654995322227478, "memory(GiB)": 21.48, "step": 17820, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.956559 }, { "epoch": 0.5789234317642855, "grad_norm": 0.43927332758903503, "learning_rate": 4.113309297435065e-06, "loss": 0.021800504997372627, "memory(GiB)": 21.48, "step": 17821, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956567 }, { "epoch": 0.5789559172270409, "grad_norm": 0.41663795709609985, "learning_rate": 4.1127806635027705e-06, "loss": 0.01812085509300232, "memory(GiB)": 21.48, "step": 17822, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.956575 }, { "epoch": 0.5789884026897963, "grad_norm": 0.5803524851799011, "learning_rate": 4.112252039810168e-06, "loss": 0.018678149208426476, "memory(GiB)": 21.48, "step": 17823, "token_acc": 0.984, "train_speed(iter/s)": 0.956584 }, { "epoch": 0.5790208881525517, "grad_norm": 0.38977763056755066, "learning_rate": 4.11172342636336e-06, "loss": 0.018999800086021423, "memory(GiB)": 21.48, "step": 17824, "token_acc": 1.0, "train_speed(iter/s)": 0.956593 }, { "epoch": 0.5790533736153072, "grad_norm": 0.2909921407699585, "learning_rate": 4.111194823168444e-06, "loss": 0.0159149207174778, "memory(GiB)": 21.48, "step": 17825, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.956601 }, { "epoch": 0.5790858590780625, "grad_norm": 0.3163514733314514, "learning_rate": 4.110666230231525e-06, "loss": 0.016256824135780334, "memory(GiB)": 21.48, "step": 17826, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.956609 }, { "epoch": 0.579118344540818, "grad_norm": 0.2817569375038147, "learning_rate": 4.110137647558699e-06, "loss": 0.01592564582824707, "memory(GiB)": 21.48, "step": 17827, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.956618 }, { "epoch": 0.5791508300035734, "grad_norm": 0.29766687750816345, "learning_rate": 4.1096090751560705e-06, "loss": 0.015976261347532272, "memory(GiB)": 21.48, "step": 17828, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.956627 }, { "epoch": 0.5791833154663288, "grad_norm": 0.2942585349082947, "learning_rate": 4.109080513029737e-06, "loss": 0.021673526614904404, "memory(GiB)": 21.48, "step": 17829, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.956636 }, { "epoch": 0.5792158009290842, "grad_norm": 0.2885202169418335, "learning_rate": 4.108551961185802e-06, "loss": 0.016239039599895477, "memory(GiB)": 21.48, "step": 17830, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.956645 }, { "epoch": 0.5792482863918397, "grad_norm": 0.3450653553009033, "learning_rate": 4.108023419630363e-06, "loss": 0.020420197397470474, "memory(GiB)": 21.48, "step": 17831, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956653 }, { "epoch": 0.579280771854595, "grad_norm": 0.40760353207588196, "learning_rate": 4.107494888369522e-06, "loss": 0.022795481607317924, "memory(GiB)": 21.48, "step": 17832, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956662 }, { "epoch": 0.5793132573173505, "grad_norm": 0.31376510858535767, "learning_rate": 4.1069663674093756e-06, "loss": 0.015543483197689056, "memory(GiB)": 21.48, "step": 17833, "token_acc": 1.0, "train_speed(iter/s)": 0.956671 }, { "epoch": 0.5793457427801059, "grad_norm": 0.5015935301780701, "learning_rate": 4.106437856756029e-06, "loss": 0.017351854592561722, "memory(GiB)": 21.48, "step": 17834, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956679 }, { "epoch": 0.5793782282428613, "grad_norm": 0.3511975407600403, "learning_rate": 4.105909356415575e-06, "loss": 0.018093328922986984, "memory(GiB)": 21.48, "step": 17835, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.956688 }, { "epoch": 0.5794107137056167, "grad_norm": 0.2774590253829956, "learning_rate": 4.1053808663941206e-06, "loss": 0.021037273108959198, "memory(GiB)": 21.48, "step": 17836, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.956695 }, { "epoch": 0.5794431991683722, "grad_norm": 0.31104397773742676, "learning_rate": 4.1048523866977605e-06, "loss": 0.017976928502321243, "memory(GiB)": 21.48, "step": 17837, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.956703 }, { "epoch": 0.5794756846311275, "grad_norm": 0.26538902521133423, "learning_rate": 4.1043239173325935e-06, "loss": 0.018726078793406487, "memory(GiB)": 21.48, "step": 17838, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.956711 }, { "epoch": 0.579508170093883, "grad_norm": 0.3316468894481659, "learning_rate": 4.103795458304723e-06, "loss": 0.02326258271932602, "memory(GiB)": 21.48, "step": 17839, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.95672 }, { "epoch": 0.5795406555566384, "grad_norm": 0.3450066149234772, "learning_rate": 4.1032670096202435e-06, "loss": 0.013815289363265038, "memory(GiB)": 21.48, "step": 17840, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956729 }, { "epoch": 0.5795731410193938, "grad_norm": 0.3511641025543213, "learning_rate": 4.1027385712852595e-06, "loss": 0.01559827197343111, "memory(GiB)": 21.48, "step": 17841, "token_acc": 0.996, "train_speed(iter/s)": 0.956738 }, { "epoch": 0.5796056264821492, "grad_norm": 0.2912188172340393, "learning_rate": 4.102210143305865e-06, "loss": 0.01982859894633293, "memory(GiB)": 21.48, "step": 17842, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.95675 }, { "epoch": 0.5796381119449047, "grad_norm": 0.34084028005599976, "learning_rate": 4.101681725688161e-06, "loss": 0.02041030302643776, "memory(GiB)": 21.48, "step": 17843, "token_acc": 1.0, "train_speed(iter/s)": 0.956761 }, { "epoch": 0.57967059740766, "grad_norm": 0.35104936361312866, "learning_rate": 4.101153318438244e-06, "loss": 0.023503830656409264, "memory(GiB)": 21.48, "step": 17844, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956771 }, { "epoch": 0.5797030828704155, "grad_norm": 0.4054100215435028, "learning_rate": 4.100624921562215e-06, "loss": 0.020491985604166985, "memory(GiB)": 21.48, "step": 17845, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.956783 }, { "epoch": 0.5797355683331709, "grad_norm": 0.4134499430656433, "learning_rate": 4.100096535066171e-06, "loss": 0.024901024997234344, "memory(GiB)": 21.48, "step": 17846, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.956793 }, { "epoch": 0.5797680537959263, "grad_norm": 0.4309886693954468, "learning_rate": 4.099568158956212e-06, "loss": 0.023512162268161774, "memory(GiB)": 21.48, "step": 17847, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956802 }, { "epoch": 0.5798005392586817, "grad_norm": 0.33364415168762207, "learning_rate": 4.099039793238433e-06, "loss": 0.014185503125190735, "memory(GiB)": 21.48, "step": 17848, "token_acc": 1.0, "train_speed(iter/s)": 0.956812 }, { "epoch": 0.5798330247214372, "grad_norm": 0.19152957201004028, "learning_rate": 4.0985114379189356e-06, "loss": 0.009637653827667236, "memory(GiB)": 21.48, "step": 17849, "token_acc": 1.0, "train_speed(iter/s)": 0.956821 }, { "epoch": 0.5798655101841925, "grad_norm": 0.36855950951576233, "learning_rate": 4.097983093003815e-06, "loss": 0.024474922567605972, "memory(GiB)": 21.48, "step": 17850, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956831 }, { "epoch": 0.579897995646948, "grad_norm": 0.3730427622795105, "learning_rate": 4.097454758499171e-06, "loss": 0.020762909203767776, "memory(GiB)": 21.48, "step": 17851, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956839 }, { "epoch": 0.5799304811097034, "grad_norm": 0.2772914171218872, "learning_rate": 4.0969264344110995e-06, "loss": 0.011423199437558651, "memory(GiB)": 21.48, "step": 17852, "token_acc": 0.9898477157360406, "train_speed(iter/s)": 0.956848 }, { "epoch": 0.5799629665724588, "grad_norm": 0.5177430510520935, "learning_rate": 4.0963981207457e-06, "loss": 0.026084324344992638, "memory(GiB)": 21.48, "step": 17853, "token_acc": 0.9759036144578314, "train_speed(iter/s)": 0.956857 }, { "epoch": 0.5799954520352142, "grad_norm": 0.31181827187538147, "learning_rate": 4.095869817509067e-06, "loss": 0.009700178168714046, "memory(GiB)": 21.48, "step": 17854, "token_acc": 0.9928571428571429, "train_speed(iter/s)": 0.956866 }, { "epoch": 0.5800279374979697, "grad_norm": 0.26226645708084106, "learning_rate": 4.095341524707301e-06, "loss": 0.014091040939092636, "memory(GiB)": 21.48, "step": 17855, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.956873 }, { "epoch": 0.5800604229607251, "grad_norm": 0.32410579919815063, "learning_rate": 4.094813242346498e-06, "loss": 0.021709758788347244, "memory(GiB)": 21.48, "step": 17856, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.956881 }, { "epoch": 0.5800929084234805, "grad_norm": 0.3457126021385193, "learning_rate": 4.0942849704327545e-06, "loss": 0.024019815027713776, "memory(GiB)": 21.48, "step": 17857, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.95689 }, { "epoch": 0.580125393886236, "grad_norm": 0.2915540337562561, "learning_rate": 4.093756708972168e-06, "loss": 0.025944480672478676, "memory(GiB)": 21.48, "step": 17858, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.956899 }, { "epoch": 0.5801578793489913, "grad_norm": 0.2771093547344208, "learning_rate": 4.0932284579708355e-06, "loss": 0.015611650422215462, "memory(GiB)": 21.48, "step": 17859, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.956907 }, { "epoch": 0.5801903648117468, "grad_norm": 0.36241161823272705, "learning_rate": 4.092700217434856e-06, "loss": 0.021579202264547348, "memory(GiB)": 21.48, "step": 17860, "token_acc": 1.0, "train_speed(iter/s)": 0.956916 }, { "epoch": 0.5802228502745022, "grad_norm": 0.44847312569618225, "learning_rate": 4.092171987370319e-06, "loss": 0.02149035781621933, "memory(GiB)": 21.48, "step": 17861, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.956925 }, { "epoch": 0.5802553357372576, "grad_norm": 0.3491835296154022, "learning_rate": 4.09164376778333e-06, "loss": 0.01905781403183937, "memory(GiB)": 21.48, "step": 17862, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956934 }, { "epoch": 0.580287821200013, "grad_norm": 0.45133575797080994, "learning_rate": 4.0911155586799764e-06, "loss": 0.01928466558456421, "memory(GiB)": 21.48, "step": 17863, "token_acc": 1.0, "train_speed(iter/s)": 0.956943 }, { "epoch": 0.5803203066627685, "grad_norm": 0.3139030337333679, "learning_rate": 4.0905873600663635e-06, "loss": 0.018622055649757385, "memory(GiB)": 21.48, "step": 17864, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.956951 }, { "epoch": 0.5803527921255238, "grad_norm": 0.28972098231315613, "learning_rate": 4.09005917194858e-06, "loss": 0.014616848900914192, "memory(GiB)": 21.48, "step": 17865, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.95696 }, { "epoch": 0.5803852775882793, "grad_norm": 0.5436308979988098, "learning_rate": 4.0895309943327255e-06, "loss": 0.02994176372885704, "memory(GiB)": 21.48, "step": 17866, "token_acc": 0.9675925925925926, "train_speed(iter/s)": 0.95697 }, { "epoch": 0.5804177630510347, "grad_norm": 0.31018176674842834, "learning_rate": 4.089002827224895e-06, "loss": 0.014521792531013489, "memory(GiB)": 21.48, "step": 17867, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.956981 }, { "epoch": 0.5804502485137901, "grad_norm": 0.22757293283939362, "learning_rate": 4.088474670631185e-06, "loss": 0.011584384366869926, "memory(GiB)": 21.48, "step": 17868, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.956992 }, { "epoch": 0.5804827339765455, "grad_norm": 0.2752530872821808, "learning_rate": 4.087946524557689e-06, "loss": 0.013134617358446121, "memory(GiB)": 21.48, "step": 17869, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.957003 }, { "epoch": 0.580515219439301, "grad_norm": 0.2877548038959503, "learning_rate": 4.087418389010505e-06, "loss": 0.02030652016401291, "memory(GiB)": 21.48, "step": 17870, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.957013 }, { "epoch": 0.5805477049020563, "grad_norm": 0.4007224142551422, "learning_rate": 4.0868902639957285e-06, "loss": 0.02083456702530384, "memory(GiB)": 21.48, "step": 17871, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957024 }, { "epoch": 0.5805801903648118, "grad_norm": 0.34489697217941284, "learning_rate": 4.086362149519452e-06, "loss": 0.018921922892332077, "memory(GiB)": 21.48, "step": 17872, "token_acc": 1.0, "train_speed(iter/s)": 0.957035 }, { "epoch": 0.5806126758275671, "grad_norm": 0.36700019240379333, "learning_rate": 4.085834045587774e-06, "loss": 0.020716505125164986, "memory(GiB)": 21.48, "step": 17873, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.957046 }, { "epoch": 0.5806451612903226, "grad_norm": 0.2837221920490265, "learning_rate": 4.085305952206785e-06, "loss": 0.011072395369410515, "memory(GiB)": 21.48, "step": 17874, "token_acc": 1.0, "train_speed(iter/s)": 0.957057 }, { "epoch": 0.580677646753078, "grad_norm": 0.3704777657985687, "learning_rate": 4.084777869382586e-06, "loss": 0.015189066529273987, "memory(GiB)": 21.48, "step": 17875, "token_acc": 1.0, "train_speed(iter/s)": 0.957068 }, { "epoch": 0.5807101322158335, "grad_norm": 0.3049744963645935, "learning_rate": 4.084249797121267e-06, "loss": 0.015722550451755524, "memory(GiB)": 21.48, "step": 17876, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.957079 }, { "epoch": 0.5807426176785888, "grad_norm": 0.516472578048706, "learning_rate": 4.083721735428925e-06, "loss": 0.019575128331780434, "memory(GiB)": 21.48, "step": 17877, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.95709 }, { "epoch": 0.5807751031413443, "grad_norm": 0.3652142286300659, "learning_rate": 4.083193684311653e-06, "loss": 0.01745595782995224, "memory(GiB)": 21.48, "step": 17878, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.957101 }, { "epoch": 0.5808075886040996, "grad_norm": 0.2725379168987274, "learning_rate": 4.082665643775547e-06, "loss": 0.018594522029161453, "memory(GiB)": 21.48, "step": 17879, "token_acc": 0.9836065573770492, "train_speed(iter/s)": 0.957112 }, { "epoch": 0.5808400740668551, "grad_norm": 0.3870638310909271, "learning_rate": 4.0821376138266985e-06, "loss": 0.026648467406630516, "memory(GiB)": 21.48, "step": 17880, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.95712 }, { "epoch": 0.5808725595296105, "grad_norm": 0.3831632435321808, "learning_rate": 4.081609594471205e-06, "loss": 0.01726454868912697, "memory(GiB)": 21.48, "step": 17881, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957129 }, { "epoch": 0.580905044992366, "grad_norm": 0.31161606311798096, "learning_rate": 4.081081585715159e-06, "loss": 0.013461525551974773, "memory(GiB)": 21.48, "step": 17882, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.957138 }, { "epoch": 0.5809375304551213, "grad_norm": 0.4817628264427185, "learning_rate": 4.0805535875646566e-06, "loss": 0.025494739413261414, "memory(GiB)": 21.48, "step": 17883, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.957147 }, { "epoch": 0.5809700159178768, "grad_norm": 0.38900026679039, "learning_rate": 4.080025600025786e-06, "loss": 0.018438253551721573, "memory(GiB)": 21.48, "step": 17884, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.957156 }, { "epoch": 0.5810025013806321, "grad_norm": 0.32364553213119507, "learning_rate": 4.079497623104645e-06, "loss": 0.015032719820737839, "memory(GiB)": 21.48, "step": 17885, "token_acc": 1.0, "train_speed(iter/s)": 0.957165 }, { "epoch": 0.5810349868433876, "grad_norm": 0.8292691707611084, "learning_rate": 4.0789696568073286e-06, "loss": 0.028549812734127045, "memory(GiB)": 21.48, "step": 17886, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.957174 }, { "epoch": 0.581067472306143, "grad_norm": 0.6105666756629944, "learning_rate": 4.078441701139927e-06, "loss": 0.02353394404053688, "memory(GiB)": 21.48, "step": 17887, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957183 }, { "epoch": 0.5810999577688984, "grad_norm": 0.4408772885799408, "learning_rate": 4.077913756108536e-06, "loss": 0.01857389509677887, "memory(GiB)": 21.48, "step": 17888, "token_acc": 1.0, "train_speed(iter/s)": 0.957192 }, { "epoch": 0.5811324432316538, "grad_norm": 0.4465709626674652, "learning_rate": 4.077385821719246e-06, "loss": 0.02539854869246483, "memory(GiB)": 21.48, "step": 17889, "token_acc": 1.0, "train_speed(iter/s)": 0.957201 }, { "epoch": 0.5811649286944093, "grad_norm": 0.30092519521713257, "learning_rate": 4.076857897978153e-06, "loss": 0.015752380713820457, "memory(GiB)": 21.48, "step": 17890, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.957209 }, { "epoch": 0.5811974141571646, "grad_norm": 0.28396832942962646, "learning_rate": 4.076329984891346e-06, "loss": 0.015577958896756172, "memory(GiB)": 21.48, "step": 17891, "token_acc": 1.0, "train_speed(iter/s)": 0.957216 }, { "epoch": 0.5812298996199201, "grad_norm": 0.3062639832496643, "learning_rate": 4.075802082464924e-06, "loss": 0.015250629745423794, "memory(GiB)": 21.48, "step": 17892, "token_acc": 0.9898477157360406, "train_speed(iter/s)": 0.957224 }, { "epoch": 0.5812623850826755, "grad_norm": 0.4102991819381714, "learning_rate": 4.075274190704974e-06, "loss": 0.020771615207195282, "memory(GiB)": 21.48, "step": 17893, "token_acc": 1.0, "train_speed(iter/s)": 0.957233 }, { "epoch": 0.5812948705454309, "grad_norm": 0.5726698637008667, "learning_rate": 4.074746309617591e-06, "loss": 0.02128264307975769, "memory(GiB)": 21.48, "step": 17894, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.957242 }, { "epoch": 0.5813273560081863, "grad_norm": 0.34842121601104736, "learning_rate": 4.074218439208867e-06, "loss": 0.016196835786104202, "memory(GiB)": 21.48, "step": 17895, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957251 }, { "epoch": 0.5813598414709418, "grad_norm": 0.36712929606437683, "learning_rate": 4.0736905794848945e-06, "loss": 0.01920958235859871, "memory(GiB)": 21.48, "step": 17896, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.957259 }, { "epoch": 0.5813923269336971, "grad_norm": 0.3654061257839203, "learning_rate": 4.073162730451765e-06, "loss": 0.019996415823698044, "memory(GiB)": 21.48, "step": 17897, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.957267 }, { "epoch": 0.5814248123964526, "grad_norm": 0.3818773925304413, "learning_rate": 4.072634892115572e-06, "loss": 0.013042725622653961, "memory(GiB)": 21.48, "step": 17898, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.957275 }, { "epoch": 0.581457297859208, "grad_norm": 0.3376466631889343, "learning_rate": 4.072107064482406e-06, "loss": 0.01758413389325142, "memory(GiB)": 21.48, "step": 17899, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.957283 }, { "epoch": 0.5814897833219634, "grad_norm": 0.4926754832267761, "learning_rate": 4.071579247558359e-06, "loss": 0.018267765641212463, "memory(GiB)": 21.48, "step": 17900, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957291 }, { "epoch": 0.5815222687847188, "grad_norm": 0.48716187477111816, "learning_rate": 4.071051441349525e-06, "loss": 0.018316073343157768, "memory(GiB)": 21.48, "step": 17901, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.9573 }, { "epoch": 0.5815547542474743, "grad_norm": 0.30108723044395447, "learning_rate": 4.070523645861993e-06, "loss": 0.019284896552562714, "memory(GiB)": 21.48, "step": 17902, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.957309 }, { "epoch": 0.5815872397102296, "grad_norm": 0.2880621552467346, "learning_rate": 4.069995861101855e-06, "loss": 0.016984717920422554, "memory(GiB)": 21.48, "step": 17903, "token_acc": 1.0, "train_speed(iter/s)": 0.957318 }, { "epoch": 0.5816197251729851, "grad_norm": 0.33296072483062744, "learning_rate": 4.069468087075203e-06, "loss": 0.020286835730075836, "memory(GiB)": 21.48, "step": 17904, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.957329 }, { "epoch": 0.5816522106357405, "grad_norm": 0.33615297079086304, "learning_rate": 4.068940323788128e-06, "loss": 0.014822516590356827, "memory(GiB)": 21.48, "step": 17905, "token_acc": 0.9963898916967509, "train_speed(iter/s)": 0.95734 }, { "epoch": 0.5816846960984959, "grad_norm": 0.3144247531890869, "learning_rate": 4.06841257124672e-06, "loss": 0.01639278419315815, "memory(GiB)": 21.48, "step": 17906, "token_acc": 0.9898477157360406, "train_speed(iter/s)": 0.957349 }, { "epoch": 0.5817171815612513, "grad_norm": 0.3081929385662079, "learning_rate": 4.067884829457071e-06, "loss": 0.015428893268108368, "memory(GiB)": 21.48, "step": 17907, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957357 }, { "epoch": 0.5817496670240068, "grad_norm": 0.3461502194404602, "learning_rate": 4.067357098425272e-06, "loss": 0.018961025401949883, "memory(GiB)": 21.48, "step": 17908, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957365 }, { "epoch": 0.5817821524867621, "grad_norm": 0.380496084690094, "learning_rate": 4.0668293781574145e-06, "loss": 0.021005187183618546, "memory(GiB)": 21.48, "step": 17909, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957374 }, { "epoch": 0.5818146379495176, "grad_norm": 0.36604416370391846, "learning_rate": 4.066301668659586e-06, "loss": 0.019173823297023773, "memory(GiB)": 21.48, "step": 17910, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.957382 }, { "epoch": 0.581847123412273, "grad_norm": 0.35028091073036194, "learning_rate": 4.0657739699378815e-06, "loss": 0.01643221825361252, "memory(GiB)": 21.48, "step": 17911, "token_acc": 1.0, "train_speed(iter/s)": 0.957391 }, { "epoch": 0.5818796088750284, "grad_norm": 0.38409775495529175, "learning_rate": 4.065246281998386e-06, "loss": 0.015281571075320244, "memory(GiB)": 21.48, "step": 17912, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.957399 }, { "epoch": 0.5819120943377838, "grad_norm": 0.3679667115211487, "learning_rate": 4.064718604847195e-06, "loss": 0.017078004777431488, "memory(GiB)": 21.48, "step": 17913, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.957408 }, { "epoch": 0.5819445798005393, "grad_norm": 0.43013912439346313, "learning_rate": 4.064190938490395e-06, "loss": 0.027215585112571716, "memory(GiB)": 21.48, "step": 17914, "token_acc": 0.984375, "train_speed(iter/s)": 0.957416 }, { "epoch": 0.5819770652632946, "grad_norm": 1.082668423652649, "learning_rate": 4.063663282934076e-06, "loss": 0.018109802156686783, "memory(GiB)": 21.48, "step": 17915, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.957425 }, { "epoch": 0.5820095507260501, "grad_norm": 0.23503287136554718, "learning_rate": 4.0631356381843305e-06, "loss": 0.01347165834158659, "memory(GiB)": 21.48, "step": 17916, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.957434 }, { "epoch": 0.5820420361888055, "grad_norm": 0.34434974193573, "learning_rate": 4.062608004247246e-06, "loss": 0.021392783150076866, "memory(GiB)": 21.48, "step": 17917, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.957443 }, { "epoch": 0.5820745216515609, "grad_norm": 0.3115222156047821, "learning_rate": 4.0620803811289145e-06, "loss": 0.01558731123805046, "memory(GiB)": 21.48, "step": 17918, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.957451 }, { "epoch": 0.5821070071143163, "grad_norm": 0.42690783739089966, "learning_rate": 4.061552768835421e-06, "loss": 0.017821649089455605, "memory(GiB)": 21.48, "step": 17919, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.95746 }, { "epoch": 0.5821394925770718, "grad_norm": 0.30962398648262024, "learning_rate": 4.061025167372861e-06, "loss": 0.015243305824697018, "memory(GiB)": 21.48, "step": 17920, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957469 }, { "epoch": 0.5821719780398272, "grad_norm": 0.6817634701728821, "learning_rate": 4.060497576747317e-06, "loss": 0.028185304254293442, "memory(GiB)": 21.48, "step": 17921, "token_acc": 0.985, "train_speed(iter/s)": 0.957478 }, { "epoch": 0.5822044635025826, "grad_norm": 0.4708654284477234, "learning_rate": 4.059969996964883e-06, "loss": 0.025096211582422256, "memory(GiB)": 21.48, "step": 17922, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.957487 }, { "epoch": 0.5822369489653381, "grad_norm": 0.3953198194503784, "learning_rate": 4.059442428031647e-06, "loss": 0.019087649881839752, "memory(GiB)": 21.48, "step": 17923, "token_acc": 0.9702127659574468, "train_speed(iter/s)": 0.957495 }, { "epoch": 0.5822694344280934, "grad_norm": 0.35935351252555847, "learning_rate": 4.058914869953696e-06, "loss": 0.019390849396586418, "memory(GiB)": 21.48, "step": 17924, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.957504 }, { "epoch": 0.5823019198908489, "grad_norm": 0.33224037289619446, "learning_rate": 4.058387322737121e-06, "loss": 0.02301361784338951, "memory(GiB)": 21.48, "step": 17925, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957513 }, { "epoch": 0.5823344053536043, "grad_norm": 0.35059502720832825, "learning_rate": 4.057859786388009e-06, "loss": 0.020106010138988495, "memory(GiB)": 21.48, "step": 17926, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.957525 }, { "epoch": 0.5823668908163597, "grad_norm": 0.42755791544914246, "learning_rate": 4.057332260912449e-06, "loss": 0.01892486959695816, "memory(GiB)": 21.48, "step": 17927, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.957536 }, { "epoch": 0.5823993762791151, "grad_norm": 0.33225780725479126, "learning_rate": 4.056804746316529e-06, "loss": 0.013595061376690865, "memory(GiB)": 21.48, "step": 17928, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957547 }, { "epoch": 0.5824318617418706, "grad_norm": 0.35313257575035095, "learning_rate": 4.056277242606338e-06, "loss": 0.021057605743408203, "memory(GiB)": 21.48, "step": 17929, "token_acc": 1.0, "train_speed(iter/s)": 0.957558 }, { "epoch": 0.5824643472046259, "grad_norm": 0.41674748063087463, "learning_rate": 4.055749749787965e-06, "loss": 0.02785816602408886, "memory(GiB)": 21.48, "step": 17930, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.957569 }, { "epoch": 0.5824968326673814, "grad_norm": 0.29790011048316956, "learning_rate": 4.055222267867495e-06, "loss": 0.018972471356391907, "memory(GiB)": 21.48, "step": 17931, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.95758 }, { "epoch": 0.5825293181301368, "grad_norm": 0.25747358798980713, "learning_rate": 4.054694796851018e-06, "loss": 0.013721725903451443, "memory(GiB)": 21.48, "step": 17932, "token_acc": 0.9845360824742269, "train_speed(iter/s)": 0.957591 }, { "epoch": 0.5825618035928922, "grad_norm": 0.2772577702999115, "learning_rate": 4.054167336744621e-06, "loss": 0.013975435867905617, "memory(GiB)": 21.48, "step": 17933, "token_acc": 0.9923664122137404, "train_speed(iter/s)": 0.957603 }, { "epoch": 0.5825942890556476, "grad_norm": 0.3266869783401489, "learning_rate": 4.0536398875543915e-06, "loss": 0.018912330269813538, "memory(GiB)": 21.48, "step": 17934, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.957614 }, { "epoch": 0.5826267745184031, "grad_norm": 0.36856210231781006, "learning_rate": 4.053112449286419e-06, "loss": 0.022848740220069885, "memory(GiB)": 21.48, "step": 17935, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.957625 }, { "epoch": 0.5826592599811584, "grad_norm": 0.34759601950645447, "learning_rate": 4.052585021946787e-06, "loss": 0.015155586414039135, "memory(GiB)": 21.48, "step": 17936, "token_acc": 1.0, "train_speed(iter/s)": 0.957637 }, { "epoch": 0.5826917454439139, "grad_norm": 0.36420413851737976, "learning_rate": 4.052057605541587e-06, "loss": 0.016825824975967407, "memory(GiB)": 21.48, "step": 17937, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.957648 }, { "epoch": 0.5827242309066692, "grad_norm": 0.5246602892875671, "learning_rate": 4.051530200076902e-06, "loss": 0.02951822802424431, "memory(GiB)": 21.48, "step": 17938, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.957659 }, { "epoch": 0.5827567163694247, "grad_norm": 0.3918364346027374, "learning_rate": 4.051002805558822e-06, "loss": 0.011969422921538353, "memory(GiB)": 21.48, "step": 17939, "token_acc": 1.0, "train_speed(iter/s)": 0.957671 }, { "epoch": 0.5827892018321801, "grad_norm": 0.41024598479270935, "learning_rate": 4.0504754219934325e-06, "loss": 0.021753495559096336, "memory(GiB)": 21.48, "step": 17940, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.957682 }, { "epoch": 0.5828216872949356, "grad_norm": 0.31966543197631836, "learning_rate": 4.049948049386821e-06, "loss": 0.01722702942788601, "memory(GiB)": 21.48, "step": 17941, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957692 }, { "epoch": 0.5828541727576909, "grad_norm": 0.23593270778656006, "learning_rate": 4.049420687745072e-06, "loss": 0.015356375835835934, "memory(GiB)": 21.48, "step": 17942, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.9577 }, { "epoch": 0.5828866582204464, "grad_norm": 0.2482208013534546, "learning_rate": 4.0488933370742765e-06, "loss": 0.0138302743434906, "memory(GiB)": 21.48, "step": 17943, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.957709 }, { "epoch": 0.5829191436832017, "grad_norm": 0.3800484240055084, "learning_rate": 4.048365997380514e-06, "loss": 0.02175934985280037, "memory(GiB)": 21.48, "step": 17944, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957718 }, { "epoch": 0.5829516291459572, "grad_norm": 0.41841036081314087, "learning_rate": 4.047838668669878e-06, "loss": 0.01506401039659977, "memory(GiB)": 21.48, "step": 17945, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.957726 }, { "epoch": 0.5829841146087126, "grad_norm": 0.3324195146560669, "learning_rate": 4.047311350948448e-06, "loss": 0.01934068277478218, "memory(GiB)": 21.48, "step": 17946, "token_acc": 0.9884169884169884, "train_speed(iter/s)": 0.957735 }, { "epoch": 0.583016600071468, "grad_norm": 0.36676082015037537, "learning_rate": 4.046784044222313e-06, "loss": 0.019315659999847412, "memory(GiB)": 21.48, "step": 17947, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.957743 }, { "epoch": 0.5830490855342234, "grad_norm": 0.28290337324142456, "learning_rate": 4.046256748497562e-06, "loss": 0.013975166715681553, "memory(GiB)": 21.48, "step": 17948, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957752 }, { "epoch": 0.5830815709969789, "grad_norm": 0.43437981605529785, "learning_rate": 4.045729463780275e-06, "loss": 0.01777302287518978, "memory(GiB)": 21.48, "step": 17949, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.95776 }, { "epoch": 0.5831140564597342, "grad_norm": 0.3971967399120331, "learning_rate": 4.045202190076542e-06, "loss": 0.015071242116391659, "memory(GiB)": 21.48, "step": 17950, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957769 }, { "epoch": 0.5831465419224897, "grad_norm": 0.3016667366027832, "learning_rate": 4.044674927392445e-06, "loss": 0.016059882938861847, "memory(GiB)": 21.48, "step": 17951, "token_acc": 1.0, "train_speed(iter/s)": 0.957777 }, { "epoch": 0.5831790273852451, "grad_norm": 0.381027489900589, "learning_rate": 4.044147675734072e-06, "loss": 0.013855689205229282, "memory(GiB)": 21.48, "step": 17952, "token_acc": 1.0, "train_speed(iter/s)": 0.957786 }, { "epoch": 0.5832115128480005, "grad_norm": 0.4197940528392792, "learning_rate": 4.043620435107506e-06, "loss": 0.02038523182272911, "memory(GiB)": 21.48, "step": 17953, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.957794 }, { "epoch": 0.5832439983107559, "grad_norm": 0.36051279306411743, "learning_rate": 4.043093205518834e-06, "loss": 0.02007041499018669, "memory(GiB)": 21.48, "step": 17954, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.957802 }, { "epoch": 0.5832764837735114, "grad_norm": 0.32930788397789, "learning_rate": 4.042565986974139e-06, "loss": 0.017315387725830078, "memory(GiB)": 21.48, "step": 17955, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.95781 }, { "epoch": 0.5833089692362667, "grad_norm": 0.27041178941726685, "learning_rate": 4.042038779479508e-06, "loss": 0.013978185132145882, "memory(GiB)": 21.48, "step": 17956, "token_acc": 0.9963503649635036, "train_speed(iter/s)": 0.957819 }, { "epoch": 0.5833414546990222, "grad_norm": 0.40947407484054565, "learning_rate": 4.041511583041023e-06, "loss": 0.02260240912437439, "memory(GiB)": 21.48, "step": 17957, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.957828 }, { "epoch": 0.5833739401617776, "grad_norm": 0.38443902134895325, "learning_rate": 4.040984397664773e-06, "loss": 0.019036155194044113, "memory(GiB)": 21.48, "step": 17958, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.957837 }, { "epoch": 0.583406425624533, "grad_norm": 3.0638391971588135, "learning_rate": 4.040457223356836e-06, "loss": 0.018648719415068626, "memory(GiB)": 21.48, "step": 17959, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957846 }, { "epoch": 0.5834389110872884, "grad_norm": 0.33847784996032715, "learning_rate": 4.039930060123301e-06, "loss": 0.018414968624711037, "memory(GiB)": 21.48, "step": 17960, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.957852 }, { "epoch": 0.5834713965500439, "grad_norm": 0.40767380595207214, "learning_rate": 4.039402907970251e-06, "loss": 0.02005091682076454, "memory(GiB)": 21.48, "step": 17961, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.95786 }, { "epoch": 0.5835038820127992, "grad_norm": 0.23092542588710785, "learning_rate": 4.0388757669037706e-06, "loss": 0.018443387001752853, "memory(GiB)": 21.48, "step": 17962, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.957869 }, { "epoch": 0.5835363674755547, "grad_norm": 0.28534460067749023, "learning_rate": 4.038348636929943e-06, "loss": 0.01683133840560913, "memory(GiB)": 21.48, "step": 17963, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957877 }, { "epoch": 0.5835688529383101, "grad_norm": 0.32680466771125793, "learning_rate": 4.037821518054851e-06, "loss": 0.019851256161928177, "memory(GiB)": 21.48, "step": 17964, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.957886 }, { "epoch": 0.5836013384010655, "grad_norm": 0.34522533416748047, "learning_rate": 4.037294410284581e-06, "loss": 0.022950246930122375, "memory(GiB)": 21.48, "step": 17965, "token_acc": 1.0, "train_speed(iter/s)": 0.957894 }, { "epoch": 0.5836338238638209, "grad_norm": 0.23084309697151184, "learning_rate": 4.036767313625214e-06, "loss": 0.010622777976095676, "memory(GiB)": 21.48, "step": 17966, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.957904 }, { "epoch": 0.5836663093265764, "grad_norm": 0.2697427272796631, "learning_rate": 4.036240228082834e-06, "loss": 0.011883232742547989, "memory(GiB)": 21.48, "step": 17967, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.957913 }, { "epoch": 0.5836987947893317, "grad_norm": 0.3312506675720215, "learning_rate": 4.035713153663525e-06, "loss": 0.013876611366868019, "memory(GiB)": 21.48, "step": 17968, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.957923 }, { "epoch": 0.5837312802520872, "grad_norm": 0.29749196767807007, "learning_rate": 4.0351860903733705e-06, "loss": 0.012430100701749325, "memory(GiB)": 21.48, "step": 17969, "token_acc": 0.9963768115942029, "train_speed(iter/s)": 0.957931 }, { "epoch": 0.5837637657148426, "grad_norm": 0.40929940342903137, "learning_rate": 4.03465903821845e-06, "loss": 0.02002708427608013, "memory(GiB)": 21.48, "step": 17970, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.957941 }, { "epoch": 0.583796251177598, "grad_norm": 0.4144401550292969, "learning_rate": 4.034131997204853e-06, "loss": 0.02060737833380699, "memory(GiB)": 21.48, "step": 17971, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.95795 }, { "epoch": 0.5838287366403534, "grad_norm": 0.2308010309934616, "learning_rate": 4.0336049673386545e-06, "loss": 0.011447741650044918, "memory(GiB)": 21.48, "step": 17972, "token_acc": 1.0, "train_speed(iter/s)": 0.957958 }, { "epoch": 0.5838612221031089, "grad_norm": 0.21646346151828766, "learning_rate": 4.033077948625944e-06, "loss": 0.01071571558713913, "memory(GiB)": 21.48, "step": 17973, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.957967 }, { "epoch": 0.5838937075658642, "grad_norm": 0.2722100615501404, "learning_rate": 4.032550941072799e-06, "loss": 0.01553051732480526, "memory(GiB)": 21.48, "step": 17974, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.957976 }, { "epoch": 0.5839261930286197, "grad_norm": 0.331848680973053, "learning_rate": 4.032023944685305e-06, "loss": 0.013989974744617939, "memory(GiB)": 21.48, "step": 17975, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957985 }, { "epoch": 0.5839586784913751, "grad_norm": 0.2278028279542923, "learning_rate": 4.031496959469542e-06, "loss": 0.011637182906270027, "memory(GiB)": 21.48, "step": 17976, "token_acc": 1.0, "train_speed(iter/s)": 0.957994 }, { "epoch": 0.5839911639541305, "grad_norm": 0.36864086985588074, "learning_rate": 4.030969985431591e-06, "loss": 0.02192213013768196, "memory(GiB)": 21.48, "step": 17977, "token_acc": 0.9826839826839827, "train_speed(iter/s)": 0.958002 }, { "epoch": 0.5840236494168859, "grad_norm": 0.4468734860420227, "learning_rate": 4.030443022577541e-06, "loss": 0.030294742435216904, "memory(GiB)": 21.48, "step": 17978, "token_acc": 0.9773584905660377, "train_speed(iter/s)": 0.95801 }, { "epoch": 0.5840561348796414, "grad_norm": 0.3111996054649353, "learning_rate": 4.029916070913466e-06, "loss": 0.014212053269147873, "memory(GiB)": 21.48, "step": 17979, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.958018 }, { "epoch": 0.5840886203423967, "grad_norm": 0.22616393864154816, "learning_rate": 4.029389130445452e-06, "loss": 0.009304221719503403, "memory(GiB)": 21.48, "step": 17980, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.958027 }, { "epoch": 0.5841211058051522, "grad_norm": 0.3461655080318451, "learning_rate": 4.028862201179578e-06, "loss": 0.015446127392351627, "memory(GiB)": 21.48, "step": 17981, "token_acc": 1.0, "train_speed(iter/s)": 0.958036 }, { "epoch": 0.5841535912679076, "grad_norm": 0.29669228196144104, "learning_rate": 4.028335283121928e-06, "loss": 0.01842082478106022, "memory(GiB)": 21.48, "step": 17982, "token_acc": 0.9927007299270073, "train_speed(iter/s)": 0.958045 }, { "epoch": 0.584186076730663, "grad_norm": 0.3924560546875, "learning_rate": 4.027808376278581e-06, "loss": 0.01789937913417816, "memory(GiB)": 21.48, "step": 17983, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.958054 }, { "epoch": 0.5842185621934185, "grad_norm": 0.2740674912929535, "learning_rate": 4.0272814806556206e-06, "loss": 0.017555266618728638, "memory(GiB)": 21.48, "step": 17984, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.958063 }, { "epoch": 0.5842510476561739, "grad_norm": 0.3854826092720032, "learning_rate": 4.026754596259125e-06, "loss": 0.019228648394346237, "memory(GiB)": 21.48, "step": 17985, "token_acc": 1.0, "train_speed(iter/s)": 0.958073 }, { "epoch": 0.5842835331189293, "grad_norm": 0.7304428815841675, "learning_rate": 4.026227723095178e-06, "loss": 0.020487740635871887, "memory(GiB)": 21.48, "step": 17986, "token_acc": 1.0, "train_speed(iter/s)": 0.958085 }, { "epoch": 0.5843160185816847, "grad_norm": 0.5229054093360901, "learning_rate": 4.025700861169859e-06, "loss": 0.022859003394842148, "memory(GiB)": 21.48, "step": 17987, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.958096 }, { "epoch": 0.5843485040444402, "grad_norm": 0.5611777305603027, "learning_rate": 4.025174010489249e-06, "loss": 0.02114267274737358, "memory(GiB)": 21.48, "step": 17988, "token_acc": 0.99, "train_speed(iter/s)": 0.958107 }, { "epoch": 0.5843809895071955, "grad_norm": 0.40358811616897583, "learning_rate": 4.0246471710594275e-06, "loss": 0.020861081779003143, "memory(GiB)": 21.48, "step": 17989, "token_acc": 0.9896551724137931, "train_speed(iter/s)": 0.958118 }, { "epoch": 0.584413474969951, "grad_norm": 0.4173888862133026, "learning_rate": 4.024120342886477e-06, "loss": 0.020275242626667023, "memory(GiB)": 21.48, "step": 17990, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.958129 }, { "epoch": 0.5844459604327064, "grad_norm": 0.5811846852302551, "learning_rate": 4.023593525976476e-06, "loss": 0.02764095924794674, "memory(GiB)": 21.48, "step": 17991, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.95814 }, { "epoch": 0.5844784458954618, "grad_norm": 0.33352920413017273, "learning_rate": 4.023066720335506e-06, "loss": 0.014313096180558205, "memory(GiB)": 21.48, "step": 17992, "token_acc": 1.0, "train_speed(iter/s)": 0.95815 }, { "epoch": 0.5845109313582172, "grad_norm": 0.3515847325325012, "learning_rate": 4.022539925969644e-06, "loss": 0.017044270411133766, "memory(GiB)": 21.48, "step": 17993, "token_acc": 0.9927536231884058, "train_speed(iter/s)": 0.958161 }, { "epoch": 0.5845434168209727, "grad_norm": 0.3857652544975281, "learning_rate": 4.022013142884974e-06, "loss": 0.01628216728568077, "memory(GiB)": 21.48, "step": 17994, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.958172 }, { "epoch": 0.584575902283728, "grad_norm": 0.46057918667793274, "learning_rate": 4.021486371087575e-06, "loss": 0.021737568080425262, "memory(GiB)": 21.48, "step": 17995, "token_acc": 0.9940476190476191, "train_speed(iter/s)": 0.958183 }, { "epoch": 0.5846083877464835, "grad_norm": 0.4540485441684723, "learning_rate": 4.0209596105835235e-06, "loss": 0.024981677532196045, "memory(GiB)": 21.48, "step": 17996, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.958195 }, { "epoch": 0.5846408732092389, "grad_norm": 0.3441314697265625, "learning_rate": 4.020432861378904e-06, "loss": 0.022008424624800682, "memory(GiB)": 21.48, "step": 17997, "token_acc": 0.995, "train_speed(iter/s)": 0.958206 }, { "epoch": 0.5846733586719943, "grad_norm": 0.3699261546134949, "learning_rate": 4.01990612347979e-06, "loss": 0.015354765579104424, "memory(GiB)": 21.48, "step": 17998, "token_acc": 0.981203007518797, "train_speed(iter/s)": 0.958217 }, { "epoch": 0.5847058441347497, "grad_norm": 0.6662094593048096, "learning_rate": 4.0193793968922664e-06, "loss": 0.01232939213514328, "memory(GiB)": 21.48, "step": 17999, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.958228 }, { "epoch": 0.5847383295975052, "grad_norm": 0.3269939720630646, "learning_rate": 4.018852681622407e-06, "loss": 0.015183750540018082, "memory(GiB)": 21.48, "step": 18000, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.95824 }, { "epoch": 0.5847383295975052, "eval_loss": 0.018399925902485847, "eval_runtime": 80.0136, "eval_samples_per_second": 124.354, "eval_steps_per_second": 3.887, "eval_token_acc": 0.992624206947798, "step": 18000 }, { "epoch": 0.5847708150602605, "grad_norm": 0.4299006760120392, "learning_rate": 4.018325977676297e-06, "loss": 0.021656541153788567, "memory(GiB)": 21.48, "step": 18001, "token_acc": 0.9925371090540615, "train_speed(iter/s)": 0.953617 }, { "epoch": 0.584803300523016, "grad_norm": 0.33104318380355835, "learning_rate": 4.0177992850600095e-06, "loss": 0.020625650882720947, "memory(GiB)": 21.48, "step": 18002, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.953625 }, { "epoch": 0.5848357859857714, "grad_norm": 0.37297552824020386, "learning_rate": 4.017272603779626e-06, "loss": 0.019572073593735695, "memory(GiB)": 21.48, "step": 18003, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.953631 }, { "epoch": 0.5848682714485268, "grad_norm": 0.4497710168361664, "learning_rate": 4.016745933841225e-06, "loss": 0.020006023347377777, "memory(GiB)": 21.48, "step": 18004, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.953638 }, { "epoch": 0.5849007569112822, "grad_norm": 0.4241560995578766, "learning_rate": 4.016219275250885e-06, "loss": 0.018018197268247604, "memory(GiB)": 21.48, "step": 18005, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.953646 }, { "epoch": 0.5849332423740377, "grad_norm": 0.3099880516529083, "learning_rate": 4.015692628014683e-06, "loss": 0.020114067941904068, "memory(GiB)": 21.48, "step": 18006, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.953654 }, { "epoch": 0.584965727836793, "grad_norm": 0.33316153287887573, "learning_rate": 4.015165992138699e-06, "loss": 0.01623111218214035, "memory(GiB)": 21.48, "step": 18007, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.953661 }, { "epoch": 0.5849982132995485, "grad_norm": 0.45444613695144653, "learning_rate": 4.014639367629009e-06, "loss": 0.0162123441696167, "memory(GiB)": 21.48, "step": 18008, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.953668 }, { "epoch": 0.5850306987623038, "grad_norm": 0.32039156556129456, "learning_rate": 4.014112754491692e-06, "loss": 0.016211124137043953, "memory(GiB)": 21.48, "step": 18009, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.953677 }, { "epoch": 0.5850631842250593, "grad_norm": 0.6646155714988708, "learning_rate": 4.013586152732828e-06, "loss": 0.03279522806406021, "memory(GiB)": 21.48, "step": 18010, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.953686 }, { "epoch": 0.5850956696878147, "grad_norm": 0.36423957347869873, "learning_rate": 4.013059562358491e-06, "loss": 0.02229149639606476, "memory(GiB)": 21.48, "step": 18011, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.953695 }, { "epoch": 0.5851281551505702, "grad_norm": 0.2911926209926605, "learning_rate": 4.012532983374762e-06, "loss": 0.01294107735157013, "memory(GiB)": 21.48, "step": 18012, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.953703 }, { "epoch": 0.5851606406133255, "grad_norm": 0.17637835443019867, "learning_rate": 4.012006415787715e-06, "loss": 0.009817807003855705, "memory(GiB)": 21.48, "step": 18013, "token_acc": 1.0, "train_speed(iter/s)": 0.953712 }, { "epoch": 0.585193126076081, "grad_norm": 0.44986265897750854, "learning_rate": 4.01147985960343e-06, "loss": 0.019382618367671967, "memory(GiB)": 21.48, "step": 18014, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.953721 }, { "epoch": 0.5852256115388363, "grad_norm": 0.27110254764556885, "learning_rate": 4.010953314827982e-06, "loss": 0.01468352135270834, "memory(GiB)": 21.48, "step": 18015, "token_acc": 1.0, "train_speed(iter/s)": 0.95373 }, { "epoch": 0.5852580970015918, "grad_norm": 0.386049747467041, "learning_rate": 4.010426781467451e-06, "loss": 0.016632068902254105, "memory(GiB)": 21.48, "step": 18016, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.953739 }, { "epoch": 0.5852905824643472, "grad_norm": 0.298522412776947, "learning_rate": 4.0099002595279104e-06, "loss": 0.015669573098421097, "memory(GiB)": 21.48, "step": 18017, "token_acc": 1.0, "train_speed(iter/s)": 0.953748 }, { "epoch": 0.5853230679271026, "grad_norm": 0.5287238955497742, "learning_rate": 4.009373749015439e-06, "loss": 0.024802232161164284, "memory(GiB)": 21.48, "step": 18018, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.95376 }, { "epoch": 0.585355553389858, "grad_norm": 0.4240511357784271, "learning_rate": 4.008847249936113e-06, "loss": 0.024993369355797768, "memory(GiB)": 21.48, "step": 18019, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.953771 }, { "epoch": 0.5853880388526135, "grad_norm": 0.28299540281295776, "learning_rate": 4.008320762296012e-06, "loss": 0.010894373059272766, "memory(GiB)": 21.48, "step": 18020, "token_acc": 1.0, "train_speed(iter/s)": 0.953782 }, { "epoch": 0.5854205243153688, "grad_norm": 0.2982803285121918, "learning_rate": 4.007794286101204e-06, "loss": 0.01604478806257248, "memory(GiB)": 21.48, "step": 18021, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.953793 }, { "epoch": 0.5854530097781243, "grad_norm": 0.35514143109321594, "learning_rate": 4.007267821357775e-06, "loss": 0.020458795130252838, "memory(GiB)": 21.48, "step": 18022, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.953804 }, { "epoch": 0.5854854952408797, "grad_norm": 0.4270724356174469, "learning_rate": 4.006741368071793e-06, "loss": 0.02529304474592209, "memory(GiB)": 21.48, "step": 18023, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.953816 }, { "epoch": 0.5855179807036351, "grad_norm": 0.4794609546661377, "learning_rate": 4.0062149262493386e-06, "loss": 0.018785767257213593, "memory(GiB)": 21.48, "step": 18024, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.953827 }, { "epoch": 0.5855504661663905, "grad_norm": 0.5089452266693115, "learning_rate": 4.005688495896489e-06, "loss": 0.02547057345509529, "memory(GiB)": 21.48, "step": 18025, "token_acc": 0.99609375, "train_speed(iter/s)": 0.953839 }, { "epoch": 0.585582951629146, "grad_norm": 0.4440440833568573, "learning_rate": 4.005162077019315e-06, "loss": 0.02248632162809372, "memory(GiB)": 21.48, "step": 18026, "token_acc": 1.0, "train_speed(iter/s)": 0.953849 }, { "epoch": 0.5856154370919013, "grad_norm": 0.3318813145160675, "learning_rate": 4.004635669623898e-06, "loss": 0.01771015301346779, "memory(GiB)": 21.48, "step": 18027, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.953861 }, { "epoch": 0.5856479225546568, "grad_norm": 0.2977485954761505, "learning_rate": 4.004109273716306e-06, "loss": 0.021066611632704735, "memory(GiB)": 21.48, "step": 18028, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.953872 }, { "epoch": 0.5856804080174122, "grad_norm": 0.3158288300037384, "learning_rate": 4.003582889302622e-06, "loss": 0.011839560233056545, "memory(GiB)": 21.48, "step": 18029, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.953884 }, { "epoch": 0.5857128934801676, "grad_norm": 0.3494381308555603, "learning_rate": 4.003056516388916e-06, "loss": 0.020943760871887207, "memory(GiB)": 21.48, "step": 18030, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.953895 }, { "epoch": 0.585745378942923, "grad_norm": 0.2539989650249481, "learning_rate": 4.002530154981266e-06, "loss": 0.017252299934625626, "memory(GiB)": 21.48, "step": 18031, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.953906 }, { "epoch": 0.5857778644056785, "grad_norm": 0.5183565020561218, "learning_rate": 4.0020038050857436e-06, "loss": 0.016584038734436035, "memory(GiB)": 21.48, "step": 18032, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.953917 }, { "epoch": 0.5858103498684338, "grad_norm": 0.36576613783836365, "learning_rate": 4.001477466708427e-06, "loss": 0.02130875363945961, "memory(GiB)": 21.48, "step": 18033, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.953928 }, { "epoch": 0.5858428353311893, "grad_norm": 0.30599984526634216, "learning_rate": 4.000951139855388e-06, "loss": 0.014098014682531357, "memory(GiB)": 21.48, "step": 18034, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.953939 }, { "epoch": 0.5858753207939447, "grad_norm": 0.2634853720664978, "learning_rate": 4.000424824532705e-06, "loss": 0.015313589945435524, "memory(GiB)": 21.48, "step": 18035, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.95395 }, { "epoch": 0.5859078062567001, "grad_norm": 0.3110628128051758, "learning_rate": 3.999898520746448e-06, "loss": 0.01656481623649597, "memory(GiB)": 21.48, "step": 18036, "token_acc": 1.0, "train_speed(iter/s)": 0.953961 }, { "epoch": 0.5859402917194555, "grad_norm": 0.3664599061012268, "learning_rate": 3.999372228502694e-06, "loss": 0.024529164656996727, "memory(GiB)": 21.48, "step": 18037, "token_acc": 0.979253112033195, "train_speed(iter/s)": 0.953972 }, { "epoch": 0.585972777182211, "grad_norm": 0.28719666600227356, "learning_rate": 3.998845947807516e-06, "loss": 0.01921514980494976, "memory(GiB)": 21.48, "step": 18038, "token_acc": 0.9921875, "train_speed(iter/s)": 0.953983 }, { "epoch": 0.5860052626449663, "grad_norm": 0.341496080160141, "learning_rate": 3.9983196786669875e-06, "loss": 0.013868514448404312, "memory(GiB)": 21.48, "step": 18039, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.953994 }, { "epoch": 0.5860377481077218, "grad_norm": 0.4533975124359131, "learning_rate": 3.997793421087185e-06, "loss": 0.020701967179775238, "memory(GiB)": 21.48, "step": 18040, "token_acc": 0.9823008849557522, "train_speed(iter/s)": 0.954005 }, { "epoch": 0.5860702335704772, "grad_norm": 0.31774571537971497, "learning_rate": 3.997267175074179e-06, "loss": 0.013886203989386559, "memory(GiB)": 21.48, "step": 18041, "token_acc": 1.0, "train_speed(iter/s)": 0.954016 }, { "epoch": 0.5861027190332326, "grad_norm": 0.35196560621261597, "learning_rate": 3.996740940634045e-06, "loss": 0.01925239898264408, "memory(GiB)": 21.48, "step": 18042, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.954028 }, { "epoch": 0.586135204495988, "grad_norm": 0.37590885162353516, "learning_rate": 3.996214717772856e-06, "loss": 0.02373392879962921, "memory(GiB)": 21.48, "step": 18043, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.954038 }, { "epoch": 0.5861676899587435, "grad_norm": 0.22316697239875793, "learning_rate": 3.995688506496685e-06, "loss": 0.013192864134907722, "memory(GiB)": 21.48, "step": 18044, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.95405 }, { "epoch": 0.5862001754214988, "grad_norm": 0.4752982258796692, "learning_rate": 3.995162306811605e-06, "loss": 0.01661931350827217, "memory(GiB)": 21.48, "step": 18045, "token_acc": 1.0, "train_speed(iter/s)": 0.954061 }, { "epoch": 0.5862326608842543, "grad_norm": 0.2256576120853424, "learning_rate": 3.99463611872369e-06, "loss": 0.015226852148771286, "memory(GiB)": 21.48, "step": 18046, "token_acc": 1.0, "train_speed(iter/s)": 0.954072 }, { "epoch": 0.5862651463470097, "grad_norm": 0.4542772173881531, "learning_rate": 3.994109942239012e-06, "loss": 0.028804637491703033, "memory(GiB)": 21.48, "step": 18047, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.954082 }, { "epoch": 0.5862976318097651, "grad_norm": 0.3394663333892822, "learning_rate": 3.993583777363646e-06, "loss": 0.01603151485323906, "memory(GiB)": 21.48, "step": 18048, "token_acc": 0.9837837837837838, "train_speed(iter/s)": 0.954091 }, { "epoch": 0.5863301172725206, "grad_norm": 0.45398038625717163, "learning_rate": 3.9930576241036586e-06, "loss": 0.02172168530523777, "memory(GiB)": 21.48, "step": 18049, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.9541 }, { "epoch": 0.586362602735276, "grad_norm": 0.32646673917770386, "learning_rate": 3.99253148246513e-06, "loss": 0.019685927778482437, "memory(GiB)": 21.48, "step": 18050, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.954109 }, { "epoch": 0.5863950881980314, "grad_norm": 0.3679508566856384, "learning_rate": 3.9920053524541265e-06, "loss": 0.03018377535045147, "memory(GiB)": 21.48, "step": 18051, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.954116 }, { "epoch": 0.5864275736607868, "grad_norm": 0.20387203991413116, "learning_rate": 3.9914792340767264e-06, "loss": 0.011070779524743557, "memory(GiB)": 21.48, "step": 18052, "token_acc": 1.0, "train_speed(iter/s)": 0.954125 }, { "epoch": 0.5864600591235423, "grad_norm": 0.2936336398124695, "learning_rate": 3.990953127338995e-06, "loss": 0.015524902381002903, "memory(GiB)": 21.48, "step": 18053, "token_acc": 1.0, "train_speed(iter/s)": 0.954134 }, { "epoch": 0.5864925445862976, "grad_norm": 0.347939133644104, "learning_rate": 3.990427032247007e-06, "loss": 0.014349005185067654, "memory(GiB)": 21.48, "step": 18054, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.954143 }, { "epoch": 0.5865250300490531, "grad_norm": 1.826869249343872, "learning_rate": 3.989900948806838e-06, "loss": 0.017767509445548058, "memory(GiB)": 21.48, "step": 18055, "token_acc": 1.0, "train_speed(iter/s)": 0.954152 }, { "epoch": 0.5865575155118085, "grad_norm": 0.39456209540367126, "learning_rate": 3.9893748770245535e-06, "loss": 0.017188318073749542, "memory(GiB)": 21.48, "step": 18056, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.954161 }, { "epoch": 0.5865900009745639, "grad_norm": 0.3119831681251526, "learning_rate": 3.988848816906231e-06, "loss": 0.011302047409117222, "memory(GiB)": 21.48, "step": 18057, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.954169 }, { "epoch": 0.5866224864373193, "grad_norm": 0.34640198945999146, "learning_rate": 3.9883227684579365e-06, "loss": 0.018948812037706375, "memory(GiB)": 21.48, "step": 18058, "token_acc": 1.0, "train_speed(iter/s)": 0.954177 }, { "epoch": 0.5866549719000748, "grad_norm": 0.3474668860435486, "learning_rate": 3.9877967316857454e-06, "loss": 0.01650121435523033, "memory(GiB)": 21.48, "step": 18059, "token_acc": 1.0, "train_speed(iter/s)": 0.954184 }, { "epoch": 0.5866874573628301, "grad_norm": 0.3916251063346863, "learning_rate": 3.987270706595726e-06, "loss": 0.01749076135456562, "memory(GiB)": 21.48, "step": 18060, "token_acc": 0.9822064056939501, "train_speed(iter/s)": 0.954191 }, { "epoch": 0.5867199428255856, "grad_norm": 0.49446532130241394, "learning_rate": 3.986744693193952e-06, "loss": 0.02181384526193142, "memory(GiB)": 21.48, "step": 18061, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.954199 }, { "epoch": 0.586752428288341, "grad_norm": 0.30579903721809387, "learning_rate": 3.986218691486491e-06, "loss": 0.015355118550360203, "memory(GiB)": 21.48, "step": 18062, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.954207 }, { "epoch": 0.5867849137510964, "grad_norm": 0.3440016806125641, "learning_rate": 3.985692701479418e-06, "loss": 0.017414163798093796, "memory(GiB)": 21.48, "step": 18063, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.954215 }, { "epoch": 0.5868173992138518, "grad_norm": 0.2691468596458435, "learning_rate": 3.9851667231787986e-06, "loss": 0.012834547087550163, "memory(GiB)": 21.48, "step": 18064, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.954221 }, { "epoch": 0.5868498846766073, "grad_norm": 0.3437962830066681, "learning_rate": 3.984640756590709e-06, "loss": 0.016732433810830116, "memory(GiB)": 21.48, "step": 18065, "token_acc": 1.0, "train_speed(iter/s)": 0.954229 }, { "epoch": 0.5868823701393626, "grad_norm": 0.4426964521408081, "learning_rate": 3.984114801721213e-06, "loss": 0.01950536109507084, "memory(GiB)": 21.48, "step": 18066, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.954236 }, { "epoch": 0.5869148556021181, "grad_norm": 0.34674161672592163, "learning_rate": 3.983588858576387e-06, "loss": 0.01994560845196247, "memory(GiB)": 21.48, "step": 18067, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.954243 }, { "epoch": 0.5869473410648735, "grad_norm": 0.3632790446281433, "learning_rate": 3.983062927162297e-06, "loss": 0.021709470078349113, "memory(GiB)": 21.48, "step": 18068, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.954236 }, { "epoch": 0.5869798265276289, "grad_norm": 0.7937578558921814, "learning_rate": 3.982537007485016e-06, "loss": 0.024303652346134186, "memory(GiB)": 21.48, "step": 18069, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.954244 }, { "epoch": 0.5870123119903843, "grad_norm": 0.4210250973701477, "learning_rate": 3.98201109955061e-06, "loss": 0.02184612676501274, "memory(GiB)": 21.48, "step": 18070, "token_acc": 1.0, "train_speed(iter/s)": 0.954252 }, { "epoch": 0.5870447974531398, "grad_norm": 0.30715614557266235, "learning_rate": 3.981485203365152e-06, "loss": 0.011871330440044403, "memory(GiB)": 21.48, "step": 18071, "token_acc": 1.0, "train_speed(iter/s)": 0.954259 }, { "epoch": 0.5870772829158951, "grad_norm": 0.4723368287086487, "learning_rate": 3.980959318934711e-06, "loss": 0.02402031607925892, "memory(GiB)": 21.48, "step": 18072, "token_acc": 0.994475138121547, "train_speed(iter/s)": 0.954267 }, { "epoch": 0.5871097683786506, "grad_norm": 0.48668521642684937, "learning_rate": 3.980433446265354e-06, "loss": 0.022478792816400528, "memory(GiB)": 21.48, "step": 18073, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.954276 }, { "epoch": 0.587142253841406, "grad_norm": 0.3127303421497345, "learning_rate": 3.979907585363153e-06, "loss": 0.012297684326767921, "memory(GiB)": 21.48, "step": 18074, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.954284 }, { "epoch": 0.5871747393041614, "grad_norm": 0.5294030904769897, "learning_rate": 3.979381736234177e-06, "loss": 0.01860208436846733, "memory(GiB)": 21.48, "step": 18075, "token_acc": 1.0, "train_speed(iter/s)": 0.954292 }, { "epoch": 0.5872072247669168, "grad_norm": 0.2661786675453186, "learning_rate": 3.978855898884495e-06, "loss": 0.01787375845015049, "memory(GiB)": 21.48, "step": 18076, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.954301 }, { "epoch": 0.5872397102296723, "grad_norm": 0.4232367277145386, "learning_rate": 3.9783300733201715e-06, "loss": 0.014817148447036743, "memory(GiB)": 21.48, "step": 18077, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.954312 }, { "epoch": 0.5872721956924276, "grad_norm": 0.5905805826187134, "learning_rate": 3.977804259547283e-06, "loss": 0.01651129126548767, "memory(GiB)": 21.48, "step": 18078, "token_acc": 1.0, "train_speed(iter/s)": 0.954324 }, { "epoch": 0.5873046811551831, "grad_norm": 0.49346888065338135, "learning_rate": 3.9772784575718905e-06, "loss": 0.021714819595217705, "memory(GiB)": 21.48, "step": 18079, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.954334 }, { "epoch": 0.5873371666179384, "grad_norm": 0.3450584411621094, "learning_rate": 3.976752667400068e-06, "loss": 0.024956686422228813, "memory(GiB)": 21.48, "step": 18080, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.954345 }, { "epoch": 0.5873696520806939, "grad_norm": 0.4078642725944519, "learning_rate": 3.976226889037881e-06, "loss": 0.016839848831295967, "memory(GiB)": 21.48, "step": 18081, "token_acc": 0.983739837398374, "train_speed(iter/s)": 0.954356 }, { "epoch": 0.5874021375434493, "grad_norm": 0.30470365285873413, "learning_rate": 3.975701122491399e-06, "loss": 0.01924785226583481, "memory(GiB)": 21.48, "step": 18082, "token_acc": 0.9965156794425087, "train_speed(iter/s)": 0.954367 }, { "epoch": 0.5874346230062047, "grad_norm": 0.31877392530441284, "learning_rate": 3.975175367766687e-06, "loss": 0.016384785994887352, "memory(GiB)": 21.48, "step": 18083, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.954378 }, { "epoch": 0.5874671084689601, "grad_norm": 0.30544254183769226, "learning_rate": 3.974649624869818e-06, "loss": 0.01835111528635025, "memory(GiB)": 21.48, "step": 18084, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.954388 }, { "epoch": 0.5874995939317156, "grad_norm": 0.5193422436714172, "learning_rate": 3.974123893806856e-06, "loss": 0.016490750014781952, "memory(GiB)": 21.48, "step": 18085, "token_acc": 0.98828125, "train_speed(iter/s)": 0.9544 }, { "epoch": 0.5875320793944709, "grad_norm": 0.3480611741542816, "learning_rate": 3.973598174583869e-06, "loss": 0.023839250206947327, "memory(GiB)": 21.48, "step": 18086, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.954411 }, { "epoch": 0.5875645648572264, "grad_norm": 0.34343209862709045, "learning_rate": 3.973072467206926e-06, "loss": 0.01419034507125616, "memory(GiB)": 21.48, "step": 18087, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.954422 }, { "epoch": 0.5875970503199818, "grad_norm": 0.3414858877658844, "learning_rate": 3.972546771682093e-06, "loss": 0.016768669709563255, "memory(GiB)": 21.48, "step": 18088, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.954433 }, { "epoch": 0.5876295357827372, "grad_norm": 0.4707390367984772, "learning_rate": 3.9720210880154385e-06, "loss": 0.02015688829123974, "memory(GiB)": 21.48, "step": 18089, "token_acc": 1.0, "train_speed(iter/s)": 0.954445 }, { "epoch": 0.5876620212454926, "grad_norm": 0.3359869122505188, "learning_rate": 3.971495416213028e-06, "loss": 0.016832303255796432, "memory(GiB)": 21.48, "step": 18090, "token_acc": 1.0, "train_speed(iter/s)": 0.954455 }, { "epoch": 0.5876945067082481, "grad_norm": 0.3874569535255432, "learning_rate": 3.97096975628093e-06, "loss": 0.013625022023916245, "memory(GiB)": 21.48, "step": 18091, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.954467 }, { "epoch": 0.5877269921710034, "grad_norm": 0.480656236410141, "learning_rate": 3.9704441082252104e-06, "loss": 0.02668224461376667, "memory(GiB)": 21.48, "step": 18092, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.954474 }, { "epoch": 0.5877594776337589, "grad_norm": 0.28202468156814575, "learning_rate": 3.969918472051937e-06, "loss": 0.01935187727212906, "memory(GiB)": 21.48, "step": 18093, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.954485 }, { "epoch": 0.5877919630965143, "grad_norm": 0.39197108149528503, "learning_rate": 3.969392847767174e-06, "loss": 0.016279203817248344, "memory(GiB)": 21.48, "step": 18094, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.954496 }, { "epoch": 0.5878244485592697, "grad_norm": 0.4330074191093445, "learning_rate": 3.968867235376991e-06, "loss": 0.019006872549653053, "memory(GiB)": 21.48, "step": 18095, "token_acc": 1.0, "train_speed(iter/s)": 0.954507 }, { "epoch": 0.5878569340220251, "grad_norm": 0.35072991251945496, "learning_rate": 3.968341634887451e-06, "loss": 0.019807394593954086, "memory(GiB)": 21.48, "step": 18096, "token_acc": 0.9923954372623575, "train_speed(iter/s)": 0.954518 }, { "epoch": 0.5878894194847806, "grad_norm": 0.3195476531982422, "learning_rate": 3.967816046304622e-06, "loss": 0.015247074887156487, "memory(GiB)": 21.48, "step": 18097, "token_acc": 1.0, "train_speed(iter/s)": 0.954529 }, { "epoch": 0.5879219049475359, "grad_norm": 0.3836337924003601, "learning_rate": 3.96729046963457e-06, "loss": 0.012444110587239265, "memory(GiB)": 21.48, "step": 18098, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.95454 }, { "epoch": 0.5879543904102914, "grad_norm": 0.49895158410072327, "learning_rate": 3.966764904883362e-06, "loss": 0.022993039339780807, "memory(GiB)": 21.48, "step": 18099, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.954551 }, { "epoch": 0.5879868758730468, "grad_norm": 0.472043514251709, "learning_rate": 3.9662393520570585e-06, "loss": 0.018663857132196426, "memory(GiB)": 21.48, "step": 18100, "token_acc": 0.99609375, "train_speed(iter/s)": 0.954562 }, { "epoch": 0.5880193613358022, "grad_norm": 0.28828588128089905, "learning_rate": 3.965713811161731e-06, "loss": 0.015764029696583748, "memory(GiB)": 21.48, "step": 18101, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.954573 }, { "epoch": 0.5880518467985576, "grad_norm": 0.3473997712135315, "learning_rate": 3.965188282203443e-06, "loss": 0.01814984530210495, "memory(GiB)": 21.48, "step": 18102, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.954584 }, { "epoch": 0.5880843322613131, "grad_norm": 0.3296620845794678, "learning_rate": 3.964662765188259e-06, "loss": 0.01836184225976467, "memory(GiB)": 21.48, "step": 18103, "token_acc": 0.994475138121547, "train_speed(iter/s)": 0.954595 }, { "epoch": 0.5881168177240684, "grad_norm": 0.4946385324001312, "learning_rate": 3.964137260122246e-06, "loss": 0.02124965377151966, "memory(GiB)": 21.48, "step": 18104, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.954606 }, { "epoch": 0.5881493031868239, "grad_norm": 0.4383469820022583, "learning_rate": 3.963611767011465e-06, "loss": 0.02393443137407303, "memory(GiB)": 21.48, "step": 18105, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.954617 }, { "epoch": 0.5881817886495793, "grad_norm": 0.44839590787887573, "learning_rate": 3.963086285861988e-06, "loss": 0.021628083661198616, "memory(GiB)": 21.48, "step": 18106, "token_acc": 0.996, "train_speed(iter/s)": 0.954628 }, { "epoch": 0.5882142741123347, "grad_norm": 0.34961774945259094, "learning_rate": 3.96256081667987e-06, "loss": 0.017722437158226967, "memory(GiB)": 21.48, "step": 18107, "token_acc": 0.9965156794425087, "train_speed(iter/s)": 0.954639 }, { "epoch": 0.5882467595750901, "grad_norm": 0.4678654372692108, "learning_rate": 3.962035359471186e-06, "loss": 0.0207025445997715, "memory(GiB)": 21.48, "step": 18108, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.954651 }, { "epoch": 0.5882792450378456, "grad_norm": 0.42223185300827026, "learning_rate": 3.9615099142419925e-06, "loss": 0.014468569308519363, "memory(GiB)": 21.48, "step": 18109, "token_acc": 1.0, "train_speed(iter/s)": 0.954662 }, { "epoch": 0.5883117305006009, "grad_norm": 0.5396583080291748, "learning_rate": 3.960984480998358e-06, "loss": 0.018102645874023438, "memory(GiB)": 21.48, "step": 18110, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.954673 }, { "epoch": 0.5883442159633564, "grad_norm": 0.31390929222106934, "learning_rate": 3.960459059746345e-06, "loss": 0.017601467669010162, "memory(GiB)": 21.48, "step": 18111, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.954684 }, { "epoch": 0.5883767014261119, "grad_norm": 0.2586558163166046, "learning_rate": 3.9599336504920184e-06, "loss": 0.015358474105596542, "memory(GiB)": 21.48, "step": 18112, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.954694 }, { "epoch": 0.5884091868888672, "grad_norm": 0.3102448284626007, "learning_rate": 3.959408253241442e-06, "loss": 0.022286586463451385, "memory(GiB)": 21.48, "step": 18113, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.954703 }, { "epoch": 0.5884416723516227, "grad_norm": 4.724839687347412, "learning_rate": 3.958882868000679e-06, "loss": 0.023223910480737686, "memory(GiB)": 21.48, "step": 18114, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.954712 }, { "epoch": 0.5884741578143781, "grad_norm": 0.4582686126232147, "learning_rate": 3.958357494775793e-06, "loss": 0.02334168553352356, "memory(GiB)": 21.48, "step": 18115, "token_acc": 0.996, "train_speed(iter/s)": 0.95472 }, { "epoch": 0.5885066432771335, "grad_norm": 0.5224421620368958, "learning_rate": 3.957832133572847e-06, "loss": 0.014677333645522594, "memory(GiB)": 21.48, "step": 18116, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.95473 }, { "epoch": 0.5885391287398889, "grad_norm": 0.5301399827003479, "learning_rate": 3.9573067843979075e-06, "loss": 0.023114673793315887, "memory(GiB)": 21.48, "step": 18117, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.954737 }, { "epoch": 0.5885716142026444, "grad_norm": 0.35606512427330017, "learning_rate": 3.956781447257034e-06, "loss": 0.016975777223706245, "memory(GiB)": 21.48, "step": 18118, "token_acc": 0.9890710382513661, "train_speed(iter/s)": 0.954745 }, { "epoch": 0.5886040996653997, "grad_norm": 0.3241302967071533, "learning_rate": 3.956256122156292e-06, "loss": 0.019792180508375168, "memory(GiB)": 21.48, "step": 18119, "token_acc": 0.98046875, "train_speed(iter/s)": 0.954752 }, { "epoch": 0.5886365851281552, "grad_norm": 0.35615047812461853, "learning_rate": 3.9557308091017426e-06, "loss": 0.018749350681900978, "memory(GiB)": 21.48, "step": 18120, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.95476 }, { "epoch": 0.5886690705909106, "grad_norm": 0.35101738572120667, "learning_rate": 3.955205508099451e-06, "loss": 0.018323104828596115, "memory(GiB)": 21.48, "step": 18121, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.954768 }, { "epoch": 0.588701556053666, "grad_norm": 0.3703711926937103, "learning_rate": 3.954680219155477e-06, "loss": 0.014825290068984032, "memory(GiB)": 21.48, "step": 18122, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.954775 }, { "epoch": 0.5887340415164214, "grad_norm": 0.3155408799648285, "learning_rate": 3.954154942275886e-06, "loss": 0.018802093341946602, "memory(GiB)": 21.48, "step": 18123, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.954783 }, { "epoch": 0.5887665269791769, "grad_norm": 0.4268850088119507, "learning_rate": 3.953629677466738e-06, "loss": 0.026590049266815186, "memory(GiB)": 21.48, "step": 18124, "token_acc": 0.9963503649635036, "train_speed(iter/s)": 0.95479 }, { "epoch": 0.5887990124419322, "grad_norm": 0.3456876277923584, "learning_rate": 3.953104424734097e-06, "loss": 0.022437121719121933, "memory(GiB)": 21.48, "step": 18125, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.954797 }, { "epoch": 0.5888314979046877, "grad_norm": 0.2645052671432495, "learning_rate": 3.952579184084023e-06, "loss": 0.013427709229290485, "memory(GiB)": 21.48, "step": 18126, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.954804 }, { "epoch": 0.5888639833674431, "grad_norm": 0.2690275013446808, "learning_rate": 3.952053955522583e-06, "loss": 0.016542669385671616, "memory(GiB)": 21.48, "step": 18127, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.954811 }, { "epoch": 0.5888964688301985, "grad_norm": 0.22944378852844238, "learning_rate": 3.95152873905583e-06, "loss": 0.011981939896941185, "memory(GiB)": 21.48, "step": 18128, "token_acc": 1.0, "train_speed(iter/s)": 0.954819 }, { "epoch": 0.5889289542929539, "grad_norm": 0.23862548172473907, "learning_rate": 3.951003534689835e-06, "loss": 0.00954889040440321, "memory(GiB)": 21.48, "step": 18129, "token_acc": 1.0, "train_speed(iter/s)": 0.954827 }, { "epoch": 0.5889614397557094, "grad_norm": 0.4433458745479584, "learning_rate": 3.950478342430652e-06, "loss": 0.025094717741012573, "memory(GiB)": 21.48, "step": 18130, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.954835 }, { "epoch": 0.5889939252184647, "grad_norm": 0.33903077244758606, "learning_rate": 3.949953162284349e-06, "loss": 0.021989984437823296, "memory(GiB)": 21.48, "step": 18131, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.954841 }, { "epoch": 0.5890264106812202, "grad_norm": 0.3597310483455658, "learning_rate": 3.949427994256982e-06, "loss": 0.02328665554523468, "memory(GiB)": 21.48, "step": 18132, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.954848 }, { "epoch": 0.5890588961439756, "grad_norm": 0.3150378465652466, "learning_rate": 3.948902838354614e-06, "loss": 0.015155065804719925, "memory(GiB)": 21.48, "step": 18133, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.954856 }, { "epoch": 0.589091381606731, "grad_norm": 0.3754810392856598, "learning_rate": 3.948377694583308e-06, "loss": 0.020877225324511528, "memory(GiB)": 21.48, "step": 18134, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.954863 }, { "epoch": 0.5891238670694864, "grad_norm": 0.47508004307746887, "learning_rate": 3.94785256294912e-06, "loss": 0.017476234585046768, "memory(GiB)": 21.48, "step": 18135, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.95487 }, { "epoch": 0.5891563525322419, "grad_norm": 0.24231193959712982, "learning_rate": 3.947327443458119e-06, "loss": 0.013935904949903488, "memory(GiB)": 21.48, "step": 18136, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.954876 }, { "epoch": 0.5891888379949972, "grad_norm": 0.37476515769958496, "learning_rate": 3.946802336116355e-06, "loss": 0.017160959541797638, "memory(GiB)": 21.48, "step": 18137, "token_acc": 0.9940476190476191, "train_speed(iter/s)": 0.954887 }, { "epoch": 0.5892213234577527, "grad_norm": 0.3631085157394409, "learning_rate": 3.9462772409298984e-06, "loss": 0.027721194550395012, "memory(GiB)": 21.48, "step": 18138, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.954898 }, { "epoch": 0.589253808920508, "grad_norm": 0.2868824303150177, "learning_rate": 3.945752157904802e-06, "loss": 0.013894874602556229, "memory(GiB)": 21.48, "step": 18139, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.954909 }, { "epoch": 0.5892862943832635, "grad_norm": 0.3903043270111084, "learning_rate": 3.94522708704713e-06, "loss": 0.01723124459385872, "memory(GiB)": 21.48, "step": 18140, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.95492 }, { "epoch": 0.5893187798460189, "grad_norm": 0.40757083892822266, "learning_rate": 3.944702028362941e-06, "loss": 0.019782526418566704, "memory(GiB)": 21.48, "step": 18141, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.954931 }, { "epoch": 0.5893512653087744, "grad_norm": 0.526135265827179, "learning_rate": 3.944176981858296e-06, "loss": 0.026757976040244102, "memory(GiB)": 21.48, "step": 18142, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.954942 }, { "epoch": 0.5893837507715297, "grad_norm": 0.2230120599269867, "learning_rate": 3.943651947539253e-06, "loss": 0.013078310526907444, "memory(GiB)": 21.48, "step": 18143, "token_acc": 0.9944444444444445, "train_speed(iter/s)": 0.954952 }, { "epoch": 0.5894162362342852, "grad_norm": 0.22477979958057404, "learning_rate": 3.943126925411873e-06, "loss": 0.013449102640151978, "memory(GiB)": 21.48, "step": 18144, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.954964 }, { "epoch": 0.5894487216970405, "grad_norm": 0.46040061116218567, "learning_rate": 3.942601915482214e-06, "loss": 0.027621962130069733, "memory(GiB)": 21.48, "step": 18145, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.954975 }, { "epoch": 0.589481207159796, "grad_norm": 0.35639244318008423, "learning_rate": 3.942076917756338e-06, "loss": 0.019089385867118835, "memory(GiB)": 21.48, "step": 18146, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.954986 }, { "epoch": 0.5895136926225514, "grad_norm": 0.37273016571998596, "learning_rate": 3.941551932240301e-06, "loss": 0.013768540695309639, "memory(GiB)": 21.48, "step": 18147, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.954997 }, { "epoch": 0.5895461780853068, "grad_norm": 0.32152271270751953, "learning_rate": 3.941026958940163e-06, "loss": 0.01567605696618557, "memory(GiB)": 21.48, "step": 18148, "token_acc": 0.992, "train_speed(iter/s)": 0.955007 }, { "epoch": 0.5895786635480622, "grad_norm": 0.3242108225822449, "learning_rate": 3.940501997861985e-06, "loss": 0.013795793056488037, "memory(GiB)": 21.48, "step": 18149, "token_acc": 1.0, "train_speed(iter/s)": 0.955019 }, { "epoch": 0.5896111490108177, "grad_norm": 0.32075610756874084, "learning_rate": 3.9399770490118225e-06, "loss": 0.021517086774110794, "memory(GiB)": 21.48, "step": 18150, "token_acc": 1.0, "train_speed(iter/s)": 0.95503 }, { "epoch": 0.589643634473573, "grad_norm": 0.40366852283477783, "learning_rate": 3.939452112395737e-06, "loss": 0.020356707274913788, "memory(GiB)": 21.48, "step": 18151, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.95504 }, { "epoch": 0.5896761199363285, "grad_norm": 0.2762310206890106, "learning_rate": 3.9389271880197855e-06, "loss": 0.018582280725240707, "memory(GiB)": 21.48, "step": 18152, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.955051 }, { "epoch": 0.5897086053990839, "grad_norm": 0.34098511934280396, "learning_rate": 3.938402275890027e-06, "loss": 0.018021468073129654, "memory(GiB)": 21.48, "step": 18153, "token_acc": 0.99609375, "train_speed(iter/s)": 0.955062 }, { "epoch": 0.5897410908618393, "grad_norm": 0.308651864528656, "learning_rate": 3.9378773760125176e-06, "loss": 0.025092771276831627, "memory(GiB)": 21.48, "step": 18154, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.955073 }, { "epoch": 0.5897735763245947, "grad_norm": 0.408708781003952, "learning_rate": 3.937352488393319e-06, "loss": 0.028338657692074776, "memory(GiB)": 21.48, "step": 18155, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.955084 }, { "epoch": 0.5898060617873502, "grad_norm": 0.27970826625823975, "learning_rate": 3.936827613038485e-06, "loss": 0.012918880209326744, "memory(GiB)": 21.48, "step": 18156, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.955095 }, { "epoch": 0.5898385472501055, "grad_norm": 0.4206036925315857, "learning_rate": 3.936302749954079e-06, "loss": 0.019630661234259605, "memory(GiB)": 21.48, "step": 18157, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.955106 }, { "epoch": 0.589871032712861, "grad_norm": 0.26797154545783997, "learning_rate": 3.9357778991461505e-06, "loss": 0.01347995176911354, "memory(GiB)": 21.48, "step": 18158, "token_acc": 1.0, "train_speed(iter/s)": 0.955117 }, { "epoch": 0.5899035181756164, "grad_norm": 0.35730865597724915, "learning_rate": 3.935253060620765e-06, "loss": 0.01650466024875641, "memory(GiB)": 21.48, "step": 18159, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.955128 }, { "epoch": 0.5899360036383718, "grad_norm": 0.26283302903175354, "learning_rate": 3.934728234383973e-06, "loss": 0.009930022060871124, "memory(GiB)": 21.48, "step": 18160, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.955139 }, { "epoch": 0.5899684891011272, "grad_norm": 0.33223214745521545, "learning_rate": 3.934203420441839e-06, "loss": 0.018160581588745117, "memory(GiB)": 21.48, "step": 18161, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.95515 }, { "epoch": 0.5900009745638827, "grad_norm": 0.4716564118862152, "learning_rate": 3.933678618800414e-06, "loss": 0.029453923925757408, "memory(GiB)": 21.48, "step": 18162, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.955161 }, { "epoch": 0.590033460026638, "grad_norm": 0.24998223781585693, "learning_rate": 3.9331538294657555e-06, "loss": 0.011032755486667156, "memory(GiB)": 21.48, "step": 18163, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955172 }, { "epoch": 0.5900659454893935, "grad_norm": 0.30138805508613586, "learning_rate": 3.932629052443925e-06, "loss": 0.011841069906949997, "memory(GiB)": 21.48, "step": 18164, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.955183 }, { "epoch": 0.5900984309521489, "grad_norm": 0.37334761023521423, "learning_rate": 3.932104287740972e-06, "loss": 0.01759553886950016, "memory(GiB)": 21.48, "step": 18165, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955194 }, { "epoch": 0.5901309164149043, "grad_norm": 0.527877688407898, "learning_rate": 3.931579535362961e-06, "loss": 0.02018778771162033, "memory(GiB)": 21.48, "step": 18166, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955204 }, { "epoch": 0.5901634018776597, "grad_norm": 0.4088326394557953, "learning_rate": 3.9310547953159416e-06, "loss": 0.012344805523753166, "memory(GiB)": 21.48, "step": 18167, "token_acc": 1.0, "train_speed(iter/s)": 0.955215 }, { "epoch": 0.5901958873404152, "grad_norm": 0.28519171476364136, "learning_rate": 3.930530067605974e-06, "loss": 0.013115671463310719, "memory(GiB)": 21.48, "step": 18168, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955226 }, { "epoch": 0.5902283728031705, "grad_norm": 0.4252801537513733, "learning_rate": 3.930005352239113e-06, "loss": 0.020896688103675842, "memory(GiB)": 21.48, "step": 18169, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955238 }, { "epoch": 0.590260858265926, "grad_norm": 0.260260671377182, "learning_rate": 3.929480649221413e-06, "loss": 0.014437505044043064, "memory(GiB)": 21.48, "step": 18170, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.955249 }, { "epoch": 0.5902933437286814, "grad_norm": 0.43590179085731506, "learning_rate": 3.928955958558931e-06, "loss": 0.021931976079940796, "memory(GiB)": 21.48, "step": 18171, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.95526 }, { "epoch": 0.5903258291914368, "grad_norm": 0.436955988407135, "learning_rate": 3.928431280257725e-06, "loss": 0.02212735079228878, "memory(GiB)": 21.48, "step": 18172, "token_acc": 1.0, "train_speed(iter/s)": 0.955271 }, { "epoch": 0.5903583146541922, "grad_norm": 0.32788917422294617, "learning_rate": 3.927906614323847e-06, "loss": 0.015486306510865688, "memory(GiB)": 21.48, "step": 18173, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955282 }, { "epoch": 0.5903908001169477, "grad_norm": 0.3045804798603058, "learning_rate": 3.927381960763353e-06, "loss": 0.014004518277943134, "memory(GiB)": 21.48, "step": 18174, "token_acc": 1.0, "train_speed(iter/s)": 0.955293 }, { "epoch": 0.590423285579703, "grad_norm": 0.32270586490631104, "learning_rate": 3.926857319582299e-06, "loss": 0.012990908697247505, "memory(GiB)": 21.48, "step": 18175, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.955304 }, { "epoch": 0.5904557710424585, "grad_norm": 0.3506185710430145, "learning_rate": 3.926332690786741e-06, "loss": 0.015764091163873672, "memory(GiB)": 21.48, "step": 18176, "token_acc": 1.0, "train_speed(iter/s)": 0.955315 }, { "epoch": 0.590488256505214, "grad_norm": 0.4643322825431824, "learning_rate": 3.925808074382732e-06, "loss": 0.0179063118994236, "memory(GiB)": 21.48, "step": 18177, "token_acc": 1.0, "train_speed(iter/s)": 0.955323 }, { "epoch": 0.5905207419679693, "grad_norm": 0.2877691388130188, "learning_rate": 3.9252834703763265e-06, "loss": 0.01967187225818634, "memory(GiB)": 21.48, "step": 18178, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955331 }, { "epoch": 0.5905532274307248, "grad_norm": 0.42628827691078186, "learning_rate": 3.924758878773582e-06, "loss": 0.02119390293955803, "memory(GiB)": 21.48, "step": 18179, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.955339 }, { "epoch": 0.5905857128934802, "grad_norm": 0.40747323632240295, "learning_rate": 3.92423429958055e-06, "loss": 0.02045762538909912, "memory(GiB)": 21.48, "step": 18180, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955347 }, { "epoch": 0.5906181983562356, "grad_norm": 0.2892860472202301, "learning_rate": 3.923709732803287e-06, "loss": 0.02031959779560566, "memory(GiB)": 21.48, "step": 18181, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.955354 }, { "epoch": 0.590650683818991, "grad_norm": 0.436480313539505, "learning_rate": 3.923185178447845e-06, "loss": 0.022535856813192368, "memory(GiB)": 21.48, "step": 18182, "token_acc": 0.9918367346938776, "train_speed(iter/s)": 0.955362 }, { "epoch": 0.5906831692817465, "grad_norm": 0.3232188820838928, "learning_rate": 3.92266063652028e-06, "loss": 0.013365405611693859, "memory(GiB)": 21.48, "step": 18183, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.955369 }, { "epoch": 0.5907156547445018, "grad_norm": 0.4517001509666443, "learning_rate": 3.922136107026646e-06, "loss": 0.0242448840290308, "memory(GiB)": 21.48, "step": 18184, "token_acc": 0.9877049180327869, "train_speed(iter/s)": 0.955376 }, { "epoch": 0.5907481402072573, "grad_norm": 0.44528260827064514, "learning_rate": 3.921611589972996e-06, "loss": 0.017475249245762825, "memory(GiB)": 21.48, "step": 18185, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.955383 }, { "epoch": 0.5907806256700127, "grad_norm": 0.3459785580635071, "learning_rate": 3.921087085365381e-06, "loss": 0.015982870012521744, "memory(GiB)": 21.48, "step": 18186, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955391 }, { "epoch": 0.5908131111327681, "grad_norm": 0.2897588908672333, "learning_rate": 3.92056259320986e-06, "loss": 0.01638410985469818, "memory(GiB)": 21.48, "step": 18187, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.955399 }, { "epoch": 0.5908455965955235, "grad_norm": 0.3724408745765686, "learning_rate": 3.9200381135124814e-06, "loss": 0.019876990467309952, "memory(GiB)": 21.48, "step": 18188, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.955406 }, { "epoch": 0.590878082058279, "grad_norm": 0.3904549181461334, "learning_rate": 3.919513646279302e-06, "loss": 0.02160004712641239, "memory(GiB)": 21.48, "step": 18189, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.955412 }, { "epoch": 0.5909105675210343, "grad_norm": 0.3231983780860901, "learning_rate": 3.918989191516373e-06, "loss": 0.0216355063021183, "memory(GiB)": 21.48, "step": 18190, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.955419 }, { "epoch": 0.5909430529837898, "grad_norm": 0.4460696280002594, "learning_rate": 3.918464749229747e-06, "loss": 0.01755109615623951, "memory(GiB)": 21.48, "step": 18191, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.955426 }, { "epoch": 0.5909755384465452, "grad_norm": 0.4344032108783722, "learning_rate": 3.917940319425477e-06, "loss": 0.01999102719128132, "memory(GiB)": 21.48, "step": 18192, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.955433 }, { "epoch": 0.5910080239093006, "grad_norm": 0.29803794622421265, "learning_rate": 3.917415902109615e-06, "loss": 0.01632506400346756, "memory(GiB)": 21.48, "step": 18193, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.95544 }, { "epoch": 0.591040509372056, "grad_norm": 0.396940141916275, "learning_rate": 3.916891497288217e-06, "loss": 0.019278788939118385, "memory(GiB)": 21.48, "step": 18194, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.955447 }, { "epoch": 0.5910729948348115, "grad_norm": 0.35844624042510986, "learning_rate": 3.9163671049673315e-06, "loss": 0.015804728493094444, "memory(GiB)": 21.48, "step": 18195, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955454 }, { "epoch": 0.5911054802975668, "grad_norm": 0.26762109994888306, "learning_rate": 3.915842725153012e-06, "loss": 0.01503379549831152, "memory(GiB)": 21.48, "step": 18196, "token_acc": 1.0, "train_speed(iter/s)": 0.955461 }, { "epoch": 0.5911379657603223, "grad_norm": 0.31747859716415405, "learning_rate": 3.91531835785131e-06, "loss": 0.02031855657696724, "memory(GiB)": 21.48, "step": 18197, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.95547 }, { "epoch": 0.5911704512230777, "grad_norm": 0.6935649514198303, "learning_rate": 3.914794003068278e-06, "loss": 0.014991520904004574, "memory(GiB)": 21.48, "step": 18198, "token_acc": 1.0, "train_speed(iter/s)": 0.955478 }, { "epoch": 0.5912029366858331, "grad_norm": 0.42051196098327637, "learning_rate": 3.914269660809967e-06, "loss": 0.027144283056259155, "memory(GiB)": 21.48, "step": 18199, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.955487 }, { "epoch": 0.5912354221485885, "grad_norm": 0.42120155692100525, "learning_rate": 3.91374533108243e-06, "loss": 0.014116746373474598, "memory(GiB)": 21.48, "step": 18200, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.955497 }, { "epoch": 0.591267907611344, "grad_norm": 0.37785589694976807, "learning_rate": 3.913221013891718e-06, "loss": 0.021978868171572685, "memory(GiB)": 21.48, "step": 18201, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.955508 }, { "epoch": 0.5913003930740993, "grad_norm": 0.36795419454574585, "learning_rate": 3.9126967092438824e-06, "loss": 0.017658794298768044, "memory(GiB)": 21.48, "step": 18202, "token_acc": 0.9947368421052631, "train_speed(iter/s)": 0.955519 }, { "epoch": 0.5913328785368548, "grad_norm": 0.7299537062644958, "learning_rate": 3.912172417144973e-06, "loss": 0.015810489654541016, "memory(GiB)": 21.48, "step": 18203, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.955529 }, { "epoch": 0.5913653639996101, "grad_norm": 0.28776857256889343, "learning_rate": 3.911648137601042e-06, "loss": 0.018259411677718163, "memory(GiB)": 21.48, "step": 18204, "token_acc": 1.0, "train_speed(iter/s)": 0.955538 }, { "epoch": 0.5913978494623656, "grad_norm": 0.3327987790107727, "learning_rate": 3.91112387061814e-06, "loss": 0.014183166436851025, "memory(GiB)": 21.48, "step": 18205, "token_acc": 1.0, "train_speed(iter/s)": 0.955549 }, { "epoch": 0.591430334925121, "grad_norm": 0.7420908212661743, "learning_rate": 3.910599616202319e-06, "loss": 0.024428894743323326, "memory(GiB)": 21.48, "step": 18206, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.955559 }, { "epoch": 0.5914628203878765, "grad_norm": 0.6326125860214233, "learning_rate": 3.9100753743596255e-06, "loss": 0.020173095166683197, "memory(GiB)": 21.48, "step": 18207, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.95557 }, { "epoch": 0.5914953058506318, "grad_norm": 0.4163416028022766, "learning_rate": 3.909551145096117e-06, "loss": 0.020275302231311798, "memory(GiB)": 21.48, "step": 18208, "token_acc": 0.994475138121547, "train_speed(iter/s)": 0.95558 }, { "epoch": 0.5915277913133873, "grad_norm": 0.714594304561615, "learning_rate": 3.909026928417836e-06, "loss": 0.02626800909638405, "memory(GiB)": 21.48, "step": 18209, "token_acc": 0.978021978021978, "train_speed(iter/s)": 0.955591 }, { "epoch": 0.5915602767761426, "grad_norm": 0.3068542778491974, "learning_rate": 3.908502724330837e-06, "loss": 0.012637237086892128, "memory(GiB)": 21.48, "step": 18210, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955603 }, { "epoch": 0.5915927622388981, "grad_norm": 0.6245975494384766, "learning_rate": 3.907978532841171e-06, "loss": 0.01713845133781433, "memory(GiB)": 21.48, "step": 18211, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.955613 }, { "epoch": 0.5916252477016535, "grad_norm": 0.41790321469306946, "learning_rate": 3.907454353954884e-06, "loss": 0.022941797971725464, "memory(GiB)": 21.48, "step": 18212, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955624 }, { "epoch": 0.591657733164409, "grad_norm": 0.3660658001899719, "learning_rate": 3.9069301876780305e-06, "loss": 0.016541672870516777, "memory(GiB)": 21.48, "step": 18213, "token_acc": 1.0, "train_speed(iter/s)": 0.955635 }, { "epoch": 0.5916902186271643, "grad_norm": 0.4293355941772461, "learning_rate": 3.906406034016655e-06, "loss": 0.016334660351276398, "memory(GiB)": 21.48, "step": 18214, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.955646 }, { "epoch": 0.5917227040899198, "grad_norm": 0.3356965184211731, "learning_rate": 3.905881892976813e-06, "loss": 0.013754214160144329, "memory(GiB)": 21.48, "step": 18215, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.955657 }, { "epoch": 0.5917551895526751, "grad_norm": 0.4276822805404663, "learning_rate": 3.905357764564545e-06, "loss": 0.017440151423215866, "memory(GiB)": 21.48, "step": 18216, "token_acc": 1.0, "train_speed(iter/s)": 0.955668 }, { "epoch": 0.5917876750154306, "grad_norm": 0.29428526759147644, "learning_rate": 3.904833648785911e-06, "loss": 0.018472302705049515, "memory(GiB)": 21.48, "step": 18217, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.95568 }, { "epoch": 0.591820160478186, "grad_norm": 0.23179641366004944, "learning_rate": 3.9043095456469505e-06, "loss": 0.008489903062582016, "memory(GiB)": 21.48, "step": 18218, "token_acc": 0.9933110367892977, "train_speed(iter/s)": 0.955691 }, { "epoch": 0.5918526459409414, "grad_norm": 0.39602336287498474, "learning_rate": 3.903785455153718e-06, "loss": 0.02053375542163849, "memory(GiB)": 21.48, "step": 18219, "token_acc": 0.99, "train_speed(iter/s)": 0.955702 }, { "epoch": 0.5918851314036968, "grad_norm": 0.24259574711322784, "learning_rate": 3.903261377312259e-06, "loss": 0.010355123318731785, "memory(GiB)": 21.48, "step": 18220, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.955712 }, { "epoch": 0.5919176168664523, "grad_norm": 0.45228493213653564, "learning_rate": 3.902737312128625e-06, "loss": 0.022147364914417267, "memory(GiB)": 21.48, "step": 18221, "token_acc": 0.98, "train_speed(iter/s)": 0.955723 }, { "epoch": 0.5919501023292076, "grad_norm": 0.23919077217578888, "learning_rate": 3.902213259608861e-06, "loss": 0.010106700472533703, "memory(GiB)": 21.48, "step": 18222, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.955735 }, { "epoch": 0.5919825877919631, "grad_norm": 0.2469261884689331, "learning_rate": 3.901689219759019e-06, "loss": 0.013211878016591072, "memory(GiB)": 21.48, "step": 18223, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.955746 }, { "epoch": 0.5920150732547185, "grad_norm": 2.425014019012451, "learning_rate": 3.901165192585144e-06, "loss": 0.02460315451025963, "memory(GiB)": 21.48, "step": 18224, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.955757 }, { "epoch": 0.5920475587174739, "grad_norm": 0.24511846899986267, "learning_rate": 3.900641178093285e-06, "loss": 0.01080562174320221, "memory(GiB)": 21.48, "step": 18225, "token_acc": 0.9961832061068703, "train_speed(iter/s)": 0.955768 }, { "epoch": 0.5920800441802293, "grad_norm": 0.24806654453277588, "learning_rate": 3.900117176289491e-06, "loss": 0.010272457264363766, "memory(GiB)": 21.48, "step": 18226, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.955779 }, { "epoch": 0.5921125296429848, "grad_norm": 0.2916363775730133, "learning_rate": 3.899593187179808e-06, "loss": 0.014079015702009201, "memory(GiB)": 21.48, "step": 18227, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.95579 }, { "epoch": 0.5921450151057401, "grad_norm": 0.3225375711917877, "learning_rate": 3.899069210770285e-06, "loss": 0.015448624268174171, "memory(GiB)": 21.48, "step": 18228, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955801 }, { "epoch": 0.5921775005684956, "grad_norm": 0.37906402349472046, "learning_rate": 3.8985452470669685e-06, "loss": 0.018694089725613594, "memory(GiB)": 21.48, "step": 18229, "token_acc": 1.0, "train_speed(iter/s)": 0.955813 }, { "epoch": 0.592209986031251, "grad_norm": 0.4205176830291748, "learning_rate": 3.898021296075906e-06, "loss": 0.02505050040781498, "memory(GiB)": 21.48, "step": 18230, "token_acc": 0.9944444444444445, "train_speed(iter/s)": 0.955824 }, { "epoch": 0.5922424714940064, "grad_norm": 0.38634923100471497, "learning_rate": 3.897497357803143e-06, "loss": 0.024900630116462708, "memory(GiB)": 21.48, "step": 18231, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.955835 }, { "epoch": 0.5922749569567618, "grad_norm": 0.36401331424713135, "learning_rate": 3.89697343225473e-06, "loss": 0.016194332391023636, "memory(GiB)": 21.48, "step": 18232, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.955846 }, { "epoch": 0.5923074424195173, "grad_norm": 0.35590818524360657, "learning_rate": 3.89644951943671e-06, "loss": 0.018659546971321106, "memory(GiB)": 21.48, "step": 18233, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.955857 }, { "epoch": 0.5923399278822726, "grad_norm": 0.4692496955394745, "learning_rate": 3.895925619355133e-06, "loss": 0.019610164687037468, "memory(GiB)": 21.48, "step": 18234, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.955869 }, { "epoch": 0.5923724133450281, "grad_norm": 0.2936708629131317, "learning_rate": 3.895401732016043e-06, "loss": 0.016390301287174225, "memory(GiB)": 21.48, "step": 18235, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.95588 }, { "epoch": 0.5924048988077835, "grad_norm": 0.408013254404068, "learning_rate": 3.894877857425489e-06, "loss": 0.016971899196505547, "memory(GiB)": 21.48, "step": 18236, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.955891 }, { "epoch": 0.5924373842705389, "grad_norm": 0.4058293104171753, "learning_rate": 3.894353995589512e-06, "loss": 0.018020831048488617, "memory(GiB)": 21.48, "step": 18237, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.9559 }, { "epoch": 0.5924698697332943, "grad_norm": 0.35074323415756226, "learning_rate": 3.893830146514164e-06, "loss": 0.01944560557603836, "memory(GiB)": 21.48, "step": 18238, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.955909 }, { "epoch": 0.5925023551960498, "grad_norm": 0.29972949624061584, "learning_rate": 3.893306310205487e-06, "loss": 0.018500899896025658, "memory(GiB)": 21.48, "step": 18239, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.955918 }, { "epoch": 0.5925348406588052, "grad_norm": 0.4448241889476776, "learning_rate": 3.8927824866695275e-06, "loss": 0.023546064272522926, "memory(GiB)": 21.48, "step": 18240, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.955927 }, { "epoch": 0.5925673261215606, "grad_norm": 0.33771729469299316, "learning_rate": 3.892258675912335e-06, "loss": 0.02047434076666832, "memory(GiB)": 21.48, "step": 18241, "token_acc": 0.9947368421052631, "train_speed(iter/s)": 0.955935 }, { "epoch": 0.5925998115843161, "grad_norm": 0.45225822925567627, "learning_rate": 3.891734877939948e-06, "loss": 0.02365201897919178, "memory(GiB)": 21.48, "step": 18242, "token_acc": 0.984375, "train_speed(iter/s)": 0.955943 }, { "epoch": 0.5926322970470714, "grad_norm": 0.7258397340774536, "learning_rate": 3.89121109275842e-06, "loss": 0.020917581394314766, "memory(GiB)": 21.48, "step": 18243, "token_acc": 1.0, "train_speed(iter/s)": 0.955952 }, { "epoch": 0.5926647825098269, "grad_norm": 0.29391658306121826, "learning_rate": 3.890687320373787e-06, "loss": 0.015336697921156883, "memory(GiB)": 21.48, "step": 18244, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955959 }, { "epoch": 0.5926972679725823, "grad_norm": 0.3819568455219269, "learning_rate": 3.890163560792103e-06, "loss": 0.01629805937409401, "memory(GiB)": 21.48, "step": 18245, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955967 }, { "epoch": 0.5927297534353377, "grad_norm": 0.28767120838165283, "learning_rate": 3.889639814019406e-06, "loss": 0.018760252743959427, "memory(GiB)": 21.48, "step": 18246, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.955975 }, { "epoch": 0.5927622388980931, "grad_norm": 0.3094782531261444, "learning_rate": 3.889116080061745e-06, "loss": 0.013600524514913559, "memory(GiB)": 21.48, "step": 18247, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.955983 }, { "epoch": 0.5927947243608486, "grad_norm": 0.4171872138977051, "learning_rate": 3.8885923589251615e-06, "loss": 0.01916489005088806, "memory(GiB)": 21.48, "step": 18248, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955991 }, { "epoch": 0.5928272098236039, "grad_norm": 0.28788629174232483, "learning_rate": 3.888068650615703e-06, "loss": 0.012374373152852058, "memory(GiB)": 21.48, "step": 18249, "token_acc": 0.9933554817275747, "train_speed(iter/s)": 0.955999 }, { "epoch": 0.5928596952863594, "grad_norm": 0.3876199722290039, "learning_rate": 3.887544955139411e-06, "loss": 0.01928754150867462, "memory(GiB)": 21.48, "step": 18250, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.956007 }, { "epoch": 0.5928921807491148, "grad_norm": 0.17424362897872925, "learning_rate": 3.8870212725023315e-06, "loss": 0.01135145965963602, "memory(GiB)": 21.48, "step": 18251, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.956016 }, { "epoch": 0.5929246662118702, "grad_norm": 0.32511240243911743, "learning_rate": 3.886497602710507e-06, "loss": 0.01320862676948309, "memory(GiB)": 21.48, "step": 18252, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.956023 }, { "epoch": 0.5929571516746256, "grad_norm": 0.2879191040992737, "learning_rate": 3.885973945769984e-06, "loss": 0.00932802353054285, "memory(GiB)": 21.48, "step": 18253, "token_acc": 1.0, "train_speed(iter/s)": 0.95603 }, { "epoch": 0.5929896371373811, "grad_norm": 0.28277701139450073, "learning_rate": 3.885450301686803e-06, "loss": 0.015299467369914055, "memory(GiB)": 21.48, "step": 18254, "token_acc": 1.0, "train_speed(iter/s)": 0.956038 }, { "epoch": 0.5930221226001364, "grad_norm": 0.3402753472328186, "learning_rate": 3.8849266704670085e-06, "loss": 0.015833627432584763, "memory(GiB)": 21.48, "step": 18255, "token_acc": 1.0, "train_speed(iter/s)": 0.956044 }, { "epoch": 0.5930546080628919, "grad_norm": 0.4649849534034729, "learning_rate": 3.884403052116646e-06, "loss": 0.02364792674779892, "memory(GiB)": 21.48, "step": 18256, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.956052 }, { "epoch": 0.5930870935256473, "grad_norm": 0.36209574341773987, "learning_rate": 3.883879446641757e-06, "loss": 0.013671832159161568, "memory(GiB)": 21.48, "step": 18257, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956061 }, { "epoch": 0.5931195789884027, "grad_norm": 0.3389972150325775, "learning_rate": 3.8833558540483835e-06, "loss": 0.016853226348757744, "memory(GiB)": 21.48, "step": 18258, "token_acc": 1.0, "train_speed(iter/s)": 0.956069 }, { "epoch": 0.5931520644511581, "grad_norm": 0.3335943818092346, "learning_rate": 3.88283227434257e-06, "loss": 0.01133216917514801, "memory(GiB)": 21.48, "step": 18259, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.956078 }, { "epoch": 0.5931845499139136, "grad_norm": 0.3487350642681122, "learning_rate": 3.88230870753036e-06, "loss": 0.016483642160892487, "memory(GiB)": 21.48, "step": 18260, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.956085 }, { "epoch": 0.5932170353766689, "grad_norm": 0.547464907169342, "learning_rate": 3.881785153617793e-06, "loss": 0.025565708056092262, "memory(GiB)": 21.48, "step": 18261, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.956094 }, { "epoch": 0.5932495208394244, "grad_norm": 0.32252591848373413, "learning_rate": 3.881261612610916e-06, "loss": 0.025087006390094757, "memory(GiB)": 21.48, "step": 18262, "token_acc": 1.0, "train_speed(iter/s)": 0.956104 }, { "epoch": 0.5932820063021798, "grad_norm": 0.4278361201286316, "learning_rate": 3.880738084515768e-06, "loss": 0.020721612498164177, "memory(GiB)": 21.48, "step": 18263, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.956112 }, { "epoch": 0.5933144917649352, "grad_norm": 0.31922516226768494, "learning_rate": 3.880214569338394e-06, "loss": 0.01021839864552021, "memory(GiB)": 21.48, "step": 18264, "token_acc": 1.0, "train_speed(iter/s)": 0.956121 }, { "epoch": 0.5933469772276906, "grad_norm": 0.34954652190208435, "learning_rate": 3.879691067084831e-06, "loss": 0.017351612448692322, "memory(GiB)": 21.48, "step": 18265, "token_acc": 1.0, "train_speed(iter/s)": 0.956131 }, { "epoch": 0.5933794626904461, "grad_norm": 0.26308000087738037, "learning_rate": 3.879167577761128e-06, "loss": 0.015532130375504494, "memory(GiB)": 21.48, "step": 18266, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.956143 }, { "epoch": 0.5934119481532014, "grad_norm": 0.2922770082950592, "learning_rate": 3.878644101373319e-06, "loss": 0.015941165387630463, "memory(GiB)": 21.48, "step": 18267, "token_acc": 1.0, "train_speed(iter/s)": 0.956154 }, { "epoch": 0.5934444336159569, "grad_norm": 0.3310059905052185, "learning_rate": 3.878120637927453e-06, "loss": 0.018340300768613815, "memory(GiB)": 21.48, "step": 18268, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.956165 }, { "epoch": 0.5934769190787123, "grad_norm": 0.5117852091789246, "learning_rate": 3.877597187429567e-06, "loss": 0.022998925298452377, "memory(GiB)": 21.48, "step": 18269, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956176 }, { "epoch": 0.5935094045414677, "grad_norm": 0.3693371117115021, "learning_rate": 3.8770737498857035e-06, "loss": 0.019771549850702286, "memory(GiB)": 21.48, "step": 18270, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.956187 }, { "epoch": 0.5935418900042231, "grad_norm": 0.31562405824661255, "learning_rate": 3.8765503253019024e-06, "loss": 0.018134817481040955, "memory(GiB)": 21.48, "step": 18271, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956199 }, { "epoch": 0.5935743754669786, "grad_norm": 0.43225905299186707, "learning_rate": 3.876026913684205e-06, "loss": 0.015122046694159508, "memory(GiB)": 21.48, "step": 18272, "token_acc": 1.0, "train_speed(iter/s)": 0.95621 }, { "epoch": 0.5936068609297339, "grad_norm": 0.355646014213562, "learning_rate": 3.875503515038656e-06, "loss": 0.021513212472200394, "memory(GiB)": 21.48, "step": 18273, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.95622 }, { "epoch": 0.5936393463924894, "grad_norm": 0.47061049938201904, "learning_rate": 3.8749801293712905e-06, "loss": 0.01930200308561325, "memory(GiB)": 21.48, "step": 18274, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.95623 }, { "epoch": 0.5936718318552447, "grad_norm": 0.46820470690727234, "learning_rate": 3.874456756688153e-06, "loss": 0.02087607979774475, "memory(GiB)": 21.48, "step": 18275, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956241 }, { "epoch": 0.5937043173180002, "grad_norm": 0.2239871770143509, "learning_rate": 3.873933396995282e-06, "loss": 0.010453009977936745, "memory(GiB)": 21.48, "step": 18276, "token_acc": 1.0, "train_speed(iter/s)": 0.956251 }, { "epoch": 0.5937368027807556, "grad_norm": 0.40192994475364685, "learning_rate": 3.87341005029872e-06, "loss": 0.01845928095281124, "memory(GiB)": 21.48, "step": 18277, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.956262 }, { "epoch": 0.593769288243511, "grad_norm": 1.7460404634475708, "learning_rate": 3.872886716604503e-06, "loss": 0.019017016515135765, "memory(GiB)": 21.48, "step": 18278, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.956273 }, { "epoch": 0.5938017737062664, "grad_norm": 0.3290775418281555, "learning_rate": 3.872363395918676e-06, "loss": 0.01621638797223568, "memory(GiB)": 21.48, "step": 18279, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.956284 }, { "epoch": 0.5938342591690219, "grad_norm": 0.3845241665840149, "learning_rate": 3.871840088247274e-06, "loss": 0.023853078484535217, "memory(GiB)": 21.48, "step": 18280, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.956295 }, { "epoch": 0.5938667446317772, "grad_norm": 0.39468497037887573, "learning_rate": 3.871316793596341e-06, "loss": 0.018109457567334175, "memory(GiB)": 21.48, "step": 18281, "token_acc": 1.0, "train_speed(iter/s)": 0.956306 }, { "epoch": 0.5938992300945327, "grad_norm": 0.41585296392440796, "learning_rate": 3.870793511971913e-06, "loss": 0.017961286008358, "memory(GiB)": 21.48, "step": 18282, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.956317 }, { "epoch": 0.5939317155572881, "grad_norm": 0.46435582637786865, "learning_rate": 3.870270243380031e-06, "loss": 0.025199472904205322, "memory(GiB)": 21.48, "step": 18283, "token_acc": 1.0, "train_speed(iter/s)": 0.956328 }, { "epoch": 0.5939642010200435, "grad_norm": 0.3786892592906952, "learning_rate": 3.8697469878267345e-06, "loss": 0.023724209517240524, "memory(GiB)": 21.48, "step": 18284, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.95634 }, { "epoch": 0.5939966864827989, "grad_norm": 0.31978678703308105, "learning_rate": 3.869223745318063e-06, "loss": 0.016277560964226723, "memory(GiB)": 21.48, "step": 18285, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.956352 }, { "epoch": 0.5940291719455544, "grad_norm": 0.4452967643737793, "learning_rate": 3.868700515860053e-06, "loss": 0.02401217818260193, "memory(GiB)": 21.48, "step": 18286, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956363 }, { "epoch": 0.5940616574083097, "grad_norm": 0.4090365171432495, "learning_rate": 3.868177299458745e-06, "loss": 0.022400662302970886, "memory(GiB)": 21.48, "step": 18287, "token_acc": 1.0, "train_speed(iter/s)": 0.956374 }, { "epoch": 0.5940941428710652, "grad_norm": 0.424152135848999, "learning_rate": 3.867654096120178e-06, "loss": 0.02146698534488678, "memory(GiB)": 21.48, "step": 18288, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956384 }, { "epoch": 0.5941266283338206, "grad_norm": 0.3738202154636383, "learning_rate": 3.867130905850389e-06, "loss": 0.01626812480390072, "memory(GiB)": 21.48, "step": 18289, "token_acc": 1.0, "train_speed(iter/s)": 0.956396 }, { "epoch": 0.594159113796576, "grad_norm": 0.5328394770622253, "learning_rate": 3.8666077286554185e-06, "loss": 0.02258148044347763, "memory(GiB)": 21.48, "step": 18290, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956406 }, { "epoch": 0.5941915992593314, "grad_norm": 0.34355491399765015, "learning_rate": 3.866084564541303e-06, "loss": 0.01602434180676937, "memory(GiB)": 21.48, "step": 18291, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956418 }, { "epoch": 0.5942240847220869, "grad_norm": 0.38197970390319824, "learning_rate": 3.865561413514082e-06, "loss": 0.015746288001537323, "memory(GiB)": 21.48, "step": 18292, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.956428 }, { "epoch": 0.5942565701848422, "grad_norm": 0.33857014775276184, "learning_rate": 3.865038275579788e-06, "loss": 0.023231465369462967, "memory(GiB)": 21.48, "step": 18293, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.95644 }, { "epoch": 0.5942890556475977, "grad_norm": 0.7128979563713074, "learning_rate": 3.8645151507444675e-06, "loss": 0.015930024906992912, "memory(GiB)": 21.48, "step": 18294, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.95645 }, { "epoch": 0.5943215411103531, "grad_norm": 0.29739487171173096, "learning_rate": 3.863992039014151e-06, "loss": 0.015834547579288483, "memory(GiB)": 21.48, "step": 18295, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.956462 }, { "epoch": 0.5943540265731085, "grad_norm": 0.41364786028862, "learning_rate": 3.863468940394881e-06, "loss": 0.019577590748667717, "memory(GiB)": 21.48, "step": 18296, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956472 }, { "epoch": 0.5943865120358639, "grad_norm": 0.6120500564575195, "learning_rate": 3.862945854892689e-06, "loss": 0.022191552445292473, "memory(GiB)": 21.48, "step": 18297, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.956482 }, { "epoch": 0.5944189974986194, "grad_norm": 0.28674110770225525, "learning_rate": 3.862422782513617e-06, "loss": 0.01756840944290161, "memory(GiB)": 21.48, "step": 18298, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.95649 }, { "epoch": 0.5944514829613747, "grad_norm": 0.33077120780944824, "learning_rate": 3.861899723263699e-06, "loss": 0.024346213787794113, "memory(GiB)": 21.48, "step": 18299, "token_acc": 0.9892086330935251, "train_speed(iter/s)": 0.956499 }, { "epoch": 0.5944839684241302, "grad_norm": 0.28540682792663574, "learning_rate": 3.8613766771489744e-06, "loss": 0.01964370347559452, "memory(GiB)": 21.48, "step": 18300, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.956507 }, { "epoch": 0.5945164538868856, "grad_norm": 0.32626041769981384, "learning_rate": 3.860853644175477e-06, "loss": 0.01874728687107563, "memory(GiB)": 21.48, "step": 18301, "token_acc": 1.0, "train_speed(iter/s)": 0.956515 }, { "epoch": 0.594548939349641, "grad_norm": 0.467624694108963, "learning_rate": 3.8603306243492455e-06, "loss": 0.025159625336527824, "memory(GiB)": 21.48, "step": 18302, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.956524 }, { "epoch": 0.5945814248123964, "grad_norm": 0.38544610142707825, "learning_rate": 3.859807617676316e-06, "loss": 0.017072707414627075, "memory(GiB)": 21.48, "step": 18303, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956532 }, { "epoch": 0.5946139102751519, "grad_norm": 0.3851698338985443, "learning_rate": 3.859284624162723e-06, "loss": 0.022231439128518105, "memory(GiB)": 21.48, "step": 18304, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.95654 }, { "epoch": 0.5946463957379073, "grad_norm": 0.24993588030338287, "learning_rate": 3.858761643814505e-06, "loss": 0.012120607309043407, "memory(GiB)": 21.48, "step": 18305, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956547 }, { "epoch": 0.5946788812006627, "grad_norm": 0.40548425912857056, "learning_rate": 3.858238676637695e-06, "loss": 0.018175311386585236, "memory(GiB)": 21.48, "step": 18306, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.956554 }, { "epoch": 0.5947113666634182, "grad_norm": 0.4926624000072479, "learning_rate": 3.857715722638333e-06, "loss": 0.018839938566088676, "memory(GiB)": 21.48, "step": 18307, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956561 }, { "epoch": 0.5947438521261735, "grad_norm": 0.3396211266517639, "learning_rate": 3.85719278182245e-06, "loss": 0.012773394584655762, "memory(GiB)": 21.48, "step": 18308, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956569 }, { "epoch": 0.594776337588929, "grad_norm": 0.39272820949554443, "learning_rate": 3.856669854196084e-06, "loss": 0.02295631356537342, "memory(GiB)": 21.48, "step": 18309, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.956576 }, { "epoch": 0.5948088230516844, "grad_norm": 0.3996562659740448, "learning_rate": 3.856146939765268e-06, "loss": 0.016959741711616516, "memory(GiB)": 21.48, "step": 18310, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.956584 }, { "epoch": 0.5948413085144398, "grad_norm": 0.2367173284292221, "learning_rate": 3.855624038536041e-06, "loss": 0.01372838206589222, "memory(GiB)": 21.48, "step": 18311, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956591 }, { "epoch": 0.5948737939771952, "grad_norm": 0.22146625816822052, "learning_rate": 3.855101150514435e-06, "loss": 0.01326039806008339, "memory(GiB)": 21.48, "step": 18312, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.956599 }, { "epoch": 0.5949062794399507, "grad_norm": 0.7124513983726501, "learning_rate": 3.854578275706485e-06, "loss": 0.021025020629167557, "memory(GiB)": 21.48, "step": 18313, "token_acc": 0.9940476190476191, "train_speed(iter/s)": 0.956607 }, { "epoch": 0.594938764902706, "grad_norm": 0.285506933927536, "learning_rate": 3.854055414118227e-06, "loss": 0.015584650449454784, "memory(GiB)": 21.48, "step": 18314, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.956614 }, { "epoch": 0.5949712503654615, "grad_norm": 0.3897801339626312, "learning_rate": 3.853532565755696e-06, "loss": 0.016581125557422638, "memory(GiB)": 21.48, "step": 18315, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956621 }, { "epoch": 0.5950037358282169, "grad_norm": 0.42813554406166077, "learning_rate": 3.853009730624922e-06, "loss": 0.01656808704137802, "memory(GiB)": 21.48, "step": 18316, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.956629 }, { "epoch": 0.5950362212909723, "grad_norm": 0.28980302810668945, "learning_rate": 3.8524869087319435e-06, "loss": 0.015127879567444324, "memory(GiB)": 21.48, "step": 18317, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.956637 }, { "epoch": 0.5950687067537277, "grad_norm": 0.38122040033340454, "learning_rate": 3.8519641000827944e-06, "loss": 0.019804034382104874, "memory(GiB)": 21.48, "step": 18318, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.956646 }, { "epoch": 0.5951011922164832, "grad_norm": 0.5783669948577881, "learning_rate": 3.8514413046835065e-06, "loss": 0.02310056984424591, "memory(GiB)": 21.48, "step": 18319, "token_acc": 1.0, "train_speed(iter/s)": 0.956656 }, { "epoch": 0.5951336776792385, "grad_norm": 0.4068397283554077, "learning_rate": 3.850918522540117e-06, "loss": 0.019594503566622734, "memory(GiB)": 21.48, "step": 18320, "token_acc": 1.0, "train_speed(iter/s)": 0.956666 }, { "epoch": 0.595166163141994, "grad_norm": 0.4029597342014313, "learning_rate": 3.850395753658654e-06, "loss": 0.01930738240480423, "memory(GiB)": 21.48, "step": 18321, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.956674 }, { "epoch": 0.5951986486047494, "grad_norm": 0.34632954001426697, "learning_rate": 3.849872998045157e-06, "loss": 0.015441039577126503, "memory(GiB)": 21.48, "step": 18322, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956683 }, { "epoch": 0.5952311340675048, "grad_norm": 0.3980066776275635, "learning_rate": 3.849350255705653e-06, "loss": 0.023924466222524643, "memory(GiB)": 21.48, "step": 18323, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.956691 }, { "epoch": 0.5952636195302602, "grad_norm": 0.4400063157081604, "learning_rate": 3.848827526646182e-06, "loss": 0.02382376417517662, "memory(GiB)": 21.48, "step": 18324, "token_acc": 1.0, "train_speed(iter/s)": 0.9567 }, { "epoch": 0.5952961049930157, "grad_norm": 0.33343973755836487, "learning_rate": 3.848304810872771e-06, "loss": 0.024054378271102905, "memory(GiB)": 21.48, "step": 18325, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956709 }, { "epoch": 0.595328590455771, "grad_norm": 0.3404904305934906, "learning_rate": 3.847782108391458e-06, "loss": 0.012603392824530602, "memory(GiB)": 21.48, "step": 18326, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.956718 }, { "epoch": 0.5953610759185265, "grad_norm": 0.37126240134239197, "learning_rate": 3.847259419208272e-06, "loss": 0.01151550654321909, "memory(GiB)": 21.48, "step": 18327, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.956726 }, { "epoch": 0.5953935613812819, "grad_norm": 0.39857929944992065, "learning_rate": 3.846736743329246e-06, "loss": 0.02367999590933323, "memory(GiB)": 21.48, "step": 18328, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.956737 }, { "epoch": 0.5954260468440373, "grad_norm": 0.38657712936401367, "learning_rate": 3.846214080760413e-06, "loss": 0.013967229053378105, "memory(GiB)": 21.48, "step": 18329, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956748 }, { "epoch": 0.5954585323067927, "grad_norm": 1.3098710775375366, "learning_rate": 3.845691431507805e-06, "loss": 0.02152319997549057, "memory(GiB)": 21.48, "step": 18330, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.956759 }, { "epoch": 0.5954910177695482, "grad_norm": 0.3129602372646332, "learning_rate": 3.845168795577454e-06, "loss": 0.013753235340118408, "memory(GiB)": 21.48, "step": 18331, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.95677 }, { "epoch": 0.5955235032323035, "grad_norm": 0.7176083922386169, "learning_rate": 3.844646172975393e-06, "loss": 0.019575249403715134, "memory(GiB)": 21.48, "step": 18332, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.956781 }, { "epoch": 0.595555988695059, "grad_norm": 0.24657806754112244, "learning_rate": 3.844123563707653e-06, "loss": 0.011118723079562187, "memory(GiB)": 21.48, "step": 18333, "token_acc": 1.0, "train_speed(iter/s)": 0.956792 }, { "epoch": 0.5955884741578144, "grad_norm": 0.3392619788646698, "learning_rate": 3.8436009677802645e-06, "loss": 0.01939368061721325, "memory(GiB)": 21.48, "step": 18334, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.956803 }, { "epoch": 0.5956209596205698, "grad_norm": 0.5817772150039673, "learning_rate": 3.8430783851992615e-06, "loss": 0.026242133229970932, "memory(GiB)": 21.48, "step": 18335, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.956814 }, { "epoch": 0.5956534450833252, "grad_norm": 0.35739824175834656, "learning_rate": 3.842555815970672e-06, "loss": 0.013995162211358547, "memory(GiB)": 21.48, "step": 18336, "token_acc": 1.0, "train_speed(iter/s)": 0.956824 }, { "epoch": 0.5956859305460807, "grad_norm": 0.34166792035102844, "learning_rate": 3.842033260100531e-06, "loss": 0.016605325043201447, "memory(GiB)": 21.48, "step": 18337, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956835 }, { "epoch": 0.595718416008836, "grad_norm": 0.33667466044425964, "learning_rate": 3.8415107175948655e-06, "loss": 0.018540501594543457, "memory(GiB)": 21.48, "step": 18338, "token_acc": 1.0, "train_speed(iter/s)": 0.956847 }, { "epoch": 0.5957509014715915, "grad_norm": 0.47125136852264404, "learning_rate": 3.84098818845971e-06, "loss": 0.024082496762275696, "memory(GiB)": 21.48, "step": 18339, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.956857 }, { "epoch": 0.5957833869343468, "grad_norm": 0.30683159828186035, "learning_rate": 3.840465672701092e-06, "loss": 0.016413897275924683, "memory(GiB)": 21.48, "step": 18340, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.956868 }, { "epoch": 0.5958158723971023, "grad_norm": 0.3803374469280243, "learning_rate": 3.839943170325045e-06, "loss": 0.01723967120051384, "memory(GiB)": 21.48, "step": 18341, "token_acc": 1.0, "train_speed(iter/s)": 0.956879 }, { "epoch": 0.5958483578598577, "grad_norm": 0.387236088514328, "learning_rate": 3.839420681337596e-06, "loss": 0.021207481622695923, "memory(GiB)": 21.48, "step": 18342, "token_acc": 0.9884169884169884, "train_speed(iter/s)": 0.95689 }, { "epoch": 0.5958808433226132, "grad_norm": 0.28318503499031067, "learning_rate": 3.838898205744779e-06, "loss": 0.01284240186214447, "memory(GiB)": 21.48, "step": 18343, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.956901 }, { "epoch": 0.5959133287853685, "grad_norm": 0.33665570616722107, "learning_rate": 3.8383757435526206e-06, "loss": 0.011957624927163124, "memory(GiB)": 21.48, "step": 18344, "token_acc": 0.99609375, "train_speed(iter/s)": 0.956912 }, { "epoch": 0.595945814248124, "grad_norm": 0.3357461392879486, "learning_rate": 3.837853294767154e-06, "loss": 0.019166599959135056, "memory(GiB)": 21.48, "step": 18345, "token_acc": 0.9885057471264368, "train_speed(iter/s)": 0.956922 }, { "epoch": 0.5959782997108793, "grad_norm": 0.28855857253074646, "learning_rate": 3.837330859394404e-06, "loss": 0.01656065136194229, "memory(GiB)": 21.48, "step": 18346, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956933 }, { "epoch": 0.5960107851736348, "grad_norm": 0.34819743037223816, "learning_rate": 3.836808437440407e-06, "loss": 0.01629798300564289, "memory(GiB)": 21.48, "step": 18347, "token_acc": 1.0, "train_speed(iter/s)": 0.956945 }, { "epoch": 0.5960432706363902, "grad_norm": 0.4099716544151306, "learning_rate": 3.836286028911186e-06, "loss": 0.015501494519412518, "memory(GiB)": 21.48, "step": 18348, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.956955 }, { "epoch": 0.5960757560991456, "grad_norm": 0.26412418484687805, "learning_rate": 3.835763633812773e-06, "loss": 0.013590593822300434, "memory(GiB)": 21.48, "step": 18349, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.956966 }, { "epoch": 0.596108241561901, "grad_norm": 0.2789568305015564, "learning_rate": 3.835241252151201e-06, "loss": 0.017830846831202507, "memory(GiB)": 21.48, "step": 18350, "token_acc": 1.0, "train_speed(iter/s)": 0.956978 }, { "epoch": 0.5961407270246565, "grad_norm": 0.24175626039505005, "learning_rate": 3.834718883932491e-06, "loss": 0.011135273613035679, "memory(GiB)": 21.48, "step": 18351, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.956989 }, { "epoch": 0.5961732124874118, "grad_norm": 0.3371800184249878, "learning_rate": 3.8341965291626795e-06, "loss": 0.018084663897752762, "memory(GiB)": 21.48, "step": 18352, "token_acc": 1.0, "train_speed(iter/s)": 0.957 }, { "epoch": 0.5962056979501673, "grad_norm": 0.2751951217651367, "learning_rate": 3.833674187847788e-06, "loss": 0.018275490030646324, "memory(GiB)": 21.48, "step": 18353, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.957011 }, { "epoch": 0.5962381834129227, "grad_norm": 0.32427722215652466, "learning_rate": 3.8331518599938525e-06, "loss": 0.01342684030532837, "memory(GiB)": 21.48, "step": 18354, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.957022 }, { "epoch": 0.5962706688756781, "grad_norm": 0.42268455028533936, "learning_rate": 3.832629545606896e-06, "loss": 0.017497442662715912, "memory(GiB)": 21.48, "step": 18355, "token_acc": 0.98828125, "train_speed(iter/s)": 0.957033 }, { "epoch": 0.5963031543384335, "grad_norm": 0.2707284688949585, "learning_rate": 3.832107244692949e-06, "loss": 0.019037775695323944, "memory(GiB)": 21.48, "step": 18356, "token_acc": 0.9856459330143541, "train_speed(iter/s)": 0.957044 }, { "epoch": 0.596335639801189, "grad_norm": 0.36178359389305115, "learning_rate": 3.831584957258037e-06, "loss": 0.01747279427945614, "memory(GiB)": 21.48, "step": 18357, "token_acc": 1.0, "train_speed(iter/s)": 0.957054 }, { "epoch": 0.5963681252639443, "grad_norm": 0.2762938141822815, "learning_rate": 3.831062683308191e-06, "loss": 0.019402727484703064, "memory(GiB)": 21.48, "step": 18358, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957062 }, { "epoch": 0.5964006107266998, "grad_norm": 0.3120114207267761, "learning_rate": 3.830540422849437e-06, "loss": 0.013684054836630821, "memory(GiB)": 21.48, "step": 18359, "token_acc": 0.992, "train_speed(iter/s)": 0.957071 }, { "epoch": 0.5964330961894552, "grad_norm": 0.33880698680877686, "learning_rate": 3.830018175887804e-06, "loss": 0.012635217979550362, "memory(GiB)": 21.48, "step": 18360, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.957079 }, { "epoch": 0.5964655816522106, "grad_norm": 0.1918289214372635, "learning_rate": 3.829495942429318e-06, "loss": 0.011042816564440727, "memory(GiB)": 21.48, "step": 18361, "token_acc": 1.0, "train_speed(iter/s)": 0.957087 }, { "epoch": 0.596498067114966, "grad_norm": 0.6571418046951294, "learning_rate": 3.828973722480007e-06, "loss": 0.01732998713850975, "memory(GiB)": 21.48, "step": 18362, "token_acc": 0.9941176470588236, "train_speed(iter/s)": 0.957096 }, { "epoch": 0.5965305525777215, "grad_norm": 0.4402299225330353, "learning_rate": 3.828451516045897e-06, "loss": 0.0220863688737154, "memory(GiB)": 21.48, "step": 18363, "token_acc": 0.9877551020408163, "train_speed(iter/s)": 0.957104 }, { "epoch": 0.5965630380404768, "grad_norm": 0.41185733675956726, "learning_rate": 3.827929323133016e-06, "loss": 0.017554230988025665, "memory(GiB)": 21.48, "step": 18364, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957112 }, { "epoch": 0.5965955235032323, "grad_norm": 0.3196592330932617, "learning_rate": 3.8274071437473915e-06, "loss": 0.013334987685084343, "memory(GiB)": 21.48, "step": 18365, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.95712 }, { "epoch": 0.5966280089659877, "grad_norm": 0.3606712222099304, "learning_rate": 3.826884977895048e-06, "loss": 0.013841248117387295, "memory(GiB)": 21.48, "step": 18366, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957127 }, { "epoch": 0.5966604944287431, "grad_norm": 0.4194527864456177, "learning_rate": 3.8263628255820154e-06, "loss": 0.023127764463424683, "memory(GiB)": 21.48, "step": 18367, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.957134 }, { "epoch": 0.5966929798914986, "grad_norm": 0.43519142270088196, "learning_rate": 3.825840686814316e-06, "loss": 0.020356589928269386, "memory(GiB)": 21.48, "step": 18368, "token_acc": 1.0, "train_speed(iter/s)": 0.957141 }, { "epoch": 0.596725465354254, "grad_norm": 0.3352220356464386, "learning_rate": 3.8253185615979785e-06, "loss": 0.018883485347032547, "memory(GiB)": 21.48, "step": 18369, "token_acc": 0.9963898916967509, "train_speed(iter/s)": 0.957149 }, { "epoch": 0.5967579508170094, "grad_norm": 0.34083202481269836, "learning_rate": 3.824796449939028e-06, "loss": 0.01801772229373455, "memory(GiB)": 21.48, "step": 18370, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957156 }, { "epoch": 0.5967904362797648, "grad_norm": 0.3253859579563141, "learning_rate": 3.824274351843491e-06, "loss": 0.017819317057728767, "memory(GiB)": 21.48, "step": 18371, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.957163 }, { "epoch": 0.5968229217425203, "grad_norm": 0.35538408160209656, "learning_rate": 3.823752267317392e-06, "loss": 0.007100886199623346, "memory(GiB)": 21.48, "step": 18372, "token_acc": 1.0, "train_speed(iter/s)": 0.957172 }, { "epoch": 0.5968554072052756, "grad_norm": 0.45107996463775635, "learning_rate": 3.82323019636676e-06, "loss": 0.016654105857014656, "memory(GiB)": 21.48, "step": 18373, "token_acc": 1.0, "train_speed(iter/s)": 0.957179 }, { "epoch": 0.5968878926680311, "grad_norm": 0.5023375153541565, "learning_rate": 3.822708138997615e-06, "loss": 0.018585633486509323, "memory(GiB)": 21.48, "step": 18374, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957186 }, { "epoch": 0.5969203781307865, "grad_norm": 0.3130742311477661, "learning_rate": 3.822186095215987e-06, "loss": 0.017631975933909416, "memory(GiB)": 21.48, "step": 18375, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.957195 }, { "epoch": 0.5969528635935419, "grad_norm": 0.3021019995212555, "learning_rate": 3.821664065027897e-06, "loss": 0.011470569297671318, "memory(GiB)": 21.48, "step": 18376, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.957202 }, { "epoch": 0.5969853490562973, "grad_norm": 0.378129780292511, "learning_rate": 3.821142048439374e-06, "loss": 0.023001965135335922, "memory(GiB)": 21.48, "step": 18377, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.957211 }, { "epoch": 0.5970178345190528, "grad_norm": 0.2839696705341339, "learning_rate": 3.82062004545644e-06, "loss": 0.016108045354485512, "memory(GiB)": 21.48, "step": 18378, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.95722 }, { "epoch": 0.5970503199818081, "grad_norm": 0.33905029296875, "learning_rate": 3.820098056085119e-06, "loss": 0.017630048096179962, "memory(GiB)": 21.48, "step": 18379, "token_acc": 0.9929824561403509, "train_speed(iter/s)": 0.957229 }, { "epoch": 0.5970828054445636, "grad_norm": 0.3808140456676483, "learning_rate": 3.81957608033144e-06, "loss": 0.01741502434015274, "memory(GiB)": 21.48, "step": 18380, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.957238 }, { "epoch": 0.597115290907319, "grad_norm": 0.27585330605506897, "learning_rate": 3.81905411820142e-06, "loss": 0.018430817872285843, "memory(GiB)": 21.48, "step": 18381, "token_acc": 1.0, "train_speed(iter/s)": 0.957247 }, { "epoch": 0.5971477763700744, "grad_norm": 0.29331153631210327, "learning_rate": 3.818532169701093e-06, "loss": 0.016087036579847336, "memory(GiB)": 21.48, "step": 18382, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.957256 }, { "epoch": 0.5971802618328298, "grad_norm": 0.3632010817527771, "learning_rate": 3.818010234836473e-06, "loss": 0.01843392476439476, "memory(GiB)": 21.48, "step": 18383, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957265 }, { "epoch": 0.5972127472955853, "grad_norm": 0.43672314286231995, "learning_rate": 3.8174883136135895e-06, "loss": 0.01793554425239563, "memory(GiB)": 21.48, "step": 18384, "token_acc": 0.994475138121547, "train_speed(iter/s)": 0.957274 }, { "epoch": 0.5972452327583406, "grad_norm": 0.3871716856956482, "learning_rate": 3.816966406038464e-06, "loss": 0.014973444864153862, "memory(GiB)": 21.48, "step": 18385, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957281 }, { "epoch": 0.5972777182210961, "grad_norm": 0.2871021032333374, "learning_rate": 3.816444512117122e-06, "loss": 0.0132759939879179, "memory(GiB)": 21.48, "step": 18386, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.957288 }, { "epoch": 0.5973102036838515, "grad_norm": 0.3864302933216095, "learning_rate": 3.815922631855584e-06, "loss": 0.024230998009443283, "memory(GiB)": 21.48, "step": 18387, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957297 }, { "epoch": 0.5973426891466069, "grad_norm": 0.2815859913825989, "learning_rate": 3.8154007652598765e-06, "loss": 0.018427850678563118, "memory(GiB)": 21.48, "step": 18388, "token_acc": 0.9947916666666666, "train_speed(iter/s)": 0.957306 }, { "epoch": 0.5973751746093623, "grad_norm": 0.3437252938747406, "learning_rate": 3.81487891233602e-06, "loss": 0.021728230640292168, "memory(GiB)": 21.48, "step": 18389, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.957315 }, { "epoch": 0.5974076600721178, "grad_norm": 0.42788466811180115, "learning_rate": 3.814357073090039e-06, "loss": 0.016015667468309402, "memory(GiB)": 21.48, "step": 18390, "token_acc": 1.0, "train_speed(iter/s)": 0.957326 }, { "epoch": 0.5974401455348731, "grad_norm": 0.45785170793533325, "learning_rate": 3.8138352475279542e-06, "loss": 0.020165711641311646, "memory(GiB)": 21.48, "step": 18391, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.957336 }, { "epoch": 0.5974726309976286, "grad_norm": 0.4755629301071167, "learning_rate": 3.8133134356557904e-06, "loss": 0.021634342148900032, "memory(GiB)": 21.48, "step": 18392, "token_acc": 0.9948979591836735, "train_speed(iter/s)": 0.957347 }, { "epoch": 0.597505116460384, "grad_norm": 0.247777059674263, "learning_rate": 3.8127916374795682e-06, "loss": 0.012992789968848228, "memory(GiB)": 21.48, "step": 18393, "token_acc": 0.9841772151898734, "train_speed(iter/s)": 0.957357 }, { "epoch": 0.5975376019231394, "grad_norm": 0.32620635628700256, "learning_rate": 3.812269853005311e-06, "loss": 0.02049718238413334, "memory(GiB)": 21.48, "step": 18394, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957368 }, { "epoch": 0.5975700873858948, "grad_norm": 0.2530500292778015, "learning_rate": 3.811748082239041e-06, "loss": 0.014623553492128849, "memory(GiB)": 21.48, "step": 18395, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957379 }, { "epoch": 0.5976025728486503, "grad_norm": 0.3186897337436676, "learning_rate": 3.8112263251867794e-06, "loss": 0.015660110861063004, "memory(GiB)": 21.48, "step": 18396, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.957391 }, { "epoch": 0.5976350583114056, "grad_norm": 0.2797468900680542, "learning_rate": 3.8107045818545485e-06, "loss": 0.01886827126145363, "memory(GiB)": 21.48, "step": 18397, "token_acc": 0.99, "train_speed(iter/s)": 0.957401 }, { "epoch": 0.5976675437741611, "grad_norm": 0.263530969619751, "learning_rate": 3.810182852248369e-06, "loss": 0.012092106975615025, "memory(GiB)": 21.48, "step": 18398, "token_acc": 1.0, "train_speed(iter/s)": 0.957412 }, { "epoch": 0.5977000292369165, "grad_norm": 0.3410889208316803, "learning_rate": 3.8096611363742642e-06, "loss": 0.016568001359701157, "memory(GiB)": 21.48, "step": 18399, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957422 }, { "epoch": 0.5977325146996719, "grad_norm": 0.291233092546463, "learning_rate": 3.8091394342382528e-06, "loss": 0.014507204294204712, "memory(GiB)": 21.48, "step": 18400, "token_acc": 1.0, "train_speed(iter/s)": 0.957433 }, { "epoch": 0.5977650001624273, "grad_norm": 0.2964802086353302, "learning_rate": 3.8086177458463597e-06, "loss": 0.010566510260105133, "memory(GiB)": 21.48, "step": 18401, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.957445 }, { "epoch": 0.5977974856251828, "grad_norm": 0.29081177711486816, "learning_rate": 3.8080960712046e-06, "loss": 0.014847762882709503, "memory(GiB)": 21.48, "step": 18402, "token_acc": 1.0, "train_speed(iter/s)": 0.957455 }, { "epoch": 0.5978299710879381, "grad_norm": 0.5116187930107117, "learning_rate": 3.8075744103190022e-06, "loss": 0.020676087588071823, "memory(GiB)": 21.48, "step": 18403, "token_acc": 1.0, "train_speed(iter/s)": 0.957467 }, { "epoch": 0.5978624565506936, "grad_norm": 0.3243648111820221, "learning_rate": 3.807052763195579e-06, "loss": 0.01817021518945694, "memory(GiB)": 21.48, "step": 18404, "token_acc": 1.0, "train_speed(iter/s)": 0.957477 }, { "epoch": 0.597894942013449, "grad_norm": 0.42953386902809143, "learning_rate": 3.8065311298403575e-06, "loss": 0.01618148945271969, "memory(GiB)": 21.48, "step": 18405, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.957488 }, { "epoch": 0.5979274274762044, "grad_norm": 0.2825144827365875, "learning_rate": 3.8060095102593535e-06, "loss": 0.017575331032276154, "memory(GiB)": 21.48, "step": 18406, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.957499 }, { "epoch": 0.5979599129389598, "grad_norm": 0.28723666071891785, "learning_rate": 3.80548790445859e-06, "loss": 0.009934395551681519, "memory(GiB)": 21.48, "step": 18407, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.95751 }, { "epoch": 0.5979923984017153, "grad_norm": 0.4475480616092682, "learning_rate": 3.804966312444085e-06, "loss": 0.020670723170042038, "memory(GiB)": 21.48, "step": 18408, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.95752 }, { "epoch": 0.5980248838644706, "grad_norm": 0.4056227207183838, "learning_rate": 3.8044447342218606e-06, "loss": 0.014219594188034534, "memory(GiB)": 21.48, "step": 18409, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.95753 }, { "epoch": 0.5980573693272261, "grad_norm": 0.39849671721458435, "learning_rate": 3.803923169797934e-06, "loss": 0.016004158183932304, "memory(GiB)": 21.48, "step": 18410, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.957541 }, { "epoch": 0.5980898547899814, "grad_norm": 0.4371981918811798, "learning_rate": 3.803401619178326e-06, "loss": 0.02585631236433983, "memory(GiB)": 21.48, "step": 18411, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.957552 }, { "epoch": 0.5981223402527369, "grad_norm": 0.36106953024864197, "learning_rate": 3.802880082369057e-06, "loss": 0.012128127738833427, "memory(GiB)": 21.48, "step": 18412, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.957563 }, { "epoch": 0.5981548257154923, "grad_norm": 0.3811551630496979, "learning_rate": 3.8023585593761444e-06, "loss": 0.01636703684926033, "memory(GiB)": 21.48, "step": 18413, "token_acc": 1.0, "train_speed(iter/s)": 0.957573 }, { "epoch": 0.5981873111782477, "grad_norm": 0.2785488963127136, "learning_rate": 3.801837050205609e-06, "loss": 0.015042094513773918, "memory(GiB)": 21.48, "step": 18414, "token_acc": 0.9923076923076923, "train_speed(iter/s)": 0.957584 }, { "epoch": 0.5982197966410031, "grad_norm": 0.38382887840270996, "learning_rate": 3.8013155548634673e-06, "loss": 0.017832133919000626, "memory(GiB)": 21.48, "step": 18415, "token_acc": 0.9884615384615385, "train_speed(iter/s)": 0.957594 }, { "epoch": 0.5982522821037586, "grad_norm": 0.6098207831382751, "learning_rate": 3.8007940733557412e-06, "loss": 0.0338507704436779, "memory(GiB)": 21.48, "step": 18416, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.957605 }, { "epoch": 0.5982847675665139, "grad_norm": 0.4704051911830902, "learning_rate": 3.8002726056884467e-06, "loss": 0.024640455842018127, "memory(GiB)": 21.48, "step": 18417, "token_acc": 0.988, "train_speed(iter/s)": 0.957613 }, { "epoch": 0.5983172530292694, "grad_norm": 0.37383517622947693, "learning_rate": 3.7997511518676035e-06, "loss": 0.01835152879357338, "memory(GiB)": 21.48, "step": 18418, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.957621 }, { "epoch": 0.5983497384920248, "grad_norm": 0.3506520986557007, "learning_rate": 3.7992297118992283e-06, "loss": 0.013997260481119156, "memory(GiB)": 21.48, "step": 18419, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.95763 }, { "epoch": 0.5983822239547802, "grad_norm": 0.33540788292884827, "learning_rate": 3.798708285789342e-06, "loss": 0.017014173790812492, "memory(GiB)": 21.48, "step": 18420, "token_acc": 0.9905660377358491, "train_speed(iter/s)": 0.957638 }, { "epoch": 0.5984147094175356, "grad_norm": 0.4719250500202179, "learning_rate": 3.7981868735439604e-06, "loss": 0.01873769983649254, "memory(GiB)": 21.48, "step": 18421, "token_acc": 0.996, "train_speed(iter/s)": 0.957647 }, { "epoch": 0.5984471948802911, "grad_norm": 0.3672448396682739, "learning_rate": 3.7976654751691024e-06, "loss": 0.015715789049863815, "memory(GiB)": 21.48, "step": 18422, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.957656 }, { "epoch": 0.5984796803430464, "grad_norm": 0.3406708538532257, "learning_rate": 3.797144090670784e-06, "loss": 0.013159187510609627, "memory(GiB)": 21.48, "step": 18423, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.957664 }, { "epoch": 0.5985121658058019, "grad_norm": 0.3306277394294739, "learning_rate": 3.7966227200550267e-06, "loss": 0.018402855843305588, "memory(GiB)": 21.48, "step": 18424, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957672 }, { "epoch": 0.5985446512685573, "grad_norm": 0.7642738223075867, "learning_rate": 3.796101363327841e-06, "loss": 0.017238035798072815, "memory(GiB)": 21.48, "step": 18425, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.95768 }, { "epoch": 0.5985771367313127, "grad_norm": 0.40498241782188416, "learning_rate": 3.795580020495249e-06, "loss": 0.01490348856896162, "memory(GiB)": 21.48, "step": 18426, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957689 }, { "epoch": 0.5986096221940681, "grad_norm": 0.38954535126686096, "learning_rate": 3.7950586915632677e-06, "loss": 0.015521189197897911, "memory(GiB)": 21.48, "step": 18427, "token_acc": 1.0, "train_speed(iter/s)": 0.957697 }, { "epoch": 0.5986421076568236, "grad_norm": 0.31315040588378906, "learning_rate": 3.7945373765379116e-06, "loss": 0.01780039630830288, "memory(GiB)": 21.48, "step": 18428, "token_acc": 1.0, "train_speed(iter/s)": 0.957704 }, { "epoch": 0.5986745931195789, "grad_norm": 0.3738812208175659, "learning_rate": 3.7940160754252016e-06, "loss": 0.017169471830129623, "memory(GiB)": 21.48, "step": 18429, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.957712 }, { "epoch": 0.5987070785823344, "grad_norm": 0.27035367488861084, "learning_rate": 3.793494788231147e-06, "loss": 0.011131983250379562, "memory(GiB)": 21.48, "step": 18430, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957719 }, { "epoch": 0.5987395640450898, "grad_norm": 0.6022470593452454, "learning_rate": 3.7929735149617727e-06, "loss": 0.017389707267284393, "memory(GiB)": 21.48, "step": 18431, "token_acc": 1.0, "train_speed(iter/s)": 0.957727 }, { "epoch": 0.5987720495078452, "grad_norm": 0.3501394987106323, "learning_rate": 3.7924522556230866e-06, "loss": 0.01946643926203251, "memory(GiB)": 21.48, "step": 18432, "token_acc": 1.0, "train_speed(iter/s)": 0.957735 }, { "epoch": 0.5988045349706007, "grad_norm": 0.2854262590408325, "learning_rate": 3.791931010221112e-06, "loss": 0.011677930131554604, "memory(GiB)": 21.48, "step": 18433, "token_acc": 1.0, "train_speed(iter/s)": 0.957742 }, { "epoch": 0.5988370204333561, "grad_norm": 0.3787858784198761, "learning_rate": 3.79140977876186e-06, "loss": 0.017925601452589035, "memory(GiB)": 21.48, "step": 18434, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.957749 }, { "epoch": 0.5988695058961115, "grad_norm": 0.4105556011199951, "learning_rate": 3.790888561251348e-06, "loss": 0.01939459517598152, "memory(GiB)": 21.48, "step": 18435, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.957757 }, { "epoch": 0.5989019913588669, "grad_norm": 0.4636725187301636, "learning_rate": 3.790367357695591e-06, "loss": 0.020396698266267776, "memory(GiB)": 21.48, "step": 18436, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957764 }, { "epoch": 0.5989344768216224, "grad_norm": 0.38605359196662903, "learning_rate": 3.789846168100605e-06, "loss": 0.0190263744443655, "memory(GiB)": 21.48, "step": 18437, "token_acc": 0.9835390946502057, "train_speed(iter/s)": 0.957773 }, { "epoch": 0.5989669622843777, "grad_norm": 0.2920357882976532, "learning_rate": 3.789324992472404e-06, "loss": 0.015814295038580894, "memory(GiB)": 21.48, "step": 18438, "token_acc": 1.0, "train_speed(iter/s)": 0.957781 }, { "epoch": 0.5989994477471332, "grad_norm": 0.3433573246002197, "learning_rate": 3.7888038308170054e-06, "loss": 0.016993673518300056, "memory(GiB)": 21.48, "step": 18439, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.95779 }, { "epoch": 0.5990319332098886, "grad_norm": 0.5078611969947815, "learning_rate": 3.7882826831404217e-06, "loss": 0.014863662421703339, "memory(GiB)": 21.48, "step": 18440, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.957798 }, { "epoch": 0.599064418672644, "grad_norm": 0.46156948804855347, "learning_rate": 3.7877615494486684e-06, "loss": 0.016652995720505714, "memory(GiB)": 21.48, "step": 18441, "token_acc": 1.0, "train_speed(iter/s)": 0.957807 }, { "epoch": 0.5990969041353994, "grad_norm": 0.37756556272506714, "learning_rate": 3.7872404297477615e-06, "loss": 0.019416389986872673, "memory(GiB)": 21.48, "step": 18442, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957816 }, { "epoch": 0.5991293895981549, "grad_norm": 0.4294869005680084, "learning_rate": 3.7867193240437127e-06, "loss": 0.015768110752105713, "memory(GiB)": 21.48, "step": 18443, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.957825 }, { "epoch": 0.5991618750609102, "grad_norm": 0.3036177158355713, "learning_rate": 3.7861982323425395e-06, "loss": 0.018642373383045197, "memory(GiB)": 21.48, "step": 18444, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.957833 }, { "epoch": 0.5991943605236657, "grad_norm": 0.4401935040950775, "learning_rate": 3.785677154650253e-06, "loss": 0.02043086662888527, "memory(GiB)": 21.48, "step": 18445, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.957842 }, { "epoch": 0.5992268459864211, "grad_norm": 0.48281219601631165, "learning_rate": 3.785156090972869e-06, "loss": 0.016776470467448235, "memory(GiB)": 21.48, "step": 18446, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.95785 }, { "epoch": 0.5992593314491765, "grad_norm": 0.3306262493133545, "learning_rate": 3.7846350413164e-06, "loss": 0.015156509354710579, "memory(GiB)": 21.48, "step": 18447, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957857 }, { "epoch": 0.5992918169119319, "grad_norm": 0.33620965480804443, "learning_rate": 3.7841140056868618e-06, "loss": 0.020264022052288055, "memory(GiB)": 21.48, "step": 18448, "token_acc": 1.0, "train_speed(iter/s)": 0.957865 }, { "epoch": 0.5993243023746874, "grad_norm": 0.4016658663749695, "learning_rate": 3.7835929840902645e-06, "loss": 0.015951603651046753, "memory(GiB)": 21.48, "step": 18449, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957874 }, { "epoch": 0.5993567878374427, "grad_norm": 0.3144795894622803, "learning_rate": 3.7830719765326247e-06, "loss": 0.017913546413183212, "memory(GiB)": 21.48, "step": 18450, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.957883 }, { "epoch": 0.5993892733001982, "grad_norm": 0.4397820234298706, "learning_rate": 3.7825509830199525e-06, "loss": 0.018631547689437866, "memory(GiB)": 21.48, "step": 18451, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.957891 }, { "epoch": 0.5994217587629536, "grad_norm": 0.7410096526145935, "learning_rate": 3.782030003558265e-06, "loss": 0.026490308344364166, "memory(GiB)": 21.48, "step": 18452, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.957901 }, { "epoch": 0.599454244225709, "grad_norm": 0.44175559282302856, "learning_rate": 3.781509038153569e-06, "loss": 0.026845166459679604, "memory(GiB)": 21.48, "step": 18453, "token_acc": 0.9836065573770492, "train_speed(iter/s)": 0.957912 }, { "epoch": 0.5994867296884644, "grad_norm": 0.27904245257377625, "learning_rate": 3.7809880868118837e-06, "loss": 0.0161756444722414, "memory(GiB)": 21.48, "step": 18454, "token_acc": 1.0, "train_speed(iter/s)": 0.957923 }, { "epoch": 0.5995192151512199, "grad_norm": 0.3558316230773926, "learning_rate": 3.7804671495392155e-06, "loss": 0.01713109388947487, "memory(GiB)": 21.48, "step": 18455, "token_acc": 1.0, "train_speed(iter/s)": 0.957935 }, { "epoch": 0.5995517006139752, "grad_norm": 0.4669761061668396, "learning_rate": 3.7799462263415798e-06, "loss": 0.025623489171266556, "memory(GiB)": 21.48, "step": 18456, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957945 }, { "epoch": 0.5995841860767307, "grad_norm": 0.45734211802482605, "learning_rate": 3.779425317224991e-06, "loss": 0.01817585900425911, "memory(GiB)": 21.48, "step": 18457, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.957956 }, { "epoch": 0.5996166715394861, "grad_norm": 0.3365108370780945, "learning_rate": 3.778904422195456e-06, "loss": 0.01937483251094818, "memory(GiB)": 21.48, "step": 18458, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957967 }, { "epoch": 0.5996491570022415, "grad_norm": 0.34987571835517883, "learning_rate": 3.7783835412589918e-06, "loss": 0.014737145975232124, "memory(GiB)": 21.48, "step": 18459, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957979 }, { "epoch": 0.5996816424649969, "grad_norm": 0.3523600101470947, "learning_rate": 3.7778626744216045e-06, "loss": 0.018476776778697968, "memory(GiB)": 21.48, "step": 18460, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.957989 }, { "epoch": 0.5997141279277524, "grad_norm": 1.115429401397705, "learning_rate": 3.7773418216893122e-06, "loss": 0.026365285739302635, "memory(GiB)": 21.48, "step": 18461, "token_acc": 1.0, "train_speed(iter/s)": 0.958 }, { "epoch": 0.5997466133905077, "grad_norm": 0.4102684557437897, "learning_rate": 3.77682098306812e-06, "loss": 0.01783210039138794, "memory(GiB)": 21.48, "step": 18462, "token_acc": 1.0, "train_speed(iter/s)": 0.958011 }, { "epoch": 0.5997790988532632, "grad_norm": 0.538691520690918, "learning_rate": 3.776300158564043e-06, "loss": 0.021565861999988556, "memory(GiB)": 21.48, "step": 18463, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.958022 }, { "epoch": 0.5998115843160186, "grad_norm": 0.29186776280403137, "learning_rate": 3.7757793481830895e-06, "loss": 0.01151338778436184, "memory(GiB)": 21.48, "step": 18464, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.958033 }, { "epoch": 0.599844069778774, "grad_norm": 0.27252325415611267, "learning_rate": 3.7752585519312735e-06, "loss": 0.015164094977080822, "memory(GiB)": 21.48, "step": 18465, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.958045 }, { "epoch": 0.5998765552415294, "grad_norm": 0.2767678499221802, "learning_rate": 3.774737769814602e-06, "loss": 0.011929803527891636, "memory(GiB)": 21.48, "step": 18466, "token_acc": 1.0, "train_speed(iter/s)": 0.958055 }, { "epoch": 0.5999090407042849, "grad_norm": 0.4488244652748108, "learning_rate": 3.774217001839089e-06, "loss": 0.016356755048036575, "memory(GiB)": 21.48, "step": 18467, "token_acc": 0.9964285714285714, "train_speed(iter/s)": 0.958065 }, { "epoch": 0.5999415261670402, "grad_norm": 0.4600907862186432, "learning_rate": 3.7736962480107424e-06, "loss": 0.022754022851586342, "memory(GiB)": 21.48, "step": 18468, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.958075 }, { "epoch": 0.5999740116297957, "grad_norm": 0.4685324728488922, "learning_rate": 3.7731755083355744e-06, "loss": 0.019070304930210114, "memory(GiB)": 21.48, "step": 18469, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.958086 }, { "epoch": 0.600006497092551, "grad_norm": 0.3284624218940735, "learning_rate": 3.7726547828195924e-06, "loss": 0.01059524342417717, "memory(GiB)": 21.48, "step": 18470, "token_acc": 1.0, "train_speed(iter/s)": 0.958097 }, { "epoch": 0.6000389825553065, "grad_norm": 0.3068932294845581, "learning_rate": 3.772134071468808e-06, "loss": 0.013494878076016903, "memory(GiB)": 21.48, "step": 18471, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.958108 }, { "epoch": 0.6000714680180619, "grad_norm": 0.27473053336143494, "learning_rate": 3.7716133742892317e-06, "loss": 0.01739323139190674, "memory(GiB)": 21.48, "step": 18472, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.958119 }, { "epoch": 0.6001039534808174, "grad_norm": 0.38494858145713806, "learning_rate": 3.771092691286871e-06, "loss": 0.015101799741387367, "memory(GiB)": 21.48, "step": 18473, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.958129 }, { "epoch": 0.6001364389435727, "grad_norm": 0.29621848464012146, "learning_rate": 3.770572022467738e-06, "loss": 0.017137957736849785, "memory(GiB)": 21.48, "step": 18474, "token_acc": 1.0, "train_speed(iter/s)": 0.95814 }, { "epoch": 0.6001689244063282, "grad_norm": 0.282470703125, "learning_rate": 3.770051367837838e-06, "loss": 0.009947095066308975, "memory(GiB)": 21.48, "step": 18475, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.958151 }, { "epoch": 0.6002014098690835, "grad_norm": 0.3257531523704529, "learning_rate": 3.7695307274031846e-06, "loss": 0.01646912470459938, "memory(GiB)": 21.48, "step": 18476, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.958161 }, { "epoch": 0.600233895331839, "grad_norm": 0.5878206491470337, "learning_rate": 3.769010101169782e-06, "loss": 0.020648520439863205, "memory(GiB)": 21.48, "step": 18477, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.958169 }, { "epoch": 0.6002663807945944, "grad_norm": 0.4684877395629883, "learning_rate": 3.7684894891436435e-06, "loss": 0.029588347300887108, "memory(GiB)": 21.48, "step": 18478, "token_acc": 0.995, "train_speed(iter/s)": 0.958177 }, { "epoch": 0.6002988662573499, "grad_norm": 0.3770171105861664, "learning_rate": 3.7679688913307737e-06, "loss": 0.015535054728388786, "memory(GiB)": 21.48, "step": 18479, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.958185 }, { "epoch": 0.6003313517201052, "grad_norm": 0.3556128740310669, "learning_rate": 3.7674483077371848e-06, "loss": 0.011854125186800957, "memory(GiB)": 21.48, "step": 18480, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.958193 }, { "epoch": 0.6003638371828607, "grad_norm": 0.26609501242637634, "learning_rate": 3.7669277383688802e-06, "loss": 0.011723553761839867, "memory(GiB)": 21.48, "step": 18481, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.958201 }, { "epoch": 0.600396322645616, "grad_norm": 0.43450161814689636, "learning_rate": 3.766407183231874e-06, "loss": 0.022080469876527786, "memory(GiB)": 21.48, "step": 18482, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.958209 }, { "epoch": 0.6004288081083715, "grad_norm": 0.3442428410053253, "learning_rate": 3.765886642332167e-06, "loss": 0.02014078199863434, "memory(GiB)": 21.48, "step": 18483, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.958216 }, { "epoch": 0.6004612935711269, "grad_norm": 0.44444021582603455, "learning_rate": 3.7653661156757743e-06, "loss": 0.02336299791932106, "memory(GiB)": 21.48, "step": 18484, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.958225 }, { "epoch": 0.6004937790338823, "grad_norm": 0.38840940594673157, "learning_rate": 3.7648456032686974e-06, "loss": 0.016837120056152344, "memory(GiB)": 21.48, "step": 18485, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.958232 }, { "epoch": 0.6005262644966377, "grad_norm": 0.553118109703064, "learning_rate": 3.764325105116948e-06, "loss": 0.02000357210636139, "memory(GiB)": 21.48, "step": 18486, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.95824 }, { "epoch": 0.6005587499593932, "grad_norm": 0.2681540548801422, "learning_rate": 3.76380462122653e-06, "loss": 0.009076415561139584, "memory(GiB)": 21.48, "step": 18487, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.958248 }, { "epoch": 0.6005912354221485, "grad_norm": 0.9212161302566528, "learning_rate": 3.763284151603451e-06, "loss": 0.017946161329746246, "memory(GiB)": 21.48, "step": 18488, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.958255 }, { "epoch": 0.600623720884904, "grad_norm": 0.43365761637687683, "learning_rate": 3.762763696253723e-06, "loss": 0.028787778690457344, "memory(GiB)": 21.48, "step": 18489, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.958263 }, { "epoch": 0.6006562063476594, "grad_norm": 0.3505837917327881, "learning_rate": 3.7622432551833454e-06, "loss": 0.020631715655326843, "memory(GiB)": 21.48, "step": 18490, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.958271 }, { "epoch": 0.6006886918104148, "grad_norm": 0.27154314517974854, "learning_rate": 3.7617228283983297e-06, "loss": 0.010633712634444237, "memory(GiB)": 21.48, "step": 18491, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.95828 }, { "epoch": 0.6007211772731702, "grad_norm": 0.34202849864959717, "learning_rate": 3.7612024159046795e-06, "loss": 0.018837662413716316, "memory(GiB)": 21.48, "step": 18492, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.958288 }, { "epoch": 0.6007536627359257, "grad_norm": 0.32735997438430786, "learning_rate": 3.7606820177084034e-06, "loss": 0.020059704780578613, "memory(GiB)": 21.48, "step": 18493, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.958295 }, { "epoch": 0.600786148198681, "grad_norm": 0.41045403480529785, "learning_rate": 3.760161633815505e-06, "loss": 0.01727999560534954, "memory(GiB)": 21.48, "step": 18494, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.958303 }, { "epoch": 0.6008186336614365, "grad_norm": 0.5110583305358887, "learning_rate": 3.7596412642319927e-06, "loss": 0.02202475257217884, "memory(GiB)": 21.48, "step": 18495, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.958311 }, { "epoch": 0.600851119124192, "grad_norm": 0.28450658917427063, "learning_rate": 3.7591209089638703e-06, "loss": 0.01497914083302021, "memory(GiB)": 21.48, "step": 18496, "token_acc": 1.0, "train_speed(iter/s)": 0.958319 }, { "epoch": 0.6008836045869473, "grad_norm": 0.4221402406692505, "learning_rate": 3.7586005680171455e-06, "loss": 0.01813594251871109, "memory(GiB)": 21.48, "step": 18497, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.958327 }, { "epoch": 0.6009160900497028, "grad_norm": 0.36740633845329285, "learning_rate": 3.758080241397821e-06, "loss": 0.014995909295976162, "memory(GiB)": 21.48, "step": 18498, "token_acc": 1.0, "train_speed(iter/s)": 0.958336 }, { "epoch": 0.6009485755124582, "grad_norm": 0.7629613876342773, "learning_rate": 3.7575599291119045e-06, "loss": 0.027508264407515526, "memory(GiB)": 21.48, "step": 18499, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.958344 }, { "epoch": 0.6009810609752136, "grad_norm": 0.4453912675380707, "learning_rate": 3.757039631165399e-06, "loss": 0.024851977825164795, "memory(GiB)": 21.48, "step": 18500, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.958353 }, { "epoch": 0.6009810609752136, "eval_loss": 0.01909034512937069, "eval_runtime": 80.7033, "eval_samples_per_second": 123.291, "eval_steps_per_second": 3.854, "eval_token_acc": 0.9922741227731472, "step": 18500 }, { "epoch": 0.601013546437969, "grad_norm": 0.3137615919113159, "learning_rate": 3.756519347564312e-06, "loss": 0.01574038341641426, "memory(GiB)": 21.48, "step": 18501, "token_acc": 0.9920265234546252, "train_speed(iter/s)": 0.953818 }, { "epoch": 0.6010460319007245, "grad_norm": 0.4819556176662445, "learning_rate": 3.7559990783146456e-06, "loss": 0.018408121541142464, "memory(GiB)": 21.48, "step": 18502, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.953825 }, { "epoch": 0.6010785173634798, "grad_norm": 0.49701637029647827, "learning_rate": 3.7554788234224054e-06, "loss": 0.01703847199678421, "memory(GiB)": 21.48, "step": 18503, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.953833 }, { "epoch": 0.6011110028262353, "grad_norm": 0.24459147453308105, "learning_rate": 3.7549585828935977e-06, "loss": 0.012799711897969246, "memory(GiB)": 21.48, "step": 18504, "token_acc": 1.0, "train_speed(iter/s)": 0.953842 }, { "epoch": 0.6011434882889907, "grad_norm": 0.4123072028160095, "learning_rate": 3.7544383567342228e-06, "loss": 0.023806916549801826, "memory(GiB)": 21.48, "step": 18505, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.95385 }, { "epoch": 0.6011759737517461, "grad_norm": 0.45491907000541687, "learning_rate": 3.753918144950289e-06, "loss": 0.021772045642137527, "memory(GiB)": 21.48, "step": 18506, "token_acc": 1.0, "train_speed(iter/s)": 0.953857 }, { "epoch": 0.6012084592145015, "grad_norm": 0.3650760352611542, "learning_rate": 3.7533979475477976e-06, "loss": 0.020070362836122513, "memory(GiB)": 21.48, "step": 18507, "token_acc": 1.0, "train_speed(iter/s)": 0.953866 }, { "epoch": 0.601240944677257, "grad_norm": 0.9733743071556091, "learning_rate": 3.752877764532755e-06, "loss": 0.021564211696386337, "memory(GiB)": 21.48, "step": 18508, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.953877 }, { "epoch": 0.6012734301400123, "grad_norm": 0.4529646337032318, "learning_rate": 3.752357595911159e-06, "loss": 0.018255945295095444, "memory(GiB)": 21.48, "step": 18509, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.953887 }, { "epoch": 0.6013059156027678, "grad_norm": 0.3401227593421936, "learning_rate": 3.7518374416890206e-06, "loss": 0.013688872568309307, "memory(GiB)": 21.48, "step": 18510, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.953899 }, { "epoch": 0.6013384010655232, "grad_norm": 0.3894263505935669, "learning_rate": 3.7513173018723363e-06, "loss": 0.014504615217447281, "memory(GiB)": 21.48, "step": 18511, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.95391 }, { "epoch": 0.6013708865282786, "grad_norm": 0.4578000605106354, "learning_rate": 3.750797176467116e-06, "loss": 0.026174601167440414, "memory(GiB)": 21.48, "step": 18512, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.953921 }, { "epoch": 0.601403371991034, "grad_norm": 0.441090852022171, "learning_rate": 3.7502770654793563e-06, "loss": 0.018995501101017, "memory(GiB)": 21.48, "step": 18513, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.953932 }, { "epoch": 0.6014358574537895, "grad_norm": 0.4509897530078888, "learning_rate": 3.7497569689150635e-06, "loss": 0.020832261070609093, "memory(GiB)": 21.48, "step": 18514, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.953943 }, { "epoch": 0.6014683429165448, "grad_norm": 0.5069862008094788, "learning_rate": 3.7492368867802387e-06, "loss": 0.0186960119754076, "memory(GiB)": 21.48, "step": 18515, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.953954 }, { "epoch": 0.6015008283793003, "grad_norm": 0.43673422932624817, "learning_rate": 3.7487168190808865e-06, "loss": 0.025741204619407654, "memory(GiB)": 21.48, "step": 18516, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.953964 }, { "epoch": 0.6015333138420557, "grad_norm": 0.5628594756126404, "learning_rate": 3.7481967658230056e-06, "loss": 0.014992522075772285, "memory(GiB)": 21.48, "step": 18517, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.953974 }, { "epoch": 0.6015657993048111, "grad_norm": 0.31546854972839355, "learning_rate": 3.7476767270125993e-06, "loss": 0.013197742402553558, "memory(GiB)": 21.48, "step": 18518, "token_acc": 1.0, "train_speed(iter/s)": 0.953984 }, { "epoch": 0.6015982847675665, "grad_norm": 0.65447598695755, "learning_rate": 3.7471567026556744e-06, "loss": 0.02468121238052845, "memory(GiB)": 21.48, "step": 18519, "token_acc": 1.0, "train_speed(iter/s)": 0.953995 }, { "epoch": 0.601630770230322, "grad_norm": 0.3702625334262848, "learning_rate": 3.7466366927582253e-06, "loss": 0.023757707327604294, "memory(GiB)": 21.48, "step": 18520, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.954007 }, { "epoch": 0.6016632556930773, "grad_norm": 0.2906745970249176, "learning_rate": 3.7461166973262592e-06, "loss": 0.0179887767881155, "memory(GiB)": 21.48, "step": 18521, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.954018 }, { "epoch": 0.6016957411558328, "grad_norm": 0.3707658052444458, "learning_rate": 3.7455967163657735e-06, "loss": 0.020228184759616852, "memory(GiB)": 21.48, "step": 18522, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.954029 }, { "epoch": 0.6017282266185882, "grad_norm": 0.4831944406032562, "learning_rate": 3.745076749882773e-06, "loss": 0.02524634823203087, "memory(GiB)": 21.48, "step": 18523, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.95404 }, { "epoch": 0.6017607120813436, "grad_norm": 0.38148459792137146, "learning_rate": 3.7445567978832555e-06, "loss": 0.021742533892393112, "memory(GiB)": 21.48, "step": 18524, "token_acc": 0.9858657243816255, "train_speed(iter/s)": 0.954051 }, { "epoch": 0.601793197544099, "grad_norm": 0.48239582777023315, "learning_rate": 3.7440368603732247e-06, "loss": 0.03141840174794197, "memory(GiB)": 21.48, "step": 18525, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.954062 }, { "epoch": 0.6018256830068545, "grad_norm": 0.2785313129425049, "learning_rate": 3.7435169373586792e-06, "loss": 0.011853142641484737, "memory(GiB)": 21.48, "step": 18526, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.954073 }, { "epoch": 0.6018581684696098, "grad_norm": 0.35513144731521606, "learning_rate": 3.742997028845622e-06, "loss": 0.01678159460425377, "memory(GiB)": 21.48, "step": 18527, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.954084 }, { "epoch": 0.6018906539323653, "grad_norm": 0.39354172348976135, "learning_rate": 3.742477134840051e-06, "loss": 0.024924051016569138, "memory(GiB)": 21.48, "step": 18528, "token_acc": 0.9836734693877551, "train_speed(iter/s)": 0.954094 }, { "epoch": 0.6019231393951207, "grad_norm": 0.26729926466941833, "learning_rate": 3.7419572553479683e-06, "loss": 0.014505771920084953, "memory(GiB)": 21.48, "step": 18529, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.954105 }, { "epoch": 0.6019556248578761, "grad_norm": 0.24587121605873108, "learning_rate": 3.7414373903753726e-06, "loss": 0.014395173639059067, "memory(GiB)": 21.48, "step": 18530, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.954103 }, { "epoch": 0.6019881103206315, "grad_norm": 0.31366777420043945, "learning_rate": 3.7409175399282666e-06, "loss": 0.017493702471256256, "memory(GiB)": 21.48, "step": 18531, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.954114 }, { "epoch": 0.602020595783387, "grad_norm": 0.33277037739753723, "learning_rate": 3.740397704012645e-06, "loss": 0.017158102244138718, "memory(GiB)": 21.48, "step": 18532, "token_acc": 0.9896373056994818, "train_speed(iter/s)": 0.954125 }, { "epoch": 0.6020530812461423, "grad_norm": 0.3428885340690613, "learning_rate": 3.7398778826345107e-06, "loss": 0.016865063458681107, "memory(GiB)": 21.48, "step": 18533, "token_acc": 0.9942857142857143, "train_speed(iter/s)": 0.954136 }, { "epoch": 0.6020855667088978, "grad_norm": 0.3363772928714752, "learning_rate": 3.739358075799865e-06, "loss": 0.019579501822590828, "memory(GiB)": 21.48, "step": 18534, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.954147 }, { "epoch": 0.6021180521716532, "grad_norm": 0.30974680185317993, "learning_rate": 3.738838283514703e-06, "loss": 0.01852806657552719, "memory(GiB)": 21.48, "step": 18535, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.954157 }, { "epoch": 0.6021505376344086, "grad_norm": 0.3975396156311035, "learning_rate": 3.738318505785028e-06, "loss": 0.02357013151049614, "memory(GiB)": 21.48, "step": 18536, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.954165 }, { "epoch": 0.602183023097164, "grad_norm": 0.3845890164375305, "learning_rate": 3.737798742616835e-06, "loss": 0.020444506779313087, "memory(GiB)": 21.48, "step": 18537, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.954174 }, { "epoch": 0.6022155085599195, "grad_norm": 0.2774568200111389, "learning_rate": 3.737278994016126e-06, "loss": 0.013530894182622433, "memory(GiB)": 21.48, "step": 18538, "token_acc": 1.0, "train_speed(iter/s)": 0.954182 }, { "epoch": 0.6022479940226748, "grad_norm": 0.2938855290412903, "learning_rate": 3.736759259988896e-06, "loss": 0.014220686629414558, "memory(GiB)": 21.48, "step": 18539, "token_acc": 1.0, "train_speed(iter/s)": 0.95419 }, { "epoch": 0.6022804794854303, "grad_norm": 0.3406785726547241, "learning_rate": 3.736239540541148e-06, "loss": 0.01814216375350952, "memory(GiB)": 21.48, "step": 18540, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.954198 }, { "epoch": 0.6023129649481856, "grad_norm": 0.29948803782463074, "learning_rate": 3.735719835678875e-06, "loss": 0.013731978833675385, "memory(GiB)": 21.48, "step": 18541, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.954208 }, { "epoch": 0.6023454504109411, "grad_norm": 0.36921435594558716, "learning_rate": 3.7352001454080804e-06, "loss": 0.01865721121430397, "memory(GiB)": 21.48, "step": 18542, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.954217 }, { "epoch": 0.6023779358736965, "grad_norm": 1.4253654479980469, "learning_rate": 3.734680469734758e-06, "loss": 0.021384354680776596, "memory(GiB)": 21.48, "step": 18543, "token_acc": 0.9947368421052631, "train_speed(iter/s)": 0.954226 }, { "epoch": 0.602410421336452, "grad_norm": 0.5544731616973877, "learning_rate": 3.7341608086649085e-06, "loss": 0.020217591896653175, "memory(GiB)": 21.48, "step": 18544, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.954234 }, { "epoch": 0.6024429067992073, "grad_norm": 0.42612504959106445, "learning_rate": 3.7336411622045264e-06, "loss": 0.021707208827137947, "memory(GiB)": 21.48, "step": 18545, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.954242 }, { "epoch": 0.6024753922619628, "grad_norm": 0.3187912404537201, "learning_rate": 3.733121530359612e-06, "loss": 0.021422699093818665, "memory(GiB)": 21.48, "step": 18546, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.95425 }, { "epoch": 0.6025078777247181, "grad_norm": 0.29683053493499756, "learning_rate": 3.7326019131361602e-06, "loss": 0.01966366544365883, "memory(GiB)": 21.48, "step": 18547, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.954258 }, { "epoch": 0.6025403631874736, "grad_norm": 0.386200487613678, "learning_rate": 3.7320823105401705e-06, "loss": 0.02043284848332405, "memory(GiB)": 21.48, "step": 18548, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.954266 }, { "epoch": 0.602572848650229, "grad_norm": 5.0120439529418945, "learning_rate": 3.7315627225776376e-06, "loss": 0.021221863105893135, "memory(GiB)": 21.48, "step": 18549, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.954273 }, { "epoch": 0.6026053341129844, "grad_norm": 0.4600837230682373, "learning_rate": 3.7310431492545585e-06, "loss": 0.02762531489133835, "memory(GiB)": 21.48, "step": 18550, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.954281 }, { "epoch": 0.6026378195757398, "grad_norm": 0.3335503041744232, "learning_rate": 3.7305235905769323e-06, "loss": 0.016265980899333954, "memory(GiB)": 21.48, "step": 18551, "token_acc": 1.0, "train_speed(iter/s)": 0.954288 }, { "epoch": 0.6026703050384953, "grad_norm": 0.3642900288105011, "learning_rate": 3.730004046550752e-06, "loss": 0.016284391283988953, "memory(GiB)": 21.48, "step": 18552, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.954296 }, { "epoch": 0.6027027905012506, "grad_norm": 0.3778214752674103, "learning_rate": 3.7294845171820165e-06, "loss": 0.013417474925518036, "memory(GiB)": 21.48, "step": 18553, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.954303 }, { "epoch": 0.6027352759640061, "grad_norm": 0.23291732370853424, "learning_rate": 3.72896500247672e-06, "loss": 0.01746443845331669, "memory(GiB)": 21.48, "step": 18554, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.954309 }, { "epoch": 0.6027677614267615, "grad_norm": 0.29515182971954346, "learning_rate": 3.7284455024408593e-06, "loss": 0.021916761994361877, "memory(GiB)": 21.48, "step": 18555, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.954316 }, { "epoch": 0.6028002468895169, "grad_norm": 0.4751301407814026, "learning_rate": 3.7279260170804298e-06, "loss": 0.02770417556166649, "memory(GiB)": 21.48, "step": 18556, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.954324 }, { "epoch": 0.6028327323522723, "grad_norm": 0.33475127816200256, "learning_rate": 3.727406546401428e-06, "loss": 0.01958080753684044, "memory(GiB)": 21.48, "step": 18557, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.954331 }, { "epoch": 0.6028652178150278, "grad_norm": 0.24975170195102692, "learning_rate": 3.7268870904098477e-06, "loss": 0.013669885694980621, "memory(GiB)": 21.48, "step": 18558, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.954339 }, { "epoch": 0.6028977032777831, "grad_norm": 0.2391609400510788, "learning_rate": 3.726367649111685e-06, "loss": 0.01565028540790081, "memory(GiB)": 21.48, "step": 18559, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.954347 }, { "epoch": 0.6029301887405386, "grad_norm": 0.29194653034210205, "learning_rate": 3.7258482225129345e-06, "loss": 0.019402917474508286, "memory(GiB)": 21.48, "step": 18560, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.954354 }, { "epoch": 0.6029626742032941, "grad_norm": 0.5053460597991943, "learning_rate": 3.725328810619594e-06, "loss": 0.022627778351306915, "memory(GiB)": 21.48, "step": 18561, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.954362 }, { "epoch": 0.6029951596660494, "grad_norm": 0.30867788195610046, "learning_rate": 3.724809413437651e-06, "loss": 0.019589047878980637, "memory(GiB)": 21.48, "step": 18562, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.954371 }, { "epoch": 0.6030276451288049, "grad_norm": 0.43948546051979065, "learning_rate": 3.724290030973109e-06, "loss": 0.020692333579063416, "memory(GiB)": 21.48, "step": 18563, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.954379 }, { "epoch": 0.6030601305915603, "grad_norm": 0.31168055534362793, "learning_rate": 3.7237706632319547e-06, "loss": 0.018572501838207245, "memory(GiB)": 21.48, "step": 18564, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.954387 }, { "epoch": 0.6030926160543157, "grad_norm": 0.363116979598999, "learning_rate": 3.723251310220186e-06, "loss": 0.015690818428993225, "memory(GiB)": 21.48, "step": 18565, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.954396 }, { "epoch": 0.6031251015170711, "grad_norm": 0.42998507618904114, "learning_rate": 3.7227319719437992e-06, "loss": 0.019144058227539062, "memory(GiB)": 21.48, "step": 18566, "token_acc": 0.9941520467836257, "train_speed(iter/s)": 0.954404 }, { "epoch": 0.6031575869798266, "grad_norm": 0.3319721519947052, "learning_rate": 3.722212648408782e-06, "loss": 0.020176783204078674, "memory(GiB)": 21.48, "step": 18567, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.954413 }, { "epoch": 0.6031900724425819, "grad_norm": 0.4057100713253021, "learning_rate": 3.721693339621135e-06, "loss": 0.020498234778642654, "memory(GiB)": 21.48, "step": 18568, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.954424 }, { "epoch": 0.6032225579053374, "grad_norm": 0.3437201678752899, "learning_rate": 3.721174045586845e-06, "loss": 0.020068982616066933, "memory(GiB)": 21.48, "step": 18569, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.954434 }, { "epoch": 0.6032550433680928, "grad_norm": 0.4439484179019928, "learning_rate": 3.720654766311912e-06, "loss": 0.023040182888507843, "memory(GiB)": 21.48, "step": 18570, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.954445 }, { "epoch": 0.6032875288308482, "grad_norm": 0.333909273147583, "learning_rate": 3.7201355018023233e-06, "loss": 0.02016582526266575, "memory(GiB)": 21.48, "step": 18571, "token_acc": 1.0, "train_speed(iter/s)": 0.954456 }, { "epoch": 0.6033200142936036, "grad_norm": 0.40376508235931396, "learning_rate": 3.719616252064076e-06, "loss": 0.019867105409502983, "memory(GiB)": 21.48, "step": 18572, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.954466 }, { "epoch": 0.6033524997563591, "grad_norm": 0.2824440598487854, "learning_rate": 3.7190970171031603e-06, "loss": 0.014410636387765408, "memory(GiB)": 21.48, "step": 18573, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.954477 }, { "epoch": 0.6033849852191144, "grad_norm": 0.4058716297149658, "learning_rate": 3.7185777969255705e-06, "loss": 0.019135741516947746, "memory(GiB)": 21.48, "step": 18574, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.954489 }, { "epoch": 0.6034174706818699, "grad_norm": 0.3875103294849396, "learning_rate": 3.7180585915372974e-06, "loss": 0.017814401537179947, "memory(GiB)": 21.48, "step": 18575, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.954499 }, { "epoch": 0.6034499561446253, "grad_norm": 0.6522023677825928, "learning_rate": 3.7175394009443356e-06, "loss": 0.018985167145729065, "memory(GiB)": 21.48, "step": 18576, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.95451 }, { "epoch": 0.6034824416073807, "grad_norm": 0.269993394613266, "learning_rate": 3.7170202251526753e-06, "loss": 0.016884667798876762, "memory(GiB)": 21.48, "step": 18577, "token_acc": 0.9933774834437086, "train_speed(iter/s)": 0.954521 }, { "epoch": 0.6035149270701361, "grad_norm": 0.5847439169883728, "learning_rate": 3.7165010641683096e-06, "loss": 0.03363237529993057, "memory(GiB)": 21.48, "step": 18578, "token_acc": 0.9923076923076923, "train_speed(iter/s)": 0.954532 }, { "epoch": 0.6035474125328916, "grad_norm": 0.32594940066337585, "learning_rate": 3.7159819179972297e-06, "loss": 0.01484211441129446, "memory(GiB)": 21.48, "step": 18579, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.954543 }, { "epoch": 0.6035798979956469, "grad_norm": 0.38289299607276917, "learning_rate": 3.7154627866454274e-06, "loss": 0.019129885360598564, "memory(GiB)": 21.48, "step": 18580, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.954554 }, { "epoch": 0.6036123834584024, "grad_norm": 0.31833067536354065, "learning_rate": 3.7149436701188946e-06, "loss": 0.013535883277654648, "memory(GiB)": 21.48, "step": 18581, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.954565 }, { "epoch": 0.6036448689211578, "grad_norm": 0.6097259521484375, "learning_rate": 3.7144245684236212e-06, "loss": 0.0198195930570364, "memory(GiB)": 21.48, "step": 18582, "token_acc": 1.0, "train_speed(iter/s)": 0.954576 }, { "epoch": 0.6036773543839132, "grad_norm": 0.7437098026275635, "learning_rate": 3.7139054815656016e-06, "loss": 0.019150622189044952, "memory(GiB)": 21.48, "step": 18583, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.954586 }, { "epoch": 0.6037098398466686, "grad_norm": 0.2553795278072357, "learning_rate": 3.7133864095508226e-06, "loss": 0.01482400856912136, "memory(GiB)": 21.48, "step": 18584, "token_acc": 1.0, "train_speed(iter/s)": 0.954597 }, { "epoch": 0.6037423253094241, "grad_norm": 0.2359156459569931, "learning_rate": 3.7128673523852782e-06, "loss": 0.010530360043048859, "memory(GiB)": 21.48, "step": 18585, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.954608 }, { "epoch": 0.6037748107721794, "grad_norm": 0.2951580882072449, "learning_rate": 3.712348310074956e-06, "loss": 0.016035370528697968, "memory(GiB)": 21.48, "step": 18586, "token_acc": 0.9947368421052631, "train_speed(iter/s)": 0.954619 }, { "epoch": 0.6038072962349349, "grad_norm": 0.3423806428909302, "learning_rate": 3.7118292826258505e-06, "loss": 0.01890876144170761, "memory(GiB)": 21.48, "step": 18587, "token_acc": 1.0, "train_speed(iter/s)": 0.954628 }, { "epoch": 0.6038397816976903, "grad_norm": 0.30709826946258545, "learning_rate": 3.7113102700439476e-06, "loss": 0.02340155467391014, "memory(GiB)": 21.48, "step": 18588, "token_acc": 1.0, "train_speed(iter/s)": 0.954639 }, { "epoch": 0.6038722671604457, "grad_norm": 0.285329669713974, "learning_rate": 3.710791272335242e-06, "loss": 0.016709022223949432, "memory(GiB)": 21.48, "step": 18589, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.954649 }, { "epoch": 0.6039047526232011, "grad_norm": 0.41400524973869324, "learning_rate": 3.7102722895057177e-06, "loss": 0.026193393394351006, "memory(GiB)": 21.48, "step": 18590, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.95466 }, { "epoch": 0.6039372380859566, "grad_norm": 0.26805606484413147, "learning_rate": 3.7097533215613707e-06, "loss": 0.015965431928634644, "memory(GiB)": 21.48, "step": 18591, "token_acc": 1.0, "train_speed(iter/s)": 0.954671 }, { "epoch": 0.6039697235487119, "grad_norm": 0.2992396950721741, "learning_rate": 3.7092343685081848e-06, "loss": 0.014246770180761814, "memory(GiB)": 21.48, "step": 18592, "token_acc": 1.0, "train_speed(iter/s)": 0.954682 }, { "epoch": 0.6040022090114674, "grad_norm": 0.3110624849796295, "learning_rate": 3.7087154303521557e-06, "loss": 0.013275514356791973, "memory(GiB)": 21.48, "step": 18593, "token_acc": 0.9890710382513661, "train_speed(iter/s)": 0.954693 }, { "epoch": 0.6040346944742228, "grad_norm": 0.37524208426475525, "learning_rate": 3.7081965070992664e-06, "loss": 0.019997961819171906, "memory(GiB)": 21.48, "step": 18594, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.954704 }, { "epoch": 0.6040671799369782, "grad_norm": 0.3930652439594269, "learning_rate": 3.707677598755508e-06, "loss": 0.020782502368092537, "memory(GiB)": 21.48, "step": 18595, "token_acc": 0.988, "train_speed(iter/s)": 0.954715 }, { "epoch": 0.6040996653997336, "grad_norm": 0.2953687012195587, "learning_rate": 3.707158705326874e-06, "loss": 0.012925590388476849, "memory(GiB)": 21.48, "step": 18596, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.954726 }, { "epoch": 0.6041321508624891, "grad_norm": 0.40446725487709045, "learning_rate": 3.706639826819346e-06, "loss": 0.02008928544819355, "memory(GiB)": 21.48, "step": 18597, "token_acc": 1.0, "train_speed(iter/s)": 0.954737 }, { "epoch": 0.6041646363252444, "grad_norm": 0.2828834652900696, "learning_rate": 3.7061209632389182e-06, "loss": 0.016155902296304703, "memory(GiB)": 21.48, "step": 18598, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.954748 }, { "epoch": 0.6041971217879999, "grad_norm": 0.4360370635986328, "learning_rate": 3.7056021145915754e-06, "loss": 0.018673833459615707, "memory(GiB)": 21.48, "step": 18599, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.954758 }, { "epoch": 0.6042296072507553, "grad_norm": 0.44061243534088135, "learning_rate": 3.705083280883308e-06, "loss": 0.020503494888544083, "memory(GiB)": 21.48, "step": 18600, "token_acc": 0.9921875, "train_speed(iter/s)": 0.954769 }, { "epoch": 0.6042620927135107, "grad_norm": 0.47975149750709534, "learning_rate": 3.7045644621201018e-06, "loss": 0.021562501788139343, "memory(GiB)": 21.48, "step": 18601, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.954778 }, { "epoch": 0.6042945781762661, "grad_norm": 4.366147994995117, "learning_rate": 3.7040456583079463e-06, "loss": 0.02030225843191147, "memory(GiB)": 21.48, "step": 18602, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.954786 }, { "epoch": 0.6043270636390216, "grad_norm": 0.3485536575317383, "learning_rate": 3.7035268694528286e-06, "loss": 0.017757734283804893, "memory(GiB)": 21.48, "step": 18603, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.954795 }, { "epoch": 0.6043595491017769, "grad_norm": 0.5068630576133728, "learning_rate": 3.7030080955607373e-06, "loss": 0.018469981849193573, "memory(GiB)": 21.48, "step": 18604, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.954804 }, { "epoch": 0.6043920345645324, "grad_norm": 0.29556792974472046, "learning_rate": 3.702489336637658e-06, "loss": 0.015340670011937618, "memory(GiB)": 21.48, "step": 18605, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.954812 }, { "epoch": 0.6044245200272877, "grad_norm": 0.2935270667076111, "learning_rate": 3.701970592689579e-06, "loss": 0.015943404287099838, "memory(GiB)": 21.48, "step": 18606, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.954821 }, { "epoch": 0.6044570054900432, "grad_norm": 0.2695549428462982, "learning_rate": 3.7014518637224874e-06, "loss": 0.01379287987947464, "memory(GiB)": 21.48, "step": 18607, "token_acc": 1.0, "train_speed(iter/s)": 0.95483 }, { "epoch": 0.6044894909527986, "grad_norm": 0.3934289515018463, "learning_rate": 3.7009331497423696e-06, "loss": 0.020155441015958786, "memory(GiB)": 21.48, "step": 18608, "token_acc": 0.9786324786324786, "train_speed(iter/s)": 0.954836 }, { "epoch": 0.604521976415554, "grad_norm": 0.3189184367656708, "learning_rate": 3.700414450755211e-06, "loss": 0.014690954238176346, "memory(GiB)": 21.48, "step": 18609, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.954843 }, { "epoch": 0.6045544618783094, "grad_norm": 0.29134368896484375, "learning_rate": 3.699895766767001e-06, "loss": 0.012657036073505878, "memory(GiB)": 21.48, "step": 18610, "token_acc": 0.9921875, "train_speed(iter/s)": 0.95485 }, { "epoch": 0.6045869473410649, "grad_norm": 0.3772670328617096, "learning_rate": 3.6993770977837235e-06, "loss": 0.01570013538002968, "memory(GiB)": 21.48, "step": 18611, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.954858 }, { "epoch": 0.6046194328038202, "grad_norm": 0.4121182858943939, "learning_rate": 3.6988584438113652e-06, "loss": 0.014526196755468845, "memory(GiB)": 21.48, "step": 18612, "token_acc": 1.0, "train_speed(iter/s)": 0.954865 }, { "epoch": 0.6046519182665757, "grad_norm": 0.3227559030056, "learning_rate": 3.6983398048559126e-06, "loss": 0.01707155629992485, "memory(GiB)": 21.48, "step": 18613, "token_acc": 0.98828125, "train_speed(iter/s)": 0.954872 }, { "epoch": 0.6046844037293311, "grad_norm": 0.548416018486023, "learning_rate": 3.697821180923351e-06, "loss": 0.01608147844672203, "memory(GiB)": 21.48, "step": 18614, "token_acc": 0.9940828402366864, "train_speed(iter/s)": 0.954879 }, { "epoch": 0.6047168891920865, "grad_norm": 0.4380189776420593, "learning_rate": 3.6973025720196674e-06, "loss": 0.018610022962093353, "memory(GiB)": 21.48, "step": 18615, "token_acc": 0.9939393939393939, "train_speed(iter/s)": 0.954886 }, { "epoch": 0.6047493746548419, "grad_norm": 0.2850392460823059, "learning_rate": 3.6967839781508443e-06, "loss": 0.013068713247776031, "memory(GiB)": 21.48, "step": 18616, "token_acc": 1.0, "train_speed(iter/s)": 0.954893 }, { "epoch": 0.6047818601175974, "grad_norm": 0.3805454969406128, "learning_rate": 3.6962653993228708e-06, "loss": 0.021967139095067978, "memory(GiB)": 21.48, "step": 18617, "token_acc": 0.9808612440191388, "train_speed(iter/s)": 0.9549 }, { "epoch": 0.6048143455803527, "grad_norm": 0.39430707693099976, "learning_rate": 3.695746835541727e-06, "loss": 0.014114022254943848, "memory(GiB)": 21.48, "step": 18618, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.954907 }, { "epoch": 0.6048468310431082, "grad_norm": 0.8520832061767578, "learning_rate": 3.6952282868134037e-06, "loss": 0.02269364520907402, "memory(GiB)": 21.48, "step": 18619, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.954914 }, { "epoch": 0.6048793165058636, "grad_norm": 0.40253469347953796, "learning_rate": 3.6947097531438796e-06, "loss": 0.013705814257264137, "memory(GiB)": 21.48, "step": 18620, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.954921 }, { "epoch": 0.604911801968619, "grad_norm": 0.37188151478767395, "learning_rate": 3.6941912345391453e-06, "loss": 0.017096154391765594, "memory(GiB)": 21.48, "step": 18621, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.954929 }, { "epoch": 0.6049442874313744, "grad_norm": 0.3194570541381836, "learning_rate": 3.69367273100518e-06, "loss": 0.02132660523056984, "memory(GiB)": 21.48, "step": 18622, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.954937 }, { "epoch": 0.6049767728941299, "grad_norm": 0.38462960720062256, "learning_rate": 3.6931542425479718e-06, "loss": 0.028031297028064728, "memory(GiB)": 21.48, "step": 18623, "token_acc": 0.9867109634551495, "train_speed(iter/s)": 0.954944 }, { "epoch": 0.6050092583568853, "grad_norm": 0.32105958461761475, "learning_rate": 3.692635769173501e-06, "loss": 0.016078408807516098, "memory(GiB)": 21.48, "step": 18624, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.954952 }, { "epoch": 0.6050417438196407, "grad_norm": 0.3740910291671753, "learning_rate": 3.692117310887755e-06, "loss": 0.020877983421087265, "memory(GiB)": 21.48, "step": 18625, "token_acc": 0.9760765550239234, "train_speed(iter/s)": 0.954961 }, { "epoch": 0.6050742292823962, "grad_norm": 0.34804949164390564, "learning_rate": 3.6915988676967153e-06, "loss": 0.02067723497748375, "memory(GiB)": 21.48, "step": 18626, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.954969 }, { "epoch": 0.6051067147451515, "grad_norm": 0.3992592692375183, "learning_rate": 3.691080439606366e-06, "loss": 0.015311047434806824, "memory(GiB)": 21.48, "step": 18627, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.954981 }, { "epoch": 0.605139200207907, "grad_norm": 0.25805041193962097, "learning_rate": 3.690562026622691e-06, "loss": 0.011021215468645096, "memory(GiB)": 21.48, "step": 18628, "token_acc": 1.0, "train_speed(iter/s)": 0.954992 }, { "epoch": 0.6051716856706624, "grad_norm": 0.47996285557746887, "learning_rate": 3.690043628751673e-06, "loss": 0.026756955310702324, "memory(GiB)": 21.48, "step": 18629, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955003 }, { "epoch": 0.6052041711334178, "grad_norm": 0.3601956367492676, "learning_rate": 3.689525245999295e-06, "loss": 0.01913665235042572, "memory(GiB)": 21.48, "step": 18630, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.955014 }, { "epoch": 0.6052366565961732, "grad_norm": 0.24574802815914154, "learning_rate": 3.6890068783715395e-06, "loss": 0.013810310512781143, "memory(GiB)": 21.48, "step": 18631, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955024 }, { "epoch": 0.6052691420589287, "grad_norm": 0.24396273493766785, "learning_rate": 3.688488525874391e-06, "loss": 0.011023404076695442, "memory(GiB)": 21.48, "step": 18632, "token_acc": 0.993006993006993, "train_speed(iter/s)": 0.955035 }, { "epoch": 0.605301627521684, "grad_norm": 0.2807929515838623, "learning_rate": 3.6879701885138288e-06, "loss": 0.020406175404787064, "memory(GiB)": 21.48, "step": 18633, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.955046 }, { "epoch": 0.6053341129844395, "grad_norm": 0.32420846819877625, "learning_rate": 3.687451866295838e-06, "loss": 0.014184210449457169, "memory(GiB)": 21.48, "step": 18634, "token_acc": 0.9948717948717949, "train_speed(iter/s)": 0.955057 }, { "epoch": 0.6053665984471949, "grad_norm": 0.2716328501701355, "learning_rate": 3.686933559226399e-06, "loss": 0.017423346638679504, "memory(GiB)": 21.48, "step": 18635, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.955068 }, { "epoch": 0.6053990839099503, "grad_norm": 0.31666702032089233, "learning_rate": 3.686415267311495e-06, "loss": 0.01634625345468521, "memory(GiB)": 21.48, "step": 18636, "token_acc": 0.9890510948905109, "train_speed(iter/s)": 0.955078 }, { "epoch": 0.6054315693727057, "grad_norm": 0.34461236000061035, "learning_rate": 3.6858969905571067e-06, "loss": 0.01714874990284443, "memory(GiB)": 21.48, "step": 18637, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.955089 }, { "epoch": 0.6054640548354612, "grad_norm": 0.2864653766155243, "learning_rate": 3.6853787289692173e-06, "loss": 0.014084620401263237, "memory(GiB)": 21.48, "step": 18638, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.9551 }, { "epoch": 0.6054965402982165, "grad_norm": 0.3113546371459961, "learning_rate": 3.684860482553806e-06, "loss": 0.017311349511146545, "memory(GiB)": 21.48, "step": 18639, "token_acc": 1.0, "train_speed(iter/s)": 0.955102 }, { "epoch": 0.605529025760972, "grad_norm": 0.43903422355651855, "learning_rate": 3.684342251316857e-06, "loss": 0.02020515687763691, "memory(GiB)": 21.48, "step": 18640, "token_acc": 0.99609375, "train_speed(iter/s)": 0.955113 }, { "epoch": 0.6055615112237274, "grad_norm": 0.37369364500045776, "learning_rate": 3.683824035264346e-06, "loss": 0.018365902826189995, "memory(GiB)": 21.48, "step": 18641, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.955124 }, { "epoch": 0.6055939966864828, "grad_norm": 0.33943694829940796, "learning_rate": 3.6833058344022586e-06, "loss": 0.01690964587032795, "memory(GiB)": 21.48, "step": 18642, "token_acc": 1.0, "train_speed(iter/s)": 0.955134 }, { "epoch": 0.6056264821492382, "grad_norm": 0.27860239148139954, "learning_rate": 3.6827876487365767e-06, "loss": 0.01819329522550106, "memory(GiB)": 21.48, "step": 18643, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.955145 }, { "epoch": 0.6056589676119937, "grad_norm": 0.3810541033744812, "learning_rate": 3.682269478273276e-06, "loss": 0.017406269907951355, "memory(GiB)": 21.48, "step": 18644, "token_acc": 0.9922779922779923, "train_speed(iter/s)": 0.955156 }, { "epoch": 0.605691453074749, "grad_norm": 0.40547993779182434, "learning_rate": 3.6817513230183423e-06, "loss": 0.015964139252901077, "memory(GiB)": 21.48, "step": 18645, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.955167 }, { "epoch": 0.6057239385375045, "grad_norm": 0.351285845041275, "learning_rate": 3.681233182977749e-06, "loss": 0.017819520086050034, "memory(GiB)": 21.48, "step": 18646, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955178 }, { "epoch": 0.6057564240002599, "grad_norm": 0.4023198187351227, "learning_rate": 3.6807150581574846e-06, "loss": 0.019486047327518463, "memory(GiB)": 21.48, "step": 18647, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955188 }, { "epoch": 0.6057889094630153, "grad_norm": 0.40818604826927185, "learning_rate": 3.6801969485635203e-06, "loss": 0.021444030106067657, "memory(GiB)": 21.48, "step": 18648, "token_acc": 1.0, "train_speed(iter/s)": 0.955199 }, { "epoch": 0.6058213949257707, "grad_norm": 0.2731699049472809, "learning_rate": 3.679678854201843e-06, "loss": 0.01868768036365509, "memory(GiB)": 21.48, "step": 18649, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.95521 }, { "epoch": 0.6058538803885262, "grad_norm": 0.4513761103153229, "learning_rate": 3.6791607750784276e-06, "loss": 0.02144809067249298, "memory(GiB)": 21.48, "step": 18650, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.955221 }, { "epoch": 0.6058863658512815, "grad_norm": 0.25083887577056885, "learning_rate": 3.678642711199255e-06, "loss": 0.010522321797907352, "memory(GiB)": 21.48, "step": 18651, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.955232 }, { "epoch": 0.605918851314037, "grad_norm": 0.3767009675502777, "learning_rate": 3.6781246625703038e-06, "loss": 0.01695949397981167, "memory(GiB)": 21.48, "step": 18652, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955243 }, { "epoch": 0.6059513367767924, "grad_norm": 0.28283482789993286, "learning_rate": 3.6776066291975543e-06, "loss": 0.012246795929968357, "memory(GiB)": 21.48, "step": 18653, "token_acc": 1.0, "train_speed(iter/s)": 0.955253 }, { "epoch": 0.6059838222395478, "grad_norm": 0.3581472635269165, "learning_rate": 3.677088611086983e-06, "loss": 0.018714508041739464, "memory(GiB)": 21.48, "step": 18654, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955264 }, { "epoch": 0.6060163077023032, "grad_norm": 0.3989756405353546, "learning_rate": 3.6765706082445716e-06, "loss": 0.02498818002641201, "memory(GiB)": 21.48, "step": 18655, "token_acc": 0.9748743718592965, "train_speed(iter/s)": 0.955275 }, { "epoch": 0.6060487931650587, "grad_norm": 0.374419629573822, "learning_rate": 3.676052620676295e-06, "loss": 0.026282934471964836, "memory(GiB)": 21.48, "step": 18656, "token_acc": 0.9797979797979798, "train_speed(iter/s)": 0.955286 }, { "epoch": 0.606081278627814, "grad_norm": 0.4143742024898529, "learning_rate": 3.675534648388134e-06, "loss": 0.012358024716377258, "memory(GiB)": 21.48, "step": 18657, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.955296 }, { "epoch": 0.6061137640905695, "grad_norm": 0.4459090828895569, "learning_rate": 3.6750166913860675e-06, "loss": 0.02338145487010479, "memory(GiB)": 21.48, "step": 18658, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.955305 }, { "epoch": 0.6061462495533249, "grad_norm": 0.3057563602924347, "learning_rate": 3.6744987496760697e-06, "loss": 0.020086374133825302, "memory(GiB)": 21.48, "step": 18659, "token_acc": 1.0, "train_speed(iter/s)": 0.955316 }, { "epoch": 0.6061787350160803, "grad_norm": 0.3644844889640808, "learning_rate": 3.6739808232641225e-06, "loss": 0.019903231412172318, "memory(GiB)": 21.48, "step": 18660, "token_acc": 1.0, "train_speed(iter/s)": 0.955327 }, { "epoch": 0.6062112204788357, "grad_norm": 0.3501110374927521, "learning_rate": 3.6734629121562003e-06, "loss": 0.018230782821774483, "memory(GiB)": 21.48, "step": 18661, "token_acc": 0.98828125, "train_speed(iter/s)": 0.955337 }, { "epoch": 0.6062437059415912, "grad_norm": 0.3471417725086212, "learning_rate": 3.6729450163582825e-06, "loss": 0.016413889825344086, "memory(GiB)": 21.48, "step": 18662, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955349 }, { "epoch": 0.6062761914043465, "grad_norm": 0.3365020453929901, "learning_rate": 3.672427135876344e-06, "loss": 0.020630083978176117, "memory(GiB)": 21.48, "step": 18663, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955359 }, { "epoch": 0.606308676867102, "grad_norm": 0.3239592909812927, "learning_rate": 3.6719092707163657e-06, "loss": 0.01639370247721672, "memory(GiB)": 21.48, "step": 18664, "token_acc": 0.9826086956521739, "train_speed(iter/s)": 0.95537 }, { "epoch": 0.6063411623298574, "grad_norm": 0.21422281861305237, "learning_rate": 3.6713914208843205e-06, "loss": 0.009342426434159279, "memory(GiB)": 21.48, "step": 18665, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.95538 }, { "epoch": 0.6063736477926128, "grad_norm": 0.42543545365333557, "learning_rate": 3.6708735863861873e-06, "loss": 0.01814686693251133, "memory(GiB)": 21.48, "step": 18666, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.955389 }, { "epoch": 0.6064061332553682, "grad_norm": 0.34289759397506714, "learning_rate": 3.670355767227941e-06, "loss": 0.021414071321487427, "memory(GiB)": 21.48, "step": 18667, "token_acc": 1.0, "train_speed(iter/s)": 0.955398 }, { "epoch": 0.6064386187181237, "grad_norm": 0.2272530198097229, "learning_rate": 3.6698379634155613e-06, "loss": 0.011201499029994011, "memory(GiB)": 21.48, "step": 18668, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.955405 }, { "epoch": 0.606471104180879, "grad_norm": 0.24395766854286194, "learning_rate": 3.669320174955019e-06, "loss": 0.013104944489896297, "memory(GiB)": 21.48, "step": 18669, "token_acc": 1.0, "train_speed(iter/s)": 0.955413 }, { "epoch": 0.6065035896436345, "grad_norm": 0.3644627332687378, "learning_rate": 3.668802401852296e-06, "loss": 0.015547179616987705, "memory(GiB)": 21.48, "step": 18670, "token_acc": 1.0, "train_speed(iter/s)": 0.955421 }, { "epoch": 0.6065360751063898, "grad_norm": 0.46238797903060913, "learning_rate": 3.6682846441133613e-06, "loss": 0.02408408932387829, "memory(GiB)": 21.48, "step": 18671, "token_acc": 0.9834254143646409, "train_speed(iter/s)": 0.955428 }, { "epoch": 0.6065685605691453, "grad_norm": 0.4887121915817261, "learning_rate": 3.6677669017441963e-06, "loss": 0.021895186975598335, "memory(GiB)": 21.48, "step": 18672, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.955433 }, { "epoch": 0.6066010460319007, "grad_norm": 0.2575857937335968, "learning_rate": 3.6672491747507762e-06, "loss": 0.01395207829773426, "memory(GiB)": 21.48, "step": 18673, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.95544 }, { "epoch": 0.6066335314946562, "grad_norm": 0.3394295275211334, "learning_rate": 3.666731463139071e-06, "loss": 0.017305191606283188, "memory(GiB)": 21.48, "step": 18674, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955447 }, { "epoch": 0.6066660169574115, "grad_norm": 0.355610191822052, "learning_rate": 3.6662137669150627e-06, "loss": 0.018838070333003998, "memory(GiB)": 21.48, "step": 18675, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955454 }, { "epoch": 0.606698502420167, "grad_norm": 0.21843060851097107, "learning_rate": 3.6656960860847197e-06, "loss": 0.010045399889349937, "memory(GiB)": 21.48, "step": 18676, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.955462 }, { "epoch": 0.6067309878829223, "grad_norm": 0.5044189095497131, "learning_rate": 3.665178420654023e-06, "loss": 0.022892512381076813, "memory(GiB)": 21.48, "step": 18677, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.95547 }, { "epoch": 0.6067634733456778, "grad_norm": 0.35712310671806335, "learning_rate": 3.6646607706289415e-06, "loss": 0.013840146362781525, "memory(GiB)": 21.48, "step": 18678, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.955476 }, { "epoch": 0.6067959588084332, "grad_norm": 0.2788080871105194, "learning_rate": 3.6641431360154532e-06, "loss": 0.012441031634807587, "memory(GiB)": 21.48, "step": 18679, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.955483 }, { "epoch": 0.6068284442711886, "grad_norm": 0.301045686006546, "learning_rate": 3.6636255168195312e-06, "loss": 0.013700579293072224, "memory(GiB)": 21.48, "step": 18680, "token_acc": 1.0, "train_speed(iter/s)": 0.95549 }, { "epoch": 0.606860929733944, "grad_norm": 0.37052297592163086, "learning_rate": 3.663107913047149e-06, "loss": 0.01901846192777157, "memory(GiB)": 21.48, "step": 18681, "token_acc": 0.9813084112149533, "train_speed(iter/s)": 0.955497 }, { "epoch": 0.6068934151966995, "grad_norm": 0.2590942680835724, "learning_rate": 3.6625903247042803e-06, "loss": 0.016845859587192535, "memory(GiB)": 21.48, "step": 18682, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955505 }, { "epoch": 0.6069259006594548, "grad_norm": 0.2856394350528717, "learning_rate": 3.662072751796901e-06, "loss": 0.009163097478449345, "memory(GiB)": 21.48, "step": 18683, "token_acc": 1.0, "train_speed(iter/s)": 0.955513 }, { "epoch": 0.6069583861222103, "grad_norm": 0.3503013253211975, "learning_rate": 3.661555194330981e-06, "loss": 0.015170453116297722, "memory(GiB)": 21.48, "step": 18684, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.95552 }, { "epoch": 0.6069908715849657, "grad_norm": 0.4349915385246277, "learning_rate": 3.6610376523124967e-06, "loss": 0.024642139673233032, "memory(GiB)": 21.48, "step": 18685, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.955528 }, { "epoch": 0.6070233570477211, "grad_norm": 0.34680813550949097, "learning_rate": 3.6605201257474197e-06, "loss": 0.018059372901916504, "memory(GiB)": 21.48, "step": 18686, "token_acc": 1.0, "train_speed(iter/s)": 0.955535 }, { "epoch": 0.6070558425104765, "grad_norm": 0.42516395449638367, "learning_rate": 3.660002614641724e-06, "loss": 0.013267137110233307, "memory(GiB)": 21.48, "step": 18687, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955544 }, { "epoch": 0.607088327973232, "grad_norm": 0.39336493611335754, "learning_rate": 3.6594851190013804e-06, "loss": 0.021171934902668, "memory(GiB)": 21.48, "step": 18688, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.955553 }, { "epoch": 0.6071208134359874, "grad_norm": 0.3479732573032379, "learning_rate": 3.6589676388323625e-06, "loss": 0.016692552715539932, "memory(GiB)": 21.48, "step": 18689, "token_acc": 1.0, "train_speed(iter/s)": 0.955564 }, { "epoch": 0.6071532988987428, "grad_norm": 0.36062073707580566, "learning_rate": 3.658450174140645e-06, "loss": 0.019619252532720566, "memory(GiB)": 21.48, "step": 18690, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.955575 }, { "epoch": 0.6071857843614983, "grad_norm": 0.25135526061058044, "learning_rate": 3.657932724932196e-06, "loss": 0.014448752626776695, "memory(GiB)": 21.48, "step": 18691, "token_acc": 1.0, "train_speed(iter/s)": 0.955585 }, { "epoch": 0.6072182698242536, "grad_norm": 0.3969516456127167, "learning_rate": 3.657415291212991e-06, "loss": 0.013645874336361885, "memory(GiB)": 21.48, "step": 18692, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.955596 }, { "epoch": 0.6072507552870091, "grad_norm": 0.35404640436172485, "learning_rate": 3.656897872989e-06, "loss": 0.018282698467373848, "memory(GiB)": 21.48, "step": 18693, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.955607 }, { "epoch": 0.6072832407497645, "grad_norm": 0.2646973431110382, "learning_rate": 3.6563804702661953e-06, "loss": 0.013440674170851707, "memory(GiB)": 21.48, "step": 18694, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.955617 }, { "epoch": 0.60731572621252, "grad_norm": 0.3465231955051422, "learning_rate": 3.655863083050548e-06, "loss": 0.01707535609602928, "memory(GiB)": 21.48, "step": 18695, "token_acc": 0.99609375, "train_speed(iter/s)": 0.955628 }, { "epoch": 0.6073482116752753, "grad_norm": 0.3594816327095032, "learning_rate": 3.6553457113480317e-06, "loss": 0.01713375374674797, "memory(GiB)": 21.48, "step": 18696, "token_acc": 1.0, "train_speed(iter/s)": 0.955638 }, { "epoch": 0.6073806971380308, "grad_norm": 0.8093082904815674, "learning_rate": 3.6548283551646126e-06, "loss": 0.018650714308023453, "memory(GiB)": 21.48, "step": 18697, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.955649 }, { "epoch": 0.6074131826007861, "grad_norm": 0.3051692843437195, "learning_rate": 3.6543110145062676e-06, "loss": 0.019536573439836502, "memory(GiB)": 21.48, "step": 18698, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.95566 }, { "epoch": 0.6074456680635416, "grad_norm": 0.3766787052154541, "learning_rate": 3.6537936893789616e-06, "loss": 0.016657400876283646, "memory(GiB)": 21.48, "step": 18699, "token_acc": 0.9944444444444445, "train_speed(iter/s)": 0.955671 }, { "epoch": 0.607478153526297, "grad_norm": 0.3979181945323944, "learning_rate": 3.6532763797886717e-06, "loss": 0.011191224679350853, "memory(GiB)": 21.48, "step": 18700, "token_acc": 1.0, "train_speed(iter/s)": 0.955682 }, { "epoch": 0.6075106389890524, "grad_norm": 0.3942022919654846, "learning_rate": 3.652759085741362e-06, "loss": 0.017121825367212296, "memory(GiB)": 21.48, "step": 18701, "token_acc": 1.0, "train_speed(iter/s)": 0.955692 }, { "epoch": 0.6075431244518078, "grad_norm": 0.5787861943244934, "learning_rate": 3.6522418072430066e-06, "loss": 0.021795108914375305, "memory(GiB)": 21.48, "step": 18702, "token_acc": 0.9825581395348837, "train_speed(iter/s)": 0.955703 }, { "epoch": 0.6075756099145633, "grad_norm": 0.26492375135421753, "learning_rate": 3.651724544299574e-06, "loss": 0.013345731422305107, "memory(GiB)": 21.48, "step": 18703, "token_acc": 1.0, "train_speed(iter/s)": 0.955714 }, { "epoch": 0.6076080953773186, "grad_norm": 7.606639862060547, "learning_rate": 3.6512072969170344e-06, "loss": 0.030761154368519783, "memory(GiB)": 21.48, "step": 18704, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.955724 }, { "epoch": 0.6076405808400741, "grad_norm": 0.3761414885520935, "learning_rate": 3.65069006510136e-06, "loss": 0.01730484887957573, "memory(GiB)": 21.48, "step": 18705, "token_acc": 1.0, "train_speed(iter/s)": 0.955736 }, { "epoch": 0.6076730663028295, "grad_norm": 0.5056548714637756, "learning_rate": 3.6501728488585166e-06, "loss": 0.02379436232149601, "memory(GiB)": 21.48, "step": 18706, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.955747 }, { "epoch": 0.6077055517655849, "grad_norm": 0.4749663174152374, "learning_rate": 3.6496556481944754e-06, "loss": 0.021779732778668404, "memory(GiB)": 21.48, "step": 18707, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955757 }, { "epoch": 0.6077380372283403, "grad_norm": 0.5164421796798706, "learning_rate": 3.6491384631152047e-06, "loss": 0.0195272509008646, "memory(GiB)": 21.48, "step": 18708, "token_acc": 1.0, "train_speed(iter/s)": 0.955768 }, { "epoch": 0.6077705226910958, "grad_norm": 0.27418726682662964, "learning_rate": 3.6486212936266754e-06, "loss": 0.014986942522227764, "memory(GiB)": 21.48, "step": 18709, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.95578 }, { "epoch": 0.6078030081538511, "grad_norm": 0.2638675272464752, "learning_rate": 3.6481041397348537e-06, "loss": 0.014080215245485306, "memory(GiB)": 21.48, "step": 18710, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.95579 }, { "epoch": 0.6078354936166066, "grad_norm": 0.34918612241744995, "learning_rate": 3.647587001445711e-06, "loss": 0.020999837666749954, "memory(GiB)": 21.48, "step": 18711, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.9558 }, { "epoch": 0.607867979079362, "grad_norm": 0.409135103225708, "learning_rate": 3.647069878765213e-06, "loss": 0.01600055769085884, "memory(GiB)": 21.48, "step": 18712, "token_acc": 1.0, "train_speed(iter/s)": 0.955811 }, { "epoch": 0.6079004645421174, "grad_norm": 0.3750356137752533, "learning_rate": 3.646552771699331e-06, "loss": 0.016542542725801468, "memory(GiB)": 21.48, "step": 18713, "token_acc": 1.0, "train_speed(iter/s)": 0.955822 }, { "epoch": 0.6079329500048728, "grad_norm": 0.5605825185775757, "learning_rate": 3.6460356802540296e-06, "loss": 0.020249158143997192, "memory(GiB)": 21.48, "step": 18714, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955832 }, { "epoch": 0.6079654354676283, "grad_norm": 0.2943144738674164, "learning_rate": 3.64551860443528e-06, "loss": 0.018083855509757996, "memory(GiB)": 21.48, "step": 18715, "token_acc": 0.9948979591836735, "train_speed(iter/s)": 0.955844 }, { "epoch": 0.6079979209303836, "grad_norm": 0.6165067553520203, "learning_rate": 3.645001544249047e-06, "loss": 0.025375347584486008, "memory(GiB)": 21.48, "step": 18716, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.955855 }, { "epoch": 0.6080304063931391, "grad_norm": 0.3312909007072449, "learning_rate": 3.644484499701302e-06, "loss": 0.018700938671827316, "memory(GiB)": 21.48, "step": 18717, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.955864 }, { "epoch": 0.6080628918558945, "grad_norm": 0.320840060710907, "learning_rate": 3.6439674707980077e-06, "loss": 0.015591113828122616, "memory(GiB)": 21.48, "step": 18718, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955875 }, { "epoch": 0.6080953773186499, "grad_norm": 0.24789312481880188, "learning_rate": 3.643450457545134e-06, "loss": 0.011513404548168182, "memory(GiB)": 21.48, "step": 18719, "token_acc": 1.0, "train_speed(iter/s)": 0.955886 }, { "epoch": 0.6081278627814053, "grad_norm": 0.5450847148895264, "learning_rate": 3.6429334599486488e-06, "loss": 0.018803022801876068, "memory(GiB)": 21.48, "step": 18720, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.955897 }, { "epoch": 0.6081603482441608, "grad_norm": 0.5075673460960388, "learning_rate": 3.6424164780145168e-06, "loss": 0.019303593784570694, "memory(GiB)": 21.48, "step": 18721, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955908 }, { "epoch": 0.6081928337069161, "grad_norm": 0.32657817006111145, "learning_rate": 3.6418995117487062e-06, "loss": 0.022159814834594727, "memory(GiB)": 21.48, "step": 18722, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.955918 }, { "epoch": 0.6082253191696716, "grad_norm": 0.3850356638431549, "learning_rate": 3.641382561157182e-06, "loss": 0.013561490923166275, "memory(GiB)": 21.48, "step": 18723, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.955928 }, { "epoch": 0.608257804632427, "grad_norm": 0.31528863310813904, "learning_rate": 3.6408656262459138e-06, "loss": 0.011885542422533035, "memory(GiB)": 21.48, "step": 18724, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955939 }, { "epoch": 0.6082902900951824, "grad_norm": 1.106268048286438, "learning_rate": 3.6403487070208614e-06, "loss": 0.018820863217115402, "memory(GiB)": 21.48, "step": 18725, "token_acc": 1.0, "train_speed(iter/s)": 0.95595 }, { "epoch": 0.6083227755579378, "grad_norm": 0.28919172286987305, "learning_rate": 3.6398318034879987e-06, "loss": 0.01587979681789875, "memory(GiB)": 21.48, "step": 18726, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.955961 }, { "epoch": 0.6083552610206933, "grad_norm": 0.34229299426078796, "learning_rate": 3.6393149156532835e-06, "loss": 0.01429988443851471, "memory(GiB)": 21.48, "step": 18727, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.955971 }, { "epoch": 0.6083877464834486, "grad_norm": 0.3007717430591583, "learning_rate": 3.6387980435226888e-06, "loss": 0.013326744548976421, "memory(GiB)": 21.48, "step": 18728, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.955979 }, { "epoch": 0.6084202319462041, "grad_norm": 0.36126166582107544, "learning_rate": 3.6382811871021738e-06, "loss": 0.02347935363650322, "memory(GiB)": 21.48, "step": 18729, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.955985 }, { "epoch": 0.6084527174089595, "grad_norm": 0.32561957836151123, "learning_rate": 3.6377643463977076e-06, "loss": 0.018591906875371933, "memory(GiB)": 21.48, "step": 18730, "token_acc": 1.0, "train_speed(iter/s)": 0.955992 }, { "epoch": 0.6084852028717149, "grad_norm": 0.2991170287132263, "learning_rate": 3.6372475214152537e-06, "loss": 0.016753027215600014, "memory(GiB)": 21.48, "step": 18731, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956 }, { "epoch": 0.6085176883344703, "grad_norm": 0.3686046898365021, "learning_rate": 3.6367307121607777e-06, "loss": 0.02452903427183628, "memory(GiB)": 21.48, "step": 18732, "token_acc": 1.0, "train_speed(iter/s)": 0.956007 }, { "epoch": 0.6085501737972258, "grad_norm": 0.36612668633461, "learning_rate": 3.6362139186402425e-06, "loss": 0.014907900243997574, "memory(GiB)": 21.48, "step": 18733, "token_acc": 1.0, "train_speed(iter/s)": 0.956014 }, { "epoch": 0.6085826592599811, "grad_norm": 0.4089517891407013, "learning_rate": 3.6356971408596126e-06, "loss": 0.021659350022673607, "memory(GiB)": 21.48, "step": 18734, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.956022 }, { "epoch": 0.6086151447227366, "grad_norm": 0.365267813205719, "learning_rate": 3.6351803788248573e-06, "loss": 0.02148863859474659, "memory(GiB)": 21.48, "step": 18735, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956029 }, { "epoch": 0.608647630185492, "grad_norm": 0.4668882489204407, "learning_rate": 3.634663632541935e-06, "loss": 0.02030111849308014, "memory(GiB)": 21.48, "step": 18736, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.956035 }, { "epoch": 0.6086801156482474, "grad_norm": 0.38034048676490784, "learning_rate": 3.6341469020168117e-06, "loss": 0.015125869773328304, "memory(GiB)": 21.48, "step": 18737, "token_acc": 1.0, "train_speed(iter/s)": 0.956043 }, { "epoch": 0.6087126011110028, "grad_norm": 0.2749518156051636, "learning_rate": 3.6336301872554513e-06, "loss": 0.013221446424722672, "memory(GiB)": 21.48, "step": 18738, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.95605 }, { "epoch": 0.6087450865737583, "grad_norm": 0.2285282015800476, "learning_rate": 3.6331134882638175e-06, "loss": 0.006779520772397518, "memory(GiB)": 21.48, "step": 18739, "token_acc": 0.9868995633187773, "train_speed(iter/s)": 0.956057 }, { "epoch": 0.6087775720365136, "grad_norm": 0.4019842743873596, "learning_rate": 3.632596805047872e-06, "loss": 0.020899642258882523, "memory(GiB)": 21.48, "step": 18740, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.956064 }, { "epoch": 0.6088100574992691, "grad_norm": 0.3085187077522278, "learning_rate": 3.6320801376135815e-06, "loss": 0.0209587924182415, "memory(GiB)": 21.48, "step": 18741, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.95607 }, { "epoch": 0.6088425429620244, "grad_norm": 0.4577634930610657, "learning_rate": 3.631563485966905e-06, "loss": 0.024581879377365112, "memory(GiB)": 21.48, "step": 18742, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956076 }, { "epoch": 0.6088750284247799, "grad_norm": 0.5007502436637878, "learning_rate": 3.6310468501138086e-06, "loss": 0.01847909763455391, "memory(GiB)": 21.48, "step": 18743, "token_acc": 1.0, "train_speed(iter/s)": 0.956084 }, { "epoch": 0.6089075138875353, "grad_norm": 0.3474586606025696, "learning_rate": 3.630530230060253e-06, "loss": 0.013532204553484917, "memory(GiB)": 21.48, "step": 18744, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956091 }, { "epoch": 0.6089399993502908, "grad_norm": 0.37887731194496155, "learning_rate": 3.6300136258122023e-06, "loss": 0.019743919372558594, "memory(GiB)": 21.48, "step": 18745, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.956098 }, { "epoch": 0.6089724848130461, "grad_norm": 0.44404616951942444, "learning_rate": 3.6294970373756154e-06, "loss": 0.019030094146728516, "memory(GiB)": 21.48, "step": 18746, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.956105 }, { "epoch": 0.6090049702758016, "grad_norm": 0.46235060691833496, "learning_rate": 3.6289804647564603e-06, "loss": 0.024401653558015823, "memory(GiB)": 21.48, "step": 18747, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.956113 }, { "epoch": 0.6090374557385569, "grad_norm": 0.3166659474372864, "learning_rate": 3.628463907960692e-06, "loss": 0.014112770557403564, "memory(GiB)": 21.48, "step": 18748, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.956121 }, { "epoch": 0.6090699412013124, "grad_norm": 0.37598589062690735, "learning_rate": 3.6279473669942787e-06, "loss": 0.016101306304335594, "memory(GiB)": 21.48, "step": 18749, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.95613 }, { "epoch": 0.6091024266640678, "grad_norm": 0.5201194882392883, "learning_rate": 3.627430841863176e-06, "loss": 0.019696127623319626, "memory(GiB)": 21.48, "step": 18750, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.95614 }, { "epoch": 0.6091349121268232, "grad_norm": 0.6154013872146606, "learning_rate": 3.6269143325733493e-06, "loss": 0.021538959816098213, "memory(GiB)": 21.48, "step": 18751, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956149 }, { "epoch": 0.6091673975895787, "grad_norm": 0.36235225200653076, "learning_rate": 3.62639783913076e-06, "loss": 0.014430709183216095, "memory(GiB)": 21.48, "step": 18752, "token_acc": 0.99609375, "train_speed(iter/s)": 0.95616 }, { "epoch": 0.6091998830523341, "grad_norm": 0.5082060098648071, "learning_rate": 3.625881361541366e-06, "loss": 0.02325565367937088, "memory(GiB)": 21.48, "step": 18753, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.956171 }, { "epoch": 0.6092323685150896, "grad_norm": 0.3725392818450928, "learning_rate": 3.625364899811133e-06, "loss": 0.015548992902040482, "memory(GiB)": 21.48, "step": 18754, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.956182 }, { "epoch": 0.6092648539778449, "grad_norm": 0.3332049548625946, "learning_rate": 3.624848453946015e-06, "loss": 0.015156300738453865, "memory(GiB)": 21.48, "step": 18755, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956192 }, { "epoch": 0.6092973394406004, "grad_norm": 1.3753763437271118, "learning_rate": 3.62433202395198e-06, "loss": 0.016098927706480026, "memory(GiB)": 21.48, "step": 18756, "token_acc": 0.9892086330935251, "train_speed(iter/s)": 0.956204 }, { "epoch": 0.6093298249033557, "grad_norm": 0.3819550573825836, "learning_rate": 3.6238156098349806e-06, "loss": 0.01941675879061222, "memory(GiB)": 21.48, "step": 18757, "token_acc": 0.99609375, "train_speed(iter/s)": 0.956214 }, { "epoch": 0.6093623103661112, "grad_norm": 0.4283914864063263, "learning_rate": 3.6232992116009846e-06, "loss": 0.022811707109212875, "memory(GiB)": 21.48, "step": 18758, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.956225 }, { "epoch": 0.6093947958288666, "grad_norm": 0.3817741274833679, "learning_rate": 3.622782829255945e-06, "loss": 0.016797099262475967, "memory(GiB)": 21.48, "step": 18759, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956235 }, { "epoch": 0.609427281291622, "grad_norm": 0.329703688621521, "learning_rate": 3.622266462805827e-06, "loss": 0.017209073528647423, "memory(GiB)": 21.48, "step": 18760, "token_acc": 1.0, "train_speed(iter/s)": 0.956246 }, { "epoch": 0.6094597667543774, "grad_norm": 0.47556814551353455, "learning_rate": 3.621750112256586e-06, "loss": 0.02250538021326065, "memory(GiB)": 21.48, "step": 18761, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956257 }, { "epoch": 0.6094922522171329, "grad_norm": 0.35441532731056213, "learning_rate": 3.6212337776141838e-06, "loss": 0.017161153256893158, "memory(GiB)": 21.48, "step": 18762, "token_acc": 1.0, "train_speed(iter/s)": 0.956268 }, { "epoch": 0.6095247376798882, "grad_norm": 0.3482293486595154, "learning_rate": 3.6207174588845785e-06, "loss": 0.014816351234912872, "memory(GiB)": 21.48, "step": 18763, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.956279 }, { "epoch": 0.6095572231426437, "grad_norm": 0.3413408398628235, "learning_rate": 3.6202011560737304e-06, "loss": 0.020448867231607437, "memory(GiB)": 21.48, "step": 18764, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.95629 }, { "epoch": 0.6095897086053991, "grad_norm": 0.4360896646976471, "learning_rate": 3.6196848691875965e-06, "loss": 0.0160544291138649, "memory(GiB)": 21.48, "step": 18765, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.9563 }, { "epoch": 0.6096221940681545, "grad_norm": 0.34401237964630127, "learning_rate": 3.6191685982321355e-06, "loss": 0.016736827790737152, "memory(GiB)": 21.48, "step": 18766, "token_acc": 1.0, "train_speed(iter/s)": 0.956311 }, { "epoch": 0.6096546795309099, "grad_norm": 0.3923735022544861, "learning_rate": 3.6186523432133094e-06, "loss": 0.01907266676425934, "memory(GiB)": 21.48, "step": 18767, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.956322 }, { "epoch": 0.6096871649936654, "grad_norm": 0.4435395300388336, "learning_rate": 3.6181361041370718e-06, "loss": 0.022357460111379623, "memory(GiB)": 21.48, "step": 18768, "token_acc": 0.981203007518797, "train_speed(iter/s)": 0.956333 }, { "epoch": 0.6097196504564207, "grad_norm": 0.2535928785800934, "learning_rate": 3.617619881009384e-06, "loss": 0.012276813387870789, "memory(GiB)": 21.48, "step": 18769, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.956344 }, { "epoch": 0.6097521359191762, "grad_norm": 0.38009944558143616, "learning_rate": 3.617103673836202e-06, "loss": 0.021336238831281662, "memory(GiB)": 21.48, "step": 18770, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.956355 }, { "epoch": 0.6097846213819316, "grad_norm": 2.8126018047332764, "learning_rate": 3.616587482623485e-06, "loss": 0.01699574664235115, "memory(GiB)": 21.48, "step": 18771, "token_acc": 1.0, "train_speed(iter/s)": 0.956365 }, { "epoch": 0.609817106844687, "grad_norm": 0.36940184235572815, "learning_rate": 3.616071307377188e-06, "loss": 0.016284875571727753, "memory(GiB)": 21.48, "step": 18772, "token_acc": 1.0, "train_speed(iter/s)": 0.956376 }, { "epoch": 0.6098495923074424, "grad_norm": 0.4686487913131714, "learning_rate": 3.6155551481032725e-06, "loss": 0.018300175666809082, "memory(GiB)": 21.48, "step": 18773, "token_acc": 1.0, "train_speed(iter/s)": 0.956387 }, { "epoch": 0.6098820777701979, "grad_norm": 0.48948410153388977, "learning_rate": 3.615039004807691e-06, "loss": 0.02151200920343399, "memory(GiB)": 21.48, "step": 18774, "token_acc": 0.99609375, "train_speed(iter/s)": 0.956398 }, { "epoch": 0.6099145632329532, "grad_norm": 0.41313913464546204, "learning_rate": 3.6145228774964047e-06, "loss": 0.018884412944316864, "memory(GiB)": 21.48, "step": 18775, "token_acc": 1.0, "train_speed(iter/s)": 0.956408 }, { "epoch": 0.6099470486957087, "grad_norm": 0.40840670466423035, "learning_rate": 3.6140067661753674e-06, "loss": 0.021769993007183075, "memory(GiB)": 21.48, "step": 18776, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.956419 }, { "epoch": 0.6099795341584641, "grad_norm": 0.3669808804988861, "learning_rate": 3.6134906708505384e-06, "loss": 0.01150069572031498, "memory(GiB)": 21.48, "step": 18777, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956429 }, { "epoch": 0.6100120196212195, "grad_norm": 0.21532300114631653, "learning_rate": 3.6129745915278695e-06, "loss": 0.011997245252132416, "memory(GiB)": 21.48, "step": 18778, "token_acc": 0.9967105263157895, "train_speed(iter/s)": 0.95644 }, { "epoch": 0.6100445050839749, "grad_norm": 0.40029117465019226, "learning_rate": 3.6124585282133236e-06, "loss": 0.01428220234811306, "memory(GiB)": 21.48, "step": 18779, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956451 }, { "epoch": 0.6100769905467304, "grad_norm": 0.5094454884529114, "learning_rate": 3.611942480912849e-06, "loss": 0.013395940884947777, "memory(GiB)": 21.48, "step": 18780, "token_acc": 1.0, "train_speed(iter/s)": 0.956461 }, { "epoch": 0.6101094760094857, "grad_norm": 0.2247198522090912, "learning_rate": 3.6114264496324075e-06, "loss": 0.012258190661668777, "memory(GiB)": 21.48, "step": 18781, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956471 }, { "epoch": 0.6101419614722412, "grad_norm": 0.31674373149871826, "learning_rate": 3.6109104343779545e-06, "loss": 0.015623985789716244, "memory(GiB)": 21.48, "step": 18782, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.956482 }, { "epoch": 0.6101744469349966, "grad_norm": 0.3686622679233551, "learning_rate": 3.610394435155441e-06, "loss": 0.01349938940256834, "memory(GiB)": 21.48, "step": 18783, "token_acc": 0.9964028776978417, "train_speed(iter/s)": 0.956493 }, { "epoch": 0.610206932397752, "grad_norm": 0.3835509121417999, "learning_rate": 3.6098784519708287e-06, "loss": 0.022721905261278152, "memory(GiB)": 21.48, "step": 18784, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.956503 }, { "epoch": 0.6102394178605074, "grad_norm": 0.5027972459793091, "learning_rate": 3.6093624848300664e-06, "loss": 0.015125628560781479, "memory(GiB)": 21.48, "step": 18785, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956514 }, { "epoch": 0.6102719033232629, "grad_norm": 0.30904296040534973, "learning_rate": 3.6088465337391144e-06, "loss": 0.016991224139928818, "memory(GiB)": 21.48, "step": 18786, "token_acc": 1.0, "train_speed(iter/s)": 0.956524 }, { "epoch": 0.6103043887860182, "grad_norm": 0.26106420159339905, "learning_rate": 3.608330598703923e-06, "loss": 0.01263486035168171, "memory(GiB)": 21.48, "step": 18787, "token_acc": 1.0, "train_speed(iter/s)": 0.956535 }, { "epoch": 0.6103368742487737, "grad_norm": 0.42612844705581665, "learning_rate": 3.6078146797304503e-06, "loss": 0.02193944901227951, "memory(GiB)": 21.48, "step": 18788, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956544 }, { "epoch": 0.6103693597115291, "grad_norm": 0.34086066484451294, "learning_rate": 3.607298776824648e-06, "loss": 0.011561544612050056, "memory(GiB)": 21.48, "step": 18789, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956551 }, { "epoch": 0.6104018451742845, "grad_norm": 0.38391128182411194, "learning_rate": 3.6067828899924723e-06, "loss": 0.01944540999829769, "memory(GiB)": 21.48, "step": 18790, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.956558 }, { "epoch": 0.6104343306370399, "grad_norm": 0.3167806565761566, "learning_rate": 3.606267019239875e-06, "loss": 0.01649821549654007, "memory(GiB)": 21.48, "step": 18791, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.956566 }, { "epoch": 0.6104668160997954, "grad_norm": 0.43697676062583923, "learning_rate": 3.605751164572814e-06, "loss": 0.019044555723667145, "memory(GiB)": 21.48, "step": 18792, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.956573 }, { "epoch": 0.6104993015625507, "grad_norm": 0.286466509103775, "learning_rate": 3.6052353259972377e-06, "loss": 0.014879540540277958, "memory(GiB)": 21.48, "step": 18793, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.956581 }, { "epoch": 0.6105317870253062, "grad_norm": 0.49412640929222107, "learning_rate": 3.6047195035191042e-06, "loss": 0.018996987491846085, "memory(GiB)": 21.48, "step": 18794, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.956588 }, { "epoch": 0.6105642724880616, "grad_norm": 0.451438844203949, "learning_rate": 3.6042036971443627e-06, "loss": 0.024587081745266914, "memory(GiB)": 21.48, "step": 18795, "token_acc": 0.9933333333333333, "train_speed(iter/s)": 0.956596 }, { "epoch": 0.610596757950817, "grad_norm": 0.2663244307041168, "learning_rate": 3.603687906878969e-06, "loss": 0.010957682505249977, "memory(GiB)": 21.48, "step": 18796, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.956602 }, { "epoch": 0.6106292434135724, "grad_norm": 0.2814485728740692, "learning_rate": 3.603172132728876e-06, "loss": 0.016132235527038574, "memory(GiB)": 21.48, "step": 18797, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.95661 }, { "epoch": 0.6106617288763279, "grad_norm": 0.307987779378891, "learning_rate": 3.602656374700035e-06, "loss": 0.016025114804506302, "memory(GiB)": 21.48, "step": 18798, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.956616 }, { "epoch": 0.6106942143390832, "grad_norm": 0.3424227237701416, "learning_rate": 3.6021406327984e-06, "loss": 0.014311794191598892, "memory(GiB)": 21.48, "step": 18799, "token_acc": 0.996, "train_speed(iter/s)": 0.956623 }, { "epoch": 0.6107266998018387, "grad_norm": 0.3107275664806366, "learning_rate": 3.601624907029922e-06, "loss": 0.015887394547462463, "memory(GiB)": 21.48, "step": 18800, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956631 }, { "epoch": 0.610759185264594, "grad_norm": 0.34160321950912476, "learning_rate": 3.601109197400555e-06, "loss": 0.01875622756779194, "memory(GiB)": 21.48, "step": 18801, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.956638 }, { "epoch": 0.6107916707273495, "grad_norm": 0.3517504632472992, "learning_rate": 3.6005935039162484e-06, "loss": 0.016612377017736435, "memory(GiB)": 21.48, "step": 18802, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956645 }, { "epoch": 0.6108241561901049, "grad_norm": 0.4043050706386566, "learning_rate": 3.6000778265829563e-06, "loss": 0.021973498165607452, "memory(GiB)": 21.48, "step": 18803, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956651 }, { "epoch": 0.6108566416528604, "grad_norm": 0.323747456073761, "learning_rate": 3.599562165406628e-06, "loss": 0.012552937492728233, "memory(GiB)": 21.48, "step": 18804, "token_acc": 1.0, "train_speed(iter/s)": 0.956658 }, { "epoch": 0.6108891271156157, "grad_norm": 0.37170639634132385, "learning_rate": 3.599046520393219e-06, "loss": 0.014965381473302841, "memory(GiB)": 21.48, "step": 18805, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.956665 }, { "epoch": 0.6109216125783712, "grad_norm": 0.3821735084056854, "learning_rate": 3.5985308915486748e-06, "loss": 0.021440789103507996, "memory(GiB)": 21.48, "step": 18806, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.956672 }, { "epoch": 0.6109540980411265, "grad_norm": 0.39259305596351624, "learning_rate": 3.5980152788789524e-06, "loss": 0.020593905821442604, "memory(GiB)": 21.48, "step": 18807, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.95668 }, { "epoch": 0.610986583503882, "grad_norm": 0.3444501757621765, "learning_rate": 3.5974996823899966e-06, "loss": 0.016836343333125114, "memory(GiB)": 21.48, "step": 18808, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.956689 }, { "epoch": 0.6110190689666374, "grad_norm": 0.5473147034645081, "learning_rate": 3.5969841020877642e-06, "loss": 0.020074693486094475, "memory(GiB)": 21.48, "step": 18809, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.956698 }, { "epoch": 0.6110515544293929, "grad_norm": 0.39249128103256226, "learning_rate": 3.5964685379782006e-06, "loss": 0.019925322383642197, "memory(GiB)": 21.48, "step": 18810, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956707 }, { "epoch": 0.6110840398921482, "grad_norm": 0.2668576240539551, "learning_rate": 3.5959529900672573e-06, "loss": 0.014795660972595215, "memory(GiB)": 21.48, "step": 18811, "token_acc": 1.0, "train_speed(iter/s)": 0.956716 }, { "epoch": 0.6111165253549037, "grad_norm": 0.8136157393455505, "learning_rate": 3.5954374583608886e-06, "loss": 0.020231865346431732, "memory(GiB)": 21.48, "step": 18812, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956726 }, { "epoch": 0.611149010817659, "grad_norm": 0.2665066421031952, "learning_rate": 3.5949219428650373e-06, "loss": 0.019163738936185837, "memory(GiB)": 21.48, "step": 18813, "token_acc": 1.0, "train_speed(iter/s)": 0.956735 }, { "epoch": 0.6111814962804145, "grad_norm": 0.36772620677948, "learning_rate": 3.5944064435856606e-06, "loss": 0.017919626086950302, "memory(GiB)": 21.48, "step": 18814, "token_acc": 1.0, "train_speed(iter/s)": 0.956745 }, { "epoch": 0.6112139817431699, "grad_norm": 0.38625288009643555, "learning_rate": 3.593890960528702e-06, "loss": 0.018984457477927208, "memory(GiB)": 21.48, "step": 18815, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956755 }, { "epoch": 0.6112464672059253, "grad_norm": 0.3440413773059845, "learning_rate": 3.5933754937001144e-06, "loss": 0.012645509093999863, "memory(GiB)": 21.48, "step": 18816, "token_acc": 1.0, "train_speed(iter/s)": 0.956766 }, { "epoch": 0.6112789526686808, "grad_norm": 0.3172987103462219, "learning_rate": 3.5928600431058448e-06, "loss": 0.016386013478040695, "memory(GiB)": 21.48, "step": 18817, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.956777 }, { "epoch": 0.6113114381314362, "grad_norm": 0.3461250066757202, "learning_rate": 3.592344608751844e-06, "loss": 0.01378727052360773, "memory(GiB)": 21.48, "step": 18818, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.956787 }, { "epoch": 0.6113439235941917, "grad_norm": 0.35419172048568726, "learning_rate": 3.5918291906440596e-06, "loss": 0.017525481060147285, "memory(GiB)": 21.48, "step": 18819, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956797 }, { "epoch": 0.611376409056947, "grad_norm": 0.4439052939414978, "learning_rate": 3.591313788788441e-06, "loss": 0.020806748420000076, "memory(GiB)": 21.48, "step": 18820, "token_acc": 0.9927007299270073, "train_speed(iter/s)": 0.956808 }, { "epoch": 0.6114088945197025, "grad_norm": 0.3821120858192444, "learning_rate": 3.5907984031909354e-06, "loss": 0.015866782516241074, "memory(GiB)": 21.48, "step": 18821, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956819 }, { "epoch": 0.6114413799824578, "grad_norm": 0.28426337242126465, "learning_rate": 3.590283033857493e-06, "loss": 0.010732220485806465, "memory(GiB)": 21.48, "step": 18822, "token_acc": 1.0, "train_speed(iter/s)": 0.95683 }, { "epoch": 0.6114738654452133, "grad_norm": 0.3541412353515625, "learning_rate": 3.5897676807940595e-06, "loss": 0.01723191700875759, "memory(GiB)": 21.48, "step": 18823, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.95684 }, { "epoch": 0.6115063509079687, "grad_norm": 0.2888028025627136, "learning_rate": 3.5892523440065853e-06, "loss": 0.01110941544175148, "memory(GiB)": 21.48, "step": 18824, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.956851 }, { "epoch": 0.6115388363707241, "grad_norm": 0.42740246653556824, "learning_rate": 3.5887370235010155e-06, "loss": 0.01790982484817505, "memory(GiB)": 21.48, "step": 18825, "token_acc": 1.0, "train_speed(iter/s)": 0.956862 }, { "epoch": 0.6115713218334795, "grad_norm": 0.2590847611427307, "learning_rate": 3.5882217192833e-06, "loss": 0.013736468739807606, "memory(GiB)": 21.48, "step": 18826, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.956873 }, { "epoch": 0.611603807296235, "grad_norm": 0.38030293583869934, "learning_rate": 3.5877064313593835e-06, "loss": 0.01625591330230236, "memory(GiB)": 21.48, "step": 18827, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.956884 }, { "epoch": 0.6116362927589903, "grad_norm": 0.27979281544685364, "learning_rate": 3.587191159735215e-06, "loss": 0.011776389554142952, "memory(GiB)": 21.48, "step": 18828, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956895 }, { "epoch": 0.6116687782217458, "grad_norm": 0.31698840856552124, "learning_rate": 3.586675904416742e-06, "loss": 0.01750139147043228, "memory(GiB)": 21.48, "step": 18829, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.956906 }, { "epoch": 0.6117012636845012, "grad_norm": 0.35981419682502747, "learning_rate": 3.586160665409909e-06, "loss": 0.017591672018170357, "memory(GiB)": 21.48, "step": 18830, "token_acc": 0.9837837837837838, "train_speed(iter/s)": 0.956916 }, { "epoch": 0.6117337491472566, "grad_norm": 0.4040205776691437, "learning_rate": 3.5856454427206644e-06, "loss": 0.018086202442646027, "memory(GiB)": 21.48, "step": 18831, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956927 }, { "epoch": 0.611766234610012, "grad_norm": 0.3033148944377899, "learning_rate": 3.5851302363549536e-06, "loss": 0.018761709332466125, "memory(GiB)": 21.48, "step": 18832, "token_acc": 0.9812734082397003, "train_speed(iter/s)": 0.956938 }, { "epoch": 0.6117987200727675, "grad_norm": 0.2698417007923126, "learning_rate": 3.5846150463187246e-06, "loss": 0.01104426383972168, "memory(GiB)": 21.48, "step": 18833, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.956948 }, { "epoch": 0.6118312055355228, "grad_norm": 0.3314579427242279, "learning_rate": 3.5840998726179187e-06, "loss": 0.014481989666819572, "memory(GiB)": 21.48, "step": 18834, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.956959 }, { "epoch": 0.6118636909982783, "grad_norm": 0.36196306347846985, "learning_rate": 3.583584715258488e-06, "loss": 0.017303263768553734, "memory(GiB)": 21.48, "step": 18835, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.95697 }, { "epoch": 0.6118961764610337, "grad_norm": 0.3312572240829468, "learning_rate": 3.5830695742463722e-06, "loss": 0.014644441194832325, "memory(GiB)": 21.48, "step": 18836, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.956981 }, { "epoch": 0.6119286619237891, "grad_norm": 4.080439567565918, "learning_rate": 3.582554449587522e-06, "loss": 0.022425616160035133, "memory(GiB)": 21.48, "step": 18837, "token_acc": 0.985, "train_speed(iter/s)": 0.956992 }, { "epoch": 0.6119611473865445, "grad_norm": 0.3476437032222748, "learning_rate": 3.5820393412878786e-06, "loss": 0.016413964331150055, "memory(GiB)": 21.48, "step": 18838, "token_acc": 0.9770114942528736, "train_speed(iter/s)": 0.957002 }, { "epoch": 0.6119936328493, "grad_norm": 0.3471963703632355, "learning_rate": 3.581524249353389e-06, "loss": 0.018601778894662857, "memory(GiB)": 21.48, "step": 18839, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.957013 }, { "epoch": 0.6120261183120553, "grad_norm": 0.3344959020614624, "learning_rate": 3.581009173789997e-06, "loss": 0.01713259518146515, "memory(GiB)": 21.48, "step": 18840, "token_acc": 0.9927797833935018, "train_speed(iter/s)": 0.957024 }, { "epoch": 0.6120586037748108, "grad_norm": 0.42561593651771545, "learning_rate": 3.5804941146036485e-06, "loss": 0.01995089277625084, "memory(GiB)": 21.48, "step": 18841, "token_acc": 1.0, "train_speed(iter/s)": 0.957034 }, { "epoch": 0.6120910892375662, "grad_norm": 0.38023871183395386, "learning_rate": 3.5799790718002865e-06, "loss": 0.023153789341449738, "memory(GiB)": 21.48, "step": 18842, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.957045 }, { "epoch": 0.6121235747003216, "grad_norm": 0.39709779620170593, "learning_rate": 3.579464045385856e-06, "loss": 0.01726485788822174, "memory(GiB)": 21.48, "step": 18843, "token_acc": 0.9898648648648649, "train_speed(iter/s)": 0.957056 }, { "epoch": 0.612156060163077, "grad_norm": 0.5618738532066345, "learning_rate": 3.578949035366302e-06, "loss": 0.023369193077087402, "memory(GiB)": 21.48, "step": 18844, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.957066 }, { "epoch": 0.6121885456258325, "grad_norm": 0.2855224609375, "learning_rate": 3.5784340417475668e-06, "loss": 0.013670871965587139, "memory(GiB)": 21.48, "step": 18845, "token_acc": 0.9878048780487805, "train_speed(iter/s)": 0.957077 }, { "epoch": 0.6122210310885878, "grad_norm": 0.279364675283432, "learning_rate": 3.5779190645355957e-06, "loss": 0.01662374660372734, "memory(GiB)": 21.48, "step": 18846, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957087 }, { "epoch": 0.6122535165513433, "grad_norm": 0.28147876262664795, "learning_rate": 3.57740410373633e-06, "loss": 0.016059406101703644, "memory(GiB)": 21.48, "step": 18847, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.957097 }, { "epoch": 0.6122860020140987, "grad_norm": 0.31957489252090454, "learning_rate": 3.5768891593557155e-06, "loss": 0.016895629465579987, "memory(GiB)": 21.48, "step": 18848, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.957106 }, { "epoch": 0.6123184874768541, "grad_norm": 0.2907545268535614, "learning_rate": 3.576374231399694e-06, "loss": 0.01693674363195896, "memory(GiB)": 21.48, "step": 18849, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957114 }, { "epoch": 0.6123509729396095, "grad_norm": 0.33785495162010193, "learning_rate": 3.5758593198742086e-06, "loss": 0.020630445331335068, "memory(GiB)": 21.48, "step": 18850, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.957122 }, { "epoch": 0.612383458402365, "grad_norm": 0.32996615767478943, "learning_rate": 3.575344424785202e-06, "loss": 0.01702856831252575, "memory(GiB)": 21.48, "step": 18851, "token_acc": 0.992, "train_speed(iter/s)": 0.95713 }, { "epoch": 0.6124159438651203, "grad_norm": 0.5894606113433838, "learning_rate": 3.574829546138618e-06, "loss": 0.02461618185043335, "memory(GiB)": 21.48, "step": 18852, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.957138 }, { "epoch": 0.6124484293278758, "grad_norm": 0.36568623781204224, "learning_rate": 3.5743146839403965e-06, "loss": 0.016758020967245102, "memory(GiB)": 21.48, "step": 18853, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.957145 }, { "epoch": 0.6124809147906312, "grad_norm": 0.2662532329559326, "learning_rate": 3.573799838196482e-06, "loss": 0.011922497302293777, "memory(GiB)": 21.48, "step": 18854, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957152 }, { "epoch": 0.6125134002533866, "grad_norm": 0.2795102596282959, "learning_rate": 3.573285008912815e-06, "loss": 0.019625594839453697, "memory(GiB)": 21.48, "step": 18855, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.957158 }, { "epoch": 0.612545885716142, "grad_norm": 0.570106029510498, "learning_rate": 3.5727701960953406e-06, "loss": 0.02027374878525734, "memory(GiB)": 21.48, "step": 18856, "token_acc": 1.0, "train_speed(iter/s)": 0.957165 }, { "epoch": 0.6125783711788975, "grad_norm": 0.3931308090686798, "learning_rate": 3.5722553997499936e-06, "loss": 0.013107274658977985, "memory(GiB)": 21.48, "step": 18857, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.957173 }, { "epoch": 0.6126108566416528, "grad_norm": 0.30754703283309937, "learning_rate": 3.571740619882721e-06, "loss": 0.018539026379585266, "memory(GiB)": 21.48, "step": 18858, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.957181 }, { "epoch": 0.6126433421044083, "grad_norm": 0.4147406816482544, "learning_rate": 3.571225856499464e-06, "loss": 0.01642315648496151, "memory(GiB)": 21.48, "step": 18859, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.957188 }, { "epoch": 0.6126758275671637, "grad_norm": 0.3517254590988159, "learning_rate": 3.570711109606161e-06, "loss": 0.01975388079881668, "memory(GiB)": 21.48, "step": 18860, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.957195 }, { "epoch": 0.6127083130299191, "grad_norm": 0.22567088901996613, "learning_rate": 3.570196379208756e-06, "loss": 0.010207269340753555, "memory(GiB)": 21.48, "step": 18861, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.957203 }, { "epoch": 0.6127407984926745, "grad_norm": 0.38164791464805603, "learning_rate": 3.569681665313185e-06, "loss": 0.023563522845506668, "memory(GiB)": 21.48, "step": 18862, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.95721 }, { "epoch": 0.61277328395543, "grad_norm": 0.379940003156662, "learning_rate": 3.5691669679253947e-06, "loss": 0.016766324639320374, "memory(GiB)": 21.48, "step": 18863, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.957217 }, { "epoch": 0.6128057694181853, "grad_norm": 0.34214022755622864, "learning_rate": 3.5686522870513184e-06, "loss": 0.019663091748952866, "memory(GiB)": 21.48, "step": 18864, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957225 }, { "epoch": 0.6128382548809408, "grad_norm": 0.3628939390182495, "learning_rate": 3.5681376226969032e-06, "loss": 0.015279969200491905, "memory(GiB)": 21.48, "step": 18865, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957232 }, { "epoch": 0.6128707403436962, "grad_norm": 2.7203378677368164, "learning_rate": 3.5676229748680835e-06, "loss": 0.018494997173547745, "memory(GiB)": 21.48, "step": 18866, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.957238 }, { "epoch": 0.6129032258064516, "grad_norm": 0.6620031595230103, "learning_rate": 3.567108343570803e-06, "loss": 0.02366994321346283, "memory(GiB)": 21.48, "step": 18867, "token_acc": 1.0, "train_speed(iter/s)": 0.957245 }, { "epoch": 0.612935711269207, "grad_norm": 0.42626091837882996, "learning_rate": 3.5665937288109976e-06, "loss": 0.019201792776584625, "memory(GiB)": 21.48, "step": 18868, "token_acc": 0.9944444444444445, "train_speed(iter/s)": 0.957253 }, { "epoch": 0.6129681967319625, "grad_norm": 0.29130426049232483, "learning_rate": 3.566079130594609e-06, "loss": 0.013034515082836151, "memory(GiB)": 21.48, "step": 18869, "token_acc": 1.0, "train_speed(iter/s)": 0.957261 }, { "epoch": 0.6130006821947178, "grad_norm": 0.27409330010414124, "learning_rate": 3.565564548927575e-06, "loss": 0.011964036151766777, "memory(GiB)": 21.48, "step": 18870, "token_acc": 1.0, "train_speed(iter/s)": 0.95727 }, { "epoch": 0.6130331676574733, "grad_norm": 0.5271716713905334, "learning_rate": 3.565049983815837e-06, "loss": 0.01806000992655754, "memory(GiB)": 21.48, "step": 18871, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957277 }, { "epoch": 0.6130656531202286, "grad_norm": 0.25695762038230896, "learning_rate": 3.5645354352653303e-06, "loss": 0.015316962264478207, "memory(GiB)": 21.48, "step": 18872, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.957285 }, { "epoch": 0.6130981385829841, "grad_norm": 0.2680458128452301, "learning_rate": 3.564020903281996e-06, "loss": 0.012448013760149479, "memory(GiB)": 21.48, "step": 18873, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957294 }, { "epoch": 0.6131306240457395, "grad_norm": 0.30438879132270813, "learning_rate": 3.5635063878717725e-06, "loss": 0.01582830399274826, "memory(GiB)": 21.48, "step": 18874, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.957303 }, { "epoch": 0.613163109508495, "grad_norm": 0.3196723163127899, "learning_rate": 3.5629918890405955e-06, "loss": 0.018016912043094635, "memory(GiB)": 21.48, "step": 18875, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.957312 }, { "epoch": 0.6131955949712503, "grad_norm": 0.3291792869567871, "learning_rate": 3.562477406794407e-06, "loss": 0.020739663392305374, "memory(GiB)": 21.48, "step": 18876, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957322 }, { "epoch": 0.6132280804340058, "grad_norm": 0.4005974233150482, "learning_rate": 3.561962941139141e-06, "loss": 0.018555477261543274, "memory(GiB)": 21.48, "step": 18877, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.957332 }, { "epoch": 0.6132605658967611, "grad_norm": 0.30954575538635254, "learning_rate": 3.561448492080738e-06, "loss": 0.015711093321442604, "memory(GiB)": 21.48, "step": 18878, "token_acc": 0.9928571428571429, "train_speed(iter/s)": 0.957342 }, { "epoch": 0.6132930513595166, "grad_norm": 0.34833449125289917, "learning_rate": 3.5609340596251324e-06, "loss": 0.01688724011182785, "memory(GiB)": 21.48, "step": 18879, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957353 }, { "epoch": 0.6133255368222721, "grad_norm": 0.3463108539581299, "learning_rate": 3.5604196437782645e-06, "loss": 0.015437418594956398, "memory(GiB)": 21.48, "step": 18880, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.957364 }, { "epoch": 0.6133580222850274, "grad_norm": 0.30930691957473755, "learning_rate": 3.559905244546069e-06, "loss": 0.013480860739946365, "memory(GiB)": 21.48, "step": 18881, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957375 }, { "epoch": 0.6133905077477829, "grad_norm": 0.35445383191108704, "learning_rate": 3.559390861934485e-06, "loss": 0.014238354749977589, "memory(GiB)": 21.48, "step": 18882, "token_acc": 1.0, "train_speed(iter/s)": 0.957386 }, { "epoch": 0.6134229932105383, "grad_norm": 0.31556782126426697, "learning_rate": 3.5588764959494463e-06, "loss": 0.016825705766677856, "memory(GiB)": 21.48, "step": 18883, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.957396 }, { "epoch": 0.6134554786732938, "grad_norm": 0.4289160370826721, "learning_rate": 3.5583621465968933e-06, "loss": 0.0218784399330616, "memory(GiB)": 21.48, "step": 18884, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.957407 }, { "epoch": 0.6134879641360491, "grad_norm": 0.3833056688308716, "learning_rate": 3.557847813882757e-06, "loss": 0.01556671317666769, "memory(GiB)": 21.48, "step": 18885, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.957418 }, { "epoch": 0.6135204495988046, "grad_norm": 0.2514863610267639, "learning_rate": 3.5573334978129793e-06, "loss": 0.010246265679597855, "memory(GiB)": 21.48, "step": 18886, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.957429 }, { "epoch": 0.61355293506156, "grad_norm": 0.404863178730011, "learning_rate": 3.556819198393491e-06, "loss": 0.030491454526782036, "memory(GiB)": 21.48, "step": 18887, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.95744 }, { "epoch": 0.6135854205243154, "grad_norm": 0.3833479881286621, "learning_rate": 3.556304915630232e-06, "loss": 0.020706482231616974, "memory(GiB)": 21.48, "step": 18888, "token_acc": 0.9921875, "train_speed(iter/s)": 0.95745 }, { "epoch": 0.6136179059870708, "grad_norm": 0.43033307790756226, "learning_rate": 3.555790649529134e-06, "loss": 0.018240943551063538, "memory(GiB)": 21.48, "step": 18889, "token_acc": 1.0, "train_speed(iter/s)": 0.95746 }, { "epoch": 0.6136503914498262, "grad_norm": 0.4107314646244049, "learning_rate": 3.555276400096133e-06, "loss": 0.023318812251091003, "memory(GiB)": 21.48, "step": 18890, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957471 }, { "epoch": 0.6136828769125816, "grad_norm": 0.3468146026134491, "learning_rate": 3.554762167337169e-06, "loss": 0.019701380282640457, "memory(GiB)": 21.48, "step": 18891, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.957482 }, { "epoch": 0.6137153623753371, "grad_norm": 0.25781407952308655, "learning_rate": 3.5542479512581695e-06, "loss": 0.011642620898783207, "memory(GiB)": 21.48, "step": 18892, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.957493 }, { "epoch": 0.6137478478380924, "grad_norm": 0.5131552219390869, "learning_rate": 3.553733751865076e-06, "loss": 0.021936994045972824, "memory(GiB)": 21.48, "step": 18893, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.957504 }, { "epoch": 0.6137803333008479, "grad_norm": 0.4612901508808136, "learning_rate": 3.553219569163818e-06, "loss": 0.021200623363256454, "memory(GiB)": 21.48, "step": 18894, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.957515 }, { "epoch": 0.6138128187636033, "grad_norm": 0.30013900995254517, "learning_rate": 3.5527054031603337e-06, "loss": 0.01400262862443924, "memory(GiB)": 21.48, "step": 18895, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.957526 }, { "epoch": 0.6138453042263587, "grad_norm": 0.33024483919143677, "learning_rate": 3.5521912538605537e-06, "loss": 0.018345696851611137, "memory(GiB)": 21.48, "step": 18896, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.957537 }, { "epoch": 0.6138777896891141, "grad_norm": 0.42461904883384705, "learning_rate": 3.551677121270415e-06, "loss": 0.010991515591740608, "memory(GiB)": 21.48, "step": 18897, "token_acc": 1.0, "train_speed(iter/s)": 0.957547 }, { "epoch": 0.6139102751518696, "grad_norm": 0.42597758769989014, "learning_rate": 3.551163005395848e-06, "loss": 0.022288799285888672, "memory(GiB)": 21.48, "step": 18898, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.957559 }, { "epoch": 0.6139427606146249, "grad_norm": 0.3926216661930084, "learning_rate": 3.5506489062427902e-06, "loss": 0.017063643783330917, "memory(GiB)": 21.48, "step": 18899, "token_acc": 0.995, "train_speed(iter/s)": 0.95757 }, { "epoch": 0.6139752460773804, "grad_norm": 0.4337112009525299, "learning_rate": 3.550134823817172e-06, "loss": 0.01856835186481476, "memory(GiB)": 21.48, "step": 18900, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.95758 }, { "epoch": 0.6140077315401358, "grad_norm": 0.5007444024085999, "learning_rate": 3.549620758124929e-06, "loss": 0.018768977373838425, "memory(GiB)": 21.48, "step": 18901, "token_acc": 1.0, "train_speed(iter/s)": 0.957591 }, { "epoch": 0.6140402170028912, "grad_norm": 0.5698326826095581, "learning_rate": 3.549106709171991e-06, "loss": 0.01910923793911934, "memory(GiB)": 21.48, "step": 18902, "token_acc": 0.9878048780487805, "train_speed(iter/s)": 0.957603 }, { "epoch": 0.6140727024656466, "grad_norm": 0.3857550621032715, "learning_rate": 3.5485926769642944e-06, "loss": 0.01862497255206108, "memory(GiB)": 21.48, "step": 18903, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.957613 }, { "epoch": 0.6141051879284021, "grad_norm": 0.4983278810977936, "learning_rate": 3.548078661507769e-06, "loss": 0.016857638955116272, "memory(GiB)": 21.48, "step": 18904, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957623 }, { "epoch": 0.6141376733911574, "grad_norm": 0.24176384508609772, "learning_rate": 3.547564662808348e-06, "loss": 0.009253434836864471, "memory(GiB)": 21.48, "step": 18905, "token_acc": 1.0, "train_speed(iter/s)": 0.957635 }, { "epoch": 0.6141701588539129, "grad_norm": 0.3431847393512726, "learning_rate": 3.547050680871965e-06, "loss": 0.017946256324648857, "memory(GiB)": 21.48, "step": 18906, "token_acc": 1.0, "train_speed(iter/s)": 0.957645 }, { "epoch": 0.6142026443166683, "grad_norm": 0.335890531539917, "learning_rate": 3.5465367157045503e-06, "loss": 0.00954398699104786, "memory(GiB)": 21.48, "step": 18907, "token_acc": 1.0, "train_speed(iter/s)": 0.957656 }, { "epoch": 0.6142351297794237, "grad_norm": 0.3466051518917084, "learning_rate": 3.5460227673120377e-06, "loss": 0.02043592371046543, "memory(GiB)": 21.48, "step": 18908, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957665 }, { "epoch": 0.6142676152421791, "grad_norm": 0.4729793667793274, "learning_rate": 3.5455088357003562e-06, "loss": 0.013804569840431213, "memory(GiB)": 21.48, "step": 18909, "token_acc": 0.9930313588850174, "train_speed(iter/s)": 0.957674 }, { "epoch": 0.6143001007049346, "grad_norm": 0.42944228649139404, "learning_rate": 3.5449949208754405e-06, "loss": 0.023465260863304138, "memory(GiB)": 21.48, "step": 18910, "token_acc": 1.0, "train_speed(iter/s)": 0.957682 }, { "epoch": 0.6143325861676899, "grad_norm": 0.39154818654060364, "learning_rate": 3.544481022843218e-06, "loss": 0.019381195306777954, "memory(GiB)": 21.48, "step": 18911, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.957691 }, { "epoch": 0.6143650716304454, "grad_norm": 0.37559643387794495, "learning_rate": 3.5439671416096244e-06, "loss": 0.015591887757182121, "memory(GiB)": 21.48, "step": 18912, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.957698 }, { "epoch": 0.6143975570932008, "grad_norm": 2.506133794784546, "learning_rate": 3.543453277180584e-06, "loss": 0.012514817528426647, "memory(GiB)": 21.48, "step": 18913, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957706 }, { "epoch": 0.6144300425559562, "grad_norm": 0.40486016869544983, "learning_rate": 3.5429394295620358e-06, "loss": 0.01870875060558319, "memory(GiB)": 21.48, "step": 18914, "token_acc": 0.9927797833935018, "train_speed(iter/s)": 0.957715 }, { "epoch": 0.6144625280187116, "grad_norm": 0.4258400499820709, "learning_rate": 3.542425598759902e-06, "loss": 0.019250325858592987, "memory(GiB)": 21.48, "step": 18915, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.957722 }, { "epoch": 0.6144950134814671, "grad_norm": 0.30296823382377625, "learning_rate": 3.5419117847801198e-06, "loss": 0.015987619757652283, "memory(GiB)": 21.48, "step": 18916, "token_acc": 0.9940828402366864, "train_speed(iter/s)": 0.957729 }, { "epoch": 0.6145274989442224, "grad_norm": 0.28205838799476624, "learning_rate": 3.5413979876286146e-06, "loss": 0.019556600600481033, "memory(GiB)": 21.48, "step": 18917, "token_acc": 1.0, "train_speed(iter/s)": 0.957736 }, { "epoch": 0.6145599844069779, "grad_norm": 0.40544629096984863, "learning_rate": 3.540884207311319e-06, "loss": 0.021437477320432663, "memory(GiB)": 21.48, "step": 18918, "token_acc": 0.983957219251337, "train_speed(iter/s)": 0.957744 }, { "epoch": 0.6145924698697333, "grad_norm": 0.3562398850917816, "learning_rate": 3.5403704438341615e-06, "loss": 0.01124824583530426, "memory(GiB)": 21.48, "step": 18919, "token_acc": 0.9859649122807017, "train_speed(iter/s)": 0.957752 }, { "epoch": 0.6146249553324887, "grad_norm": 0.521029531955719, "learning_rate": 3.5398566972030695e-06, "loss": 0.01902887597680092, "memory(GiB)": 21.48, "step": 18920, "token_acc": 1.0, "train_speed(iter/s)": 0.957759 }, { "epoch": 0.6146574407952441, "grad_norm": 0.4105325937271118, "learning_rate": 3.5393429674239784e-06, "loss": 0.017245400696992874, "memory(GiB)": 21.48, "step": 18921, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.957766 }, { "epoch": 0.6146899262579996, "grad_norm": 0.3141980469226837, "learning_rate": 3.5388292545028114e-06, "loss": 0.022173820063471794, "memory(GiB)": 21.48, "step": 18922, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957774 }, { "epoch": 0.6147224117207549, "grad_norm": 0.45756837725639343, "learning_rate": 3.5383155584455008e-06, "loss": 0.024046415463089943, "memory(GiB)": 21.48, "step": 18923, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957782 }, { "epoch": 0.6147548971835104, "grad_norm": 0.3894558250904083, "learning_rate": 3.5378018792579727e-06, "loss": 0.013489757664501667, "memory(GiB)": 21.48, "step": 18924, "token_acc": 0.9836065573770492, "train_speed(iter/s)": 0.957789 }, { "epoch": 0.6147873826462658, "grad_norm": 0.3016802966594696, "learning_rate": 3.5372882169461587e-06, "loss": 0.014835559763014317, "memory(GiB)": 21.48, "step": 18925, "token_acc": 0.9966666666666667, "train_speed(iter/s)": 0.957796 }, { "epoch": 0.6148198681090212, "grad_norm": 0.3217826783657074, "learning_rate": 3.536774571515984e-06, "loss": 0.019895773380994797, "memory(GiB)": 21.48, "step": 18926, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.957803 }, { "epoch": 0.6148523535717766, "grad_norm": 0.3363994061946869, "learning_rate": 3.5362609429733797e-06, "loss": 0.021531302481889725, "memory(GiB)": 21.48, "step": 18927, "token_acc": 0.9793388429752066, "train_speed(iter/s)": 0.957811 }, { "epoch": 0.6148848390345321, "grad_norm": 0.3571660816669464, "learning_rate": 3.5357473313242706e-06, "loss": 0.016083672642707825, "memory(GiB)": 21.48, "step": 18928, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.95782 }, { "epoch": 0.6149173244972874, "grad_norm": 0.40453100204467773, "learning_rate": 3.5352337365745882e-06, "loss": 0.01931677758693695, "memory(GiB)": 21.48, "step": 18929, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.957829 }, { "epoch": 0.6149498099600429, "grad_norm": 0.1835615187883377, "learning_rate": 3.5347201587302566e-06, "loss": 0.009745866991579533, "memory(GiB)": 21.48, "step": 18930, "token_acc": 1.0, "train_speed(iter/s)": 0.957838 }, { "epoch": 0.6149822954227983, "grad_norm": 0.4163980782032013, "learning_rate": 3.5342065977972055e-06, "loss": 0.01582941599190235, "memory(GiB)": 21.48, "step": 18931, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.957847 }, { "epoch": 0.6150147808855537, "grad_norm": 0.19586323201656342, "learning_rate": 3.5336930537813606e-06, "loss": 0.010956743732094765, "memory(GiB)": 21.48, "step": 18932, "token_acc": 1.0, "train_speed(iter/s)": 0.957856 }, { "epoch": 0.6150472663483091, "grad_norm": 0.3088167607784271, "learning_rate": 3.5331795266886505e-06, "loss": 0.012828841805458069, "memory(GiB)": 21.48, "step": 18933, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.957863 }, { "epoch": 0.6150797518110646, "grad_norm": 0.47556114196777344, "learning_rate": 3.532666016524999e-06, "loss": 0.021545715630054474, "memory(GiB)": 21.48, "step": 18934, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.957872 }, { "epoch": 0.6151122372738199, "grad_norm": 0.46176955103874207, "learning_rate": 3.5321525232963363e-06, "loss": 0.02636788599193096, "memory(GiB)": 21.48, "step": 18935, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.95788 }, { "epoch": 0.6151447227365754, "grad_norm": 0.33793193101882935, "learning_rate": 3.531639047008587e-06, "loss": 0.015247308649122715, "memory(GiB)": 21.48, "step": 18936, "token_acc": 0.9929824561403509, "train_speed(iter/s)": 0.957889 }, { "epoch": 0.6151772081993307, "grad_norm": 0.3301132321357727, "learning_rate": 3.531125587667677e-06, "loss": 0.016986355185508728, "memory(GiB)": 21.48, "step": 18937, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.957898 }, { "epoch": 0.6152096936620862, "grad_norm": 0.3595598340034485, "learning_rate": 3.530612145279534e-06, "loss": 0.017352569848299026, "memory(GiB)": 21.48, "step": 18938, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.957906 }, { "epoch": 0.6152421791248416, "grad_norm": 0.30907368659973145, "learning_rate": 3.5300987198500814e-06, "loss": 0.013387630693614483, "memory(GiB)": 21.48, "step": 18939, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.957916 }, { "epoch": 0.615274664587597, "grad_norm": 0.3577842712402344, "learning_rate": 3.5295853113852473e-06, "loss": 0.016891639679670334, "memory(GiB)": 21.48, "step": 18940, "token_acc": 0.9807692307692307, "train_speed(iter/s)": 0.957927 }, { "epoch": 0.6153071500503524, "grad_norm": 0.3276364505290985, "learning_rate": 3.529071919890954e-06, "loss": 0.016805574297904968, "memory(GiB)": 21.48, "step": 18941, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.957937 }, { "epoch": 0.6153396355131079, "grad_norm": 0.28008899092674255, "learning_rate": 3.528558545373131e-06, "loss": 0.013957315124571323, "memory(GiB)": 21.48, "step": 18942, "token_acc": 1.0, "train_speed(iter/s)": 0.957948 }, { "epoch": 0.6153721209758632, "grad_norm": 0.3256593942642212, "learning_rate": 3.5280451878376974e-06, "loss": 0.01756085269153118, "memory(GiB)": 21.48, "step": 18943, "token_acc": 1.0, "train_speed(iter/s)": 0.957958 }, { "epoch": 0.6154046064386187, "grad_norm": 0.2818518280982971, "learning_rate": 3.527531847290585e-06, "loss": 0.019560454413294792, "memory(GiB)": 21.48, "step": 18944, "token_acc": 0.9826839826839827, "train_speed(iter/s)": 0.957969 }, { "epoch": 0.6154370919013742, "grad_norm": 0.23501136898994446, "learning_rate": 3.5270185237377117e-06, "loss": 0.013100746087729931, "memory(GiB)": 21.48, "step": 18945, "token_acc": 1.0, "train_speed(iter/s)": 0.957979 }, { "epoch": 0.6154695773641295, "grad_norm": 0.36473748087882996, "learning_rate": 3.5265052171850083e-06, "loss": 0.020278044044971466, "memory(GiB)": 21.48, "step": 18946, "token_acc": 0.9847908745247148, "train_speed(iter/s)": 0.957989 }, { "epoch": 0.615502062826885, "grad_norm": 0.4581555128097534, "learning_rate": 3.525991927638394e-06, "loss": 0.018071353435516357, "memory(GiB)": 21.48, "step": 18947, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.957999 }, { "epoch": 0.6155345482896404, "grad_norm": 0.35711976885795593, "learning_rate": 3.5254786551037956e-06, "loss": 0.02281683310866356, "memory(GiB)": 21.48, "step": 18948, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.95801 }, { "epoch": 0.6155670337523959, "grad_norm": 0.3238499164581299, "learning_rate": 3.5249653995871354e-06, "loss": 0.021535204723477364, "memory(GiB)": 21.48, "step": 18949, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.95802 }, { "epoch": 0.6155995192151512, "grad_norm": 0.4093150496482849, "learning_rate": 3.524452161094336e-06, "loss": 0.02257831022143364, "memory(GiB)": 21.48, "step": 18950, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.958031 }, { "epoch": 0.6156320046779067, "grad_norm": 0.2538325786590576, "learning_rate": 3.523938939631327e-06, "loss": 0.01467101275920868, "memory(GiB)": 21.48, "step": 18951, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.958041 }, { "epoch": 0.615664490140662, "grad_norm": 0.27287420630455017, "learning_rate": 3.5234257352040243e-06, "loss": 0.02008724957704544, "memory(GiB)": 21.48, "step": 18952, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.958051 }, { "epoch": 0.6156969756034175, "grad_norm": 0.44749391078948975, "learning_rate": 3.5229125478183546e-06, "loss": 0.016594242304563522, "memory(GiB)": 21.48, "step": 18953, "token_acc": 0.9968354430379747, "train_speed(iter/s)": 0.958062 }, { "epoch": 0.6157294610661729, "grad_norm": 0.3679117262363434, "learning_rate": 3.5223993774802403e-06, "loss": 0.013827240094542503, "memory(GiB)": 21.48, "step": 18954, "token_acc": 0.9966442953020134, "train_speed(iter/s)": 0.958072 }, { "epoch": 0.6157619465289284, "grad_norm": 0.5011137127876282, "learning_rate": 3.5218862241956037e-06, "loss": 0.016132861375808716, "memory(GiB)": 21.48, "step": 18955, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.958083 }, { "epoch": 0.6157944319916837, "grad_norm": 0.2900855541229248, "learning_rate": 3.5213730879703666e-06, "loss": 0.018193306401371956, "memory(GiB)": 21.48, "step": 18956, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.958094 }, { "epoch": 0.6158269174544392, "grad_norm": 0.314704030752182, "learning_rate": 3.520859968810454e-06, "loss": 0.014866476878523827, "memory(GiB)": 21.48, "step": 18957, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.958104 }, { "epoch": 0.6158594029171945, "grad_norm": 0.3966570198535919, "learning_rate": 3.5203468667217843e-06, "loss": 0.017602983862161636, "memory(GiB)": 21.48, "step": 18958, "token_acc": 1.0, "train_speed(iter/s)": 0.958115 }, { "epoch": 0.61589188837995, "grad_norm": 0.38641127943992615, "learning_rate": 3.5198337817102822e-06, "loss": 0.01706170104444027, "memory(GiB)": 21.48, "step": 18959, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.958125 }, { "epoch": 0.6159243738427054, "grad_norm": 0.7177236080169678, "learning_rate": 3.5193207137818673e-06, "loss": 0.016914820298552513, "memory(GiB)": 21.48, "step": 18960, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.958136 }, { "epoch": 0.6159568593054608, "grad_norm": 0.39963045716285706, "learning_rate": 3.518807662942463e-06, "loss": 0.019601481035351753, "memory(GiB)": 21.48, "step": 18961, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.958147 }, { "epoch": 0.6159893447682162, "grad_norm": 0.338201642036438, "learning_rate": 3.518294629197989e-06, "loss": 0.015394298359751701, "memory(GiB)": 21.48, "step": 18962, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.958157 }, { "epoch": 0.6160218302309717, "grad_norm": 0.3738439679145813, "learning_rate": 3.5177816125543685e-06, "loss": 0.018891507759690285, "memory(GiB)": 21.48, "step": 18963, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.958168 }, { "epoch": 0.616054315693727, "grad_norm": 0.6477363705635071, "learning_rate": 3.5172686130175198e-06, "loss": 0.013919403776526451, "memory(GiB)": 21.48, "step": 18964, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.958178 }, { "epoch": 0.6160868011564825, "grad_norm": 0.35586753487586975, "learning_rate": 3.516755630593367e-06, "loss": 0.0226246677339077, "memory(GiB)": 21.48, "step": 18965, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.958189 }, { "epoch": 0.6161192866192379, "grad_norm": 0.4009687304496765, "learning_rate": 3.5162426652878247e-06, "loss": 0.021679675206542015, "memory(GiB)": 21.48, "step": 18966, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.958199 }, { "epoch": 0.6161517720819933, "grad_norm": 0.5059086680412292, "learning_rate": 3.5157297171068182e-06, "loss": 0.024986449629068375, "memory(GiB)": 21.48, "step": 18967, "token_acc": 1.0, "train_speed(iter/s)": 0.95821 }, { "epoch": 0.6161842575447487, "grad_norm": 0.6276668906211853, "learning_rate": 3.515216786056268e-06, "loss": 0.02182295359671116, "memory(GiB)": 21.48, "step": 18968, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.958219 }, { "epoch": 0.6162167430075042, "grad_norm": 0.3463151454925537, "learning_rate": 3.5147038721420907e-06, "loss": 0.022501112893223763, "memory(GiB)": 21.48, "step": 18969, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.958227 }, { "epoch": 0.6162492284702595, "grad_norm": 0.3329320251941681, "learning_rate": 3.51419097537021e-06, "loss": 0.026806864887475967, "memory(GiB)": 21.48, "step": 18970, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.958235 }, { "epoch": 0.616281713933015, "grad_norm": 0.45753976702690125, "learning_rate": 3.5136780957465394e-06, "loss": 0.017223715782165527, "memory(GiB)": 21.48, "step": 18971, "token_acc": 0.9831460674157303, "train_speed(iter/s)": 0.958243 }, { "epoch": 0.6163141993957704, "grad_norm": 0.35677477717399597, "learning_rate": 3.5131652332770057e-06, "loss": 0.01575314998626709, "memory(GiB)": 21.48, "step": 18972, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.958251 }, { "epoch": 0.6163466848585258, "grad_norm": 0.253756046295166, "learning_rate": 3.51265238796752e-06, "loss": 0.012074057012796402, "memory(GiB)": 21.48, "step": 18973, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.958259 }, { "epoch": 0.6163791703212812, "grad_norm": 0.5043709874153137, "learning_rate": 3.512139559824009e-06, "loss": 0.017913836985826492, "memory(GiB)": 21.48, "step": 18974, "token_acc": 0.9875, "train_speed(iter/s)": 0.958267 }, { "epoch": 0.6164116557840367, "grad_norm": 0.22744104266166687, "learning_rate": 3.511626748852387e-06, "loss": 0.008938584476709366, "memory(GiB)": 21.48, "step": 18975, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.958275 }, { "epoch": 0.616444141246792, "grad_norm": 0.29440534114837646, "learning_rate": 3.511113955058573e-06, "loss": 0.012238170951604843, "memory(GiB)": 21.48, "step": 18976, "token_acc": 1.0, "train_speed(iter/s)": 0.958282 }, { "epoch": 0.6164766267095475, "grad_norm": 0.530629575252533, "learning_rate": 3.510601178448485e-06, "loss": 0.017719123512506485, "memory(GiB)": 21.48, "step": 18977, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.95829 }, { "epoch": 0.6165091121723029, "grad_norm": 0.35664787888526917, "learning_rate": 3.510088419028043e-06, "loss": 0.01596955955028534, "memory(GiB)": 21.48, "step": 18978, "token_acc": 0.9923371647509579, "train_speed(iter/s)": 0.958297 }, { "epoch": 0.6165415976350583, "grad_norm": 0.36784571409225464, "learning_rate": 3.509575676803162e-06, "loss": 0.019027331843972206, "memory(GiB)": 21.48, "step": 18979, "token_acc": 1.0, "train_speed(iter/s)": 0.958305 }, { "epoch": 0.6165740830978137, "grad_norm": 0.3039588928222656, "learning_rate": 3.5090629517797632e-06, "loss": 0.014415564015507698, "memory(GiB)": 21.48, "step": 18980, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.958313 }, { "epoch": 0.6166065685605692, "grad_norm": 0.30490872263908386, "learning_rate": 3.508550243963761e-06, "loss": 0.013186334632337093, "memory(GiB)": 21.48, "step": 18981, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.95832 }, { "epoch": 0.6166390540233245, "grad_norm": 0.39403316378593445, "learning_rate": 3.508037553361074e-06, "loss": 0.022432712838053703, "memory(GiB)": 21.48, "step": 18982, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.958328 }, { "epoch": 0.61667153948608, "grad_norm": 0.37096747756004333, "learning_rate": 3.507524879977621e-06, "loss": 0.018884683027863503, "memory(GiB)": 21.48, "step": 18983, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.958335 }, { "epoch": 0.6167040249488354, "grad_norm": 0.7394672632217407, "learning_rate": 3.5070122238193155e-06, "loss": 0.01968695968389511, "memory(GiB)": 21.48, "step": 18984, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.958342 }, { "epoch": 0.6167365104115908, "grad_norm": 0.28168460726737976, "learning_rate": 3.506499584892078e-06, "loss": 0.020508423447608948, "memory(GiB)": 21.48, "step": 18985, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.958349 }, { "epoch": 0.6167689958743462, "grad_norm": 0.33433598279953003, "learning_rate": 3.505986963201822e-06, "loss": 0.02232210896909237, "memory(GiB)": 21.48, "step": 18986, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.958357 }, { "epoch": 0.6168014813371017, "grad_norm": 0.35037270188331604, "learning_rate": 3.505474358754465e-06, "loss": 0.015872851014137268, "memory(GiB)": 21.48, "step": 18987, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.958364 }, { "epoch": 0.616833966799857, "grad_norm": 0.410469651222229, "learning_rate": 3.5049617715559224e-06, "loss": 0.02005944401025772, "memory(GiB)": 21.48, "step": 18988, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.958372 }, { "epoch": 0.6168664522626125, "grad_norm": 0.32241177558898926, "learning_rate": 3.5044492016121125e-06, "loss": 0.019458241760730743, "memory(GiB)": 21.48, "step": 18989, "token_acc": 1.0, "train_speed(iter/s)": 0.958381 }, { "epoch": 0.6168989377253679, "grad_norm": 0.31606999039649963, "learning_rate": 3.5039366489289482e-06, "loss": 0.009897610172629356, "memory(GiB)": 21.48, "step": 18990, "token_acc": 0.99609375, "train_speed(iter/s)": 0.958389 }, { "epoch": 0.6169314231881233, "grad_norm": 0.2722300887107849, "learning_rate": 3.503424113512347e-06, "loss": 0.015316499397158623, "memory(GiB)": 21.48, "step": 18991, "token_acc": 1.0, "train_speed(iter/s)": 0.958397 }, { "epoch": 0.6169639086508787, "grad_norm": 0.290477991104126, "learning_rate": 3.5029115953682224e-06, "loss": 0.01371388603001833, "memory(GiB)": 21.48, "step": 18992, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.958406 }, { "epoch": 0.6169963941136342, "grad_norm": 0.3464351296424866, "learning_rate": 3.5023990945024934e-06, "loss": 0.02200021967291832, "memory(GiB)": 21.48, "step": 18993, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.958415 }, { "epoch": 0.6170288795763895, "grad_norm": 0.29624655842781067, "learning_rate": 3.501886610921068e-06, "loss": 0.013086557388305664, "memory(GiB)": 21.48, "step": 18994, "token_acc": 1.0, "train_speed(iter/s)": 0.958423 }, { "epoch": 0.617061365039145, "grad_norm": 0.307604044675827, "learning_rate": 3.501374144629869e-06, "loss": 0.012871984392404556, "memory(GiB)": 21.48, "step": 18995, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.958431 }, { "epoch": 0.6170938505019004, "grad_norm": 0.5852853059768677, "learning_rate": 3.500861695634804e-06, "loss": 0.02806873433291912, "memory(GiB)": 21.48, "step": 18996, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.958439 }, { "epoch": 0.6171263359646558, "grad_norm": 0.3415408730506897, "learning_rate": 3.5003492639417908e-06, "loss": 0.02701311558485031, "memory(GiB)": 21.48, "step": 18997, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.958447 }, { "epoch": 0.6171588214274112, "grad_norm": 0.22429490089416504, "learning_rate": 3.499836849556746e-06, "loss": 0.014241734519600868, "memory(GiB)": 21.48, "step": 18998, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.958456 }, { "epoch": 0.6171913068901667, "grad_norm": 0.3963547945022583, "learning_rate": 3.4993244524855763e-06, "loss": 0.01897789165377617, "memory(GiB)": 21.48, "step": 18999, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.958464 }, { "epoch": 0.617223792352922, "grad_norm": 0.32839006185531616, "learning_rate": 3.498812072734204e-06, "loss": 0.017376556992530823, "memory(GiB)": 21.48, "step": 19000, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.958473 }, { "epoch": 0.617223792352922, "eval_loss": 0.016956916078925133, "eval_runtime": 79.4599, "eval_samples_per_second": 125.22, "eval_steps_per_second": 3.914, "eval_token_acc": 0.9932952016158787, "step": 19000 }, { "epoch": 0.6172562778156775, "grad_norm": 0.4578840732574463, "learning_rate": 3.4982997103085348e-06, "loss": 0.02372625842690468, "memory(GiB)": 21.48, "step": 19001, "token_acc": 0.9930422396318414, "train_speed(iter/s)": 0.954137 }, { "epoch": 0.6172887632784329, "grad_norm": 0.21552728116512299, "learning_rate": 3.497787365214489e-06, "loss": 0.012243775650858879, "memory(GiB)": 21.48, "step": 19002, "token_acc": 0.9961538461538462, "train_speed(iter/s)": 0.954145 }, { "epoch": 0.6173212487411883, "grad_norm": 0.5081011652946472, "learning_rate": 3.4972750374579746e-06, "loss": 0.02023901231586933, "memory(GiB)": 21.48, "step": 19003, "token_acc": 0.9940476190476191, "train_speed(iter/s)": 0.954153 }, { "epoch": 0.6173537342039437, "grad_norm": 0.2627216577529907, "learning_rate": 3.4967627270449073e-06, "loss": 0.015755202621221542, "memory(GiB)": 21.48, "step": 19004, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.954162 }, { "epoch": 0.6173862196666992, "grad_norm": 0.552423894405365, "learning_rate": 3.4962504339811974e-06, "loss": 0.01618729904294014, "memory(GiB)": 21.48, "step": 19005, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.95417 }, { "epoch": 0.6174187051294545, "grad_norm": 0.35675397515296936, "learning_rate": 3.4957381582727607e-06, "loss": 0.01713588461279869, "memory(GiB)": 21.48, "step": 19006, "token_acc": 0.996, "train_speed(iter/s)": 0.954178 }, { "epoch": 0.61745119059221, "grad_norm": 0.22777512669563293, "learning_rate": 3.4952258999255064e-06, "loss": 0.008238391019403934, "memory(GiB)": 21.48, "step": 19007, "token_acc": 1.0, "train_speed(iter/s)": 0.954186 }, { "epoch": 0.6174836760549655, "grad_norm": 0.3532312214374542, "learning_rate": 3.494713658945349e-06, "loss": 0.018300775438547134, "memory(GiB)": 21.48, "step": 19008, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.954195 }, { "epoch": 0.6175161615177208, "grad_norm": 0.3666350841522217, "learning_rate": 3.4942014353381992e-06, "loss": 0.018360022455453873, "memory(GiB)": 21.48, "step": 19009, "token_acc": 0.986013986013986, "train_speed(iter/s)": 0.954202 }, { "epoch": 0.6175486469804763, "grad_norm": 0.24861066043376923, "learning_rate": 3.4936892291099693e-06, "loss": 0.015247654169797897, "memory(GiB)": 21.48, "step": 19010, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.95421 }, { "epoch": 0.6175811324432317, "grad_norm": 0.24231691658496857, "learning_rate": 3.4931770402665697e-06, "loss": 0.010969117283821106, "memory(GiB)": 21.48, "step": 19011, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.954219 }, { "epoch": 0.6176136179059871, "grad_norm": 0.3008994460105896, "learning_rate": 3.492664868813913e-06, "loss": 0.009919891133904457, "memory(GiB)": 21.48, "step": 19012, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.954226 }, { "epoch": 0.6176461033687425, "grad_norm": 0.31998011469841003, "learning_rate": 3.4921527147579102e-06, "loss": 0.01632126420736313, "memory(GiB)": 21.48, "step": 19013, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.954236 }, { "epoch": 0.617678588831498, "grad_norm": 0.32174500823020935, "learning_rate": 3.491640578104472e-06, "loss": 0.015322929248213768, "memory(GiB)": 21.48, "step": 19014, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.954245 }, { "epoch": 0.6177110742942533, "grad_norm": 0.341781884431839, "learning_rate": 3.4911284588595098e-06, "loss": 0.017284251749515533, "memory(GiB)": 21.48, "step": 19015, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.954254 }, { "epoch": 0.6177435597570088, "grad_norm": 0.3492952585220337, "learning_rate": 3.490616357028932e-06, "loss": 0.0192917138338089, "memory(GiB)": 21.48, "step": 19016, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.954263 }, { "epoch": 0.6177760452197641, "grad_norm": 0.5864691734313965, "learning_rate": 3.4901042726186518e-06, "loss": 0.021840035915374756, "memory(GiB)": 21.48, "step": 19017, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.954273 }, { "epoch": 0.6178085306825196, "grad_norm": 0.30086392164230347, "learning_rate": 3.4895922056345776e-06, "loss": 0.017657088115811348, "memory(GiB)": 21.48, "step": 19018, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.954282 }, { "epoch": 0.617841016145275, "grad_norm": 0.5337748527526855, "learning_rate": 3.48908015608262e-06, "loss": 0.02237294800579548, "memory(GiB)": 21.48, "step": 19019, "token_acc": 1.0, "train_speed(iter/s)": 0.95429 }, { "epoch": 0.6178735016080305, "grad_norm": 0.3668578565120697, "learning_rate": 3.4885681239686877e-06, "loss": 0.023320745676755905, "memory(GiB)": 21.48, "step": 19020, "token_acc": 0.995, "train_speed(iter/s)": 0.954299 }, { "epoch": 0.6179059870707858, "grad_norm": 0.3497684597969055, "learning_rate": 3.4880561092986927e-06, "loss": 0.021908894181251526, "memory(GiB)": 21.48, "step": 19021, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.954308 }, { "epoch": 0.6179384725335413, "grad_norm": 0.36228713393211365, "learning_rate": 3.487544112078539e-06, "loss": 0.012373894453048706, "memory(GiB)": 21.48, "step": 19022, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.954318 }, { "epoch": 0.6179709579962966, "grad_norm": 0.36494097113609314, "learning_rate": 3.4870321323141433e-06, "loss": 0.01360882818698883, "memory(GiB)": 21.48, "step": 19023, "token_acc": 0.9746192893401016, "train_speed(iter/s)": 0.954326 }, { "epoch": 0.6180034434590521, "grad_norm": 0.3316727578639984, "learning_rate": 3.486520170011407e-06, "loss": 0.01343989185988903, "memory(GiB)": 21.48, "step": 19024, "token_acc": 1.0, "train_speed(iter/s)": 0.954334 }, { "epoch": 0.6180359289218075, "grad_norm": 0.3124313950538635, "learning_rate": 3.486008225176246e-06, "loss": 0.012829504907131195, "memory(GiB)": 21.48, "step": 19025, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.954342 }, { "epoch": 0.618068414384563, "grad_norm": 0.2715457081794739, "learning_rate": 3.4854962978145623e-06, "loss": 0.01333341933786869, "memory(GiB)": 21.48, "step": 19026, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.95435 }, { "epoch": 0.6181008998473183, "grad_norm": 0.49197256565093994, "learning_rate": 3.4849843879322685e-06, "loss": 0.02344556525349617, "memory(GiB)": 21.48, "step": 19027, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.954359 }, { "epoch": 0.6181333853100738, "grad_norm": 0.3975617587566376, "learning_rate": 3.484472495535271e-06, "loss": 0.013920508325099945, "memory(GiB)": 21.48, "step": 19028, "token_acc": 0.9899328859060402, "train_speed(iter/s)": 0.954368 }, { "epoch": 0.6181658707728291, "grad_norm": 0.40019941329956055, "learning_rate": 3.4839606206294762e-06, "loss": 0.021098289638757706, "memory(GiB)": 21.48, "step": 19029, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.954377 }, { "epoch": 0.6181983562355846, "grad_norm": 0.26941150426864624, "learning_rate": 3.483448763220797e-06, "loss": 0.011153855361044407, "memory(GiB)": 21.48, "step": 19030, "token_acc": 1.0, "train_speed(iter/s)": 0.954387 }, { "epoch": 0.61823084169834, "grad_norm": 0.4437316358089447, "learning_rate": 3.4829369233151356e-06, "loss": 0.018334902822971344, "memory(GiB)": 21.48, "step": 19031, "token_acc": 1.0, "train_speed(iter/s)": 0.954396 }, { "epoch": 0.6182633271610954, "grad_norm": 0.32699570059776306, "learning_rate": 3.482425100918403e-06, "loss": 0.01953798346221447, "memory(GiB)": 21.48, "step": 19032, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.954404 }, { "epoch": 0.6182958126238508, "grad_norm": 1.0145838260650635, "learning_rate": 3.4819132960365025e-06, "loss": 0.02029889076948166, "memory(GiB)": 21.48, "step": 19033, "token_acc": 1.0, "train_speed(iter/s)": 0.954412 }, { "epoch": 0.6183282980866063, "grad_norm": 0.2893979251384735, "learning_rate": 3.481401508675345e-06, "loss": 0.010521351359784603, "memory(GiB)": 21.48, "step": 19034, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.954421 }, { "epoch": 0.6183607835493616, "grad_norm": 0.30817773938179016, "learning_rate": 3.4808897388408338e-06, "loss": 0.017983058467507362, "memory(GiB)": 21.48, "step": 19035, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.954431 }, { "epoch": 0.6183932690121171, "grad_norm": 0.5532962679862976, "learning_rate": 3.4803779865388774e-06, "loss": 0.01750693842768669, "memory(GiB)": 21.48, "step": 19036, "token_acc": 0.99609375, "train_speed(iter/s)": 0.95444 }, { "epoch": 0.6184257544748725, "grad_norm": 0.44342824816703796, "learning_rate": 3.4798662517753816e-06, "loss": 0.01480550691485405, "memory(GiB)": 21.48, "step": 19037, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.954449 }, { "epoch": 0.6184582399376279, "grad_norm": 0.4550263285636902, "learning_rate": 3.4793545345562534e-06, "loss": 0.020715245977044106, "memory(GiB)": 21.48, "step": 19038, "token_acc": 1.0, "train_speed(iter/s)": 0.954457 }, { "epoch": 0.6184907254003833, "grad_norm": 0.5789401531219482, "learning_rate": 3.4788428348873972e-06, "loss": 0.029661579057574272, "memory(GiB)": 21.48, "step": 19039, "token_acc": 0.9715302491103203, "train_speed(iter/s)": 0.954466 }, { "epoch": 0.6185232108631388, "grad_norm": 0.3033928871154785, "learning_rate": 3.4783311527747194e-06, "loss": 0.017125006765127182, "memory(GiB)": 21.48, "step": 19040, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.954477 }, { "epoch": 0.6185556963258941, "grad_norm": 0.2851567268371582, "learning_rate": 3.4778194882241244e-06, "loss": 0.015855439007282257, "memory(GiB)": 21.48, "step": 19041, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.954488 }, { "epoch": 0.6185881817886496, "grad_norm": 0.282055526971817, "learning_rate": 3.4773078412415205e-06, "loss": 0.015552612021565437, "memory(GiB)": 21.48, "step": 19042, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.954498 }, { "epoch": 0.618620667251405, "grad_norm": 0.46850109100341797, "learning_rate": 3.476796211832809e-06, "loss": 0.019350556656718254, "memory(GiB)": 21.48, "step": 19043, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.954508 }, { "epoch": 0.6186531527141604, "grad_norm": 0.32194212079048157, "learning_rate": 3.4762846000038973e-06, "loss": 0.01485456246882677, "memory(GiB)": 21.48, "step": 19044, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.954518 }, { "epoch": 0.6186856381769158, "grad_norm": 0.2218204140663147, "learning_rate": 3.47577300576069e-06, "loss": 0.010042141191661358, "memory(GiB)": 21.48, "step": 19045, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.954528 }, { "epoch": 0.6187181236396713, "grad_norm": 0.35291367769241333, "learning_rate": 3.4752614291090895e-06, "loss": 0.02127043530344963, "memory(GiB)": 21.48, "step": 19046, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.954539 }, { "epoch": 0.6187506091024266, "grad_norm": 0.4049261212348938, "learning_rate": 3.474749870055003e-06, "loss": 0.02036678045988083, "memory(GiB)": 21.48, "step": 19047, "token_acc": 0.99375, "train_speed(iter/s)": 0.954549 }, { "epoch": 0.6187830945651821, "grad_norm": 0.3448660373687744, "learning_rate": 3.4742383286043324e-06, "loss": 0.02100684493780136, "memory(GiB)": 21.48, "step": 19048, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.95456 }, { "epoch": 0.6188155800279375, "grad_norm": 0.49552208185195923, "learning_rate": 3.473726804762984e-06, "loss": 0.024986324831843376, "memory(GiB)": 21.48, "step": 19049, "token_acc": 0.9846743295019157, "train_speed(iter/s)": 0.954571 }, { "epoch": 0.6188480654906929, "grad_norm": 0.39831891655921936, "learning_rate": 3.473215298536856e-06, "loss": 0.021402092650532722, "memory(GiB)": 21.48, "step": 19050, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.954581 }, { "epoch": 0.6188805509534483, "grad_norm": 0.3093123137950897, "learning_rate": 3.47270380993186e-06, "loss": 0.014488086104393005, "memory(GiB)": 21.48, "step": 19051, "token_acc": 1.0, "train_speed(iter/s)": 0.954589 }, { "epoch": 0.6189130364162038, "grad_norm": 0.5113843083381653, "learning_rate": 3.4721923389538907e-06, "loss": 0.021512335166335106, "memory(GiB)": 21.48, "step": 19052, "token_acc": 1.0, "train_speed(iter/s)": 0.954596 }, { "epoch": 0.6189455218789591, "grad_norm": 0.4011135995388031, "learning_rate": 3.471680885608859e-06, "loss": 0.019602037966251373, "memory(GiB)": 21.48, "step": 19053, "token_acc": 0.984, "train_speed(iter/s)": 0.954605 }, { "epoch": 0.6189780073417146, "grad_norm": 0.25818654894828796, "learning_rate": 3.4711694499026618e-06, "loss": 0.016427982598543167, "memory(GiB)": 21.48, "step": 19054, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.954613 }, { "epoch": 0.61901049280447, "grad_norm": 0.2971278727054596, "learning_rate": 3.4706580318412053e-06, "loss": 0.014380890876054764, "memory(GiB)": 21.48, "step": 19055, "token_acc": 1.0, "train_speed(iter/s)": 0.95462 }, { "epoch": 0.6190429782672254, "grad_norm": 0.3671371042728424, "learning_rate": 3.4701466314303888e-06, "loss": 0.013149000704288483, "memory(GiB)": 21.48, "step": 19056, "token_acc": 1.0, "train_speed(iter/s)": 0.954629 }, { "epoch": 0.6190754637299808, "grad_norm": 0.3625037968158722, "learning_rate": 3.469635248676119e-06, "loss": 0.021341480314731598, "memory(GiB)": 21.48, "step": 19057, "token_acc": 0.9875, "train_speed(iter/s)": 0.954637 }, { "epoch": 0.6191079491927363, "grad_norm": 0.32303735613822937, "learning_rate": 3.469123883584293e-06, "loss": 0.011968324892222881, "memory(GiB)": 21.48, "step": 19058, "token_acc": 1.0, "train_speed(iter/s)": 0.954646 }, { "epoch": 0.6191404346554916, "grad_norm": 0.23297075927257538, "learning_rate": 3.4686125361608146e-06, "loss": 0.013613294810056686, "memory(GiB)": 21.48, "step": 19059, "token_acc": 0.9921875, "train_speed(iter/s)": 0.954655 }, { "epoch": 0.6191729201182471, "grad_norm": 0.2443230301141739, "learning_rate": 3.4681012064115884e-06, "loss": 0.01025715097784996, "memory(GiB)": 21.48, "step": 19060, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.954662 }, { "epoch": 0.6192054055810025, "grad_norm": 0.3626115620136261, "learning_rate": 3.4675898943425114e-06, "loss": 0.020103121176362038, "memory(GiB)": 21.48, "step": 19061, "token_acc": 1.0, "train_speed(iter/s)": 0.95467 }, { "epoch": 0.6192378910437579, "grad_norm": 0.3739258646965027, "learning_rate": 3.467078599959487e-06, "loss": 0.013380290940403938, "memory(GiB)": 21.48, "step": 19062, "token_acc": 1.0, "train_speed(iter/s)": 0.954678 }, { "epoch": 0.6192703765065133, "grad_norm": 0.29686981439590454, "learning_rate": 3.466567323268416e-06, "loss": 0.021887099370360374, "memory(GiB)": 21.48, "step": 19063, "token_acc": 1.0, "train_speed(iter/s)": 0.954687 }, { "epoch": 0.6193028619692688, "grad_norm": 0.2937301695346832, "learning_rate": 3.4660560642751993e-06, "loss": 0.019064147025346756, "memory(GiB)": 21.48, "step": 19064, "token_acc": 1.0, "train_speed(iter/s)": 0.954695 }, { "epoch": 0.6193353474320241, "grad_norm": 0.4489496648311615, "learning_rate": 3.465544822985737e-06, "loss": 0.023296911269426346, "memory(GiB)": 21.48, "step": 19065, "token_acc": 0.9807692307692307, "train_speed(iter/s)": 0.954703 }, { "epoch": 0.6193678328947796, "grad_norm": 0.44687148928642273, "learning_rate": 3.4650335994059302e-06, "loss": 0.018584327772259712, "memory(GiB)": 21.48, "step": 19066, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.954712 }, { "epoch": 0.619400318357535, "grad_norm": 0.3326648473739624, "learning_rate": 3.464522393541677e-06, "loss": 0.014582859352231026, "memory(GiB)": 21.48, "step": 19067, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.95472 }, { "epoch": 0.6194328038202904, "grad_norm": 0.34611669182777405, "learning_rate": 3.464011205398882e-06, "loss": 0.013763777911663055, "memory(GiB)": 21.48, "step": 19068, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.954728 }, { "epoch": 0.6194652892830458, "grad_norm": 0.4146789014339447, "learning_rate": 3.46350003498344e-06, "loss": 0.019267790019512177, "memory(GiB)": 21.48, "step": 19069, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.954737 }, { "epoch": 0.6194977747458013, "grad_norm": 0.4209214448928833, "learning_rate": 3.4629888823012534e-06, "loss": 0.022362299263477325, "memory(GiB)": 21.48, "step": 19070, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.954742 }, { "epoch": 0.6195302602085566, "grad_norm": 0.2579928934574127, "learning_rate": 3.46247774735822e-06, "loss": 0.013943667523562908, "memory(GiB)": 21.48, "step": 19071, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.954753 }, { "epoch": 0.6195627456713121, "grad_norm": 0.5854091048240662, "learning_rate": 3.4619666301602425e-06, "loss": 0.0174972303211689, "memory(GiB)": 21.48, "step": 19072, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.954764 }, { "epoch": 0.6195952311340676, "grad_norm": 0.3455525040626526, "learning_rate": 3.4614555307132137e-06, "loss": 0.013714531436562538, "memory(GiB)": 21.48, "step": 19073, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.954775 }, { "epoch": 0.6196277165968229, "grad_norm": 0.4096132516860962, "learning_rate": 3.4609444490230363e-06, "loss": 0.020002298057079315, "memory(GiB)": 21.48, "step": 19074, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.954786 }, { "epoch": 0.6196602020595784, "grad_norm": 0.9092133641242981, "learning_rate": 3.46043338509561e-06, "loss": 0.019942421466112137, "memory(GiB)": 21.48, "step": 19075, "token_acc": 1.0, "train_speed(iter/s)": 0.954796 }, { "epoch": 0.6196926875223338, "grad_norm": 0.6574597358703613, "learning_rate": 3.4599223389368308e-06, "loss": 0.01943942904472351, "memory(GiB)": 21.48, "step": 19076, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.954794 }, { "epoch": 0.6197251729850892, "grad_norm": 0.28269535303115845, "learning_rate": 3.4594113105525997e-06, "loss": 0.016449768096208572, "memory(GiB)": 21.48, "step": 19077, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.954805 }, { "epoch": 0.6197576584478446, "grad_norm": 0.3596780300140381, "learning_rate": 3.458900299948809e-06, "loss": 0.017578307539224625, "memory(GiB)": 21.48, "step": 19078, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.954815 }, { "epoch": 0.6197901439106, "grad_norm": 0.32353198528289795, "learning_rate": 3.4583893071313634e-06, "loss": 0.024769287556409836, "memory(GiB)": 21.48, "step": 19079, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.954823 }, { "epoch": 0.6198226293733554, "grad_norm": 0.43452247977256775, "learning_rate": 3.4578783321061537e-06, "loss": 0.016704950481653214, "memory(GiB)": 21.48, "step": 19080, "token_acc": 1.0, "train_speed(iter/s)": 0.954832 }, { "epoch": 0.6198551148361109, "grad_norm": 0.41344690322875977, "learning_rate": 3.457367374879085e-06, "loss": 0.013049155473709106, "memory(GiB)": 21.48, "step": 19081, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.954841 }, { "epoch": 0.6198876002988662, "grad_norm": 0.35559436678886414, "learning_rate": 3.4568564354560463e-06, "loss": 0.01442713662981987, "memory(GiB)": 21.48, "step": 19082, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.95485 }, { "epoch": 0.6199200857616217, "grad_norm": 0.3736937642097473, "learning_rate": 3.45634551384294e-06, "loss": 0.021339885890483856, "memory(GiB)": 21.48, "step": 19083, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.954858 }, { "epoch": 0.6199525712243771, "grad_norm": 0.3699527978897095, "learning_rate": 3.45583461004566e-06, "loss": 0.017953889444470406, "memory(GiB)": 21.48, "step": 19084, "token_acc": 0.995, "train_speed(iter/s)": 0.954867 }, { "epoch": 0.6199850566871326, "grad_norm": 0.4554787576198578, "learning_rate": 3.4553237240701054e-06, "loss": 0.012648707255721092, "memory(GiB)": 21.48, "step": 19085, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.954875 }, { "epoch": 0.6200175421498879, "grad_norm": 0.41489502787590027, "learning_rate": 3.4548128559221695e-06, "loss": 0.019885201007127762, "memory(GiB)": 21.48, "step": 19086, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.954884 }, { "epoch": 0.6200500276126434, "grad_norm": 0.3629918098449707, "learning_rate": 3.4543020056077505e-06, "loss": 0.016848472878336906, "memory(GiB)": 21.48, "step": 19087, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.954893 }, { "epoch": 0.6200825130753987, "grad_norm": 0.45299032330513, "learning_rate": 3.4537911731327435e-06, "loss": 0.0174470916390419, "memory(GiB)": 21.48, "step": 19088, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.954901 }, { "epoch": 0.6201149985381542, "grad_norm": 0.44160494208335876, "learning_rate": 3.4532803585030437e-06, "loss": 0.019875651225447655, "memory(GiB)": 21.48, "step": 19089, "token_acc": 1.0, "train_speed(iter/s)": 0.954909 }, { "epoch": 0.6201474840009096, "grad_norm": 0.31195926666259766, "learning_rate": 3.4527695617245484e-06, "loss": 0.016825411468744278, "memory(GiB)": 21.48, "step": 19090, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.954917 }, { "epoch": 0.620179969463665, "grad_norm": 0.4338504374027252, "learning_rate": 3.4522587828031504e-06, "loss": 0.01817493885755539, "memory(GiB)": 21.48, "step": 19091, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.954926 }, { "epoch": 0.6202124549264204, "grad_norm": 0.2923707067966461, "learning_rate": 3.4517480217447473e-06, "loss": 0.017977438867092133, "memory(GiB)": 21.48, "step": 19092, "token_acc": 1.0, "train_speed(iter/s)": 0.954934 }, { "epoch": 0.6202449403891759, "grad_norm": 0.3225628435611725, "learning_rate": 3.4512372785552322e-06, "loss": 0.01932995766401291, "memory(GiB)": 21.48, "step": 19093, "token_acc": 1.0, "train_speed(iter/s)": 0.954943 }, { "epoch": 0.6202774258519312, "grad_norm": 0.42706793546676636, "learning_rate": 3.450726553240501e-06, "loss": 0.01752777397632599, "memory(GiB)": 21.48, "step": 19094, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.954952 }, { "epoch": 0.6203099113146867, "grad_norm": 0.34946098923683167, "learning_rate": 3.4502158458064467e-06, "loss": 0.013479320332407951, "memory(GiB)": 21.48, "step": 19095, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.954961 }, { "epoch": 0.6203423967774421, "grad_norm": 0.2603797912597656, "learning_rate": 3.4497051562589645e-06, "loss": 0.01520673930644989, "memory(GiB)": 21.48, "step": 19096, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.95497 }, { "epoch": 0.6203748822401975, "grad_norm": 0.30074450373649597, "learning_rate": 3.4491944846039483e-06, "loss": 0.01261205319315195, "memory(GiB)": 21.48, "step": 19097, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.954977 }, { "epoch": 0.6204073677029529, "grad_norm": 0.39680030941963196, "learning_rate": 3.4486838308472927e-06, "loss": 0.021548481658101082, "memory(GiB)": 21.48, "step": 19098, "token_acc": 0.9890710382513661, "train_speed(iter/s)": 0.954985 }, { "epoch": 0.6204398531657084, "grad_norm": 0.34484437108039856, "learning_rate": 3.4481731949948887e-06, "loss": 0.012416976504027843, "memory(GiB)": 21.48, "step": 19099, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.954994 }, { "epoch": 0.6204723386284637, "grad_norm": 0.32711079716682434, "learning_rate": 3.447662577052634e-06, "loss": 0.020572971552610397, "memory(GiB)": 21.48, "step": 19100, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.955003 }, { "epoch": 0.6205048240912192, "grad_norm": 0.43413299322128296, "learning_rate": 3.447151977026416e-06, "loss": 0.021442731842398643, "memory(GiB)": 21.48, "step": 19101, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955012 }, { "epoch": 0.6205373095539746, "grad_norm": 0.528634250164032, "learning_rate": 3.4466413949221345e-06, "loss": 0.024032320827245712, "memory(GiB)": 21.48, "step": 19102, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.955021 }, { "epoch": 0.62056979501673, "grad_norm": 0.30193713307380676, "learning_rate": 3.4461308307456755e-06, "loss": 0.014610398560762405, "memory(GiB)": 21.48, "step": 19103, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.955031 }, { "epoch": 0.6206022804794854, "grad_norm": 0.7062066197395325, "learning_rate": 3.4456202845029385e-06, "loss": 0.021222367882728577, "memory(GiB)": 21.48, "step": 19104, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.955042 }, { "epoch": 0.6206347659422409, "grad_norm": 0.3226583003997803, "learning_rate": 3.4451097561998092e-06, "loss": 0.016768652945756912, "memory(GiB)": 21.48, "step": 19105, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.955053 }, { "epoch": 0.6206672514049962, "grad_norm": 0.28226953744888306, "learning_rate": 3.444599245842183e-06, "loss": 0.014523716643452644, "memory(GiB)": 21.48, "step": 19106, "token_acc": 0.9885496183206107, "train_speed(iter/s)": 0.955063 }, { "epoch": 0.6206997368677517, "grad_norm": 0.45401731133461, "learning_rate": 3.4440887534359543e-06, "loss": 0.023083530366420746, "memory(GiB)": 21.48, "step": 19107, "token_acc": 0.98125, "train_speed(iter/s)": 0.955074 }, { "epoch": 0.6207322223305071, "grad_norm": 0.26618367433547974, "learning_rate": 3.4435782789870087e-06, "loss": 0.015669111162424088, "memory(GiB)": 21.48, "step": 19108, "token_acc": 0.975609756097561, "train_speed(iter/s)": 0.955084 }, { "epoch": 0.6207647077932625, "grad_norm": 0.29499512910842896, "learning_rate": 3.443067822501245e-06, "loss": 0.012059498578310013, "memory(GiB)": 21.48, "step": 19109, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.955094 }, { "epoch": 0.6207971932560179, "grad_norm": 0.2779463231563568, "learning_rate": 3.4425573839845487e-06, "loss": 0.011759212240576744, "memory(GiB)": 21.48, "step": 19110, "token_acc": 1.0, "train_speed(iter/s)": 0.955103 }, { "epoch": 0.6208296787187734, "grad_norm": 0.41787102818489075, "learning_rate": 3.4420469634428145e-06, "loss": 0.01963573694229126, "memory(GiB)": 21.48, "step": 19111, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.955111 }, { "epoch": 0.6208621641815287, "grad_norm": 0.2881034016609192, "learning_rate": 3.4415365608819308e-06, "loss": 0.018826451152563095, "memory(GiB)": 21.48, "step": 19112, "token_acc": 0.9932432432432432, "train_speed(iter/s)": 0.955118 }, { "epoch": 0.6208946496442842, "grad_norm": 0.4030798375606537, "learning_rate": 3.44102617630779e-06, "loss": 0.020796993747353554, "memory(GiB)": 21.48, "step": 19113, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955126 }, { "epoch": 0.6209271351070396, "grad_norm": 0.3282473087310791, "learning_rate": 3.4405158097262816e-06, "loss": 0.015319577418267727, "memory(GiB)": 21.48, "step": 19114, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.955135 }, { "epoch": 0.620959620569795, "grad_norm": 0.34987911581993103, "learning_rate": 3.4400054611432975e-06, "loss": 0.01480552926659584, "memory(GiB)": 21.48, "step": 19115, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.955143 }, { "epoch": 0.6209921060325504, "grad_norm": 0.2737952172756195, "learning_rate": 3.439495130564725e-06, "loss": 0.01185663603246212, "memory(GiB)": 21.48, "step": 19116, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.955152 }, { "epoch": 0.6210245914953059, "grad_norm": 0.3814541697502136, "learning_rate": 3.438984817996458e-06, "loss": 0.015291870571672916, "memory(GiB)": 21.48, "step": 19117, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955161 }, { "epoch": 0.6210570769580612, "grad_norm": 0.2960009276866913, "learning_rate": 3.4384745234443816e-06, "loss": 0.010723106563091278, "memory(GiB)": 21.48, "step": 19118, "token_acc": 1.0, "train_speed(iter/s)": 0.955169 }, { "epoch": 0.6210895624208167, "grad_norm": 0.3915761709213257, "learning_rate": 3.437964246914389e-06, "loss": 0.02271365560591221, "memory(GiB)": 21.48, "step": 19119, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.955176 }, { "epoch": 0.6211220478835721, "grad_norm": 0.40179556608200073, "learning_rate": 3.4374539884123675e-06, "loss": 0.021443773061037064, "memory(GiB)": 21.48, "step": 19120, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.955185 }, { "epoch": 0.6211545333463275, "grad_norm": 0.3909373879432678, "learning_rate": 3.4369437479442063e-06, "loss": 0.020774900913238525, "memory(GiB)": 21.48, "step": 19121, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.955194 }, { "epoch": 0.6211870188090829, "grad_norm": 0.3232308626174927, "learning_rate": 3.436433525515796e-06, "loss": 0.0162834282964468, "memory(GiB)": 21.48, "step": 19122, "token_acc": 1.0, "train_speed(iter/s)": 0.955203 }, { "epoch": 0.6212195042718384, "grad_norm": 0.3064877390861511, "learning_rate": 3.435923321133023e-06, "loss": 0.01214313879609108, "memory(GiB)": 21.48, "step": 19123, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955211 }, { "epoch": 0.6212519897345937, "grad_norm": 0.35169902443885803, "learning_rate": 3.435413134801777e-06, "loss": 0.01876853033900261, "memory(GiB)": 21.48, "step": 19124, "token_acc": 1.0, "train_speed(iter/s)": 0.95522 }, { "epoch": 0.6212844751973492, "grad_norm": 0.24971356987953186, "learning_rate": 3.4349029665279454e-06, "loss": 0.012435223907232285, "memory(GiB)": 21.48, "step": 19125, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.955228 }, { "epoch": 0.6213169606601046, "grad_norm": 0.8599168062210083, "learning_rate": 3.434392816317418e-06, "loss": 0.016373174265027046, "memory(GiB)": 21.48, "step": 19126, "token_acc": 1.0, "train_speed(iter/s)": 0.955235 }, { "epoch": 0.62134944612286, "grad_norm": 0.3367452323436737, "learning_rate": 3.4338826841760797e-06, "loss": 0.01775027997791767, "memory(GiB)": 21.48, "step": 19127, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.955243 }, { "epoch": 0.6213819315856154, "grad_norm": 0.3373224139213562, "learning_rate": 3.433372570109822e-06, "loss": 0.016970135271549225, "memory(GiB)": 21.48, "step": 19128, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.955252 }, { "epoch": 0.6214144170483709, "grad_norm": 0.3460316061973572, "learning_rate": 3.4328624741245264e-06, "loss": 0.018859386444091797, "memory(GiB)": 21.48, "step": 19129, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.95526 }, { "epoch": 0.6214469025111262, "grad_norm": 0.23116087913513184, "learning_rate": 3.4323523962260876e-06, "loss": 0.014240124262869358, "memory(GiB)": 21.48, "step": 19130, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.95527 }, { "epoch": 0.6214793879738817, "grad_norm": 0.4074990153312683, "learning_rate": 3.431842336420385e-06, "loss": 0.01718303933739662, "memory(GiB)": 21.48, "step": 19131, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.955281 }, { "epoch": 0.621511873436637, "grad_norm": 0.3071555495262146, "learning_rate": 3.431332294713313e-06, "loss": 0.0141874048858881, "memory(GiB)": 21.48, "step": 19132, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.955292 }, { "epoch": 0.6215443588993925, "grad_norm": 0.24960929155349731, "learning_rate": 3.4308222711107507e-06, "loss": 0.01600884646177292, "memory(GiB)": 21.48, "step": 19133, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.955302 }, { "epoch": 0.6215768443621479, "grad_norm": 0.4407437741756439, "learning_rate": 3.4303122656185906e-06, "loss": 0.017999716103076935, "memory(GiB)": 21.48, "step": 19134, "token_acc": 0.9930555555555556, "train_speed(iter/s)": 0.955313 }, { "epoch": 0.6216093298249034, "grad_norm": 0.37863394618034363, "learning_rate": 3.429802278242714e-06, "loss": 0.02069842629134655, "memory(GiB)": 21.48, "step": 19135, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.955324 }, { "epoch": 0.6216418152876588, "grad_norm": 0.26466104388237, "learning_rate": 3.4292923089890084e-06, "loss": 0.01630524918437004, "memory(GiB)": 21.48, "step": 19136, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.955335 }, { "epoch": 0.6216743007504142, "grad_norm": 0.28079918026924133, "learning_rate": 3.4287823578633633e-06, "loss": 0.012808965519070625, "memory(GiB)": 21.48, "step": 19137, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.955346 }, { "epoch": 0.6217067862131697, "grad_norm": 0.41253936290740967, "learning_rate": 3.428272424871658e-06, "loss": 0.013944975100457668, "memory(GiB)": 21.48, "step": 19138, "token_acc": 1.0, "train_speed(iter/s)": 0.955356 }, { "epoch": 0.621739271675925, "grad_norm": 0.3409872353076935, "learning_rate": 3.4277625100197843e-06, "loss": 0.01187003031373024, "memory(GiB)": 21.48, "step": 19139, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.955367 }, { "epoch": 0.6217717571386805, "grad_norm": 0.32455432415008545, "learning_rate": 3.427252613313621e-06, "loss": 0.017129182815551758, "memory(GiB)": 21.48, "step": 19140, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955379 }, { "epoch": 0.6218042426014359, "grad_norm": 0.2499258816242218, "learning_rate": 3.4267427347590575e-06, "loss": 0.014071213081479073, "memory(GiB)": 21.48, "step": 19141, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.955389 }, { "epoch": 0.6218367280641913, "grad_norm": 0.45373624563217163, "learning_rate": 3.4262328743619757e-06, "loss": 0.016768205910921097, "memory(GiB)": 21.48, "step": 19142, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.9554 }, { "epoch": 0.6218692135269467, "grad_norm": 0.5971105694770813, "learning_rate": 3.425723032128262e-06, "loss": 0.02283848449587822, "memory(GiB)": 21.48, "step": 19143, "token_acc": 0.9845559845559846, "train_speed(iter/s)": 0.95541 }, { "epoch": 0.6219016989897022, "grad_norm": 0.24465787410736084, "learning_rate": 3.425213208063799e-06, "loss": 0.011070329695940018, "memory(GiB)": 21.48, "step": 19144, "token_acc": 1.0, "train_speed(iter/s)": 0.955419 }, { "epoch": 0.6219341844524575, "grad_norm": 0.29427477717399597, "learning_rate": 3.424703402174473e-06, "loss": 0.014252850785851479, "memory(GiB)": 21.48, "step": 19145, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955427 }, { "epoch": 0.621966669915213, "grad_norm": 0.3598318099975586, "learning_rate": 3.4241936144661647e-06, "loss": 0.0162479467689991, "memory(GiB)": 21.48, "step": 19146, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.955436 }, { "epoch": 0.6219991553779683, "grad_norm": 0.7441145777702332, "learning_rate": 3.4236838449447613e-06, "loss": 0.02142801322042942, "memory(GiB)": 21.48, "step": 19147, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.955444 }, { "epoch": 0.6220316408407238, "grad_norm": 0.4189569652080536, "learning_rate": 3.423174093616143e-06, "loss": 0.015646835789084435, "memory(GiB)": 21.48, "step": 19148, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.955453 }, { "epoch": 0.6220641263034792, "grad_norm": 1.20085871219635, "learning_rate": 3.4226643604861954e-06, "loss": 0.018460936844348907, "memory(GiB)": 21.48, "step": 19149, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.955461 }, { "epoch": 0.6220966117662347, "grad_norm": 0.3063655495643616, "learning_rate": 3.422154645560799e-06, "loss": 0.01526677142828703, "memory(GiB)": 21.48, "step": 19150, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.95547 }, { "epoch": 0.62212909722899, "grad_norm": 0.4413079619407654, "learning_rate": 3.421644948845839e-06, "loss": 0.018973078578710556, "memory(GiB)": 21.48, "step": 19151, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955479 }, { "epoch": 0.6221615826917455, "grad_norm": 0.38147032260894775, "learning_rate": 3.421135270347198e-06, "loss": 0.02289782650768757, "memory(GiB)": 21.48, "step": 19152, "token_acc": 0.98, "train_speed(iter/s)": 0.955487 }, { "epoch": 0.6221940681545008, "grad_norm": 0.31276917457580566, "learning_rate": 3.420625610070756e-06, "loss": 0.016709426417946815, "memory(GiB)": 21.48, "step": 19153, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.955495 }, { "epoch": 0.6222265536172563, "grad_norm": 0.3725022077560425, "learning_rate": 3.4201159680223977e-06, "loss": 0.016592448577284813, "memory(GiB)": 21.48, "step": 19154, "token_acc": 0.9897959183673469, "train_speed(iter/s)": 0.955503 }, { "epoch": 0.6222590390800117, "grad_norm": 0.28537100553512573, "learning_rate": 3.419606344208003e-06, "loss": 0.017049819231033325, "memory(GiB)": 21.48, "step": 19155, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.955512 }, { "epoch": 0.6222915245427671, "grad_norm": 0.3366432189941406, "learning_rate": 3.4190967386334555e-06, "loss": 0.016209539026021957, "memory(GiB)": 21.48, "step": 19156, "token_acc": 1.0, "train_speed(iter/s)": 0.95552 }, { "epoch": 0.6223240100055225, "grad_norm": 0.44156768918037415, "learning_rate": 3.418587151304635e-06, "loss": 0.020906632766127586, "memory(GiB)": 21.48, "step": 19157, "token_acc": 0.9776536312849162, "train_speed(iter/s)": 0.955528 }, { "epoch": 0.622356495468278, "grad_norm": 0.37360307574272156, "learning_rate": 3.4180775822274255e-06, "loss": 0.015105542726814747, "memory(GiB)": 21.48, "step": 19158, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.955536 }, { "epoch": 0.6223889809310333, "grad_norm": 0.32083776593208313, "learning_rate": 3.417568031407703e-06, "loss": 0.016213076189160347, "memory(GiB)": 21.48, "step": 19159, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955545 }, { "epoch": 0.6224214663937888, "grad_norm": 0.35677316784858704, "learning_rate": 3.4170584988513557e-06, "loss": 0.01687629148364067, "memory(GiB)": 21.48, "step": 19160, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955552 }, { "epoch": 0.6224539518565442, "grad_norm": 0.5534908175468445, "learning_rate": 3.4165489845642557e-06, "loss": 0.018468229100108147, "memory(GiB)": 21.48, "step": 19161, "token_acc": 0.98828125, "train_speed(iter/s)": 0.955561 }, { "epoch": 0.6224864373192996, "grad_norm": 0.24987007677555084, "learning_rate": 3.416039488552292e-06, "loss": 0.012671593576669693, "memory(GiB)": 21.48, "step": 19162, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.955568 }, { "epoch": 0.622518922782055, "grad_norm": 0.3421264886856079, "learning_rate": 3.4155300108213386e-06, "loss": 0.023431526497006416, "memory(GiB)": 21.48, "step": 19163, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.955577 }, { "epoch": 0.6225514082448105, "grad_norm": 0.2621375024318695, "learning_rate": 3.4150205513772782e-06, "loss": 0.011699273250997066, "memory(GiB)": 21.48, "step": 19164, "token_acc": 1.0, "train_speed(iter/s)": 0.955586 }, { "epoch": 0.6225838937075658, "grad_norm": 0.3434818685054779, "learning_rate": 3.4145111102259896e-06, "loss": 0.013943982310593128, "memory(GiB)": 21.48, "step": 19165, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.955593 }, { "epoch": 0.6226163791703213, "grad_norm": 0.3714751601219177, "learning_rate": 3.414001687373355e-06, "loss": 0.019387220963835716, "memory(GiB)": 21.48, "step": 19166, "token_acc": 1.0, "train_speed(iter/s)": 0.955601 }, { "epoch": 0.6226488646330767, "grad_norm": 0.20112431049346924, "learning_rate": 3.41349228282525e-06, "loss": 0.007372095249593258, "memory(GiB)": 21.48, "step": 19167, "token_acc": 1.0, "train_speed(iter/s)": 0.955611 }, { "epoch": 0.6226813500958321, "grad_norm": 0.32462283968925476, "learning_rate": 3.4129828965875554e-06, "loss": 0.012471238151192665, "memory(GiB)": 21.48, "step": 19168, "token_acc": 1.0, "train_speed(iter/s)": 0.95562 }, { "epoch": 0.6227138355585875, "grad_norm": 0.42680948972702026, "learning_rate": 3.4124735286661525e-06, "loss": 0.020418304949998856, "memory(GiB)": 21.48, "step": 19169, "token_acc": 1.0, "train_speed(iter/s)": 0.95563 }, { "epoch": 0.622746321021343, "grad_norm": 0.3603937327861786, "learning_rate": 3.4119641790669163e-06, "loss": 0.020829742774367332, "memory(GiB)": 21.48, "step": 19170, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955637 }, { "epoch": 0.6227788064840983, "grad_norm": 0.3590390980243683, "learning_rate": 3.4114548477957282e-06, "loss": 0.01745505817234516, "memory(GiB)": 21.48, "step": 19171, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.955646 }, { "epoch": 0.6228112919468538, "grad_norm": 0.4308986961841583, "learning_rate": 3.4109455348584645e-06, "loss": 0.01499335840344429, "memory(GiB)": 21.48, "step": 19172, "token_acc": 1.0, "train_speed(iter/s)": 0.955654 }, { "epoch": 0.6228437774096092, "grad_norm": 0.41186439990997314, "learning_rate": 3.4104362402610057e-06, "loss": 0.02142970636487007, "memory(GiB)": 21.48, "step": 19173, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.955662 }, { "epoch": 0.6228762628723646, "grad_norm": 0.4094727337360382, "learning_rate": 3.409926964009227e-06, "loss": 0.024113427847623825, "memory(GiB)": 21.48, "step": 19174, "token_acc": 1.0, "train_speed(iter/s)": 0.955669 }, { "epoch": 0.62290874833512, "grad_norm": 0.28400227427482605, "learning_rate": 3.409417706109009e-06, "loss": 0.014080110937356949, "memory(GiB)": 21.48, "step": 19175, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955677 }, { "epoch": 0.6229412337978755, "grad_norm": 0.3440425992012024, "learning_rate": 3.4089084665662264e-06, "loss": 0.014567104168236256, "memory(GiB)": 21.48, "step": 19176, "token_acc": 1.0, "train_speed(iter/s)": 0.955684 }, { "epoch": 0.6229737192606308, "grad_norm": 0.3038627803325653, "learning_rate": 3.4083992453867586e-06, "loss": 0.014154102653265, "memory(GiB)": 21.48, "step": 19177, "token_acc": 1.0, "train_speed(iter/s)": 0.955691 }, { "epoch": 0.6230062047233863, "grad_norm": 0.5875183343887329, "learning_rate": 3.4078900425764814e-06, "loss": 0.028294021263718605, "memory(GiB)": 21.48, "step": 19178, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.9557 }, { "epoch": 0.6230386901861417, "grad_norm": 0.2957417964935303, "learning_rate": 3.407380858141273e-06, "loss": 0.01790086179971695, "memory(GiB)": 21.48, "step": 19179, "token_acc": 1.0, "train_speed(iter/s)": 0.955708 }, { "epoch": 0.6230711756488971, "grad_norm": 0.35661202669143677, "learning_rate": 3.4068716920870082e-06, "loss": 0.019278064370155334, "memory(GiB)": 21.48, "step": 19180, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.955717 }, { "epoch": 0.6231036611116525, "grad_norm": 0.22608275711536407, "learning_rate": 3.406362544419567e-06, "loss": 0.009414441883563995, "memory(GiB)": 21.48, "step": 19181, "token_acc": 1.0, "train_speed(iter/s)": 0.955725 }, { "epoch": 0.623136146574408, "grad_norm": 0.38664332032203674, "learning_rate": 3.4058534151448197e-06, "loss": 0.016523122787475586, "memory(GiB)": 21.48, "step": 19182, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955732 }, { "epoch": 0.6231686320371633, "grad_norm": 0.40129315853118896, "learning_rate": 3.4053443042686463e-06, "loss": 0.014102807268500328, "memory(GiB)": 21.48, "step": 19183, "token_acc": 1.0, "train_speed(iter/s)": 0.95574 }, { "epoch": 0.6232011174999188, "grad_norm": 0.38301557302474976, "learning_rate": 3.404835211796924e-06, "loss": 0.022325603291392326, "memory(GiB)": 21.48, "step": 19184, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955748 }, { "epoch": 0.6232336029626742, "grad_norm": 0.48627930879592896, "learning_rate": 3.4043261377355253e-06, "loss": 0.02828509360551834, "memory(GiB)": 21.48, "step": 19185, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.955755 }, { "epoch": 0.6232660884254296, "grad_norm": 0.393595814704895, "learning_rate": 3.4038170820903283e-06, "loss": 0.015301000326871872, "memory(GiB)": 21.48, "step": 19186, "token_acc": 1.0, "train_speed(iter/s)": 0.955763 }, { "epoch": 0.623298573888185, "grad_norm": 0.3154309093952179, "learning_rate": 3.403308044867204e-06, "loss": 0.020407501608133316, "memory(GiB)": 21.48, "step": 19187, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.955772 }, { "epoch": 0.6233310593509405, "grad_norm": 0.3112899363040924, "learning_rate": 3.4027990260720333e-06, "loss": 0.011884592473506927, "memory(GiB)": 21.48, "step": 19188, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955781 }, { "epoch": 0.6233635448136958, "grad_norm": 0.49752214550971985, "learning_rate": 3.4022900257106845e-06, "loss": 0.023048873990774155, "memory(GiB)": 21.48, "step": 19189, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.955791 }, { "epoch": 0.6233960302764513, "grad_norm": 0.5142568945884705, "learning_rate": 3.4017810437890384e-06, "loss": 0.019719498232007027, "memory(GiB)": 21.48, "step": 19190, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955802 }, { "epoch": 0.6234285157392067, "grad_norm": 0.4153995215892792, "learning_rate": 3.401272080312964e-06, "loss": 0.01911068521440029, "memory(GiB)": 21.48, "step": 19191, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.955812 }, { "epoch": 0.6234610012019621, "grad_norm": 0.31780385971069336, "learning_rate": 3.4007631352883397e-06, "loss": 0.017849892377853394, "memory(GiB)": 21.48, "step": 19192, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.955822 }, { "epoch": 0.6234934866647175, "grad_norm": 0.30081188678741455, "learning_rate": 3.4002542087210353e-06, "loss": 0.013673335313796997, "memory(GiB)": 21.48, "step": 19193, "token_acc": 1.0, "train_speed(iter/s)": 0.955832 }, { "epoch": 0.623525972127473, "grad_norm": 0.4543898105621338, "learning_rate": 3.399745300616928e-06, "loss": 0.023825082927942276, "memory(GiB)": 21.48, "step": 19194, "token_acc": 0.9899665551839465, "train_speed(iter/s)": 0.955843 }, { "epoch": 0.6235584575902283, "grad_norm": 0.3750530183315277, "learning_rate": 3.399236410981889e-06, "loss": 0.014316053129732609, "memory(GiB)": 21.48, "step": 19195, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.955854 }, { "epoch": 0.6235909430529838, "grad_norm": 0.676935076713562, "learning_rate": 3.398727539821793e-06, "loss": 0.018473461270332336, "memory(GiB)": 21.48, "step": 19196, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.955864 }, { "epoch": 0.6236234285157392, "grad_norm": 0.34990984201431274, "learning_rate": 3.3982186871425117e-06, "loss": 0.01786050572991371, "memory(GiB)": 21.48, "step": 19197, "token_acc": 0.9938650306748467, "train_speed(iter/s)": 0.955874 }, { "epoch": 0.6236559139784946, "grad_norm": 0.38337892293930054, "learning_rate": 3.3977098529499185e-06, "loss": 0.01805686578154564, "memory(GiB)": 21.48, "step": 19198, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955883 }, { "epoch": 0.62368839944125, "grad_norm": 0.5757330060005188, "learning_rate": 3.3972010372498875e-06, "loss": 0.022029362618923187, "memory(GiB)": 21.48, "step": 19199, "token_acc": 0.9817073170731707, "train_speed(iter/s)": 0.955894 }, { "epoch": 0.6237208849040055, "grad_norm": 0.33264607191085815, "learning_rate": 3.3966922400482887e-06, "loss": 0.014605406671762466, "memory(GiB)": 21.48, "step": 19200, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.955904 }, { "epoch": 0.6237533703667609, "grad_norm": 0.44850388169288635, "learning_rate": 3.396183461350997e-06, "loss": 0.014659841544926167, "memory(GiB)": 21.48, "step": 19201, "token_acc": 1.0, "train_speed(iter/s)": 0.955915 }, { "epoch": 0.6237858558295163, "grad_norm": 0.37229591608047485, "learning_rate": 3.3956747011638814e-06, "loss": 0.01540185697376728, "memory(GiB)": 21.48, "step": 19202, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.955926 }, { "epoch": 0.6238183412922718, "grad_norm": 0.4734838306903839, "learning_rate": 3.395165959492816e-06, "loss": 0.02494635432958603, "memory(GiB)": 21.48, "step": 19203, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955936 }, { "epoch": 0.6238508267550271, "grad_norm": 0.4039296805858612, "learning_rate": 3.3946572363436702e-06, "loss": 0.021750999614596367, "memory(GiB)": 21.48, "step": 19204, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955947 }, { "epoch": 0.6238833122177826, "grad_norm": 0.3497501313686371, "learning_rate": 3.394148531722318e-06, "loss": 0.023312732577323914, "memory(GiB)": 21.48, "step": 19205, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955955 }, { "epoch": 0.623915797680538, "grad_norm": 0.9700083136558533, "learning_rate": 3.393639845634629e-06, "loss": 0.014026612043380737, "memory(GiB)": 21.48, "step": 19206, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.955964 }, { "epoch": 0.6239482831432934, "grad_norm": 0.42400628328323364, "learning_rate": 3.393131178086474e-06, "loss": 0.023651570081710815, "memory(GiB)": 21.48, "step": 19207, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955973 }, { "epoch": 0.6239807686060488, "grad_norm": 0.3817603886127472, "learning_rate": 3.3926225290837227e-06, "loss": 0.01900913193821907, "memory(GiB)": 21.48, "step": 19208, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.955981 }, { "epoch": 0.6240132540688043, "grad_norm": 0.2944360375404358, "learning_rate": 3.39211389863225e-06, "loss": 0.013388924300670624, "memory(GiB)": 21.48, "step": 19209, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955989 }, { "epoch": 0.6240457395315596, "grad_norm": 0.393996000289917, "learning_rate": 3.391605286737919e-06, "loss": 0.020554888993501663, "memory(GiB)": 21.48, "step": 19210, "token_acc": 0.992, "train_speed(iter/s)": 0.955997 }, { "epoch": 0.6240782249943151, "grad_norm": 0.5320947170257568, "learning_rate": 3.391096693406607e-06, "loss": 0.023146729916334152, "memory(GiB)": 21.48, "step": 19211, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.956005 }, { "epoch": 0.6241107104570705, "grad_norm": 0.37201347947120667, "learning_rate": 3.390588118644178e-06, "loss": 0.014032270759344101, "memory(GiB)": 21.48, "step": 19212, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956014 }, { "epoch": 0.6241431959198259, "grad_norm": 0.38563042879104614, "learning_rate": 3.390079562456505e-06, "loss": 0.01881692185997963, "memory(GiB)": 21.48, "step": 19213, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.956023 }, { "epoch": 0.6241756813825813, "grad_norm": 0.5080373287200928, "learning_rate": 3.3895710248494585e-06, "loss": 0.0183612909168005, "memory(GiB)": 21.48, "step": 19214, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956031 }, { "epoch": 0.6242081668453368, "grad_norm": 0.5526166558265686, "learning_rate": 3.3890625058289028e-06, "loss": 0.018924260511994362, "memory(GiB)": 21.48, "step": 19215, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.956039 }, { "epoch": 0.6242406523080921, "grad_norm": 0.3917330205440521, "learning_rate": 3.388554005400714e-06, "loss": 0.017842495813965797, "memory(GiB)": 21.48, "step": 19216, "token_acc": 0.9893238434163701, "train_speed(iter/s)": 0.956047 }, { "epoch": 0.6242731377708476, "grad_norm": 0.2888984978199005, "learning_rate": 3.388045523570752e-06, "loss": 0.018471531569957733, "memory(GiB)": 21.48, "step": 19217, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956055 }, { "epoch": 0.624305623233603, "grad_norm": 0.28420326113700867, "learning_rate": 3.3875370603448944e-06, "loss": 0.015845993533730507, "memory(GiB)": 21.48, "step": 19218, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.956064 }, { "epoch": 0.6243381086963584, "grad_norm": 0.532132625579834, "learning_rate": 3.3870286157290032e-06, "loss": 0.013504558242857456, "memory(GiB)": 21.48, "step": 19219, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956072 }, { "epoch": 0.6243705941591138, "grad_norm": 0.290345698595047, "learning_rate": 3.386520189728949e-06, "loss": 0.015406240709125996, "memory(GiB)": 21.48, "step": 19220, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956081 }, { "epoch": 0.6244030796218693, "grad_norm": 0.3159969747066498, "learning_rate": 3.386011782350599e-06, "loss": 0.014607695862650871, "memory(GiB)": 21.48, "step": 19221, "token_acc": 0.988950276243094, "train_speed(iter/s)": 0.95609 }, { "epoch": 0.6244355650846246, "grad_norm": 0.31588709354400635, "learning_rate": 3.385503393599822e-06, "loss": 0.018180547282099724, "memory(GiB)": 21.48, "step": 19222, "token_acc": 1.0, "train_speed(iter/s)": 0.956098 }, { "epoch": 0.6244680505473801, "grad_norm": 0.31829899549484253, "learning_rate": 3.3849950234824836e-06, "loss": 0.01506242249161005, "memory(GiB)": 21.48, "step": 19223, "token_acc": 0.9961832061068703, "train_speed(iter/s)": 0.956107 }, { "epoch": 0.6245005360101354, "grad_norm": 0.2998839020729065, "learning_rate": 3.384486672004453e-06, "loss": 0.020726971328258514, "memory(GiB)": 21.48, "step": 19224, "token_acc": 0.9893992932862191, "train_speed(iter/s)": 0.956113 }, { "epoch": 0.6245330214728909, "grad_norm": 0.2993139922618866, "learning_rate": 3.383978339171596e-06, "loss": 0.02146235480904579, "memory(GiB)": 21.48, "step": 19225, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956122 }, { "epoch": 0.6245655069356463, "grad_norm": 0.43642425537109375, "learning_rate": 3.3834700249897813e-06, "loss": 0.014254345558583736, "memory(GiB)": 21.48, "step": 19226, "token_acc": 0.9887005649717514, "train_speed(iter/s)": 0.956131 }, { "epoch": 0.6245979923984017, "grad_norm": 0.31902799010276794, "learning_rate": 3.382961729464872e-06, "loss": 0.024302925914525986, "memory(GiB)": 21.48, "step": 19227, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956139 }, { "epoch": 0.6246304778611571, "grad_norm": 0.37644731998443604, "learning_rate": 3.3824534526027376e-06, "loss": 0.014348618686199188, "memory(GiB)": 21.48, "step": 19228, "token_acc": 0.984, "train_speed(iter/s)": 0.956148 }, { "epoch": 0.6246629633239126, "grad_norm": 0.26725533604621887, "learning_rate": 3.381945194409244e-06, "loss": 0.013853314332664013, "memory(GiB)": 21.48, "step": 19229, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.956156 }, { "epoch": 0.6246954487866679, "grad_norm": 0.2416571080684662, "learning_rate": 3.381436954890255e-06, "loss": 0.008915325626730919, "memory(GiB)": 21.48, "step": 19230, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956164 }, { "epoch": 0.6247279342494234, "grad_norm": 0.42004260420799255, "learning_rate": 3.3809287340516393e-06, "loss": 0.024075068533420563, "memory(GiB)": 21.48, "step": 19231, "token_acc": 0.9814126394052045, "train_speed(iter/s)": 0.956173 }, { "epoch": 0.6247604197121788, "grad_norm": 0.33758488297462463, "learning_rate": 3.38042053189926e-06, "loss": 0.016412777826189995, "memory(GiB)": 21.48, "step": 19232, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.956181 }, { "epoch": 0.6247929051749342, "grad_norm": 0.27687081694602966, "learning_rate": 3.379912348438984e-06, "loss": 0.009004756808280945, "memory(GiB)": 21.48, "step": 19233, "token_acc": 1.0, "train_speed(iter/s)": 0.956188 }, { "epoch": 0.6248253906376896, "grad_norm": 0.6176642179489136, "learning_rate": 3.379404183676675e-06, "loss": 0.021685119718313217, "memory(GiB)": 21.48, "step": 19234, "token_acc": 0.9930795847750865, "train_speed(iter/s)": 0.956195 }, { "epoch": 0.6248578761004451, "grad_norm": 0.31169062852859497, "learning_rate": 3.3788960376182e-06, "loss": 0.019034193828701973, "memory(GiB)": 21.48, "step": 19235, "token_acc": 1.0, "train_speed(iter/s)": 0.956203 }, { "epoch": 0.6248903615632004, "grad_norm": 0.6974354982376099, "learning_rate": 3.378387910269421e-06, "loss": 0.024448979645967484, "memory(GiB)": 21.48, "step": 19236, "token_acc": 0.9776357827476039, "train_speed(iter/s)": 0.95621 }, { "epoch": 0.6249228470259559, "grad_norm": 0.45713457465171814, "learning_rate": 3.3778798016362062e-06, "loss": 0.015532540157437325, "memory(GiB)": 21.48, "step": 19237, "token_acc": 0.9869565217391304, "train_speed(iter/s)": 0.956217 }, { "epoch": 0.6249553324887113, "grad_norm": 0.3434675335884094, "learning_rate": 3.3773717117244144e-06, "loss": 0.010273287072777748, "memory(GiB)": 21.48, "step": 19238, "token_acc": 1.0, "train_speed(iter/s)": 0.956224 }, { "epoch": 0.6249878179514667, "grad_norm": 0.3394533097743988, "learning_rate": 3.3768636405399158e-06, "loss": 0.020135775208473206, "memory(GiB)": 21.48, "step": 19239, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.956232 }, { "epoch": 0.6250203034142221, "grad_norm": 0.23640812933444977, "learning_rate": 3.3763555880885677e-06, "loss": 0.012523893266916275, "memory(GiB)": 21.48, "step": 19240, "token_acc": 0.9927007299270073, "train_speed(iter/s)": 0.95624 }, { "epoch": 0.6250527888769776, "grad_norm": 3.2100353240966797, "learning_rate": 3.3758475543762413e-06, "loss": 0.020525652915239334, "memory(GiB)": 21.48, "step": 19241, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956247 }, { "epoch": 0.6250852743397329, "grad_norm": 0.49313846230506897, "learning_rate": 3.375339539408793e-06, "loss": 0.017650917172431946, "memory(GiB)": 21.48, "step": 19242, "token_acc": 1.0, "train_speed(iter/s)": 0.956255 }, { "epoch": 0.6251177598024884, "grad_norm": 0.2743469774723053, "learning_rate": 3.3748315431920897e-06, "loss": 0.008780548349022865, "memory(GiB)": 21.48, "step": 19243, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956263 }, { "epoch": 0.6251502452652438, "grad_norm": 0.30021923780441284, "learning_rate": 3.374323565731993e-06, "loss": 0.013974826782941818, "memory(GiB)": 21.48, "step": 19244, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.956271 }, { "epoch": 0.6251827307279992, "grad_norm": 0.266865998506546, "learning_rate": 3.373815607034365e-06, "loss": 0.014912914484739304, "memory(GiB)": 21.48, "step": 19245, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956278 }, { "epoch": 0.6252152161907546, "grad_norm": 0.26288020610809326, "learning_rate": 3.3733076671050722e-06, "loss": 0.011148837395012379, "memory(GiB)": 21.48, "step": 19246, "token_acc": 1.0, "train_speed(iter/s)": 0.956286 }, { "epoch": 0.6252477016535101, "grad_norm": 0.3184545338153839, "learning_rate": 3.3727997459499724e-06, "loss": 0.014669914729893208, "memory(GiB)": 21.48, "step": 19247, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956294 }, { "epoch": 0.6252801871162654, "grad_norm": 0.391066312789917, "learning_rate": 3.3722918435749296e-06, "loss": 0.020971540361642838, "memory(GiB)": 21.48, "step": 19248, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956304 }, { "epoch": 0.6253126725790209, "grad_norm": 0.2508005201816559, "learning_rate": 3.371783959985805e-06, "loss": 0.011575132608413696, "memory(GiB)": 21.48, "step": 19249, "token_acc": 1.0, "train_speed(iter/s)": 0.956314 }, { "epoch": 0.6253451580417763, "grad_norm": 0.35365310311317444, "learning_rate": 3.3712760951884615e-06, "loss": 0.019298579543828964, "memory(GiB)": 21.48, "step": 19250, "token_acc": 0.9853479853479854, "train_speed(iter/s)": 0.956324 }, { "epoch": 0.6253776435045317, "grad_norm": 0.4431253671646118, "learning_rate": 3.370768249188759e-06, "loss": 0.020901568233966827, "memory(GiB)": 21.48, "step": 19251, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.956334 }, { "epoch": 0.6254101289672871, "grad_norm": 0.3914965093135834, "learning_rate": 3.3702604219925605e-06, "loss": 0.02935810759663582, "memory(GiB)": 21.48, "step": 19252, "token_acc": 1.0, "train_speed(iter/s)": 0.956344 }, { "epoch": 0.6254426144300426, "grad_norm": 0.3846149146556854, "learning_rate": 3.369752613605724e-06, "loss": 0.013989592902362347, "memory(GiB)": 21.48, "step": 19253, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.956354 }, { "epoch": 0.6254750998927979, "grad_norm": 0.313696950674057, "learning_rate": 3.3692448240341137e-06, "loss": 0.01111937128007412, "memory(GiB)": 21.48, "step": 19254, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.956365 }, { "epoch": 0.6255075853555534, "grad_norm": 0.4622584283351898, "learning_rate": 3.368737053283588e-06, "loss": 0.0241091288626194, "memory(GiB)": 21.48, "step": 19255, "token_acc": 1.0, "train_speed(iter/s)": 0.956376 }, { "epoch": 0.6255400708183088, "grad_norm": 0.44229593873023987, "learning_rate": 3.3682293013600086e-06, "loss": 0.01522394735366106, "memory(GiB)": 21.48, "step": 19256, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.956386 }, { "epoch": 0.6255725562810642, "grad_norm": 0.4825470745563507, "learning_rate": 3.367721568269234e-06, "loss": 0.022836724296212196, "memory(GiB)": 21.48, "step": 19257, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.956397 }, { "epoch": 0.6256050417438196, "grad_norm": 0.31067216396331787, "learning_rate": 3.3672138540171255e-06, "loss": 0.015331035479903221, "memory(GiB)": 21.48, "step": 19258, "token_acc": 1.0, "train_speed(iter/s)": 0.956408 }, { "epoch": 0.6256375272065751, "grad_norm": 0.5908838510513306, "learning_rate": 3.366706158609542e-06, "loss": 0.023180721327662468, "memory(GiB)": 21.48, "step": 19259, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956418 }, { "epoch": 0.6256700126693304, "grad_norm": 0.32696619629859924, "learning_rate": 3.366198482052343e-06, "loss": 0.013679256662726402, "memory(GiB)": 21.48, "step": 19260, "token_acc": 1.0, "train_speed(iter/s)": 0.956429 }, { "epoch": 0.6257024981320859, "grad_norm": 0.3067084550857544, "learning_rate": 3.36569082435139e-06, "loss": 0.01659514755010605, "memory(GiB)": 21.48, "step": 19261, "token_acc": 0.9897959183673469, "train_speed(iter/s)": 0.956439 }, { "epoch": 0.6257349835948413, "grad_norm": 0.24367006123065948, "learning_rate": 3.3651831855125382e-06, "loss": 0.010000238195061684, "memory(GiB)": 21.48, "step": 19262, "token_acc": 1.0, "train_speed(iter/s)": 0.95645 }, { "epoch": 0.6257674690575967, "grad_norm": 0.22012023627758026, "learning_rate": 3.3646755655416495e-06, "loss": 0.011115407571196556, "memory(GiB)": 21.48, "step": 19263, "token_acc": 1.0, "train_speed(iter/s)": 0.95646 }, { "epoch": 0.6257999545203522, "grad_norm": 0.31993043422698975, "learning_rate": 3.3641679644445802e-06, "loss": 0.012493068352341652, "memory(GiB)": 21.48, "step": 19264, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.95647 }, { "epoch": 0.6258324399831076, "grad_norm": 0.313802570104599, "learning_rate": 3.363660382227192e-06, "loss": 0.014634679071605206, "memory(GiB)": 21.48, "step": 19265, "token_acc": 1.0, "train_speed(iter/s)": 0.956481 }, { "epoch": 0.625864925445863, "grad_norm": 0.3465121388435364, "learning_rate": 3.363152818895339e-06, "loss": 0.021325059235095978, "memory(GiB)": 21.48, "step": 19266, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956492 }, { "epoch": 0.6258974109086184, "grad_norm": 0.27933940291404724, "learning_rate": 3.3626452744548833e-06, "loss": 0.014086984097957611, "memory(GiB)": 21.48, "step": 19267, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.9565 }, { "epoch": 0.6259298963713739, "grad_norm": 0.4007299840450287, "learning_rate": 3.362137748911677e-06, "loss": 0.018682312220335007, "memory(GiB)": 21.48, "step": 19268, "token_acc": 1.0, "train_speed(iter/s)": 0.956508 }, { "epoch": 0.6259623818341292, "grad_norm": 0.43508073687553406, "learning_rate": 3.361630242271584e-06, "loss": 0.014515711925923824, "memory(GiB)": 21.48, "step": 19269, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.956516 }, { "epoch": 0.6259948672968847, "grad_norm": 0.32855862379074097, "learning_rate": 3.3611227545404575e-06, "loss": 0.018317848443984985, "memory(GiB)": 21.48, "step": 19270, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.956525 }, { "epoch": 0.62602735275964, "grad_norm": 0.33801472187042236, "learning_rate": 3.3606152857241563e-06, "loss": 0.02058650180697441, "memory(GiB)": 21.48, "step": 19271, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.956534 }, { "epoch": 0.6260598382223955, "grad_norm": 0.2202044278383255, "learning_rate": 3.3601078358285354e-06, "loss": 0.011010130867362022, "memory(GiB)": 21.48, "step": 19272, "token_acc": 1.0, "train_speed(iter/s)": 0.956542 }, { "epoch": 0.6260923236851509, "grad_norm": 0.6234036087989807, "learning_rate": 3.3596004048594544e-06, "loss": 0.022054407745599747, "memory(GiB)": 21.48, "step": 19273, "token_acc": 0.9826839826839827, "train_speed(iter/s)": 0.956551 }, { "epoch": 0.6261248091479064, "grad_norm": 0.7816380262374878, "learning_rate": 3.359092992822767e-06, "loss": 0.009782830253243446, "memory(GiB)": 21.48, "step": 19274, "token_acc": 1.0, "train_speed(iter/s)": 0.956559 }, { "epoch": 0.6261572946106617, "grad_norm": 0.47547805309295654, "learning_rate": 3.35858559972433e-06, "loss": 0.01785796694457531, "memory(GiB)": 21.48, "step": 19275, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.956567 }, { "epoch": 0.6261897800734172, "grad_norm": 0.40226006507873535, "learning_rate": 3.3580782255700017e-06, "loss": 0.020482279360294342, "memory(GiB)": 21.48, "step": 19276, "token_acc": 0.9752066115702479, "train_speed(iter/s)": 0.956576 }, { "epoch": 0.6262222655361726, "grad_norm": 0.28332406282424927, "learning_rate": 3.3575708703656347e-06, "loss": 0.012755781412124634, "memory(GiB)": 21.48, "step": 19277, "token_acc": 1.0, "train_speed(iter/s)": 0.956585 }, { "epoch": 0.626254750998928, "grad_norm": 0.33338454365730286, "learning_rate": 3.3570635341170867e-06, "loss": 0.018275562673807144, "memory(GiB)": 21.48, "step": 19278, "token_acc": 0.9895470383275261, "train_speed(iter/s)": 0.956593 }, { "epoch": 0.6262872364616834, "grad_norm": 0.2857774794101715, "learning_rate": 3.3565562168302115e-06, "loss": 0.01295866072177887, "memory(GiB)": 21.48, "step": 19279, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.956602 }, { "epoch": 0.6263197219244389, "grad_norm": 0.37901467084884644, "learning_rate": 3.356048918510866e-06, "loss": 0.01315453089773655, "memory(GiB)": 21.48, "step": 19280, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956611 }, { "epoch": 0.6263522073871942, "grad_norm": 0.3945715129375458, "learning_rate": 3.3555416391649028e-06, "loss": 0.0184542965143919, "memory(GiB)": 21.48, "step": 19281, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.95662 }, { "epoch": 0.6263846928499497, "grad_norm": 0.32990944385528564, "learning_rate": 3.3550343787981787e-06, "loss": 0.015823354944586754, "memory(GiB)": 21.48, "step": 19282, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956629 }, { "epoch": 0.626417178312705, "grad_norm": 0.47884291410446167, "learning_rate": 3.3545271374165466e-06, "loss": 0.017663732171058655, "memory(GiB)": 21.48, "step": 19283, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.956638 }, { "epoch": 0.6264496637754605, "grad_norm": 0.482601523399353, "learning_rate": 3.354019915025863e-06, "loss": 0.027152089402079582, "memory(GiB)": 21.48, "step": 19284, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956646 }, { "epoch": 0.6264821492382159, "grad_norm": 0.2963466942310333, "learning_rate": 3.3535127116319787e-06, "loss": 0.01177951693534851, "memory(GiB)": 21.48, "step": 19285, "token_acc": 1.0, "train_speed(iter/s)": 0.956655 }, { "epoch": 0.6265146347009714, "grad_norm": 0.2680601477622986, "learning_rate": 3.3530055272407502e-06, "loss": 0.014318708330392838, "memory(GiB)": 21.48, "step": 19286, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956662 }, { "epoch": 0.6265471201637267, "grad_norm": 0.4608510136604309, "learning_rate": 3.352498361858029e-06, "loss": 0.026918858289718628, "memory(GiB)": 21.48, "step": 19287, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.956671 }, { "epoch": 0.6265796056264822, "grad_norm": 0.3051365315914154, "learning_rate": 3.3519912154896717e-06, "loss": 0.01232529804110527, "memory(GiB)": 21.48, "step": 19288, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.956679 }, { "epoch": 0.6266120910892375, "grad_norm": 0.34984368085861206, "learning_rate": 3.351484088141527e-06, "loss": 0.018354376778006554, "memory(GiB)": 21.48, "step": 19289, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956686 }, { "epoch": 0.626644576551993, "grad_norm": 0.37778764963150024, "learning_rate": 3.3509769798194504e-06, "loss": 0.011714947409927845, "memory(GiB)": 21.48, "step": 19290, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956694 }, { "epoch": 0.6266770620147484, "grad_norm": 0.29186907410621643, "learning_rate": 3.350469890529295e-06, "loss": 0.012622914277017117, "memory(GiB)": 21.48, "step": 19291, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.956702 }, { "epoch": 0.6267095474775038, "grad_norm": 0.4626004993915558, "learning_rate": 3.349962820276912e-06, "loss": 0.016563480719923973, "memory(GiB)": 21.48, "step": 19292, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.95671 }, { "epoch": 0.6267420329402592, "grad_norm": 0.3352077305316925, "learning_rate": 3.3494557690681563e-06, "loss": 0.011601351201534271, "memory(GiB)": 21.48, "step": 19293, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.956718 }, { "epoch": 0.6267745184030147, "grad_norm": 0.33136042952537537, "learning_rate": 3.348948736908874e-06, "loss": 0.011067409068346024, "memory(GiB)": 21.48, "step": 19294, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.956727 }, { "epoch": 0.62680700386577, "grad_norm": 0.6345282196998596, "learning_rate": 3.348441723804925e-06, "loss": 0.01973455771803856, "memory(GiB)": 21.48, "step": 19295, "token_acc": 1.0, "train_speed(iter/s)": 0.956735 }, { "epoch": 0.6268394893285255, "grad_norm": 0.27128803730010986, "learning_rate": 3.347934729762153e-06, "loss": 0.015234168618917465, "memory(GiB)": 21.48, "step": 19296, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.956743 }, { "epoch": 0.6268719747912809, "grad_norm": 0.4384164810180664, "learning_rate": 3.3474277547864163e-06, "loss": 0.015861060470342636, "memory(GiB)": 21.48, "step": 19297, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956751 }, { "epoch": 0.6269044602540363, "grad_norm": 0.3847762644290924, "learning_rate": 3.3469207988835608e-06, "loss": 0.01729843020439148, "memory(GiB)": 21.48, "step": 19298, "token_acc": 1.0, "train_speed(iter/s)": 0.956758 }, { "epoch": 0.6269369457167917, "grad_norm": 0.41210630536079407, "learning_rate": 3.346413862059441e-06, "loss": 0.018880309537053108, "memory(GiB)": 21.48, "step": 19299, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.956765 }, { "epoch": 0.6269694311795472, "grad_norm": 0.39392587542533875, "learning_rate": 3.3459069443199042e-06, "loss": 0.018530594184994698, "memory(GiB)": 21.48, "step": 19300, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956773 }, { "epoch": 0.6270019166423025, "grad_norm": 0.20205499231815338, "learning_rate": 3.3454000456708046e-06, "loss": 0.011960458941757679, "memory(GiB)": 21.48, "step": 19301, "token_acc": 1.0, "train_speed(iter/s)": 0.956781 }, { "epoch": 0.627034402105058, "grad_norm": 0.3368678390979767, "learning_rate": 3.3448931661179895e-06, "loss": 0.017221098765730858, "memory(GiB)": 21.48, "step": 19302, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.95679 }, { "epoch": 0.6270668875678134, "grad_norm": 0.3764020800590515, "learning_rate": 3.344386305667312e-06, "loss": 0.028107067570090294, "memory(GiB)": 21.48, "step": 19303, "token_acc": 1.0, "train_speed(iter/s)": 0.956798 }, { "epoch": 0.6270993730305688, "grad_norm": 0.38032591342926025, "learning_rate": 3.343879464324618e-06, "loss": 0.017736613750457764, "memory(GiB)": 21.48, "step": 19304, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.956805 }, { "epoch": 0.6271318584933242, "grad_norm": 0.33205896615982056, "learning_rate": 3.3433726420957614e-06, "loss": 0.01738830655813217, "memory(GiB)": 21.48, "step": 19305, "token_acc": 0.9904761904761905, "train_speed(iter/s)": 0.956813 }, { "epoch": 0.6271643439560797, "grad_norm": 0.37448805570602417, "learning_rate": 3.3428658389865877e-06, "loss": 0.01805589720606804, "memory(GiB)": 21.48, "step": 19306, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.956821 }, { "epoch": 0.627196829418835, "grad_norm": 0.3341817855834961, "learning_rate": 3.342359055002947e-06, "loss": 0.021907245740294456, "memory(GiB)": 21.48, "step": 19307, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.956829 }, { "epoch": 0.6272293148815905, "grad_norm": 0.38324424624443054, "learning_rate": 3.3418522901506915e-06, "loss": 0.016318276524543762, "memory(GiB)": 21.48, "step": 19308, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.95684 }, { "epoch": 0.6272618003443459, "grad_norm": 0.4674995243549347, "learning_rate": 3.3413455444356664e-06, "loss": 0.021962203085422516, "memory(GiB)": 21.48, "step": 19309, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.956851 }, { "epoch": 0.6272942858071013, "grad_norm": 0.3309858739376068, "learning_rate": 3.340838817863722e-06, "loss": 0.01381571963429451, "memory(GiB)": 21.48, "step": 19310, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.956861 }, { "epoch": 0.6273267712698567, "grad_norm": 0.6236051321029663, "learning_rate": 3.340332110440705e-06, "loss": 0.016081131994724274, "memory(GiB)": 21.48, "step": 19311, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956872 }, { "epoch": 0.6273592567326122, "grad_norm": 0.2639448046684265, "learning_rate": 3.339825422172466e-06, "loss": 0.01124314684420824, "memory(GiB)": 21.48, "step": 19312, "token_acc": 1.0, "train_speed(iter/s)": 0.956882 }, { "epoch": 0.6273917421953675, "grad_norm": 0.3601839542388916, "learning_rate": 3.33931875306485e-06, "loss": 0.015149777755141258, "memory(GiB)": 21.48, "step": 19313, "token_acc": 0.9895104895104895, "train_speed(iter/s)": 0.956892 }, { "epoch": 0.627424227658123, "grad_norm": 0.3336644172668457, "learning_rate": 3.3388121031237063e-06, "loss": 0.016580678522586823, "memory(GiB)": 21.48, "step": 19314, "token_acc": 0.996, "train_speed(iter/s)": 0.956903 }, { "epoch": 0.6274567131208784, "grad_norm": 0.49538281559944153, "learning_rate": 3.338305472354881e-06, "loss": 0.012916898354887962, "memory(GiB)": 21.48, "step": 19315, "token_acc": 1.0, "train_speed(iter/s)": 0.956913 }, { "epoch": 0.6274891985836338, "grad_norm": 0.3606204688549042, "learning_rate": 3.3377988607642255e-06, "loss": 0.021960198879241943, "memory(GiB)": 21.48, "step": 19316, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.956924 }, { "epoch": 0.6275216840463892, "grad_norm": 0.3146428167819977, "learning_rate": 3.3372922683575797e-06, "loss": 0.015027385205030441, "memory(GiB)": 21.48, "step": 19317, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.956934 }, { "epoch": 0.6275541695091447, "grad_norm": 0.40367257595062256, "learning_rate": 3.3367856951407975e-06, "loss": 0.015802528709173203, "memory(GiB)": 21.48, "step": 19318, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956945 }, { "epoch": 0.6275866549719, "grad_norm": 0.40255436301231384, "learning_rate": 3.3362791411197185e-06, "loss": 0.01902218908071518, "memory(GiB)": 21.48, "step": 19319, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.956955 }, { "epoch": 0.6276191404346555, "grad_norm": 0.33443596959114075, "learning_rate": 3.335772606300196e-06, "loss": 0.016401279717683792, "memory(GiB)": 21.48, "step": 19320, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956966 }, { "epoch": 0.6276516258974109, "grad_norm": 0.3251338005065918, "learning_rate": 3.335266090688071e-06, "loss": 0.0213964581489563, "memory(GiB)": 21.48, "step": 19321, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.956977 }, { "epoch": 0.6276841113601663, "grad_norm": 0.28239595890045166, "learning_rate": 3.3347595942891897e-06, "loss": 0.017549384385347366, "memory(GiB)": 21.48, "step": 19322, "token_acc": 0.981203007518797, "train_speed(iter/s)": 0.956987 }, { "epoch": 0.6277165968229217, "grad_norm": 0.4532777667045593, "learning_rate": 3.3342531171094024e-06, "loss": 0.013706284575164318, "memory(GiB)": 21.48, "step": 19323, "token_acc": 1.0, "train_speed(iter/s)": 0.956997 }, { "epoch": 0.6277490822856772, "grad_norm": 0.31968894600868225, "learning_rate": 3.3337466591545474e-06, "loss": 0.013809707015752792, "memory(GiB)": 21.48, "step": 19324, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957008 }, { "epoch": 0.6277815677484325, "grad_norm": 0.25090524554252625, "learning_rate": 3.333240220430478e-06, "loss": 0.014198919758200645, "memory(GiB)": 21.48, "step": 19325, "token_acc": 1.0, "train_speed(iter/s)": 0.957018 }, { "epoch": 0.627814053211188, "grad_norm": 0.352858304977417, "learning_rate": 3.332733800943032e-06, "loss": 0.02040661871433258, "memory(GiB)": 21.48, "step": 19326, "token_acc": 1.0, "train_speed(iter/s)": 0.957029 }, { "epoch": 0.6278465386739434, "grad_norm": 0.3466394543647766, "learning_rate": 3.332227400698058e-06, "loss": 0.014934983104467392, "memory(GiB)": 21.48, "step": 19327, "token_acc": 0.9927272727272727, "train_speed(iter/s)": 0.957039 }, { "epoch": 0.6278790241366988, "grad_norm": 0.3439391553401947, "learning_rate": 3.3317210197013995e-06, "loss": 0.013042844831943512, "memory(GiB)": 21.48, "step": 19328, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957049 }, { "epoch": 0.6279115095994543, "grad_norm": 0.407844215631485, "learning_rate": 3.331214657958901e-06, "loss": 0.02556789293885231, "memory(GiB)": 21.48, "step": 19329, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957058 }, { "epoch": 0.6279439950622097, "grad_norm": 0.32846498489379883, "learning_rate": 3.3307083154764064e-06, "loss": 0.01891472563147545, "memory(GiB)": 21.48, "step": 19330, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957067 }, { "epoch": 0.6279764805249651, "grad_norm": 0.32563096284866333, "learning_rate": 3.3302019922597595e-06, "loss": 0.012738282792270184, "memory(GiB)": 21.48, "step": 19331, "token_acc": 1.0, "train_speed(iter/s)": 0.957076 }, { "epoch": 0.6280089659877205, "grad_norm": 0.4258756637573242, "learning_rate": 3.3296956883148037e-06, "loss": 0.021443091332912445, "memory(GiB)": 21.48, "step": 19332, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.957085 }, { "epoch": 0.628041451450476, "grad_norm": 0.4852651357650757, "learning_rate": 3.329189403647384e-06, "loss": 0.02021881565451622, "memory(GiB)": 21.48, "step": 19333, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957094 }, { "epoch": 0.6280739369132313, "grad_norm": 0.3765459358692169, "learning_rate": 3.3286831382633413e-06, "loss": 0.016642451286315918, "memory(GiB)": 21.48, "step": 19334, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957102 }, { "epoch": 0.6281064223759868, "grad_norm": 0.396698534488678, "learning_rate": 3.328176892168521e-06, "loss": 0.01735564135015011, "memory(GiB)": 21.48, "step": 19335, "token_acc": 1.0, "train_speed(iter/s)": 0.957111 }, { "epoch": 0.6281389078387422, "grad_norm": 0.4888014495372772, "learning_rate": 3.327670665368763e-06, "loss": 0.01993633806705475, "memory(GiB)": 21.48, "step": 19336, "token_acc": 0.9813953488372092, "train_speed(iter/s)": 0.95712 }, { "epoch": 0.6281713933014976, "grad_norm": 0.43578165769577026, "learning_rate": 3.3271644578699115e-06, "loss": 0.024365339428186417, "memory(GiB)": 21.48, "step": 19337, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.957128 }, { "epoch": 0.628203878764253, "grad_norm": 0.3337503969669342, "learning_rate": 3.3266582696778093e-06, "loss": 0.015023890882730484, "memory(GiB)": 21.48, "step": 19338, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957137 }, { "epoch": 0.6282363642270085, "grad_norm": 0.30381274223327637, "learning_rate": 3.326152100798298e-06, "loss": 0.013954748399555683, "memory(GiB)": 21.48, "step": 19339, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.957146 }, { "epoch": 0.6282688496897638, "grad_norm": 0.34735405445098877, "learning_rate": 3.3256459512372195e-06, "loss": 0.02343362383544445, "memory(GiB)": 21.48, "step": 19340, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.957153 }, { "epoch": 0.6283013351525193, "grad_norm": 0.3693333566188812, "learning_rate": 3.3251398210004147e-06, "loss": 0.018347978591918945, "memory(GiB)": 21.48, "step": 19341, "token_acc": 0.996, "train_speed(iter/s)": 0.957161 }, { "epoch": 0.6283338206152747, "grad_norm": 0.2323547899723053, "learning_rate": 3.3246337100937263e-06, "loss": 0.011784590780735016, "memory(GiB)": 21.48, "step": 19342, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.95717 }, { "epoch": 0.6283663060780301, "grad_norm": 0.33245643973350525, "learning_rate": 3.324127618522994e-06, "loss": 0.015895014628767967, "memory(GiB)": 21.48, "step": 19343, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.957178 }, { "epoch": 0.6283987915407855, "grad_norm": 0.44068989157676697, "learning_rate": 3.3236215462940613e-06, "loss": 0.017976926639676094, "memory(GiB)": 21.48, "step": 19344, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.957188 }, { "epoch": 0.628431277003541, "grad_norm": 0.2856706976890564, "learning_rate": 3.323115493412764e-06, "loss": 0.011434989050030708, "memory(GiB)": 21.48, "step": 19345, "token_acc": 1.0, "train_speed(iter/s)": 0.957196 }, { "epoch": 0.6284637624662963, "grad_norm": 0.2676917016506195, "learning_rate": 3.3226094598849494e-06, "loss": 0.014340969733893871, "memory(GiB)": 21.48, "step": 19346, "token_acc": 0.9965277777777778, "train_speed(iter/s)": 0.957206 }, { "epoch": 0.6284962479290518, "grad_norm": 0.36503612995147705, "learning_rate": 3.3221034457164514e-06, "loss": 0.0209454707801342, "memory(GiB)": 21.48, "step": 19347, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957215 }, { "epoch": 0.6285287333918071, "grad_norm": 0.3336755633354187, "learning_rate": 3.3215974509131154e-06, "loss": 0.01596573367714882, "memory(GiB)": 21.48, "step": 19348, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.957221 }, { "epoch": 0.6285612188545626, "grad_norm": 0.2958220839500427, "learning_rate": 3.3210914754807767e-06, "loss": 0.015446377918124199, "memory(GiB)": 21.48, "step": 19349, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957228 }, { "epoch": 0.628593704317318, "grad_norm": 0.36064228415489197, "learning_rate": 3.3205855194252784e-06, "loss": 0.020384356379508972, "memory(GiB)": 21.48, "step": 19350, "token_acc": 1.0, "train_speed(iter/s)": 0.957235 }, { "epoch": 0.6286261897800735, "grad_norm": 0.4648190438747406, "learning_rate": 3.3200795827524575e-06, "loss": 0.02542959153652191, "memory(GiB)": 21.48, "step": 19351, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.957243 }, { "epoch": 0.6286586752428288, "grad_norm": 0.3977019190788269, "learning_rate": 3.3195736654681526e-06, "loss": 0.013798053376376629, "memory(GiB)": 21.48, "step": 19352, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.95725 }, { "epoch": 0.6286911607055843, "grad_norm": 0.289559006690979, "learning_rate": 3.3190677675782078e-06, "loss": 0.014936232939362526, "memory(GiB)": 21.48, "step": 19353, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.957257 }, { "epoch": 0.6287236461683396, "grad_norm": 0.38965141773223877, "learning_rate": 3.3185618890884552e-06, "loss": 0.020469531416893005, "memory(GiB)": 21.48, "step": 19354, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.957266 }, { "epoch": 0.6287561316310951, "grad_norm": 0.4123070240020752, "learning_rate": 3.318056030004739e-06, "loss": 0.016339313238859177, "memory(GiB)": 21.48, "step": 19355, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957273 }, { "epoch": 0.6287886170938505, "grad_norm": 0.8464921712875366, "learning_rate": 3.3175501903328934e-06, "loss": 0.013676781207323074, "memory(GiB)": 21.48, "step": 19356, "token_acc": 1.0, "train_speed(iter/s)": 0.957281 }, { "epoch": 0.628821102556606, "grad_norm": 0.3091243803501129, "learning_rate": 3.3170443700787584e-06, "loss": 0.018523171544075012, "memory(GiB)": 21.48, "step": 19357, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957289 }, { "epoch": 0.6288535880193613, "grad_norm": 0.2964591979980469, "learning_rate": 3.316538569248171e-06, "loss": 0.01837698183953762, "memory(GiB)": 21.48, "step": 19358, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.957296 }, { "epoch": 0.6288860734821168, "grad_norm": 0.4552536606788635, "learning_rate": 3.31603278784697e-06, "loss": 0.023984305560588837, "memory(GiB)": 21.48, "step": 19359, "token_acc": 0.984, "train_speed(iter/s)": 0.957304 }, { "epoch": 0.6289185589448721, "grad_norm": 0.45146286487579346, "learning_rate": 3.315527025880991e-06, "loss": 0.02114325761795044, "memory(GiB)": 21.48, "step": 19360, "token_acc": 1.0, "train_speed(iter/s)": 0.957312 }, { "epoch": 0.6289510444076276, "grad_norm": 0.2867717146873474, "learning_rate": 3.3150212833560726e-06, "loss": 0.015567095950245857, "memory(GiB)": 21.48, "step": 19361, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.95732 }, { "epoch": 0.628983529870383, "grad_norm": 0.300670862197876, "learning_rate": 3.314515560278051e-06, "loss": 0.014075109735131264, "memory(GiB)": 21.48, "step": 19362, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957328 }, { "epoch": 0.6290160153331384, "grad_norm": 0.4462341070175171, "learning_rate": 3.3140098566527635e-06, "loss": 0.01608891598880291, "memory(GiB)": 21.48, "step": 19363, "token_acc": 0.993103448275862, "train_speed(iter/s)": 0.957336 }, { "epoch": 0.6290485007958938, "grad_norm": 0.4704740047454834, "learning_rate": 3.313504172486046e-06, "loss": 0.027148988097906113, "memory(GiB)": 21.48, "step": 19364, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.957344 }, { "epoch": 0.6290809862586493, "grad_norm": 0.36691930890083313, "learning_rate": 3.3129985077837355e-06, "loss": 0.022949080914258957, "memory(GiB)": 21.48, "step": 19365, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.957352 }, { "epoch": 0.6291134717214046, "grad_norm": 0.4915093779563904, "learning_rate": 3.3124928625516673e-06, "loss": 0.020245496183633804, "memory(GiB)": 21.48, "step": 19366, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.957361 }, { "epoch": 0.6291459571841601, "grad_norm": 0.34644952416419983, "learning_rate": 3.3119872367956774e-06, "loss": 0.017101259902119637, "memory(GiB)": 21.48, "step": 19367, "token_acc": 0.996, "train_speed(iter/s)": 0.957368 }, { "epoch": 0.6291784426469155, "grad_norm": 0.3452398478984833, "learning_rate": 3.311481630521602e-06, "loss": 0.018347905948758125, "memory(GiB)": 21.48, "step": 19368, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.957379 }, { "epoch": 0.6292109281096709, "grad_norm": 0.623030424118042, "learning_rate": 3.310976043735275e-06, "loss": 0.01461072824895382, "memory(GiB)": 21.48, "step": 19369, "token_acc": 0.9823008849557522, "train_speed(iter/s)": 0.95739 }, { "epoch": 0.6292434135724263, "grad_norm": 0.3618251085281372, "learning_rate": 3.310470476442534e-06, "loss": 0.012426892295479774, "memory(GiB)": 21.48, "step": 19370, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.9574 }, { "epoch": 0.6292758990351818, "grad_norm": 0.3356562852859497, "learning_rate": 3.309964928649211e-06, "loss": 0.010797997936606407, "memory(GiB)": 21.48, "step": 19371, "token_acc": 1.0, "train_speed(iter/s)": 0.957411 }, { "epoch": 0.6293083844979371, "grad_norm": 0.40506166219711304, "learning_rate": 3.3094594003611434e-06, "loss": 0.021100524812936783, "memory(GiB)": 21.48, "step": 19372, "token_acc": 0.9823321554770318, "train_speed(iter/s)": 0.957421 }, { "epoch": 0.6293408699606926, "grad_norm": 0.2629495859146118, "learning_rate": 3.3089538915841636e-06, "loss": 0.014840088784694672, "memory(GiB)": 21.48, "step": 19373, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957432 }, { "epoch": 0.629373355423448, "grad_norm": 0.40972819924354553, "learning_rate": 3.3084484023241087e-06, "loss": 0.016148561611771584, "memory(GiB)": 21.48, "step": 19374, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957443 }, { "epoch": 0.6294058408862034, "grad_norm": 0.3188055455684662, "learning_rate": 3.3079429325868075e-06, "loss": 0.01632082834839821, "memory(GiB)": 21.48, "step": 19375, "token_acc": 1.0, "train_speed(iter/s)": 0.957453 }, { "epoch": 0.6294383263489588, "grad_norm": 0.28823283314704895, "learning_rate": 3.3074374823780995e-06, "loss": 0.012077790684998035, "memory(GiB)": 21.48, "step": 19376, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957464 }, { "epoch": 0.6294708118117143, "grad_norm": 0.2556329071521759, "learning_rate": 3.306932051703814e-06, "loss": 0.012207274325191975, "memory(GiB)": 21.48, "step": 19377, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957475 }, { "epoch": 0.6295032972744696, "grad_norm": 0.5519229769706726, "learning_rate": 3.3064266405697875e-06, "loss": 0.022333746775984764, "memory(GiB)": 21.48, "step": 19378, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957485 }, { "epoch": 0.6295357827372251, "grad_norm": 0.33240699768066406, "learning_rate": 3.305921248981851e-06, "loss": 0.018165607005357742, "memory(GiB)": 21.48, "step": 19379, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.957495 }, { "epoch": 0.6295682681999805, "grad_norm": 0.274262934923172, "learning_rate": 3.305415876945839e-06, "loss": 0.01271408423781395, "memory(GiB)": 21.48, "step": 19380, "token_acc": 1.0, "train_speed(iter/s)": 0.957506 }, { "epoch": 0.6296007536627359, "grad_norm": 0.3222455382347107, "learning_rate": 3.3049105244675816e-06, "loss": 0.01767788827419281, "memory(GiB)": 21.48, "step": 19381, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.957516 }, { "epoch": 0.6296332391254913, "grad_norm": 0.2788352072238922, "learning_rate": 3.3044051915529143e-06, "loss": 0.0168171264231205, "memory(GiB)": 21.48, "step": 19382, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.957527 }, { "epoch": 0.6296657245882468, "grad_norm": 0.4354971647262573, "learning_rate": 3.303899878207667e-06, "loss": 0.018730804324150085, "memory(GiB)": 21.48, "step": 19383, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.957537 }, { "epoch": 0.6296982100510021, "grad_norm": 0.189187154173851, "learning_rate": 3.3033945844376723e-06, "loss": 0.011686671525239944, "memory(GiB)": 21.48, "step": 19384, "token_acc": 0.9966887417218543, "train_speed(iter/s)": 0.957548 }, { "epoch": 0.6297306955137576, "grad_norm": 0.4674932360649109, "learning_rate": 3.3028893102487636e-06, "loss": 0.020803477615118027, "memory(GiB)": 21.48, "step": 19385, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.957559 }, { "epoch": 0.629763180976513, "grad_norm": 0.42369839549064636, "learning_rate": 3.3023840556467696e-06, "loss": 0.015952853485941887, "memory(GiB)": 21.48, "step": 19386, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.957569 }, { "epoch": 0.6297956664392684, "grad_norm": 0.32420313358306885, "learning_rate": 3.301878820637524e-06, "loss": 0.022015012800693512, "memory(GiB)": 21.48, "step": 19387, "token_acc": 1.0, "train_speed(iter/s)": 0.95758 }, { "epoch": 0.6298281519020238, "grad_norm": 0.4419931173324585, "learning_rate": 3.3013736052268563e-06, "loss": 0.022989939898252487, "memory(GiB)": 21.48, "step": 19388, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.957591 }, { "epoch": 0.6298606373647793, "grad_norm": 0.36149072647094727, "learning_rate": 3.3008684094206002e-06, "loss": 0.015040500089526176, "memory(GiB)": 21.48, "step": 19389, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957601 }, { "epoch": 0.6298931228275346, "grad_norm": 0.2123737782239914, "learning_rate": 3.3003632332245816e-06, "loss": 0.00842125155031681, "memory(GiB)": 21.48, "step": 19390, "token_acc": 1.0, "train_speed(iter/s)": 0.957612 }, { "epoch": 0.6299256082902901, "grad_norm": 0.4541833698749542, "learning_rate": 3.299858076644635e-06, "loss": 0.024687346071004868, "memory(GiB)": 21.48, "step": 19391, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.957622 }, { "epoch": 0.6299580937530456, "grad_norm": 0.32040512561798096, "learning_rate": 3.2993529396865886e-06, "loss": 0.011093540117144585, "memory(GiB)": 21.48, "step": 19392, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.957632 }, { "epoch": 0.6299905792158009, "grad_norm": 0.425802081823349, "learning_rate": 3.298847822356273e-06, "loss": 0.015656698495149612, "memory(GiB)": 21.48, "step": 19393, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.957641 }, { "epoch": 0.6300230646785564, "grad_norm": 0.4704173505306244, "learning_rate": 3.2983427246595174e-06, "loss": 0.0231047160923481, "memory(GiB)": 21.48, "step": 19394, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.95765 }, { "epoch": 0.6300555501413118, "grad_norm": 0.37053316831588745, "learning_rate": 3.2978376466021523e-06, "loss": 0.021840453147888184, "memory(GiB)": 21.48, "step": 19395, "token_acc": 1.0, "train_speed(iter/s)": 0.957659 }, { "epoch": 0.6300880356040672, "grad_norm": 0.3503018021583557, "learning_rate": 3.2973325881900054e-06, "loss": 0.01403089240193367, "memory(GiB)": 21.48, "step": 19396, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957667 }, { "epoch": 0.6301205210668226, "grad_norm": 0.34522172808647156, "learning_rate": 3.2968275494289093e-06, "loss": 0.01802867464721203, "memory(GiB)": 21.48, "step": 19397, "token_acc": 1.0, "train_speed(iter/s)": 0.957675 }, { "epoch": 0.6301530065295781, "grad_norm": 0.2562965452671051, "learning_rate": 3.296322530324686e-06, "loss": 0.013320960104465485, "memory(GiB)": 21.48, "step": 19398, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957684 }, { "epoch": 0.6301854919923334, "grad_norm": 0.47783589363098145, "learning_rate": 3.2958175308831696e-06, "loss": 0.018002241849899292, "memory(GiB)": 21.48, "step": 19399, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.957692 }, { "epoch": 0.6302179774550889, "grad_norm": 0.3259393572807312, "learning_rate": 3.295312551110189e-06, "loss": 0.014219182543456554, "memory(GiB)": 21.48, "step": 19400, "token_acc": 1.0, "train_speed(iter/s)": 0.957701 }, { "epoch": 0.6302504629178443, "grad_norm": 0.41364577412605286, "learning_rate": 3.294807591011569e-06, "loss": 0.01367949228733778, "memory(GiB)": 21.48, "step": 19401, "token_acc": 1.0, "train_speed(iter/s)": 0.95771 }, { "epoch": 0.6302829483805997, "grad_norm": 0.3486126661300659, "learning_rate": 3.2943026505931408e-06, "loss": 0.019251424819231033, "memory(GiB)": 21.48, "step": 19402, "token_acc": 0.9813084112149533, "train_speed(iter/s)": 0.957718 }, { "epoch": 0.6303154338433551, "grad_norm": 0.3807089626789093, "learning_rate": 3.2937977298607276e-06, "loss": 0.016340745612978935, "memory(GiB)": 21.48, "step": 19403, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.957727 }, { "epoch": 0.6303479193061106, "grad_norm": 0.4002757966518402, "learning_rate": 3.2932928288201626e-06, "loss": 0.012919281609356403, "memory(GiB)": 21.48, "step": 19404, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.957735 }, { "epoch": 0.6303804047688659, "grad_norm": 0.4689130485057831, "learning_rate": 3.2927879474772663e-06, "loss": 0.01878054253757, "memory(GiB)": 21.48, "step": 19405, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957743 }, { "epoch": 0.6304128902316214, "grad_norm": 0.4074993133544922, "learning_rate": 3.292283085837873e-06, "loss": 0.016375456005334854, "memory(GiB)": 21.48, "step": 19406, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.957751 }, { "epoch": 0.6304453756943768, "grad_norm": 0.48487553000450134, "learning_rate": 3.2917782439078037e-06, "loss": 0.019565463066101074, "memory(GiB)": 21.48, "step": 19407, "token_acc": 0.992, "train_speed(iter/s)": 0.95776 }, { "epoch": 0.6304778611571322, "grad_norm": 0.4096079170703888, "learning_rate": 3.2912734216928884e-06, "loss": 0.017744164913892746, "memory(GiB)": 21.48, "step": 19408, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.957768 }, { "epoch": 0.6305103466198876, "grad_norm": 0.3930855393409729, "learning_rate": 3.2907686191989508e-06, "loss": 0.01451796106994152, "memory(GiB)": 21.48, "step": 19409, "token_acc": 1.0, "train_speed(iter/s)": 0.957775 }, { "epoch": 0.6305428320826431, "grad_norm": 0.4021383821964264, "learning_rate": 3.2902638364318197e-06, "loss": 0.01418856717646122, "memory(GiB)": 21.48, "step": 19410, "token_acc": 1.0, "train_speed(iter/s)": 0.957782 }, { "epoch": 0.6305753175453984, "grad_norm": 0.4999736547470093, "learning_rate": 3.2897590733973177e-06, "loss": 0.021310053765773773, "memory(GiB)": 21.48, "step": 19411, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957789 }, { "epoch": 0.6306078030081539, "grad_norm": 0.41339653730392456, "learning_rate": 3.2892543301012744e-06, "loss": 0.018931608647108078, "memory(GiB)": 21.48, "step": 19412, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957795 }, { "epoch": 0.6306402884709092, "grad_norm": 0.8874545097351074, "learning_rate": 3.288749606549511e-06, "loss": 0.02163798362016678, "memory(GiB)": 21.48, "step": 19413, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957801 }, { "epoch": 0.6306727739336647, "grad_norm": 0.41166219115257263, "learning_rate": 3.2882449027478557e-06, "loss": 0.019309256225824356, "memory(GiB)": 21.48, "step": 19414, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.957809 }, { "epoch": 0.6307052593964201, "grad_norm": 0.3883289694786072, "learning_rate": 3.2877402187021334e-06, "loss": 0.0219790730625391, "memory(GiB)": 21.48, "step": 19415, "token_acc": 1.0, "train_speed(iter/s)": 0.957815 }, { "epoch": 0.6307377448591756, "grad_norm": 0.2230672985315323, "learning_rate": 3.2872355544181667e-06, "loss": 0.012758441269397736, "memory(GiB)": 21.48, "step": 19416, "token_acc": 1.0, "train_speed(iter/s)": 0.957823 }, { "epoch": 0.6307702303219309, "grad_norm": 0.26027923822402954, "learning_rate": 3.2867309099017826e-06, "loss": 0.008849920704960823, "memory(GiB)": 21.48, "step": 19417, "token_acc": 1.0, "train_speed(iter/s)": 0.95783 }, { "epoch": 0.6308027157846864, "grad_norm": 0.3684064745903015, "learning_rate": 3.2862262851588034e-06, "loss": 0.015033123083412647, "memory(GiB)": 21.48, "step": 19418, "token_acc": 0.9947643979057592, "train_speed(iter/s)": 0.957838 }, { "epoch": 0.6308352012474417, "grad_norm": 0.31490108370780945, "learning_rate": 3.2857216801950543e-06, "loss": 0.01884324848651886, "memory(GiB)": 21.48, "step": 19419, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957846 }, { "epoch": 0.6308676867101972, "grad_norm": 0.4577102065086365, "learning_rate": 3.2852170950163577e-06, "loss": 0.02473646029829979, "memory(GiB)": 21.48, "step": 19420, "token_acc": 1.0, "train_speed(iter/s)": 0.957853 }, { "epoch": 0.6309001721729526, "grad_norm": 0.2718101441860199, "learning_rate": 3.2847125296285396e-06, "loss": 0.01637493446469307, "memory(GiB)": 21.48, "step": 19421, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.957861 }, { "epoch": 0.630932657635708, "grad_norm": 0.43098771572113037, "learning_rate": 3.2842079840374206e-06, "loss": 0.014395153149962425, "memory(GiB)": 21.48, "step": 19422, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.957869 }, { "epoch": 0.6309651430984634, "grad_norm": 0.35259509086608887, "learning_rate": 3.283703458248827e-06, "loss": 0.01771710067987442, "memory(GiB)": 21.48, "step": 19423, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.957876 }, { "epoch": 0.6309976285612189, "grad_norm": 0.408014178276062, "learning_rate": 3.2831989522685782e-06, "loss": 0.01900579035282135, "memory(GiB)": 21.48, "step": 19424, "token_acc": 0.99609375, "train_speed(iter/s)": 0.957883 }, { "epoch": 0.6310301140239742, "grad_norm": 0.4436539113521576, "learning_rate": 3.2826944661025007e-06, "loss": 0.017164509743452072, "memory(GiB)": 21.48, "step": 19425, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.957891 }, { "epoch": 0.6310625994867297, "grad_norm": 0.32903963327407837, "learning_rate": 3.2821899997564118e-06, "loss": 0.02128206193447113, "memory(GiB)": 21.48, "step": 19426, "token_acc": 0.995, "train_speed(iter/s)": 0.957899 }, { "epoch": 0.6310950849494851, "grad_norm": 0.25598496198654175, "learning_rate": 3.281685553236139e-06, "loss": 0.013459820300340652, "memory(GiB)": 21.48, "step": 19427, "token_acc": 0.9927797833935018, "train_speed(iter/s)": 0.957907 }, { "epoch": 0.6311275704122405, "grad_norm": 0.8551139235496521, "learning_rate": 3.2811811265475e-06, "loss": 0.02184521034359932, "memory(GiB)": 21.48, "step": 19428, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957918 }, { "epoch": 0.6311600558749959, "grad_norm": 0.3566961884498596, "learning_rate": 3.2806767196963197e-06, "loss": 0.017640072852373123, "memory(GiB)": 21.48, "step": 19429, "token_acc": 1.0, "train_speed(iter/s)": 0.957928 }, { "epoch": 0.6311925413377514, "grad_norm": 0.3995538055896759, "learning_rate": 3.280172332688419e-06, "loss": 0.017796354368329048, "memory(GiB)": 21.48, "step": 19430, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957938 }, { "epoch": 0.6312250268005067, "grad_norm": 0.28462761640548706, "learning_rate": 3.279667965529617e-06, "loss": 0.011609663255512714, "memory(GiB)": 21.48, "step": 19431, "token_acc": 1.0, "train_speed(iter/s)": 0.957948 }, { "epoch": 0.6312575122632622, "grad_norm": 0.4740790128707886, "learning_rate": 3.279163618225739e-06, "loss": 0.023369602859020233, "memory(GiB)": 21.48, "step": 19432, "token_acc": 0.9921875, "train_speed(iter/s)": 0.957959 }, { "epoch": 0.6312899977260176, "grad_norm": 0.4531625211238861, "learning_rate": 3.2786592907826004e-06, "loss": 0.018365737050771713, "memory(GiB)": 21.48, "step": 19433, "token_acc": 0.9798387096774194, "train_speed(iter/s)": 0.957969 }, { "epoch": 0.631322483188773, "grad_norm": 0.4009227752685547, "learning_rate": 3.2781549832060277e-06, "loss": 0.016968926414847374, "memory(GiB)": 21.48, "step": 19434, "token_acc": 1.0, "train_speed(iter/s)": 0.957979 }, { "epoch": 0.6313549686515284, "grad_norm": 2.7140052318573, "learning_rate": 3.277650695501836e-06, "loss": 0.01599392294883728, "memory(GiB)": 21.48, "step": 19435, "token_acc": 1.0, "train_speed(iter/s)": 0.95799 }, { "epoch": 0.6313874541142839, "grad_norm": 0.3332362473011017, "learning_rate": 3.277146427675849e-06, "loss": 0.015255233272910118, "memory(GiB)": 21.48, "step": 19436, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.958 }, { "epoch": 0.6314199395770392, "grad_norm": 0.40688109397888184, "learning_rate": 3.2766421797338845e-06, "loss": 0.014778923243284225, "memory(GiB)": 21.48, "step": 19437, "token_acc": 0.992, "train_speed(iter/s)": 0.95801 }, { "epoch": 0.6314524250397947, "grad_norm": 0.2412552386522293, "learning_rate": 3.2761379516817637e-06, "loss": 0.008118970319628716, "memory(GiB)": 21.48, "step": 19438, "token_acc": 1.0, "train_speed(iter/s)": 0.95802 }, { "epoch": 0.6314849105025501, "grad_norm": 0.4570654034614563, "learning_rate": 3.2756337435253046e-06, "loss": 0.019498765468597412, "memory(GiB)": 21.48, "step": 19439, "token_acc": 0.9965277777777778, "train_speed(iter/s)": 0.958031 }, { "epoch": 0.6315173959653055, "grad_norm": 0.31815066933631897, "learning_rate": 3.2751295552703275e-06, "loss": 0.024547548964619637, "memory(GiB)": 21.48, "step": 19440, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.958041 }, { "epoch": 0.6315498814280609, "grad_norm": 0.4246569871902466, "learning_rate": 3.274625386922651e-06, "loss": 0.01964995265007019, "memory(GiB)": 21.48, "step": 19441, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.958051 }, { "epoch": 0.6315823668908164, "grad_norm": 0.31009772419929504, "learning_rate": 3.274121238488095e-06, "loss": 0.013984501361846924, "memory(GiB)": 21.48, "step": 19442, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.958061 }, { "epoch": 0.6316148523535717, "grad_norm": 0.32239434123039246, "learning_rate": 3.2736171099724756e-06, "loss": 0.0155660230666399, "memory(GiB)": 21.48, "step": 19443, "token_acc": 1.0, "train_speed(iter/s)": 0.958071 }, { "epoch": 0.6316473378163272, "grad_norm": 0.5665917992591858, "learning_rate": 3.273113001381614e-06, "loss": 0.01957590878009796, "memory(GiB)": 21.48, "step": 19444, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.958081 }, { "epoch": 0.6316798232790826, "grad_norm": 0.45151734352111816, "learning_rate": 3.272608912721325e-06, "loss": 0.01740153320133686, "memory(GiB)": 21.48, "step": 19445, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.958091 }, { "epoch": 0.631712308741838, "grad_norm": 0.3857385516166687, "learning_rate": 3.272104843997429e-06, "loss": 0.00968865305185318, "memory(GiB)": 21.48, "step": 19446, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.958101 }, { "epoch": 0.6317447942045934, "grad_norm": 0.803504228591919, "learning_rate": 3.2716007952157437e-06, "loss": 0.015561744570732117, "memory(GiB)": 21.48, "step": 19447, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.958112 }, { "epoch": 0.6317772796673489, "grad_norm": 0.3702898323535919, "learning_rate": 3.2710967663820845e-06, "loss": 0.016374582424759865, "memory(GiB)": 21.48, "step": 19448, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.958121 }, { "epoch": 0.6318097651301042, "grad_norm": 0.40136611461639404, "learning_rate": 3.270592757502271e-06, "loss": 0.020015008747577667, "memory(GiB)": 21.48, "step": 19449, "token_acc": 1.0, "train_speed(iter/s)": 0.958132 }, { "epoch": 0.6318422505928597, "grad_norm": 0.38406670093536377, "learning_rate": 3.270088768582118e-06, "loss": 0.02435903251171112, "memory(GiB)": 21.48, "step": 19450, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.958142 }, { "epoch": 0.6318747360556151, "grad_norm": 0.39978715777397156, "learning_rate": 3.2695847996274443e-06, "loss": 0.019117727875709534, "memory(GiB)": 21.48, "step": 19451, "token_acc": 0.9812734082397003, "train_speed(iter/s)": 0.958153 }, { "epoch": 0.6319072215183705, "grad_norm": 0.33950045704841614, "learning_rate": 3.269080850644064e-06, "loss": 0.014738968573510647, "memory(GiB)": 21.48, "step": 19452, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.958163 }, { "epoch": 0.6319397069811259, "grad_norm": 0.24391549825668335, "learning_rate": 3.268576921637797e-06, "loss": 0.008713613264262676, "memory(GiB)": 21.48, "step": 19453, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.958173 }, { "epoch": 0.6319721924438814, "grad_norm": 0.4498447775840759, "learning_rate": 3.2680730126144534e-06, "loss": 0.022029943764209747, "memory(GiB)": 21.48, "step": 19454, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.958184 }, { "epoch": 0.6320046779066367, "grad_norm": 0.48134294152259827, "learning_rate": 3.2675691235798552e-06, "loss": 0.017140919342637062, "memory(GiB)": 21.48, "step": 19455, "token_acc": 1.0, "train_speed(iter/s)": 0.958195 }, { "epoch": 0.6320371633693922, "grad_norm": 0.33750492334365845, "learning_rate": 3.2670652545398124e-06, "loss": 0.016139114275574684, "memory(GiB)": 21.48, "step": 19456, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.958204 }, { "epoch": 0.6320696488321477, "grad_norm": 0.33252108097076416, "learning_rate": 3.266561405500146e-06, "loss": 0.01745053008198738, "memory(GiB)": 21.48, "step": 19457, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.958213 }, { "epoch": 0.632102134294903, "grad_norm": 0.27512747049331665, "learning_rate": 3.2660575764666662e-06, "loss": 0.0143839530646801, "memory(GiB)": 21.48, "step": 19458, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.95822 }, { "epoch": 0.6321346197576585, "grad_norm": 0.40926456451416016, "learning_rate": 3.265553767445191e-06, "loss": 0.022482287138700485, "memory(GiB)": 21.48, "step": 19459, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.958228 }, { "epoch": 0.6321671052204139, "grad_norm": 0.3655448257923126, "learning_rate": 3.2650499784415323e-06, "loss": 0.01633654348552227, "memory(GiB)": 21.48, "step": 19460, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.958235 }, { "epoch": 0.6321995906831693, "grad_norm": 0.5407334566116333, "learning_rate": 3.2645462094615055e-06, "loss": 0.02255016379058361, "memory(GiB)": 21.48, "step": 19461, "token_acc": 0.9896551724137931, "train_speed(iter/s)": 0.958243 }, { "epoch": 0.6322320761459247, "grad_norm": 0.36849892139434814, "learning_rate": 3.264042460510929e-06, "loss": 0.012903804890811443, "memory(GiB)": 21.48, "step": 19462, "token_acc": 0.9837837837837838, "train_speed(iter/s)": 0.95825 }, { "epoch": 0.6322645616086802, "grad_norm": 0.47700849175453186, "learning_rate": 3.26353873159561e-06, "loss": 0.025919057428836823, "memory(GiB)": 21.48, "step": 19463, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.958257 }, { "epoch": 0.6322970470714355, "grad_norm": 0.3831954896450043, "learning_rate": 3.263035022721367e-06, "loss": 0.01599973998963833, "memory(GiB)": 21.48, "step": 19464, "token_acc": 1.0, "train_speed(iter/s)": 0.958265 }, { "epoch": 0.632329532534191, "grad_norm": 0.4688126742839813, "learning_rate": 3.26253133389401e-06, "loss": 0.01914236880838871, "memory(GiB)": 21.48, "step": 19465, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.958274 }, { "epoch": 0.6323620179969464, "grad_norm": 0.4178242087364197, "learning_rate": 3.2620276651193556e-06, "loss": 0.015052756294608116, "memory(GiB)": 21.48, "step": 19466, "token_acc": 1.0, "train_speed(iter/s)": 0.958281 }, { "epoch": 0.6323945034597018, "grad_norm": 0.317392498254776, "learning_rate": 3.2615240164032134e-06, "loss": 0.012799128890037537, "memory(GiB)": 21.48, "step": 19467, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.958288 }, { "epoch": 0.6324269889224572, "grad_norm": 0.46917322278022766, "learning_rate": 3.2610203877514e-06, "loss": 0.020751796662807465, "memory(GiB)": 21.48, "step": 19468, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.958295 }, { "epoch": 0.6324594743852127, "grad_norm": 0.27702751755714417, "learning_rate": 3.2605167791697246e-06, "loss": 0.01798049919307232, "memory(GiB)": 21.48, "step": 19469, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.958303 }, { "epoch": 0.632491959847968, "grad_norm": 0.43200820684432983, "learning_rate": 3.2600131906640013e-06, "loss": 0.01704729162156582, "memory(GiB)": 21.48, "step": 19470, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.95831 }, { "epoch": 0.6325244453107235, "grad_norm": 0.2155260443687439, "learning_rate": 3.2595096222400414e-06, "loss": 0.011801982298493385, "memory(GiB)": 21.48, "step": 19471, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.958317 }, { "epoch": 0.6325569307734789, "grad_norm": 0.4624413549900055, "learning_rate": 3.259006073903658e-06, "loss": 0.019715651869773865, "memory(GiB)": 21.48, "step": 19472, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.958324 }, { "epoch": 0.6325894162362343, "grad_norm": 0.25278592109680176, "learning_rate": 3.25850254566066e-06, "loss": 0.012489227578043938, "memory(GiB)": 21.48, "step": 19473, "token_acc": 1.0, "train_speed(iter/s)": 0.95833 }, { "epoch": 0.6326219016989897, "grad_norm": 0.3215491473674774, "learning_rate": 3.257999037516862e-06, "loss": 0.014668799936771393, "memory(GiB)": 21.48, "step": 19474, "token_acc": 0.9849056603773585, "train_speed(iter/s)": 0.958337 }, { "epoch": 0.6326543871617452, "grad_norm": 0.23858779668807983, "learning_rate": 3.2574955494780726e-06, "loss": 0.009534728713333607, "memory(GiB)": 21.48, "step": 19475, "token_acc": 1.0, "train_speed(iter/s)": 0.958342 }, { "epoch": 0.6326868726245005, "grad_norm": 0.22420941293239594, "learning_rate": 3.2569920815501037e-06, "loss": 0.011565223336219788, "memory(GiB)": 21.48, "step": 19476, "token_acc": 1.0, "train_speed(iter/s)": 0.958349 }, { "epoch": 0.632719358087256, "grad_norm": 0.37749016284942627, "learning_rate": 3.2564886337387676e-06, "loss": 0.01573997363448143, "memory(GiB)": 21.48, "step": 19477, "token_acc": 1.0, "train_speed(iter/s)": 0.958357 }, { "epoch": 0.6327518435500114, "grad_norm": 0.9925293326377869, "learning_rate": 3.255985206049872e-06, "loss": 0.017731139436364174, "memory(GiB)": 21.48, "step": 19478, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.958364 }, { "epoch": 0.6327843290127668, "grad_norm": 0.4077300429344177, "learning_rate": 3.2554817984892295e-06, "loss": 0.011100970208644867, "memory(GiB)": 21.48, "step": 19479, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.958371 }, { "epoch": 0.6328168144755222, "grad_norm": 0.48862671852111816, "learning_rate": 3.254978411062648e-06, "loss": 0.02461252547800541, "memory(GiB)": 21.48, "step": 19480, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.958378 }, { "epoch": 0.6328492999382777, "grad_norm": 0.4514828622341156, "learning_rate": 3.25447504377594e-06, "loss": 0.014019734226167202, "memory(GiB)": 21.48, "step": 19481, "token_acc": 0.9880239520958084, "train_speed(iter/s)": 0.958386 }, { "epoch": 0.632881785401033, "grad_norm": 0.37702468037605286, "learning_rate": 3.25397169663491e-06, "loss": 0.020106781274080276, "memory(GiB)": 21.48, "step": 19482, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.958394 }, { "epoch": 0.6329142708637885, "grad_norm": 0.1807369738817215, "learning_rate": 3.2534683696453745e-06, "loss": 0.007549292407929897, "memory(GiB)": 21.48, "step": 19483, "token_acc": 1.0, "train_speed(iter/s)": 0.9584 }, { "epoch": 0.6329467563265438, "grad_norm": 0.37919795513153076, "learning_rate": 3.252965062813135e-06, "loss": 0.01567859575152397, "memory(GiB)": 21.48, "step": 19484, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.958409 }, { "epoch": 0.6329792417892993, "grad_norm": 0.38401755690574646, "learning_rate": 3.252461776144007e-06, "loss": 0.019359339028596878, "memory(GiB)": 21.48, "step": 19485, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.958417 }, { "epoch": 0.6330117272520547, "grad_norm": 0.5600930452346802, "learning_rate": 3.2519585096437943e-06, "loss": 0.035329803824424744, "memory(GiB)": 21.48, "step": 19486, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.958425 }, { "epoch": 0.6330442127148102, "grad_norm": 0.3591575026512146, "learning_rate": 3.2514552633183084e-06, "loss": 0.013366295024752617, "memory(GiB)": 21.48, "step": 19487, "token_acc": 0.9897959183673469, "train_speed(iter/s)": 0.958433 }, { "epoch": 0.6330766981775655, "grad_norm": 0.455304354429245, "learning_rate": 3.2509520371733545e-06, "loss": 0.017037374898791313, "memory(GiB)": 21.48, "step": 19488, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.958443 }, { "epoch": 0.633109183640321, "grad_norm": 0.22974736988544464, "learning_rate": 3.2504488312147432e-06, "loss": 0.012927491217851639, "memory(GiB)": 21.48, "step": 19489, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.958453 }, { "epoch": 0.6331416691030763, "grad_norm": 0.39805254340171814, "learning_rate": 3.2499456454482798e-06, "loss": 0.02224934659898281, "memory(GiB)": 21.48, "step": 19490, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.958462 }, { "epoch": 0.6331741545658318, "grad_norm": 0.36635762453079224, "learning_rate": 3.249442479879773e-06, "loss": 0.013640609569847584, "memory(GiB)": 21.48, "step": 19491, "token_acc": 1.0, "train_speed(iter/s)": 0.958472 }, { "epoch": 0.6332066400285872, "grad_norm": 0.19135257601737976, "learning_rate": 3.2489393345150313e-06, "loss": 0.011214550584554672, "memory(GiB)": 21.48, "step": 19492, "token_acc": 1.0, "train_speed(iter/s)": 0.958482 }, { "epoch": 0.6332391254913426, "grad_norm": 0.2904645502567291, "learning_rate": 3.2484362093598595e-06, "loss": 0.015453441999852657, "memory(GiB)": 21.48, "step": 19493, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.958492 }, { "epoch": 0.633271610954098, "grad_norm": 0.35678985714912415, "learning_rate": 3.2479331044200657e-06, "loss": 0.021145161241292953, "memory(GiB)": 21.48, "step": 19494, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.958502 }, { "epoch": 0.6333040964168535, "grad_norm": 0.31080853939056396, "learning_rate": 3.2474300197014553e-06, "loss": 0.013863612897694111, "memory(GiB)": 21.48, "step": 19495, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.958513 }, { "epoch": 0.6333365818796088, "grad_norm": 0.30533188581466675, "learning_rate": 3.2469269552098357e-06, "loss": 0.010426511988043785, "memory(GiB)": 21.48, "step": 19496, "token_acc": 1.0, "train_speed(iter/s)": 0.958524 }, { "epoch": 0.6333690673423643, "grad_norm": 0.47302547097206116, "learning_rate": 3.246423910951011e-06, "loss": 0.025105025619268417, "memory(GiB)": 21.48, "step": 19497, "token_acc": 0.9885931558935361, "train_speed(iter/s)": 0.958534 }, { "epoch": 0.6334015528051197, "grad_norm": 0.36195313930511475, "learning_rate": 3.24592088693079e-06, "loss": 0.011974954977631569, "memory(GiB)": 21.48, "step": 19498, "token_acc": 1.0, "train_speed(iter/s)": 0.958545 }, { "epoch": 0.6334340382678751, "grad_norm": 0.3318219482898712, "learning_rate": 3.2454178831549753e-06, "loss": 0.01835595816373825, "memory(GiB)": 21.48, "step": 19499, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.958555 }, { "epoch": 0.6334665237306305, "grad_norm": 0.29714664816856384, "learning_rate": 3.2449148996293755e-06, "loss": 0.015807680785655975, "memory(GiB)": 21.48, "step": 19500, "token_acc": 1.0, "train_speed(iter/s)": 0.958565 }, { "epoch": 0.6334665237306305, "eval_loss": 0.016786303371191025, "eval_runtime": 80.5595, "eval_samples_per_second": 123.511, "eval_steps_per_second": 3.86, "eval_token_acc": 0.993168210297623, "step": 19500 }, { "epoch": 0.633499009193386, "grad_norm": 0.23382122814655304, "learning_rate": 3.244411936359792e-06, "loss": 0.010840527713298798, "memory(GiB)": 21.48, "step": 19501, "token_acc": 0.9929627601314348, "train_speed(iter/s)": 0.954317 }, { "epoch": 0.6335314946561413, "grad_norm": 0.32312244176864624, "learning_rate": 3.2439089933520327e-06, "loss": 0.01325384434312582, "memory(GiB)": 21.48, "step": 19502, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.954325 }, { "epoch": 0.6335639801188968, "grad_norm": 0.3386339843273163, "learning_rate": 3.2434060706119e-06, "loss": 0.013165615499019623, "memory(GiB)": 21.48, "step": 19503, "token_acc": 1.0, "train_speed(iter/s)": 0.954331 }, { "epoch": 0.6335964655816522, "grad_norm": 0.33880406618118286, "learning_rate": 3.2429031681452005e-06, "loss": 0.016654150560498238, "memory(GiB)": 21.48, "step": 19504, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.954338 }, { "epoch": 0.6336289510444076, "grad_norm": 0.22282439470291138, "learning_rate": 3.2424002859577352e-06, "loss": 0.009096583351492882, "memory(GiB)": 21.48, "step": 19505, "token_acc": 1.0, "train_speed(iter/s)": 0.954345 }, { "epoch": 0.633661436507163, "grad_norm": 0.2728418707847595, "learning_rate": 3.24189742405531e-06, "loss": 0.015912074595689774, "memory(GiB)": 21.48, "step": 19506, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.954351 }, { "epoch": 0.6336939219699185, "grad_norm": 0.2338067889213562, "learning_rate": 3.2413945824437297e-06, "loss": 0.009193431586027145, "memory(GiB)": 21.48, "step": 19507, "token_acc": 0.9892857142857143, "train_speed(iter/s)": 0.954358 }, { "epoch": 0.6337264074326738, "grad_norm": 0.2843613624572754, "learning_rate": 3.240891761128796e-06, "loss": 0.02193712815642357, "memory(GiB)": 21.48, "step": 19508, "token_acc": 0.9810606060606061, "train_speed(iter/s)": 0.954365 }, { "epoch": 0.6337588928954293, "grad_norm": 0.3046072721481323, "learning_rate": 3.240388960116315e-06, "loss": 0.011299552395939827, "memory(GiB)": 21.48, "step": 19509, "token_acc": 1.0, "train_speed(iter/s)": 0.954372 }, { "epoch": 0.6337913783581847, "grad_norm": 0.3403712511062622, "learning_rate": 3.239886179412084e-06, "loss": 0.015930237248539925, "memory(GiB)": 21.48, "step": 19510, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.954378 }, { "epoch": 0.6338238638209401, "grad_norm": 0.7527773976325989, "learning_rate": 3.2393834190219127e-06, "loss": 0.02062508463859558, "memory(GiB)": 21.48, "step": 19511, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.954385 }, { "epoch": 0.6338563492836955, "grad_norm": 0.316339910030365, "learning_rate": 3.2388806789515962e-06, "loss": 0.011538809165358543, "memory(GiB)": 21.48, "step": 19512, "token_acc": 0.9930313588850174, "train_speed(iter/s)": 0.954392 }, { "epoch": 0.633888834746451, "grad_norm": 0.32786551117897034, "learning_rate": 3.238377959206944e-06, "loss": 0.016821537166833878, "memory(GiB)": 21.48, "step": 19513, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.954399 }, { "epoch": 0.6339213202092063, "grad_norm": 0.5043200254440308, "learning_rate": 3.2378752597937534e-06, "loss": 0.029543686658143997, "memory(GiB)": 21.48, "step": 19514, "token_acc": 1.0, "train_speed(iter/s)": 0.954405 }, { "epoch": 0.6339538056719618, "grad_norm": 0.38687682151794434, "learning_rate": 3.2373725807178284e-06, "loss": 0.015811050310730934, "memory(GiB)": 21.48, "step": 19515, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.954412 }, { "epoch": 0.6339862911347172, "grad_norm": 0.34613344073295593, "learning_rate": 3.2368699219849692e-06, "loss": 0.013000363484025002, "memory(GiB)": 21.48, "step": 19516, "token_acc": 1.0, "train_speed(iter/s)": 0.95442 }, { "epoch": 0.6340187765974726, "grad_norm": 0.33916449546813965, "learning_rate": 3.2363672836009784e-06, "loss": 0.018720461055636406, "memory(GiB)": 21.48, "step": 19517, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.954426 }, { "epoch": 0.634051262060228, "grad_norm": 0.4044911861419678, "learning_rate": 3.235864665571655e-06, "loss": 0.01451699435710907, "memory(GiB)": 21.48, "step": 19518, "token_acc": 0.991869918699187, "train_speed(iter/s)": 0.954433 }, { "epoch": 0.6340837475229835, "grad_norm": 0.36138108372688293, "learning_rate": 3.2353620679028034e-06, "loss": 0.014412017539143562, "memory(GiB)": 21.48, "step": 19519, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.95444 }, { "epoch": 0.6341162329857389, "grad_norm": 0.43466776609420776, "learning_rate": 3.234859490600221e-06, "loss": 0.030354835093021393, "memory(GiB)": 21.48, "step": 19520, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.954447 }, { "epoch": 0.6341487184484943, "grad_norm": 0.3515855371952057, "learning_rate": 3.2343569336697107e-06, "loss": 0.018346933647990227, "memory(GiB)": 21.48, "step": 19521, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.954455 }, { "epoch": 0.6341812039112498, "grad_norm": 0.3946987986564636, "learning_rate": 3.23385439711707e-06, "loss": 0.01819712668657303, "memory(GiB)": 21.48, "step": 19522, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.954463 }, { "epoch": 0.6342136893740051, "grad_norm": 0.23095808923244476, "learning_rate": 3.2333518809481003e-06, "loss": 0.011635297909379005, "memory(GiB)": 21.48, "step": 19523, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.954471 }, { "epoch": 0.6342461748367606, "grad_norm": 0.29795125126838684, "learning_rate": 3.232849385168603e-06, "loss": 0.012118162587285042, "memory(GiB)": 21.48, "step": 19524, "token_acc": 1.0, "train_speed(iter/s)": 0.954478 }, { "epoch": 0.634278660299516, "grad_norm": 0.19309940934181213, "learning_rate": 3.232346909784374e-06, "loss": 0.009004905819892883, "memory(GiB)": 21.48, "step": 19525, "token_acc": 1.0, "train_speed(iter/s)": 0.954486 }, { "epoch": 0.6343111457622714, "grad_norm": 0.286134272813797, "learning_rate": 3.231844454801215e-06, "loss": 0.01624908298254013, "memory(GiB)": 21.48, "step": 19526, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.954495 }, { "epoch": 0.6343436312250268, "grad_norm": 0.30718275904655457, "learning_rate": 3.2313420202249248e-06, "loss": 0.017197273671627045, "memory(GiB)": 21.48, "step": 19527, "token_acc": 1.0, "train_speed(iter/s)": 0.954503 }, { "epoch": 0.6343761166877823, "grad_norm": 0.30922386050224304, "learning_rate": 3.230839606061302e-06, "loss": 0.012985734269022942, "memory(GiB)": 21.48, "step": 19528, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.954511 }, { "epoch": 0.6344086021505376, "grad_norm": 0.3124333620071411, "learning_rate": 3.2303372123161437e-06, "loss": 0.01694313995540142, "memory(GiB)": 21.48, "step": 19529, "token_acc": 0.9783783783783784, "train_speed(iter/s)": 0.954519 }, { "epoch": 0.6344410876132931, "grad_norm": 0.4800291657447815, "learning_rate": 3.2298348389952505e-06, "loss": 0.01917482726275921, "memory(GiB)": 21.48, "step": 19530, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.954527 }, { "epoch": 0.6344735730760485, "grad_norm": 0.3549325168132782, "learning_rate": 3.2293324861044185e-06, "loss": 0.014443648047745228, "memory(GiB)": 21.48, "step": 19531, "token_acc": 0.995, "train_speed(iter/s)": 0.954535 }, { "epoch": 0.6345060585388039, "grad_norm": 0.36644047498703003, "learning_rate": 3.228830153649448e-06, "loss": 0.022224776446819305, "memory(GiB)": 21.48, "step": 19532, "token_acc": 1.0, "train_speed(iter/s)": 0.954544 }, { "epoch": 0.6345385440015593, "grad_norm": 0.27058476209640503, "learning_rate": 3.228327841636133e-06, "loss": 0.01044420339167118, "memory(GiB)": 21.48, "step": 19533, "token_acc": 1.0, "train_speed(iter/s)": 0.954552 }, { "epoch": 0.6345710294643148, "grad_norm": 0.31789734959602356, "learning_rate": 3.227825550070275e-06, "loss": 0.018075816333293915, "memory(GiB)": 21.48, "step": 19534, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.954561 }, { "epoch": 0.6346035149270701, "grad_norm": 0.36382704973220825, "learning_rate": 3.227323278957666e-06, "loss": 0.015298069454729557, "memory(GiB)": 21.48, "step": 19535, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.95457 }, { "epoch": 0.6346360003898256, "grad_norm": 0.3362162709236145, "learning_rate": 3.226821028304109e-06, "loss": 0.015450283885002136, "memory(GiB)": 21.48, "step": 19536, "token_acc": 0.9942196531791907, "train_speed(iter/s)": 0.95458 }, { "epoch": 0.634668485852581, "grad_norm": 0.3172815442085266, "learning_rate": 3.226318798115395e-06, "loss": 0.011702225543558598, "memory(GiB)": 21.48, "step": 19537, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.954591 }, { "epoch": 0.6347009713153364, "grad_norm": 0.25434550642967224, "learning_rate": 3.225816588397322e-06, "loss": 0.010855735279619694, "memory(GiB)": 21.48, "step": 19538, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.954601 }, { "epoch": 0.6347334567780918, "grad_norm": 0.563232421875, "learning_rate": 3.2253143991556903e-06, "loss": 0.012355389073491096, "memory(GiB)": 21.48, "step": 19539, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.954597 }, { "epoch": 0.6347659422408473, "grad_norm": 0.3920884132385254, "learning_rate": 3.224812230396289e-06, "loss": 0.016176922246813774, "memory(GiB)": 21.48, "step": 19540, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.954607 }, { "epoch": 0.6347984277036026, "grad_norm": 0.407642662525177, "learning_rate": 3.2243100821249206e-06, "loss": 0.017992306500673294, "memory(GiB)": 21.48, "step": 19541, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.954618 }, { "epoch": 0.6348309131663581, "grad_norm": 0.31381237506866455, "learning_rate": 3.2238079543473756e-06, "loss": 0.011966625228524208, "memory(GiB)": 21.48, "step": 19542, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.954628 }, { "epoch": 0.6348633986291135, "grad_norm": 0.31490591168403625, "learning_rate": 3.223305847069451e-06, "loss": 0.013531005941331387, "memory(GiB)": 21.48, "step": 19543, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.954639 }, { "epoch": 0.6348958840918689, "grad_norm": 0.29634711146354675, "learning_rate": 3.2228037602969413e-06, "loss": 0.013461310416460037, "memory(GiB)": 21.48, "step": 19544, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.954649 }, { "epoch": 0.6349283695546243, "grad_norm": 0.3037765622138977, "learning_rate": 3.222301694035643e-06, "loss": 0.011585358530282974, "memory(GiB)": 21.48, "step": 19545, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.954659 }, { "epoch": 0.6349608550173798, "grad_norm": 0.295172780752182, "learning_rate": 3.2217996482913478e-06, "loss": 0.010723483748733997, "memory(GiB)": 21.48, "step": 19546, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.95467 }, { "epoch": 0.6349933404801351, "grad_norm": 0.3006218373775482, "learning_rate": 3.221297623069852e-06, "loss": 0.010317422449588776, "memory(GiB)": 21.48, "step": 19547, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.95468 }, { "epoch": 0.6350258259428906, "grad_norm": 0.30399367213249207, "learning_rate": 3.2207956183769497e-06, "loss": 0.01425711065530777, "memory(GiB)": 21.48, "step": 19548, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.95469 }, { "epoch": 0.635058311405646, "grad_norm": 0.46304118633270264, "learning_rate": 3.220293634218433e-06, "loss": 0.021076373755931854, "memory(GiB)": 21.48, "step": 19549, "token_acc": 0.9961832061068703, "train_speed(iter/s)": 0.954701 }, { "epoch": 0.6350907968684014, "grad_norm": 0.44210413098335266, "learning_rate": 3.219791670600097e-06, "loss": 0.02150518074631691, "memory(GiB)": 21.48, "step": 19550, "token_acc": 0.9711934156378601, "train_speed(iter/s)": 0.954711 }, { "epoch": 0.6351232823311568, "grad_norm": 0.3497241139411926, "learning_rate": 3.2192897275277353e-06, "loss": 0.014492526650428772, "memory(GiB)": 21.48, "step": 19551, "token_acc": 1.0, "train_speed(iter/s)": 0.954721 }, { "epoch": 0.6351557677939123, "grad_norm": 0.42622992396354675, "learning_rate": 3.2187878050071393e-06, "loss": 0.014695334248244762, "memory(GiB)": 21.48, "step": 19552, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.954731 }, { "epoch": 0.6351882532566676, "grad_norm": 0.48570284247398376, "learning_rate": 3.218285903044103e-06, "loss": 0.022244617342948914, "memory(GiB)": 21.48, "step": 19553, "token_acc": 0.9875, "train_speed(iter/s)": 0.954741 }, { "epoch": 0.6352207387194231, "grad_norm": 0.321394681930542, "learning_rate": 3.217784021644419e-06, "loss": 0.016802094876766205, "memory(GiB)": 21.48, "step": 19554, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.954752 }, { "epoch": 0.6352532241821784, "grad_norm": 0.4660426676273346, "learning_rate": 3.217282160813879e-06, "loss": 0.0199270062148571, "memory(GiB)": 21.48, "step": 19555, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.954762 }, { "epoch": 0.6352857096449339, "grad_norm": 0.4638020098209381, "learning_rate": 3.2167803205582765e-06, "loss": 0.015819517895579338, "memory(GiB)": 21.48, "step": 19556, "token_acc": 0.9834254143646409, "train_speed(iter/s)": 0.954772 }, { "epoch": 0.6353181951076893, "grad_norm": 0.36475178599357605, "learning_rate": 3.216278500883402e-06, "loss": 0.01617307960987091, "memory(GiB)": 21.48, "step": 19557, "token_acc": 1.0, "train_speed(iter/s)": 0.954782 }, { "epoch": 0.6353506805704447, "grad_norm": 0.3431476950645447, "learning_rate": 3.2157767017950483e-06, "loss": 0.010327770374715328, "memory(GiB)": 21.48, "step": 19558, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.954792 }, { "epoch": 0.6353831660332001, "grad_norm": 0.3501371741294861, "learning_rate": 3.2152749232990053e-06, "loss": 0.01901905983686447, "memory(GiB)": 21.48, "step": 19559, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.954802 }, { "epoch": 0.6354156514959556, "grad_norm": 0.43516841530799866, "learning_rate": 3.2147731654010672e-06, "loss": 0.018715310841798782, "memory(GiB)": 21.48, "step": 19560, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.95481 }, { "epoch": 0.6354481369587109, "grad_norm": 0.3950105607509613, "learning_rate": 3.2142714281070196e-06, "loss": 0.0239399466663599, "memory(GiB)": 21.48, "step": 19561, "token_acc": 0.9932432432432432, "train_speed(iter/s)": 0.954818 }, { "epoch": 0.6354806224214664, "grad_norm": 0.37746649980545044, "learning_rate": 3.21376971142266e-06, "loss": 0.017458166927099228, "memory(GiB)": 21.48, "step": 19562, "token_acc": 0.9927797833935018, "train_speed(iter/s)": 0.954826 }, { "epoch": 0.6355131078842218, "grad_norm": 0.4625117778778076, "learning_rate": 3.213268015353772e-06, "loss": 0.013206924311816692, "memory(GiB)": 21.48, "step": 19563, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.954834 }, { "epoch": 0.6355455933469772, "grad_norm": 0.25402742624282837, "learning_rate": 3.2127663399061524e-06, "loss": 0.013430235907435417, "memory(GiB)": 21.48, "step": 19564, "token_acc": 1.0, "train_speed(iter/s)": 0.954842 }, { "epoch": 0.6355780788097326, "grad_norm": 0.5611571073532104, "learning_rate": 3.2122646850855853e-06, "loss": 0.026371922343969345, "memory(GiB)": 21.48, "step": 19565, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.95485 }, { "epoch": 0.6356105642724881, "grad_norm": 0.3614422678947449, "learning_rate": 3.211763050897866e-06, "loss": 0.016109855845570564, "memory(GiB)": 21.48, "step": 19566, "token_acc": 1.0, "train_speed(iter/s)": 0.954857 }, { "epoch": 0.6356430497352434, "grad_norm": 0.41002020239830017, "learning_rate": 3.211261437348779e-06, "loss": 0.014487079344689846, "memory(GiB)": 21.48, "step": 19567, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.954865 }, { "epoch": 0.6356755351979989, "grad_norm": 0.45516133308410645, "learning_rate": 3.2107598444441157e-06, "loss": 0.022531576454639435, "memory(GiB)": 21.48, "step": 19568, "token_acc": 0.9927797833935018, "train_speed(iter/s)": 0.954873 }, { "epoch": 0.6357080206607543, "grad_norm": 0.23753997683525085, "learning_rate": 3.2102582721896674e-06, "loss": 0.010043950751423836, "memory(GiB)": 21.48, "step": 19569, "token_acc": 1.0, "train_speed(iter/s)": 0.954881 }, { "epoch": 0.6357405061235097, "grad_norm": 0.49964919686317444, "learning_rate": 3.2097567205912183e-06, "loss": 0.015987522900104523, "memory(GiB)": 21.48, "step": 19570, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.954889 }, { "epoch": 0.6357729915862651, "grad_norm": 0.2861005365848541, "learning_rate": 3.209255189654562e-06, "loss": 0.01353162806481123, "memory(GiB)": 21.48, "step": 19571, "token_acc": 1.0, "train_speed(iter/s)": 0.954896 }, { "epoch": 0.6358054770490206, "grad_norm": 0.40241551399230957, "learning_rate": 3.2087536793854825e-06, "loss": 0.021901048719882965, "memory(GiB)": 21.48, "step": 19572, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.954905 }, { "epoch": 0.6358379625117759, "grad_norm": 0.2303839772939682, "learning_rate": 3.2082521897897705e-06, "loss": 0.01107160747051239, "memory(GiB)": 21.48, "step": 19573, "token_acc": 1.0, "train_speed(iter/s)": 0.954913 }, { "epoch": 0.6358704479745314, "grad_norm": 0.357657253742218, "learning_rate": 3.207750720873212e-06, "loss": 0.02015773579478264, "memory(GiB)": 21.48, "step": 19574, "token_acc": 0.99609375, "train_speed(iter/s)": 0.954922 }, { "epoch": 0.6359029334372868, "grad_norm": 0.30015942454338074, "learning_rate": 3.207249272641597e-06, "loss": 0.012847714126110077, "memory(GiB)": 21.48, "step": 19575, "token_acc": 0.996415770609319, "train_speed(iter/s)": 0.95493 }, { "epoch": 0.6359354189000422, "grad_norm": 0.34543630480766296, "learning_rate": 3.2067478451007105e-06, "loss": 0.01716996543109417, "memory(GiB)": 21.48, "step": 19576, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.954937 }, { "epoch": 0.6359679043627976, "grad_norm": 0.4290592670440674, "learning_rate": 3.206246438256342e-06, "loss": 0.017237845808267593, "memory(GiB)": 21.48, "step": 19577, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.954944 }, { "epoch": 0.6360003898255531, "grad_norm": 0.28364303708076477, "learning_rate": 3.2057450521142754e-06, "loss": 0.01584659144282341, "memory(GiB)": 21.48, "step": 19578, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.954949 }, { "epoch": 0.6360328752883084, "grad_norm": 0.2864139676094055, "learning_rate": 3.2052436866803006e-06, "loss": 0.01690458320081234, "memory(GiB)": 21.48, "step": 19579, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.954956 }, { "epoch": 0.6360653607510639, "grad_norm": 0.41356027126312256, "learning_rate": 3.204742341960201e-06, "loss": 0.01711832731962204, "memory(GiB)": 21.48, "step": 19580, "token_acc": 1.0, "train_speed(iter/s)": 0.954963 }, { "epoch": 0.6360978462138193, "grad_norm": 0.31535881757736206, "learning_rate": 3.2042410179597648e-06, "loss": 0.02105756103992462, "memory(GiB)": 21.48, "step": 19581, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.95497 }, { "epoch": 0.6361303316765747, "grad_norm": 0.3209927976131439, "learning_rate": 3.203739714684777e-06, "loss": 0.018606873229146004, "memory(GiB)": 21.48, "step": 19582, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.954977 }, { "epoch": 0.6361628171393301, "grad_norm": 0.3527302145957947, "learning_rate": 3.203238432141025e-06, "loss": 0.015398349612951279, "memory(GiB)": 21.48, "step": 19583, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.954983 }, { "epoch": 0.6361953026020856, "grad_norm": 0.38454923033714294, "learning_rate": 3.202737170334292e-06, "loss": 0.015438554808497429, "memory(GiB)": 21.48, "step": 19584, "token_acc": 1.0, "train_speed(iter/s)": 0.954989 }, { "epoch": 0.636227788064841, "grad_norm": 0.5308380126953125, "learning_rate": 3.202235929270364e-06, "loss": 0.025207946076989174, "memory(GiB)": 21.48, "step": 19585, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.954997 }, { "epoch": 0.6362602735275964, "grad_norm": 0.43166694045066833, "learning_rate": 3.2017347089550266e-06, "loss": 0.012978690676391125, "memory(GiB)": 21.48, "step": 19586, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955005 }, { "epoch": 0.6362927589903519, "grad_norm": 0.32643184065818787, "learning_rate": 3.2012335093940637e-06, "loss": 0.017524896189570427, "memory(GiB)": 21.48, "step": 19587, "token_acc": 1.0, "train_speed(iter/s)": 0.955013 }, { "epoch": 0.6363252444531072, "grad_norm": 0.3608528971672058, "learning_rate": 3.2007323305932613e-06, "loss": 0.020576471462845802, "memory(GiB)": 21.48, "step": 19588, "token_acc": 0.9808612440191388, "train_speed(iter/s)": 0.955021 }, { "epoch": 0.6363577299158627, "grad_norm": 0.751457154750824, "learning_rate": 3.2002311725584013e-06, "loss": 0.014376789331436157, "memory(GiB)": 21.48, "step": 19589, "token_acc": 0.9892857142857143, "train_speed(iter/s)": 0.955028 }, { "epoch": 0.6363902153786181, "grad_norm": 0.2626001834869385, "learning_rate": 3.1997300352952713e-06, "loss": 0.01098647527396679, "memory(GiB)": 21.48, "step": 19590, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.955035 }, { "epoch": 0.6364227008413735, "grad_norm": 0.4789159297943115, "learning_rate": 3.1992289188096503e-06, "loss": 0.01787831261754036, "memory(GiB)": 21.48, "step": 19591, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955042 }, { "epoch": 0.6364551863041289, "grad_norm": 0.329145610332489, "learning_rate": 3.198727823107327e-06, "loss": 0.018560640513896942, "memory(GiB)": 21.48, "step": 19592, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.95505 }, { "epoch": 0.6364876717668844, "grad_norm": 0.436832457780838, "learning_rate": 3.1982267481940787e-06, "loss": 0.027332618832588196, "memory(GiB)": 21.48, "step": 19593, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955058 }, { "epoch": 0.6365201572296397, "grad_norm": 0.40195560455322266, "learning_rate": 3.1977256940756952e-06, "loss": 0.021836675703525543, "memory(GiB)": 21.48, "step": 19594, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.955066 }, { "epoch": 0.6365526426923952, "grad_norm": 0.2625274658203125, "learning_rate": 3.1972246607579536e-06, "loss": 0.013835271820425987, "memory(GiB)": 21.48, "step": 19595, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.955075 }, { "epoch": 0.6365851281551506, "grad_norm": 0.510539710521698, "learning_rate": 3.1967236482466412e-06, "loss": 0.020634669810533524, "memory(GiB)": 21.48, "step": 19596, "token_acc": 0.9786096256684492, "train_speed(iter/s)": 0.955086 }, { "epoch": 0.636617613617906, "grad_norm": 0.32863229513168335, "learning_rate": 3.196222656547536e-06, "loss": 0.014899157918989658, "memory(GiB)": 21.48, "step": 19597, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.955096 }, { "epoch": 0.6366500990806614, "grad_norm": 1.3164716958999634, "learning_rate": 3.195721685666423e-06, "loss": 0.019327664747834206, "memory(GiB)": 21.48, "step": 19598, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955107 }, { "epoch": 0.6366825845434169, "grad_norm": 0.410480260848999, "learning_rate": 3.1952207356090826e-06, "loss": 0.026087764650583267, "memory(GiB)": 21.48, "step": 19599, "token_acc": 0.9867549668874173, "train_speed(iter/s)": 0.955117 }, { "epoch": 0.6367150700061722, "grad_norm": 0.401959091424942, "learning_rate": 3.194719806381297e-06, "loss": 0.014396914280951023, "memory(GiB)": 21.48, "step": 19600, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.955128 }, { "epoch": 0.6367475554689277, "grad_norm": 0.3791351616382599, "learning_rate": 3.194218897988849e-06, "loss": 0.01680932752788067, "memory(GiB)": 21.48, "step": 19601, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955139 }, { "epoch": 0.6367800409316831, "grad_norm": 0.24195526540279388, "learning_rate": 3.193718010437517e-06, "loss": 0.012366549111902714, "memory(GiB)": 21.48, "step": 19602, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955149 }, { "epoch": 0.6368125263944385, "grad_norm": 0.26721474528312683, "learning_rate": 3.193217143733085e-06, "loss": 0.011021671816706657, "memory(GiB)": 21.48, "step": 19603, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.95516 }, { "epoch": 0.6368450118571939, "grad_norm": 0.2732959985733032, "learning_rate": 3.19271629788133e-06, "loss": 0.0161982923746109, "memory(GiB)": 21.48, "step": 19604, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.95517 }, { "epoch": 0.6368774973199494, "grad_norm": 0.3954797685146332, "learning_rate": 3.192215472888036e-06, "loss": 0.015507129020988941, "memory(GiB)": 21.48, "step": 19605, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.95518 }, { "epoch": 0.6369099827827047, "grad_norm": 0.5514589548110962, "learning_rate": 3.1917146687589806e-06, "loss": 0.02563263103365898, "memory(GiB)": 21.48, "step": 19606, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955191 }, { "epoch": 0.6369424682454602, "grad_norm": 0.20283590257167816, "learning_rate": 3.1912138854999454e-06, "loss": 0.008837953209877014, "memory(GiB)": 21.48, "step": 19607, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955202 }, { "epoch": 0.6369749537082156, "grad_norm": 0.39955756068229675, "learning_rate": 3.190713123116709e-06, "loss": 0.017220716923475266, "memory(GiB)": 21.48, "step": 19608, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955213 }, { "epoch": 0.637007439170971, "grad_norm": 0.34228208661079407, "learning_rate": 3.1902123816150517e-06, "loss": 0.01528523676097393, "memory(GiB)": 21.48, "step": 19609, "token_acc": 1.0, "train_speed(iter/s)": 0.955223 }, { "epoch": 0.6370399246337264, "grad_norm": 0.28907260298728943, "learning_rate": 3.1897116610007517e-06, "loss": 0.015748223289847374, "memory(GiB)": 21.48, "step": 19610, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.955234 }, { "epoch": 0.6370724100964819, "grad_norm": 0.4256741404533386, "learning_rate": 3.1892109612795892e-06, "loss": 0.020394867286086082, "memory(GiB)": 21.48, "step": 19611, "token_acc": 1.0, "train_speed(iter/s)": 0.955244 }, { "epoch": 0.6371048955592372, "grad_norm": 0.3131803274154663, "learning_rate": 3.1887102824573413e-06, "loss": 0.020938601344823837, "memory(GiB)": 21.48, "step": 19612, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.955254 }, { "epoch": 0.6371373810219927, "grad_norm": 0.45738404989242554, "learning_rate": 3.1882096245397904e-06, "loss": 0.01681794971227646, "memory(GiB)": 21.48, "step": 19613, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955264 }, { "epoch": 0.637169866484748, "grad_norm": 0.32697218656539917, "learning_rate": 3.1877089875327084e-06, "loss": 0.016030561178922653, "memory(GiB)": 21.48, "step": 19614, "token_acc": 1.0, "train_speed(iter/s)": 0.955274 }, { "epoch": 0.6372023519475035, "grad_norm": 0.25696519017219543, "learning_rate": 3.1872083714418777e-06, "loss": 0.011811106465756893, "memory(GiB)": 21.48, "step": 19615, "token_acc": 1.0, "train_speed(iter/s)": 0.955285 }, { "epoch": 0.6372348374102589, "grad_norm": 0.3129672408103943, "learning_rate": 3.1867077762730764e-06, "loss": 0.011109251528978348, "memory(GiB)": 21.48, "step": 19616, "token_acc": 0.9929824561403509, "train_speed(iter/s)": 0.955296 }, { "epoch": 0.6372673228730144, "grad_norm": 0.3904101848602295, "learning_rate": 3.18620720203208e-06, "loss": 0.02392912656068802, "memory(GiB)": 21.48, "step": 19617, "token_acc": 1.0, "train_speed(iter/s)": 0.955306 }, { "epoch": 0.6372998083357697, "grad_norm": 0.48277655243873596, "learning_rate": 3.1857066487246687e-06, "loss": 0.017988014966249466, "memory(GiB)": 21.48, "step": 19618, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955316 }, { "epoch": 0.6373322937985252, "grad_norm": 0.3339850604534149, "learning_rate": 3.1852061163566138e-06, "loss": 0.01238834485411644, "memory(GiB)": 21.48, "step": 19619, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.955327 }, { "epoch": 0.6373647792612805, "grad_norm": 0.34618085622787476, "learning_rate": 3.1847056049337e-06, "loss": 0.015873484313488007, "memory(GiB)": 21.48, "step": 19620, "token_acc": 0.9930795847750865, "train_speed(iter/s)": 0.955337 }, { "epoch": 0.637397264724036, "grad_norm": 0.3039959967136383, "learning_rate": 3.1842051144616953e-06, "loss": 0.013691780157387257, "memory(GiB)": 21.48, "step": 19621, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.955347 }, { "epoch": 0.6374297501867914, "grad_norm": 0.34444141387939453, "learning_rate": 3.1837046449463844e-06, "loss": 0.013542638160288334, "memory(GiB)": 21.48, "step": 19622, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.955358 }, { "epoch": 0.6374622356495468, "grad_norm": 0.3278160095214844, "learning_rate": 3.183204196393537e-06, "loss": 0.016405360773205757, "memory(GiB)": 21.48, "step": 19623, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.955368 }, { "epoch": 0.6374947211123022, "grad_norm": 0.3416953384876251, "learning_rate": 3.1827037688089323e-06, "loss": 0.02118688076734543, "memory(GiB)": 21.48, "step": 19624, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.955376 }, { "epoch": 0.6375272065750577, "grad_norm": 0.32172006368637085, "learning_rate": 3.182203362198344e-06, "loss": 0.016752667725086212, "memory(GiB)": 21.48, "step": 19625, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.955384 }, { "epoch": 0.637559692037813, "grad_norm": 0.2888663411140442, "learning_rate": 3.181702976567549e-06, "loss": 0.014194635674357414, "memory(GiB)": 21.48, "step": 19626, "token_acc": 1.0, "train_speed(iter/s)": 0.955392 }, { "epoch": 0.6375921775005685, "grad_norm": 0.27594390511512756, "learning_rate": 3.1812026119223214e-06, "loss": 0.012078513391315937, "memory(GiB)": 21.48, "step": 19627, "token_acc": 1.0, "train_speed(iter/s)": 0.955399 }, { "epoch": 0.6376246629633239, "grad_norm": 0.3359434902667999, "learning_rate": 3.1807022682684375e-06, "loss": 0.010419627651572227, "memory(GiB)": 21.48, "step": 19628, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955408 }, { "epoch": 0.6376571484260793, "grad_norm": 0.32100537419319153, "learning_rate": 3.1802019456116694e-06, "loss": 0.017499614506959915, "memory(GiB)": 21.48, "step": 19629, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.955417 }, { "epoch": 0.6376896338888347, "grad_norm": 0.3042813241481781, "learning_rate": 3.179701643957793e-06, "loss": 0.015214349143207073, "memory(GiB)": 21.48, "step": 19630, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955425 }, { "epoch": 0.6377221193515902, "grad_norm": 0.5006691813468933, "learning_rate": 3.179201363312584e-06, "loss": 0.01936201937496662, "memory(GiB)": 21.48, "step": 19631, "token_acc": 0.99609375, "train_speed(iter/s)": 0.955433 }, { "epoch": 0.6377546048143455, "grad_norm": 0.3779570162296295, "learning_rate": 3.1787011036818133e-06, "loss": 0.018071332946419716, "memory(GiB)": 21.48, "step": 19632, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.95544 }, { "epoch": 0.637787090277101, "grad_norm": 0.38746875524520874, "learning_rate": 3.178200865071257e-06, "loss": 0.019939083606004715, "memory(GiB)": 21.48, "step": 19633, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.955448 }, { "epoch": 0.6378195757398564, "grad_norm": 0.4993587136268616, "learning_rate": 3.177700647486687e-06, "loss": 0.014391878619790077, "memory(GiB)": 21.48, "step": 19634, "token_acc": 1.0, "train_speed(iter/s)": 0.955455 }, { "epoch": 0.6378520612026118, "grad_norm": 0.39517566561698914, "learning_rate": 3.177200450933877e-06, "loss": 0.014893968589603901, "memory(GiB)": 21.48, "step": 19635, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.955462 }, { "epoch": 0.6378845466653672, "grad_norm": 0.33251407742500305, "learning_rate": 3.1767002754186e-06, "loss": 0.011812100186944008, "memory(GiB)": 21.48, "step": 19636, "token_acc": 1.0, "train_speed(iter/s)": 0.955468 }, { "epoch": 0.6379170321281227, "grad_norm": 0.331778883934021, "learning_rate": 3.1762001209466294e-06, "loss": 0.01506116520613432, "memory(GiB)": 21.48, "step": 19637, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.955475 }, { "epoch": 0.637949517590878, "grad_norm": 0.3122147023677826, "learning_rate": 3.1756999875237353e-06, "loss": 0.01149966474622488, "memory(GiB)": 21.48, "step": 19638, "token_acc": 0.992, "train_speed(iter/s)": 0.955482 }, { "epoch": 0.6379820030536335, "grad_norm": 0.287655770778656, "learning_rate": 3.1751998751556933e-06, "loss": 0.016281332820653915, "memory(GiB)": 21.48, "step": 19639, "token_acc": 1.0, "train_speed(iter/s)": 0.955488 }, { "epoch": 0.6380144885163889, "grad_norm": 0.6620687246322632, "learning_rate": 3.1746997838482718e-06, "loss": 0.02286827191710472, "memory(GiB)": 21.48, "step": 19640, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.955495 }, { "epoch": 0.6380469739791443, "grad_norm": 0.31786566972732544, "learning_rate": 3.1741997136072465e-06, "loss": 0.018838487565517426, "memory(GiB)": 21.48, "step": 19641, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955502 }, { "epoch": 0.6380794594418997, "grad_norm": 0.45185616612434387, "learning_rate": 3.173699664438384e-06, "loss": 0.019212756305933, "memory(GiB)": 21.48, "step": 19642, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.955509 }, { "epoch": 0.6381119449046552, "grad_norm": 0.23114971816539764, "learning_rate": 3.173199636347461e-06, "loss": 0.011041766032576561, "memory(GiB)": 21.48, "step": 19643, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.955516 }, { "epoch": 0.6381444303674105, "grad_norm": 0.5882251858711243, "learning_rate": 3.1726996293402425e-06, "loss": 0.019457729533314705, "memory(GiB)": 21.48, "step": 19644, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955523 }, { "epoch": 0.638176915830166, "grad_norm": 0.38211292028427124, "learning_rate": 3.1721996434225055e-06, "loss": 0.017611660063266754, "memory(GiB)": 21.48, "step": 19645, "token_acc": 1.0, "train_speed(iter/s)": 0.955529 }, { "epoch": 0.6382094012929214, "grad_norm": 0.4251064658164978, "learning_rate": 3.1716996786000152e-06, "loss": 0.028328578919172287, "memory(GiB)": 21.48, "step": 19646, "token_acc": 0.9928571428571429, "train_speed(iter/s)": 0.955536 }, { "epoch": 0.6382418867556768, "grad_norm": 0.2669176161289215, "learning_rate": 3.1711997348785433e-06, "loss": 0.014689828269183636, "memory(GiB)": 21.48, "step": 19647, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.955543 }, { "epoch": 0.6382743722184323, "grad_norm": 0.3537523150444031, "learning_rate": 3.170699812263863e-06, "loss": 0.015432607382535934, "memory(GiB)": 21.48, "step": 19648, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955551 }, { "epoch": 0.6383068576811877, "grad_norm": 0.3704160749912262, "learning_rate": 3.1701999107617397e-06, "loss": 0.018851231783628464, "memory(GiB)": 21.48, "step": 19649, "token_acc": 0.986013986013986, "train_speed(iter/s)": 0.955558 }, { "epoch": 0.6383393431439431, "grad_norm": 0.3407399356365204, "learning_rate": 3.169700030377947e-06, "loss": 0.016409248113632202, "memory(GiB)": 21.48, "step": 19650, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.955566 }, { "epoch": 0.6383718286066985, "grad_norm": 0.3619837760925293, "learning_rate": 3.169200171118251e-06, "loss": 0.017524512484669685, "memory(GiB)": 21.48, "step": 19651, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.955573 }, { "epoch": 0.638404314069454, "grad_norm": 0.2941492795944214, "learning_rate": 3.1687003329884224e-06, "loss": 0.010233269073069096, "memory(GiB)": 21.48, "step": 19652, "token_acc": 1.0, "train_speed(iter/s)": 0.955581 }, { "epoch": 0.6384367995322093, "grad_norm": 0.2874990701675415, "learning_rate": 3.168200515994228e-06, "loss": 0.015385006554424763, "memory(GiB)": 21.48, "step": 19653, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.955589 }, { "epoch": 0.6384692849949648, "grad_norm": 0.30175119638442993, "learning_rate": 3.167700720141439e-06, "loss": 0.014446232467889786, "memory(GiB)": 21.48, "step": 19654, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.955597 }, { "epoch": 0.6385017704577202, "grad_norm": 0.4219108819961548, "learning_rate": 3.1672009454358216e-06, "loss": 0.02529330365359783, "memory(GiB)": 21.48, "step": 19655, "token_acc": 1.0, "train_speed(iter/s)": 0.955607 }, { "epoch": 0.6385342559204756, "grad_norm": 0.26177579164505005, "learning_rate": 3.1667011918831456e-06, "loss": 0.015574997290968895, "memory(GiB)": 21.48, "step": 19656, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.955618 }, { "epoch": 0.638566741383231, "grad_norm": 0.280632883310318, "learning_rate": 3.166201459489177e-06, "loss": 0.013516252860426903, "memory(GiB)": 21.48, "step": 19657, "token_acc": 0.9948186528497409, "train_speed(iter/s)": 0.955628 }, { "epoch": 0.6385992268459865, "grad_norm": 0.36782437562942505, "learning_rate": 3.165701748259685e-06, "loss": 0.016672387719154358, "memory(GiB)": 21.48, "step": 19658, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955639 }, { "epoch": 0.6386317123087418, "grad_norm": 0.41237398982048035, "learning_rate": 3.165202058200436e-06, "loss": 0.014140548184514046, "memory(GiB)": 21.48, "step": 19659, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.955649 }, { "epoch": 0.6386641977714973, "grad_norm": 0.29515141248703003, "learning_rate": 3.164702389317198e-06, "loss": 0.013588447123765945, "memory(GiB)": 21.48, "step": 19660, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.955659 }, { "epoch": 0.6386966832342527, "grad_norm": 0.41704031825065613, "learning_rate": 3.164202741615736e-06, "loss": 0.021509047597646713, "memory(GiB)": 21.48, "step": 19661, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.95567 }, { "epoch": 0.6387291686970081, "grad_norm": 0.5729469060897827, "learning_rate": 3.1637031151018183e-06, "loss": 0.02069029025733471, "memory(GiB)": 21.48, "step": 19662, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.95568 }, { "epoch": 0.6387616541597635, "grad_norm": 0.3842145800590515, "learning_rate": 3.1632035097812113e-06, "loss": 0.02142471820116043, "memory(GiB)": 21.48, "step": 19663, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955689 }, { "epoch": 0.638794139622519, "grad_norm": 0.37967634201049805, "learning_rate": 3.16270392565968e-06, "loss": 0.01803770661354065, "memory(GiB)": 21.48, "step": 19664, "token_acc": 0.9948717948717949, "train_speed(iter/s)": 0.9557 }, { "epoch": 0.6388266250852743, "grad_norm": 0.9206739068031311, "learning_rate": 3.162204362742991e-06, "loss": 0.022602923214435577, "memory(GiB)": 21.48, "step": 19665, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.95571 }, { "epoch": 0.6388591105480298, "grad_norm": 0.39103323221206665, "learning_rate": 3.1617048210369096e-06, "loss": 0.017536606639623642, "memory(GiB)": 21.48, "step": 19666, "token_acc": 1.0, "train_speed(iter/s)": 0.955721 }, { "epoch": 0.6388915960107852, "grad_norm": 0.2973606586456299, "learning_rate": 3.161205300547202e-06, "loss": 0.006941543892025948, "memory(GiB)": 21.48, "step": 19667, "token_acc": 1.0, "train_speed(iter/s)": 0.955731 }, { "epoch": 0.6389240814735406, "grad_norm": 0.32148393988609314, "learning_rate": 3.160705801279632e-06, "loss": 0.01945701241493225, "memory(GiB)": 21.48, "step": 19668, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955741 }, { "epoch": 0.638956566936296, "grad_norm": 0.30486926436424255, "learning_rate": 3.160206323239967e-06, "loss": 0.014194920659065247, "memory(GiB)": 21.48, "step": 19669, "token_acc": 1.0, "train_speed(iter/s)": 0.955752 }, { "epoch": 0.6389890523990515, "grad_norm": 0.35062921047210693, "learning_rate": 3.1597068664339665e-06, "loss": 0.012751782312989235, "memory(GiB)": 21.48, "step": 19670, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955762 }, { "epoch": 0.6390215378618068, "grad_norm": 0.30545487999916077, "learning_rate": 3.1592074308674014e-06, "loss": 0.012833129614591599, "memory(GiB)": 21.48, "step": 19671, "token_acc": 0.9890710382513661, "train_speed(iter/s)": 0.955773 }, { "epoch": 0.6390540233245623, "grad_norm": 0.3310616612434387, "learning_rate": 3.1587080165460294e-06, "loss": 0.013031414709985256, "memory(GiB)": 21.48, "step": 19672, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955783 }, { "epoch": 0.6390865087873177, "grad_norm": 0.4019676744937897, "learning_rate": 3.1582086234756206e-06, "loss": 0.01718110777437687, "memory(GiB)": 21.48, "step": 19673, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.955794 }, { "epoch": 0.6391189942500731, "grad_norm": 0.24250684678554535, "learning_rate": 3.157709251661934e-06, "loss": 0.013985169120132923, "memory(GiB)": 21.48, "step": 19674, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955804 }, { "epoch": 0.6391514797128285, "grad_norm": 0.27661558985710144, "learning_rate": 3.157209901110736e-06, "loss": 0.009371662512421608, "memory(GiB)": 21.48, "step": 19675, "token_acc": 1.0, "train_speed(iter/s)": 0.955815 }, { "epoch": 0.639183965175584, "grad_norm": 0.31402096152305603, "learning_rate": 3.1567105718277868e-06, "loss": 0.011543259024620056, "memory(GiB)": 21.48, "step": 19676, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.955825 }, { "epoch": 0.6392164506383393, "grad_norm": 0.47256794571876526, "learning_rate": 3.1562112638188503e-06, "loss": 0.02016139030456543, "memory(GiB)": 21.48, "step": 19677, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955835 }, { "epoch": 0.6392489361010948, "grad_norm": 0.40848326683044434, "learning_rate": 3.1557119770896926e-06, "loss": 0.014430712908506393, "memory(GiB)": 21.48, "step": 19678, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.955846 }, { "epoch": 0.6392814215638501, "grad_norm": 0.4620124399662018, "learning_rate": 3.1552127116460718e-06, "loss": 0.021929152309894562, "memory(GiB)": 21.48, "step": 19679, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.955856 }, { "epoch": 0.6393139070266056, "grad_norm": 0.3174920976161957, "learning_rate": 3.1547134674937528e-06, "loss": 0.015256116166710854, "memory(GiB)": 21.48, "step": 19680, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.955867 }, { "epoch": 0.639346392489361, "grad_norm": 0.2631489932537079, "learning_rate": 3.1542142446384954e-06, "loss": 0.01113089732825756, "memory(GiB)": 21.48, "step": 19681, "token_acc": 1.0, "train_speed(iter/s)": 0.955878 }, { "epoch": 0.6393788779521165, "grad_norm": 0.3052395284175873, "learning_rate": 3.1537150430860632e-06, "loss": 0.014767924323678017, "memory(GiB)": 21.48, "step": 19682, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.955887 }, { "epoch": 0.6394113634148718, "grad_norm": 0.28609681129455566, "learning_rate": 3.153215862842216e-06, "loss": 0.013110237196087837, "memory(GiB)": 21.48, "step": 19683, "token_acc": 1.0, "train_speed(iter/s)": 0.955898 }, { "epoch": 0.6394438488776273, "grad_norm": 0.31196722388267517, "learning_rate": 3.152716703912717e-06, "loss": 0.01341527234762907, "memory(GiB)": 21.48, "step": 19684, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.955908 }, { "epoch": 0.6394763343403826, "grad_norm": 0.39603644609451294, "learning_rate": 3.1522175663033242e-06, "loss": 0.021991323679685593, "memory(GiB)": 21.48, "step": 19685, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955918 }, { "epoch": 0.6395088198031381, "grad_norm": 0.279351145029068, "learning_rate": 3.1517184500198016e-06, "loss": 0.016891740262508392, "memory(GiB)": 21.48, "step": 19686, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.955927 }, { "epoch": 0.6395413052658935, "grad_norm": 0.28260305523872375, "learning_rate": 3.151219355067907e-06, "loss": 0.012378336861729622, "memory(GiB)": 21.48, "step": 19687, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.955935 }, { "epoch": 0.639573790728649, "grad_norm": 0.6839200258255005, "learning_rate": 3.150720281453403e-06, "loss": 0.02469903789460659, "memory(GiB)": 21.48, "step": 19688, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.955942 }, { "epoch": 0.6396062761914043, "grad_norm": 0.43044108152389526, "learning_rate": 3.150221229182047e-06, "loss": 0.017548363655805588, "memory(GiB)": 21.48, "step": 19689, "token_acc": 1.0, "train_speed(iter/s)": 0.955951 }, { "epoch": 0.6396387616541598, "grad_norm": 0.2959873378276825, "learning_rate": 3.1497221982596016e-06, "loss": 0.013034235686063766, "memory(GiB)": 21.48, "step": 19690, "token_acc": 0.9923954372623575, "train_speed(iter/s)": 0.955959 }, { "epoch": 0.6396712471169151, "grad_norm": 0.3133525252342224, "learning_rate": 3.1492231886918236e-06, "loss": 0.01492747850716114, "memory(GiB)": 21.48, "step": 19691, "token_acc": 0.988, "train_speed(iter/s)": 0.955967 }, { "epoch": 0.6397037325796706, "grad_norm": 0.28101983666419983, "learning_rate": 3.1487242004844733e-06, "loss": 0.011982467956840992, "memory(GiB)": 21.48, "step": 19692, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955975 }, { "epoch": 0.639736218042426, "grad_norm": 0.27091801166534424, "learning_rate": 3.14822523364331e-06, "loss": 0.0175224170088768, "memory(GiB)": 21.48, "step": 19693, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.955984 }, { "epoch": 0.6397687035051814, "grad_norm": 0.3322281539440155, "learning_rate": 3.1477262881740924e-06, "loss": 0.011501414701342583, "memory(GiB)": 21.48, "step": 19694, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.955992 }, { "epoch": 0.6398011889679368, "grad_norm": 0.27975499629974365, "learning_rate": 3.147227364082579e-06, "loss": 0.016340136528015137, "memory(GiB)": 21.48, "step": 19695, "token_acc": 1.0, "train_speed(iter/s)": 0.955999 }, { "epoch": 0.6398336744306923, "grad_norm": 0.45406025648117065, "learning_rate": 3.1467284613745267e-06, "loss": 0.017831504344940186, "memory(GiB)": 21.48, "step": 19696, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.956006 }, { "epoch": 0.6398661598934476, "grad_norm": 0.4813087284564972, "learning_rate": 3.1462295800556973e-06, "loss": 0.019040241837501526, "memory(GiB)": 21.48, "step": 19697, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956013 }, { "epoch": 0.6398986453562031, "grad_norm": 0.4145694077014923, "learning_rate": 3.1457307201318426e-06, "loss": 0.022492578253149986, "memory(GiB)": 21.48, "step": 19698, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.95602 }, { "epoch": 0.6399311308189585, "grad_norm": 0.28168386220932007, "learning_rate": 3.145231881608727e-06, "loss": 0.011238893494009972, "memory(GiB)": 21.48, "step": 19699, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956026 }, { "epoch": 0.6399636162817139, "grad_norm": 0.779948353767395, "learning_rate": 3.1447330644921005e-06, "loss": 0.017120197415351868, "memory(GiB)": 21.48, "step": 19700, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.956033 }, { "epoch": 0.6399961017444693, "grad_norm": 0.34359756112098694, "learning_rate": 3.1442342687877268e-06, "loss": 0.01336167473345995, "memory(GiB)": 21.48, "step": 19701, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.95604 }, { "epoch": 0.6400285872072248, "grad_norm": 0.32146912813186646, "learning_rate": 3.143735494501358e-06, "loss": 0.012448567897081375, "memory(GiB)": 21.48, "step": 19702, "token_acc": 0.986013986013986, "train_speed(iter/s)": 0.956046 }, { "epoch": 0.6400610726699801, "grad_norm": 0.38663843274116516, "learning_rate": 3.1432367416387526e-06, "loss": 0.016030211001634598, "memory(GiB)": 21.48, "step": 19703, "token_acc": 1.0, "train_speed(iter/s)": 0.956053 }, { "epoch": 0.6400935581327356, "grad_norm": 0.21103660762310028, "learning_rate": 3.142738010205666e-06, "loss": 0.010457135736942291, "memory(GiB)": 21.48, "step": 19704, "token_acc": 1.0, "train_speed(iter/s)": 0.95606 }, { "epoch": 0.640126043595491, "grad_norm": 0.2556998133659363, "learning_rate": 3.142239300207856e-06, "loss": 0.01592109352350235, "memory(GiB)": 21.48, "step": 19705, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.956065 }, { "epoch": 0.6401585290582464, "grad_norm": 0.3612271845340729, "learning_rate": 3.141740611651076e-06, "loss": 0.014591067098081112, "memory(GiB)": 21.48, "step": 19706, "token_acc": 1.0, "train_speed(iter/s)": 0.956072 }, { "epoch": 0.6401910145210018, "grad_norm": 0.3014379143714905, "learning_rate": 3.1412419445410823e-06, "loss": 0.0190337672829628, "memory(GiB)": 21.48, "step": 19707, "token_acc": 1.0, "train_speed(iter/s)": 0.956078 }, { "epoch": 0.6402234999837573, "grad_norm": 0.30036336183547974, "learning_rate": 3.1407432988836315e-06, "loss": 0.02097773551940918, "memory(GiB)": 21.48, "step": 19708, "token_acc": 1.0, "train_speed(iter/s)": 0.956085 }, { "epoch": 0.6402559854465126, "grad_norm": 0.36526891589164734, "learning_rate": 3.1402446746844777e-06, "loss": 0.014266310259699821, "memory(GiB)": 21.48, "step": 19709, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.956092 }, { "epoch": 0.6402884709092681, "grad_norm": 0.3098473846912384, "learning_rate": 3.1397460719493763e-06, "loss": 0.015371164306998253, "memory(GiB)": 21.48, "step": 19710, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.9561 }, { "epoch": 0.6403209563720235, "grad_norm": 0.4286905825138092, "learning_rate": 3.1392474906840795e-06, "loss": 0.016862619668245316, "memory(GiB)": 21.48, "step": 19711, "token_acc": 0.9840425531914894, "train_speed(iter/s)": 0.956107 }, { "epoch": 0.6403534418347789, "grad_norm": 0.5075848698616028, "learning_rate": 3.138748930894345e-06, "loss": 0.02494705282151699, "memory(GiB)": 21.48, "step": 19712, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.956115 }, { "epoch": 0.6403859272975344, "grad_norm": 0.2780309319496155, "learning_rate": 3.138250392585924e-06, "loss": 0.01884561963379383, "memory(GiB)": 21.48, "step": 19713, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.956123 }, { "epoch": 0.6404184127602898, "grad_norm": 1.9168269634246826, "learning_rate": 3.1377518757645725e-06, "loss": 0.013159357011318207, "memory(GiB)": 21.48, "step": 19714, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956132 }, { "epoch": 0.6404508982230452, "grad_norm": 0.31518587470054626, "learning_rate": 3.137253380436042e-06, "loss": 0.015604717656970024, "memory(GiB)": 21.48, "step": 19715, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956142 }, { "epoch": 0.6404833836858006, "grad_norm": 0.2450474351644516, "learning_rate": 3.1367549066060875e-06, "loss": 0.011167019605636597, "memory(GiB)": 21.48, "step": 19716, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956153 }, { "epoch": 0.6405158691485561, "grad_norm": 0.3681402802467346, "learning_rate": 3.136256454280461e-06, "loss": 0.014519482851028442, "memory(GiB)": 21.48, "step": 19717, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.956163 }, { "epoch": 0.6405483546113114, "grad_norm": 0.28663837909698486, "learning_rate": 3.135758023464916e-06, "loss": 0.013568861410021782, "memory(GiB)": 21.48, "step": 19718, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.956173 }, { "epoch": 0.6405808400740669, "grad_norm": 0.4111754596233368, "learning_rate": 3.1352596141652037e-06, "loss": 0.015438024885952473, "memory(GiB)": 21.48, "step": 19719, "token_acc": 1.0, "train_speed(iter/s)": 0.956183 }, { "epoch": 0.6406133255368223, "grad_norm": 0.41387292742729187, "learning_rate": 3.13476122638708e-06, "loss": 0.014322185888886452, "memory(GiB)": 21.48, "step": 19720, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.956193 }, { "epoch": 0.6406458109995777, "grad_norm": 0.32159745693206787, "learning_rate": 3.1342628601362907e-06, "loss": 0.020185764878988266, "memory(GiB)": 21.48, "step": 19721, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.956204 }, { "epoch": 0.6406782964623331, "grad_norm": 0.32707634568214417, "learning_rate": 3.133764515418595e-06, "loss": 0.019847411662340164, "memory(GiB)": 21.48, "step": 19722, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.956213 }, { "epoch": 0.6407107819250886, "grad_norm": 0.41188448667526245, "learning_rate": 3.133266192239737e-06, "loss": 0.0166170597076416, "memory(GiB)": 21.48, "step": 19723, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956222 }, { "epoch": 0.6407432673878439, "grad_norm": 0.2722340524196625, "learning_rate": 3.1327678906054732e-06, "loss": 0.01072943676263094, "memory(GiB)": 21.48, "step": 19724, "token_acc": 1.0, "train_speed(iter/s)": 0.956232 }, { "epoch": 0.6407757528505994, "grad_norm": 0.36085885763168335, "learning_rate": 3.132269610521555e-06, "loss": 0.01692849211394787, "memory(GiB)": 21.48, "step": 19725, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956243 }, { "epoch": 0.6408082383133548, "grad_norm": 0.41596007347106934, "learning_rate": 3.1317713519937286e-06, "loss": 0.018474776297807693, "memory(GiB)": 21.48, "step": 19726, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.956252 }, { "epoch": 0.6408407237761102, "grad_norm": 0.3220629096031189, "learning_rate": 3.1312731150277505e-06, "loss": 0.02296200767159462, "memory(GiB)": 21.48, "step": 19727, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.956262 }, { "epoch": 0.6408732092388656, "grad_norm": 0.5128156542778015, "learning_rate": 3.130774899629365e-06, "loss": 0.0222385935485363, "memory(GiB)": 21.48, "step": 19728, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.956272 }, { "epoch": 0.6409056947016211, "grad_norm": 0.2924889326095581, "learning_rate": 3.1302767058043277e-06, "loss": 0.015587656758725643, "memory(GiB)": 21.48, "step": 19729, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956282 }, { "epoch": 0.6409381801643764, "grad_norm": 0.3378707468509674, "learning_rate": 3.129778533558384e-06, "loss": 0.01916545070707798, "memory(GiB)": 21.48, "step": 19730, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.956291 }, { "epoch": 0.6409706656271319, "grad_norm": 0.3776194751262665, "learning_rate": 3.129280382897286e-06, "loss": 0.019147265702486038, "memory(GiB)": 21.48, "step": 19731, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.956302 }, { "epoch": 0.6410031510898873, "grad_norm": 0.24318553507328033, "learning_rate": 3.128782253826782e-06, "loss": 0.011390553787350655, "memory(GiB)": 21.48, "step": 19732, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.956312 }, { "epoch": 0.6410356365526427, "grad_norm": 0.5039740204811096, "learning_rate": 3.1282841463526216e-06, "loss": 0.019863175228238106, "memory(GiB)": 21.48, "step": 19733, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956322 }, { "epoch": 0.6410681220153981, "grad_norm": 0.3979838788509369, "learning_rate": 3.1277860604805533e-06, "loss": 0.019442681223154068, "memory(GiB)": 21.48, "step": 19734, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956331 }, { "epoch": 0.6411006074781536, "grad_norm": 0.3335837125778198, "learning_rate": 3.127287996216326e-06, "loss": 0.014601413160562515, "memory(GiB)": 21.48, "step": 19735, "token_acc": 0.9875, "train_speed(iter/s)": 0.956341 }, { "epoch": 0.6411330929409089, "grad_norm": 0.43762606382369995, "learning_rate": 3.126789953565687e-06, "loss": 0.01424107514321804, "memory(GiB)": 21.48, "step": 19736, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.956352 }, { "epoch": 0.6411655784036644, "grad_norm": 0.41392359137535095, "learning_rate": 3.126291932534386e-06, "loss": 0.019202418625354767, "memory(GiB)": 21.48, "step": 19737, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.956362 }, { "epoch": 0.6411980638664198, "grad_norm": 0.3718654215335846, "learning_rate": 3.1257939331281693e-06, "loss": 0.01867634430527687, "memory(GiB)": 21.48, "step": 19738, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.956373 }, { "epoch": 0.6412305493291752, "grad_norm": 0.3982299268245697, "learning_rate": 3.125295955352785e-06, "loss": 0.018898136913776398, "memory(GiB)": 21.48, "step": 19739, "token_acc": 0.9834710743801653, "train_speed(iter/s)": 0.956382 }, { "epoch": 0.6412630347919306, "grad_norm": 0.472120463848114, "learning_rate": 3.124797999213982e-06, "loss": 0.014059914276003838, "memory(GiB)": 21.48, "step": 19740, "token_acc": 0.9961685823754789, "train_speed(iter/s)": 0.956393 }, { "epoch": 0.6412955202546861, "grad_norm": 0.2847209870815277, "learning_rate": 3.1243000647175048e-06, "loss": 0.01694638282060623, "memory(GiB)": 21.48, "step": 19741, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956403 }, { "epoch": 0.6413280057174414, "grad_norm": 0.4668169319629669, "learning_rate": 3.123802151869102e-06, "loss": 0.018265459686517715, "memory(GiB)": 21.48, "step": 19742, "token_acc": 1.0, "train_speed(iter/s)": 0.956414 }, { "epoch": 0.6413604911801969, "grad_norm": 0.2923809587955475, "learning_rate": 3.123304260674519e-06, "loss": 0.010869799181818962, "memory(GiB)": 21.48, "step": 19743, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.956424 }, { "epoch": 0.6413929766429523, "grad_norm": 0.38171201944351196, "learning_rate": 3.1228063911395033e-06, "loss": 0.020941752940416336, "memory(GiB)": 21.48, "step": 19744, "token_acc": 0.9923664122137404, "train_speed(iter/s)": 0.956434 }, { "epoch": 0.6414254621057077, "grad_norm": 0.304090291261673, "learning_rate": 3.1223085432698e-06, "loss": 0.00946168601512909, "memory(GiB)": 21.48, "step": 19745, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.956444 }, { "epoch": 0.6414579475684631, "grad_norm": 0.7071558237075806, "learning_rate": 3.121810717071155e-06, "loss": 0.025556370615959167, "memory(GiB)": 21.48, "step": 19746, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.956454 }, { "epoch": 0.6414904330312186, "grad_norm": 0.30662235617637634, "learning_rate": 3.1213129125493148e-06, "loss": 0.0178583562374115, "memory(GiB)": 21.48, "step": 19747, "token_acc": 0.99, "train_speed(iter/s)": 0.956465 }, { "epoch": 0.6415229184939739, "grad_norm": 0.38350728154182434, "learning_rate": 3.1208151297100253e-06, "loss": 0.019557995721697807, "memory(GiB)": 21.48, "step": 19748, "token_acc": 0.9921875, "train_speed(iter/s)": 0.956474 }, { "epoch": 0.6415554039567294, "grad_norm": 0.3813777267932892, "learning_rate": 3.1203173685590277e-06, "loss": 0.014334021136164665, "memory(GiB)": 21.48, "step": 19749, "token_acc": 1.0, "train_speed(iter/s)": 0.956483 }, { "epoch": 0.6415878894194847, "grad_norm": 0.3849759101867676, "learning_rate": 3.1198196291020723e-06, "loss": 0.018707696348428726, "memory(GiB)": 21.48, "step": 19750, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956491 }, { "epoch": 0.6416203748822402, "grad_norm": 0.404238224029541, "learning_rate": 3.119321911344898e-06, "loss": 0.022454850375652313, "memory(GiB)": 21.48, "step": 19751, "token_acc": 0.984375, "train_speed(iter/s)": 0.956499 }, { "epoch": 0.6416528603449956, "grad_norm": 0.3125172257423401, "learning_rate": 3.1188242152932546e-06, "loss": 0.01417448464781046, "memory(GiB)": 21.48, "step": 19752, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.956506 }, { "epoch": 0.641685345807751, "grad_norm": 0.3393726050853729, "learning_rate": 3.1183265409528824e-06, "loss": 0.011816105805337429, "memory(GiB)": 21.48, "step": 19753, "token_acc": 0.996, "train_speed(iter/s)": 0.956515 }, { "epoch": 0.6417178312705064, "grad_norm": 0.29582828283309937, "learning_rate": 3.117828888329525e-06, "loss": 0.01423798967152834, "memory(GiB)": 21.48, "step": 19754, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956523 }, { "epoch": 0.6417503167332619, "grad_norm": 0.2364729940891266, "learning_rate": 3.1173312574289307e-06, "loss": 0.010295543819665909, "memory(GiB)": 21.48, "step": 19755, "token_acc": 1.0, "train_speed(iter/s)": 0.956531 }, { "epoch": 0.6417828021960172, "grad_norm": 0.31208109855651855, "learning_rate": 3.1168336482568357e-06, "loss": 0.01537844818085432, "memory(GiB)": 21.48, "step": 19756, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.956537 }, { "epoch": 0.6418152876587727, "grad_norm": 0.2557278871536255, "learning_rate": 3.1163360608189907e-06, "loss": 0.01356389932334423, "memory(GiB)": 21.48, "step": 19757, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.956544 }, { "epoch": 0.6418477731215281, "grad_norm": 0.2931356132030487, "learning_rate": 3.115838495121131e-06, "loss": 0.010349464602768421, "memory(GiB)": 21.48, "step": 19758, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.95655 }, { "epoch": 0.6418802585842835, "grad_norm": 0.27163606882095337, "learning_rate": 3.1153409511690064e-06, "loss": 0.015806086361408234, "memory(GiB)": 21.48, "step": 19759, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956557 }, { "epoch": 0.6419127440470389, "grad_norm": 0.504531979560852, "learning_rate": 3.1148434289683537e-06, "loss": 0.023977354168891907, "memory(GiB)": 21.48, "step": 19760, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956563 }, { "epoch": 0.6419452295097944, "grad_norm": 0.5919917821884155, "learning_rate": 3.114345928524918e-06, "loss": 0.02252907119691372, "memory(GiB)": 21.48, "step": 19761, "token_acc": 0.9963636363636363, "train_speed(iter/s)": 0.95657 }, { "epoch": 0.6419777149725497, "grad_norm": 0.3830644488334656, "learning_rate": 3.113848449844439e-06, "loss": 0.01605541817843914, "memory(GiB)": 21.48, "step": 19762, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956576 }, { "epoch": 0.6420102004353052, "grad_norm": 0.4045954942703247, "learning_rate": 3.11335099293266e-06, "loss": 0.024522263556718826, "memory(GiB)": 21.48, "step": 19763, "token_acc": 0.9893238434163701, "train_speed(iter/s)": 0.956582 }, { "epoch": 0.6420426858980606, "grad_norm": 0.3710066080093384, "learning_rate": 3.112853557795321e-06, "loss": 0.014485879801213741, "memory(GiB)": 21.48, "step": 19764, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956589 }, { "epoch": 0.642075171360816, "grad_norm": 0.8834846019744873, "learning_rate": 3.1123561444381644e-06, "loss": 0.028069056570529938, "memory(GiB)": 21.48, "step": 19765, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.956595 }, { "epoch": 0.6421076568235714, "grad_norm": 0.3195297420024872, "learning_rate": 3.11185875286693e-06, "loss": 0.012030013836920261, "memory(GiB)": 21.48, "step": 19766, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956602 }, { "epoch": 0.6421401422863269, "grad_norm": 0.41555675864219666, "learning_rate": 3.111361383087359e-06, "loss": 0.021453402936458588, "memory(GiB)": 21.48, "step": 19767, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.956608 }, { "epoch": 0.6421726277490822, "grad_norm": 0.3329195976257324, "learning_rate": 3.110864035105191e-06, "loss": 0.01484823040664196, "memory(GiB)": 21.48, "step": 19768, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956614 }, { "epoch": 0.6422051132118377, "grad_norm": 0.524713397026062, "learning_rate": 3.1103667089261656e-06, "loss": 0.0190825704485178, "memory(GiB)": 21.48, "step": 19769, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.956621 }, { "epoch": 0.6422375986745931, "grad_norm": 0.328174352645874, "learning_rate": 3.109869404556025e-06, "loss": 0.015332795679569244, "memory(GiB)": 21.48, "step": 19770, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956628 }, { "epoch": 0.6422700841373485, "grad_norm": 0.23078003525733948, "learning_rate": 3.1093721220005067e-06, "loss": 0.006304787006229162, "memory(GiB)": 21.48, "step": 19771, "token_acc": 1.0, "train_speed(iter/s)": 0.956634 }, { "epoch": 0.6423025696001039, "grad_norm": 0.2148461490869522, "learning_rate": 3.1088748612653515e-06, "loss": 0.011095844209194183, "memory(GiB)": 21.48, "step": 19772, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.956641 }, { "epoch": 0.6423350550628594, "grad_norm": 0.2167823761701584, "learning_rate": 3.1083776223562956e-06, "loss": 0.0083152549341321, "memory(GiB)": 21.48, "step": 19773, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.95665 }, { "epoch": 0.6423675405256147, "grad_norm": 0.3485601246356964, "learning_rate": 3.107880405279081e-06, "loss": 0.0116486307233572, "memory(GiB)": 21.48, "step": 19774, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.956658 }, { "epoch": 0.6424000259883702, "grad_norm": 0.3829362094402313, "learning_rate": 3.1073832100394446e-06, "loss": 0.015648625791072845, "memory(GiB)": 21.48, "step": 19775, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956667 }, { "epoch": 0.6424325114511257, "grad_norm": 0.23040953278541565, "learning_rate": 3.1068860366431255e-06, "loss": 0.013264170847833157, "memory(GiB)": 21.48, "step": 19776, "token_acc": 1.0, "train_speed(iter/s)": 0.956678 }, { "epoch": 0.642464996913881, "grad_norm": 0.2767479121685028, "learning_rate": 3.1063888850958602e-06, "loss": 0.013031590729951859, "memory(GiB)": 21.48, "step": 19777, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956688 }, { "epoch": 0.6424974823766365, "grad_norm": 0.410770982503891, "learning_rate": 3.1058917554033897e-06, "loss": 0.014571876265108585, "memory(GiB)": 21.48, "step": 19778, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.956698 }, { "epoch": 0.6425299678393919, "grad_norm": 0.8128400444984436, "learning_rate": 3.105394647571447e-06, "loss": 0.028119105845689774, "memory(GiB)": 21.48, "step": 19779, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.956708 }, { "epoch": 0.6425624533021473, "grad_norm": 0.7099564671516418, "learning_rate": 3.1048975616057743e-06, "loss": 0.0316670686006546, "memory(GiB)": 21.48, "step": 19780, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.956718 }, { "epoch": 0.6425949387649027, "grad_norm": 0.3130672574043274, "learning_rate": 3.1044004975121024e-06, "loss": 0.011939270421862602, "memory(GiB)": 21.48, "step": 19781, "token_acc": 1.0, "train_speed(iter/s)": 0.956728 }, { "epoch": 0.6426274242276582, "grad_norm": 0.39070188999176025, "learning_rate": 3.1039034552961756e-06, "loss": 0.02073616161942482, "memory(GiB)": 21.48, "step": 19782, "token_acc": 0.9921875, "train_speed(iter/s)": 0.956739 }, { "epoch": 0.6426599096904135, "grad_norm": 0.48252925276756287, "learning_rate": 3.1034064349637236e-06, "loss": 0.014314687810838223, "memory(GiB)": 21.48, "step": 19783, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.956749 }, { "epoch": 0.642692395153169, "grad_norm": 0.6602339148521423, "learning_rate": 3.102909436520488e-06, "loss": 0.01754804328083992, "memory(GiB)": 21.48, "step": 19784, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.956759 }, { "epoch": 0.6427248806159244, "grad_norm": 0.2997104227542877, "learning_rate": 3.1024124599722013e-06, "loss": 0.013005629181861877, "memory(GiB)": 21.48, "step": 19785, "token_acc": 1.0, "train_speed(iter/s)": 0.95677 }, { "epoch": 0.6427573660786798, "grad_norm": 0.18899664282798767, "learning_rate": 3.101915505324599e-06, "loss": 0.007573361974209547, "memory(GiB)": 21.48, "step": 19786, "token_acc": 1.0, "train_speed(iter/s)": 0.95678 }, { "epoch": 0.6427898515414352, "grad_norm": 0.3563472330570221, "learning_rate": 3.101418572583421e-06, "loss": 0.012273895554244518, "memory(GiB)": 21.48, "step": 19787, "token_acc": 1.0, "train_speed(iter/s)": 0.95679 }, { "epoch": 0.6428223370041907, "grad_norm": 0.3718791902065277, "learning_rate": 3.100921661754398e-06, "loss": 0.01872958056628704, "memory(GiB)": 21.48, "step": 19788, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.956801 }, { "epoch": 0.642854822466946, "grad_norm": 0.29061099886894226, "learning_rate": 3.100424772843268e-06, "loss": 0.01144552044570446, "memory(GiB)": 21.48, "step": 19789, "token_acc": 1.0, "train_speed(iter/s)": 0.956811 }, { "epoch": 0.6428873079297015, "grad_norm": 0.570589542388916, "learning_rate": 3.099927905855763e-06, "loss": 0.017744014039635658, "memory(GiB)": 21.48, "step": 19790, "token_acc": 0.9894179894179894, "train_speed(iter/s)": 0.956821 }, { "epoch": 0.6429197933924569, "grad_norm": 0.38429346680641174, "learning_rate": 3.0994310607976204e-06, "loss": 0.02071959339082241, "memory(GiB)": 21.48, "step": 19791, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956831 }, { "epoch": 0.6429522788552123, "grad_norm": 0.2875785231590271, "learning_rate": 3.098934237674571e-06, "loss": 0.01110378373414278, "memory(GiB)": 21.48, "step": 19792, "token_acc": 1.0, "train_speed(iter/s)": 0.956842 }, { "epoch": 0.6429847643179677, "grad_norm": 0.3060932755470276, "learning_rate": 3.0984374364923533e-06, "loss": 0.01823563501238823, "memory(GiB)": 21.48, "step": 19793, "token_acc": 0.9847908745247148, "train_speed(iter/s)": 0.956852 }, { "epoch": 0.6430172497807232, "grad_norm": 0.4549022316932678, "learning_rate": 3.0979406572566963e-06, "loss": 0.015053166076540947, "memory(GiB)": 21.48, "step": 19794, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956862 }, { "epoch": 0.6430497352434785, "grad_norm": 0.28625041246414185, "learning_rate": 3.0974438999733373e-06, "loss": 0.013559524901211262, "memory(GiB)": 21.48, "step": 19795, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.956872 }, { "epoch": 0.643082220706234, "grad_norm": 0.48316600918769836, "learning_rate": 3.0969471646480072e-06, "loss": 0.022594543173909187, "memory(GiB)": 21.48, "step": 19796, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.956883 }, { "epoch": 0.6431147061689894, "grad_norm": 0.3410360515117645, "learning_rate": 3.09645045128644e-06, "loss": 0.014305450022220612, "memory(GiB)": 21.48, "step": 19797, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.956893 }, { "epoch": 0.6431471916317448, "grad_norm": 0.3390013575553894, "learning_rate": 3.095953759894368e-06, "loss": 0.020057454705238342, "memory(GiB)": 21.48, "step": 19798, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.956904 }, { "epoch": 0.6431796770945002, "grad_norm": 0.39420628547668457, "learning_rate": 3.095457090477525e-06, "loss": 0.014907445758581161, "memory(GiB)": 21.48, "step": 19799, "token_acc": 1.0, "train_speed(iter/s)": 0.956914 }, { "epoch": 0.6432121625572557, "grad_norm": 0.3571493923664093, "learning_rate": 3.0949604430416403e-06, "loss": 0.018066423013806343, "memory(GiB)": 21.48, "step": 19800, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.956924 }, { "epoch": 0.643244648020011, "grad_norm": 0.3527889847755432, "learning_rate": 3.094463817592448e-06, "loss": 0.01856350153684616, "memory(GiB)": 21.48, "step": 19801, "token_acc": 1.0, "train_speed(iter/s)": 0.956934 }, { "epoch": 0.6432771334827665, "grad_norm": 0.41179707646369934, "learning_rate": 3.093967214135681e-06, "loss": 0.014944588765501976, "memory(GiB)": 21.48, "step": 19802, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956945 }, { "epoch": 0.6433096189455219, "grad_norm": 0.5141392946243286, "learning_rate": 3.093470632677068e-06, "loss": 0.02009940706193447, "memory(GiB)": 21.48, "step": 19803, "token_acc": 1.0, "train_speed(iter/s)": 0.956955 }, { "epoch": 0.6433421044082773, "grad_norm": 0.4314466118812561, "learning_rate": 3.0929740732223425e-06, "loss": 0.0170764047652483, "memory(GiB)": 21.48, "step": 19804, "token_acc": 0.996, "train_speed(iter/s)": 0.956966 }, { "epoch": 0.6433745898710327, "grad_norm": 0.2964382469654083, "learning_rate": 3.092477535777233e-06, "loss": 0.014791414141654968, "memory(GiB)": 21.48, "step": 19805, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956976 }, { "epoch": 0.6434070753337882, "grad_norm": 0.41107210516929626, "learning_rate": 3.0919810203474736e-06, "loss": 0.01447276584804058, "memory(GiB)": 21.48, "step": 19806, "token_acc": 0.9899328859060402, "train_speed(iter/s)": 0.956986 }, { "epoch": 0.6434395607965435, "grad_norm": 0.35240429639816284, "learning_rate": 3.0914845269387907e-06, "loss": 0.01685008779168129, "memory(GiB)": 21.48, "step": 19807, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.956995 }, { "epoch": 0.643472046259299, "grad_norm": 0.2990121841430664, "learning_rate": 3.0909880555569188e-06, "loss": 0.01973879523575306, "memory(GiB)": 21.48, "step": 19808, "token_acc": 1.0, "train_speed(iter/s)": 0.957005 }, { "epoch": 0.6435045317220544, "grad_norm": 0.5719590187072754, "learning_rate": 3.090491606207583e-06, "loss": 0.021585771813988686, "memory(GiB)": 21.48, "step": 19809, "token_acc": 1.0, "train_speed(iter/s)": 0.957016 }, { "epoch": 0.6435370171848098, "grad_norm": 0.3287409245967865, "learning_rate": 3.089995178896518e-06, "loss": 0.012531538493931293, "memory(GiB)": 21.48, "step": 19810, "token_acc": 1.0, "train_speed(iter/s)": 0.957025 }, { "epoch": 0.6435695026475652, "grad_norm": 0.35032400488853455, "learning_rate": 3.0894987736294496e-06, "loss": 0.016846520826220512, "memory(GiB)": 21.48, "step": 19811, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.957032 }, { "epoch": 0.6436019881103207, "grad_norm": 0.4841107726097107, "learning_rate": 3.089002390412109e-06, "loss": 0.02436387538909912, "memory(GiB)": 21.48, "step": 19812, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.95704 }, { "epoch": 0.643634473573076, "grad_norm": 0.34449076652526855, "learning_rate": 3.088506029250223e-06, "loss": 0.021927807480096817, "memory(GiB)": 21.48, "step": 19813, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.957048 }, { "epoch": 0.6436669590358315, "grad_norm": 0.4742105305194855, "learning_rate": 3.088009690149523e-06, "loss": 0.0195598304271698, "memory(GiB)": 21.48, "step": 19814, "token_acc": 1.0, "train_speed(iter/s)": 0.957056 }, { "epoch": 0.6436994444985868, "grad_norm": 0.3384177088737488, "learning_rate": 3.0875133731157346e-06, "loss": 0.00951104611158371, "memory(GiB)": 21.48, "step": 19815, "token_acc": 1.0, "train_speed(iter/s)": 0.957064 }, { "epoch": 0.6437319299613423, "grad_norm": 0.32981595396995544, "learning_rate": 3.0870170781545883e-06, "loss": 0.01590634509921074, "memory(GiB)": 21.48, "step": 19816, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.95707 }, { "epoch": 0.6437644154240977, "grad_norm": 0.4330970346927643, "learning_rate": 3.086520805271811e-06, "loss": 0.020432237535715103, "memory(GiB)": 21.48, "step": 19817, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.957077 }, { "epoch": 0.6437969008868532, "grad_norm": 0.40673744678497314, "learning_rate": 3.0860245544731303e-06, "loss": 0.01970004104077816, "memory(GiB)": 21.48, "step": 19818, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.957084 }, { "epoch": 0.6438293863496085, "grad_norm": 0.21894650161266327, "learning_rate": 3.085528325764274e-06, "loss": 0.009742673486471176, "memory(GiB)": 21.48, "step": 19819, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.957091 }, { "epoch": 0.643861871812364, "grad_norm": 0.3093438148498535, "learning_rate": 3.085032119150969e-06, "loss": 0.012201015837490559, "memory(GiB)": 21.48, "step": 19820, "token_acc": 0.9940476190476191, "train_speed(iter/s)": 0.957098 }, { "epoch": 0.6438943572751193, "grad_norm": 0.22357720136642456, "learning_rate": 3.0845359346389423e-06, "loss": 0.009799888357520103, "memory(GiB)": 21.48, "step": 19821, "token_acc": 1.0, "train_speed(iter/s)": 0.957105 }, { "epoch": 0.6439268427378748, "grad_norm": 0.3529603183269501, "learning_rate": 3.08403977223392e-06, "loss": 0.01252718549221754, "memory(GiB)": 21.48, "step": 19822, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.957111 }, { "epoch": 0.6439593282006302, "grad_norm": 0.24181173741817474, "learning_rate": 3.083543631941629e-06, "loss": 0.011541265994310379, "memory(GiB)": 21.48, "step": 19823, "token_acc": 1.0, "train_speed(iter/s)": 0.957118 }, { "epoch": 0.6439918136633856, "grad_norm": 0.4188484251499176, "learning_rate": 3.083047513767795e-06, "loss": 0.01692816987633705, "memory(GiB)": 21.48, "step": 19824, "token_acc": 1.0, "train_speed(iter/s)": 0.957124 }, { "epoch": 0.644024299126141, "grad_norm": 0.46778443455696106, "learning_rate": 3.082551417718145e-06, "loss": 0.01718379557132721, "memory(GiB)": 21.48, "step": 19825, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.957131 }, { "epoch": 0.6440567845888965, "grad_norm": 0.3448066711425781, "learning_rate": 3.0820553437984022e-06, "loss": 0.0163213349878788, "memory(GiB)": 21.48, "step": 19826, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957137 }, { "epoch": 0.6440892700516518, "grad_norm": 0.25722575187683105, "learning_rate": 3.081559292014296e-06, "loss": 0.012997081503272057, "memory(GiB)": 21.48, "step": 19827, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.957144 }, { "epoch": 0.6441217555144073, "grad_norm": 0.2629702091217041, "learning_rate": 3.0810632623715463e-06, "loss": 0.010943805798888206, "memory(GiB)": 21.48, "step": 19828, "token_acc": 1.0, "train_speed(iter/s)": 0.95715 }, { "epoch": 0.6441542409771627, "grad_norm": 0.2997111678123474, "learning_rate": 3.0805672548758835e-06, "loss": 0.01449041161686182, "memory(GiB)": 21.48, "step": 19829, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.957156 }, { "epoch": 0.6441867264399181, "grad_norm": 0.35551440715789795, "learning_rate": 3.0800712695330258e-06, "loss": 0.015557065606117249, "memory(GiB)": 21.48, "step": 19830, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.957163 }, { "epoch": 0.6442192119026735, "grad_norm": 0.3858199715614319, "learning_rate": 3.079575306348703e-06, "loss": 0.01823156699538231, "memory(GiB)": 21.48, "step": 19831, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.957169 }, { "epoch": 0.644251697365429, "grad_norm": 0.7238380312919617, "learning_rate": 3.0790793653286377e-06, "loss": 0.02441348321735859, "memory(GiB)": 21.48, "step": 19832, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957176 }, { "epoch": 0.6442841828281843, "grad_norm": 0.34310784935951233, "learning_rate": 3.078583446478552e-06, "loss": 0.021661117672920227, "memory(GiB)": 21.48, "step": 19833, "token_acc": 1.0, "train_speed(iter/s)": 0.957183 }, { "epoch": 0.6443166682909398, "grad_norm": 0.2897420823574066, "learning_rate": 3.078087549804173e-06, "loss": 0.014333481900393963, "memory(GiB)": 21.48, "step": 19834, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.95719 }, { "epoch": 0.6443491537536952, "grad_norm": 0.4531475007534027, "learning_rate": 3.077591675311219e-06, "loss": 0.016772359609603882, "memory(GiB)": 21.48, "step": 19835, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.9572 }, { "epoch": 0.6443816392164506, "grad_norm": 0.23456910252571106, "learning_rate": 3.0770958230054184e-06, "loss": 0.012843215838074684, "memory(GiB)": 21.48, "step": 19836, "token_acc": 1.0, "train_speed(iter/s)": 0.95721 }, { "epoch": 0.644414124679206, "grad_norm": 0.3564227521419525, "learning_rate": 3.0765999928924884e-06, "loss": 0.01779034733772278, "memory(GiB)": 21.48, "step": 19837, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.95722 }, { "epoch": 0.6444466101419615, "grad_norm": 0.49229806661605835, "learning_rate": 3.0761041849781575e-06, "loss": 0.022593975067138672, "memory(GiB)": 21.48, "step": 19838, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.95723 }, { "epoch": 0.6444790956047168, "grad_norm": 0.411015123128891, "learning_rate": 3.075608399268143e-06, "loss": 0.018513623625040054, "memory(GiB)": 21.48, "step": 19839, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.95724 }, { "epoch": 0.6445115810674723, "grad_norm": 0.35950732231140137, "learning_rate": 3.0751126357681706e-06, "loss": 0.01728467084467411, "memory(GiB)": 21.48, "step": 19840, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957249 }, { "epoch": 0.6445440665302278, "grad_norm": 0.28230804204940796, "learning_rate": 3.0746168944839585e-06, "loss": 0.014806875959038734, "memory(GiB)": 21.48, "step": 19841, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.95726 }, { "epoch": 0.6445765519929831, "grad_norm": 0.3026002049446106, "learning_rate": 3.0741211754212314e-06, "loss": 0.010495596565306187, "memory(GiB)": 21.48, "step": 19842, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.95727 }, { "epoch": 0.6446090374557386, "grad_norm": 0.4078657627105713, "learning_rate": 3.0736254785857084e-06, "loss": 0.015438096597790718, "memory(GiB)": 21.48, "step": 19843, "token_acc": 1.0, "train_speed(iter/s)": 0.957281 }, { "epoch": 0.644641522918494, "grad_norm": 0.46623581647872925, "learning_rate": 3.0731298039831127e-06, "loss": 0.022716820240020752, "memory(GiB)": 21.48, "step": 19844, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957291 }, { "epoch": 0.6446740083812494, "grad_norm": 0.26311194896698, "learning_rate": 3.072634151619162e-06, "loss": 0.008756830357015133, "memory(GiB)": 21.48, "step": 19845, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.9573 }, { "epoch": 0.6447064938440048, "grad_norm": 0.3038084805011749, "learning_rate": 3.0721385214995793e-06, "loss": 0.01618197001516819, "memory(GiB)": 21.48, "step": 19846, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.957309 }, { "epoch": 0.6447389793067603, "grad_norm": 0.311240017414093, "learning_rate": 3.071642913630084e-06, "loss": 0.01784610189497471, "memory(GiB)": 21.48, "step": 19847, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957319 }, { "epoch": 0.6447714647695156, "grad_norm": 0.4876770079135895, "learning_rate": 3.0711473280163963e-06, "loss": 0.020654797554016113, "memory(GiB)": 21.48, "step": 19848, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957329 }, { "epoch": 0.6448039502322711, "grad_norm": 0.5718379616737366, "learning_rate": 3.0706517646642352e-06, "loss": 0.026051275432109833, "memory(GiB)": 21.48, "step": 19849, "token_acc": 1.0, "train_speed(iter/s)": 0.95734 }, { "epoch": 0.6448364356950265, "grad_norm": 0.3969273865222931, "learning_rate": 3.0701562235793214e-06, "loss": 0.017535362392663956, "memory(GiB)": 21.48, "step": 19850, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.95735 }, { "epoch": 0.6448689211577819, "grad_norm": 0.2671991288661957, "learning_rate": 3.069660704767373e-06, "loss": 0.010845961980521679, "memory(GiB)": 21.48, "step": 19851, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.95736 }, { "epoch": 0.6449014066205373, "grad_norm": 0.4243296682834625, "learning_rate": 3.0691652082341095e-06, "loss": 0.019691742956638336, "memory(GiB)": 21.48, "step": 19852, "token_acc": 0.9806763285024155, "train_speed(iter/s)": 0.957371 }, { "epoch": 0.6449338920832928, "grad_norm": 0.2893921136856079, "learning_rate": 3.06866973398525e-06, "loss": 0.011855985037982464, "memory(GiB)": 21.48, "step": 19853, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957381 }, { "epoch": 0.6449663775460481, "grad_norm": 0.45094776153564453, "learning_rate": 3.068174282026511e-06, "loss": 0.020262517035007477, "memory(GiB)": 21.48, "step": 19854, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957391 }, { "epoch": 0.6449988630088036, "grad_norm": 0.2818153500556946, "learning_rate": 3.0676788523636137e-06, "loss": 0.009970388375222683, "memory(GiB)": 21.48, "step": 19855, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.9574 }, { "epoch": 0.645031348471559, "grad_norm": 0.3206403851509094, "learning_rate": 3.0671834450022726e-06, "loss": 0.013316642493009567, "memory(GiB)": 21.48, "step": 19856, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957411 }, { "epoch": 0.6450638339343144, "grad_norm": 0.4219375252723694, "learning_rate": 3.0666880599482095e-06, "loss": 0.01797809824347496, "memory(GiB)": 21.48, "step": 19857, "token_acc": 0.9860627177700348, "train_speed(iter/s)": 0.957421 }, { "epoch": 0.6450963193970698, "grad_norm": 0.42797356843948364, "learning_rate": 3.0661926972071365e-06, "loss": 0.014426219277083874, "memory(GiB)": 21.48, "step": 19858, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.957432 }, { "epoch": 0.6451288048598253, "grad_norm": 0.477789044380188, "learning_rate": 3.0656973567847763e-06, "loss": 0.024779435247182846, "memory(GiB)": 21.48, "step": 19859, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.957442 }, { "epoch": 0.6451612903225806, "grad_norm": 0.3678576648235321, "learning_rate": 3.0652020386868407e-06, "loss": 0.01919444650411606, "memory(GiB)": 21.48, "step": 19860, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957453 }, { "epoch": 0.6451937757853361, "grad_norm": 0.318213552236557, "learning_rate": 3.0647067429190512e-06, "loss": 0.014098208397626877, "memory(GiB)": 21.48, "step": 19861, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.957463 }, { "epoch": 0.6452262612480915, "grad_norm": 0.3401441276073456, "learning_rate": 3.0642114694871204e-06, "loss": 0.01596491038799286, "memory(GiB)": 21.48, "step": 19862, "token_acc": 1.0, "train_speed(iter/s)": 0.957473 }, { "epoch": 0.6452587467108469, "grad_norm": 0.3403850793838501, "learning_rate": 3.063716218396764e-06, "loss": 0.01402898970991373, "memory(GiB)": 21.48, "step": 19863, "token_acc": 1.0, "train_speed(iter/s)": 0.957483 }, { "epoch": 0.6452912321736023, "grad_norm": 0.4038216173648834, "learning_rate": 3.0632209896537036e-06, "loss": 0.016843555495142937, "memory(GiB)": 21.48, "step": 19864, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957492 }, { "epoch": 0.6453237176363578, "grad_norm": 0.31851181387901306, "learning_rate": 3.062725783263647e-06, "loss": 0.0163471270352602, "memory(GiB)": 21.48, "step": 19865, "token_acc": 0.98828125, "train_speed(iter/s)": 0.957503 }, { "epoch": 0.6453562030991131, "grad_norm": 0.2935388684272766, "learning_rate": 3.0622305992323164e-06, "loss": 0.013406043872237206, "memory(GiB)": 21.48, "step": 19866, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957513 }, { "epoch": 0.6453886885618686, "grad_norm": 0.26636257767677307, "learning_rate": 3.0617354375654224e-06, "loss": 0.01380426436662674, "memory(GiB)": 21.48, "step": 19867, "token_acc": 0.9823008849557522, "train_speed(iter/s)": 0.957523 }, { "epoch": 0.645421174024624, "grad_norm": 0.5383852124214172, "learning_rate": 3.0612402982686817e-06, "loss": 0.022049030289053917, "memory(GiB)": 21.48, "step": 19868, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.957533 }, { "epoch": 0.6454536594873794, "grad_norm": 0.32150998711586, "learning_rate": 3.0607451813478083e-06, "loss": 0.01096462830901146, "memory(GiB)": 21.48, "step": 19869, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.957543 }, { "epoch": 0.6454861449501348, "grad_norm": 0.4982832968235016, "learning_rate": 3.060250086808517e-06, "loss": 0.01863105036318302, "memory(GiB)": 21.48, "step": 19870, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.957554 }, { "epoch": 0.6455186304128903, "grad_norm": 0.4464835226535797, "learning_rate": 3.0597550146565204e-06, "loss": 0.019694842398166656, "memory(GiB)": 21.48, "step": 19871, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957564 }, { "epoch": 0.6455511158756456, "grad_norm": 0.40729227662086487, "learning_rate": 3.0592599648975344e-06, "loss": 0.022462695837020874, "memory(GiB)": 21.48, "step": 19872, "token_acc": 0.9837837837837838, "train_speed(iter/s)": 0.957574 }, { "epoch": 0.6455836013384011, "grad_norm": 0.2847922444343567, "learning_rate": 3.058764937537271e-06, "loss": 0.010127857327461243, "memory(GiB)": 21.48, "step": 19873, "token_acc": 0.99609375, "train_speed(iter/s)": 0.957582 }, { "epoch": 0.6456160868011565, "grad_norm": 0.1847381591796875, "learning_rate": 3.0582699325814447e-06, "loss": 0.008558167144656181, "memory(GiB)": 21.48, "step": 19874, "token_acc": 1.0, "train_speed(iter/s)": 0.95759 }, { "epoch": 0.6456485722639119, "grad_norm": 0.4257666766643524, "learning_rate": 3.0577749500357665e-06, "loss": 0.015127520076930523, "memory(GiB)": 21.48, "step": 19875, "token_acc": 1.0, "train_speed(iter/s)": 0.957597 }, { "epoch": 0.6456810577266673, "grad_norm": 0.3381572961807251, "learning_rate": 3.057279989905952e-06, "loss": 0.018446024507284164, "memory(GiB)": 21.48, "step": 19876, "token_acc": 0.992831541218638, "train_speed(iter/s)": 0.957604 }, { "epoch": 0.6457135431894228, "grad_norm": 0.47184523940086365, "learning_rate": 3.0567850521977103e-06, "loss": 0.018423831090331078, "memory(GiB)": 21.48, "step": 19877, "token_acc": 0.99609375, "train_speed(iter/s)": 0.957611 }, { "epoch": 0.6457460286521781, "grad_norm": 0.765081524848938, "learning_rate": 3.0562901369167565e-06, "loss": 0.019900595769286156, "memory(GiB)": 21.48, "step": 19878, "token_acc": 1.0, "train_speed(iter/s)": 0.957618 }, { "epoch": 0.6457785141149336, "grad_norm": 0.4173842668533325, "learning_rate": 3.055795244068802e-06, "loss": 0.015346027910709381, "memory(GiB)": 21.48, "step": 19879, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.957625 }, { "epoch": 0.645810999577689, "grad_norm": 0.3007679283618927, "learning_rate": 3.0553003736595576e-06, "loss": 0.012899504043161869, "memory(GiB)": 21.48, "step": 19880, "token_acc": 1.0, "train_speed(iter/s)": 0.957632 }, { "epoch": 0.6458434850404444, "grad_norm": 0.5406575798988342, "learning_rate": 3.0548055256947364e-06, "loss": 0.019949518144130707, "memory(GiB)": 21.48, "step": 19881, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957639 }, { "epoch": 0.6458759705031998, "grad_norm": 0.3349762260913849, "learning_rate": 3.054310700180047e-06, "loss": 0.01370326429605484, "memory(GiB)": 21.48, "step": 19882, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.957646 }, { "epoch": 0.6459084559659553, "grad_norm": 0.3825540840625763, "learning_rate": 3.053815897121203e-06, "loss": 0.0231696255505085, "memory(GiB)": 21.48, "step": 19883, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957652 }, { "epoch": 0.6459409414287106, "grad_norm": 0.24883511662483215, "learning_rate": 3.0533211165239127e-06, "loss": 0.013493175618350506, "memory(GiB)": 21.48, "step": 19884, "token_acc": 1.0, "train_speed(iter/s)": 0.957659 }, { "epoch": 0.6459734268914661, "grad_norm": 0.3701396584510803, "learning_rate": 3.05282635839389e-06, "loss": 0.01747826673090458, "memory(GiB)": 21.48, "step": 19885, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.957665 }, { "epoch": 0.6460059123542214, "grad_norm": 0.3226032555103302, "learning_rate": 3.052331622736839e-06, "loss": 0.0201365165412426, "memory(GiB)": 21.48, "step": 19886, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957672 }, { "epoch": 0.6460383978169769, "grad_norm": 0.47437992691993713, "learning_rate": 3.0518369095584777e-06, "loss": 0.02109703980386257, "memory(GiB)": 21.48, "step": 19887, "token_acc": 1.0, "train_speed(iter/s)": 0.957678 }, { "epoch": 0.6460708832797323, "grad_norm": 0.25751566886901855, "learning_rate": 3.051342218864508e-06, "loss": 0.013069690205156803, "memory(GiB)": 21.48, "step": 19888, "token_acc": 1.0, "train_speed(iter/s)": 0.957685 }, { "epoch": 0.6461033687424877, "grad_norm": 0.3009682893753052, "learning_rate": 3.0508475506606455e-06, "loss": 0.011402600444853306, "memory(GiB)": 21.48, "step": 19889, "token_acc": 1.0, "train_speed(iter/s)": 0.957692 }, { "epoch": 0.6461358542052431, "grad_norm": 0.37628334760665894, "learning_rate": 3.0503529049525947e-06, "loss": 0.018572870641946793, "memory(GiB)": 21.48, "step": 19890, "token_acc": 1.0, "train_speed(iter/s)": 0.957699 }, { "epoch": 0.6461683396679986, "grad_norm": 0.36167222261428833, "learning_rate": 3.0498582817460674e-06, "loss": 0.01666106842458248, "memory(GiB)": 21.48, "step": 19891, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957705 }, { "epoch": 0.6462008251307539, "grad_norm": 0.9680859446525574, "learning_rate": 3.0493636810467705e-06, "loss": 0.016470184549689293, "memory(GiB)": 21.48, "step": 19892, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.957711 }, { "epoch": 0.6462333105935094, "grad_norm": 0.4724836051464081, "learning_rate": 3.048869102860411e-06, "loss": 0.023136988282203674, "memory(GiB)": 21.48, "step": 19893, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.957718 }, { "epoch": 0.6462657960562648, "grad_norm": 0.37897029519081116, "learning_rate": 3.048374547192703e-06, "loss": 0.02104431763291359, "memory(GiB)": 21.48, "step": 19894, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957725 }, { "epoch": 0.6462982815190202, "grad_norm": 0.386849582195282, "learning_rate": 3.047880014049348e-06, "loss": 0.018402770161628723, "memory(GiB)": 21.48, "step": 19895, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.957733 }, { "epoch": 0.6463307669817756, "grad_norm": 0.3681392967700958, "learning_rate": 3.0473855034360567e-06, "loss": 0.01809862069785595, "memory(GiB)": 21.48, "step": 19896, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.957742 }, { "epoch": 0.6463632524445311, "grad_norm": 0.25002729892730713, "learning_rate": 3.046891015358535e-06, "loss": 0.011404838413000107, "memory(GiB)": 21.48, "step": 19897, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957752 }, { "epoch": 0.6463957379072864, "grad_norm": 0.40743395686149597, "learning_rate": 3.0463965498224913e-06, "loss": 0.022020529955625534, "memory(GiB)": 21.48, "step": 19898, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.957762 }, { "epoch": 0.6464282233700419, "grad_norm": 0.3868285119533539, "learning_rate": 3.0459021068336305e-06, "loss": 0.024086052551865578, "memory(GiB)": 21.48, "step": 19899, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.957772 }, { "epoch": 0.6464607088327973, "grad_norm": 0.41021960973739624, "learning_rate": 3.045407686397662e-06, "loss": 0.012602636590600014, "memory(GiB)": 21.48, "step": 19900, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.957782 }, { "epoch": 0.6464931942955527, "grad_norm": 0.40265148878097534, "learning_rate": 3.0449132885202885e-06, "loss": 0.018466955050826073, "memory(GiB)": 21.48, "step": 19901, "token_acc": 1.0, "train_speed(iter/s)": 0.957792 }, { "epoch": 0.6465256797583081, "grad_norm": 0.3918878436088562, "learning_rate": 3.0444189132072193e-06, "loss": 0.013069799169898033, "memory(GiB)": 21.48, "step": 19902, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.957801 }, { "epoch": 0.6465581652210636, "grad_norm": 0.3454248011112213, "learning_rate": 3.0439245604641575e-06, "loss": 0.016075169667601585, "memory(GiB)": 21.48, "step": 19903, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.957811 }, { "epoch": 0.646590650683819, "grad_norm": 0.48987728357315063, "learning_rate": 3.043430230296811e-06, "loss": 0.018437542021274567, "memory(GiB)": 21.48, "step": 19904, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957822 }, { "epoch": 0.6466231361465744, "grad_norm": 0.8259961009025574, "learning_rate": 3.0429359227108828e-06, "loss": 0.020602110773324966, "memory(GiB)": 21.48, "step": 19905, "token_acc": 0.9890909090909091, "train_speed(iter/s)": 0.957832 }, { "epoch": 0.6466556216093299, "grad_norm": 0.34339481592178345, "learning_rate": 3.0424416377120803e-06, "loss": 0.014225040562450886, "memory(GiB)": 21.48, "step": 19906, "token_acc": 0.9963503649635036, "train_speed(iter/s)": 0.957841 }, { "epoch": 0.6466881070720852, "grad_norm": 0.30597519874572754, "learning_rate": 3.0419473753061046e-06, "loss": 0.0141125014051795, "memory(GiB)": 21.48, "step": 19907, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957851 }, { "epoch": 0.6467205925348407, "grad_norm": 0.303911417722702, "learning_rate": 3.041453135498663e-06, "loss": 0.01077249925583601, "memory(GiB)": 21.48, "step": 19908, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957862 }, { "epoch": 0.6467530779975961, "grad_norm": 0.6300748586654663, "learning_rate": 3.040958918295461e-06, "loss": 0.019462745636701584, "memory(GiB)": 21.48, "step": 19909, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.957871 }, { "epoch": 0.6467855634603515, "grad_norm": 0.31365859508514404, "learning_rate": 3.0404647237021983e-06, "loss": 0.0215504989027977, "memory(GiB)": 21.48, "step": 19910, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.957882 }, { "epoch": 0.6468180489231069, "grad_norm": 0.25721222162246704, "learning_rate": 3.0399705517245826e-06, "loss": 0.009747141040861607, "memory(GiB)": 21.48, "step": 19911, "token_acc": 1.0, "train_speed(iter/s)": 0.957892 }, { "epoch": 0.6468505343858624, "grad_norm": 0.44688114523887634, "learning_rate": 3.0394764023683137e-06, "loss": 0.02489526756107807, "memory(GiB)": 21.48, "step": 19912, "token_acc": 0.9923076923076923, "train_speed(iter/s)": 0.957902 }, { "epoch": 0.6468830198486177, "grad_norm": 0.38058438897132874, "learning_rate": 3.0389822756390993e-06, "loss": 0.021463219076395035, "memory(GiB)": 21.48, "step": 19913, "token_acc": 1.0, "train_speed(iter/s)": 0.957912 }, { "epoch": 0.6469155053113732, "grad_norm": 0.34197962284088135, "learning_rate": 3.038488171542636e-06, "loss": 0.021678775548934937, "memory(GiB)": 21.48, "step": 19914, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957922 }, { "epoch": 0.6469479907741286, "grad_norm": 0.3491840958595276, "learning_rate": 3.0379940900846333e-06, "loss": 0.016747338697314262, "memory(GiB)": 21.48, "step": 19915, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957933 }, { "epoch": 0.646980476236884, "grad_norm": 0.41577720642089844, "learning_rate": 3.037500031270787e-06, "loss": 0.013787400908768177, "memory(GiB)": 21.48, "step": 19916, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.957941 }, { "epoch": 0.6470129616996394, "grad_norm": 0.3781220614910126, "learning_rate": 3.037005995106806e-06, "loss": 0.014656656421720982, "memory(GiB)": 21.48, "step": 19917, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.95795 }, { "epoch": 0.6470454471623949, "grad_norm": 0.29386818408966064, "learning_rate": 3.036511981598386e-06, "loss": 0.010575151070952415, "memory(GiB)": 21.48, "step": 19918, "token_acc": 1.0, "train_speed(iter/s)": 0.95796 }, { "epoch": 0.6470779326251502, "grad_norm": 0.30109885334968567, "learning_rate": 3.0360179907512323e-06, "loss": 0.015329468064010143, "memory(GiB)": 21.48, "step": 19919, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.95797 }, { "epoch": 0.6471104180879057, "grad_norm": 0.295320063829422, "learning_rate": 3.0355240225710443e-06, "loss": 0.011850578710436821, "memory(GiB)": 21.48, "step": 19920, "token_acc": 1.0, "train_speed(iter/s)": 0.95798 }, { "epoch": 0.6471429035506611, "grad_norm": 0.4934823215007782, "learning_rate": 3.0350300770635244e-06, "loss": 0.027539070695638657, "memory(GiB)": 21.48, "step": 19921, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.95799 }, { "epoch": 0.6471753890134165, "grad_norm": 0.3021596670150757, "learning_rate": 3.034536154234372e-06, "loss": 0.011045404709875584, "memory(GiB)": 21.48, "step": 19922, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.958001 }, { "epoch": 0.6472078744761719, "grad_norm": 0.3460709750652313, "learning_rate": 3.034042254089289e-06, "loss": 0.015285829082131386, "memory(GiB)": 21.48, "step": 19923, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.958011 }, { "epoch": 0.6472403599389274, "grad_norm": 0.35564038157463074, "learning_rate": 3.0335483766339745e-06, "loss": 0.0151203703135252, "memory(GiB)": 21.48, "step": 19924, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.958021 }, { "epoch": 0.6472728454016827, "grad_norm": 0.38892215490341187, "learning_rate": 3.033054521874129e-06, "loss": 0.020106937736272812, "memory(GiB)": 21.48, "step": 19925, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.958031 }, { "epoch": 0.6473053308644382, "grad_norm": 0.46829020977020264, "learning_rate": 3.032560689815453e-06, "loss": 0.02047932706773281, "memory(GiB)": 21.48, "step": 19926, "token_acc": 0.98, "train_speed(iter/s)": 0.958041 }, { "epoch": 0.6473378163271936, "grad_norm": 0.2852274179458618, "learning_rate": 3.0320668804636445e-06, "loss": 0.018124382942914963, "memory(GiB)": 21.48, "step": 19927, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.958051 }, { "epoch": 0.647370301789949, "grad_norm": 0.776432454586029, "learning_rate": 3.031573093824405e-06, "loss": 0.025881968438625336, "memory(GiB)": 21.48, "step": 19928, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.958061 }, { "epoch": 0.6474027872527044, "grad_norm": 0.2695057690143585, "learning_rate": 3.03107932990343e-06, "loss": 0.014685245230793953, "memory(GiB)": 21.48, "step": 19929, "token_acc": 1.0, "train_speed(iter/s)": 0.958072 }, { "epoch": 0.6474352727154599, "grad_norm": 0.403343141078949, "learning_rate": 3.0305855887064217e-06, "loss": 0.014291031286120415, "memory(GiB)": 21.48, "step": 19930, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.958081 }, { "epoch": 0.6474677581782152, "grad_norm": 0.36401283740997314, "learning_rate": 3.0300918702390763e-06, "loss": 0.017725002020597458, "memory(GiB)": 21.48, "step": 19931, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.958091 }, { "epoch": 0.6475002436409707, "grad_norm": 0.3327259421348572, "learning_rate": 3.0295981745070933e-06, "loss": 0.015093171037733555, "memory(GiB)": 21.48, "step": 19932, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.958101 }, { "epoch": 0.6475327291037261, "grad_norm": 0.35097041726112366, "learning_rate": 3.0291045015161692e-06, "loss": 0.013921255245804787, "memory(GiB)": 21.48, "step": 19933, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.958112 }, { "epoch": 0.6475652145664815, "grad_norm": 0.31346622109413147, "learning_rate": 3.0286108512720032e-06, "loss": 0.016161441802978516, "memory(GiB)": 21.48, "step": 19934, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.958121 }, { "epoch": 0.6475977000292369, "grad_norm": 0.4177229702472687, "learning_rate": 3.0281172237802907e-06, "loss": 0.017877629026770592, "memory(GiB)": 21.48, "step": 19935, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.95813 }, { "epoch": 0.6476301854919924, "grad_norm": 0.21883174777030945, "learning_rate": 3.0276236190467323e-06, "loss": 0.009731702506542206, "memory(GiB)": 21.48, "step": 19936, "token_acc": 1.0, "train_speed(iter/s)": 0.958137 }, { "epoch": 0.6476626709547477, "grad_norm": 0.3355138897895813, "learning_rate": 3.0271300370770197e-06, "loss": 0.009365483187139034, "memory(GiB)": 21.48, "step": 19937, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.958143 }, { "epoch": 0.6476951564175032, "grad_norm": 0.4604863226413727, "learning_rate": 3.0266364778768553e-06, "loss": 0.017815005034208298, "memory(GiB)": 21.48, "step": 19938, "token_acc": 0.9890510948905109, "train_speed(iter/s)": 0.95815 }, { "epoch": 0.6477276418802586, "grad_norm": 0.20100028812885284, "learning_rate": 3.026142941451929e-06, "loss": 0.009633034467697144, "memory(GiB)": 21.48, "step": 19939, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.958157 }, { "epoch": 0.647760127343014, "grad_norm": 0.37009483575820923, "learning_rate": 3.0256494278079417e-06, "loss": 0.01798105053603649, "memory(GiB)": 21.48, "step": 19940, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.958163 }, { "epoch": 0.6477926128057694, "grad_norm": 0.5925417542457581, "learning_rate": 3.02515593695059e-06, "loss": 0.013714522123336792, "memory(GiB)": 21.48, "step": 19941, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.95817 }, { "epoch": 0.6478250982685249, "grad_norm": 0.6987433433532715, "learning_rate": 3.0246624688855643e-06, "loss": 0.00890594907104969, "memory(GiB)": 21.48, "step": 19942, "token_acc": 1.0, "train_speed(iter/s)": 0.958177 }, { "epoch": 0.6478575837312802, "grad_norm": 0.40119001269340515, "learning_rate": 3.024169023618565e-06, "loss": 0.018572499975562096, "memory(GiB)": 21.48, "step": 19943, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.958183 }, { "epoch": 0.6478900691940357, "grad_norm": 0.36969977617263794, "learning_rate": 3.023675601155283e-06, "loss": 0.01784404180943966, "memory(GiB)": 21.48, "step": 19944, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.95819 }, { "epoch": 0.647922554656791, "grad_norm": 0.31475692987442017, "learning_rate": 3.023182201501418e-06, "loss": 0.01401340402662754, "memory(GiB)": 21.48, "step": 19945, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.958196 }, { "epoch": 0.6479550401195465, "grad_norm": 0.37246212363243103, "learning_rate": 3.022688824662659e-06, "loss": 0.019316114485263824, "memory(GiB)": 21.48, "step": 19946, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.958203 }, { "epoch": 0.6479875255823019, "grad_norm": 0.34122925996780396, "learning_rate": 3.0221954706447042e-06, "loss": 0.016149017959833145, "memory(GiB)": 21.48, "step": 19947, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.95821 }, { "epoch": 0.6480200110450574, "grad_norm": 0.29451173543930054, "learning_rate": 3.021702139453245e-06, "loss": 0.013302153907716274, "memory(GiB)": 21.48, "step": 19948, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.958216 }, { "epoch": 0.6480524965078127, "grad_norm": 0.3909754455089569, "learning_rate": 3.021208831093977e-06, "loss": 0.022059595212340355, "memory(GiB)": 21.48, "step": 19949, "token_acc": 0.9961685823754789, "train_speed(iter/s)": 0.958221 }, { "epoch": 0.6480849819705682, "grad_norm": 0.3404049873352051, "learning_rate": 3.0207155455725923e-06, "loss": 0.01711808145046234, "memory(GiB)": 21.48, "step": 19950, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.958227 }, { "epoch": 0.6481174674333235, "grad_norm": 0.3838711977005005, "learning_rate": 3.0202222828947853e-06, "loss": 0.021612679585814476, "memory(GiB)": 21.48, "step": 19951, "token_acc": 1.0, "train_speed(iter/s)": 0.958234 }, { "epoch": 0.648149952896079, "grad_norm": 0.2883073091506958, "learning_rate": 3.0197290430662474e-06, "loss": 0.012240199372172356, "memory(GiB)": 21.48, "step": 19952, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.95824 }, { "epoch": 0.6481824383588344, "grad_norm": 0.2574262022972107, "learning_rate": 3.0192358260926726e-06, "loss": 0.011086409911513329, "memory(GiB)": 21.48, "step": 19953, "token_acc": 1.0, "train_speed(iter/s)": 0.958247 }, { "epoch": 0.6482149238215899, "grad_norm": 0.49804508686065674, "learning_rate": 3.0187426319797517e-06, "loss": 0.017842456698417664, "memory(GiB)": 21.48, "step": 19954, "token_acc": 0.9948717948717949, "train_speed(iter/s)": 0.958252 }, { "epoch": 0.6482474092843452, "grad_norm": 0.36878374218940735, "learning_rate": 3.018249460733179e-06, "loss": 0.019068729132413864, "memory(GiB)": 21.48, "step": 19955, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.95826 }, { "epoch": 0.6482798947471007, "grad_norm": 0.3419894874095917, "learning_rate": 3.0177563123586452e-06, "loss": 0.014688748866319656, "memory(GiB)": 21.48, "step": 19956, "token_acc": 1.0, "train_speed(iter/s)": 0.958268 }, { "epoch": 0.648312380209856, "grad_norm": 0.4665583670139313, "learning_rate": 3.017263186861841e-06, "loss": 0.021422134712338448, "memory(GiB)": 21.48, "step": 19957, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.958276 }, { "epoch": 0.6483448656726115, "grad_norm": 0.31751325726509094, "learning_rate": 3.0167700842484606e-06, "loss": 0.012504065409302711, "memory(GiB)": 21.48, "step": 19958, "token_acc": 0.99609375, "train_speed(iter/s)": 0.958284 }, { "epoch": 0.6483773511353669, "grad_norm": 0.2956107556819916, "learning_rate": 3.016277004524191e-06, "loss": 0.01464561652392149, "memory(GiB)": 21.48, "step": 19959, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.958294 }, { "epoch": 0.6484098365981223, "grad_norm": 0.404096394777298, "learning_rate": 3.0157839476947266e-06, "loss": 0.019352175295352936, "memory(GiB)": 21.48, "step": 19960, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.958305 }, { "epoch": 0.6484423220608777, "grad_norm": 0.3084684908390045, "learning_rate": 3.0152909137657548e-06, "loss": 0.011606080457568169, "memory(GiB)": 21.48, "step": 19961, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.958315 }, { "epoch": 0.6484748075236332, "grad_norm": 0.35647788643836975, "learning_rate": 3.0147979027429696e-06, "loss": 0.013436902314424515, "memory(GiB)": 21.48, "step": 19962, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.958325 }, { "epoch": 0.6485072929863885, "grad_norm": 0.34378084540367126, "learning_rate": 3.0143049146320567e-06, "loss": 0.01658688858151436, "memory(GiB)": 21.48, "step": 19963, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.958335 }, { "epoch": 0.648539778449144, "grad_norm": 0.4685920178890228, "learning_rate": 3.013811949438711e-06, "loss": 0.018039217218756676, "memory(GiB)": 21.48, "step": 19964, "token_acc": 0.995, "train_speed(iter/s)": 0.958345 }, { "epoch": 0.6485722639118994, "grad_norm": 0.42001253366470337, "learning_rate": 3.013319007168616e-06, "loss": 0.01619243621826172, "memory(GiB)": 21.48, "step": 19965, "token_acc": 0.991869918699187, "train_speed(iter/s)": 0.958356 }, { "epoch": 0.6486047493746548, "grad_norm": 0.5310123562812805, "learning_rate": 3.012826087827466e-06, "loss": 0.018306612968444824, "memory(GiB)": 21.48, "step": 19966, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.958365 }, { "epoch": 0.6486372348374102, "grad_norm": 0.38577866554260254, "learning_rate": 3.012333191420946e-06, "loss": 0.013971253298223019, "memory(GiB)": 21.48, "step": 19967, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.958375 }, { "epoch": 0.6486697203001657, "grad_norm": 0.684033989906311, "learning_rate": 3.011840317954749e-06, "loss": 0.0316140279173851, "memory(GiB)": 21.48, "step": 19968, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.958385 }, { "epoch": 0.6487022057629211, "grad_norm": 0.5713546276092529, "learning_rate": 3.011347467434559e-06, "loss": 0.023734871298074722, "memory(GiB)": 21.48, "step": 19969, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.958395 }, { "epoch": 0.6487346912256765, "grad_norm": 0.3059503734111786, "learning_rate": 3.0108546398660647e-06, "loss": 0.016637947410345078, "memory(GiB)": 21.48, "step": 19970, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.958406 }, { "epoch": 0.648767176688432, "grad_norm": 0.3032495379447937, "learning_rate": 3.010361835254959e-06, "loss": 0.017063289880752563, "memory(GiB)": 21.48, "step": 19971, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.958416 }, { "epoch": 0.6487996621511873, "grad_norm": 0.36826902627944946, "learning_rate": 3.009869053606923e-06, "loss": 0.020423702895641327, "memory(GiB)": 21.48, "step": 19972, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.958425 }, { "epoch": 0.6488321476139428, "grad_norm": 0.6546205878257751, "learning_rate": 3.0093762949276494e-06, "loss": 0.01773699000477791, "memory(GiB)": 21.48, "step": 19973, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.958436 }, { "epoch": 0.6488646330766982, "grad_norm": 0.36243563890457153, "learning_rate": 3.0088835592228204e-06, "loss": 0.02093033865094185, "memory(GiB)": 21.48, "step": 19974, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.958446 }, { "epoch": 0.6488971185394536, "grad_norm": 0.7122193574905396, "learning_rate": 3.008390846498128e-06, "loss": 0.03292841091752052, "memory(GiB)": 21.48, "step": 19975, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.958456 }, { "epoch": 0.648929604002209, "grad_norm": 0.3028494119644165, "learning_rate": 3.007898156759254e-06, "loss": 0.013625399209558964, "memory(GiB)": 21.48, "step": 19976, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.958466 }, { "epoch": 0.6489620894649645, "grad_norm": 0.3008747696876526, "learning_rate": 3.0074054900118877e-06, "loss": 0.013699017465114594, "memory(GiB)": 21.48, "step": 19977, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.958476 }, { "epoch": 0.6489945749277198, "grad_norm": 0.24582426249980927, "learning_rate": 3.0069128462617135e-06, "loss": 0.009461136534810066, "memory(GiB)": 21.48, "step": 19978, "token_acc": 1.0, "train_speed(iter/s)": 0.958486 }, { "epoch": 0.6490270603904753, "grad_norm": 0.34562963247299194, "learning_rate": 3.006420225514419e-06, "loss": 0.02121451124548912, "memory(GiB)": 21.48, "step": 19979, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.958496 }, { "epoch": 0.6490595458532307, "grad_norm": 0.3128911554813385, "learning_rate": 3.0059276277756868e-06, "loss": 0.017813686281442642, "memory(GiB)": 21.48, "step": 19980, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.958507 }, { "epoch": 0.6490920313159861, "grad_norm": 0.31305748224258423, "learning_rate": 3.0054350530512057e-06, "loss": 0.017147552222013474, "memory(GiB)": 21.48, "step": 19981, "token_acc": 0.9900332225913622, "train_speed(iter/s)": 0.958516 }, { "epoch": 0.6491245167787415, "grad_norm": 0.3502157926559448, "learning_rate": 3.0049425013466573e-06, "loss": 0.017905402928590775, "memory(GiB)": 21.48, "step": 19982, "token_acc": 0.9805825242718447, "train_speed(iter/s)": 0.958526 }, { "epoch": 0.649157002241497, "grad_norm": 0.3410698175430298, "learning_rate": 3.0044499726677288e-06, "loss": 0.017011087387800217, "memory(GiB)": 21.48, "step": 19983, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.958537 }, { "epoch": 0.6491894877042523, "grad_norm": 0.4374711811542511, "learning_rate": 3.0039574670201034e-06, "loss": 0.0155624495819211, "memory(GiB)": 21.48, "step": 19984, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.958546 }, { "epoch": 0.6492219731670078, "grad_norm": 0.2607925534248352, "learning_rate": 3.0034649844094648e-06, "loss": 0.012171913869678974, "memory(GiB)": 21.48, "step": 19985, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.958557 }, { "epoch": 0.6492544586297632, "grad_norm": 0.44875597953796387, "learning_rate": 3.0029725248414986e-06, "loss": 0.015796374529600143, "memory(GiB)": 21.48, "step": 19986, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.958566 }, { "epoch": 0.6492869440925186, "grad_norm": 0.2708147466182709, "learning_rate": 3.0024800883218876e-06, "loss": 0.014590902253985405, "memory(GiB)": 21.48, "step": 19987, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.958574 }, { "epoch": 0.649319429555274, "grad_norm": 0.280708909034729, "learning_rate": 3.001987674856315e-06, "loss": 0.008631651289761066, "memory(GiB)": 21.48, "step": 19988, "token_acc": 1.0, "train_speed(iter/s)": 0.958585 }, { "epoch": 0.6493519150180295, "grad_norm": 0.4129965007305145, "learning_rate": 3.001495284450463e-06, "loss": 0.024371430277824402, "memory(GiB)": 21.48, "step": 19989, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.958595 }, { "epoch": 0.6493844004807848, "grad_norm": 0.36050981283187866, "learning_rate": 3.001002917110017e-06, "loss": 0.014313019812107086, "memory(GiB)": 21.48, "step": 19990, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.958605 }, { "epoch": 0.6494168859435403, "grad_norm": 1.2571231126785278, "learning_rate": 3.000510572840657e-06, "loss": 0.02518545091152191, "memory(GiB)": 21.48, "step": 19991, "token_acc": 1.0, "train_speed(iter/s)": 0.958615 }, { "epoch": 0.6494493714062957, "grad_norm": 0.39256370067596436, "learning_rate": 3.000018251648067e-06, "loss": 0.018636060878634453, "memory(GiB)": 21.48, "step": 19992, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.958625 }, { "epoch": 0.6494818568690511, "grad_norm": 0.37568894028663635, "learning_rate": 2.999525953537927e-06, "loss": 0.017222357913851738, "memory(GiB)": 21.48, "step": 19993, "token_acc": 0.9952830188679245, "train_speed(iter/s)": 0.958635 }, { "epoch": 0.6495143423318065, "grad_norm": 0.3349183201789856, "learning_rate": 2.9990336785159222e-06, "loss": 0.016160886734724045, "memory(GiB)": 21.48, "step": 19994, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.958645 }, { "epoch": 0.649546827794562, "grad_norm": 0.321101576089859, "learning_rate": 2.9985414265877294e-06, "loss": 0.015713073313236237, "memory(GiB)": 21.48, "step": 19995, "token_acc": 1.0, "train_speed(iter/s)": 0.958656 }, { "epoch": 0.6495793132573173, "grad_norm": 0.27518486976623535, "learning_rate": 2.9980491977590355e-06, "loss": 0.01792953908443451, "memory(GiB)": 21.48, "step": 19996, "token_acc": 1.0, "train_speed(iter/s)": 0.958664 }, { "epoch": 0.6496117987200728, "grad_norm": 0.2667578160762787, "learning_rate": 2.9975569920355154e-06, "loss": 0.016853127628564835, "memory(GiB)": 21.48, "step": 19997, "token_acc": 1.0, "train_speed(iter/s)": 0.958671 }, { "epoch": 0.6496442841828282, "grad_norm": 0.3047276735305786, "learning_rate": 2.997064809422856e-06, "loss": 0.01884046569466591, "memory(GiB)": 21.48, "step": 19998, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.958678 }, { "epoch": 0.6496767696455836, "grad_norm": 0.31661850214004517, "learning_rate": 2.9965726499267324e-06, "loss": 0.017531532794237137, "memory(GiB)": 21.48, "step": 19999, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.958685 }, { "epoch": 0.649709255108339, "grad_norm": 0.4554053246974945, "learning_rate": 2.9960805135528274e-06, "loss": 0.016470741480588913, "memory(GiB)": 21.48, "step": 20000, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.958692 }, { "epoch": 0.649709255108339, "eval_loss": 0.016894664615392685, "eval_runtime": 80.7495, "eval_samples_per_second": 123.221, "eval_steps_per_second": 3.851, "eval_token_acc": 0.9933003499125648, "step": 20000 }, { "epoch": 0.6497417405710945, "grad_norm": 0.34149813652038574, "learning_rate": 2.9955884003068203e-06, "loss": 0.01259641908109188, "memory(GiB)": 21.48, "step": 20001, "token_acc": 0.9930146555266401, "train_speed(iter/s)": 0.954463 }, { "epoch": 0.6497742260338498, "grad_norm": 0.38435477018356323, "learning_rate": 2.9950963101943897e-06, "loss": 0.017931003123521805, "memory(GiB)": 21.48, "step": 20002, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.954458 }, { "epoch": 0.6498067114966053, "grad_norm": 0.5019903779029846, "learning_rate": 2.994604243221219e-06, "loss": 0.021641619503498077, "memory(GiB)": 21.48, "step": 20003, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.954464 }, { "epoch": 0.6498391969593607, "grad_norm": 0.342802494764328, "learning_rate": 2.9941121993929828e-06, "loss": 0.015834098681807518, "memory(GiB)": 21.48, "step": 20004, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.954471 }, { "epoch": 0.6498716824221161, "grad_norm": 0.22641228139400482, "learning_rate": 2.993620178715363e-06, "loss": 0.011632042936980724, "memory(GiB)": 21.48, "step": 20005, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.954479 }, { "epoch": 0.6499041678848715, "grad_norm": 0.479498952627182, "learning_rate": 2.9931281811940347e-06, "loss": 0.014400623738765717, "memory(GiB)": 21.48, "step": 20006, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.954487 }, { "epoch": 0.649936653347627, "grad_norm": 0.3567809462547302, "learning_rate": 2.9926362068346804e-06, "loss": 0.016224723309278488, "memory(GiB)": 21.48, "step": 20007, "token_acc": 1.0, "train_speed(iter/s)": 0.954495 }, { "epoch": 0.6499691388103823, "grad_norm": 0.5720611214637756, "learning_rate": 2.992144255642975e-06, "loss": 0.02646647021174431, "memory(GiB)": 21.48, "step": 20008, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.954502 }, { "epoch": 0.6500016242731378, "grad_norm": 0.6204814910888672, "learning_rate": 2.9916523276245975e-06, "loss": 0.024209273979067802, "memory(GiB)": 21.48, "step": 20009, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.95451 }, { "epoch": 0.6500341097358932, "grad_norm": 0.32103249430656433, "learning_rate": 2.991160422785224e-06, "loss": 0.011462515220046043, "memory(GiB)": 21.48, "step": 20010, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.954518 }, { "epoch": 0.6500665951986486, "grad_norm": 0.34215256571769714, "learning_rate": 2.990668541130535e-06, "loss": 0.017197493463754654, "memory(GiB)": 21.48, "step": 20011, "token_acc": 0.9936305732484076, "train_speed(iter/s)": 0.954525 }, { "epoch": 0.650099080661404, "grad_norm": 0.23470798134803772, "learning_rate": 2.990176682666205e-06, "loss": 0.013046067208051682, "memory(GiB)": 21.48, "step": 20012, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.954533 }, { "epoch": 0.6501315661241595, "grad_norm": 0.26750436425209045, "learning_rate": 2.9896848473979112e-06, "loss": 0.015406322665512562, "memory(GiB)": 21.48, "step": 20013, "token_acc": 1.0, "train_speed(iter/s)": 0.95454 }, { "epoch": 0.6501640515869148, "grad_norm": 0.4088185727596283, "learning_rate": 2.9891930353313287e-06, "loss": 0.014720532111823559, "memory(GiB)": 21.48, "step": 20014, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.954549 }, { "epoch": 0.6501965370496703, "grad_norm": 0.38523203134536743, "learning_rate": 2.9887012464721365e-06, "loss": 0.018305573612451553, "memory(GiB)": 21.48, "step": 20015, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.954558 }, { "epoch": 0.6502290225124256, "grad_norm": 0.38668110966682434, "learning_rate": 2.9882094808260076e-06, "loss": 0.018679603934288025, "memory(GiB)": 21.48, "step": 20016, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.954568 }, { "epoch": 0.6502615079751811, "grad_norm": 0.5095770359039307, "learning_rate": 2.9877177383986188e-06, "loss": 0.015253040939569473, "memory(GiB)": 21.48, "step": 20017, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.954579 }, { "epoch": 0.6502939934379365, "grad_norm": 0.43497395515441895, "learning_rate": 2.9872260191956477e-06, "loss": 0.01935194618999958, "memory(GiB)": 21.48, "step": 20018, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.954588 }, { "epoch": 0.650326478900692, "grad_norm": 0.2999103367328644, "learning_rate": 2.9867343232227652e-06, "loss": 0.015021687373518944, "memory(GiB)": 21.48, "step": 20019, "token_acc": 1.0, "train_speed(iter/s)": 0.954597 }, { "epoch": 0.6503589643634473, "grad_norm": 0.2943733036518097, "learning_rate": 2.98624265048565e-06, "loss": 0.012567188590765, "memory(GiB)": 21.48, "step": 20020, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.954608 }, { "epoch": 0.6503914498262028, "grad_norm": 0.35617226362228394, "learning_rate": 2.9857510009899736e-06, "loss": 0.012980177998542786, "memory(GiB)": 21.48, "step": 20021, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.954619 }, { "epoch": 0.6504239352889581, "grad_norm": 0.3399268686771393, "learning_rate": 2.9852593747414137e-06, "loss": 0.012516949325799942, "memory(GiB)": 21.48, "step": 20022, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.954629 }, { "epoch": 0.6504564207517136, "grad_norm": 0.38199833035469055, "learning_rate": 2.984767771745639e-06, "loss": 0.014716248959302902, "memory(GiB)": 21.48, "step": 20023, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.95464 }, { "epoch": 0.650488906214469, "grad_norm": 0.28926923871040344, "learning_rate": 2.9842761920083297e-06, "loss": 0.016251539811491966, "memory(GiB)": 21.48, "step": 20024, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.95465 }, { "epoch": 0.6505213916772244, "grad_norm": 0.49972108006477356, "learning_rate": 2.983784635535153e-06, "loss": 0.021240703761577606, "memory(GiB)": 21.48, "step": 20025, "token_acc": 0.9961685823754789, "train_speed(iter/s)": 0.954661 }, { "epoch": 0.6505538771399798, "grad_norm": 0.37376853823661804, "learning_rate": 2.983293102331788e-06, "loss": 0.02126123569905758, "memory(GiB)": 21.48, "step": 20026, "token_acc": 1.0, "train_speed(iter/s)": 0.954672 }, { "epoch": 0.6505863626027353, "grad_norm": 0.3882276713848114, "learning_rate": 2.9828015924039033e-06, "loss": 0.012504234910011292, "memory(GiB)": 21.48, "step": 20027, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.954682 }, { "epoch": 0.6506188480654906, "grad_norm": 0.5111439824104309, "learning_rate": 2.9823101057571737e-06, "loss": 0.024797679856419563, "memory(GiB)": 21.48, "step": 20028, "token_acc": 0.9763313609467456, "train_speed(iter/s)": 0.954693 }, { "epoch": 0.6506513335282461, "grad_norm": 0.20993156731128693, "learning_rate": 2.9818186423972708e-06, "loss": 0.010508885607123375, "memory(GiB)": 21.48, "step": 20029, "token_acc": 1.0, "train_speed(iter/s)": 0.954704 }, { "epoch": 0.6506838189910015, "grad_norm": 0.40619173645973206, "learning_rate": 2.9813272023298668e-06, "loss": 0.02262941375374794, "memory(GiB)": 21.48, "step": 20030, "token_acc": 0.995, "train_speed(iter/s)": 0.954715 }, { "epoch": 0.6507163044537569, "grad_norm": 0.4185559153556824, "learning_rate": 2.980835785560633e-06, "loss": 0.016641803085803986, "memory(GiB)": 21.48, "step": 20031, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.954725 }, { "epoch": 0.6507487899165124, "grad_norm": 0.407064825296402, "learning_rate": 2.9803443920952423e-06, "loss": 0.016293980181217194, "memory(GiB)": 21.48, "step": 20032, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.954736 }, { "epoch": 0.6507812753792678, "grad_norm": 0.34782806038856506, "learning_rate": 2.979853021939366e-06, "loss": 0.014027822762727737, "memory(GiB)": 21.48, "step": 20033, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.954746 }, { "epoch": 0.6508137608420232, "grad_norm": 0.2619953155517578, "learning_rate": 2.979361675098674e-06, "loss": 0.011071587912738323, "memory(GiB)": 21.48, "step": 20034, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.954758 }, { "epoch": 0.6508462463047786, "grad_norm": 0.3039558231830597, "learning_rate": 2.9788703515788387e-06, "loss": 0.013336509466171265, "memory(GiB)": 21.48, "step": 20035, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.954768 }, { "epoch": 0.6508787317675341, "grad_norm": 0.34056201577186584, "learning_rate": 2.9783790513855284e-06, "loss": 0.01767943985760212, "memory(GiB)": 21.48, "step": 20036, "token_acc": 1.0, "train_speed(iter/s)": 0.954779 }, { "epoch": 0.6509112172302894, "grad_norm": 0.310145765542984, "learning_rate": 2.9778877745244157e-06, "loss": 0.01363900676369667, "memory(GiB)": 21.48, "step": 20037, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.954789 }, { "epoch": 0.6509437026930449, "grad_norm": 0.4230009913444519, "learning_rate": 2.977396521001168e-06, "loss": 0.014708297327160835, "memory(GiB)": 21.48, "step": 20038, "token_acc": 1.0, "train_speed(iter/s)": 0.9548 }, { "epoch": 0.6509761881558003, "grad_norm": 0.276431679725647, "learning_rate": 2.976905290821458e-06, "loss": 0.013205772265791893, "memory(GiB)": 21.48, "step": 20039, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.95481 }, { "epoch": 0.6510086736185557, "grad_norm": 0.35300999879837036, "learning_rate": 2.9764140839909527e-06, "loss": 0.01758440211415291, "memory(GiB)": 21.48, "step": 20040, "token_acc": 1.0, "train_speed(iter/s)": 0.95482 }, { "epoch": 0.6510411590813111, "grad_norm": 0.39813941717147827, "learning_rate": 2.9759229005153232e-06, "loss": 0.01444944180548191, "memory(GiB)": 21.48, "step": 20041, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.954829 }, { "epoch": 0.6510736445440666, "grad_norm": 0.7976861000061035, "learning_rate": 2.975431740400237e-06, "loss": 0.03072933293879032, "memory(GiB)": 21.48, "step": 20042, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.954838 }, { "epoch": 0.6511061300068219, "grad_norm": 0.4238758981227875, "learning_rate": 2.9749406036513638e-06, "loss": 0.015674427151679993, "memory(GiB)": 21.48, "step": 20043, "token_acc": 1.0, "train_speed(iter/s)": 0.954846 }, { "epoch": 0.6511386154695774, "grad_norm": 0.4503813087940216, "learning_rate": 2.9744494902743705e-06, "loss": 0.02308233268558979, "memory(GiB)": 21.48, "step": 20044, "token_acc": 1.0, "train_speed(iter/s)": 0.954855 }, { "epoch": 0.6511711009323328, "grad_norm": 0.3673046827316284, "learning_rate": 2.973958400274928e-06, "loss": 0.021147828549146652, "memory(GiB)": 21.48, "step": 20045, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.954863 }, { "epoch": 0.6512035863950882, "grad_norm": 0.3230971693992615, "learning_rate": 2.973467333658699e-06, "loss": 0.0162736177444458, "memory(GiB)": 21.48, "step": 20046, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.954872 }, { "epoch": 0.6512360718578436, "grad_norm": 0.23315748572349548, "learning_rate": 2.9729762904313553e-06, "loss": 0.011299018748104572, "memory(GiB)": 21.48, "step": 20047, "token_acc": 1.0, "train_speed(iter/s)": 0.954881 }, { "epoch": 0.6512685573205991, "grad_norm": 0.30676841735839844, "learning_rate": 2.972485270598564e-06, "loss": 0.015721190720796585, "memory(GiB)": 21.48, "step": 20048, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.95489 }, { "epoch": 0.6513010427833544, "grad_norm": 0.35538405179977417, "learning_rate": 2.971994274165991e-06, "loss": 0.014034338295459747, "memory(GiB)": 21.48, "step": 20049, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.954898 }, { "epoch": 0.6513335282461099, "grad_norm": 0.41275787353515625, "learning_rate": 2.9715033011393046e-06, "loss": 0.01815253123641014, "memory(GiB)": 21.48, "step": 20050, "token_acc": 1.0, "train_speed(iter/s)": 0.954907 }, { "epoch": 0.6513660137088653, "grad_norm": 0.411987841129303, "learning_rate": 2.9710123515241673e-06, "loss": 0.017103999853134155, "memory(GiB)": 21.48, "step": 20051, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.954916 }, { "epoch": 0.6513984991716207, "grad_norm": 0.35280415415763855, "learning_rate": 2.9705214253262516e-06, "loss": 0.01428045704960823, "memory(GiB)": 21.48, "step": 20052, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.954925 }, { "epoch": 0.6514309846343761, "grad_norm": 0.40864551067352295, "learning_rate": 2.970030522551216e-06, "loss": 0.016318311914801598, "memory(GiB)": 21.48, "step": 20053, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.954933 }, { "epoch": 0.6514634700971316, "grad_norm": 0.3729655146598816, "learning_rate": 2.969539643204733e-06, "loss": 0.018099533393979073, "memory(GiB)": 21.48, "step": 20054, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.954942 }, { "epoch": 0.6514959555598869, "grad_norm": 0.6631321907043457, "learning_rate": 2.969048787292464e-06, "loss": 0.018580922856926918, "memory(GiB)": 21.48, "step": 20055, "token_acc": 0.9658119658119658, "train_speed(iter/s)": 0.95495 }, { "epoch": 0.6515284410226424, "grad_norm": 0.45603084564208984, "learning_rate": 2.9685579548200753e-06, "loss": 0.014507371932268143, "memory(GiB)": 21.48, "step": 20056, "token_acc": 0.9940476190476191, "train_speed(iter/s)": 0.954957 }, { "epoch": 0.6515609264853978, "grad_norm": 0.32437416911125183, "learning_rate": 2.968067145793231e-06, "loss": 0.01404858659952879, "memory(GiB)": 21.48, "step": 20057, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.954964 }, { "epoch": 0.6515934119481532, "grad_norm": 0.36677831411361694, "learning_rate": 2.9675763602175977e-06, "loss": 0.014450335875153542, "memory(GiB)": 21.48, "step": 20058, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.954971 }, { "epoch": 0.6516258974109086, "grad_norm": 0.5401422381401062, "learning_rate": 2.967085598098837e-06, "loss": 0.018966738134622574, "memory(GiB)": 21.48, "step": 20059, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.954979 }, { "epoch": 0.6516583828736641, "grad_norm": 0.46379977464675903, "learning_rate": 2.966594859442615e-06, "loss": 0.02374739944934845, "memory(GiB)": 21.48, "step": 20060, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.954984 }, { "epoch": 0.6516908683364194, "grad_norm": 0.40601176023483276, "learning_rate": 2.9661041442545945e-06, "loss": 0.015667978674173355, "memory(GiB)": 21.48, "step": 20061, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.954991 }, { "epoch": 0.6517233537991749, "grad_norm": 0.34531471133232117, "learning_rate": 2.9656134525404393e-06, "loss": 0.01750374212861061, "memory(GiB)": 21.48, "step": 20062, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.954998 }, { "epoch": 0.6517558392619303, "grad_norm": 0.36310848593711853, "learning_rate": 2.9651227843058126e-06, "loss": 0.016969095915555954, "memory(GiB)": 21.48, "step": 20063, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.955005 }, { "epoch": 0.6517883247246857, "grad_norm": 0.423600971698761, "learning_rate": 2.964632139556377e-06, "loss": 0.015390681102871895, "memory(GiB)": 21.48, "step": 20064, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.955012 }, { "epoch": 0.6518208101874411, "grad_norm": 0.3868527114391327, "learning_rate": 2.964141518297796e-06, "loss": 0.013374898582696915, "memory(GiB)": 21.48, "step": 20065, "token_acc": 0.9948717948717949, "train_speed(iter/s)": 0.95502 }, { "epoch": 0.6518532956501966, "grad_norm": 0.365448534488678, "learning_rate": 2.963650920535731e-06, "loss": 0.014327381737530231, "memory(GiB)": 21.48, "step": 20066, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.955027 }, { "epoch": 0.6518857811129519, "grad_norm": 0.30519843101501465, "learning_rate": 2.9631603462758462e-06, "loss": 0.013944429345428944, "memory(GiB)": 21.48, "step": 20067, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.955033 }, { "epoch": 0.6519182665757074, "grad_norm": 0.34143656492233276, "learning_rate": 2.9626697955238008e-06, "loss": 0.013325847685337067, "memory(GiB)": 21.48, "step": 20068, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.955041 }, { "epoch": 0.6519507520384628, "grad_norm": 0.8645742535591125, "learning_rate": 2.962179268285259e-06, "loss": 0.015300266444683075, "memory(GiB)": 21.48, "step": 20069, "token_acc": 1.0, "train_speed(iter/s)": 0.955049 }, { "epoch": 0.6519832375012182, "grad_norm": 0.3570471405982971, "learning_rate": 2.961688764565879e-06, "loss": 0.012965207919478416, "memory(GiB)": 21.48, "step": 20070, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955057 }, { "epoch": 0.6520157229639736, "grad_norm": 0.4853278696537018, "learning_rate": 2.961198284371325e-06, "loss": 0.0217851921916008, "memory(GiB)": 21.48, "step": 20071, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.955064 }, { "epoch": 0.6520482084267291, "grad_norm": 0.3681451678276062, "learning_rate": 2.9607078277072556e-06, "loss": 0.009364000521600246, "memory(GiB)": 21.48, "step": 20072, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955071 }, { "epoch": 0.6520806938894844, "grad_norm": 0.3166246712207794, "learning_rate": 2.960217394579334e-06, "loss": 0.014894864521920681, "memory(GiB)": 21.48, "step": 20073, "token_acc": 0.994475138121547, "train_speed(iter/s)": 0.955079 }, { "epoch": 0.6521131793522399, "grad_norm": 0.32197922468185425, "learning_rate": 2.9597269849932154e-06, "loss": 0.01697522960603237, "memory(GiB)": 21.48, "step": 20074, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955087 }, { "epoch": 0.6521456648149953, "grad_norm": 0.3498779237270355, "learning_rate": 2.9592365989545663e-06, "loss": 0.01865716278553009, "memory(GiB)": 21.48, "step": 20075, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.955098 }, { "epoch": 0.6521781502777507, "grad_norm": 0.2883438766002655, "learning_rate": 2.95874623646904e-06, "loss": 0.012055909261107445, "memory(GiB)": 21.48, "step": 20076, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.955109 }, { "epoch": 0.6522106357405061, "grad_norm": 0.27552682161331177, "learning_rate": 2.9582558975423015e-06, "loss": 0.014462050050497055, "memory(GiB)": 21.48, "step": 20077, "token_acc": 1.0, "train_speed(iter/s)": 0.955119 }, { "epoch": 0.6522431212032616, "grad_norm": 0.4216448664665222, "learning_rate": 2.9577655821800065e-06, "loss": 0.018053047358989716, "memory(GiB)": 21.48, "step": 20078, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.955129 }, { "epoch": 0.6522756066660169, "grad_norm": 0.37285518646240234, "learning_rate": 2.957275290387813e-06, "loss": 0.02343958057463169, "memory(GiB)": 21.48, "step": 20079, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.955138 }, { "epoch": 0.6523080921287724, "grad_norm": 0.3429754674434662, "learning_rate": 2.956785022171385e-06, "loss": 0.012921301648020744, "memory(GiB)": 21.48, "step": 20080, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.955148 }, { "epoch": 0.6523405775915277, "grad_norm": 0.22816479206085205, "learning_rate": 2.9562947775363747e-06, "loss": 0.013256353326141834, "memory(GiB)": 21.48, "step": 20081, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.955159 }, { "epoch": 0.6523730630542832, "grad_norm": 0.3090788424015045, "learning_rate": 2.955804556488445e-06, "loss": 0.014636959880590439, "memory(GiB)": 21.48, "step": 20082, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955169 }, { "epoch": 0.6524055485170386, "grad_norm": 0.276155024766922, "learning_rate": 2.95531435903325e-06, "loss": 0.013896021991968155, "memory(GiB)": 21.48, "step": 20083, "token_acc": 1.0, "train_speed(iter/s)": 0.955178 }, { "epoch": 0.652438033979794, "grad_norm": 0.3557285666465759, "learning_rate": 2.9548241851764497e-06, "loss": 0.014661330729722977, "memory(GiB)": 21.48, "step": 20084, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.955188 }, { "epoch": 0.6524705194425494, "grad_norm": 0.5618330836296082, "learning_rate": 2.9543340349236994e-06, "loss": 0.022887663915753365, "memory(GiB)": 21.48, "step": 20085, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.955198 }, { "epoch": 0.6525030049053049, "grad_norm": 0.3659047484397888, "learning_rate": 2.953843908280658e-06, "loss": 0.014203470200300217, "memory(GiB)": 21.48, "step": 20086, "token_acc": 0.992, "train_speed(iter/s)": 0.955208 }, { "epoch": 0.6525354903680602, "grad_norm": 0.45394596457481384, "learning_rate": 2.9533538052529808e-06, "loss": 0.018017107620835304, "memory(GiB)": 21.48, "step": 20087, "token_acc": 0.9792746113989638, "train_speed(iter/s)": 0.955218 }, { "epoch": 0.6525679758308157, "grad_norm": 0.2708967328071594, "learning_rate": 2.9528637258463254e-06, "loss": 0.01074322871863842, "memory(GiB)": 21.48, "step": 20088, "token_acc": 1.0, "train_speed(iter/s)": 0.955228 }, { "epoch": 0.6526004612935711, "grad_norm": 0.32439887523651123, "learning_rate": 2.9523736700663463e-06, "loss": 0.010399199090898037, "memory(GiB)": 21.48, "step": 20089, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955237 }, { "epoch": 0.6526329467563265, "grad_norm": 0.4329085350036621, "learning_rate": 2.9518836379187016e-06, "loss": 0.01468746643513441, "memory(GiB)": 21.48, "step": 20090, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.955247 }, { "epoch": 0.6526654322190819, "grad_norm": 0.3330489993095398, "learning_rate": 2.9513936294090444e-06, "loss": 0.01434270478785038, "memory(GiB)": 21.48, "step": 20091, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.955257 }, { "epoch": 0.6526979176818374, "grad_norm": 0.4299946129322052, "learning_rate": 2.950903644543033e-06, "loss": 0.02669728919863701, "memory(GiB)": 21.48, "step": 20092, "token_acc": 0.9893238434163701, "train_speed(iter/s)": 0.955268 }, { "epoch": 0.6527304031445927, "grad_norm": 0.5385056138038635, "learning_rate": 2.95041368332632e-06, "loss": 0.017159270122647285, "memory(GiB)": 21.48, "step": 20093, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955279 }, { "epoch": 0.6527628886073482, "grad_norm": 0.3266362249851227, "learning_rate": 2.9499237457645603e-06, "loss": 0.015593411400914192, "memory(GiB)": 21.48, "step": 20094, "token_acc": 1.0, "train_speed(iter/s)": 0.95529 }, { "epoch": 0.6527953740701036, "grad_norm": 0.3200642168521881, "learning_rate": 2.9494338318634104e-06, "loss": 0.014367099851369858, "memory(GiB)": 21.48, "step": 20095, "token_acc": 1.0, "train_speed(iter/s)": 0.9553 }, { "epoch": 0.652827859532859, "grad_norm": 0.38967108726501465, "learning_rate": 2.9489439416285225e-06, "loss": 0.018762197345495224, "memory(GiB)": 21.48, "step": 20096, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.95531 }, { "epoch": 0.6528603449956145, "grad_norm": 0.859624981880188, "learning_rate": 2.948454075065552e-06, "loss": 0.011783964931964874, "memory(GiB)": 21.48, "step": 20097, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.95532 }, { "epoch": 0.6528928304583699, "grad_norm": 0.28613826632499695, "learning_rate": 2.9479642321801515e-06, "loss": 0.017427222803235054, "memory(GiB)": 21.48, "step": 20098, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.95533 }, { "epoch": 0.6529253159211253, "grad_norm": 0.30701714754104614, "learning_rate": 2.9474744129779763e-06, "loss": 0.015540504828095436, "memory(GiB)": 21.48, "step": 20099, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.955341 }, { "epoch": 0.6529578013838807, "grad_norm": 0.3623655140399933, "learning_rate": 2.9469846174646775e-06, "loss": 0.016588740050792694, "memory(GiB)": 21.48, "step": 20100, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955351 }, { "epoch": 0.6529902868466362, "grad_norm": 1.9962142705917358, "learning_rate": 2.9464948456459104e-06, "loss": 0.019144278019666672, "memory(GiB)": 21.48, "step": 20101, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955361 }, { "epoch": 0.6530227723093915, "grad_norm": 0.2887267470359802, "learning_rate": 2.9460050975273225e-06, "loss": 0.01791258528828621, "memory(GiB)": 21.48, "step": 20102, "token_acc": 0.982532751091703, "train_speed(iter/s)": 0.955371 }, { "epoch": 0.653055257772147, "grad_norm": 0.433689683675766, "learning_rate": 2.9455153731145737e-06, "loss": 0.01977725327014923, "memory(GiB)": 21.48, "step": 20103, "token_acc": 1.0, "train_speed(iter/s)": 0.955381 }, { "epoch": 0.6530877432349024, "grad_norm": 0.4376325309276581, "learning_rate": 2.9450256724133073e-06, "loss": 0.013893291354179382, "memory(GiB)": 21.48, "step": 20104, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.955391 }, { "epoch": 0.6531202286976578, "grad_norm": 0.29456889629364014, "learning_rate": 2.944535995429184e-06, "loss": 0.011972388252615929, "memory(GiB)": 21.48, "step": 20105, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.955401 }, { "epoch": 0.6531527141604132, "grad_norm": 0.35231125354766846, "learning_rate": 2.9440463421678496e-06, "loss": 0.020035408437252045, "memory(GiB)": 21.48, "step": 20106, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.95541 }, { "epoch": 0.6531851996231687, "grad_norm": 0.3443608582019806, "learning_rate": 2.9435567126349572e-06, "loss": 0.017657840624451637, "memory(GiB)": 21.48, "step": 20107, "token_acc": 0.9923664122137404, "train_speed(iter/s)": 0.955418 }, { "epoch": 0.653217685085924, "grad_norm": 0.33663684129714966, "learning_rate": 2.9430671068361568e-06, "loss": 0.010731152258813381, "memory(GiB)": 21.48, "step": 20108, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955425 }, { "epoch": 0.6532501705486795, "grad_norm": 0.23260089755058289, "learning_rate": 2.9425775247770983e-06, "loss": 0.009566990658640862, "memory(GiB)": 21.48, "step": 20109, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.955434 }, { "epoch": 0.6532826560114349, "grad_norm": 0.775000274181366, "learning_rate": 2.9420879664634366e-06, "loss": 0.01943015493452549, "memory(GiB)": 21.48, "step": 20110, "token_acc": 0.978021978021978, "train_speed(iter/s)": 0.955443 }, { "epoch": 0.6533151414741903, "grad_norm": 0.6217572093009949, "learning_rate": 2.9415984319008173e-06, "loss": 0.02282162569463253, "memory(GiB)": 21.48, "step": 20111, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.955451 }, { "epoch": 0.6533476269369457, "grad_norm": 0.4147232174873352, "learning_rate": 2.9411089210948924e-06, "loss": 0.01650751754641533, "memory(GiB)": 21.48, "step": 20112, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.95546 }, { "epoch": 0.6533801123997012, "grad_norm": 0.3219064176082611, "learning_rate": 2.94061943405131e-06, "loss": 0.016972241923213005, "memory(GiB)": 21.48, "step": 20113, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.955467 }, { "epoch": 0.6534125978624565, "grad_norm": 0.4065382778644562, "learning_rate": 2.9401299707757214e-06, "loss": 0.018137481063604355, "memory(GiB)": 21.48, "step": 20114, "token_acc": 0.9840425531914894, "train_speed(iter/s)": 0.955476 }, { "epoch": 0.653445083325212, "grad_norm": 0.4239144027233124, "learning_rate": 2.9396405312737737e-06, "loss": 0.01961573399603367, "memory(GiB)": 21.48, "step": 20115, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.955484 }, { "epoch": 0.6534775687879674, "grad_norm": 0.40367162227630615, "learning_rate": 2.9391511155511177e-06, "loss": 0.017125407233834267, "memory(GiB)": 21.48, "step": 20116, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.95549 }, { "epoch": 0.6535100542507228, "grad_norm": 0.42993634939193726, "learning_rate": 2.9386617236133997e-06, "loss": 0.02115456759929657, "memory(GiB)": 21.48, "step": 20117, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.955497 }, { "epoch": 0.6535425397134782, "grad_norm": 0.3089695870876312, "learning_rate": 2.93817235546627e-06, "loss": 0.016713103279471397, "memory(GiB)": 21.48, "step": 20118, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.955504 }, { "epoch": 0.6535750251762337, "grad_norm": 0.5219563841819763, "learning_rate": 2.937683011115374e-06, "loss": 0.02049928903579712, "memory(GiB)": 21.48, "step": 20119, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.95551 }, { "epoch": 0.653607510638989, "grad_norm": 0.3127491772174835, "learning_rate": 2.9371936905663633e-06, "loss": 0.013262571766972542, "memory(GiB)": 21.48, "step": 20120, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955516 }, { "epoch": 0.6536399961017445, "grad_norm": 0.47704261541366577, "learning_rate": 2.936704393824881e-06, "loss": 0.019177738577127457, "memory(GiB)": 21.48, "step": 20121, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.955522 }, { "epoch": 0.6536724815644999, "grad_norm": 0.857292890548706, "learning_rate": 2.9362151208965773e-06, "loss": 0.01611059531569481, "memory(GiB)": 21.48, "step": 20122, "token_acc": 0.9921875, "train_speed(iter/s)": 0.955528 }, { "epoch": 0.6537049670272553, "grad_norm": 0.30499929189682007, "learning_rate": 2.9357258717870974e-06, "loss": 0.010791096836328506, "memory(GiB)": 21.48, "step": 20123, "token_acc": 1.0, "train_speed(iter/s)": 0.955535 }, { "epoch": 0.6537374524900107, "grad_norm": 0.41459378600120544, "learning_rate": 2.9352366465020885e-06, "loss": 0.029230032116174698, "memory(GiB)": 21.48, "step": 20124, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.955541 }, { "epoch": 0.6537699379527662, "grad_norm": 0.4776536226272583, "learning_rate": 2.934747445047198e-06, "loss": 0.011020217090845108, "memory(GiB)": 21.48, "step": 20125, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955548 }, { "epoch": 0.6538024234155215, "grad_norm": 0.28579458594322205, "learning_rate": 2.9342582674280696e-06, "loss": 0.012057057581841946, "memory(GiB)": 21.48, "step": 20126, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955554 }, { "epoch": 0.653834908878277, "grad_norm": 0.3898111581802368, "learning_rate": 2.933769113650351e-06, "loss": 0.02137797698378563, "memory(GiB)": 21.48, "step": 20127, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.955561 }, { "epoch": 0.6538673943410324, "grad_norm": 0.2762758135795593, "learning_rate": 2.9332799837196856e-06, "loss": 0.01522355992347002, "memory(GiB)": 21.48, "step": 20128, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.955567 }, { "epoch": 0.6538998798037878, "grad_norm": 0.38178351521492004, "learning_rate": 2.932790877641723e-06, "loss": 0.019920576363801956, "memory(GiB)": 21.48, "step": 20129, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.955574 }, { "epoch": 0.6539323652665432, "grad_norm": 0.2374047040939331, "learning_rate": 2.9323017954221016e-06, "loss": 0.011174896731972694, "memory(GiB)": 21.48, "step": 20130, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955582 }, { "epoch": 0.6539648507292987, "grad_norm": 0.4456517994403839, "learning_rate": 2.931812737066473e-06, "loss": 0.014292503707110882, "memory(GiB)": 21.48, "step": 20131, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.95559 }, { "epoch": 0.653997336192054, "grad_norm": 0.3252421021461487, "learning_rate": 2.9313237025804746e-06, "loss": 0.014834890142083168, "memory(GiB)": 21.48, "step": 20132, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955597 }, { "epoch": 0.6540298216548095, "grad_norm": 0.3591594696044922, "learning_rate": 2.930834691969757e-06, "loss": 0.015661772340536118, "memory(GiB)": 21.48, "step": 20133, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955606 }, { "epoch": 0.6540623071175649, "grad_norm": 0.29262158274650574, "learning_rate": 2.9303457052399596e-06, "loss": 0.012417709454894066, "memory(GiB)": 21.48, "step": 20134, "token_acc": 1.0, "train_speed(iter/s)": 0.955614 }, { "epoch": 0.6540947925803203, "grad_norm": 0.27431774139404297, "learning_rate": 2.9298567423967283e-06, "loss": 0.012427953071892262, "memory(GiB)": 21.48, "step": 20135, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.955624 }, { "epoch": 0.6541272780430757, "grad_norm": 0.47255027294158936, "learning_rate": 2.929367803445705e-06, "loss": 0.020267508924007416, "memory(GiB)": 21.48, "step": 20136, "token_acc": 1.0, "train_speed(iter/s)": 0.955634 }, { "epoch": 0.6541597635058312, "grad_norm": 0.3667037785053253, "learning_rate": 2.9288788883925334e-06, "loss": 0.018673736602067947, "memory(GiB)": 21.48, "step": 20137, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955645 }, { "epoch": 0.6541922489685865, "grad_norm": 0.33757737278938293, "learning_rate": 2.928389997242855e-06, "loss": 0.021191168576478958, "memory(GiB)": 21.48, "step": 20138, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.955655 }, { "epoch": 0.654224734431342, "grad_norm": 0.2876509428024292, "learning_rate": 2.9279011300023154e-06, "loss": 0.015043506398797035, "memory(GiB)": 21.48, "step": 20139, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.955665 }, { "epoch": 0.6542572198940974, "grad_norm": 0.26550835371017456, "learning_rate": 2.927412286676553e-06, "loss": 0.011751208454370499, "memory(GiB)": 21.48, "step": 20140, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955676 }, { "epoch": 0.6542897053568528, "grad_norm": 0.33497893810272217, "learning_rate": 2.9269234672712123e-06, "loss": 0.019654199481010437, "memory(GiB)": 21.48, "step": 20141, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955686 }, { "epoch": 0.6543221908196082, "grad_norm": 0.41156795620918274, "learning_rate": 2.926434671791934e-06, "loss": 0.021896343678236008, "memory(GiB)": 21.48, "step": 20142, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.955696 }, { "epoch": 0.6543546762823637, "grad_norm": 0.3284519612789154, "learning_rate": 2.925945900244359e-06, "loss": 0.017175566405057907, "memory(GiB)": 21.48, "step": 20143, "token_acc": 1.0, "train_speed(iter/s)": 0.955706 }, { "epoch": 0.654387161745119, "grad_norm": 0.38125351071357727, "learning_rate": 2.9254571526341293e-06, "loss": 0.009628377854824066, "memory(GiB)": 21.48, "step": 20144, "token_acc": 1.0, "train_speed(iter/s)": 0.955716 }, { "epoch": 0.6544196472078745, "grad_norm": 0.4787728190422058, "learning_rate": 2.9249684289668845e-06, "loss": 0.01730302721261978, "memory(GiB)": 21.48, "step": 20145, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.955725 }, { "epoch": 0.6544521326706298, "grad_norm": 0.3097379207611084, "learning_rate": 2.9244797292482672e-06, "loss": 0.014563914388418198, "memory(GiB)": 21.48, "step": 20146, "token_acc": 1.0, "train_speed(iter/s)": 0.955736 }, { "epoch": 0.6544846181333853, "grad_norm": 0.36804988980293274, "learning_rate": 2.9239910534839154e-06, "loss": 0.013732368126511574, "memory(GiB)": 21.48, "step": 20147, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955746 }, { "epoch": 0.6545171035961407, "grad_norm": 0.6742547750473022, "learning_rate": 2.9235024016794705e-06, "loss": 0.017662785947322845, "memory(GiB)": 21.48, "step": 20148, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955757 }, { "epoch": 0.6545495890588962, "grad_norm": 0.36846280097961426, "learning_rate": 2.923013773840571e-06, "loss": 0.01618238538503647, "memory(GiB)": 21.48, "step": 20149, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.955767 }, { "epoch": 0.6545820745216515, "grad_norm": 0.38813602924346924, "learning_rate": 2.922525169972858e-06, "loss": 0.01763208955526352, "memory(GiB)": 21.48, "step": 20150, "token_acc": 0.9923371647509579, "train_speed(iter/s)": 0.955776 }, { "epoch": 0.654614559984407, "grad_norm": 0.31818994879722595, "learning_rate": 2.9220365900819688e-06, "loss": 0.014610908925533295, "memory(GiB)": 21.48, "step": 20151, "token_acc": 1.0, "train_speed(iter/s)": 0.955786 }, { "epoch": 0.6546470454471623, "grad_norm": 0.29591822624206543, "learning_rate": 2.9215480341735446e-06, "loss": 0.014645667746663094, "memory(GiB)": 21.48, "step": 20152, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.955796 }, { "epoch": 0.6546795309099178, "grad_norm": 0.42921867966651917, "learning_rate": 2.92105950225322e-06, "loss": 0.017583400011062622, "memory(GiB)": 21.48, "step": 20153, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.955806 }, { "epoch": 0.6547120163726732, "grad_norm": 0.2581048905849457, "learning_rate": 2.9205709943266393e-06, "loss": 0.009649227373301983, "memory(GiB)": 21.48, "step": 20154, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955816 }, { "epoch": 0.6547445018354286, "grad_norm": 0.3183489143848419, "learning_rate": 2.920082510399433e-06, "loss": 0.012156574055552483, "memory(GiB)": 21.48, "step": 20155, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.955826 }, { "epoch": 0.654776987298184, "grad_norm": 0.8744084239006042, "learning_rate": 2.919594050477244e-06, "loss": 0.016477111726999283, "memory(GiB)": 21.48, "step": 20156, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.955836 }, { "epoch": 0.6548094727609395, "grad_norm": 0.48039430379867554, "learning_rate": 2.9191056145657117e-06, "loss": 0.021664682775735855, "memory(GiB)": 21.48, "step": 20157, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.955846 }, { "epoch": 0.6548419582236948, "grad_norm": 0.3107694983482361, "learning_rate": 2.9186172026704675e-06, "loss": 0.011512299068272114, "memory(GiB)": 21.48, "step": 20158, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.955856 }, { "epoch": 0.6548744436864503, "grad_norm": 0.5911591053009033, "learning_rate": 2.918128814797151e-06, "loss": 0.012443131767213345, "memory(GiB)": 21.48, "step": 20159, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955866 }, { "epoch": 0.6549069291492058, "grad_norm": 0.646220326423645, "learning_rate": 2.917640450951399e-06, "loss": 0.023959225043654442, "memory(GiB)": 21.48, "step": 20160, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.955876 }, { "epoch": 0.6549394146119611, "grad_norm": 0.27915313839912415, "learning_rate": 2.9171521111388497e-06, "loss": 0.00904717855155468, "memory(GiB)": 21.48, "step": 20161, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955886 }, { "epoch": 0.6549719000747166, "grad_norm": 0.40602388978004456, "learning_rate": 2.9166637953651356e-06, "loss": 0.016169928014278412, "memory(GiB)": 21.48, "step": 20162, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.955896 }, { "epoch": 0.655004385537472, "grad_norm": 0.3412315249443054, "learning_rate": 2.916175503635894e-06, "loss": 0.01911826990544796, "memory(GiB)": 21.48, "step": 20163, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955907 }, { "epoch": 0.6550368710002274, "grad_norm": 0.43003609776496887, "learning_rate": 2.915687235956761e-06, "loss": 0.017927002161741257, "memory(GiB)": 21.48, "step": 20164, "token_acc": 0.9862542955326461, "train_speed(iter/s)": 0.955917 }, { "epoch": 0.6550693564629828, "grad_norm": 0.2694402039051056, "learning_rate": 2.915198992333373e-06, "loss": 0.011883838102221489, "memory(GiB)": 21.48, "step": 20165, "token_acc": 1.0, "train_speed(iter/s)": 0.955927 }, { "epoch": 0.6551018419257383, "grad_norm": 0.4333679676055908, "learning_rate": 2.914710772771362e-06, "loss": 0.016625257208943367, "memory(GiB)": 21.48, "step": 20166, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.955937 }, { "epoch": 0.6551343273884936, "grad_norm": 0.360306978225708, "learning_rate": 2.9142225772763653e-06, "loss": 0.012658599764108658, "memory(GiB)": 21.48, "step": 20167, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.955947 }, { "epoch": 0.6551668128512491, "grad_norm": 0.5429027676582336, "learning_rate": 2.913734405854013e-06, "loss": 0.028837332502007484, "memory(GiB)": 21.48, "step": 20168, "token_acc": 0.9944444444444445, "train_speed(iter/s)": 0.955956 }, { "epoch": 0.6551992983140045, "grad_norm": 0.32023125886917114, "learning_rate": 2.9132462585099462e-06, "loss": 0.016225147992372513, "memory(GiB)": 21.48, "step": 20169, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.955965 }, { "epoch": 0.65523178377676, "grad_norm": 0.3453003466129303, "learning_rate": 2.9127581352497936e-06, "loss": 0.011072712019085884, "memory(GiB)": 21.48, "step": 20170, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.955974 }, { "epoch": 0.6552642692395153, "grad_norm": 0.5048293471336365, "learning_rate": 2.91227003607919e-06, "loss": 0.02057744190096855, "memory(GiB)": 21.48, "step": 20171, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.955982 }, { "epoch": 0.6552967547022708, "grad_norm": 0.3206731379032135, "learning_rate": 2.91178196100377e-06, "loss": 0.015639716759324074, "memory(GiB)": 21.48, "step": 20172, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955991 }, { "epoch": 0.6553292401650261, "grad_norm": 0.39503318071365356, "learning_rate": 2.9112939100291626e-06, "loss": 0.021047178655862808, "memory(GiB)": 21.48, "step": 20173, "token_acc": 1.0, "train_speed(iter/s)": 0.955999 }, { "epoch": 0.6553617256277816, "grad_norm": 0.35984140634536743, "learning_rate": 2.9108058831610074e-06, "loss": 0.020593496039509773, "memory(GiB)": 21.48, "step": 20174, "token_acc": 1.0, "train_speed(iter/s)": 0.956008 }, { "epoch": 0.655394211090537, "grad_norm": 0.4115903079509735, "learning_rate": 2.910317880404931e-06, "loss": 0.017802489921450615, "memory(GiB)": 21.48, "step": 20175, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.956016 }, { "epoch": 0.6554266965532924, "grad_norm": 0.3113752007484436, "learning_rate": 2.90982990176657e-06, "loss": 0.02104467898607254, "memory(GiB)": 21.48, "step": 20176, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956023 }, { "epoch": 0.6554591820160478, "grad_norm": 0.38681602478027344, "learning_rate": 2.909341947251551e-06, "loss": 0.01972895860671997, "memory(GiB)": 21.48, "step": 20177, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956029 }, { "epoch": 0.6554916674788033, "grad_norm": 0.43783819675445557, "learning_rate": 2.9088540168655087e-06, "loss": 0.023904183879494667, "memory(GiB)": 21.48, "step": 20178, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956036 }, { "epoch": 0.6555241529415586, "grad_norm": 0.332283616065979, "learning_rate": 2.908366110614075e-06, "loss": 0.018621385097503662, "memory(GiB)": 21.48, "step": 20179, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.956043 }, { "epoch": 0.6555566384043141, "grad_norm": 0.24527595937252045, "learning_rate": 2.9078782285028817e-06, "loss": 0.013096791692078114, "memory(GiB)": 21.48, "step": 20180, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.95605 }, { "epoch": 0.6555891238670695, "grad_norm": 0.4459436237812042, "learning_rate": 2.907390370537557e-06, "loss": 0.02081248164176941, "memory(GiB)": 21.48, "step": 20181, "token_acc": 1.0, "train_speed(iter/s)": 0.956057 }, { "epoch": 0.6556216093298249, "grad_norm": 0.39135241508483887, "learning_rate": 2.9069025367237324e-06, "loss": 0.018101699650287628, "memory(GiB)": 21.48, "step": 20182, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.956064 }, { "epoch": 0.6556540947925803, "grad_norm": 0.30518314242362976, "learning_rate": 2.9064147270670384e-06, "loss": 0.013925068080425262, "memory(GiB)": 21.48, "step": 20183, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.95607 }, { "epoch": 0.6556865802553358, "grad_norm": 0.33168864250183105, "learning_rate": 2.905926941573107e-06, "loss": 0.02077564038336277, "memory(GiB)": 21.48, "step": 20184, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956076 }, { "epoch": 0.6557190657180911, "grad_norm": 0.681703507900238, "learning_rate": 2.9054391802475636e-06, "loss": 0.02271544747054577, "memory(GiB)": 21.48, "step": 20185, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956082 }, { "epoch": 0.6557515511808466, "grad_norm": 0.6548945903778076, "learning_rate": 2.9049514430960403e-06, "loss": 0.017877710983157158, "memory(GiB)": 21.48, "step": 20186, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.956089 }, { "epoch": 0.655784036643602, "grad_norm": 0.41219091415405273, "learning_rate": 2.904463730124165e-06, "loss": 0.010613733902573586, "memory(GiB)": 21.48, "step": 20187, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956095 }, { "epoch": 0.6558165221063574, "grad_norm": 0.46283969283103943, "learning_rate": 2.9039760413375683e-06, "loss": 0.012896222062408924, "memory(GiB)": 21.48, "step": 20188, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956102 }, { "epoch": 0.6558490075691128, "grad_norm": 0.3139675259590149, "learning_rate": 2.9034883767418797e-06, "loss": 0.014231283217668533, "memory(GiB)": 21.48, "step": 20189, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.956109 }, { "epoch": 0.6558814930318683, "grad_norm": 0.4290238916873932, "learning_rate": 2.9030007363427236e-06, "loss": 0.01829826831817627, "memory(GiB)": 21.48, "step": 20190, "token_acc": 0.9921875, "train_speed(iter/s)": 0.956116 }, { "epoch": 0.6559139784946236, "grad_norm": 0.2569482624530792, "learning_rate": 2.9025131201457302e-06, "loss": 0.015983151271939278, "memory(GiB)": 21.48, "step": 20191, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956123 }, { "epoch": 0.6559464639573791, "grad_norm": 0.4528897702693939, "learning_rate": 2.9020255281565272e-06, "loss": 0.017355378717184067, "memory(GiB)": 21.48, "step": 20192, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.95613 }, { "epoch": 0.6559789494201345, "grad_norm": 0.3232085108757019, "learning_rate": 2.9015379603807436e-06, "loss": 0.010225681588053703, "memory(GiB)": 21.48, "step": 20193, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956138 }, { "epoch": 0.6560114348828899, "grad_norm": 0.3240613341331482, "learning_rate": 2.901050416824004e-06, "loss": 0.01241380162537098, "memory(GiB)": 21.48, "step": 20194, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.956145 }, { "epoch": 0.6560439203456453, "grad_norm": 0.5209274888038635, "learning_rate": 2.9005628974919375e-06, "loss": 0.01808817684650421, "memory(GiB)": 21.48, "step": 20195, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956155 }, { "epoch": 0.6560764058084008, "grad_norm": 0.5049550533294678, "learning_rate": 2.9000754023901657e-06, "loss": 0.013763013295829296, "memory(GiB)": 21.48, "step": 20196, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.956166 }, { "epoch": 0.6561088912711561, "grad_norm": 0.3720379173755646, "learning_rate": 2.8995879315243224e-06, "loss": 0.018509987741708755, "memory(GiB)": 21.48, "step": 20197, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956176 }, { "epoch": 0.6561413767339116, "grad_norm": 0.37112170457839966, "learning_rate": 2.899100484900028e-06, "loss": 0.018329329788684845, "memory(GiB)": 21.48, "step": 20198, "token_acc": 1.0, "train_speed(iter/s)": 0.956187 }, { "epoch": 0.656173862196667, "grad_norm": 0.31590017676353455, "learning_rate": 2.8986130625229126e-06, "loss": 0.013567079789936543, "memory(GiB)": 21.48, "step": 20199, "token_acc": 1.0, "train_speed(iter/s)": 0.956197 }, { "epoch": 0.6562063476594224, "grad_norm": 0.5190714597702026, "learning_rate": 2.898125664398598e-06, "loss": 0.017087172716856003, "memory(GiB)": 21.48, "step": 20200, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.956207 }, { "epoch": 0.6562388331221778, "grad_norm": 0.26542988419532776, "learning_rate": 2.8976382905327094e-06, "loss": 0.01747342385351658, "memory(GiB)": 21.48, "step": 20201, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.956217 }, { "epoch": 0.6562713185849333, "grad_norm": 0.38306328654289246, "learning_rate": 2.897150940930874e-06, "loss": 0.013973118737339973, "memory(GiB)": 21.48, "step": 20202, "token_acc": 1.0, "train_speed(iter/s)": 0.956227 }, { "epoch": 0.6563038040476886, "grad_norm": 0.28135401010513306, "learning_rate": 2.896663615598716e-06, "loss": 0.010762276127934456, "memory(GiB)": 21.48, "step": 20203, "token_acc": 1.0, "train_speed(iter/s)": 0.956237 }, { "epoch": 0.6563362895104441, "grad_norm": 0.5319742560386658, "learning_rate": 2.8961763145418607e-06, "loss": 0.016254641115665436, "memory(GiB)": 21.48, "step": 20204, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.956247 }, { "epoch": 0.6563687749731995, "grad_norm": 0.32662495970726013, "learning_rate": 2.895689037765929e-06, "loss": 0.015413510613143444, "memory(GiB)": 21.48, "step": 20205, "token_acc": 1.0, "train_speed(iter/s)": 0.956258 }, { "epoch": 0.6564012604359549, "grad_norm": 0.4311351776123047, "learning_rate": 2.8952017852765468e-06, "loss": 0.02021440863609314, "memory(GiB)": 21.48, "step": 20206, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.956268 }, { "epoch": 0.6564337458987103, "grad_norm": 0.30033940076828003, "learning_rate": 2.894714557079338e-06, "loss": 0.011559726670384407, "memory(GiB)": 21.48, "step": 20207, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.956278 }, { "epoch": 0.6564662313614658, "grad_norm": 0.3258799612522125, "learning_rate": 2.8942273531799265e-06, "loss": 0.013498252257704735, "memory(GiB)": 21.48, "step": 20208, "token_acc": 1.0, "train_speed(iter/s)": 0.956289 }, { "epoch": 0.6564987168242211, "grad_norm": 0.3457922339439392, "learning_rate": 2.893740173583932e-06, "loss": 0.014190170913934708, "memory(GiB)": 21.48, "step": 20209, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956299 }, { "epoch": 0.6565312022869766, "grad_norm": 0.3439484238624573, "learning_rate": 2.89325301829698e-06, "loss": 0.01418760884553194, "memory(GiB)": 21.48, "step": 20210, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.95631 }, { "epoch": 0.656563687749732, "grad_norm": 0.45234209299087524, "learning_rate": 2.8927658873246924e-06, "loss": 0.020268820226192474, "memory(GiB)": 21.48, "step": 20211, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.95632 }, { "epoch": 0.6565961732124874, "grad_norm": 0.6507185101509094, "learning_rate": 2.8922787806726927e-06, "loss": 0.01879016123712063, "memory(GiB)": 21.48, "step": 20212, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.95633 }, { "epoch": 0.6566286586752428, "grad_norm": 0.3903338611125946, "learning_rate": 2.8917916983465993e-06, "loss": 0.017557233572006226, "memory(GiB)": 21.48, "step": 20213, "token_acc": 1.0, "train_speed(iter/s)": 0.956339 }, { "epoch": 0.6566611441379983, "grad_norm": 0.3178448975086212, "learning_rate": 2.8913046403520354e-06, "loss": 0.00962037593126297, "memory(GiB)": 21.48, "step": 20214, "token_acc": 1.0, "train_speed(iter/s)": 0.956348 }, { "epoch": 0.6566936296007536, "grad_norm": 0.3183369040489197, "learning_rate": 2.890817606694624e-06, "loss": 0.015741368755698204, "memory(GiB)": 21.48, "step": 20215, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.956359 }, { "epoch": 0.6567261150635091, "grad_norm": 0.33856239914894104, "learning_rate": 2.890330597379985e-06, "loss": 0.011326106265187263, "memory(GiB)": 21.48, "step": 20216, "token_acc": 0.9948717948717949, "train_speed(iter/s)": 0.956368 }, { "epoch": 0.6567586005262644, "grad_norm": 0.4253609776496887, "learning_rate": 2.8898436124137376e-06, "loss": 0.016224054619669914, "memory(GiB)": 21.48, "step": 20217, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956378 }, { "epoch": 0.6567910859890199, "grad_norm": 0.4739415645599365, "learning_rate": 2.889356651801504e-06, "loss": 0.018219061195850372, "memory(GiB)": 21.48, "step": 20218, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.956389 }, { "epoch": 0.6568235714517753, "grad_norm": 0.27047762274742126, "learning_rate": 2.888869715548903e-06, "loss": 0.01697675883769989, "memory(GiB)": 21.48, "step": 20219, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956399 }, { "epoch": 0.6568560569145308, "grad_norm": 0.37416180968284607, "learning_rate": 2.8883828036615557e-06, "loss": 0.01943044923245907, "memory(GiB)": 21.48, "step": 20220, "token_acc": 1.0, "train_speed(iter/s)": 0.95641 }, { "epoch": 0.6568885423772861, "grad_norm": 0.262729287147522, "learning_rate": 2.8878959161450835e-06, "loss": 0.012155594304203987, "memory(GiB)": 21.48, "step": 20221, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.95642 }, { "epoch": 0.6569210278400416, "grad_norm": 0.28361645340919495, "learning_rate": 2.8874090530051015e-06, "loss": 0.013244959525763988, "memory(GiB)": 21.48, "step": 20222, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.95643 }, { "epoch": 0.6569535133027969, "grad_norm": 0.2514412999153137, "learning_rate": 2.886922214247233e-06, "loss": 0.011602204293012619, "memory(GiB)": 21.48, "step": 20223, "token_acc": 0.9927272727272727, "train_speed(iter/s)": 0.95644 }, { "epoch": 0.6569859987655524, "grad_norm": 0.41103145480155945, "learning_rate": 2.886435399877091e-06, "loss": 0.01907799392938614, "memory(GiB)": 21.48, "step": 20224, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.95645 }, { "epoch": 0.6570184842283079, "grad_norm": 0.33749106526374817, "learning_rate": 2.885948609900301e-06, "loss": 0.015707433223724365, "memory(GiB)": 21.48, "step": 20225, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956461 }, { "epoch": 0.6570509696910632, "grad_norm": 0.6085474491119385, "learning_rate": 2.885461844322476e-06, "loss": 0.02273530140519142, "memory(GiB)": 21.48, "step": 20226, "token_acc": 1.0, "train_speed(iter/s)": 0.956471 }, { "epoch": 0.6570834551538187, "grad_norm": 0.24540355801582336, "learning_rate": 2.884975103149238e-06, "loss": 0.009567433968186378, "memory(GiB)": 21.48, "step": 20227, "token_acc": 1.0, "train_speed(iter/s)": 0.956481 }, { "epoch": 0.6571159406165741, "grad_norm": 0.3565152585506439, "learning_rate": 2.8844883863862005e-06, "loss": 0.019287729635834694, "memory(GiB)": 21.48, "step": 20228, "token_acc": 1.0, "train_speed(iter/s)": 0.956491 }, { "epoch": 0.6571484260793296, "grad_norm": 0.3178061544895172, "learning_rate": 2.8840016940389824e-06, "loss": 0.015466203913092613, "memory(GiB)": 21.48, "step": 20229, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.956502 }, { "epoch": 0.6571809115420849, "grad_norm": 0.590316653251648, "learning_rate": 2.883515026113202e-06, "loss": 0.020144781097769737, "memory(GiB)": 21.48, "step": 20230, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.956512 }, { "epoch": 0.6572133970048404, "grad_norm": 0.39829009771347046, "learning_rate": 2.8830283826144754e-06, "loss": 0.019464660435914993, "memory(GiB)": 21.48, "step": 20231, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.95652 }, { "epoch": 0.6572458824675957, "grad_norm": 0.402019202709198, "learning_rate": 2.8825417635484186e-06, "loss": 0.018383517861366272, "memory(GiB)": 21.48, "step": 20232, "token_acc": 1.0, "train_speed(iter/s)": 0.956529 }, { "epoch": 0.6572783679303512, "grad_norm": 0.4310768246650696, "learning_rate": 2.8820551689206476e-06, "loss": 0.02149897813796997, "memory(GiB)": 21.48, "step": 20233, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.956537 }, { "epoch": 0.6573108533931066, "grad_norm": 2.5743191242218018, "learning_rate": 2.881568598736779e-06, "loss": 0.031793106347322464, "memory(GiB)": 21.48, "step": 20234, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956546 }, { "epoch": 0.657343338855862, "grad_norm": 0.44883424043655396, "learning_rate": 2.8810820530024287e-06, "loss": 0.01751505397260189, "memory(GiB)": 21.48, "step": 20235, "token_acc": 1.0, "train_speed(iter/s)": 0.956554 }, { "epoch": 0.6573758243186174, "grad_norm": 0.47211408615112305, "learning_rate": 2.8805955317232132e-06, "loss": 0.021094806492328644, "memory(GiB)": 21.48, "step": 20236, "token_acc": 0.9757281553398058, "train_speed(iter/s)": 0.956561 }, { "epoch": 0.6574083097813729, "grad_norm": 0.5149438977241516, "learning_rate": 2.8801090349047444e-06, "loss": 0.0198652446269989, "memory(GiB)": 21.48, "step": 20237, "token_acc": 0.9929824561403509, "train_speed(iter/s)": 0.956568 }, { "epoch": 0.6574407952441282, "grad_norm": 0.42901384830474854, "learning_rate": 2.8796225625526384e-06, "loss": 0.016984667629003525, "memory(GiB)": 21.48, "step": 20238, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.956576 }, { "epoch": 0.6574732807068837, "grad_norm": 0.6177647709846497, "learning_rate": 2.879136114672511e-06, "loss": 0.020194169133901596, "memory(GiB)": 21.48, "step": 20239, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.956583 }, { "epoch": 0.6575057661696391, "grad_norm": 0.3807366192340851, "learning_rate": 2.878649691269978e-06, "loss": 0.012918584048748016, "memory(GiB)": 21.48, "step": 20240, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.95659 }, { "epoch": 0.6575382516323945, "grad_norm": 0.2745082676410675, "learning_rate": 2.8781632923506484e-06, "loss": 0.010512640699744225, "memory(GiB)": 21.48, "step": 20241, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956597 }, { "epoch": 0.6575707370951499, "grad_norm": 0.25328314304351807, "learning_rate": 2.8776769179201387e-06, "loss": 0.010169618763029575, "memory(GiB)": 21.48, "step": 20242, "token_acc": 0.9846153846153847, "train_speed(iter/s)": 0.956604 }, { "epoch": 0.6576032225579054, "grad_norm": 0.3789655566215515, "learning_rate": 2.8771905679840616e-06, "loss": 0.01733633503317833, "memory(GiB)": 21.48, "step": 20243, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.956611 }, { "epoch": 0.6576357080206607, "grad_norm": 0.3253921866416931, "learning_rate": 2.876704242548034e-06, "loss": 0.01312272623181343, "memory(GiB)": 21.48, "step": 20244, "token_acc": 0.9927272727272727, "train_speed(iter/s)": 0.956617 }, { "epoch": 0.6576681934834162, "grad_norm": 0.45983970165252686, "learning_rate": 2.8762179416176626e-06, "loss": 0.012789425440132618, "memory(GiB)": 21.48, "step": 20245, "token_acc": 1.0, "train_speed(iter/s)": 0.956624 }, { "epoch": 0.6577006789461716, "grad_norm": 0.3440702557563782, "learning_rate": 2.8757316651985655e-06, "loss": 0.010234356857836246, "memory(GiB)": 21.48, "step": 20246, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.95663 }, { "epoch": 0.657733164408927, "grad_norm": 0.2938929796218872, "learning_rate": 2.875245413296348e-06, "loss": 0.011882426217198372, "memory(GiB)": 21.48, "step": 20247, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.956636 }, { "epoch": 0.6577656498716824, "grad_norm": 0.8001688122749329, "learning_rate": 2.874759185916629e-06, "loss": 0.015738312155008316, "memory(GiB)": 21.48, "step": 20248, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.956643 }, { "epoch": 0.6577981353344379, "grad_norm": 0.29737797379493713, "learning_rate": 2.874272983065019e-06, "loss": 0.0130708497017622, "memory(GiB)": 21.48, "step": 20249, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956649 }, { "epoch": 0.6578306207971932, "grad_norm": 0.2858196198940277, "learning_rate": 2.873786804747126e-06, "loss": 0.012929295189678669, "memory(GiB)": 21.48, "step": 20250, "token_acc": 1.0, "train_speed(iter/s)": 0.956655 }, { "epoch": 0.6578631062599487, "grad_norm": 0.35092994570732117, "learning_rate": 2.8733006509685658e-06, "loss": 0.014759429730474949, "memory(GiB)": 21.48, "step": 20251, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956662 }, { "epoch": 0.6578955917227041, "grad_norm": 0.3850507140159607, "learning_rate": 2.872814521734943e-06, "loss": 0.01447567343711853, "memory(GiB)": 21.48, "step": 20252, "token_acc": 1.0, "train_speed(iter/s)": 0.956669 }, { "epoch": 0.6579280771854595, "grad_norm": 0.32971739768981934, "learning_rate": 2.872328417051875e-06, "loss": 0.015792978927493095, "memory(GiB)": 21.48, "step": 20253, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956675 }, { "epoch": 0.6579605626482149, "grad_norm": 0.37807944416999817, "learning_rate": 2.8718423369249676e-06, "loss": 0.01566047966480255, "memory(GiB)": 21.48, "step": 20254, "token_acc": 1.0, "train_speed(iter/s)": 0.956682 }, { "epoch": 0.6579930481109704, "grad_norm": 0.5517247319221497, "learning_rate": 2.871356281359834e-06, "loss": 0.023326033726334572, "memory(GiB)": 21.48, "step": 20255, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.956691 }, { "epoch": 0.6580255335737257, "grad_norm": 0.2994668483734131, "learning_rate": 2.87087025036208e-06, "loss": 0.01197783648967743, "memory(GiB)": 21.48, "step": 20256, "token_acc": 1.0, "train_speed(iter/s)": 0.956701 }, { "epoch": 0.6580580190364812, "grad_norm": 0.4156496822834015, "learning_rate": 2.8703842439373175e-06, "loss": 0.021755486726760864, "memory(GiB)": 21.48, "step": 20257, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956711 }, { "epoch": 0.6580905044992366, "grad_norm": 0.34076976776123047, "learning_rate": 2.8698982620911552e-06, "loss": 0.0136195607483387, "memory(GiB)": 21.48, "step": 20258, "token_acc": 1.0, "train_speed(iter/s)": 0.956721 }, { "epoch": 0.658122989961992, "grad_norm": 0.24139396846294403, "learning_rate": 2.8694123048292043e-06, "loss": 0.009842387400567532, "memory(GiB)": 21.48, "step": 20259, "token_acc": 1.0, "train_speed(iter/s)": 0.956732 }, { "epoch": 0.6581554754247474, "grad_norm": 0.388983815908432, "learning_rate": 2.8689263721570693e-06, "loss": 0.013752472586929798, "memory(GiB)": 21.48, "step": 20260, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.956742 }, { "epoch": 0.6581879608875029, "grad_norm": 0.27960824966430664, "learning_rate": 2.86844046408036e-06, "loss": 0.013756630942225456, "memory(GiB)": 21.48, "step": 20261, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.956752 }, { "epoch": 0.6582204463502582, "grad_norm": 0.3446361720561981, "learning_rate": 2.867954580604685e-06, "loss": 0.015692908316850662, "memory(GiB)": 21.48, "step": 20262, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956762 }, { "epoch": 0.6582529318130137, "grad_norm": 0.3623145520687103, "learning_rate": 2.8674687217356515e-06, "loss": 0.018188390880823135, "memory(GiB)": 21.48, "step": 20263, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956773 }, { "epoch": 0.6582854172757691, "grad_norm": 0.4617268443107605, "learning_rate": 2.8669828874788698e-06, "loss": 0.01914130337536335, "memory(GiB)": 21.48, "step": 20264, "token_acc": 1.0, "train_speed(iter/s)": 0.956783 }, { "epoch": 0.6583179027385245, "grad_norm": 0.4078075587749481, "learning_rate": 2.8664970778399427e-06, "loss": 0.01631843112409115, "memory(GiB)": 21.48, "step": 20265, "token_acc": 0.988, "train_speed(iter/s)": 0.956791 }, { "epoch": 0.6583503882012799, "grad_norm": 0.3348865211009979, "learning_rate": 2.8660112928244786e-06, "loss": 0.012491045519709587, "memory(GiB)": 21.48, "step": 20266, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.956799 }, { "epoch": 0.6583828736640354, "grad_norm": 0.5594047904014587, "learning_rate": 2.865525532438085e-06, "loss": 0.021650832146406174, "memory(GiB)": 21.48, "step": 20267, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956807 }, { "epoch": 0.6584153591267907, "grad_norm": 0.3678176701068878, "learning_rate": 2.8650397966863695e-06, "loss": 0.011334901675581932, "memory(GiB)": 21.48, "step": 20268, "token_acc": 1.0, "train_speed(iter/s)": 0.956815 }, { "epoch": 0.6584478445895462, "grad_norm": 0.2759803831577301, "learning_rate": 2.8645540855749345e-06, "loss": 0.00991959497332573, "memory(GiB)": 21.48, "step": 20269, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956821 }, { "epoch": 0.6584803300523016, "grad_norm": 0.34613606333732605, "learning_rate": 2.8640683991093876e-06, "loss": 0.012971604242920876, "memory(GiB)": 21.48, "step": 20270, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.956829 }, { "epoch": 0.658512815515057, "grad_norm": 0.3906777799129486, "learning_rate": 2.863582737295334e-06, "loss": 0.01944759674370289, "memory(GiB)": 21.48, "step": 20271, "token_acc": 0.995, "train_speed(iter/s)": 0.956837 }, { "epoch": 0.6585453009778124, "grad_norm": 0.2500998377799988, "learning_rate": 2.8630971001383813e-06, "loss": 0.00998048484325409, "memory(GiB)": 21.48, "step": 20272, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956844 }, { "epoch": 0.6585777864405679, "grad_norm": 0.246060311794281, "learning_rate": 2.86261148764413e-06, "loss": 0.019701972603797913, "memory(GiB)": 21.48, "step": 20273, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.956852 }, { "epoch": 0.6586102719033232, "grad_norm": 0.21051527559757233, "learning_rate": 2.86212589981819e-06, "loss": 0.008840778842568398, "memory(GiB)": 21.48, "step": 20274, "token_acc": 1.0, "train_speed(iter/s)": 0.956859 }, { "epoch": 0.6586427573660787, "grad_norm": 0.3995983302593231, "learning_rate": 2.861640336666157e-06, "loss": 0.019592754542827606, "memory(GiB)": 21.48, "step": 20275, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.956867 }, { "epoch": 0.658675242828834, "grad_norm": 0.28490954637527466, "learning_rate": 2.861154798193646e-06, "loss": 0.011833051219582558, "memory(GiB)": 21.48, "step": 20276, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956876 }, { "epoch": 0.6587077282915895, "grad_norm": 0.36703968048095703, "learning_rate": 2.8606692844062535e-06, "loss": 0.017128443345427513, "memory(GiB)": 21.48, "step": 20277, "token_acc": 0.9742489270386266, "train_speed(iter/s)": 0.956884 }, { "epoch": 0.6587402137543449, "grad_norm": 0.2617102861404419, "learning_rate": 2.8601837953095862e-06, "loss": 0.015427256934344769, "memory(GiB)": 21.48, "step": 20278, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.95689 }, { "epoch": 0.6587726992171004, "grad_norm": 0.29141765832901, "learning_rate": 2.859698330909244e-06, "loss": 0.01479644887149334, "memory(GiB)": 21.48, "step": 20279, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956897 }, { "epoch": 0.6588051846798557, "grad_norm": 1.0138309001922607, "learning_rate": 2.8592128912108303e-06, "loss": 0.019498219713568687, "memory(GiB)": 21.48, "step": 20280, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956903 }, { "epoch": 0.6588376701426112, "grad_norm": 0.4000854790210724, "learning_rate": 2.8587274762199524e-06, "loss": 0.01361675001680851, "memory(GiB)": 21.48, "step": 20281, "token_acc": 1.0, "train_speed(iter/s)": 0.956911 }, { "epoch": 0.6588701556053665, "grad_norm": 0.34281831979751587, "learning_rate": 2.8582420859422082e-06, "loss": 0.014004688709974289, "memory(GiB)": 21.48, "step": 20282, "token_acc": 0.98828125, "train_speed(iter/s)": 0.95692 }, { "epoch": 0.658902641068122, "grad_norm": 0.5010555982589722, "learning_rate": 2.857756720383203e-06, "loss": 0.014331159181892872, "memory(GiB)": 21.48, "step": 20283, "token_acc": 1.0, "train_speed(iter/s)": 0.956928 }, { "epoch": 0.6589351265308774, "grad_norm": 0.3630302846431732, "learning_rate": 2.8572713795485324e-06, "loss": 0.01731187105178833, "memory(GiB)": 21.48, "step": 20284, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.956937 }, { "epoch": 0.6589676119936329, "grad_norm": 0.4703822135925293, "learning_rate": 2.856786063443806e-06, "loss": 0.019699322059750557, "memory(GiB)": 21.48, "step": 20285, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956947 }, { "epoch": 0.6590000974563882, "grad_norm": 0.39302489161491394, "learning_rate": 2.856300772074619e-06, "loss": 0.020075837150216103, "memory(GiB)": 21.48, "step": 20286, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.956956 }, { "epoch": 0.6590325829191437, "grad_norm": 0.3012160062789917, "learning_rate": 2.8558155054465766e-06, "loss": 0.014577122405171394, "memory(GiB)": 21.48, "step": 20287, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.956966 }, { "epoch": 0.6590650683818992, "grad_norm": 0.3514670133590698, "learning_rate": 2.855330263565275e-06, "loss": 0.012633190490305424, "memory(GiB)": 21.48, "step": 20288, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.956974 }, { "epoch": 0.6590975538446545, "grad_norm": 0.3940052390098572, "learning_rate": 2.8548450464363163e-06, "loss": 0.026078173890709877, "memory(GiB)": 21.48, "step": 20289, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.956985 }, { "epoch": 0.65913003930741, "grad_norm": 0.34293070435523987, "learning_rate": 2.8543598540653016e-06, "loss": 0.01436430960893631, "memory(GiB)": 21.48, "step": 20290, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.956993 }, { "epoch": 0.6591625247701653, "grad_norm": 0.255604088306427, "learning_rate": 2.8538746864578317e-06, "loss": 0.010216610506176949, "memory(GiB)": 21.48, "step": 20291, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.957003 }, { "epoch": 0.6591950102329208, "grad_norm": 0.33291107416152954, "learning_rate": 2.8533895436195025e-06, "loss": 0.017957421019673347, "memory(GiB)": 21.48, "step": 20292, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.957013 }, { "epoch": 0.6592274956956762, "grad_norm": 0.37813103199005127, "learning_rate": 2.8529044255559146e-06, "loss": 0.01584324985742569, "memory(GiB)": 21.48, "step": 20293, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957021 }, { "epoch": 0.6592599811584317, "grad_norm": 0.29612571001052856, "learning_rate": 2.8524193322726678e-06, "loss": 0.015287823043763638, "memory(GiB)": 21.48, "step": 20294, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957026 }, { "epoch": 0.659292466621187, "grad_norm": 0.3528362214565277, "learning_rate": 2.851934263775361e-06, "loss": 0.014889653772115707, "memory(GiB)": 21.48, "step": 20295, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.95703 }, { "epoch": 0.6593249520839425, "grad_norm": 0.5313100814819336, "learning_rate": 2.8514492200695932e-06, "loss": 0.028732096776366234, "memory(GiB)": 21.48, "step": 20296, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.957033 }, { "epoch": 0.6593574375466978, "grad_norm": 0.37876319885253906, "learning_rate": 2.850964201160959e-06, "loss": 0.014149222522974014, "memory(GiB)": 21.48, "step": 20297, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.957039 }, { "epoch": 0.6593899230094533, "grad_norm": 0.3678427040576935, "learning_rate": 2.850479207055058e-06, "loss": 0.016071785241365433, "memory(GiB)": 21.48, "step": 20298, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.957045 }, { "epoch": 0.6594224084722087, "grad_norm": 0.36555561423301697, "learning_rate": 2.8499942377574884e-06, "loss": 0.01694406196475029, "memory(GiB)": 21.48, "step": 20299, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.957051 }, { "epoch": 0.6594548939349641, "grad_norm": 1.5056203603744507, "learning_rate": 2.8495092932738493e-06, "loss": 0.018898583948612213, "memory(GiB)": 21.48, "step": 20300, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.957057 }, { "epoch": 0.6594873793977195, "grad_norm": 0.497464656829834, "learning_rate": 2.849024373609733e-06, "loss": 0.01692385971546173, "memory(GiB)": 21.48, "step": 20301, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.957064 }, { "epoch": 0.659519864860475, "grad_norm": 0.28536322712898254, "learning_rate": 2.8485394787707383e-06, "loss": 0.011617050506174564, "memory(GiB)": 21.48, "step": 20302, "token_acc": 1.0, "train_speed(iter/s)": 0.95707 }, { "epoch": 0.6595523503232303, "grad_norm": 0.24071648716926575, "learning_rate": 2.8480546087624618e-06, "loss": 0.010449603199958801, "memory(GiB)": 21.48, "step": 20303, "token_acc": 1.0, "train_speed(iter/s)": 0.957076 }, { "epoch": 0.6595848357859858, "grad_norm": 0.36012035608291626, "learning_rate": 2.8475697635905007e-06, "loss": 0.01158122532069683, "memory(GiB)": 21.48, "step": 20304, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957082 }, { "epoch": 0.6596173212487412, "grad_norm": 0.42324507236480713, "learning_rate": 2.847084943260448e-06, "loss": 0.014273343607783318, "memory(GiB)": 21.48, "step": 20305, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957087 }, { "epoch": 0.6596498067114966, "grad_norm": 0.6171102523803711, "learning_rate": 2.8466001477779026e-06, "loss": 0.01216871291399002, "memory(GiB)": 21.48, "step": 20306, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.957089 }, { "epoch": 0.659682292174252, "grad_norm": 0.3087906241416931, "learning_rate": 2.846115377148453e-06, "loss": 0.015309221111238003, "memory(GiB)": 21.48, "step": 20307, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.957093 }, { "epoch": 0.6597147776370075, "grad_norm": 0.43083158135414124, "learning_rate": 2.8456306313777036e-06, "loss": 0.01568145491182804, "memory(GiB)": 21.48, "step": 20308, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957094 }, { "epoch": 0.6597472630997628, "grad_norm": 0.28515228629112244, "learning_rate": 2.845145910471242e-06, "loss": 0.014189837500452995, "memory(GiB)": 21.48, "step": 20309, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.957099 }, { "epoch": 0.6597797485625183, "grad_norm": 0.3590380847454071, "learning_rate": 2.844661214434664e-06, "loss": 0.019864646717905998, "memory(GiB)": 21.48, "step": 20310, "token_acc": 1.0, "train_speed(iter/s)": 0.957105 }, { "epoch": 0.6598122340252737, "grad_norm": 0.4006940424442291, "learning_rate": 2.844176543273567e-06, "loss": 0.016380513086915016, "memory(GiB)": 21.48, "step": 20311, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.957111 }, { "epoch": 0.6598447194880291, "grad_norm": 0.2738310694694519, "learning_rate": 2.8436918969935377e-06, "loss": 0.01209631934762001, "memory(GiB)": 21.48, "step": 20312, "token_acc": 1.0, "train_speed(iter/s)": 0.957118 }, { "epoch": 0.6598772049507845, "grad_norm": 0.33595913648605347, "learning_rate": 2.8432072756001783e-06, "loss": 0.016647523269057274, "memory(GiB)": 21.48, "step": 20313, "token_acc": 0.9890909090909091, "train_speed(iter/s)": 0.957125 }, { "epoch": 0.65990969041354, "grad_norm": 0.37781423330307007, "learning_rate": 2.842722679099076e-06, "loss": 0.013416038826107979, "memory(GiB)": 21.48, "step": 20314, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957133 }, { "epoch": 0.6599421758762953, "grad_norm": 0.35916003584861755, "learning_rate": 2.842238107495826e-06, "loss": 0.014349715784192085, "memory(GiB)": 21.48, "step": 20315, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.957141 }, { "epoch": 0.6599746613390508, "grad_norm": 4.987963676452637, "learning_rate": 2.8417535607960197e-06, "loss": 0.012556057423353195, "memory(GiB)": 21.48, "step": 20316, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.95715 }, { "epoch": 0.6600071468018062, "grad_norm": 0.3323471248149872, "learning_rate": 2.841269039005249e-06, "loss": 0.015588698908686638, "memory(GiB)": 21.48, "step": 20317, "token_acc": 1.0, "train_speed(iter/s)": 0.95716 }, { "epoch": 0.6600396322645616, "grad_norm": 0.3421385586261749, "learning_rate": 2.840784542129107e-06, "loss": 0.017787780612707138, "memory(GiB)": 21.48, "step": 20318, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.95717 }, { "epoch": 0.660072117727317, "grad_norm": 0.3887091279029846, "learning_rate": 2.8403000701731875e-06, "loss": 0.02316533774137497, "memory(GiB)": 21.48, "step": 20319, "token_acc": 1.0, "train_speed(iter/s)": 0.957181 }, { "epoch": 0.6601046031900725, "grad_norm": 0.2998430132865906, "learning_rate": 2.8398156231430773e-06, "loss": 0.008664997294545174, "memory(GiB)": 21.48, "step": 20320, "token_acc": 1.0, "train_speed(iter/s)": 0.95719 }, { "epoch": 0.6601370886528278, "grad_norm": 0.3336358964443207, "learning_rate": 2.8393312010443706e-06, "loss": 0.01451933104544878, "memory(GiB)": 21.48, "step": 20321, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.9572 }, { "epoch": 0.6601695741155833, "grad_norm": 0.38866084814071655, "learning_rate": 2.8388468038826578e-06, "loss": 0.019362643361091614, "memory(GiB)": 21.48, "step": 20322, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.95721 }, { "epoch": 0.6602020595783387, "grad_norm": 0.4170876443386078, "learning_rate": 2.8383624316635304e-06, "loss": 0.0193614698946476, "memory(GiB)": 21.48, "step": 20323, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.95722 }, { "epoch": 0.6602345450410941, "grad_norm": 0.38039088249206543, "learning_rate": 2.837878084392577e-06, "loss": 0.012409349903464317, "memory(GiB)": 21.48, "step": 20324, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.957229 }, { "epoch": 0.6602670305038495, "grad_norm": 0.27782654762268066, "learning_rate": 2.8373937620753874e-06, "loss": 0.010513769462704659, "memory(GiB)": 21.48, "step": 20325, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957239 }, { "epoch": 0.660299515966605, "grad_norm": 0.34059253334999084, "learning_rate": 2.836909464717552e-06, "loss": 0.013927919790148735, "memory(GiB)": 21.48, "step": 20326, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.957249 }, { "epoch": 0.6603320014293603, "grad_norm": 0.35489821434020996, "learning_rate": 2.836425192324661e-06, "loss": 0.017218708992004395, "memory(GiB)": 21.48, "step": 20327, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957259 }, { "epoch": 0.6603644868921158, "grad_norm": 0.4898199439048767, "learning_rate": 2.8359409449023045e-06, "loss": 0.020657764747738838, "memory(GiB)": 21.48, "step": 20328, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.957269 }, { "epoch": 0.6603969723548712, "grad_norm": 0.32282736897468567, "learning_rate": 2.835456722456068e-06, "loss": 0.013115352019667625, "memory(GiB)": 21.48, "step": 20329, "token_acc": 0.9896193771626297, "train_speed(iter/s)": 0.957279 }, { "epoch": 0.6604294578176266, "grad_norm": 0.4282180666923523, "learning_rate": 2.834972524991543e-06, "loss": 0.017735052853822708, "memory(GiB)": 21.48, "step": 20330, "token_acc": 1.0, "train_speed(iter/s)": 0.957289 }, { "epoch": 0.660461943280382, "grad_norm": 0.3245721459388733, "learning_rate": 2.8344883525143153e-06, "loss": 0.014132465235888958, "memory(GiB)": 21.48, "step": 20331, "token_acc": 0.9947643979057592, "train_speed(iter/s)": 0.957299 }, { "epoch": 0.6604944287431375, "grad_norm": 0.3660661280155182, "learning_rate": 2.8340042050299767e-06, "loss": 0.01543472521007061, "memory(GiB)": 21.48, "step": 20332, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957309 }, { "epoch": 0.6605269142058928, "grad_norm": 0.45825666189193726, "learning_rate": 2.8335200825441105e-06, "loss": 0.016083255410194397, "memory(GiB)": 21.48, "step": 20333, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.957319 }, { "epoch": 0.6605593996686483, "grad_norm": 0.28033366799354553, "learning_rate": 2.833035985062308e-06, "loss": 0.011596126481890678, "memory(GiB)": 21.48, "step": 20334, "token_acc": 1.0, "train_speed(iter/s)": 0.957329 }, { "epoch": 0.6605918851314037, "grad_norm": 0.3505212962627411, "learning_rate": 2.8325519125901514e-06, "loss": 0.01621578074991703, "memory(GiB)": 21.48, "step": 20335, "token_acc": 0.9965156794425087, "train_speed(iter/s)": 0.957339 }, { "epoch": 0.6606243705941591, "grad_norm": 0.5143652558326721, "learning_rate": 2.8320678651332334e-06, "loss": 0.02014419436454773, "memory(GiB)": 21.48, "step": 20336, "token_acc": 1.0, "train_speed(iter/s)": 0.957348 }, { "epoch": 0.6606568560569145, "grad_norm": 0.4612952172756195, "learning_rate": 2.8315838426971367e-06, "loss": 0.01602325029671192, "memory(GiB)": 21.48, "step": 20337, "token_acc": 0.9938650306748467, "train_speed(iter/s)": 0.957358 }, { "epoch": 0.66068934151967, "grad_norm": 0.550154983997345, "learning_rate": 2.8310998452874496e-06, "loss": 0.017146429046988487, "memory(GiB)": 21.48, "step": 20338, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.957368 }, { "epoch": 0.6607218269824253, "grad_norm": 0.5066058039665222, "learning_rate": 2.8306158729097553e-06, "loss": 0.023395828902721405, "memory(GiB)": 21.48, "step": 20339, "token_acc": 0.9767441860465116, "train_speed(iter/s)": 0.957378 }, { "epoch": 0.6607543124451808, "grad_norm": 0.47768476605415344, "learning_rate": 2.8301319255696422e-06, "loss": 0.01759899966418743, "memory(GiB)": 21.48, "step": 20340, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957388 }, { "epoch": 0.6607867979079362, "grad_norm": 0.3540281057357788, "learning_rate": 2.8296480032726935e-06, "loss": 0.017699094489216805, "memory(GiB)": 21.48, "step": 20341, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957398 }, { "epoch": 0.6608192833706916, "grad_norm": 0.3142656087875366, "learning_rate": 2.8291641060244966e-06, "loss": 0.017501192167401314, "memory(GiB)": 21.48, "step": 20342, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.957408 }, { "epoch": 0.660851768833447, "grad_norm": 0.6565952301025391, "learning_rate": 2.8286802338306372e-06, "loss": 0.01997550018131733, "memory(GiB)": 21.48, "step": 20343, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957417 }, { "epoch": 0.6608842542962025, "grad_norm": 0.32208502292633057, "learning_rate": 2.8281963866966953e-06, "loss": 0.021215975284576416, "memory(GiB)": 21.48, "step": 20344, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.957427 }, { "epoch": 0.6609167397589578, "grad_norm": 0.5121150016784668, "learning_rate": 2.827712564628258e-06, "loss": 0.014326389878988266, "memory(GiB)": 21.48, "step": 20345, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.957436 }, { "epoch": 0.6609492252217133, "grad_norm": 0.3767763078212738, "learning_rate": 2.8272287676309096e-06, "loss": 0.017456762492656708, "memory(GiB)": 21.48, "step": 20346, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.957446 }, { "epoch": 0.6609817106844686, "grad_norm": 0.3276887536048889, "learning_rate": 2.8267449957102344e-06, "loss": 0.01274176873266697, "memory(GiB)": 21.48, "step": 20347, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.957456 }, { "epoch": 0.6610141961472241, "grad_norm": 0.4025864899158478, "learning_rate": 2.8262612488718133e-06, "loss": 0.017894916236400604, "memory(GiB)": 21.48, "step": 20348, "token_acc": 0.988, "train_speed(iter/s)": 0.957466 }, { "epoch": 0.6610466816099795, "grad_norm": 0.3165297210216522, "learning_rate": 2.8257775271212295e-06, "loss": 0.013148821890354156, "memory(GiB)": 21.48, "step": 20349, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.957476 }, { "epoch": 0.661079167072735, "grad_norm": 0.2774081230163574, "learning_rate": 2.8252938304640675e-06, "loss": 0.014734874479472637, "memory(GiB)": 21.48, "step": 20350, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957486 }, { "epoch": 0.6611116525354903, "grad_norm": 0.4742739200592041, "learning_rate": 2.8248101589059117e-06, "loss": 0.017002597451210022, "memory(GiB)": 21.48, "step": 20351, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.957496 }, { "epoch": 0.6611441379982458, "grad_norm": 0.5942801237106323, "learning_rate": 2.8243265124523396e-06, "loss": 0.01080454234033823, "memory(GiB)": 21.48, "step": 20352, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957506 }, { "epoch": 0.6611766234610013, "grad_norm": 0.27205297350883484, "learning_rate": 2.823842891108935e-06, "loss": 0.014427650719881058, "memory(GiB)": 21.48, "step": 20353, "token_acc": 1.0, "train_speed(iter/s)": 0.957516 }, { "epoch": 0.6612091089237566, "grad_norm": 0.3277813494205475, "learning_rate": 2.8233592948812805e-06, "loss": 0.015820929780602455, "memory(GiB)": 21.48, "step": 20354, "token_acc": 1.0, "train_speed(iter/s)": 0.957525 }, { "epoch": 0.6612415943865121, "grad_norm": 0.38445132970809937, "learning_rate": 2.8228757237749583e-06, "loss": 0.01850152388215065, "memory(GiB)": 21.48, "step": 20355, "token_acc": 1.0, "train_speed(iter/s)": 0.957532 }, { "epoch": 0.6612740798492674, "grad_norm": 0.2573932111263275, "learning_rate": 2.822392177795546e-06, "loss": 0.015725160017609596, "memory(GiB)": 21.48, "step": 20356, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957538 }, { "epoch": 0.6613065653120229, "grad_norm": 0.3781011998653412, "learning_rate": 2.8219086569486266e-06, "loss": 0.012219475582242012, "memory(GiB)": 21.48, "step": 20357, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957545 }, { "epoch": 0.6613390507747783, "grad_norm": 0.39548274874687195, "learning_rate": 2.82142516123978e-06, "loss": 0.01611904427409172, "memory(GiB)": 21.48, "step": 20358, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.957552 }, { "epoch": 0.6613715362375338, "grad_norm": 0.45658060908317566, "learning_rate": 2.820941690674587e-06, "loss": 0.023302435874938965, "memory(GiB)": 21.48, "step": 20359, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.957558 }, { "epoch": 0.6614040217002891, "grad_norm": 0.28288084268569946, "learning_rate": 2.820458245258629e-06, "loss": 0.011983477510511875, "memory(GiB)": 21.48, "step": 20360, "token_acc": 1.0, "train_speed(iter/s)": 0.957565 }, { "epoch": 0.6614365071630446, "grad_norm": 0.3841990828514099, "learning_rate": 2.8199748249974824e-06, "loss": 0.01684989221394062, "memory(GiB)": 21.48, "step": 20361, "token_acc": 0.976, "train_speed(iter/s)": 0.957571 }, { "epoch": 0.6614689926258, "grad_norm": 0.34966668486595154, "learning_rate": 2.8194914298967292e-06, "loss": 0.01603289321064949, "memory(GiB)": 21.48, "step": 20362, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.957577 }, { "epoch": 0.6615014780885554, "grad_norm": 0.3537590801715851, "learning_rate": 2.8190080599619433e-06, "loss": 0.01666167750954628, "memory(GiB)": 21.48, "step": 20363, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.957584 }, { "epoch": 0.6615339635513108, "grad_norm": 0.4329116642475128, "learning_rate": 2.8185247151987116e-06, "loss": 0.02142719365656376, "memory(GiB)": 21.48, "step": 20364, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.95759 }, { "epoch": 0.6615664490140662, "grad_norm": 0.36892083287239075, "learning_rate": 2.8180413956126063e-06, "loss": 0.015856582671403885, "memory(GiB)": 21.48, "step": 20365, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.957597 }, { "epoch": 0.6615989344768216, "grad_norm": 0.3731749653816223, "learning_rate": 2.817558101209209e-06, "loss": 0.01770031824707985, "memory(GiB)": 21.48, "step": 20366, "token_acc": 0.9933333333333333, "train_speed(iter/s)": 0.957604 }, { "epoch": 0.6616314199395771, "grad_norm": 0.3174283802509308, "learning_rate": 2.8170748319940943e-06, "loss": 0.013198585249483585, "memory(GiB)": 21.48, "step": 20367, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.95761 }, { "epoch": 0.6616639054023324, "grad_norm": 0.3142683207988739, "learning_rate": 2.816591587972841e-06, "loss": 0.012467635795474052, "memory(GiB)": 21.48, "step": 20368, "token_acc": 1.0, "train_speed(iter/s)": 0.957616 }, { "epoch": 0.6616963908650879, "grad_norm": 0.26608264446258545, "learning_rate": 2.816108369151028e-06, "loss": 0.014436621218919754, "memory(GiB)": 21.48, "step": 20369, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.957621 }, { "epoch": 0.6617288763278433, "grad_norm": 0.2879874110221863, "learning_rate": 2.8156251755342323e-06, "loss": 0.010496594943106174, "memory(GiB)": 21.48, "step": 20370, "token_acc": 1.0, "train_speed(iter/s)": 0.957627 }, { "epoch": 0.6617613617905987, "grad_norm": 0.40542128682136536, "learning_rate": 2.8151420071280277e-06, "loss": 0.01408575102686882, "memory(GiB)": 21.48, "step": 20371, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.957634 }, { "epoch": 0.6617938472533541, "grad_norm": 0.5041263103485107, "learning_rate": 2.8146588639379923e-06, "loss": 0.0135630052536726, "memory(GiB)": 21.48, "step": 20372, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.95764 }, { "epoch": 0.6618263327161096, "grad_norm": 0.43579939007759094, "learning_rate": 2.814175745969702e-06, "loss": 0.019154010340571404, "memory(GiB)": 21.48, "step": 20373, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.957646 }, { "epoch": 0.6618588181788649, "grad_norm": 0.352458655834198, "learning_rate": 2.8136926532287334e-06, "loss": 0.01647479645907879, "memory(GiB)": 21.48, "step": 20374, "token_acc": 0.9963503649635036, "train_speed(iter/s)": 0.957654 }, { "epoch": 0.6618913036416204, "grad_norm": 0.25465700030326843, "learning_rate": 2.813209585720663e-06, "loss": 0.01120038889348507, "memory(GiB)": 21.48, "step": 20375, "token_acc": 1.0, "train_speed(iter/s)": 0.957662 }, { "epoch": 0.6619237891043758, "grad_norm": 0.33641791343688965, "learning_rate": 2.8127265434510637e-06, "loss": 0.015843786299228668, "memory(GiB)": 21.48, "step": 20376, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.95767 }, { "epoch": 0.6619562745671312, "grad_norm": 0.39182281494140625, "learning_rate": 2.8122435264255105e-06, "loss": 0.02156836912035942, "memory(GiB)": 21.48, "step": 20377, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.957679 }, { "epoch": 0.6619887600298866, "grad_norm": 0.2980075478553772, "learning_rate": 2.811760534649579e-06, "loss": 0.012323508970439434, "memory(GiB)": 21.48, "step": 20378, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957688 }, { "epoch": 0.6620212454926421, "grad_norm": 0.27401232719421387, "learning_rate": 2.8112775681288456e-06, "loss": 0.015386591665446758, "memory(GiB)": 21.48, "step": 20379, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957699 }, { "epoch": 0.6620537309553974, "grad_norm": 0.349118173122406, "learning_rate": 2.81079462686888e-06, "loss": 0.016399100422859192, "memory(GiB)": 21.48, "step": 20380, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.957708 }, { "epoch": 0.6620862164181529, "grad_norm": 0.3806593418121338, "learning_rate": 2.8103117108752586e-06, "loss": 0.015199181623756886, "memory(GiB)": 21.48, "step": 20381, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.957718 }, { "epoch": 0.6621187018809083, "grad_norm": 0.4289543628692627, "learning_rate": 2.809828820153554e-06, "loss": 0.02069253847002983, "memory(GiB)": 21.48, "step": 20382, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957728 }, { "epoch": 0.6621511873436637, "grad_norm": 0.2904767096042633, "learning_rate": 2.8093459547093426e-06, "loss": 0.01586976647377014, "memory(GiB)": 21.48, "step": 20383, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.957738 }, { "epoch": 0.6621836728064191, "grad_norm": 0.4294460713863373, "learning_rate": 2.8088631145481927e-06, "loss": 0.01952877640724182, "memory(GiB)": 21.48, "step": 20384, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.957748 }, { "epoch": 0.6622161582691746, "grad_norm": 0.3490591049194336, "learning_rate": 2.80838029967568e-06, "loss": 0.01940484531223774, "memory(GiB)": 21.48, "step": 20385, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.957758 }, { "epoch": 0.6622486437319299, "grad_norm": 0.3918720781803131, "learning_rate": 2.8078975100973723e-06, "loss": 0.020428698509931564, "memory(GiB)": 21.48, "step": 20386, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.957769 }, { "epoch": 0.6622811291946854, "grad_norm": 0.3597104847431183, "learning_rate": 2.8074147458188467e-06, "loss": 0.016069354489445686, "memory(GiB)": 21.48, "step": 20387, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.957779 }, { "epoch": 0.6623136146574408, "grad_norm": 0.4044753313064575, "learning_rate": 2.806932006845675e-06, "loss": 0.01627064310014248, "memory(GiB)": 21.48, "step": 20388, "token_acc": 1.0, "train_speed(iter/s)": 0.957789 }, { "epoch": 0.6623461001201962, "grad_norm": 0.39251309633255005, "learning_rate": 2.8064492931834257e-06, "loss": 0.015948990359902382, "memory(GiB)": 21.48, "step": 20389, "token_acc": 0.982532751091703, "train_speed(iter/s)": 0.957799 }, { "epoch": 0.6623785855829516, "grad_norm": 0.46948498487472534, "learning_rate": 2.805966604837673e-06, "loss": 0.019478287547826767, "memory(GiB)": 21.48, "step": 20390, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957809 }, { "epoch": 0.6624110710457071, "grad_norm": 0.3074447512626648, "learning_rate": 2.805483941813981e-06, "loss": 0.018104735761880875, "memory(GiB)": 21.48, "step": 20391, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957819 }, { "epoch": 0.6624435565084624, "grad_norm": 0.38046926259994507, "learning_rate": 2.80500130411793e-06, "loss": 0.018185313791036606, "memory(GiB)": 21.48, "step": 20392, "token_acc": 0.988, "train_speed(iter/s)": 0.957829 }, { "epoch": 0.6624760419712179, "grad_norm": 0.36166346073150635, "learning_rate": 2.804518691755084e-06, "loss": 0.020942162722349167, "memory(GiB)": 21.48, "step": 20393, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.957839 }, { "epoch": 0.6625085274339733, "grad_norm": 0.36877670884132385, "learning_rate": 2.804036104731016e-06, "loss": 0.017167391255497932, "memory(GiB)": 21.48, "step": 20394, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.957848 }, { "epoch": 0.6625410128967287, "grad_norm": 0.5001282691955566, "learning_rate": 2.8035535430512915e-06, "loss": 0.014873404987156391, "memory(GiB)": 21.48, "step": 20395, "token_acc": 1.0, "train_speed(iter/s)": 0.957857 }, { "epoch": 0.6625734983594841, "grad_norm": 0.38771185278892517, "learning_rate": 2.803071006721483e-06, "loss": 0.02501954697072506, "memory(GiB)": 21.48, "step": 20396, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.957867 }, { "epoch": 0.6626059838222396, "grad_norm": 0.29400911927223206, "learning_rate": 2.8025884957471592e-06, "loss": 0.011157184839248657, "memory(GiB)": 21.48, "step": 20397, "token_acc": 1.0, "train_speed(iter/s)": 0.957877 }, { "epoch": 0.6626384692849949, "grad_norm": 0.3114734888076782, "learning_rate": 2.8021060101338916e-06, "loss": 0.014700127765536308, "memory(GiB)": 21.48, "step": 20398, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957887 }, { "epoch": 0.6626709547477504, "grad_norm": 0.2579261064529419, "learning_rate": 2.8016235498872435e-06, "loss": 0.014685830101370811, "memory(GiB)": 21.48, "step": 20399, "token_acc": 1.0, "train_speed(iter/s)": 0.957897 }, { "epoch": 0.6627034402105058, "grad_norm": 0.45071232318878174, "learning_rate": 2.8011411150127855e-06, "loss": 0.012629814445972443, "memory(GiB)": 21.48, "step": 20400, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.957907 }, { "epoch": 0.6627359256732612, "grad_norm": 0.4323527216911316, "learning_rate": 2.800658705516086e-06, "loss": 0.016222547739744186, "memory(GiB)": 21.48, "step": 20401, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.957917 }, { "epoch": 0.6627684111360166, "grad_norm": 0.3299291133880615, "learning_rate": 2.8001763214027124e-06, "loss": 0.011328051798045635, "memory(GiB)": 21.48, "step": 20402, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957926 }, { "epoch": 0.6628008965987721, "grad_norm": 0.1999979317188263, "learning_rate": 2.7996939626782337e-06, "loss": 0.012229612097144127, "memory(GiB)": 21.48, "step": 20403, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.957936 }, { "epoch": 0.6628333820615274, "grad_norm": 0.3161972761154175, "learning_rate": 2.799211629348213e-06, "loss": 0.01670074090361595, "memory(GiB)": 21.48, "step": 20404, "token_acc": 1.0, "train_speed(iter/s)": 0.957946 }, { "epoch": 0.6628658675242829, "grad_norm": 0.27300164103507996, "learning_rate": 2.79872932141822e-06, "loss": 0.010831587016582489, "memory(GiB)": 21.48, "step": 20405, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.957956 }, { "epoch": 0.6628983529870383, "grad_norm": 0.2974649965763092, "learning_rate": 2.79824703889382e-06, "loss": 0.015323552303016186, "memory(GiB)": 21.48, "step": 20406, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957966 }, { "epoch": 0.6629308384497937, "grad_norm": 0.2667929232120514, "learning_rate": 2.797764781780582e-06, "loss": 0.011093534529209137, "memory(GiB)": 21.48, "step": 20407, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957976 }, { "epoch": 0.6629633239125491, "grad_norm": 0.39930039644241333, "learning_rate": 2.797282550084067e-06, "loss": 0.014867383986711502, "memory(GiB)": 21.48, "step": 20408, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.957986 }, { "epoch": 0.6629958093753046, "grad_norm": 0.6389934420585632, "learning_rate": 2.7968003438098445e-06, "loss": 0.009905297309160233, "memory(GiB)": 21.48, "step": 20409, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957996 }, { "epoch": 0.6630282948380599, "grad_norm": 0.1794194132089615, "learning_rate": 2.7963181629634774e-06, "loss": 0.009065249934792519, "memory(GiB)": 21.48, "step": 20410, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.958006 }, { "epoch": 0.6630607803008154, "grad_norm": 0.343671977519989, "learning_rate": 2.795836007550534e-06, "loss": 0.01942596212029457, "memory(GiB)": 21.48, "step": 20411, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.958016 }, { "epoch": 0.6630932657635707, "grad_norm": 0.25905758142471313, "learning_rate": 2.7953538775765756e-06, "loss": 0.01179293543100357, "memory(GiB)": 21.48, "step": 20412, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.958026 }, { "epoch": 0.6631257512263262, "grad_norm": 0.3557341396808624, "learning_rate": 2.7948717730471684e-06, "loss": 0.019291184842586517, "memory(GiB)": 21.48, "step": 20413, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.958035 }, { "epoch": 0.6631582366890816, "grad_norm": 0.5766858458518982, "learning_rate": 2.794389693967873e-06, "loss": 0.016951218247413635, "memory(GiB)": 21.48, "step": 20414, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.958045 }, { "epoch": 0.663190722151837, "grad_norm": 0.46126919984817505, "learning_rate": 2.79390764034426e-06, "loss": 0.020113326609134674, "memory(GiB)": 21.48, "step": 20415, "token_acc": 0.992, "train_speed(iter/s)": 0.958053 }, { "epoch": 0.6632232076145925, "grad_norm": 0.32308530807495117, "learning_rate": 2.793425612181887e-06, "loss": 0.013038605451583862, "memory(GiB)": 21.48, "step": 20416, "token_acc": 1.0, "train_speed(iter/s)": 0.95806 }, { "epoch": 0.6632556930773479, "grad_norm": 0.27712294459342957, "learning_rate": 2.792943609486321e-06, "loss": 0.010789016261696815, "memory(GiB)": 21.48, "step": 20417, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.958067 }, { "epoch": 0.6632881785401034, "grad_norm": 0.3868010640144348, "learning_rate": 2.7924616322631216e-06, "loss": 0.015919947996735573, "memory(GiB)": 21.48, "step": 20418, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.958073 }, { "epoch": 0.6633206640028587, "grad_norm": 0.3629334270954132, "learning_rate": 2.791979680517851e-06, "loss": 0.019260533154010773, "memory(GiB)": 21.48, "step": 20419, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.958079 }, { "epoch": 0.6633531494656142, "grad_norm": 0.5072759985923767, "learning_rate": 2.7914977542560784e-06, "loss": 0.017052508890628815, "memory(GiB)": 21.48, "step": 20420, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.958086 }, { "epoch": 0.6633856349283695, "grad_norm": 0.31698405742645264, "learning_rate": 2.7910158534833588e-06, "loss": 0.011960767209529877, "memory(GiB)": 21.48, "step": 20421, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.958093 }, { "epoch": 0.663418120391125, "grad_norm": 0.42198553681373596, "learning_rate": 2.7905339782052585e-06, "loss": 0.016553159803152084, "memory(GiB)": 21.48, "step": 20422, "token_acc": 0.9928571428571429, "train_speed(iter/s)": 0.9581 }, { "epoch": 0.6634506058538804, "grad_norm": 0.3439491391181946, "learning_rate": 2.790052128427335e-06, "loss": 0.01237280759960413, "memory(GiB)": 21.48, "step": 20423, "token_acc": 1.0, "train_speed(iter/s)": 0.958106 }, { "epoch": 0.6634830913166359, "grad_norm": 0.3683668076992035, "learning_rate": 2.7895703041551512e-06, "loss": 0.015504974871873856, "memory(GiB)": 21.48, "step": 20424, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.958113 }, { "epoch": 0.6635155767793912, "grad_norm": 0.35421720147132874, "learning_rate": 2.7890885053942687e-06, "loss": 0.013627446256577969, "memory(GiB)": 21.48, "step": 20425, "token_acc": 0.996, "train_speed(iter/s)": 0.958119 }, { "epoch": 0.6635480622421467, "grad_norm": 0.3012458384037018, "learning_rate": 2.7886067321502485e-06, "loss": 0.012774016708135605, "memory(GiB)": 21.48, "step": 20426, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.958126 }, { "epoch": 0.663580547704902, "grad_norm": 0.35167545080184937, "learning_rate": 2.7881249844286483e-06, "loss": 0.018802806735038757, "memory(GiB)": 21.48, "step": 20427, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.958133 }, { "epoch": 0.6636130331676575, "grad_norm": 0.3208037316799164, "learning_rate": 2.7876432622350303e-06, "loss": 0.014006132259964943, "memory(GiB)": 21.48, "step": 20428, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.958139 }, { "epoch": 0.6636455186304129, "grad_norm": 0.33215010166168213, "learning_rate": 2.7871615655749532e-06, "loss": 0.012087310664355755, "memory(GiB)": 21.48, "step": 20429, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.958146 }, { "epoch": 0.6636780040931683, "grad_norm": 0.30828857421875, "learning_rate": 2.7866798944539796e-06, "loss": 0.0102011077105999, "memory(GiB)": 21.48, "step": 20430, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.958152 }, { "epoch": 0.6637104895559237, "grad_norm": 1.0605131387710571, "learning_rate": 2.786198248877663e-06, "loss": 0.02279343456029892, "memory(GiB)": 21.48, "step": 20431, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.958158 }, { "epoch": 0.6637429750186792, "grad_norm": 0.244192972779274, "learning_rate": 2.785716628851566e-06, "loss": 0.014380158856511116, "memory(GiB)": 21.48, "step": 20432, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.958164 }, { "epoch": 0.6637754604814345, "grad_norm": 0.3599323034286499, "learning_rate": 2.7852350343812456e-06, "loss": 0.015336582437157631, "memory(GiB)": 21.48, "step": 20433, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.958171 }, { "epoch": 0.66380794594419, "grad_norm": 0.34554678201675415, "learning_rate": 2.784753465472261e-06, "loss": 0.013787000440061092, "memory(GiB)": 21.48, "step": 20434, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.958179 }, { "epoch": 0.6638404314069454, "grad_norm": 0.3887459933757782, "learning_rate": 2.784271922130172e-06, "loss": 0.017276853322982788, "memory(GiB)": 21.48, "step": 20435, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.958185 }, { "epoch": 0.6638729168697008, "grad_norm": 0.8938596844673157, "learning_rate": 2.7837904043605322e-06, "loss": 0.02239408530294895, "memory(GiB)": 21.48, "step": 20436, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.958193 }, { "epoch": 0.6639054023324562, "grad_norm": 0.3721007704734802, "learning_rate": 2.783308912168901e-06, "loss": 0.015873178839683533, "memory(GiB)": 21.48, "step": 20437, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.958201 }, { "epoch": 0.6639378877952117, "grad_norm": 0.37134984135627747, "learning_rate": 2.782827445560835e-06, "loss": 0.011897317133843899, "memory(GiB)": 21.48, "step": 20438, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.958208 }, { "epoch": 0.663970373257967, "grad_norm": 0.6172013878822327, "learning_rate": 2.7823460045418936e-06, "loss": 0.011122006922960281, "memory(GiB)": 21.48, "step": 20439, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.958215 }, { "epoch": 0.6640028587207225, "grad_norm": 0.3626737892627716, "learning_rate": 2.781864589117629e-06, "loss": 0.017114102840423584, "memory(GiB)": 21.48, "step": 20440, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.958224 }, { "epoch": 0.6640353441834779, "grad_norm": 0.31207919120788574, "learning_rate": 2.7813831992936014e-06, "loss": 0.013483097776770592, "memory(GiB)": 21.48, "step": 20441, "token_acc": 1.0, "train_speed(iter/s)": 0.958233 }, { "epoch": 0.6640678296462333, "grad_norm": 0.4324585497379303, "learning_rate": 2.7809018350753604e-06, "loss": 0.018445953726768494, "memory(GiB)": 21.48, "step": 20442, "token_acc": 1.0, "train_speed(iter/s)": 0.958243 }, { "epoch": 0.6641003151089887, "grad_norm": 0.41659557819366455, "learning_rate": 2.78042049646847e-06, "loss": 0.015215601772069931, "memory(GiB)": 21.48, "step": 20443, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.958253 }, { "epoch": 0.6641328005717442, "grad_norm": 0.4031558036804199, "learning_rate": 2.7799391834784794e-06, "loss": 0.016036128625273705, "memory(GiB)": 21.48, "step": 20444, "token_acc": 1.0, "train_speed(iter/s)": 0.958263 }, { "epoch": 0.6641652860344995, "grad_norm": 0.29879528284072876, "learning_rate": 2.7794578961109473e-06, "loss": 0.014958811923861504, "memory(GiB)": 21.48, "step": 20445, "token_acc": 0.9961832061068703, "train_speed(iter/s)": 0.958273 }, { "epoch": 0.664197771497255, "grad_norm": 0.630695641040802, "learning_rate": 2.778976634371425e-06, "loss": 0.01669694483280182, "memory(GiB)": 21.48, "step": 20446, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.958283 }, { "epoch": 0.6642302569600104, "grad_norm": 0.29658347368240356, "learning_rate": 2.7784953982654684e-06, "loss": 0.014667140319943428, "memory(GiB)": 21.48, "step": 20447, "token_acc": 1.0, "train_speed(iter/s)": 0.958293 }, { "epoch": 0.6642627424227658, "grad_norm": 0.39487510919570923, "learning_rate": 2.778014187798631e-06, "loss": 0.01674518920481205, "memory(GiB)": 21.48, "step": 20448, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.958302 }, { "epoch": 0.6642952278855212, "grad_norm": 0.3315563499927521, "learning_rate": 2.777533002976468e-06, "loss": 0.023264504969120026, "memory(GiB)": 21.48, "step": 20449, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.958313 }, { "epoch": 0.6643277133482767, "grad_norm": 0.5010443329811096, "learning_rate": 2.7770518438045347e-06, "loss": 0.015596533194184303, "memory(GiB)": 21.48, "step": 20450, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.958323 }, { "epoch": 0.664360198811032, "grad_norm": 0.5473771095275879, "learning_rate": 2.776570710288379e-06, "loss": 0.012711884453892708, "memory(GiB)": 21.48, "step": 20451, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.958332 }, { "epoch": 0.6643926842737875, "grad_norm": 0.3021503984928131, "learning_rate": 2.776089602433557e-06, "loss": 0.013025260530412197, "memory(GiB)": 21.48, "step": 20452, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.958342 }, { "epoch": 0.6644251697365429, "grad_norm": 0.37385979294776917, "learning_rate": 2.7756085202456206e-06, "loss": 0.01446103397756815, "memory(GiB)": 21.48, "step": 20453, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.958352 }, { "epoch": 0.6644576551992983, "grad_norm": 0.3753691613674164, "learning_rate": 2.775127463730125e-06, "loss": 0.01266641728579998, "memory(GiB)": 21.48, "step": 20454, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.958362 }, { "epoch": 0.6644901406620537, "grad_norm": 0.240458145737648, "learning_rate": 2.774646432892618e-06, "loss": 0.012373756617307663, "memory(GiB)": 21.48, "step": 20455, "token_acc": 1.0, "train_speed(iter/s)": 0.958372 }, { "epoch": 0.6645226261248092, "grad_norm": 0.304038405418396, "learning_rate": 2.7741654277386526e-06, "loss": 0.012629585340619087, "memory(GiB)": 21.48, "step": 20456, "token_acc": 0.9849056603773585, "train_speed(iter/s)": 0.958381 }, { "epoch": 0.6645551115875645, "grad_norm": 0.35558968782424927, "learning_rate": 2.773684448273781e-06, "loss": 0.012123245745897293, "memory(GiB)": 21.48, "step": 20457, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.958391 }, { "epoch": 0.66458759705032, "grad_norm": 0.3915824890136719, "learning_rate": 2.7732034945035557e-06, "loss": 0.019645871594548225, "memory(GiB)": 21.48, "step": 20458, "token_acc": 1.0, "train_speed(iter/s)": 0.9584 }, { "epoch": 0.6646200825130754, "grad_norm": 0.351100355386734, "learning_rate": 2.7727225664335247e-06, "loss": 0.017566869035363197, "memory(GiB)": 21.48, "step": 20459, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.958409 }, { "epoch": 0.6646525679758308, "grad_norm": 0.2123222053050995, "learning_rate": 2.7722416640692397e-06, "loss": 0.00918213464319706, "memory(GiB)": 21.48, "step": 20460, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.958419 }, { "epoch": 0.6646850534385862, "grad_norm": 0.3041435480117798, "learning_rate": 2.77176078741625e-06, "loss": 0.01256328634917736, "memory(GiB)": 21.48, "step": 20461, "token_acc": 0.996415770609319, "train_speed(iter/s)": 0.958429 }, { "epoch": 0.6647175389013417, "grad_norm": 0.25495287775993347, "learning_rate": 2.77127993648011e-06, "loss": 0.013730451464653015, "memory(GiB)": 21.48, "step": 20462, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.958439 }, { "epoch": 0.664750024364097, "grad_norm": 0.36567065119743347, "learning_rate": 2.7707991112663623e-06, "loss": 0.0176406092941761, "memory(GiB)": 21.48, "step": 20463, "token_acc": 0.98828125, "train_speed(iter/s)": 0.958448 }, { "epoch": 0.6647825098268525, "grad_norm": 0.4112792909145355, "learning_rate": 2.770318311780561e-06, "loss": 0.01953795924782753, "memory(GiB)": 21.48, "step": 20464, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.958458 }, { "epoch": 0.6648149952896079, "grad_norm": 0.8497412800788879, "learning_rate": 2.7698375380282538e-06, "loss": 0.019829951226711273, "memory(GiB)": 21.48, "step": 20465, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.958468 }, { "epoch": 0.6648474807523633, "grad_norm": 0.31921666860580444, "learning_rate": 2.7693567900149902e-06, "loss": 0.011704200878739357, "memory(GiB)": 21.48, "step": 20466, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.958478 }, { "epoch": 0.6648799662151187, "grad_norm": 0.38687095046043396, "learning_rate": 2.76887606774632e-06, "loss": 0.014844690449535847, "memory(GiB)": 21.48, "step": 20467, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.958487 }, { "epoch": 0.6649124516778742, "grad_norm": 0.3766040503978729, "learning_rate": 2.7683953712277878e-06, "loss": 0.016422271728515625, "memory(GiB)": 21.48, "step": 20468, "token_acc": 1.0, "train_speed(iter/s)": 0.958497 }, { "epoch": 0.6649449371406295, "grad_norm": 0.30861353874206543, "learning_rate": 2.767914700464944e-06, "loss": 0.011097416281700134, "memory(GiB)": 21.48, "step": 20469, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.958507 }, { "epoch": 0.664977422603385, "grad_norm": 0.41324228048324585, "learning_rate": 2.767434055463333e-06, "loss": 0.0207419041544199, "memory(GiB)": 21.48, "step": 20470, "token_acc": 0.992, "train_speed(iter/s)": 0.958517 }, { "epoch": 0.6650099080661404, "grad_norm": 0.2772773504257202, "learning_rate": 2.766953436228508e-06, "loss": 0.005811781622469425, "memory(GiB)": 21.48, "step": 20471, "token_acc": 1.0, "train_speed(iter/s)": 0.958527 }, { "epoch": 0.6650423935288958, "grad_norm": 0.3170490562915802, "learning_rate": 2.7664728427660105e-06, "loss": 0.011771241202950478, "memory(GiB)": 21.48, "step": 20472, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.958511 }, { "epoch": 0.6650748789916512, "grad_norm": 0.3936609625816345, "learning_rate": 2.7659922750813913e-06, "loss": 0.013059355318546295, "memory(GiB)": 21.48, "step": 20473, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.958521 }, { "epoch": 0.6651073644544067, "grad_norm": 0.3773048520088196, "learning_rate": 2.765511733180193e-06, "loss": 0.018749836832284927, "memory(GiB)": 21.48, "step": 20474, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.958525 }, { "epoch": 0.665139849917162, "grad_norm": 0.23361451923847198, "learning_rate": 2.765031217067963e-06, "loss": 0.008909031748771667, "memory(GiB)": 21.48, "step": 20475, "token_acc": 0.993103448275862, "train_speed(iter/s)": 0.958531 }, { "epoch": 0.6651723353799175, "grad_norm": 0.44508805871009827, "learning_rate": 2.764550726750247e-06, "loss": 0.015550374984741211, "memory(GiB)": 21.48, "step": 20476, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.958538 }, { "epoch": 0.6652048208426729, "grad_norm": 0.491895467042923, "learning_rate": 2.7640702622325934e-06, "loss": 0.013950178399682045, "memory(GiB)": 21.48, "step": 20477, "token_acc": 1.0, "train_speed(iter/s)": 0.958545 }, { "epoch": 0.6652373063054283, "grad_norm": 0.3933993875980377, "learning_rate": 2.7635898235205425e-06, "loss": 0.019957834854722023, "memory(GiB)": 21.48, "step": 20478, "token_acc": 0.980544747081712, "train_speed(iter/s)": 0.958552 }, { "epoch": 0.6652697917681837, "grad_norm": 0.3842600882053375, "learning_rate": 2.763109410619643e-06, "loss": 0.01657191663980484, "memory(GiB)": 21.48, "step": 20479, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.958559 }, { "epoch": 0.6653022772309392, "grad_norm": 0.46821311116218567, "learning_rate": 2.762629023535437e-06, "loss": 0.021701490506529808, "memory(GiB)": 21.48, "step": 20480, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.958565 }, { "epoch": 0.6653347626936946, "grad_norm": 0.3566136956214905, "learning_rate": 2.7621486622734704e-06, "loss": 0.014829877763986588, "memory(GiB)": 21.48, "step": 20481, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.958571 }, { "epoch": 0.66536724815645, "grad_norm": 0.41963624954223633, "learning_rate": 2.7616683268392896e-06, "loss": 0.01527400128543377, "memory(GiB)": 21.48, "step": 20482, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.958577 }, { "epoch": 0.6653997336192055, "grad_norm": 0.3181959092617035, "learning_rate": 2.7611880172384325e-06, "loss": 0.013619753532111645, "memory(GiB)": 21.48, "step": 20483, "token_acc": 1.0, "train_speed(iter/s)": 0.958584 }, { "epoch": 0.6654322190819608, "grad_norm": 0.242954283952713, "learning_rate": 2.7607077334764455e-06, "loss": 0.012543529272079468, "memory(GiB)": 21.48, "step": 20484, "token_acc": 1.0, "train_speed(iter/s)": 0.95859 }, { "epoch": 0.6654647045447163, "grad_norm": 0.6679717302322388, "learning_rate": 2.760227475558872e-06, "loss": 0.022273538634181023, "memory(GiB)": 21.48, "step": 20485, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.958597 }, { "epoch": 0.6654971900074717, "grad_norm": 0.45888471603393555, "learning_rate": 2.759747243491257e-06, "loss": 0.01715349778532982, "memory(GiB)": 21.48, "step": 20486, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.958602 }, { "epoch": 0.6655296754702271, "grad_norm": 0.4833688735961914, "learning_rate": 2.759267037279138e-06, "loss": 0.02472498267889023, "memory(GiB)": 21.48, "step": 20487, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.958609 }, { "epoch": 0.6655621609329825, "grad_norm": 0.39886581897735596, "learning_rate": 2.75878685692806e-06, "loss": 0.009734990075230598, "memory(GiB)": 21.48, "step": 20488, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.958615 }, { "epoch": 0.665594646395738, "grad_norm": 0.37783733010292053, "learning_rate": 2.758306702443565e-06, "loss": 0.016783231869339943, "memory(GiB)": 21.48, "step": 20489, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.958621 }, { "epoch": 0.6656271318584933, "grad_norm": 0.3047468662261963, "learning_rate": 2.7578265738311962e-06, "loss": 0.013194368220865726, "memory(GiB)": 21.48, "step": 20490, "token_acc": 0.9878048780487805, "train_speed(iter/s)": 0.958628 }, { "epoch": 0.6656596173212488, "grad_norm": 0.26912420988082886, "learning_rate": 2.75734647109649e-06, "loss": 0.012766321189701557, "memory(GiB)": 21.48, "step": 20491, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.958634 }, { "epoch": 0.6656921027840041, "grad_norm": 0.29771825671195984, "learning_rate": 2.756866394244994e-06, "loss": 0.012580415233969688, "memory(GiB)": 21.48, "step": 20492, "token_acc": 0.99609375, "train_speed(iter/s)": 0.958641 }, { "epoch": 0.6657245882467596, "grad_norm": 0.3933276832103729, "learning_rate": 2.7563863432822412e-06, "loss": 0.015775609761476517, "memory(GiB)": 21.48, "step": 20493, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.958648 }, { "epoch": 0.665757073709515, "grad_norm": 0.3762568533420563, "learning_rate": 2.7559063182137797e-06, "loss": 0.01671781949698925, "memory(GiB)": 21.48, "step": 20494, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.958656 }, { "epoch": 0.6657895591722705, "grad_norm": 0.2585345506668091, "learning_rate": 2.7554263190451445e-06, "loss": 0.018501270562410355, "memory(GiB)": 21.48, "step": 20495, "token_acc": 1.0, "train_speed(iter/s)": 0.958664 }, { "epoch": 0.6658220446350258, "grad_norm": 0.5077545046806335, "learning_rate": 2.754946345781877e-06, "loss": 0.021363703534007072, "memory(GiB)": 21.48, "step": 20496, "token_acc": 0.9840425531914894, "train_speed(iter/s)": 0.958671 }, { "epoch": 0.6658545300977813, "grad_norm": 0.3610813021659851, "learning_rate": 2.7544663984295185e-06, "loss": 0.014912723563611507, "memory(GiB)": 21.48, "step": 20497, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.958678 }, { "epoch": 0.6658870155605366, "grad_norm": 0.4174283444881439, "learning_rate": 2.753986476993603e-06, "loss": 0.018588483333587646, "memory(GiB)": 21.48, "step": 20498, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.958685 }, { "epoch": 0.6659195010232921, "grad_norm": 0.4730064868927002, "learning_rate": 2.753506581479677e-06, "loss": 0.0166143961250782, "memory(GiB)": 21.48, "step": 20499, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.958693 }, { "epoch": 0.6659519864860475, "grad_norm": 0.4236281216144562, "learning_rate": 2.753026711893273e-06, "loss": 0.018125897273421288, "memory(GiB)": 21.48, "step": 20500, "token_acc": 0.9846153846153847, "train_speed(iter/s)": 0.958701 }, { "epoch": 0.6659519864860475, "eval_loss": 0.015685908496379852, "eval_runtime": 79.5342, "eval_samples_per_second": 125.103, "eval_steps_per_second": 3.91, "eval_token_acc": 0.993652150186111, "step": 20500 }, { "epoch": 0.665984471948803, "grad_norm": 0.39024654030799866, "learning_rate": 2.752546868239933e-06, "loss": 0.019174227491021156, "memory(GiB)": 21.48, "step": 20501, "token_acc": 0.9936313583696278, "train_speed(iter/s)": 0.95466 }, { "epoch": 0.6660169574115583, "grad_norm": 0.2765217125415802, "learning_rate": 2.752067050525192e-06, "loss": 0.014162156730890274, "memory(GiB)": 21.48, "step": 20502, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.954665 }, { "epoch": 0.6660494428743138, "grad_norm": 0.24463732540607452, "learning_rate": 2.751587258754589e-06, "loss": 0.010617120191454887, "memory(GiB)": 21.48, "step": 20503, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.954673 }, { "epoch": 0.6660819283370691, "grad_norm": 0.40182167291641235, "learning_rate": 2.7511074929336623e-06, "loss": 0.01661074161529541, "memory(GiB)": 21.48, "step": 20504, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.95468 }, { "epoch": 0.6661144137998246, "grad_norm": 0.3627292513847351, "learning_rate": 2.750627753067949e-06, "loss": 0.01846366375684738, "memory(GiB)": 21.48, "step": 20505, "token_acc": 1.0, "train_speed(iter/s)": 0.954686 }, { "epoch": 0.66614689926258, "grad_norm": 0.2704698145389557, "learning_rate": 2.750148039162985e-06, "loss": 0.012782519683241844, "memory(GiB)": 21.48, "step": 20506, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.954694 }, { "epoch": 0.6661793847253354, "grad_norm": 0.35738512873649597, "learning_rate": 2.749668351224306e-06, "loss": 0.017294522374868393, "memory(GiB)": 21.48, "step": 20507, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.954702 }, { "epoch": 0.6662118701880908, "grad_norm": 0.2935292422771454, "learning_rate": 2.74918868925745e-06, "loss": 0.012890806421637535, "memory(GiB)": 21.48, "step": 20508, "token_acc": 1.0, "train_speed(iter/s)": 0.954709 }, { "epoch": 0.6662443556508463, "grad_norm": 0.37529000639915466, "learning_rate": 2.748709053267954e-06, "loss": 0.019390910863876343, "memory(GiB)": 21.48, "step": 20509, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.954716 }, { "epoch": 0.6662768411136016, "grad_norm": 0.5220459699630737, "learning_rate": 2.7482294432613503e-06, "loss": 0.023508716374635696, "memory(GiB)": 21.48, "step": 20510, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.954724 }, { "epoch": 0.6663093265763571, "grad_norm": 0.34610188007354736, "learning_rate": 2.747749859243176e-06, "loss": 0.017474491149187088, "memory(GiB)": 21.48, "step": 20511, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.954732 }, { "epoch": 0.6663418120391125, "grad_norm": 0.7813646793365479, "learning_rate": 2.7472703012189657e-06, "loss": 0.018651451915502548, "memory(GiB)": 21.48, "step": 20512, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.954739 }, { "epoch": 0.6663742975018679, "grad_norm": 0.34152859449386597, "learning_rate": 2.7467907691942547e-06, "loss": 0.01620541140437126, "memory(GiB)": 21.48, "step": 20513, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.954747 }, { "epoch": 0.6664067829646233, "grad_norm": 0.37024572491645813, "learning_rate": 2.746311263174579e-06, "loss": 0.011988452635705471, "memory(GiB)": 21.48, "step": 20514, "token_acc": 1.0, "train_speed(iter/s)": 0.954755 }, { "epoch": 0.6664392684273788, "grad_norm": 0.44224536418914795, "learning_rate": 2.745831783165469e-06, "loss": 0.013904396444559097, "memory(GiB)": 21.48, "step": 20515, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.954762 }, { "epoch": 0.6664717538901341, "grad_norm": 0.29852426052093506, "learning_rate": 2.745352329172461e-06, "loss": 0.014649253338575363, "memory(GiB)": 21.48, "step": 20516, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.954768 }, { "epoch": 0.6665042393528896, "grad_norm": 0.3492473065853119, "learning_rate": 2.7448729012010878e-06, "loss": 0.014523357152938843, "memory(GiB)": 21.48, "step": 20517, "token_acc": 1.0, "train_speed(iter/s)": 0.954776 }, { "epoch": 0.666536724815645, "grad_norm": 0.337658166885376, "learning_rate": 2.7443934992568845e-06, "loss": 0.01554128061980009, "memory(GiB)": 21.48, "step": 20518, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.954782 }, { "epoch": 0.6665692102784004, "grad_norm": 0.42331549525260925, "learning_rate": 2.743914123345381e-06, "loss": 0.021517612040042877, "memory(GiB)": 21.48, "step": 20519, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.95479 }, { "epoch": 0.6666016957411558, "grad_norm": 0.4419403374195099, "learning_rate": 2.7434347734721134e-06, "loss": 0.014968246221542358, "memory(GiB)": 21.48, "step": 20520, "token_acc": 1.0, "train_speed(iter/s)": 0.954797 }, { "epoch": 0.6666341812039113, "grad_norm": 0.38770604133605957, "learning_rate": 2.742955449642607e-06, "loss": 0.012750515714287758, "memory(GiB)": 21.48, "step": 20521, "token_acc": 1.0, "train_speed(iter/s)": 0.954805 }, { "epoch": 0.6666666666666666, "grad_norm": 0.43097442388534546, "learning_rate": 2.7424761518624036e-06, "loss": 0.019872890785336494, "memory(GiB)": 21.48, "step": 20522, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.954812 }, { "epoch": 0.6666991521294221, "grad_norm": 0.2098524421453476, "learning_rate": 2.741996880137028e-06, "loss": 0.011303124018013477, "memory(GiB)": 21.48, "step": 20523, "token_acc": 1.0, "train_speed(iter/s)": 0.95482 }, { "epoch": 0.6667316375921775, "grad_norm": 0.45303964614868164, "learning_rate": 2.741517634472016e-06, "loss": 0.01964547485113144, "memory(GiB)": 21.48, "step": 20524, "token_acc": 0.9947368421052631, "train_speed(iter/s)": 0.954828 }, { "epoch": 0.6667641230549329, "grad_norm": 0.2478438764810562, "learning_rate": 2.741038414872895e-06, "loss": 0.010870043188333511, "memory(GiB)": 21.48, "step": 20525, "token_acc": 1.0, "train_speed(iter/s)": 0.954835 }, { "epoch": 0.6667966085176883, "grad_norm": 0.35323983430862427, "learning_rate": 2.740559221345195e-06, "loss": 0.014421563595533371, "memory(GiB)": 21.48, "step": 20526, "token_acc": 1.0, "train_speed(iter/s)": 0.954843 }, { "epoch": 0.6668290939804438, "grad_norm": 0.39067697525024414, "learning_rate": 2.740080053894452e-06, "loss": 0.011639509350061417, "memory(GiB)": 21.48, "step": 20527, "token_acc": 0.9844357976653697, "train_speed(iter/s)": 0.95485 }, { "epoch": 0.6668615794431991, "grad_norm": 0.4188612997531891, "learning_rate": 2.739600912526192e-06, "loss": 0.017625197768211365, "memory(GiB)": 21.48, "step": 20528, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.954858 }, { "epoch": 0.6668940649059546, "grad_norm": 0.32357266545295715, "learning_rate": 2.739121797245948e-06, "loss": 0.017909720540046692, "memory(GiB)": 21.48, "step": 20529, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.954865 }, { "epoch": 0.66692655036871, "grad_norm": 0.42708849906921387, "learning_rate": 2.7386427080592456e-06, "loss": 0.019423846155405045, "memory(GiB)": 21.48, "step": 20530, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.954872 }, { "epoch": 0.6669590358314654, "grad_norm": 0.44187891483306885, "learning_rate": 2.738163644971616e-06, "loss": 0.023401129990816116, "memory(GiB)": 21.48, "step": 20531, "token_acc": 0.9837837837837838, "train_speed(iter/s)": 0.95488 }, { "epoch": 0.6669915212942208, "grad_norm": 0.7438791990280151, "learning_rate": 2.737684607988589e-06, "loss": 0.018809199333190918, "memory(GiB)": 21.48, "step": 20532, "token_acc": 1.0, "train_speed(iter/s)": 0.954887 }, { "epoch": 0.6670240067569763, "grad_norm": 0.3608330488204956, "learning_rate": 2.7372055971156942e-06, "loss": 0.016543742269277573, "memory(GiB)": 21.48, "step": 20533, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.954895 }, { "epoch": 0.6670564922197316, "grad_norm": 0.34872499108314514, "learning_rate": 2.7367266123584563e-06, "loss": 0.01401593815535307, "memory(GiB)": 21.48, "step": 20534, "token_acc": 1.0, "train_speed(iter/s)": 0.954902 }, { "epoch": 0.6670889776824871, "grad_norm": 0.41441571712493896, "learning_rate": 2.7362476537224057e-06, "loss": 0.01631438545882702, "memory(GiB)": 21.48, "step": 20535, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.954909 }, { "epoch": 0.6671214631452425, "grad_norm": 0.2878491282463074, "learning_rate": 2.735768721213071e-06, "loss": 0.012843001633882523, "memory(GiB)": 21.48, "step": 20536, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.954916 }, { "epoch": 0.6671539486079979, "grad_norm": 0.42566922307014465, "learning_rate": 2.735289814835981e-06, "loss": 0.01570584997534752, "memory(GiB)": 21.48, "step": 20537, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.954923 }, { "epoch": 0.6671864340707533, "grad_norm": 0.4951310455799103, "learning_rate": 2.7348109345966578e-06, "loss": 0.014963338151574135, "memory(GiB)": 21.48, "step": 20538, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.954931 }, { "epoch": 0.6672189195335088, "grad_norm": 0.5345538258552551, "learning_rate": 2.734332080500632e-06, "loss": 0.027231870219111443, "memory(GiB)": 21.48, "step": 20539, "token_acc": 0.98828125, "train_speed(iter/s)": 0.954939 }, { "epoch": 0.6672514049962641, "grad_norm": 0.3645505905151367, "learning_rate": 2.73385325255343e-06, "loss": 0.01514425314962864, "memory(GiB)": 21.48, "step": 20540, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.954946 }, { "epoch": 0.6672838904590196, "grad_norm": 0.32365646958351135, "learning_rate": 2.7333744507605774e-06, "loss": 0.01473605539649725, "memory(GiB)": 21.48, "step": 20541, "token_acc": 1.0, "train_speed(iter/s)": 0.954955 }, { "epoch": 0.667316375921775, "grad_norm": 0.44261687994003296, "learning_rate": 2.7328956751276028e-06, "loss": 0.018752167001366615, "memory(GiB)": 21.48, "step": 20542, "token_acc": 1.0, "train_speed(iter/s)": 0.954963 }, { "epoch": 0.6673488613845304, "grad_norm": 0.332976758480072, "learning_rate": 2.7324169256600265e-06, "loss": 0.016271350905299187, "memory(GiB)": 21.48, "step": 20543, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.954973 }, { "epoch": 0.6673813468472859, "grad_norm": 0.3132878839969635, "learning_rate": 2.731938202363378e-06, "loss": 0.017170611768960953, "memory(GiB)": 21.48, "step": 20544, "token_acc": 1.0, "train_speed(iter/s)": 0.954982 }, { "epoch": 0.6674138323100413, "grad_norm": 0.48804256319999695, "learning_rate": 2.7314595052431814e-06, "loss": 0.019568882882595062, "memory(GiB)": 21.48, "step": 20545, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.954992 }, { "epoch": 0.6674463177727967, "grad_norm": 0.39159470796585083, "learning_rate": 2.730980834304963e-06, "loss": 0.014712469652295113, "memory(GiB)": 21.48, "step": 20546, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.955001 }, { "epoch": 0.6674788032355521, "grad_norm": 0.4295531213283539, "learning_rate": 2.7305021895542443e-06, "loss": 0.017284248024225235, "memory(GiB)": 21.48, "step": 20547, "token_acc": 0.9918367346938776, "train_speed(iter/s)": 0.955011 }, { "epoch": 0.6675112886983076, "grad_norm": 0.3225877583026886, "learning_rate": 2.7300235709965528e-06, "loss": 0.014706876128911972, "memory(GiB)": 21.48, "step": 20548, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.955009 }, { "epoch": 0.6675437741610629, "grad_norm": 0.35776054859161377, "learning_rate": 2.729544978637407e-06, "loss": 0.016976995393633842, "memory(GiB)": 21.48, "step": 20549, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.955019 }, { "epoch": 0.6675762596238184, "grad_norm": 0.38460394740104675, "learning_rate": 2.7290664124823374e-06, "loss": 0.01503962092101574, "memory(GiB)": 21.48, "step": 20550, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.955029 }, { "epoch": 0.6676087450865738, "grad_norm": 0.29378172755241394, "learning_rate": 2.7285878725368627e-06, "loss": 0.010763943195343018, "memory(GiB)": 21.48, "step": 20551, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955039 }, { "epoch": 0.6676412305493292, "grad_norm": 0.2883874773979187, "learning_rate": 2.728109358806509e-06, "loss": 0.013952970504760742, "memory(GiB)": 21.48, "step": 20552, "token_acc": 1.0, "train_speed(iter/s)": 0.955049 }, { "epoch": 0.6676737160120846, "grad_norm": 0.39064186811447144, "learning_rate": 2.7276308712967946e-06, "loss": 0.022009754553437233, "memory(GiB)": 21.48, "step": 20553, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.955059 }, { "epoch": 0.66770620147484, "grad_norm": 0.3800097405910492, "learning_rate": 2.7271524100132453e-06, "loss": 0.02383836917579174, "memory(GiB)": 21.48, "step": 20554, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.955069 }, { "epoch": 0.6677386869375954, "grad_norm": 0.3408905565738678, "learning_rate": 2.7266739749613823e-06, "loss": 0.01064709760248661, "memory(GiB)": 21.48, "step": 20555, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.955078 }, { "epoch": 0.6677711724003509, "grad_norm": 0.2373833805322647, "learning_rate": 2.7261955661467286e-06, "loss": 0.012151667848229408, "memory(GiB)": 21.48, "step": 20556, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955088 }, { "epoch": 0.6678036578631062, "grad_norm": 0.6935339570045471, "learning_rate": 2.725717183574803e-06, "loss": 0.018259424716234207, "memory(GiB)": 21.48, "step": 20557, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955096 }, { "epoch": 0.6678361433258617, "grad_norm": 0.37719616293907166, "learning_rate": 2.725238827251129e-06, "loss": 0.013257887214422226, "memory(GiB)": 21.48, "step": 20558, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.955104 }, { "epoch": 0.6678686287886171, "grad_norm": 0.24971245229244232, "learning_rate": 2.724760497181225e-06, "loss": 0.011999130249023438, "memory(GiB)": 21.48, "step": 20559, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.955111 }, { "epoch": 0.6679011142513726, "grad_norm": 0.267304390668869, "learning_rate": 2.724282193370614e-06, "loss": 0.011913023889064789, "memory(GiB)": 21.48, "step": 20560, "token_acc": 1.0, "train_speed(iter/s)": 0.955119 }, { "epoch": 0.6679335997141279, "grad_norm": 0.41512244939804077, "learning_rate": 2.7238039158248174e-06, "loss": 0.024519294500350952, "memory(GiB)": 21.48, "step": 20561, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955127 }, { "epoch": 0.6679660851768834, "grad_norm": 0.4170685112476349, "learning_rate": 2.723325664549352e-06, "loss": 0.01445935107767582, "memory(GiB)": 21.48, "step": 20562, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.955136 }, { "epoch": 0.6679985706396387, "grad_norm": 0.30653873085975647, "learning_rate": 2.7228474395497373e-06, "loss": 0.015441851690411568, "memory(GiB)": 21.48, "step": 20563, "token_acc": 1.0, "train_speed(iter/s)": 0.955144 }, { "epoch": 0.6680310561023942, "grad_norm": 0.40099379420280457, "learning_rate": 2.722369240831495e-06, "loss": 0.013526768423616886, "memory(GiB)": 21.48, "step": 20564, "token_acc": 1.0, "train_speed(iter/s)": 0.955151 }, { "epoch": 0.6680635415651496, "grad_norm": 0.31492307782173157, "learning_rate": 2.7218910684001447e-06, "loss": 0.017153654247522354, "memory(GiB)": 21.48, "step": 20565, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955159 }, { "epoch": 0.668096027027905, "grad_norm": 0.33613088726997375, "learning_rate": 2.7214129222612016e-06, "loss": 0.01671052724123001, "memory(GiB)": 21.48, "step": 20566, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.955167 }, { "epoch": 0.6681285124906604, "grad_norm": 0.4360160231590271, "learning_rate": 2.7209348024201867e-06, "loss": 0.01957428641617298, "memory(GiB)": 21.48, "step": 20567, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.955175 }, { "epoch": 0.6681609979534159, "grad_norm": 0.39373308420181274, "learning_rate": 2.7204567088826173e-06, "loss": 0.010664746165275574, "memory(GiB)": 21.48, "step": 20568, "token_acc": 1.0, "train_speed(iter/s)": 0.955182 }, { "epoch": 0.6681934834161712, "grad_norm": 0.5195609927177429, "learning_rate": 2.7199786416540138e-06, "loss": 0.017058081924915314, "memory(GiB)": 21.48, "step": 20569, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.95519 }, { "epoch": 0.6682259688789267, "grad_norm": 0.3081803619861603, "learning_rate": 2.7195006007398892e-06, "loss": 0.011316336691379547, "memory(GiB)": 21.48, "step": 20570, "token_acc": 0.9893238434163701, "train_speed(iter/s)": 0.955198 }, { "epoch": 0.6682584543416821, "grad_norm": 0.33084219694137573, "learning_rate": 2.7190225861457648e-06, "loss": 0.01598609797656536, "memory(GiB)": 21.48, "step": 20571, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955206 }, { "epoch": 0.6682909398044375, "grad_norm": 0.30550023913383484, "learning_rate": 2.718544597877152e-06, "loss": 0.018880613148212433, "memory(GiB)": 21.48, "step": 20572, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.955214 }, { "epoch": 0.6683234252671929, "grad_norm": 0.3447321951389313, "learning_rate": 2.718066635939573e-06, "loss": 0.013900242745876312, "memory(GiB)": 21.48, "step": 20573, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.955222 }, { "epoch": 0.6683559107299484, "grad_norm": 0.3024366497993469, "learning_rate": 2.717588700338545e-06, "loss": 0.010220215655863285, "memory(GiB)": 21.48, "step": 20574, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955229 }, { "epoch": 0.6683883961927037, "grad_norm": 0.4153786599636078, "learning_rate": 2.717110791079578e-06, "loss": 0.02147955819964409, "memory(GiB)": 21.48, "step": 20575, "token_acc": 1.0, "train_speed(iter/s)": 0.955237 }, { "epoch": 0.6684208816554592, "grad_norm": 0.38930413126945496, "learning_rate": 2.7166329081681935e-06, "loss": 0.019082237035036087, "memory(GiB)": 21.48, "step": 20576, "token_acc": 0.9923954372623575, "train_speed(iter/s)": 0.955245 }, { "epoch": 0.6684533671182146, "grad_norm": 0.2463868409395218, "learning_rate": 2.716155051609901e-06, "loss": 0.014590627513825893, "memory(GiB)": 21.48, "step": 20577, "token_acc": 0.992, "train_speed(iter/s)": 0.955255 }, { "epoch": 0.66848585258097, "grad_norm": 0.2527099549770355, "learning_rate": 2.7156772214102222e-06, "loss": 0.011345310136675835, "memory(GiB)": 21.48, "step": 20578, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.955265 }, { "epoch": 0.6685183380437254, "grad_norm": 0.2879604399204254, "learning_rate": 2.715199417574667e-06, "loss": 0.011086949147284031, "memory(GiB)": 21.48, "step": 20579, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.955275 }, { "epoch": 0.6685508235064809, "grad_norm": 0.49987906217575073, "learning_rate": 2.714721640108754e-06, "loss": 0.017698124051094055, "memory(GiB)": 21.48, "step": 20580, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.955285 }, { "epoch": 0.6685833089692362, "grad_norm": 0.35176920890808105, "learning_rate": 2.7142438890179923e-06, "loss": 0.01905370131134987, "memory(GiB)": 21.48, "step": 20581, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955293 }, { "epoch": 0.6686157944319917, "grad_norm": 0.36758193373680115, "learning_rate": 2.7137661643078984e-06, "loss": 0.012745706364512444, "memory(GiB)": 21.48, "step": 20582, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.9553 }, { "epoch": 0.6686482798947471, "grad_norm": 0.3310965299606323, "learning_rate": 2.7132884659839862e-06, "loss": 0.011702049523591995, "memory(GiB)": 21.48, "step": 20583, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.955307 }, { "epoch": 0.6686807653575025, "grad_norm": 0.29177287220954895, "learning_rate": 2.712810794051771e-06, "loss": 0.011574998497962952, "memory(GiB)": 21.48, "step": 20584, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955315 }, { "epoch": 0.6687132508202579, "grad_norm": 0.41004836559295654, "learning_rate": 2.7123331485167615e-06, "loss": 0.01668929122388363, "memory(GiB)": 21.48, "step": 20585, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955324 }, { "epoch": 0.6687457362830134, "grad_norm": 0.31029510498046875, "learning_rate": 2.711855529384472e-06, "loss": 0.011101829819381237, "memory(GiB)": 21.48, "step": 20586, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955332 }, { "epoch": 0.6687782217457687, "grad_norm": 0.6742859482765198, "learning_rate": 2.7113779366604155e-06, "loss": 0.01490497961640358, "memory(GiB)": 21.48, "step": 20587, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.95534 }, { "epoch": 0.6688107072085242, "grad_norm": 0.5128201246261597, "learning_rate": 2.7109003703501037e-06, "loss": 0.010926852002739906, "memory(GiB)": 21.48, "step": 20588, "token_acc": 1.0, "train_speed(iter/s)": 0.955347 }, { "epoch": 0.6688431926712796, "grad_norm": 0.3757740259170532, "learning_rate": 2.7104228304590494e-06, "loss": 0.019210170954465866, "memory(GiB)": 21.48, "step": 20589, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.955355 }, { "epoch": 0.668875678134035, "grad_norm": 0.27694952487945557, "learning_rate": 2.7099453169927626e-06, "loss": 0.012251898646354675, "memory(GiB)": 21.48, "step": 20590, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.955363 }, { "epoch": 0.6689081635967904, "grad_norm": 0.30592313408851624, "learning_rate": 2.709467829956754e-06, "loss": 0.01155394222587347, "memory(GiB)": 21.48, "step": 20591, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.95537 }, { "epoch": 0.6689406490595459, "grad_norm": 0.26014474034309387, "learning_rate": 2.7089903693565356e-06, "loss": 0.010296606458723545, "memory(GiB)": 21.48, "step": 20592, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.955378 }, { "epoch": 0.6689731345223012, "grad_norm": 0.33860594034194946, "learning_rate": 2.7085129351976196e-06, "loss": 0.017632436007261276, "memory(GiB)": 21.48, "step": 20593, "token_acc": 1.0, "train_speed(iter/s)": 0.955385 }, { "epoch": 0.6690056199850567, "grad_norm": 0.2169826328754425, "learning_rate": 2.7080355274855125e-06, "loss": 0.00953902117908001, "memory(GiB)": 21.48, "step": 20594, "token_acc": 1.0, "train_speed(iter/s)": 0.955392 }, { "epoch": 0.6690381054478121, "grad_norm": 0.48274457454681396, "learning_rate": 2.707558146225726e-06, "loss": 0.0151428934186697, "memory(GiB)": 21.48, "step": 20595, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.9554 }, { "epoch": 0.6690705909105675, "grad_norm": 0.36980965733528137, "learning_rate": 2.7070807914237695e-06, "loss": 0.015464942902326584, "memory(GiB)": 21.48, "step": 20596, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.955408 }, { "epoch": 0.6691030763733229, "grad_norm": 0.43332725763320923, "learning_rate": 2.7066034630851544e-06, "loss": 0.01652381755411625, "memory(GiB)": 21.48, "step": 20597, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.955416 }, { "epoch": 0.6691355618360784, "grad_norm": 0.3328070640563965, "learning_rate": 2.7061261612153862e-06, "loss": 0.01813403144478798, "memory(GiB)": 21.48, "step": 20598, "token_acc": 0.9875, "train_speed(iter/s)": 0.955424 }, { "epoch": 0.6691680472988337, "grad_norm": 0.4234180748462677, "learning_rate": 2.7056488858199768e-06, "loss": 0.017051937058568, "memory(GiB)": 21.48, "step": 20599, "token_acc": 0.9896373056994818, "train_speed(iter/s)": 0.95543 }, { "epoch": 0.6692005327615892, "grad_norm": 0.41632381081581116, "learning_rate": 2.705171636904429e-06, "loss": 0.02045830525457859, "memory(GiB)": 21.48, "step": 20600, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.955437 }, { "epoch": 0.6692330182243446, "grad_norm": 0.24418193101882935, "learning_rate": 2.7046944144742592e-06, "loss": 0.009904978796839714, "memory(GiB)": 21.48, "step": 20601, "token_acc": 1.0, "train_speed(iter/s)": 0.955445 }, { "epoch": 0.6692655036871, "grad_norm": 0.28024226427078247, "learning_rate": 2.704217218534969e-06, "loss": 0.011835183016955853, "memory(GiB)": 21.48, "step": 20602, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.955453 }, { "epoch": 0.6692979891498554, "grad_norm": 0.3689325451850891, "learning_rate": 2.7037400490920678e-06, "loss": 0.017587652429938316, "memory(GiB)": 21.48, "step": 20603, "token_acc": 0.9922779922779923, "train_speed(iter/s)": 0.955461 }, { "epoch": 0.6693304746126109, "grad_norm": 0.3031673729419708, "learning_rate": 2.7032629061510653e-06, "loss": 0.011612778529524803, "memory(GiB)": 21.48, "step": 20604, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955469 }, { "epoch": 0.6693629600753662, "grad_norm": 0.28482499718666077, "learning_rate": 2.7027857897174604e-06, "loss": 0.010146659798920155, "memory(GiB)": 21.48, "step": 20605, "token_acc": 1.0, "train_speed(iter/s)": 0.955478 }, { "epoch": 0.6693954455381217, "grad_norm": 0.22599755227565765, "learning_rate": 2.7023086997967703e-06, "loss": 0.015778854489326477, "memory(GiB)": 21.48, "step": 20606, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.955487 }, { "epoch": 0.669427931000877, "grad_norm": 0.43180543184280396, "learning_rate": 2.701831636394493e-06, "loss": 0.02537008374929428, "memory(GiB)": 21.48, "step": 20607, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.955497 }, { "epoch": 0.6694604164636325, "grad_norm": 0.7388792634010315, "learning_rate": 2.7013545995161406e-06, "loss": 0.019743166863918304, "memory(GiB)": 21.48, "step": 20608, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.955507 }, { "epoch": 0.669492901926388, "grad_norm": 0.2893194258213043, "learning_rate": 2.700877589167213e-06, "loss": 0.013414420187473297, "memory(GiB)": 21.48, "step": 20609, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955517 }, { "epoch": 0.6695253873891434, "grad_norm": 0.2732851505279541, "learning_rate": 2.7004006053532182e-06, "loss": 0.009942376986145973, "memory(GiB)": 21.48, "step": 20610, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.955527 }, { "epoch": 0.6695578728518988, "grad_norm": 0.5383705496788025, "learning_rate": 2.699923648079661e-06, "loss": 0.01763562113046646, "memory(GiB)": 21.48, "step": 20611, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.955536 }, { "epoch": 0.6695903583146542, "grad_norm": 0.2837579846382141, "learning_rate": 2.6994467173520484e-06, "loss": 0.018056731671094894, "memory(GiB)": 21.48, "step": 20612, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955545 }, { "epoch": 0.6696228437774097, "grad_norm": 0.4011702835559845, "learning_rate": 2.698969813175881e-06, "loss": 0.021330611780285835, "memory(GiB)": 21.48, "step": 20613, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.955555 }, { "epoch": 0.669655329240165, "grad_norm": 0.5417569875717163, "learning_rate": 2.698492935556664e-06, "loss": 0.022473488003015518, "memory(GiB)": 21.48, "step": 20614, "token_acc": 1.0, "train_speed(iter/s)": 0.955565 }, { "epoch": 0.6696878147029205, "grad_norm": 0.45071810483932495, "learning_rate": 2.6980160844999025e-06, "loss": 0.011693825013935566, "memory(GiB)": 21.48, "step": 20615, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.955576 }, { "epoch": 0.6697203001656759, "grad_norm": 0.5708826184272766, "learning_rate": 2.6975392600111005e-06, "loss": 0.020341746509075165, "memory(GiB)": 21.48, "step": 20616, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.955585 }, { "epoch": 0.6697527856284313, "grad_norm": 0.3420731723308563, "learning_rate": 2.6970624620957587e-06, "loss": 0.008675113320350647, "memory(GiB)": 21.48, "step": 20617, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955593 }, { "epoch": 0.6697852710911867, "grad_norm": 0.26408058404922485, "learning_rate": 2.6965856907593813e-06, "loss": 0.014454122632741928, "memory(GiB)": 21.48, "step": 20618, "token_acc": 1.0, "train_speed(iter/s)": 0.955601 }, { "epoch": 0.6698177565539422, "grad_norm": 0.4216456711292267, "learning_rate": 2.6961089460074708e-06, "loss": 0.016686707735061646, "memory(GiB)": 21.48, "step": 20619, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.955609 }, { "epoch": 0.6698502420166975, "grad_norm": 0.47479957342147827, "learning_rate": 2.6956322278455292e-06, "loss": 0.019271958619356155, "memory(GiB)": 21.48, "step": 20620, "token_acc": 0.9964028776978417, "train_speed(iter/s)": 0.955617 }, { "epoch": 0.669882727479453, "grad_norm": 0.5086701512336731, "learning_rate": 2.695155536279061e-06, "loss": 0.01632159762084484, "memory(GiB)": 21.48, "step": 20621, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955626 }, { "epoch": 0.6699152129422083, "grad_norm": 0.367864727973938, "learning_rate": 2.694678871313564e-06, "loss": 0.01748201623558998, "memory(GiB)": 21.48, "step": 20622, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.955634 }, { "epoch": 0.6699476984049638, "grad_norm": 0.4712931513786316, "learning_rate": 2.6942022329545405e-06, "loss": 0.013885410502552986, "memory(GiB)": 21.48, "step": 20623, "token_acc": 1.0, "train_speed(iter/s)": 0.955641 }, { "epoch": 0.6699801838677192, "grad_norm": 0.2595183849334717, "learning_rate": 2.6937256212074924e-06, "loss": 0.010720202699303627, "memory(GiB)": 21.48, "step": 20624, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.955649 }, { "epoch": 0.6700126693304747, "grad_norm": 0.3510094881057739, "learning_rate": 2.6932490360779227e-06, "loss": 0.018736476078629494, "memory(GiB)": 21.48, "step": 20625, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955656 }, { "epoch": 0.67004515479323, "grad_norm": 0.4155368506908417, "learning_rate": 2.692772477571327e-06, "loss": 0.016923917457461357, "memory(GiB)": 21.48, "step": 20626, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.955665 }, { "epoch": 0.6700776402559855, "grad_norm": 0.3160543739795685, "learning_rate": 2.6922959456932105e-06, "loss": 0.013263623230159283, "memory(GiB)": 21.48, "step": 20627, "token_acc": 0.9918367346938776, "train_speed(iter/s)": 0.955673 }, { "epoch": 0.6701101257187408, "grad_norm": 0.4728371500968933, "learning_rate": 2.691819440449066e-06, "loss": 0.01650879718363285, "memory(GiB)": 21.48, "step": 20628, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.955681 }, { "epoch": 0.6701426111814963, "grad_norm": 0.39794933795928955, "learning_rate": 2.691342961844401e-06, "loss": 0.020641375333070755, "memory(GiB)": 21.48, "step": 20629, "token_acc": 1.0, "train_speed(iter/s)": 0.955687 }, { "epoch": 0.6701750966442517, "grad_norm": 0.39838311076164246, "learning_rate": 2.6908665098847093e-06, "loss": 0.012760516256093979, "memory(GiB)": 21.48, "step": 20630, "token_acc": 0.996, "train_speed(iter/s)": 0.955695 }, { "epoch": 0.6702075821070071, "grad_norm": 0.461952269077301, "learning_rate": 2.6903900845754934e-06, "loss": 0.012025873176753521, "memory(GiB)": 21.48, "step": 20631, "token_acc": 1.0, "train_speed(iter/s)": 0.955702 }, { "epoch": 0.6702400675697625, "grad_norm": 0.41499632596969604, "learning_rate": 2.689913685922248e-06, "loss": 0.016476504504680634, "memory(GiB)": 21.48, "step": 20632, "token_acc": 1.0, "train_speed(iter/s)": 0.95571 }, { "epoch": 0.670272553032518, "grad_norm": 0.26082873344421387, "learning_rate": 2.689437313930473e-06, "loss": 0.009086531586945057, "memory(GiB)": 21.48, "step": 20633, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955718 }, { "epoch": 0.6703050384952733, "grad_norm": 0.4691631495952606, "learning_rate": 2.6889609686056674e-06, "loss": 0.019491733983159065, "memory(GiB)": 21.48, "step": 20634, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.955726 }, { "epoch": 0.6703375239580288, "grad_norm": 0.5516489148139954, "learning_rate": 2.688484649953328e-06, "loss": 0.010589521378278732, "memory(GiB)": 21.48, "step": 20635, "token_acc": 1.0, "train_speed(iter/s)": 0.955733 }, { "epoch": 0.6703700094207842, "grad_norm": 0.35340768098831177, "learning_rate": 2.6880083579789546e-06, "loss": 0.012397553771734238, "memory(GiB)": 21.48, "step": 20636, "token_acc": 1.0, "train_speed(iter/s)": 0.955742 }, { "epoch": 0.6704024948835396, "grad_norm": 1.8395922183990479, "learning_rate": 2.68753209268804e-06, "loss": 0.018528444692492485, "memory(GiB)": 21.48, "step": 20637, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.955752 }, { "epoch": 0.670434980346295, "grad_norm": 0.530759334564209, "learning_rate": 2.6870558540860825e-06, "loss": 0.017712946981191635, "memory(GiB)": 21.48, "step": 20638, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.955762 }, { "epoch": 0.6704674658090505, "grad_norm": 0.4183880090713501, "learning_rate": 2.6865796421785794e-06, "loss": 0.020913654938340187, "memory(GiB)": 21.48, "step": 20639, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955771 }, { "epoch": 0.6704999512718058, "grad_norm": 0.3264724910259247, "learning_rate": 2.6861034569710274e-06, "loss": 0.011504859663546085, "memory(GiB)": 21.48, "step": 20640, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955781 }, { "epoch": 0.6705324367345613, "grad_norm": 0.30934464931488037, "learning_rate": 2.6856272984689203e-06, "loss": 0.013777265325188637, "memory(GiB)": 21.48, "step": 20641, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.955791 }, { "epoch": 0.6705649221973167, "grad_norm": 0.4722512662410736, "learning_rate": 2.685151166677754e-06, "loss": 0.023033663630485535, "memory(GiB)": 21.48, "step": 20642, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.955801 }, { "epoch": 0.6705974076600721, "grad_norm": 0.3631940484046936, "learning_rate": 2.6846750616030247e-06, "loss": 0.018674980849027634, "memory(GiB)": 21.48, "step": 20643, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.955811 }, { "epoch": 0.6706298931228275, "grad_norm": 0.5177447199821472, "learning_rate": 2.684198983250229e-06, "loss": 0.020803391933441162, "memory(GiB)": 21.48, "step": 20644, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955821 }, { "epoch": 0.670662378585583, "grad_norm": 0.308664470911026, "learning_rate": 2.6837229316248576e-06, "loss": 0.01593216136097908, "memory(GiB)": 21.48, "step": 20645, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.95583 }, { "epoch": 0.6706948640483383, "grad_norm": 0.35036998987197876, "learning_rate": 2.683246906732406e-06, "loss": 0.02069159224629402, "memory(GiB)": 21.48, "step": 20646, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955837 }, { "epoch": 0.6707273495110938, "grad_norm": 0.3836384415626526, "learning_rate": 2.682770908578369e-06, "loss": 0.01654934138059616, "memory(GiB)": 21.48, "step": 20647, "token_acc": 0.985, "train_speed(iter/s)": 0.955845 }, { "epoch": 0.6707598349738492, "grad_norm": 0.4938415586948395, "learning_rate": 2.682294937168242e-06, "loss": 0.01664038375020027, "memory(GiB)": 21.48, "step": 20648, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.955852 }, { "epoch": 0.6707923204366046, "grad_norm": 0.28004008531570435, "learning_rate": 2.6818189925075135e-06, "loss": 0.01114317774772644, "memory(GiB)": 21.48, "step": 20649, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955859 }, { "epoch": 0.67082480589936, "grad_norm": 0.35361185669898987, "learning_rate": 2.68134307460168e-06, "loss": 0.016483299434185028, "memory(GiB)": 21.48, "step": 20650, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.955867 }, { "epoch": 0.6708572913621155, "grad_norm": 0.3788427412509918, "learning_rate": 2.680867183456233e-06, "loss": 0.013950901105999947, "memory(GiB)": 21.48, "step": 20651, "token_acc": 1.0, "train_speed(iter/s)": 0.955874 }, { "epoch": 0.6708897768248708, "grad_norm": 0.3075314462184906, "learning_rate": 2.680391319076666e-06, "loss": 0.009585447609424591, "memory(GiB)": 21.48, "step": 20652, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955882 }, { "epoch": 0.6709222622876263, "grad_norm": 0.39394107460975647, "learning_rate": 2.6799154814684724e-06, "loss": 0.017532482743263245, "memory(GiB)": 21.48, "step": 20653, "token_acc": 1.0, "train_speed(iter/s)": 0.95589 }, { "epoch": 0.6709547477503817, "grad_norm": 0.29919442534446716, "learning_rate": 2.67943967063714e-06, "loss": 0.010621214285492897, "memory(GiB)": 21.48, "step": 20654, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955897 }, { "epoch": 0.6709872332131371, "grad_norm": 0.2897646427154541, "learning_rate": 2.678963886588165e-06, "loss": 0.013096409849822521, "memory(GiB)": 21.48, "step": 20655, "token_acc": 0.9868995633187773, "train_speed(iter/s)": 0.955905 }, { "epoch": 0.6710197186758925, "grad_norm": 0.44871726632118225, "learning_rate": 2.6784881293270317e-06, "loss": 0.017047161236405373, "memory(GiB)": 21.48, "step": 20656, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955911 }, { "epoch": 0.671052204138648, "grad_norm": 0.3270619511604309, "learning_rate": 2.678012398859239e-06, "loss": 0.01234990544617176, "memory(GiB)": 21.48, "step": 20657, "token_acc": 0.99, "train_speed(iter/s)": 0.955919 }, { "epoch": 0.6710846896014033, "grad_norm": 0.3889123201370239, "learning_rate": 2.677536695190272e-06, "loss": 0.02272513136267662, "memory(GiB)": 21.48, "step": 20658, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.955926 }, { "epoch": 0.6711171750641588, "grad_norm": 0.2636640667915344, "learning_rate": 2.6770610183256253e-06, "loss": 0.014835851266980171, "memory(GiB)": 21.48, "step": 20659, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955933 }, { "epoch": 0.6711496605269142, "grad_norm": 0.44888952374458313, "learning_rate": 2.676585368270783e-06, "loss": 0.015999360010027885, "memory(GiB)": 21.48, "step": 20660, "token_acc": 1.0, "train_speed(iter/s)": 0.955941 }, { "epoch": 0.6711821459896696, "grad_norm": 0.2901522219181061, "learning_rate": 2.676109745031239e-06, "loss": 0.014716679230332375, "memory(GiB)": 21.48, "step": 20661, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.955949 }, { "epoch": 0.671214631452425, "grad_norm": 0.3400627076625824, "learning_rate": 2.6756341486124815e-06, "loss": 0.011744746938347816, "memory(GiB)": 21.48, "step": 20662, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.955956 }, { "epoch": 0.6712471169151805, "grad_norm": 0.43479520082473755, "learning_rate": 2.675158579020002e-06, "loss": 0.024766799062490463, "memory(GiB)": 21.48, "step": 20663, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.955963 }, { "epoch": 0.6712796023779358, "grad_norm": 0.698444664478302, "learning_rate": 2.674683036259284e-06, "loss": 0.014248422347009182, "memory(GiB)": 21.48, "step": 20664, "token_acc": 1.0, "train_speed(iter/s)": 0.955968 }, { "epoch": 0.6713120878406913, "grad_norm": 0.3732578754425049, "learning_rate": 2.6742075203358193e-06, "loss": 0.01853887364268303, "memory(GiB)": 21.48, "step": 20665, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.955975 }, { "epoch": 0.6713445733034467, "grad_norm": 0.353390634059906, "learning_rate": 2.673732031255095e-06, "loss": 0.016158215701580048, "memory(GiB)": 21.48, "step": 20666, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.955981 }, { "epoch": 0.6713770587662021, "grad_norm": 0.45090967416763306, "learning_rate": 2.6732565690226e-06, "loss": 0.014131477102637291, "memory(GiB)": 21.48, "step": 20667, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955989 }, { "epoch": 0.6714095442289575, "grad_norm": 0.3195466697216034, "learning_rate": 2.6727811336438227e-06, "loss": 0.015992548316717148, "memory(GiB)": 21.48, "step": 20668, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955997 }, { "epoch": 0.671442029691713, "grad_norm": 0.3653622269630432, "learning_rate": 2.6723057251242467e-06, "loss": 0.012316172942519188, "memory(GiB)": 21.48, "step": 20669, "token_acc": 1.0, "train_speed(iter/s)": 0.956005 }, { "epoch": 0.6714745151544683, "grad_norm": 0.3466723561286926, "learning_rate": 2.6718303434693605e-06, "loss": 0.012418590486049652, "memory(GiB)": 21.48, "step": 20670, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.956015 }, { "epoch": 0.6715070006172238, "grad_norm": 0.30210068821907043, "learning_rate": 2.6713549886846514e-06, "loss": 0.013114769011735916, "memory(GiB)": 21.48, "step": 20671, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.956024 }, { "epoch": 0.6715394860799793, "grad_norm": 0.4715273678302765, "learning_rate": 2.6708796607756065e-06, "loss": 0.014270568266510963, "memory(GiB)": 21.48, "step": 20672, "token_acc": 0.988, "train_speed(iter/s)": 0.956034 }, { "epoch": 0.6715719715427346, "grad_norm": 0.40569332242012024, "learning_rate": 2.670404359747709e-06, "loss": 0.02053041197359562, "memory(GiB)": 21.48, "step": 20673, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956044 }, { "epoch": 0.6716044570054901, "grad_norm": 0.9038597941398621, "learning_rate": 2.669929085606445e-06, "loss": 0.015565662644803524, "memory(GiB)": 21.48, "step": 20674, "token_acc": 1.0, "train_speed(iter/s)": 0.956054 }, { "epoch": 0.6716369424682455, "grad_norm": 0.32261496782302856, "learning_rate": 2.6694538383573023e-06, "loss": 0.01894315518438816, "memory(GiB)": 21.48, "step": 20675, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.956063 }, { "epoch": 0.6716694279310009, "grad_norm": 0.7223455309867859, "learning_rate": 2.668978618005765e-06, "loss": 0.011695465072989464, "memory(GiB)": 21.48, "step": 20676, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.956071 }, { "epoch": 0.6717019133937563, "grad_norm": 0.45526713132858276, "learning_rate": 2.6685034245573147e-06, "loss": 0.011574691161513329, "memory(GiB)": 21.48, "step": 20677, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.956078 }, { "epoch": 0.6717343988565118, "grad_norm": 0.32240617275238037, "learning_rate": 2.668028258017441e-06, "loss": 0.019958103075623512, "memory(GiB)": 21.48, "step": 20678, "token_acc": 1.0, "train_speed(iter/s)": 0.956086 }, { "epoch": 0.6717668843192671, "grad_norm": 0.27303028106689453, "learning_rate": 2.667553118391621e-06, "loss": 0.012128520756959915, "memory(GiB)": 21.48, "step": 20679, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.956094 }, { "epoch": 0.6717993697820226, "grad_norm": 0.44677406549453735, "learning_rate": 2.667078005685346e-06, "loss": 0.02297823317348957, "memory(GiB)": 21.48, "step": 20680, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956101 }, { "epoch": 0.671831855244778, "grad_norm": 0.3413022756576538, "learning_rate": 2.6666029199040943e-06, "loss": 0.01697055995464325, "memory(GiB)": 21.48, "step": 20681, "token_acc": 1.0, "train_speed(iter/s)": 0.956108 }, { "epoch": 0.6718643407075334, "grad_norm": 0.2924883961677551, "learning_rate": 2.6661278610533503e-06, "loss": 0.014388764277100563, "memory(GiB)": 21.48, "step": 20682, "token_acc": 1.0, "train_speed(iter/s)": 0.956115 }, { "epoch": 0.6718968261702888, "grad_norm": 1.0755635499954224, "learning_rate": 2.6656528291385996e-06, "loss": 0.010303718969225883, "memory(GiB)": 21.48, "step": 20683, "token_acc": 0.995, "train_speed(iter/s)": 0.956122 }, { "epoch": 0.6719293116330443, "grad_norm": 0.33532026410102844, "learning_rate": 2.6651778241653183e-06, "loss": 0.01651790365576744, "memory(GiB)": 21.48, "step": 20684, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.956129 }, { "epoch": 0.6719617970957996, "grad_norm": 0.45034366846084595, "learning_rate": 2.6647028461389956e-06, "loss": 0.018188849091529846, "memory(GiB)": 21.48, "step": 20685, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.956136 }, { "epoch": 0.6719942825585551, "grad_norm": 0.49699151515960693, "learning_rate": 2.6642278950651092e-06, "loss": 0.022652745246887207, "memory(GiB)": 21.48, "step": 20686, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956144 }, { "epoch": 0.6720267680213104, "grad_norm": 0.3906288743019104, "learning_rate": 2.6637529709491437e-06, "loss": 0.02101043611764908, "memory(GiB)": 21.48, "step": 20687, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.956152 }, { "epoch": 0.6720592534840659, "grad_norm": 0.25587987899780273, "learning_rate": 2.6632780737965747e-06, "loss": 0.012660108506679535, "memory(GiB)": 21.48, "step": 20688, "token_acc": 1.0, "train_speed(iter/s)": 0.95616 }, { "epoch": 0.6720917389468213, "grad_norm": 0.3127133250236511, "learning_rate": 2.66280320361289e-06, "loss": 0.015359126962721348, "memory(GiB)": 21.48, "step": 20689, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.956167 }, { "epoch": 0.6721242244095768, "grad_norm": 0.35510921478271484, "learning_rate": 2.662328360403566e-06, "loss": 0.014942947775125504, "memory(GiB)": 21.48, "step": 20690, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.956175 }, { "epoch": 0.6721567098723321, "grad_norm": 0.3356916308403015, "learning_rate": 2.6618535441740855e-06, "loss": 0.017246128991246223, "memory(GiB)": 21.48, "step": 20691, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.956182 }, { "epoch": 0.6721891953350876, "grad_norm": 0.41679757833480835, "learning_rate": 2.661378754929926e-06, "loss": 0.016782540827989578, "memory(GiB)": 21.48, "step": 20692, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.95619 }, { "epoch": 0.672221680797843, "grad_norm": 0.37924960255622864, "learning_rate": 2.660903992676568e-06, "loss": 0.023272782564163208, "memory(GiB)": 21.48, "step": 20693, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.956198 }, { "epoch": 0.6722541662605984, "grad_norm": 0.38916322588920593, "learning_rate": 2.6604292574194906e-06, "loss": 0.014294108375906944, "memory(GiB)": 21.48, "step": 20694, "token_acc": 1.0, "train_speed(iter/s)": 0.956206 }, { "epoch": 0.6722866517233538, "grad_norm": 0.33365583419799805, "learning_rate": 2.6599545491641763e-06, "loss": 0.016154659911990166, "memory(GiB)": 21.48, "step": 20695, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.956213 }, { "epoch": 0.6723191371861093, "grad_norm": 1.339627981185913, "learning_rate": 2.6594798679160988e-06, "loss": 0.014823852106928825, "memory(GiB)": 21.48, "step": 20696, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956222 }, { "epoch": 0.6723516226488646, "grad_norm": 0.31942129135131836, "learning_rate": 2.659005213680739e-06, "loss": 0.010238349437713623, "memory(GiB)": 21.48, "step": 20697, "token_acc": 1.0, "train_speed(iter/s)": 0.956232 }, { "epoch": 0.6723841081116201, "grad_norm": 0.5776221752166748, "learning_rate": 2.6585305864635746e-06, "loss": 0.012577036395668983, "memory(GiB)": 21.48, "step": 20698, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956242 }, { "epoch": 0.6724165935743754, "grad_norm": 0.40594109892845154, "learning_rate": 2.658055986270084e-06, "loss": 0.015885595232248306, "memory(GiB)": 21.48, "step": 20699, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.956251 }, { "epoch": 0.6724490790371309, "grad_norm": 0.3542717397212982, "learning_rate": 2.6575814131057466e-06, "loss": 0.017284736037254333, "memory(GiB)": 21.48, "step": 20700, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.956261 }, { "epoch": 0.6724815644998863, "grad_norm": 0.32563087344169617, "learning_rate": 2.657106866976035e-06, "loss": 0.012222625315189362, "memory(GiB)": 21.48, "step": 20701, "token_acc": 1.0, "train_speed(iter/s)": 0.95627 }, { "epoch": 0.6725140499626417, "grad_norm": 0.37405434250831604, "learning_rate": 2.6566323478864287e-06, "loss": 0.01713191159069538, "memory(GiB)": 21.48, "step": 20702, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.95628 }, { "epoch": 0.6725465354253971, "grad_norm": 0.30669209361076355, "learning_rate": 2.656157855842404e-06, "loss": 0.014445377513766289, "memory(GiB)": 21.48, "step": 20703, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.95629 }, { "epoch": 0.6725790208881526, "grad_norm": 0.32566970586776733, "learning_rate": 2.6556833908494393e-06, "loss": 0.015315132215619087, "memory(GiB)": 21.48, "step": 20704, "token_acc": 1.0, "train_speed(iter/s)": 0.956299 }, { "epoch": 0.6726115063509079, "grad_norm": 3.5721163749694824, "learning_rate": 2.655208952913007e-06, "loss": 0.022157466039061546, "memory(GiB)": 21.48, "step": 20705, "token_acc": 0.9760956175298805, "train_speed(iter/s)": 0.956308 }, { "epoch": 0.6726439918136634, "grad_norm": 0.30773621797561646, "learning_rate": 2.654734542038584e-06, "loss": 0.0175103098154068, "memory(GiB)": 21.48, "step": 20706, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956318 }, { "epoch": 0.6726764772764188, "grad_norm": 0.4617891013622284, "learning_rate": 2.6542601582316464e-06, "loss": 0.01991902105510235, "memory(GiB)": 21.48, "step": 20707, "token_acc": 1.0, "train_speed(iter/s)": 0.956326 }, { "epoch": 0.6727089627391742, "grad_norm": 0.36299794912338257, "learning_rate": 2.65378580149767e-06, "loss": 0.016466110944747925, "memory(GiB)": 21.48, "step": 20708, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.956333 }, { "epoch": 0.6727414482019296, "grad_norm": 0.2507851719856262, "learning_rate": 2.653311471842126e-06, "loss": 0.013066887855529785, "memory(GiB)": 21.48, "step": 20709, "token_acc": 1.0, "train_speed(iter/s)": 0.956341 }, { "epoch": 0.6727739336646851, "grad_norm": 0.3367330729961395, "learning_rate": 2.6528371692704935e-06, "loss": 0.016931476071476936, "memory(GiB)": 21.48, "step": 20710, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.956347 }, { "epoch": 0.6728064191274404, "grad_norm": 0.2833741009235382, "learning_rate": 2.6523628937882406e-06, "loss": 0.01443922333419323, "memory(GiB)": 21.48, "step": 20711, "token_acc": 1.0, "train_speed(iter/s)": 0.956354 }, { "epoch": 0.6728389045901959, "grad_norm": 0.41767457127571106, "learning_rate": 2.6518886454008462e-06, "loss": 0.015297921374440193, "memory(GiB)": 21.48, "step": 20712, "token_acc": 1.0, "train_speed(iter/s)": 0.956362 }, { "epoch": 0.6728713900529513, "grad_norm": 0.37069156765937805, "learning_rate": 2.651414424113784e-06, "loss": 0.016531508415937424, "memory(GiB)": 21.48, "step": 20713, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.95637 }, { "epoch": 0.6729038755157067, "grad_norm": 0.30948689579963684, "learning_rate": 2.6509402299325236e-06, "loss": 0.010369010269641876, "memory(GiB)": 21.48, "step": 20714, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956378 }, { "epoch": 0.6729363609784621, "grad_norm": 0.5231139063835144, "learning_rate": 2.650466062862541e-06, "loss": 0.022535640746355057, "memory(GiB)": 21.48, "step": 20715, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956385 }, { "epoch": 0.6729688464412176, "grad_norm": 0.27603575587272644, "learning_rate": 2.649991922909304e-06, "loss": 0.013016363605856895, "memory(GiB)": 21.48, "step": 20716, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956392 }, { "epoch": 0.6730013319039729, "grad_norm": 0.2711528241634369, "learning_rate": 2.6495178100782915e-06, "loss": 0.016017716377973557, "memory(GiB)": 21.48, "step": 20717, "token_acc": 1.0, "train_speed(iter/s)": 0.956399 }, { "epoch": 0.6730338173667284, "grad_norm": 0.2788124680519104, "learning_rate": 2.6490437243749707e-06, "loss": 0.013374077156186104, "memory(GiB)": 21.48, "step": 20718, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.956406 }, { "epoch": 0.6730663028294838, "grad_norm": 0.29773393273353577, "learning_rate": 2.6485696658048155e-06, "loss": 0.013284843415021896, "memory(GiB)": 21.48, "step": 20719, "token_acc": 0.9885057471264368, "train_speed(iter/s)": 0.956414 }, { "epoch": 0.6730987882922392, "grad_norm": 0.3920992612838745, "learning_rate": 2.6480956343732943e-06, "loss": 0.01838601380586624, "memory(GiB)": 21.48, "step": 20720, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.95642 }, { "epoch": 0.6731312737549946, "grad_norm": 0.24813488125801086, "learning_rate": 2.6476216300858804e-06, "loss": 0.007897837087512016, "memory(GiB)": 21.48, "step": 20721, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.956428 }, { "epoch": 0.6731637592177501, "grad_norm": 0.2008611410856247, "learning_rate": 2.647147652948044e-06, "loss": 0.009330904111266136, "memory(GiB)": 21.48, "step": 20722, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956435 }, { "epoch": 0.6731962446805054, "grad_norm": 0.4073253571987152, "learning_rate": 2.646673702965257e-06, "loss": 0.02155262976884842, "memory(GiB)": 21.48, "step": 20723, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.956443 }, { "epoch": 0.6732287301432609, "grad_norm": 0.3214324116706848, "learning_rate": 2.646199780142985e-06, "loss": 0.016343148425221443, "memory(GiB)": 21.48, "step": 20724, "token_acc": 1.0, "train_speed(iter/s)": 0.95645 }, { "epoch": 0.6732612156060163, "grad_norm": 0.31559526920318604, "learning_rate": 2.645725884486702e-06, "loss": 0.012563996016979218, "memory(GiB)": 21.48, "step": 20725, "token_acc": 1.0, "train_speed(iter/s)": 0.956457 }, { "epoch": 0.6732937010687717, "grad_norm": 0.2566957473754883, "learning_rate": 2.645252016001875e-06, "loss": 0.015252167358994484, "memory(GiB)": 21.48, "step": 20726, "token_acc": 1.0, "train_speed(iter/s)": 0.956463 }, { "epoch": 0.6733261865315271, "grad_norm": 0.49463990330696106, "learning_rate": 2.644778174693973e-06, "loss": 0.015479238703846931, "memory(GiB)": 21.48, "step": 20727, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.95647 }, { "epoch": 0.6733586719942826, "grad_norm": 0.2684071958065033, "learning_rate": 2.6443043605684683e-06, "loss": 0.011953498236835003, "memory(GiB)": 21.48, "step": 20728, "token_acc": 0.9905660377358491, "train_speed(iter/s)": 0.956477 }, { "epoch": 0.6733911574570379, "grad_norm": 0.46050024032592773, "learning_rate": 2.643830573630825e-06, "loss": 0.018216121941804886, "memory(GiB)": 21.48, "step": 20729, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.956484 }, { "epoch": 0.6734236429197934, "grad_norm": 0.7558307647705078, "learning_rate": 2.643356813886513e-06, "loss": 0.028278326615691185, "memory(GiB)": 21.48, "step": 20730, "token_acc": 0.9753521126760564, "train_speed(iter/s)": 0.956492 }, { "epoch": 0.6734561283825488, "grad_norm": 0.33809444308280945, "learning_rate": 2.642883081341e-06, "loss": 0.017421696335077286, "memory(GiB)": 21.48, "step": 20731, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956502 }, { "epoch": 0.6734886138453042, "grad_norm": 0.40226632356643677, "learning_rate": 2.642409375999755e-06, "loss": 0.011786559596657753, "memory(GiB)": 21.48, "step": 20732, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.956511 }, { "epoch": 0.6735210993080596, "grad_norm": 0.25007137656211853, "learning_rate": 2.641935697868242e-06, "loss": 0.014053896069526672, "memory(GiB)": 21.48, "step": 20733, "token_acc": 1.0, "train_speed(iter/s)": 0.95652 }, { "epoch": 0.6735535847708151, "grad_norm": 0.2410707324743271, "learning_rate": 2.64146204695193e-06, "loss": 0.011437523178756237, "memory(GiB)": 21.48, "step": 20734, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.95653 }, { "epoch": 0.6735860702335704, "grad_norm": 0.5687830448150635, "learning_rate": 2.6409884232562853e-06, "loss": 0.015739476308226585, "memory(GiB)": 21.48, "step": 20735, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.956537 }, { "epoch": 0.6736185556963259, "grad_norm": 0.39951449632644653, "learning_rate": 2.640514826786775e-06, "loss": 0.011965347453951836, "memory(GiB)": 21.48, "step": 20736, "token_acc": 1.0, "train_speed(iter/s)": 0.956543 }, { "epoch": 0.6736510411590814, "grad_norm": 0.34796807169914246, "learning_rate": 2.640041257548863e-06, "loss": 0.015936564654111862, "memory(GiB)": 21.48, "step": 20737, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.956551 }, { "epoch": 0.6736835266218367, "grad_norm": 0.37118294835090637, "learning_rate": 2.639567715548018e-06, "loss": 0.01206307765096426, "memory(GiB)": 21.48, "step": 20738, "token_acc": 1.0, "train_speed(iter/s)": 0.95656 }, { "epoch": 0.6737160120845922, "grad_norm": 0.36243805289268494, "learning_rate": 2.639094200789699e-06, "loss": 0.020196545869112015, "memory(GiB)": 21.48, "step": 20739, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.956567 }, { "epoch": 0.6737484975473476, "grad_norm": 0.231458380818367, "learning_rate": 2.6386207132793795e-06, "loss": 0.009839823469519615, "memory(GiB)": 21.48, "step": 20740, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.956574 }, { "epoch": 0.673780983010103, "grad_norm": 0.5054216980934143, "learning_rate": 2.638147253022518e-06, "loss": 0.016590695828199387, "memory(GiB)": 21.48, "step": 20741, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956582 }, { "epoch": 0.6738134684728584, "grad_norm": 0.5081846714019775, "learning_rate": 2.63767382002458e-06, "loss": 0.01589966006577015, "memory(GiB)": 21.48, "step": 20742, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956589 }, { "epoch": 0.6738459539356139, "grad_norm": 0.6531729102134705, "learning_rate": 2.6372004142910333e-06, "loss": 0.010418068617582321, "memory(GiB)": 21.48, "step": 20743, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.956596 }, { "epoch": 0.6738784393983692, "grad_norm": 0.5000182390213013, "learning_rate": 2.6367270358273344e-06, "loss": 0.019205406308174133, "memory(GiB)": 21.48, "step": 20744, "token_acc": 1.0, "train_speed(iter/s)": 0.956603 }, { "epoch": 0.6739109248611247, "grad_norm": 0.4256865084171295, "learning_rate": 2.6362536846389553e-06, "loss": 0.014961479231715202, "memory(GiB)": 21.48, "step": 20745, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.956611 }, { "epoch": 0.67394341032388, "grad_norm": 0.1607123464345932, "learning_rate": 2.635780360731352e-06, "loss": 0.007857429794967175, "memory(GiB)": 21.48, "step": 20746, "token_acc": 1.0, "train_speed(iter/s)": 0.956618 }, { "epoch": 0.6739758957866355, "grad_norm": 0.44639119505882263, "learning_rate": 2.6353070641099925e-06, "loss": 0.015507162548601627, "memory(GiB)": 21.48, "step": 20747, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.956626 }, { "epoch": 0.6740083812493909, "grad_norm": 0.3328592777252197, "learning_rate": 2.6348337947803348e-06, "loss": 0.011562258005142212, "memory(GiB)": 21.48, "step": 20748, "token_acc": 1.0, "train_speed(iter/s)": 0.956633 }, { "epoch": 0.6740408667121464, "grad_norm": 0.5376240611076355, "learning_rate": 2.6343605527478435e-06, "loss": 0.011619378812611103, "memory(GiB)": 21.48, "step": 20749, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956641 }, { "epoch": 0.6740733521749017, "grad_norm": 0.3894782066345215, "learning_rate": 2.6338873380179803e-06, "loss": 0.017521390691399574, "memory(GiB)": 21.48, "step": 20750, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956647 }, { "epoch": 0.6741058376376572, "grad_norm": 0.47644734382629395, "learning_rate": 2.633414150596208e-06, "loss": 0.01760098710656166, "memory(GiB)": 21.48, "step": 20751, "token_acc": 1.0, "train_speed(iter/s)": 0.956653 }, { "epoch": 0.6741383231004126, "grad_norm": 0.6066579818725586, "learning_rate": 2.6329409904879842e-06, "loss": 0.011745210736989975, "memory(GiB)": 21.48, "step": 20752, "token_acc": 0.9966329966329966, "train_speed(iter/s)": 0.956661 }, { "epoch": 0.674170808563168, "grad_norm": 0.5914284586906433, "learning_rate": 2.632467857698772e-06, "loss": 0.02271762117743492, "memory(GiB)": 21.48, "step": 20753, "token_acc": 0.9826086956521739, "train_speed(iter/s)": 0.956669 }, { "epoch": 0.6742032940259234, "grad_norm": 0.3607206344604492, "learning_rate": 2.6319947522340322e-06, "loss": 0.014890626072883606, "memory(GiB)": 21.48, "step": 20754, "token_acc": 1.0, "train_speed(iter/s)": 0.956676 }, { "epoch": 0.6742357794886789, "grad_norm": 0.2405441850423813, "learning_rate": 2.6315216740992267e-06, "loss": 0.009755173698067665, "memory(GiB)": 21.48, "step": 20755, "token_acc": 1.0, "train_speed(iter/s)": 0.956685 }, { "epoch": 0.6742682649514342, "grad_norm": 0.34337806701660156, "learning_rate": 2.6310486232998113e-06, "loss": 0.016403228044509888, "memory(GiB)": 21.48, "step": 20756, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956695 }, { "epoch": 0.6743007504141897, "grad_norm": 0.34172701835632324, "learning_rate": 2.6305755998412485e-06, "loss": 0.014679521322250366, "memory(GiB)": 21.48, "step": 20757, "token_acc": 0.99609375, "train_speed(iter/s)": 0.956704 }, { "epoch": 0.674333235876945, "grad_norm": 0.26094067096710205, "learning_rate": 2.630102603728996e-06, "loss": 0.016010133549571037, "memory(GiB)": 21.48, "step": 20758, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956713 }, { "epoch": 0.6743657213397005, "grad_norm": 0.450186550617218, "learning_rate": 2.6296296349685147e-06, "loss": 0.014404268004000187, "memory(GiB)": 21.48, "step": 20759, "token_acc": 1.0, "train_speed(iter/s)": 0.956723 }, { "epoch": 0.6743982068024559, "grad_norm": 0.364133358001709, "learning_rate": 2.6291566935652633e-06, "loss": 0.020175307989120483, "memory(GiB)": 21.48, "step": 20760, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956733 }, { "epoch": 0.6744306922652114, "grad_norm": 0.2510707676410675, "learning_rate": 2.6286837795246976e-06, "loss": 0.0114285284653306, "memory(GiB)": 21.48, "step": 20761, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956743 }, { "epoch": 0.6744631777279667, "grad_norm": 0.3746556341648102, "learning_rate": 2.628210892852278e-06, "loss": 0.01726280152797699, "memory(GiB)": 21.48, "step": 20762, "token_acc": 1.0, "train_speed(iter/s)": 0.956752 }, { "epoch": 0.6744956631907222, "grad_norm": 0.43914493918418884, "learning_rate": 2.6277380335534616e-06, "loss": 0.01944219321012497, "memory(GiB)": 21.48, "step": 20763, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.956762 }, { "epoch": 0.6745281486534775, "grad_norm": 0.3791266977787018, "learning_rate": 2.627265201633707e-06, "loss": 0.015545716509222984, "memory(GiB)": 21.48, "step": 20764, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.956772 }, { "epoch": 0.674560634116233, "grad_norm": 0.36630505323410034, "learning_rate": 2.6267923970984683e-06, "loss": 0.015484748408198357, "memory(GiB)": 21.48, "step": 20765, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.956782 }, { "epoch": 0.6745931195789884, "grad_norm": 0.27637919783592224, "learning_rate": 2.626319619953206e-06, "loss": 0.012880575843155384, "memory(GiB)": 21.48, "step": 20766, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956791 }, { "epoch": 0.6746256050417438, "grad_norm": 0.3081163763999939, "learning_rate": 2.625846870203371e-06, "loss": 0.012151747941970825, "memory(GiB)": 21.48, "step": 20767, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.956801 }, { "epoch": 0.6746580905044992, "grad_norm": 0.42705556750297546, "learning_rate": 2.625374147854427e-06, "loss": 0.019476184621453285, "memory(GiB)": 21.48, "step": 20768, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956809 }, { "epoch": 0.6746905759672547, "grad_norm": 0.3816728889942169, "learning_rate": 2.624901452911823e-06, "loss": 0.024397429078817368, "memory(GiB)": 21.48, "step": 20769, "token_acc": 1.0, "train_speed(iter/s)": 0.956819 }, { "epoch": 0.67472306143001, "grad_norm": 0.4007536768913269, "learning_rate": 2.6244287853810208e-06, "loss": 0.014089943841099739, "memory(GiB)": 21.48, "step": 20770, "token_acc": 0.996415770609319, "train_speed(iter/s)": 0.956829 }, { "epoch": 0.6747555468927655, "grad_norm": 0.4886604845523834, "learning_rate": 2.62395614526747e-06, "loss": 0.028030436486005783, "memory(GiB)": 21.48, "step": 20771, "token_acc": 0.9856459330143541, "train_speed(iter/s)": 0.956836 }, { "epoch": 0.6747880323555209, "grad_norm": 0.22283662855625153, "learning_rate": 2.6234835325766274e-06, "loss": 0.01245715282857418, "memory(GiB)": 21.48, "step": 20772, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.956843 }, { "epoch": 0.6748205178182763, "grad_norm": 0.32485607266426086, "learning_rate": 2.6230109473139482e-06, "loss": 0.008940204977989197, "memory(GiB)": 21.48, "step": 20773, "token_acc": 0.9967213114754099, "train_speed(iter/s)": 0.956851 }, { "epoch": 0.6748530032810317, "grad_norm": 0.9041621685028076, "learning_rate": 2.6225383894848867e-06, "loss": 0.01661229506134987, "memory(GiB)": 21.48, "step": 20774, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.956858 }, { "epoch": 0.6748854887437872, "grad_norm": 0.3987383246421814, "learning_rate": 2.622065859094898e-06, "loss": 0.016311781480908394, "memory(GiB)": 21.48, "step": 20775, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.956865 }, { "epoch": 0.6749179742065425, "grad_norm": 0.45342180132865906, "learning_rate": 2.6215933561494323e-06, "loss": 0.015763122588396072, "memory(GiB)": 21.48, "step": 20776, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.956873 }, { "epoch": 0.674950459669298, "grad_norm": 0.46810105443000793, "learning_rate": 2.6211208806539456e-06, "loss": 0.029299583286046982, "memory(GiB)": 21.48, "step": 20777, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.95688 }, { "epoch": 0.6749829451320534, "grad_norm": 0.3387974500656128, "learning_rate": 2.620648432613889e-06, "loss": 0.013652635738253593, "memory(GiB)": 21.48, "step": 20778, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.956887 }, { "epoch": 0.6750154305948088, "grad_norm": 0.3117384612560272, "learning_rate": 2.6201760120347196e-06, "loss": 0.01804419234395027, "memory(GiB)": 21.48, "step": 20779, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956895 }, { "epoch": 0.6750479160575642, "grad_norm": 0.4591006636619568, "learning_rate": 2.6197036189218843e-06, "loss": 0.012339172884821892, "memory(GiB)": 21.48, "step": 20780, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.956902 }, { "epoch": 0.6750804015203197, "grad_norm": 0.22415289282798767, "learning_rate": 2.619231253280837e-06, "loss": 0.009963266551494598, "memory(GiB)": 21.48, "step": 20781, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956909 }, { "epoch": 0.675112886983075, "grad_norm": 0.4061501920223236, "learning_rate": 2.6187589151170303e-06, "loss": 0.01548672653734684, "memory(GiB)": 21.48, "step": 20782, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.956916 }, { "epoch": 0.6751453724458305, "grad_norm": 0.36704206466674805, "learning_rate": 2.6182866044359167e-06, "loss": 0.0159821305423975, "memory(GiB)": 21.48, "step": 20783, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.956924 }, { "epoch": 0.6751778579085859, "grad_norm": 0.4293542206287384, "learning_rate": 2.617814321242944e-06, "loss": 0.02033372037112713, "memory(GiB)": 21.48, "step": 20784, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.956931 }, { "epoch": 0.6752103433713413, "grad_norm": 0.30669882893562317, "learning_rate": 2.617342065543565e-06, "loss": 0.010703805834054947, "memory(GiB)": 21.48, "step": 20785, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.956939 }, { "epoch": 0.6752428288340967, "grad_norm": 0.2689920961856842, "learning_rate": 2.61686983734323e-06, "loss": 0.01448647677898407, "memory(GiB)": 21.48, "step": 20786, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956946 }, { "epoch": 0.6752753142968522, "grad_norm": 0.3855774402618408, "learning_rate": 2.61639763664739e-06, "loss": 0.020417913794517517, "memory(GiB)": 21.48, "step": 20787, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.956954 }, { "epoch": 0.6753077997596075, "grad_norm": 0.3718307316303253, "learning_rate": 2.6159254634614927e-06, "loss": 0.017860744148492813, "memory(GiB)": 21.48, "step": 20788, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.956962 }, { "epoch": 0.675340285222363, "grad_norm": 0.3365667462348938, "learning_rate": 2.6154533177909887e-06, "loss": 0.011915476992726326, "memory(GiB)": 21.48, "step": 20789, "token_acc": 1.0, "train_speed(iter/s)": 0.956968 }, { "epoch": 0.6753727706851184, "grad_norm": 0.350236177444458, "learning_rate": 2.6149811996413264e-06, "loss": 0.01924414560198784, "memory(GiB)": 21.48, "step": 20790, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.956976 }, { "epoch": 0.6754052561478738, "grad_norm": 0.3305094242095947, "learning_rate": 2.6145091090179563e-06, "loss": 0.01853380724787712, "memory(GiB)": 21.48, "step": 20791, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.956983 }, { "epoch": 0.6754377416106292, "grad_norm": 0.3832521438598633, "learning_rate": 2.6140370459263283e-06, "loss": 0.018390946090221405, "memory(GiB)": 21.48, "step": 20792, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.95699 }, { "epoch": 0.6754702270733847, "grad_norm": 0.36577752232551575, "learning_rate": 2.6135650103718858e-06, "loss": 0.019535895437002182, "memory(GiB)": 21.48, "step": 20793, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956998 }, { "epoch": 0.67550271253614, "grad_norm": 0.3819851279258728, "learning_rate": 2.613093002360082e-06, "loss": 0.017933208495378494, "memory(GiB)": 21.48, "step": 20794, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.957005 }, { "epoch": 0.6755351979988955, "grad_norm": 0.5097732543945312, "learning_rate": 2.6126210218963575e-06, "loss": 0.01734495908021927, "memory(GiB)": 21.48, "step": 20795, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957012 }, { "epoch": 0.6755676834616509, "grad_norm": 0.8131659030914307, "learning_rate": 2.6121490689861683e-06, "loss": 0.02225457690656185, "memory(GiB)": 21.48, "step": 20796, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.95702 }, { "epoch": 0.6756001689244063, "grad_norm": 0.30984029173851013, "learning_rate": 2.611677143634955e-06, "loss": 0.022753600031137466, "memory(GiB)": 21.48, "step": 20797, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.957028 }, { "epoch": 0.6756326543871617, "grad_norm": 0.30309292674064636, "learning_rate": 2.6112052458481683e-06, "loss": 0.01610434800386429, "memory(GiB)": 21.48, "step": 20798, "token_acc": 1.0, "train_speed(iter/s)": 0.957035 }, { "epoch": 0.6756651398499172, "grad_norm": 0.3648844063282013, "learning_rate": 2.6107333756312508e-06, "loss": 0.015410691499710083, "memory(GiB)": 21.48, "step": 20799, "token_acc": 1.0, "train_speed(iter/s)": 0.957042 }, { "epoch": 0.6756976253126726, "grad_norm": 0.36270052194595337, "learning_rate": 2.6102615329896496e-06, "loss": 0.012227775529026985, "memory(GiB)": 21.48, "step": 20800, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.957049 }, { "epoch": 0.675730110775428, "grad_norm": 0.3797664940357208, "learning_rate": 2.6097897179288117e-06, "loss": 0.013973213732242584, "memory(GiB)": 21.48, "step": 20801, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.957055 }, { "epoch": 0.6757625962381835, "grad_norm": 0.28852471709251404, "learning_rate": 2.609317930454184e-06, "loss": 0.01702723652124405, "memory(GiB)": 21.48, "step": 20802, "token_acc": 1.0, "train_speed(iter/s)": 0.957061 }, { "epoch": 0.6757950817009388, "grad_norm": 0.3448348343372345, "learning_rate": 2.6088461705712063e-06, "loss": 0.01122099719941616, "memory(GiB)": 21.48, "step": 20803, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.957068 }, { "epoch": 0.6758275671636943, "grad_norm": 0.40635842084884644, "learning_rate": 2.608374438285327e-06, "loss": 0.015592963434755802, "memory(GiB)": 21.48, "step": 20804, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.957076 }, { "epoch": 0.6758600526264497, "grad_norm": 0.3451288938522339, "learning_rate": 2.6079027336019902e-06, "loss": 0.019030293449759483, "memory(GiB)": 21.48, "step": 20805, "token_acc": 0.9877551020408163, "train_speed(iter/s)": 0.957082 }, { "epoch": 0.6758925380892051, "grad_norm": 0.33159396052360535, "learning_rate": 2.6074310565266396e-06, "loss": 0.019888373091816902, "memory(GiB)": 21.48, "step": 20806, "token_acc": 1.0, "train_speed(iter/s)": 0.957089 }, { "epoch": 0.6759250235519605, "grad_norm": 0.4335756301879883, "learning_rate": 2.606959407064721e-06, "loss": 0.013087553903460503, "memory(GiB)": 21.48, "step": 20807, "token_acc": 1.0, "train_speed(iter/s)": 0.957096 }, { "epoch": 0.675957509014716, "grad_norm": 0.31755316257476807, "learning_rate": 2.606487785221674e-06, "loss": 0.01540398970246315, "memory(GiB)": 21.48, "step": 20808, "token_acc": 1.0, "train_speed(iter/s)": 0.957104 }, { "epoch": 0.6759899944774713, "grad_norm": 0.24296146631240845, "learning_rate": 2.6060161910029435e-06, "loss": 0.011828135699033737, "memory(GiB)": 21.48, "step": 20809, "token_acc": 1.0, "train_speed(iter/s)": 0.957111 }, { "epoch": 0.6760224799402268, "grad_norm": 0.30264538526535034, "learning_rate": 2.6055446244139733e-06, "loss": 0.015215534716844559, "memory(GiB)": 21.48, "step": 20810, "token_acc": 1.0, "train_speed(iter/s)": 0.957118 }, { "epoch": 0.6760549654029822, "grad_norm": 0.24685831367969513, "learning_rate": 2.6050730854602065e-06, "loss": 0.011302735656499863, "memory(GiB)": 21.48, "step": 20811, "token_acc": 1.0, "train_speed(iter/s)": 0.957125 }, { "epoch": 0.6760874508657376, "grad_norm": 0.27580296993255615, "learning_rate": 2.604601574147082e-06, "loss": 0.01703643426299095, "memory(GiB)": 21.48, "step": 20812, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957133 }, { "epoch": 0.676119936328493, "grad_norm": 0.3746171295642853, "learning_rate": 2.604130090480044e-06, "loss": 0.010672936215996742, "memory(GiB)": 21.48, "step": 20813, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.957139 }, { "epoch": 0.6761524217912485, "grad_norm": 0.4413865804672241, "learning_rate": 2.603658634464533e-06, "loss": 0.020006947219371796, "memory(GiB)": 21.48, "step": 20814, "token_acc": 1.0, "train_speed(iter/s)": 0.957149 }, { "epoch": 0.6761849072540038, "grad_norm": 0.2256595641374588, "learning_rate": 2.6031872061059925e-06, "loss": 0.011749068275094032, "memory(GiB)": 21.48, "step": 20815, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957158 }, { "epoch": 0.6762173927167593, "grad_norm": 0.2927994728088379, "learning_rate": 2.602715805409861e-06, "loss": 0.014616236090660095, "memory(GiB)": 21.48, "step": 20816, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.957168 }, { "epoch": 0.6762498781795147, "grad_norm": 0.44288307428359985, "learning_rate": 2.602244432381581e-06, "loss": 0.01791217550635338, "memory(GiB)": 21.48, "step": 20817, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957178 }, { "epoch": 0.6762823636422701, "grad_norm": 0.4927232563495636, "learning_rate": 2.6017730870265885e-06, "loss": 0.014408580958843231, "memory(GiB)": 21.48, "step": 20818, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.957187 }, { "epoch": 0.6763148491050255, "grad_norm": 0.5660274028778076, "learning_rate": 2.6013017693503294e-06, "loss": 0.017925161868333817, "memory(GiB)": 21.48, "step": 20819, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.957197 }, { "epoch": 0.676347334567781, "grad_norm": 0.3808997571468353, "learning_rate": 2.6008304793582393e-06, "loss": 0.016727736219763756, "memory(GiB)": 21.48, "step": 20820, "token_acc": 1.0, "train_speed(iter/s)": 0.957205 }, { "epoch": 0.6763798200305363, "grad_norm": 0.48676013946533203, "learning_rate": 2.600359217055759e-06, "loss": 0.02138594724237919, "memory(GiB)": 21.48, "step": 20821, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.957215 }, { "epoch": 0.6764123054932918, "grad_norm": 0.5219740271568298, "learning_rate": 2.599887982448329e-06, "loss": 0.016382113099098206, "memory(GiB)": 21.48, "step": 20822, "token_acc": 1.0, "train_speed(iter/s)": 0.957225 }, { "epoch": 0.6764447909560471, "grad_norm": 0.4944714307785034, "learning_rate": 2.5994167755413824e-06, "loss": 0.015017686411738396, "memory(GiB)": 21.48, "step": 20823, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957235 }, { "epoch": 0.6764772764188026, "grad_norm": 0.6478748321533203, "learning_rate": 2.5989455963403654e-06, "loss": 0.018074829131364822, "memory(GiB)": 21.48, "step": 20824, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957244 }, { "epoch": 0.676509761881558, "grad_norm": 0.43319132924079895, "learning_rate": 2.5984744448507093e-06, "loss": 0.02388271689414978, "memory(GiB)": 21.48, "step": 20825, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.957253 }, { "epoch": 0.6765422473443135, "grad_norm": 0.25078293681144714, "learning_rate": 2.598003321077857e-06, "loss": 0.01477348618209362, "memory(GiB)": 21.48, "step": 20826, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.957264 }, { "epoch": 0.6765747328070688, "grad_norm": 0.32058364152908325, "learning_rate": 2.597532225027242e-06, "loss": 0.012600354850292206, "memory(GiB)": 21.48, "step": 20827, "token_acc": 1.0, "train_speed(iter/s)": 0.957273 }, { "epoch": 0.6766072182698243, "grad_norm": 0.47258448600769043, "learning_rate": 2.5970611567043013e-06, "loss": 0.020275764167308807, "memory(GiB)": 21.48, "step": 20828, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957284 }, { "epoch": 0.6766397037325796, "grad_norm": 0.33078014850616455, "learning_rate": 2.596590116114475e-06, "loss": 0.012473054230213165, "memory(GiB)": 21.48, "step": 20829, "token_acc": 1.0, "train_speed(iter/s)": 0.957293 }, { "epoch": 0.6766721891953351, "grad_norm": 0.3466663062572479, "learning_rate": 2.596119103263198e-06, "loss": 0.021757155656814575, "memory(GiB)": 21.48, "step": 20830, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.957303 }, { "epoch": 0.6767046746580905, "grad_norm": 0.34723663330078125, "learning_rate": 2.595648118155905e-06, "loss": 0.015240458771586418, "memory(GiB)": 21.48, "step": 20831, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.957313 }, { "epoch": 0.676737160120846, "grad_norm": 0.2902815639972687, "learning_rate": 2.5951771607980333e-06, "loss": 0.017000509425997734, "memory(GiB)": 21.48, "step": 20832, "token_acc": 1.0, "train_speed(iter/s)": 0.957321 }, { "epoch": 0.6767696455836013, "grad_norm": 0.39117538928985596, "learning_rate": 2.5947062311950172e-06, "loss": 0.01988997310400009, "memory(GiB)": 21.48, "step": 20833, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957329 }, { "epoch": 0.6768021310463568, "grad_norm": 0.26637372374534607, "learning_rate": 2.5942353293522948e-06, "loss": 0.011630856432020664, "memory(GiB)": 21.48, "step": 20834, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.957337 }, { "epoch": 0.6768346165091121, "grad_norm": 0.4640161991119385, "learning_rate": 2.593764455275297e-06, "loss": 0.018966950476169586, "memory(GiB)": 21.48, "step": 20835, "token_acc": 1.0, "train_speed(iter/s)": 0.957344 }, { "epoch": 0.6768671019718676, "grad_norm": 0.36217668652534485, "learning_rate": 2.59329360896946e-06, "loss": 0.015022177249193192, "memory(GiB)": 21.48, "step": 20836, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.957351 }, { "epoch": 0.676899587434623, "grad_norm": 0.37213367223739624, "learning_rate": 2.592822790440218e-06, "loss": 0.01239427924156189, "memory(GiB)": 21.48, "step": 20837, "token_acc": 1.0, "train_speed(iter/s)": 0.957359 }, { "epoch": 0.6769320728973784, "grad_norm": 0.2925301492214203, "learning_rate": 2.5923519996930047e-06, "loss": 0.00905689038336277, "memory(GiB)": 21.48, "step": 20838, "token_acc": 1.0, "train_speed(iter/s)": 0.957366 }, { "epoch": 0.6769645583601338, "grad_norm": 0.3341776132583618, "learning_rate": 2.5918812367332563e-06, "loss": 0.010980343446135521, "memory(GiB)": 21.48, "step": 20839, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957374 }, { "epoch": 0.6769970438228893, "grad_norm": 0.41147491335868835, "learning_rate": 2.5914105015664014e-06, "loss": 0.017797226086258888, "memory(GiB)": 21.48, "step": 20840, "token_acc": 1.0, "train_speed(iter/s)": 0.957381 }, { "epoch": 0.6770295292856446, "grad_norm": 0.3968105912208557, "learning_rate": 2.590939794197875e-06, "loss": 0.014347873628139496, "memory(GiB)": 21.48, "step": 20841, "token_acc": 1.0, "train_speed(iter/s)": 0.957388 }, { "epoch": 0.6770620147484001, "grad_norm": 0.1856898069381714, "learning_rate": 2.59046911463311e-06, "loss": 0.010726682841777802, "memory(GiB)": 21.48, "step": 20842, "token_acc": 1.0, "train_speed(iter/s)": 0.957396 }, { "epoch": 0.6770945002111555, "grad_norm": 0.3353378474712372, "learning_rate": 2.589998462877541e-06, "loss": 0.012306800112128258, "memory(GiB)": 21.48, "step": 20843, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957402 }, { "epoch": 0.6771269856739109, "grad_norm": 0.3764866888523102, "learning_rate": 2.5895278389365947e-06, "loss": 0.01677466370165348, "memory(GiB)": 21.48, "step": 20844, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.95741 }, { "epoch": 0.6771594711366663, "grad_norm": 0.30213436484336853, "learning_rate": 2.589057242815708e-06, "loss": 0.015634480863809586, "memory(GiB)": 21.48, "step": 20845, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957417 }, { "epoch": 0.6771919565994218, "grad_norm": 0.39899659156799316, "learning_rate": 2.5885866745203055e-06, "loss": 0.017816776409745216, "memory(GiB)": 21.48, "step": 20846, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957425 }, { "epoch": 0.6772244420621771, "grad_norm": 0.41936153173446655, "learning_rate": 2.588116134055826e-06, "loss": 0.011669142171740532, "memory(GiB)": 21.48, "step": 20847, "token_acc": 1.0, "train_speed(iter/s)": 0.957433 }, { "epoch": 0.6772569275249326, "grad_norm": 0.3086291551589966, "learning_rate": 2.5876456214276947e-06, "loss": 0.018628491088747978, "memory(GiB)": 21.48, "step": 20848, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.95744 }, { "epoch": 0.677289412987688, "grad_norm": 0.37133851647377014, "learning_rate": 2.5871751366413454e-06, "loss": 0.015880437567830086, "memory(GiB)": 21.48, "step": 20849, "token_acc": 1.0, "train_speed(iter/s)": 0.957448 }, { "epoch": 0.6773218984504434, "grad_norm": 0.3311917781829834, "learning_rate": 2.5867046797022054e-06, "loss": 0.015858473256230354, "memory(GiB)": 21.48, "step": 20850, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957455 }, { "epoch": 0.6773543839131988, "grad_norm": 0.3080257177352905, "learning_rate": 2.5862342506157034e-06, "loss": 0.01737361028790474, "memory(GiB)": 21.48, "step": 20851, "token_acc": 1.0, "train_speed(iter/s)": 0.957462 }, { "epoch": 0.6773868693759543, "grad_norm": 0.3651290833950043, "learning_rate": 2.585763849387274e-06, "loss": 0.016469571739435196, "memory(GiB)": 21.48, "step": 20852, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.957468 }, { "epoch": 0.6774193548387096, "grad_norm": 0.41733554005622864, "learning_rate": 2.585293476022342e-06, "loss": 0.015065686777234077, "memory(GiB)": 21.48, "step": 20853, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.957474 }, { "epoch": 0.6774518403014651, "grad_norm": 0.2740877866744995, "learning_rate": 2.5848231305263383e-06, "loss": 0.013822752982378006, "memory(GiB)": 21.48, "step": 20854, "token_acc": 1.0, "train_speed(iter/s)": 0.95748 }, { "epoch": 0.6774843257642205, "grad_norm": 0.4008028507232666, "learning_rate": 2.584352812904689e-06, "loss": 0.01692618802189827, "memory(GiB)": 21.48, "step": 20855, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957487 }, { "epoch": 0.6775168112269759, "grad_norm": 0.3287210762500763, "learning_rate": 2.583882523162824e-06, "loss": 0.014817764982581139, "memory(GiB)": 21.48, "step": 20856, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.957494 }, { "epoch": 0.6775492966897313, "grad_norm": 0.31669050455093384, "learning_rate": 2.58341226130617e-06, "loss": 0.01090292539447546, "memory(GiB)": 21.48, "step": 20857, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957501 }, { "epoch": 0.6775817821524868, "grad_norm": 0.334440678358078, "learning_rate": 2.582942027340156e-06, "loss": 0.015105769038200378, "memory(GiB)": 21.48, "step": 20858, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957508 }, { "epoch": 0.6776142676152421, "grad_norm": 0.22701400518417358, "learning_rate": 2.5824718212702073e-06, "loss": 0.012586653232574463, "memory(GiB)": 21.48, "step": 20859, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.957516 }, { "epoch": 0.6776467530779976, "grad_norm": 0.30831223726272583, "learning_rate": 2.582001643101751e-06, "loss": 0.01237622182816267, "memory(GiB)": 21.48, "step": 20860, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.957522 }, { "epoch": 0.677679238540753, "grad_norm": 0.2588959336280823, "learning_rate": 2.581531492840214e-06, "loss": 0.01324586570262909, "memory(GiB)": 21.48, "step": 20861, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.957529 }, { "epoch": 0.6777117240035084, "grad_norm": 0.24965277314186096, "learning_rate": 2.581061370491025e-06, "loss": 0.014067800715565681, "memory(GiB)": 21.48, "step": 20862, "token_acc": 1.0, "train_speed(iter/s)": 0.957537 }, { "epoch": 0.6777442094662638, "grad_norm": 0.4591444134712219, "learning_rate": 2.5805912760596053e-06, "loss": 0.014474808238446712, "memory(GiB)": 21.48, "step": 20863, "token_acc": 0.996, "train_speed(iter/s)": 0.957544 }, { "epoch": 0.6777766949290193, "grad_norm": 0.3019132912158966, "learning_rate": 2.580121209551382e-06, "loss": 0.01565469615161419, "memory(GiB)": 21.48, "step": 20864, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.957552 }, { "epoch": 0.6778091803917747, "grad_norm": 0.283448725938797, "learning_rate": 2.5796511709717807e-06, "loss": 0.01710100844502449, "memory(GiB)": 21.48, "step": 20865, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957559 }, { "epoch": 0.6778416658545301, "grad_norm": 0.3666585683822632, "learning_rate": 2.579181160326227e-06, "loss": 0.016606343910098076, "memory(GiB)": 21.48, "step": 20866, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.957566 }, { "epoch": 0.6778741513172856, "grad_norm": 0.39981821179389954, "learning_rate": 2.578711177620147e-06, "loss": 0.018326520919799805, "memory(GiB)": 21.48, "step": 20867, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.957574 }, { "epoch": 0.6779066367800409, "grad_norm": 0.34267204999923706, "learning_rate": 2.5782412228589604e-06, "loss": 0.013339154422283173, "memory(GiB)": 21.48, "step": 20868, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957581 }, { "epoch": 0.6779391222427964, "grad_norm": 0.7188090085983276, "learning_rate": 2.5777712960480926e-06, "loss": 0.01989346742630005, "memory(GiB)": 21.48, "step": 20869, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.957589 }, { "epoch": 0.6779716077055518, "grad_norm": 0.24846529960632324, "learning_rate": 2.5773013971929695e-06, "loss": 0.008231748826801777, "memory(GiB)": 21.48, "step": 20870, "token_acc": 1.0, "train_speed(iter/s)": 0.957596 }, { "epoch": 0.6780040931683072, "grad_norm": 0.23752383887767792, "learning_rate": 2.576831526299014e-06, "loss": 0.011886757798492908, "memory(GiB)": 21.48, "step": 20871, "token_acc": 1.0, "train_speed(iter/s)": 0.957603 }, { "epoch": 0.6780365786310626, "grad_norm": 0.36685287952423096, "learning_rate": 2.576361683371646e-06, "loss": 0.0116872638463974, "memory(GiB)": 21.48, "step": 20872, "token_acc": 1.0, "train_speed(iter/s)": 0.957605 }, { "epoch": 0.6780690640938181, "grad_norm": 0.35963186621665955, "learning_rate": 2.575891868416292e-06, "loss": 0.017796283587813377, "memory(GiB)": 21.48, "step": 20873, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957612 }, { "epoch": 0.6781015495565734, "grad_norm": 0.40676167607307434, "learning_rate": 2.575422081438369e-06, "loss": 0.01375013217329979, "memory(GiB)": 21.48, "step": 20874, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.957622 }, { "epoch": 0.6781340350193289, "grad_norm": 0.27740415930747986, "learning_rate": 2.5749523224433052e-06, "loss": 0.010907603427767754, "memory(GiB)": 21.48, "step": 20875, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.957632 }, { "epoch": 0.6781665204820843, "grad_norm": 0.36977607011795044, "learning_rate": 2.5744825914365177e-06, "loss": 0.012064788490533829, "memory(GiB)": 21.48, "step": 20876, "token_acc": 1.0, "train_speed(iter/s)": 0.957641 }, { "epoch": 0.6781990059448397, "grad_norm": 0.36319583654403687, "learning_rate": 2.5740128884234318e-06, "loss": 0.011479459702968597, "memory(GiB)": 21.48, "step": 20877, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957649 }, { "epoch": 0.6782314914075951, "grad_norm": 0.4075362980365753, "learning_rate": 2.573543213409464e-06, "loss": 0.018197502940893173, "memory(GiB)": 21.48, "step": 20878, "token_acc": 0.9838056680161943, "train_speed(iter/s)": 0.957658 }, { "epoch": 0.6782639768703506, "grad_norm": 0.28638672828674316, "learning_rate": 2.573073566400037e-06, "loss": 0.011695762164890766, "memory(GiB)": 21.48, "step": 20879, "token_acc": 1.0, "train_speed(iter/s)": 0.957667 }, { "epoch": 0.6782964623331059, "grad_norm": 0.3157041370868683, "learning_rate": 2.5726039474005714e-06, "loss": 0.013083446770906448, "memory(GiB)": 21.48, "step": 20880, "token_acc": 1.0, "train_speed(iter/s)": 0.957677 }, { "epoch": 0.6783289477958614, "grad_norm": 0.389869749546051, "learning_rate": 2.572134356416487e-06, "loss": 0.017578978091478348, "memory(GiB)": 21.48, "step": 20881, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.957687 }, { "epoch": 0.6783614332586168, "grad_norm": 1.4903336763381958, "learning_rate": 2.5716647934532058e-06, "loss": 0.01497555524110794, "memory(GiB)": 21.48, "step": 20882, "token_acc": 1.0, "train_speed(iter/s)": 0.957697 }, { "epoch": 0.6783939187213722, "grad_norm": 0.6761859059333801, "learning_rate": 2.5711952585161425e-06, "loss": 0.01851666532456875, "memory(GiB)": 21.48, "step": 20883, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957706 }, { "epoch": 0.6784264041841276, "grad_norm": 0.3385950028896332, "learning_rate": 2.57072575161072e-06, "loss": 0.009135860949754715, "memory(GiB)": 21.48, "step": 20884, "token_acc": 1.0, "train_speed(iter/s)": 0.957715 }, { "epoch": 0.6784588896468831, "grad_norm": 0.3603150546550751, "learning_rate": 2.570256272742354e-06, "loss": 0.014026573859155178, "memory(GiB)": 21.48, "step": 20885, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.957725 }, { "epoch": 0.6784913751096384, "grad_norm": 0.3889702558517456, "learning_rate": 2.5697868219164675e-06, "loss": 0.011931119486689568, "memory(GiB)": 21.48, "step": 20886, "token_acc": 1.0, "train_speed(iter/s)": 0.957735 }, { "epoch": 0.6785238605723939, "grad_norm": 0.44331368803977966, "learning_rate": 2.5693173991384736e-06, "loss": 0.021611498668789864, "memory(GiB)": 21.48, "step": 20887, "token_acc": 0.9798994974874372, "train_speed(iter/s)": 0.957743 }, { "epoch": 0.6785563460351492, "grad_norm": 0.3151295483112335, "learning_rate": 2.568848004413792e-06, "loss": 0.009138775989413261, "memory(GiB)": 21.48, "step": 20888, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957753 }, { "epoch": 0.6785888314979047, "grad_norm": 0.30171653628349304, "learning_rate": 2.5683786377478405e-06, "loss": 0.017688866704702377, "memory(GiB)": 21.48, "step": 20889, "token_acc": 1.0, "train_speed(iter/s)": 0.957762 }, { "epoch": 0.6786213169606601, "grad_norm": 0.4575181007385254, "learning_rate": 2.5679092991460373e-06, "loss": 0.017986543476581573, "memory(GiB)": 21.48, "step": 20890, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.957771 }, { "epoch": 0.6786538024234156, "grad_norm": 0.3478568196296692, "learning_rate": 2.567439988613797e-06, "loss": 0.016455911099910736, "memory(GiB)": 21.48, "step": 20891, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.957781 }, { "epoch": 0.6786862878861709, "grad_norm": 0.3729816973209381, "learning_rate": 2.566970706156536e-06, "loss": 0.015263648703694344, "memory(GiB)": 21.48, "step": 20892, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.957791 }, { "epoch": 0.6787187733489264, "grad_norm": 0.39524149894714355, "learning_rate": 2.5665014517796716e-06, "loss": 0.017023678869009018, "memory(GiB)": 21.48, "step": 20893, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.9578 }, { "epoch": 0.6787512588116817, "grad_norm": 0.9145756959915161, "learning_rate": 2.5660322254886215e-06, "loss": 0.02978120557963848, "memory(GiB)": 21.48, "step": 20894, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.95781 }, { "epoch": 0.6787837442744372, "grad_norm": 0.20185133814811707, "learning_rate": 2.565563027288798e-06, "loss": 0.010427610948681831, "memory(GiB)": 21.48, "step": 20895, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957819 }, { "epoch": 0.6788162297371926, "grad_norm": 0.5813530683517456, "learning_rate": 2.565093857185618e-06, "loss": 0.019465569406747818, "memory(GiB)": 21.48, "step": 20896, "token_acc": 1.0, "train_speed(iter/s)": 0.957827 }, { "epoch": 0.678848715199948, "grad_norm": 0.40232759714126587, "learning_rate": 2.564624715184492e-06, "loss": 0.01361842267215252, "memory(GiB)": 21.48, "step": 20897, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957835 }, { "epoch": 0.6788812006627034, "grad_norm": 0.32942962646484375, "learning_rate": 2.5641556012908408e-06, "loss": 0.01355232298374176, "memory(GiB)": 21.48, "step": 20898, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957842 }, { "epoch": 0.6789136861254589, "grad_norm": 0.432231605052948, "learning_rate": 2.5636865155100777e-06, "loss": 0.013960577547550201, "memory(GiB)": 21.48, "step": 20899, "token_acc": 0.982078853046595, "train_speed(iter/s)": 0.95785 }, { "epoch": 0.6789461715882142, "grad_norm": 0.6042799353599548, "learning_rate": 2.563217457847613e-06, "loss": 0.02047840505838394, "memory(GiB)": 21.48, "step": 20900, "token_acc": 1.0, "train_speed(iter/s)": 0.957857 }, { "epoch": 0.6789786570509697, "grad_norm": 0.4462091028690338, "learning_rate": 2.5627484283088646e-06, "loss": 0.02492474392056465, "memory(GiB)": 21.48, "step": 20901, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957865 }, { "epoch": 0.6790111425137251, "grad_norm": 0.29217663407325745, "learning_rate": 2.562279426899239e-06, "loss": 0.012817292474210262, "memory(GiB)": 21.48, "step": 20902, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.957872 }, { "epoch": 0.6790436279764805, "grad_norm": 0.3333451449871063, "learning_rate": 2.5618104536241573e-06, "loss": 0.0162824597209692, "memory(GiB)": 21.48, "step": 20903, "token_acc": 1.0, "train_speed(iter/s)": 0.957879 }, { "epoch": 0.6790761134392359, "grad_norm": 0.2994524836540222, "learning_rate": 2.5613415084890266e-06, "loss": 0.011417388916015625, "memory(GiB)": 21.48, "step": 20904, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.957887 }, { "epoch": 0.6791085989019914, "grad_norm": 0.37481677532196045, "learning_rate": 2.5608725914992625e-06, "loss": 0.013838821090757847, "memory(GiB)": 21.48, "step": 20905, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.957894 }, { "epoch": 0.6791410843647467, "grad_norm": 0.30549556016921997, "learning_rate": 2.560403702660273e-06, "loss": 0.018147720023989677, "memory(GiB)": 21.48, "step": 20906, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.9579 }, { "epoch": 0.6791735698275022, "grad_norm": 0.5481878519058228, "learning_rate": 2.5599348419774717e-06, "loss": 0.01392629835754633, "memory(GiB)": 21.48, "step": 20907, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.957907 }, { "epoch": 0.6792060552902576, "grad_norm": 0.5026309490203857, "learning_rate": 2.5594660094562705e-06, "loss": 0.020368941128253937, "memory(GiB)": 21.48, "step": 20908, "token_acc": 0.9927007299270073, "train_speed(iter/s)": 0.957914 }, { "epoch": 0.679238540753013, "grad_norm": 0.2934712767601013, "learning_rate": 2.5589972051020818e-06, "loss": 0.011323612183332443, "memory(GiB)": 21.48, "step": 20909, "token_acc": 1.0, "train_speed(iter/s)": 0.957922 }, { "epoch": 0.6792710262157684, "grad_norm": 0.3895910680294037, "learning_rate": 2.5585284289203122e-06, "loss": 0.016636313870549202, "memory(GiB)": 21.48, "step": 20910, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957929 }, { "epoch": 0.6793035116785239, "grad_norm": 0.5035631656646729, "learning_rate": 2.558059680916374e-06, "loss": 0.02232057973742485, "memory(GiB)": 21.48, "step": 20911, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.957937 }, { "epoch": 0.6793359971412792, "grad_norm": 0.4094603657722473, "learning_rate": 2.5575909610956775e-06, "loss": 0.011518033221364021, "memory(GiB)": 21.48, "step": 20912, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957945 }, { "epoch": 0.6793684826040347, "grad_norm": 0.3213754892349243, "learning_rate": 2.557122269463632e-06, "loss": 0.01961604505777359, "memory(GiB)": 21.48, "step": 20913, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.957951 }, { "epoch": 0.6794009680667901, "grad_norm": 0.9123450517654419, "learning_rate": 2.55665360602565e-06, "loss": 0.015040314756333828, "memory(GiB)": 21.48, "step": 20914, "token_acc": 0.9858156028368794, "train_speed(iter/s)": 0.957958 }, { "epoch": 0.6794334535295455, "grad_norm": 0.3535952568054199, "learning_rate": 2.556184970787134e-06, "loss": 0.016742637380957603, "memory(GiB)": 21.48, "step": 20915, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957963 }, { "epoch": 0.6794659389923009, "grad_norm": 0.520068883895874, "learning_rate": 2.5557163637534965e-06, "loss": 0.014049161225557327, "memory(GiB)": 21.48, "step": 20916, "token_acc": 0.9948186528497409, "train_speed(iter/s)": 0.957969 }, { "epoch": 0.6794984244550564, "grad_norm": 0.3657364547252655, "learning_rate": 2.555247784930146e-06, "loss": 0.017828425392508507, "memory(GiB)": 21.48, "step": 20917, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.957975 }, { "epoch": 0.6795309099178117, "grad_norm": 0.35180795192718506, "learning_rate": 2.554779234322491e-06, "loss": 0.012392774224281311, "memory(GiB)": 21.48, "step": 20918, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.957981 }, { "epoch": 0.6795633953805672, "grad_norm": 0.47419729828834534, "learning_rate": 2.5543107119359374e-06, "loss": 0.018656138330698013, "memory(GiB)": 21.48, "step": 20919, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.957987 }, { "epoch": 0.6795958808433226, "grad_norm": 0.3524180054664612, "learning_rate": 2.5538422177758925e-06, "loss": 0.015880361199378967, "memory(GiB)": 21.48, "step": 20920, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.957994 }, { "epoch": 0.679628366306078, "grad_norm": 0.3455435335636139, "learning_rate": 2.5533737518477654e-06, "loss": 0.015255022794008255, "memory(GiB)": 21.48, "step": 20921, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.958 }, { "epoch": 0.6796608517688334, "grad_norm": 0.3556746542453766, "learning_rate": 2.5529053141569626e-06, "loss": 0.014572888612747192, "memory(GiB)": 21.48, "step": 20922, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.958007 }, { "epoch": 0.6796933372315889, "grad_norm": 0.22209975123405457, "learning_rate": 2.552436904708888e-06, "loss": 0.010360131040215492, "memory(GiB)": 21.48, "step": 20923, "token_acc": 1.0, "train_speed(iter/s)": 0.958014 }, { "epoch": 0.6797258226943442, "grad_norm": 0.32784271240234375, "learning_rate": 2.551968523508951e-06, "loss": 0.01738797500729561, "memory(GiB)": 21.48, "step": 20924, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.958021 }, { "epoch": 0.6797583081570997, "grad_norm": 0.47750869393348694, "learning_rate": 2.551500170562552e-06, "loss": 0.011422952637076378, "memory(GiB)": 21.48, "step": 20925, "token_acc": 1.0, "train_speed(iter/s)": 0.958029 }, { "epoch": 0.6797907936198551, "grad_norm": 0.23558393120765686, "learning_rate": 2.551031845875103e-06, "loss": 0.009886065497994423, "memory(GiB)": 21.48, "step": 20926, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.958036 }, { "epoch": 0.6798232790826105, "grad_norm": 0.29793500900268555, "learning_rate": 2.5505635494520047e-06, "loss": 0.013540327548980713, "memory(GiB)": 21.48, "step": 20927, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.958044 }, { "epoch": 0.679855764545366, "grad_norm": 0.3827648162841797, "learning_rate": 2.5500952812986635e-06, "loss": 0.015149095095694065, "memory(GiB)": 21.48, "step": 20928, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.958051 }, { "epoch": 0.6798882500081214, "grad_norm": 0.30222055315971375, "learning_rate": 2.549627041420485e-06, "loss": 0.013989560306072235, "memory(GiB)": 21.48, "step": 20929, "token_acc": 1.0, "train_speed(iter/s)": 0.958059 }, { "epoch": 0.6799207354708768, "grad_norm": 0.4365428388118744, "learning_rate": 2.5491588298228685e-06, "loss": 0.02019212208688259, "memory(GiB)": 21.48, "step": 20930, "token_acc": 0.9847328244274809, "train_speed(iter/s)": 0.958067 }, { "epoch": 0.6799532209336322, "grad_norm": 0.43055856227874756, "learning_rate": 2.548690646511225e-06, "loss": 0.013641776517033577, "memory(GiB)": 21.48, "step": 20931, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.958074 }, { "epoch": 0.6799857063963877, "grad_norm": 0.4374156892299652, "learning_rate": 2.5482224914909513e-06, "loss": 0.017601480707526207, "memory(GiB)": 21.48, "step": 20932, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.958082 }, { "epoch": 0.680018191859143, "grad_norm": 0.353362113237381, "learning_rate": 2.5477543647674564e-06, "loss": 0.013847526162862778, "memory(GiB)": 21.48, "step": 20933, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.958092 }, { "epoch": 0.6800506773218985, "grad_norm": 0.4016411304473877, "learning_rate": 2.547286266346137e-06, "loss": 0.012469527311623096, "memory(GiB)": 21.48, "step": 20934, "token_acc": 1.0, "train_speed(iter/s)": 0.958102 }, { "epoch": 0.6800831627846539, "grad_norm": 0.3377184271812439, "learning_rate": 2.5468181962324003e-06, "loss": 0.013101032003760338, "memory(GiB)": 21.48, "step": 20935, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.958111 }, { "epoch": 0.6801156482474093, "grad_norm": 0.44685661792755127, "learning_rate": 2.5463501544316456e-06, "loss": 0.018166057765483856, "memory(GiB)": 21.48, "step": 20936, "token_acc": 0.984375, "train_speed(iter/s)": 0.95812 }, { "epoch": 0.6801481337101647, "grad_norm": 0.34406572580337524, "learning_rate": 2.545882140949278e-06, "loss": 0.010395125485956669, "memory(GiB)": 21.48, "step": 20937, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.958131 }, { "epoch": 0.6801806191729202, "grad_norm": 1.5706909894943237, "learning_rate": 2.545414155790695e-06, "loss": 0.014053957536816597, "memory(GiB)": 21.48, "step": 20938, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.958141 }, { "epoch": 0.6802131046356755, "grad_norm": 0.32669466733932495, "learning_rate": 2.5449461989613e-06, "loss": 0.016953052952885628, "memory(GiB)": 21.48, "step": 20939, "token_acc": 1.0, "train_speed(iter/s)": 0.95815 }, { "epoch": 0.680245590098431, "grad_norm": 0.3037722706794739, "learning_rate": 2.544478270466493e-06, "loss": 0.01190782431513071, "memory(GiB)": 21.48, "step": 20940, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.95816 }, { "epoch": 0.6802780755611864, "grad_norm": 0.28702887892723083, "learning_rate": 2.544010370311677e-06, "loss": 0.016048183664679527, "memory(GiB)": 21.48, "step": 20941, "token_acc": 0.9965156794425087, "train_speed(iter/s)": 0.958169 }, { "epoch": 0.6803105610239418, "grad_norm": 0.3753296732902527, "learning_rate": 2.5435424985022487e-06, "loss": 0.014081943780183792, "memory(GiB)": 21.48, "step": 20942, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.958179 }, { "epoch": 0.6803430464866972, "grad_norm": 0.3628610670566559, "learning_rate": 2.54307465504361e-06, "loss": 0.012010492384433746, "memory(GiB)": 21.48, "step": 20943, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.958189 }, { "epoch": 0.6803755319494527, "grad_norm": 0.2743305563926697, "learning_rate": 2.542606839941159e-06, "loss": 0.010402540676295757, "memory(GiB)": 21.48, "step": 20944, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.958198 }, { "epoch": 0.680408017412208, "grad_norm": 0.3435138463973999, "learning_rate": 2.5421390532002965e-06, "loss": 0.013718357309699059, "memory(GiB)": 21.48, "step": 20945, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.958208 }, { "epoch": 0.6804405028749635, "grad_norm": 0.8683347105979919, "learning_rate": 2.541671294826422e-06, "loss": 0.02890239842236042, "memory(GiB)": 21.48, "step": 20946, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.958218 }, { "epoch": 0.6804729883377189, "grad_norm": 0.38532015681266785, "learning_rate": 2.541203564824931e-06, "loss": 0.019988011568784714, "memory(GiB)": 21.48, "step": 20947, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.958227 }, { "epoch": 0.6805054738004743, "grad_norm": 0.3097173869609833, "learning_rate": 2.5407358632012235e-06, "loss": 0.017347298562526703, "memory(GiB)": 21.48, "step": 20948, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.958236 }, { "epoch": 0.6805379592632297, "grad_norm": 0.9403477311134338, "learning_rate": 2.5402681899606977e-06, "loss": 0.022883012890815735, "memory(GiB)": 21.48, "step": 20949, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.958245 }, { "epoch": 0.6805704447259852, "grad_norm": 0.43566253781318665, "learning_rate": 2.5398005451087527e-06, "loss": 0.01661103591322899, "memory(GiB)": 21.48, "step": 20950, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.958255 }, { "epoch": 0.6806029301887405, "grad_norm": 0.32553088665008545, "learning_rate": 2.539332928650782e-06, "loss": 0.01129882875829935, "memory(GiB)": 21.48, "step": 20951, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.958264 }, { "epoch": 0.680635415651496, "grad_norm": 0.27827370166778564, "learning_rate": 2.538865340592187e-06, "loss": 0.01048472709953785, "memory(GiB)": 21.48, "step": 20952, "token_acc": 0.9845559845559846, "train_speed(iter/s)": 0.958274 }, { "epoch": 0.6806679011142514, "grad_norm": 0.34646525979042053, "learning_rate": 2.5383977809383568e-06, "loss": 0.016841541975736618, "memory(GiB)": 21.48, "step": 20953, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.958284 }, { "epoch": 0.6807003865770068, "grad_norm": 0.31771722435951233, "learning_rate": 2.5379302496946973e-06, "loss": 0.01119659747928381, "memory(GiB)": 21.48, "step": 20954, "token_acc": 0.9856459330143541, "train_speed(iter/s)": 0.958293 }, { "epoch": 0.6807328720397622, "grad_norm": 0.44608235359191895, "learning_rate": 2.537462746866597e-06, "loss": 0.02126314491033554, "memory(GiB)": 21.48, "step": 20955, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.958303 }, { "epoch": 0.6807653575025177, "grad_norm": 0.25578051805496216, "learning_rate": 2.536995272459457e-06, "loss": 0.01004730723798275, "memory(GiB)": 21.48, "step": 20956, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.958312 }, { "epoch": 0.680797842965273, "grad_norm": 0.44843974709510803, "learning_rate": 2.5365278264786673e-06, "loss": 0.016282716765999794, "memory(GiB)": 21.48, "step": 20957, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.958322 }, { "epoch": 0.6808303284280285, "grad_norm": 0.31727346777915955, "learning_rate": 2.536060408929626e-06, "loss": 0.015568308532238007, "memory(GiB)": 21.48, "step": 20958, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.95833 }, { "epoch": 0.6808628138907838, "grad_norm": 0.34927308559417725, "learning_rate": 2.5355930198177257e-06, "loss": 0.019828535616397858, "memory(GiB)": 21.48, "step": 20959, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.958337 }, { "epoch": 0.6808952993535393, "grad_norm": 0.38900062441825867, "learning_rate": 2.535125659148363e-06, "loss": 0.013423864729702473, "memory(GiB)": 21.48, "step": 20960, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.958344 }, { "epoch": 0.6809277848162947, "grad_norm": 0.3052906394004822, "learning_rate": 2.534658326926933e-06, "loss": 0.012325972318649292, "memory(GiB)": 21.48, "step": 20961, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.958352 }, { "epoch": 0.6809602702790502, "grad_norm": 0.4197990894317627, "learning_rate": 2.534191023158825e-06, "loss": 0.02403916046023369, "memory(GiB)": 21.48, "step": 20962, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.958359 }, { "epoch": 0.6809927557418055, "grad_norm": 0.8689160943031311, "learning_rate": 2.533723747849435e-06, "loss": 0.021435653790831566, "memory(GiB)": 21.48, "step": 20963, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.958367 }, { "epoch": 0.681025241204561, "grad_norm": 0.4005703926086426, "learning_rate": 2.5332565010041556e-06, "loss": 0.01447855494916439, "memory(GiB)": 21.48, "step": 20964, "token_acc": 0.9963503649635036, "train_speed(iter/s)": 0.958374 }, { "epoch": 0.6810577266673163, "grad_norm": 0.578376054763794, "learning_rate": 2.5327892826283807e-06, "loss": 0.02407994121313095, "memory(GiB)": 21.48, "step": 20965, "token_acc": 1.0, "train_speed(iter/s)": 0.958381 }, { "epoch": 0.6810902121300718, "grad_norm": 0.27748754620552063, "learning_rate": 2.5323220927275e-06, "loss": 0.013352012261748314, "memory(GiB)": 21.48, "step": 20966, "token_acc": 0.9933554817275747, "train_speed(iter/s)": 0.958389 }, { "epoch": 0.6811226975928272, "grad_norm": 0.3258957862854004, "learning_rate": 2.5318549313069062e-06, "loss": 0.01752900518476963, "memory(GiB)": 21.48, "step": 20967, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.958396 }, { "epoch": 0.6811551830555826, "grad_norm": 0.25545451045036316, "learning_rate": 2.5313877983719927e-06, "loss": 0.012334113009274006, "memory(GiB)": 21.48, "step": 20968, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.958403 }, { "epoch": 0.681187668518338, "grad_norm": 0.3051442503929138, "learning_rate": 2.5309206939281505e-06, "loss": 0.0138269467279315, "memory(GiB)": 21.48, "step": 20969, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.95841 }, { "epoch": 0.6812201539810935, "grad_norm": 0.26064905524253845, "learning_rate": 2.530453617980768e-06, "loss": 0.01719408854842186, "memory(GiB)": 21.48, "step": 20970, "token_acc": 1.0, "train_speed(iter/s)": 0.958418 }, { "epoch": 0.6812526394438488, "grad_norm": 0.3183566927909851, "learning_rate": 2.529986570535238e-06, "loss": 0.013704630546271801, "memory(GiB)": 21.48, "step": 20971, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.958426 }, { "epoch": 0.6812851249066043, "grad_norm": 0.36681678891181946, "learning_rate": 2.5295195515969507e-06, "loss": 0.013549016788601875, "memory(GiB)": 21.48, "step": 20972, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.958432 }, { "epoch": 0.6813176103693597, "grad_norm": 0.22505906224250793, "learning_rate": 2.529052561171298e-06, "loss": 0.008989734575152397, "memory(GiB)": 21.48, "step": 20973, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.958438 }, { "epoch": 0.6813500958321151, "grad_norm": 0.2515318691730499, "learning_rate": 2.5285855992636655e-06, "loss": 0.007958618924021721, "memory(GiB)": 21.48, "step": 20974, "token_acc": 1.0, "train_speed(iter/s)": 0.958445 }, { "epoch": 0.6813825812948705, "grad_norm": 0.2408166527748108, "learning_rate": 2.5281186658794445e-06, "loss": 0.010624095797538757, "memory(GiB)": 21.48, "step": 20975, "token_acc": 1.0, "train_speed(iter/s)": 0.958452 }, { "epoch": 0.681415066757626, "grad_norm": 0.3033265471458435, "learning_rate": 2.5276517610240237e-06, "loss": 0.012473026290535927, "memory(GiB)": 21.48, "step": 20976, "token_acc": 1.0, "train_speed(iter/s)": 0.958458 }, { "epoch": 0.6814475522203813, "grad_norm": 0.29244959354400635, "learning_rate": 2.527184884702793e-06, "loss": 0.013012378476560116, "memory(GiB)": 21.48, "step": 20977, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.958463 }, { "epoch": 0.6814800376831368, "grad_norm": 0.2852563261985779, "learning_rate": 2.526718036921142e-06, "loss": 0.011484230868518353, "memory(GiB)": 21.48, "step": 20978, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.958469 }, { "epoch": 0.6815125231458922, "grad_norm": 0.313131719827652, "learning_rate": 2.5262512176844545e-06, "loss": 0.0117667056620121, "memory(GiB)": 21.48, "step": 20979, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.958475 }, { "epoch": 0.6815450086086476, "grad_norm": 0.4541464149951935, "learning_rate": 2.5257844269981225e-06, "loss": 0.013851013034582138, "memory(GiB)": 21.48, "step": 20980, "token_acc": 0.9898477157360406, "train_speed(iter/s)": 0.958481 }, { "epoch": 0.681577494071403, "grad_norm": 0.6248283386230469, "learning_rate": 2.5253176648675277e-06, "loss": 0.02211587503552437, "memory(GiB)": 21.48, "step": 20981, "token_acc": 0.9961977186311787, "train_speed(iter/s)": 0.958488 }, { "epoch": 0.6816099795341585, "grad_norm": 0.4183723032474518, "learning_rate": 2.524850931298064e-06, "loss": 0.011062707751989365, "memory(GiB)": 21.48, "step": 20982, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.958495 }, { "epoch": 0.6816424649969138, "grad_norm": 0.4080789089202881, "learning_rate": 2.524384226295114e-06, "loss": 0.017332127317786217, "memory(GiB)": 21.48, "step": 20983, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.958502 }, { "epoch": 0.6816749504596693, "grad_norm": 0.3820972442626953, "learning_rate": 2.5239175498640665e-06, "loss": 0.016541380435228348, "memory(GiB)": 21.48, "step": 20984, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.958509 }, { "epoch": 0.6817074359224247, "grad_norm": 0.3289481997489929, "learning_rate": 2.523450902010304e-06, "loss": 0.015684114769101143, "memory(GiB)": 21.48, "step": 20985, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.958516 }, { "epoch": 0.6817399213851801, "grad_norm": 0.3158315122127533, "learning_rate": 2.5229842827392147e-06, "loss": 0.018003258854150772, "memory(GiB)": 21.48, "step": 20986, "token_acc": 0.9810606060606061, "train_speed(iter/s)": 0.958523 }, { "epoch": 0.6817724068479355, "grad_norm": 0.43941739201545715, "learning_rate": 2.522517692056184e-06, "loss": 0.024148479104042053, "memory(GiB)": 21.48, "step": 20987, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.958531 }, { "epoch": 0.681804892310691, "grad_norm": 0.32443997263908386, "learning_rate": 2.5220511299665985e-06, "loss": 0.013470055535435677, "memory(GiB)": 21.48, "step": 20988, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.958539 }, { "epoch": 0.6818373777734463, "grad_norm": 0.4303312301635742, "learning_rate": 2.5215845964758383e-06, "loss": 0.015675285831093788, "memory(GiB)": 21.48, "step": 20989, "token_acc": 1.0, "train_speed(iter/s)": 0.958546 }, { "epoch": 0.6818698632362018, "grad_norm": 0.3525823652744293, "learning_rate": 2.5211180915892914e-06, "loss": 0.014991482719779015, "memory(GiB)": 21.48, "step": 20990, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.958554 }, { "epoch": 0.6819023486989572, "grad_norm": 0.28158730268478394, "learning_rate": 2.5206516153123406e-06, "loss": 0.0110698863863945, "memory(GiB)": 21.48, "step": 20991, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.958561 }, { "epoch": 0.6819348341617126, "grad_norm": 0.43886587023735046, "learning_rate": 2.5201851676503706e-06, "loss": 0.011651836335659027, "memory(GiB)": 21.48, "step": 20992, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.95857 }, { "epoch": 0.6819673196244681, "grad_norm": 0.3828698992729187, "learning_rate": 2.519718748608766e-06, "loss": 0.010526837781071663, "memory(GiB)": 21.48, "step": 20993, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.95858 }, { "epoch": 0.6819998050872235, "grad_norm": 0.35954153537750244, "learning_rate": 2.5192523581929066e-06, "loss": 0.016208693385124207, "memory(GiB)": 21.48, "step": 20994, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.958589 }, { "epoch": 0.6820322905499789, "grad_norm": 0.3854637145996094, "learning_rate": 2.5187859964081775e-06, "loss": 0.019830211997032166, "memory(GiB)": 21.48, "step": 20995, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.958599 }, { "epoch": 0.6820647760127343, "grad_norm": 0.36610883474349976, "learning_rate": 2.5183196632599594e-06, "loss": 0.01652139611542225, "memory(GiB)": 21.48, "step": 20996, "token_acc": 1.0, "train_speed(iter/s)": 0.958609 }, { "epoch": 0.6820972614754898, "grad_norm": 0.4388728737831116, "learning_rate": 2.517853358753638e-06, "loss": 0.019034814089536667, "memory(GiB)": 21.48, "step": 20997, "token_acc": 1.0, "train_speed(iter/s)": 0.958618 }, { "epoch": 0.6821297469382451, "grad_norm": 0.285383939743042, "learning_rate": 2.5173870828945902e-06, "loss": 0.015892907977104187, "memory(GiB)": 21.48, "step": 20998, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.958627 }, { "epoch": 0.6821622324010006, "grad_norm": 1.0436900854110718, "learning_rate": 2.5169208356882e-06, "loss": 0.01792597770690918, "memory(GiB)": 21.48, "step": 20999, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.958637 }, { "epoch": 0.682194717863756, "grad_norm": 0.5478867292404175, "learning_rate": 2.5164546171398485e-06, "loss": 0.022665316238999367, "memory(GiB)": 21.48, "step": 21000, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.958647 }, { "epoch": 0.682194717863756, "eval_loss": 0.01543568167835474, "eval_runtime": 79.2927, "eval_samples_per_second": 125.484, "eval_steps_per_second": 3.922, "eval_token_acc": 0.9937963024933201, "step": 21000 }, { "epoch": 0.6822272033265114, "grad_norm": 0.5069572329521179, "learning_rate": 2.5159884272549186e-06, "loss": 0.017146483063697815, "memory(GiB)": 21.48, "step": 21001, "token_acc": 0.9933825644960199, "train_speed(iter/s)": 0.9547 }, { "epoch": 0.6822596887892668, "grad_norm": 0.3739672303199768, "learning_rate": 2.5155222660387856e-06, "loss": 0.012521582655608654, "memory(GiB)": 21.48, "step": 21002, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.954706 }, { "epoch": 0.6822921742520223, "grad_norm": 0.3029802739620209, "learning_rate": 2.5150561334968347e-06, "loss": 0.01672704517841339, "memory(GiB)": 21.48, "step": 21003, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.954713 }, { "epoch": 0.6823246597147776, "grad_norm": 0.3982457220554352, "learning_rate": 2.5145900296344406e-06, "loss": 0.01457710936665535, "memory(GiB)": 21.48, "step": 21004, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.954719 }, { "epoch": 0.6823571451775331, "grad_norm": 0.4051735997200012, "learning_rate": 2.514123954456986e-06, "loss": 0.021535787731409073, "memory(GiB)": 21.48, "step": 21005, "token_acc": 0.9812734082397003, "train_speed(iter/s)": 0.954726 }, { "epoch": 0.6823896306402885, "grad_norm": 0.2301645576953888, "learning_rate": 2.513657907969853e-06, "loss": 0.011145642027258873, "memory(GiB)": 21.48, "step": 21006, "token_acc": 1.0, "train_speed(iter/s)": 0.954733 }, { "epoch": 0.6824221161030439, "grad_norm": 0.1975879818201065, "learning_rate": 2.5131918901784143e-06, "loss": 0.010303452610969543, "memory(GiB)": 21.48, "step": 21007, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.954739 }, { "epoch": 0.6824546015657993, "grad_norm": 0.4473963677883148, "learning_rate": 2.5127259010880532e-06, "loss": 0.015680426731705666, "memory(GiB)": 21.48, "step": 21008, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.954745 }, { "epoch": 0.6824870870285548, "grad_norm": 0.27964627742767334, "learning_rate": 2.5122599407041415e-06, "loss": 0.01114644855260849, "memory(GiB)": 21.48, "step": 21009, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.954751 }, { "epoch": 0.6825195724913101, "grad_norm": 0.7947244048118591, "learning_rate": 2.5117940090320657e-06, "loss": 0.02000725455582142, "memory(GiB)": 21.48, "step": 21010, "token_acc": 1.0, "train_speed(iter/s)": 0.954757 }, { "epoch": 0.6825520579540656, "grad_norm": 0.6616250276565552, "learning_rate": 2.5113281060771966e-06, "loss": 0.015440264716744423, "memory(GiB)": 21.48, "step": 21011, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.954763 }, { "epoch": 0.682584543416821, "grad_norm": 0.4549501836299896, "learning_rate": 2.510862231844916e-06, "loss": 0.02132718451321125, "memory(GiB)": 21.48, "step": 21012, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.954769 }, { "epoch": 0.6826170288795764, "grad_norm": 0.2642369270324707, "learning_rate": 2.510396386340596e-06, "loss": 0.009860590100288391, "memory(GiB)": 21.48, "step": 21013, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.954775 }, { "epoch": 0.6826495143423318, "grad_norm": 0.31689223647117615, "learning_rate": 2.5099305695696154e-06, "loss": 0.012723805382847786, "memory(GiB)": 21.48, "step": 21014, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.954781 }, { "epoch": 0.6826819998050873, "grad_norm": 0.33128681778907776, "learning_rate": 2.50946478153735e-06, "loss": 0.019117314368486404, "memory(GiB)": 21.48, "step": 21015, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.954788 }, { "epoch": 0.6827144852678426, "grad_norm": 0.23892828822135925, "learning_rate": 2.5089990222491785e-06, "loss": 0.012252365238964558, "memory(GiB)": 21.48, "step": 21016, "token_acc": 1.0, "train_speed(iter/s)": 0.954794 }, { "epoch": 0.6827469707305981, "grad_norm": 0.42271891236305237, "learning_rate": 2.508533291710471e-06, "loss": 0.017915846779942513, "memory(GiB)": 21.48, "step": 21017, "token_acc": 1.0, "train_speed(iter/s)": 0.954799 }, { "epoch": 0.6827794561933535, "grad_norm": 0.42292460799217224, "learning_rate": 2.508067589926606e-06, "loss": 0.019913967698812485, "memory(GiB)": 21.48, "step": 21018, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.954807 }, { "epoch": 0.6828119416561089, "grad_norm": 0.36729946732521057, "learning_rate": 2.5076019169029576e-06, "loss": 0.01424820814281702, "memory(GiB)": 21.48, "step": 21019, "token_acc": 1.0, "train_speed(iter/s)": 0.954815 }, { "epoch": 0.6828444271188643, "grad_norm": 0.28039708733558655, "learning_rate": 2.507136272644901e-06, "loss": 0.011278449557721615, "memory(GiB)": 21.48, "step": 21020, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.954822 }, { "epoch": 0.6828769125816198, "grad_norm": 0.3938691020011902, "learning_rate": 2.5066706571578114e-06, "loss": 0.01641671359539032, "memory(GiB)": 21.48, "step": 21021, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.954829 }, { "epoch": 0.6829093980443751, "grad_norm": 0.3711869716644287, "learning_rate": 2.5062050704470587e-06, "loss": 0.016358887776732445, "memory(GiB)": 21.48, "step": 21022, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.954837 }, { "epoch": 0.6829418835071306, "grad_norm": 0.3737746775150299, "learning_rate": 2.505739512518019e-06, "loss": 0.018795067444443703, "memory(GiB)": 21.48, "step": 21023, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.954846 }, { "epoch": 0.682974368969886, "grad_norm": 0.4505358338356018, "learning_rate": 2.5052739833760654e-06, "loss": 0.012548351660370827, "memory(GiB)": 21.48, "step": 21024, "token_acc": 1.0, "train_speed(iter/s)": 0.954855 }, { "epoch": 0.6830068544326414, "grad_norm": 0.38820692896842957, "learning_rate": 2.504808483026572e-06, "loss": 0.020978964865207672, "memory(GiB)": 21.48, "step": 21025, "token_acc": 1.0, "train_speed(iter/s)": 0.954865 }, { "epoch": 0.6830393398953968, "grad_norm": 0.7171637415885925, "learning_rate": 2.504343011474908e-06, "loss": 0.010088524781167507, "memory(GiB)": 21.48, "step": 21026, "token_acc": 1.0, "train_speed(iter/s)": 0.954874 }, { "epoch": 0.6830718253581523, "grad_norm": 0.48057541251182556, "learning_rate": 2.503877568726447e-06, "loss": 0.022914625704288483, "memory(GiB)": 21.48, "step": 21027, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.954884 }, { "epoch": 0.6831043108209076, "grad_norm": 0.22901871800422668, "learning_rate": 2.503412154786562e-06, "loss": 0.014854948967695236, "memory(GiB)": 21.48, "step": 21028, "token_acc": 1.0, "train_speed(iter/s)": 0.954893 }, { "epoch": 0.6831367962836631, "grad_norm": 0.34749147295951843, "learning_rate": 2.502946769660626e-06, "loss": 0.013829944655299187, "memory(GiB)": 21.48, "step": 21029, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.954902 }, { "epoch": 0.6831692817464184, "grad_norm": 0.39229342341423035, "learning_rate": 2.5024814133540043e-06, "loss": 0.017605602741241455, "memory(GiB)": 21.48, "step": 21030, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.954912 }, { "epoch": 0.6832017672091739, "grad_norm": 0.38989296555519104, "learning_rate": 2.5020160858720744e-06, "loss": 0.018239062279462814, "memory(GiB)": 21.48, "step": 21031, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.954921 }, { "epoch": 0.6832342526719293, "grad_norm": 0.4395124912261963, "learning_rate": 2.5015507872201996e-06, "loss": 0.018432224169373512, "memory(GiB)": 21.48, "step": 21032, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.954931 }, { "epoch": 0.6832667381346847, "grad_norm": 0.4592721462249756, "learning_rate": 2.5010855174037573e-06, "loss": 0.02025410532951355, "memory(GiB)": 21.48, "step": 21033, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.95494 }, { "epoch": 0.6832992235974401, "grad_norm": 0.4119453728199005, "learning_rate": 2.500620276428113e-06, "loss": 0.019764121621847153, "memory(GiB)": 21.48, "step": 21034, "token_acc": 1.0, "train_speed(iter/s)": 0.954949 }, { "epoch": 0.6833317090601956, "grad_norm": 0.2361549288034439, "learning_rate": 2.5001550642986395e-06, "loss": 0.01379485335201025, "memory(GiB)": 21.48, "step": 21035, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.954959 }, { "epoch": 0.6833641945229509, "grad_norm": 0.2587907612323761, "learning_rate": 2.4996898810207015e-06, "loss": 0.012755343690514565, "memory(GiB)": 21.48, "step": 21036, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.954969 }, { "epoch": 0.6833966799857064, "grad_norm": 0.3371930420398712, "learning_rate": 2.4992247265996683e-06, "loss": 0.013901288621127605, "memory(GiB)": 21.48, "step": 21037, "token_acc": 1.0, "train_speed(iter/s)": 0.954978 }, { "epoch": 0.6834291654484618, "grad_norm": 0.3682864010334015, "learning_rate": 2.4987596010409142e-06, "loss": 0.016269676387310028, "memory(GiB)": 21.48, "step": 21038, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.954988 }, { "epoch": 0.6834616509112172, "grad_norm": 0.33400270342826843, "learning_rate": 2.4982945043498016e-06, "loss": 0.008361700922250748, "memory(GiB)": 21.48, "step": 21039, "token_acc": 1.0, "train_speed(iter/s)": 0.954997 }, { "epoch": 0.6834941363739726, "grad_norm": 0.31459471583366394, "learning_rate": 2.4978294365317017e-06, "loss": 0.013235585764050484, "memory(GiB)": 21.48, "step": 21040, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.955007 }, { "epoch": 0.6835266218367281, "grad_norm": 0.2655138373374939, "learning_rate": 2.4973643975919785e-06, "loss": 0.012121539562940598, "memory(GiB)": 21.48, "step": 21041, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.955017 }, { "epoch": 0.6835591072994834, "grad_norm": 0.7308138012886047, "learning_rate": 2.496899387536002e-06, "loss": 0.020595742389559746, "memory(GiB)": 21.48, "step": 21042, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.955026 }, { "epoch": 0.6835915927622389, "grad_norm": 0.36506152153015137, "learning_rate": 2.4964344063691374e-06, "loss": 0.017705649137496948, "memory(GiB)": 21.48, "step": 21043, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955035 }, { "epoch": 0.6836240782249943, "grad_norm": 0.3233792781829834, "learning_rate": 2.4959694540967543e-06, "loss": 0.010973777621984482, "memory(GiB)": 21.48, "step": 21044, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.955045 }, { "epoch": 0.6836565636877497, "grad_norm": 0.4233807921409607, "learning_rate": 2.4955045307242142e-06, "loss": 0.0191788412630558, "memory(GiB)": 21.48, "step": 21045, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.955054 }, { "epoch": 0.6836890491505051, "grad_norm": 0.4699368476867676, "learning_rate": 2.4950396362568856e-06, "loss": 0.01689247414469719, "memory(GiB)": 21.48, "step": 21046, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.955064 }, { "epoch": 0.6837215346132606, "grad_norm": 0.44551414251327515, "learning_rate": 2.4945747707001337e-06, "loss": 0.01654132641851902, "memory(GiB)": 21.48, "step": 21047, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.955074 }, { "epoch": 0.6837540200760159, "grad_norm": 0.4084862470626831, "learning_rate": 2.494109934059326e-06, "loss": 0.017794650048017502, "memory(GiB)": 21.48, "step": 21048, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.955083 }, { "epoch": 0.6837865055387714, "grad_norm": 0.324984610080719, "learning_rate": 2.4936451263398226e-06, "loss": 0.014466569758951664, "memory(GiB)": 21.48, "step": 21049, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955092 }, { "epoch": 0.6838189910015268, "grad_norm": 0.3967464864253998, "learning_rate": 2.4931803475469907e-06, "loss": 0.014424894005060196, "memory(GiB)": 21.48, "step": 21050, "token_acc": 0.99, "train_speed(iter/s)": 0.955102 }, { "epoch": 0.6838514764642822, "grad_norm": 0.2933371961116791, "learning_rate": 2.4927155976861937e-06, "loss": 0.01292893011122942, "memory(GiB)": 21.48, "step": 21051, "token_acc": 1.0, "train_speed(iter/s)": 0.955111 }, { "epoch": 0.6838839619270376, "grad_norm": 0.21462921798229218, "learning_rate": 2.4922508767627963e-06, "loss": 0.009754887782037258, "memory(GiB)": 22.01, "step": 21052, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.955121 }, { "epoch": 0.6839164473897931, "grad_norm": 0.2964654266834259, "learning_rate": 2.491786184782163e-06, "loss": 0.012089803814888, "memory(GiB)": 22.01, "step": 21053, "token_acc": 1.0, "train_speed(iter/s)": 0.95513 }, { "epoch": 0.6839489328525484, "grad_norm": 0.36217355728149414, "learning_rate": 2.491321521749654e-06, "loss": 0.008757419884204865, "memory(GiB)": 22.01, "step": 21054, "token_acc": 1.0, "train_speed(iter/s)": 0.95514 }, { "epoch": 0.6839814183153039, "grad_norm": 0.21237705647945404, "learning_rate": 2.4908568876706334e-06, "loss": 0.009669337421655655, "memory(GiB)": 22.01, "step": 21055, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.95515 }, { "epoch": 0.6840139037780594, "grad_norm": 0.3754284381866455, "learning_rate": 2.4903922825504643e-06, "loss": 0.014244120568037033, "memory(GiB)": 22.01, "step": 21056, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.95516 }, { "epoch": 0.6840463892408147, "grad_norm": 0.3750302195549011, "learning_rate": 2.4899277063945103e-06, "loss": 0.013268101029098034, "memory(GiB)": 22.01, "step": 21057, "token_acc": 0.9897959183673469, "train_speed(iter/s)": 0.95516 }, { "epoch": 0.6840788747035702, "grad_norm": 0.32385894656181335, "learning_rate": 2.48946315920813e-06, "loss": 0.013818211853504181, "memory(GiB)": 22.01, "step": 21058, "token_acc": 1.0, "train_speed(iter/s)": 0.955132 }, { "epoch": 0.6841113601663256, "grad_norm": 0.23646968603134155, "learning_rate": 2.4889986409966875e-06, "loss": 0.010931042954325676, "memory(GiB)": 22.01, "step": 21059, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.955132 }, { "epoch": 0.684143845629081, "grad_norm": 0.2389490306377411, "learning_rate": 2.4885341517655398e-06, "loss": 0.01284261979162693, "memory(GiB)": 22.01, "step": 21060, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.955135 }, { "epoch": 0.6841763310918364, "grad_norm": 0.4296393096446991, "learning_rate": 2.4880696915200535e-06, "loss": 0.01765831932425499, "memory(GiB)": 22.01, "step": 21061, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.955142 }, { "epoch": 0.6842088165545919, "grad_norm": 0.48999831080436707, "learning_rate": 2.487605260265585e-06, "loss": 0.01735633797943592, "memory(GiB)": 22.01, "step": 21062, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.955148 }, { "epoch": 0.6842413020173472, "grad_norm": 0.4182080626487732, "learning_rate": 2.4871408580074984e-06, "loss": 0.01577712967991829, "memory(GiB)": 22.01, "step": 21063, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.955155 }, { "epoch": 0.6842737874801027, "grad_norm": 0.41136303544044495, "learning_rate": 2.486676484751148e-06, "loss": 0.015058174729347229, "memory(GiB)": 22.01, "step": 21064, "token_acc": 0.9812734082397003, "train_speed(iter/s)": 0.95516 }, { "epoch": 0.6843062729428581, "grad_norm": 0.23475538194179535, "learning_rate": 2.4862121405018967e-06, "loss": 0.008931965567171574, "memory(GiB)": 22.01, "step": 21065, "token_acc": 1.0, "train_speed(iter/s)": 0.955162 }, { "epoch": 0.6843387584056135, "grad_norm": 0.3110673129558563, "learning_rate": 2.485747825265103e-06, "loss": 0.014027204364538193, "memory(GiB)": 22.01, "step": 21066, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.955169 }, { "epoch": 0.6843712438683689, "grad_norm": 0.35071924328804016, "learning_rate": 2.485283539046126e-06, "loss": 0.013236215338110924, "memory(GiB)": 22.01, "step": 21067, "token_acc": 1.0, "train_speed(iter/s)": 0.955175 }, { "epoch": 0.6844037293311244, "grad_norm": 0.2659134268760681, "learning_rate": 2.4848192818503257e-06, "loss": 0.015270717442035675, "memory(GiB)": 22.01, "step": 21068, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.955177 }, { "epoch": 0.6844362147938797, "grad_norm": 0.4258865714073181, "learning_rate": 2.484355053683057e-06, "loss": 0.016051659360527992, "memory(GiB)": 22.01, "step": 21069, "token_acc": 1.0, "train_speed(iter/s)": 0.955183 }, { "epoch": 0.6844687002566352, "grad_norm": 0.2584749162197113, "learning_rate": 2.4838908545496788e-06, "loss": 0.011012382805347443, "memory(GiB)": 22.01, "step": 21070, "token_acc": 1.0, "train_speed(iter/s)": 0.95519 }, { "epoch": 0.6845011857193906, "grad_norm": 0.43095287680625916, "learning_rate": 2.483426684455549e-06, "loss": 0.020687241107225418, "memory(GiB)": 22.01, "step": 21071, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.955196 }, { "epoch": 0.684533671182146, "grad_norm": 0.35889342427253723, "learning_rate": 2.4829625434060273e-06, "loss": 0.01704699546098709, "memory(GiB)": 22.01, "step": 21072, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.955181 }, { "epoch": 0.6845661566449014, "grad_norm": 0.5333929061889648, "learning_rate": 2.482498431406466e-06, "loss": 0.022426793351769447, "memory(GiB)": 22.01, "step": 21073, "token_acc": 0.98, "train_speed(iter/s)": 0.955181 }, { "epoch": 0.6845986421076569, "grad_norm": 0.41341856122016907, "learning_rate": 2.4820343484622235e-06, "loss": 0.015546808019280434, "memory(GiB)": 22.01, "step": 21074, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.955183 }, { "epoch": 0.6846311275704122, "grad_norm": 0.5646557211875916, "learning_rate": 2.4815702945786558e-06, "loss": 0.013858298771083355, "memory(GiB)": 22.01, "step": 21075, "token_acc": 0.99609375, "train_speed(iter/s)": 0.95519 }, { "epoch": 0.6846636130331677, "grad_norm": 0.25427982211112976, "learning_rate": 2.481106269761121e-06, "loss": 0.012210379354655743, "memory(GiB)": 22.01, "step": 21076, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.955198 }, { "epoch": 0.6846960984959231, "grad_norm": 0.29738709330558777, "learning_rate": 2.4806422740149703e-06, "loss": 0.010927092283964157, "memory(GiB)": 22.01, "step": 21077, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.955205 }, { "epoch": 0.6847285839586785, "grad_norm": 0.4360242486000061, "learning_rate": 2.4801783073455617e-06, "loss": 0.016543962061405182, "memory(GiB)": 22.01, "step": 21078, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955212 }, { "epoch": 0.6847610694214339, "grad_norm": 0.3811483085155487, "learning_rate": 2.4797143697582487e-06, "loss": 0.015878960490226746, "memory(GiB)": 22.01, "step": 21079, "token_acc": 0.9944444444444445, "train_speed(iter/s)": 0.955205 }, { "epoch": 0.6847935548841894, "grad_norm": 0.3506234288215637, "learning_rate": 2.479250461258389e-06, "loss": 0.011427086777985096, "memory(GiB)": 22.01, "step": 21080, "token_acc": 0.996, "train_speed(iter/s)": 0.955205 }, { "epoch": 0.6848260403469447, "grad_norm": 0.38920852541923523, "learning_rate": 2.4787865818513314e-06, "loss": 0.01579946279525757, "memory(GiB)": 22.01, "step": 21081, "token_acc": 1.0, "train_speed(iter/s)": 0.9552 }, { "epoch": 0.6848585258097002, "grad_norm": 0.4114377498626709, "learning_rate": 2.4783227315424323e-06, "loss": 0.015651363879442215, "memory(GiB)": 22.01, "step": 21082, "token_acc": 1.0, "train_speed(iter/s)": 0.955208 }, { "epoch": 0.6848910112724556, "grad_norm": 0.26374542713165283, "learning_rate": 2.477858910337046e-06, "loss": 0.012201138772070408, "memory(GiB)": 22.01, "step": 21083, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955216 }, { "epoch": 0.684923496735211, "grad_norm": 0.3708498477935791, "learning_rate": 2.4773951182405244e-06, "loss": 0.016512524336576462, "memory(GiB)": 22.01, "step": 21084, "token_acc": 1.0, "train_speed(iter/s)": 0.955192 }, { "epoch": 0.6849559821979664, "grad_norm": 0.29367950558662415, "learning_rate": 2.4769313552582225e-06, "loss": 0.013010839000344276, "memory(GiB)": 22.01, "step": 21085, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.955194 }, { "epoch": 0.6849884676607219, "grad_norm": 0.3825666904449463, "learning_rate": 2.476467621395489e-06, "loss": 0.015493560582399368, "memory(GiB)": 22.01, "step": 21086, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.955198 }, { "epoch": 0.6850209531234772, "grad_norm": 0.32964837551116943, "learning_rate": 2.4760039166576803e-06, "loss": 0.015013935044407845, "memory(GiB)": 22.01, "step": 21087, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.955201 }, { "epoch": 0.6850534385862327, "grad_norm": 0.3298019766807556, "learning_rate": 2.475540241050142e-06, "loss": 0.016774475574493408, "memory(GiB)": 22.01, "step": 21088, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955211 }, { "epoch": 0.685085924048988, "grad_norm": 0.4805060625076294, "learning_rate": 2.475076594578233e-06, "loss": 0.02404666692018509, "memory(GiB)": 22.01, "step": 21089, "token_acc": 0.992, "train_speed(iter/s)": 0.95522 }, { "epoch": 0.6851184095117435, "grad_norm": 0.2689943015575409, "learning_rate": 2.474612977247299e-06, "loss": 0.010041528381407261, "memory(GiB)": 22.01, "step": 21090, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.955229 }, { "epoch": 0.6851508949744989, "grad_norm": 0.3695340156555176, "learning_rate": 2.474149389062694e-06, "loss": 0.013219574466347694, "memory(GiB)": 22.01, "step": 21091, "token_acc": 1.0, "train_speed(iter/s)": 0.955239 }, { "epoch": 0.6851833804372544, "grad_norm": 0.3246932625770569, "learning_rate": 2.473685830029765e-06, "loss": 0.010143435560166836, "memory(GiB)": 22.01, "step": 21092, "token_acc": 1.0, "train_speed(iter/s)": 0.95524 }, { "epoch": 0.6852158659000097, "grad_norm": 0.2714131772518158, "learning_rate": 2.4732223001538643e-06, "loss": 0.009687881916761398, "memory(GiB)": 22.01, "step": 21093, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955247 }, { "epoch": 0.6852483513627652, "grad_norm": 0.43371036648750305, "learning_rate": 2.472758799440341e-06, "loss": 0.013423875905573368, "memory(GiB)": 22.01, "step": 21094, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.955253 }, { "epoch": 0.6852808368255205, "grad_norm": 0.2288479208946228, "learning_rate": 2.472295327894548e-06, "loss": 0.009383253753185272, "memory(GiB)": 22.01, "step": 21095, "token_acc": 1.0, "train_speed(iter/s)": 0.955257 }, { "epoch": 0.685313322288276, "grad_norm": 0.2561967968940735, "learning_rate": 2.4718318855218283e-06, "loss": 0.016137752681970596, "memory(GiB)": 22.01, "step": 21096, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.955266 }, { "epoch": 0.6853458077510314, "grad_norm": 0.3391749858856201, "learning_rate": 2.471368472327534e-06, "loss": 0.014196804724633694, "memory(GiB)": 22.01, "step": 21097, "token_acc": 1.0, "train_speed(iter/s)": 0.955276 }, { "epoch": 0.6853782932137868, "grad_norm": 0.4211673438549042, "learning_rate": 2.470905088317013e-06, "loss": 0.011422192677855492, "memory(GiB)": 22.01, "step": 21098, "token_acc": 1.0, "train_speed(iter/s)": 0.955286 }, { "epoch": 0.6854107786765422, "grad_norm": 0.2560655176639557, "learning_rate": 2.470441733495613e-06, "loss": 0.009444925002753735, "memory(GiB)": 22.01, "step": 21099, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.955296 }, { "epoch": 0.6854432641392977, "grad_norm": 0.3472895622253418, "learning_rate": 2.469978407868685e-06, "loss": 0.020383428782224655, "memory(GiB)": 22.01, "step": 21100, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.955301 }, { "epoch": 0.685475749602053, "grad_norm": 0.4345930218696594, "learning_rate": 2.4695151114415715e-06, "loss": 0.019200757145881653, "memory(GiB)": 22.01, "step": 21101, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955303 }, { "epoch": 0.6855082350648085, "grad_norm": 0.4737541079521179, "learning_rate": 2.4690518442196217e-06, "loss": 0.02008834481239319, "memory(GiB)": 22.01, "step": 21102, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955313 }, { "epoch": 0.6855407205275639, "grad_norm": 0.3467423915863037, "learning_rate": 2.468588606208182e-06, "loss": 0.012357628904283047, "memory(GiB)": 22.01, "step": 21103, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.955323 }, { "epoch": 0.6855732059903193, "grad_norm": 0.43076616525650024, "learning_rate": 2.4681253974126014e-06, "loss": 0.015438010916113853, "memory(GiB)": 22.01, "step": 21104, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.955333 }, { "epoch": 0.6856056914530747, "grad_norm": 0.3187377452850342, "learning_rate": 2.4676622178382214e-06, "loss": 0.01214916817843914, "memory(GiB)": 22.01, "step": 21105, "token_acc": 1.0, "train_speed(iter/s)": 0.955343 }, { "epoch": 0.6856381769158302, "grad_norm": 0.27188876271247864, "learning_rate": 2.46719906749039e-06, "loss": 0.007256416603922844, "memory(GiB)": 22.01, "step": 21106, "token_acc": 1.0, "train_speed(iter/s)": 0.955352 }, { "epoch": 0.6856706623785855, "grad_norm": 0.29160189628601074, "learning_rate": 2.4667359463744527e-06, "loss": 0.007680639624595642, "memory(GiB)": 22.01, "step": 21107, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.955361 }, { "epoch": 0.685703147841341, "grad_norm": 0.32036176323890686, "learning_rate": 2.466272854495756e-06, "loss": 0.010832901112735271, "memory(GiB)": 22.01, "step": 21108, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.955371 }, { "epoch": 0.6857356333040964, "grad_norm": 0.44182199239730835, "learning_rate": 2.4658097918596413e-06, "loss": 0.015366323292255402, "memory(GiB)": 22.01, "step": 21109, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.955379 }, { "epoch": 0.6857681187668518, "grad_norm": 0.427457332611084, "learning_rate": 2.4653467584714563e-06, "loss": 0.014628437347710133, "memory(GiB)": 22.01, "step": 21110, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.955386 }, { "epoch": 0.6858006042296072, "grad_norm": 0.3491193652153015, "learning_rate": 2.4648837543365394e-06, "loss": 0.01270301267504692, "memory(GiB)": 22.01, "step": 21111, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955394 }, { "epoch": 0.6858330896923627, "grad_norm": 0.4568595290184021, "learning_rate": 2.464420779460242e-06, "loss": 0.01674792729318142, "memory(GiB)": 22.01, "step": 21112, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.955402 }, { "epoch": 0.685865575155118, "grad_norm": 0.2978363633155823, "learning_rate": 2.463957833847902e-06, "loss": 0.015831759199500084, "memory(GiB)": 22.01, "step": 21113, "token_acc": 0.9927272727272727, "train_speed(iter/s)": 0.955409 }, { "epoch": 0.6858980606178735, "grad_norm": 0.6033238768577576, "learning_rate": 2.4634949175048637e-06, "loss": 0.018605168908834457, "memory(GiB)": 22.01, "step": 21114, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.955416 }, { "epoch": 0.6859305460806289, "grad_norm": 0.3198680877685547, "learning_rate": 2.463032030436471e-06, "loss": 0.009947111830115318, "memory(GiB)": 22.01, "step": 21115, "token_acc": 1.0, "train_speed(iter/s)": 0.955424 }, { "epoch": 0.6859630315433843, "grad_norm": 0.449571430683136, "learning_rate": 2.4625691726480644e-06, "loss": 0.019601870328187943, "memory(GiB)": 22.01, "step": 21116, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.955431 }, { "epoch": 0.6859955170061397, "grad_norm": 0.7930470108985901, "learning_rate": 2.4621063441449894e-06, "loss": 0.02035735361278057, "memory(GiB)": 22.01, "step": 21117, "token_acc": 0.996, "train_speed(iter/s)": 0.955438 }, { "epoch": 0.6860280024688952, "grad_norm": 0.6950243711471558, "learning_rate": 2.461643544932583e-06, "loss": 0.02157481759786606, "memory(GiB)": 22.01, "step": 21118, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955445 }, { "epoch": 0.6860604879316505, "grad_norm": 0.2636444866657257, "learning_rate": 2.4611807750161903e-06, "loss": 0.009112132713198662, "memory(GiB)": 22.01, "step": 21119, "token_acc": 1.0, "train_speed(iter/s)": 0.955453 }, { "epoch": 0.686092973394406, "grad_norm": 0.36030149459838867, "learning_rate": 2.460718034401147e-06, "loss": 0.011084500700235367, "memory(GiB)": 22.01, "step": 21120, "token_acc": 1.0, "train_speed(iter/s)": 0.95546 }, { "epoch": 0.6861254588571615, "grad_norm": 0.4311074912548065, "learning_rate": 2.460255323092801e-06, "loss": 0.017322247847914696, "memory(GiB)": 22.01, "step": 21121, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.955466 }, { "epoch": 0.6861579443199168, "grad_norm": 0.4385666847229004, "learning_rate": 2.4597926410964877e-06, "loss": 0.020021183416247368, "memory(GiB)": 22.01, "step": 21122, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.955472 }, { "epoch": 0.6861904297826723, "grad_norm": 0.4756886959075928, "learning_rate": 2.45932998841755e-06, "loss": 0.018626127392053604, "memory(GiB)": 22.01, "step": 21123, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.955478 }, { "epoch": 0.6862229152454277, "grad_norm": 0.2522909939289093, "learning_rate": 2.458867365061323e-06, "loss": 0.016453087329864502, "memory(GiB)": 22.01, "step": 21124, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.955484 }, { "epoch": 0.6862554007081831, "grad_norm": 0.43585893511772156, "learning_rate": 2.45840477103315e-06, "loss": 0.014045590534806252, "memory(GiB)": 22.01, "step": 21125, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.95549 }, { "epoch": 0.6862878861709385, "grad_norm": 0.28018346428871155, "learning_rate": 2.4579422063383683e-06, "loss": 0.014414824545383453, "memory(GiB)": 22.01, "step": 21126, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955496 }, { "epoch": 0.686320371633694, "grad_norm": 0.34913748502731323, "learning_rate": 2.457479670982319e-06, "loss": 0.013906202279031277, "memory(GiB)": 22.01, "step": 21127, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955502 }, { "epoch": 0.6863528570964493, "grad_norm": 0.37621748447418213, "learning_rate": 2.4570171649703367e-06, "loss": 0.016027282923460007, "memory(GiB)": 22.01, "step": 21128, "token_acc": 0.995, "train_speed(iter/s)": 0.955509 }, { "epoch": 0.6863853425592048, "grad_norm": 0.27382540702819824, "learning_rate": 2.456554688307761e-06, "loss": 0.013555067591369152, "memory(GiB)": 22.01, "step": 21129, "token_acc": 1.0, "train_speed(iter/s)": 0.955517 }, { "epoch": 0.6864178280219602, "grad_norm": 0.44542840123176575, "learning_rate": 2.4560922409999286e-06, "loss": 0.01761302724480629, "memory(GiB)": 22.01, "step": 21130, "token_acc": 0.9948717948717949, "train_speed(iter/s)": 0.955525 }, { "epoch": 0.6864503134847156, "grad_norm": 0.387601375579834, "learning_rate": 2.4556298230521786e-06, "loss": 0.01122426986694336, "memory(GiB)": 22.01, "step": 21131, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.955532 }, { "epoch": 0.686482798947471, "grad_norm": 0.3530815541744232, "learning_rate": 2.455167434469849e-06, "loss": 0.01472449116408825, "memory(GiB)": 22.01, "step": 21132, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.955537 }, { "epoch": 0.6865152844102265, "grad_norm": 0.28949838876724243, "learning_rate": 2.4547050752582716e-06, "loss": 0.008771855384111404, "memory(GiB)": 22.01, "step": 21133, "token_acc": 0.996, "train_speed(iter/s)": 0.955544 }, { "epoch": 0.6865477698729818, "grad_norm": 0.31288379430770874, "learning_rate": 2.4542427454227864e-06, "loss": 0.014619776047766209, "memory(GiB)": 22.01, "step": 21134, "token_acc": 1.0, "train_speed(iter/s)": 0.955551 }, { "epoch": 0.6865802553357373, "grad_norm": 0.23534977436065674, "learning_rate": 2.4537804449687274e-06, "loss": 0.010964554734528065, "memory(GiB)": 22.01, "step": 21135, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955559 }, { "epoch": 0.6866127407984927, "grad_norm": 0.33841314911842346, "learning_rate": 2.453318173901434e-06, "loss": 0.011529622599482536, "memory(GiB)": 22.01, "step": 21136, "token_acc": 1.0, "train_speed(iter/s)": 0.955566 }, { "epoch": 0.6866452262612481, "grad_norm": 0.2655274271965027, "learning_rate": 2.452855932226236e-06, "loss": 0.009710153564810753, "memory(GiB)": 22.01, "step": 21137, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955573 }, { "epoch": 0.6866777117240035, "grad_norm": 0.3998461663722992, "learning_rate": 2.4523937199484704e-06, "loss": 0.012078149244189262, "memory(GiB)": 22.01, "step": 21138, "token_acc": 1.0, "train_speed(iter/s)": 0.955581 }, { "epoch": 0.686710197186759, "grad_norm": 0.5379254817962646, "learning_rate": 2.4519315370734726e-06, "loss": 0.014794252812862396, "memory(GiB)": 22.01, "step": 21139, "token_acc": 1.0, "train_speed(iter/s)": 0.955589 }, { "epoch": 0.6867426826495143, "grad_norm": 0.4364146292209625, "learning_rate": 2.4514693836065782e-06, "loss": 0.018349329009652138, "memory(GiB)": 22.01, "step": 21140, "token_acc": 1.0, "train_speed(iter/s)": 0.955597 }, { "epoch": 0.6867751681122698, "grad_norm": 0.44953474402427673, "learning_rate": 2.4510072595531177e-06, "loss": 0.018325481563806534, "memory(GiB)": 22.01, "step": 21141, "token_acc": 1.0, "train_speed(iter/s)": 0.955581 }, { "epoch": 0.6868076535750252, "grad_norm": 0.42509394884109497, "learning_rate": 2.4505451649184277e-06, "loss": 0.019667841494083405, "memory(GiB)": 22.01, "step": 21142, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955588 }, { "epoch": 0.6868401390377806, "grad_norm": 0.42360490560531616, "learning_rate": 2.4500830997078356e-06, "loss": 0.017776595428586006, "memory(GiB)": 22.01, "step": 21143, "token_acc": 1.0, "train_speed(iter/s)": 0.955595 }, { "epoch": 0.686872624500536, "grad_norm": 0.4094998240470886, "learning_rate": 2.4496210639266804e-06, "loss": 0.0133729362860322, "memory(GiB)": 22.01, "step": 21144, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.955603 }, { "epoch": 0.6869051099632915, "grad_norm": 0.6899515986442566, "learning_rate": 2.4491590575802953e-06, "loss": 0.0204169861972332, "memory(GiB)": 22.01, "step": 21145, "token_acc": 1.0, "train_speed(iter/s)": 0.955611 }, { "epoch": 0.6869375954260468, "grad_norm": 0.3263567388057709, "learning_rate": 2.4486970806740074e-06, "loss": 0.012032419443130493, "memory(GiB)": 22.01, "step": 21146, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.95562 }, { "epoch": 0.6869700808888023, "grad_norm": 0.33694204688072205, "learning_rate": 2.4482351332131524e-06, "loss": 0.011970121413469315, "memory(GiB)": 22.01, "step": 21147, "token_acc": 0.9875, "train_speed(iter/s)": 0.955629 }, { "epoch": 0.6870025663515577, "grad_norm": 0.5499969720840454, "learning_rate": 2.4477732152030575e-06, "loss": 0.01295987144112587, "memory(GiB)": 22.01, "step": 21148, "token_acc": 1.0, "train_speed(iter/s)": 0.955639 }, { "epoch": 0.6870350518143131, "grad_norm": 0.27854597568511963, "learning_rate": 2.44731132664906e-06, "loss": 0.010355214588344097, "memory(GiB)": 22.01, "step": 21149, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.95564 }, { "epoch": 0.6870675372770685, "grad_norm": 0.32056161761283875, "learning_rate": 2.446849467556486e-06, "loss": 0.013339830562472343, "memory(GiB)": 22.01, "step": 21150, "token_acc": 1.0, "train_speed(iter/s)": 0.95565 }, { "epoch": 0.687100022739824, "grad_norm": 0.38948458433151245, "learning_rate": 2.446387637930669e-06, "loss": 0.01682024449110031, "memory(GiB)": 22.01, "step": 21151, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.955659 }, { "epoch": 0.6871325082025793, "grad_norm": 0.502407968044281, "learning_rate": 2.4459258377769352e-06, "loss": 0.01681075431406498, "memory(GiB)": 22.01, "step": 21152, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.955668 }, { "epoch": 0.6871649936653348, "grad_norm": 0.49358534812927246, "learning_rate": 2.445464067100617e-06, "loss": 0.016562532633543015, "memory(GiB)": 22.01, "step": 21153, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.955678 }, { "epoch": 0.6871974791280901, "grad_norm": 0.3050214350223541, "learning_rate": 2.4450023259070437e-06, "loss": 0.010696153156459332, "memory(GiB)": 22.01, "step": 21154, "token_acc": 0.9948186528497409, "train_speed(iter/s)": 0.955687 }, { "epoch": 0.6872299645908456, "grad_norm": 0.3675459325313568, "learning_rate": 2.4445406142015458e-06, "loss": 0.012886477634310722, "memory(GiB)": 22.01, "step": 21155, "token_acc": 1.0, "train_speed(iter/s)": 0.955697 }, { "epoch": 0.687262450053601, "grad_norm": 0.5177408456802368, "learning_rate": 2.4440789319894488e-06, "loss": 0.01718619465827942, "memory(GiB)": 22.01, "step": 21156, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.955707 }, { "epoch": 0.6872949355163565, "grad_norm": 0.37735995650291443, "learning_rate": 2.443617279276083e-06, "loss": 0.01755630597472191, "memory(GiB)": 22.01, "step": 21157, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.955716 }, { "epoch": 0.6873274209791118, "grad_norm": 0.42280182242393494, "learning_rate": 2.443155656066776e-06, "loss": 0.01507372222840786, "memory(GiB)": 22.01, "step": 21158, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955726 }, { "epoch": 0.6873599064418673, "grad_norm": 0.4207873046398163, "learning_rate": 2.4426940623668565e-06, "loss": 0.013904403895139694, "memory(GiB)": 22.01, "step": 21159, "token_acc": 0.996, "train_speed(iter/s)": 0.955735 }, { "epoch": 0.6873923919046226, "grad_norm": 0.414035826921463, "learning_rate": 2.4422324981816526e-06, "loss": 0.01817481964826584, "memory(GiB)": 22.01, "step": 21160, "token_acc": 1.0, "train_speed(iter/s)": 0.955745 }, { "epoch": 0.6874248773673781, "grad_norm": 0.4658140540122986, "learning_rate": 2.4417709635164884e-06, "loss": 0.020923111587762833, "memory(GiB)": 22.01, "step": 21161, "token_acc": 1.0, "train_speed(iter/s)": 0.955755 }, { "epoch": 0.6874573628301335, "grad_norm": 0.28571370244026184, "learning_rate": 2.441309458376692e-06, "loss": 0.013993697240948677, "memory(GiB)": 22.01, "step": 21162, "token_acc": 1.0, "train_speed(iter/s)": 0.955765 }, { "epoch": 0.687489848292889, "grad_norm": 0.3574966490268707, "learning_rate": 2.4408479827675903e-06, "loss": 0.015022783540189266, "memory(GiB)": 22.01, "step": 21163, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.955775 }, { "epoch": 0.6875223337556443, "grad_norm": 0.35044780373573303, "learning_rate": 2.4403865366945114e-06, "loss": 0.018205925822257996, "memory(GiB)": 22.01, "step": 21164, "token_acc": 0.9923371647509579, "train_speed(iter/s)": 0.955784 }, { "epoch": 0.6875548192183998, "grad_norm": 0.41284215450286865, "learning_rate": 2.4399251201627762e-06, "loss": 0.01484876498579979, "memory(GiB)": 22.01, "step": 21165, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.955794 }, { "epoch": 0.6875873046811551, "grad_norm": 0.4553508460521698, "learning_rate": 2.4394637331777128e-06, "loss": 0.019594812765717506, "memory(GiB)": 22.01, "step": 21166, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955804 }, { "epoch": 0.6876197901439106, "grad_norm": 0.3577049970626831, "learning_rate": 2.439002375744646e-06, "loss": 0.012699955143034458, "memory(GiB)": 22.01, "step": 21167, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955813 }, { "epoch": 0.687652275606666, "grad_norm": 0.6239630579948425, "learning_rate": 2.438541047868902e-06, "loss": 0.023189963772892952, "memory(GiB)": 22.01, "step": 21168, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955821 }, { "epoch": 0.6876847610694214, "grad_norm": 0.2296387255191803, "learning_rate": 2.4380797495558015e-06, "loss": 0.012151923030614853, "memory(GiB)": 22.01, "step": 21169, "token_acc": 0.9803149606299213, "train_speed(iter/s)": 0.955829 }, { "epoch": 0.6877172465321768, "grad_norm": 0.30828315019607544, "learning_rate": 2.4376184808106735e-06, "loss": 0.011044014245271683, "memory(GiB)": 22.01, "step": 21170, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955836 }, { "epoch": 0.6877497319949323, "grad_norm": 0.3426668345928192, "learning_rate": 2.4371572416388343e-06, "loss": 0.019410032778978348, "memory(GiB)": 22.01, "step": 21171, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.955844 }, { "epoch": 0.6877822174576876, "grad_norm": 0.33842867612838745, "learning_rate": 2.4366960320456156e-06, "loss": 0.007833133451640606, "memory(GiB)": 22.01, "step": 21172, "token_acc": 1.0, "train_speed(iter/s)": 0.955851 }, { "epoch": 0.6878147029204431, "grad_norm": 0.3453107178211212, "learning_rate": 2.436234852036335e-06, "loss": 0.012403277680277824, "memory(GiB)": 22.01, "step": 21173, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.955858 }, { "epoch": 0.6878471883831985, "grad_norm": 0.46670374274253845, "learning_rate": 2.4357737016163175e-06, "loss": 0.01662442833185196, "memory(GiB)": 22.01, "step": 21174, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.955866 }, { "epoch": 0.6878796738459539, "grad_norm": 0.3507128357887268, "learning_rate": 2.4353125807908832e-06, "loss": 0.015163419768214226, "memory(GiB)": 22.01, "step": 21175, "token_acc": 0.996, "train_speed(iter/s)": 0.955873 }, { "epoch": 0.6879121593087093, "grad_norm": 0.3324935734272003, "learning_rate": 2.4348514895653537e-06, "loss": 0.012546591460704803, "memory(GiB)": 22.01, "step": 21176, "token_acc": 0.9884615384615385, "train_speed(iter/s)": 0.95588 }, { "epoch": 0.6879446447714648, "grad_norm": 0.28595951199531555, "learning_rate": 2.4343904279450555e-06, "loss": 0.015417749062180519, "memory(GiB)": 22.01, "step": 21177, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.955887 }, { "epoch": 0.6879771302342201, "grad_norm": 0.49037736654281616, "learning_rate": 2.4339293959353045e-06, "loss": 0.016068754717707634, "memory(GiB)": 22.01, "step": 21178, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.955895 }, { "epoch": 0.6880096156969756, "grad_norm": 0.3003654479980469, "learning_rate": 2.433468393541426e-06, "loss": 0.016829298809170723, "memory(GiB)": 22.01, "step": 21179, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.955902 }, { "epoch": 0.688042101159731, "grad_norm": 0.2351391613483429, "learning_rate": 2.433007420768736e-06, "loss": 0.008271360769867897, "memory(GiB)": 22.01, "step": 21180, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955909 }, { "epoch": 0.6880745866224864, "grad_norm": 0.5041069984436035, "learning_rate": 2.4325464776225577e-06, "loss": 0.020542573183774948, "memory(GiB)": 22.01, "step": 21181, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955916 }, { "epoch": 0.6881070720852418, "grad_norm": 0.2929277718067169, "learning_rate": 2.43208556410821e-06, "loss": 0.014337294735014439, "memory(GiB)": 22.01, "step": 21182, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.955924 }, { "epoch": 0.6881395575479973, "grad_norm": 0.23391976952552795, "learning_rate": 2.4316246802310146e-06, "loss": 0.00964379869401455, "memory(GiB)": 22.01, "step": 21183, "token_acc": 1.0, "train_speed(iter/s)": 0.95593 }, { "epoch": 0.6881720430107527, "grad_norm": 0.29925259947776794, "learning_rate": 2.4311638259962866e-06, "loss": 0.012703245505690575, "memory(GiB)": 22.01, "step": 21184, "token_acc": 0.9930313588850174, "train_speed(iter/s)": 0.955937 }, { "epoch": 0.6882045284735081, "grad_norm": 0.28131577372550964, "learning_rate": 2.4307030014093474e-06, "loss": 0.01192416064441204, "memory(GiB)": 22.01, "step": 21185, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.955943 }, { "epoch": 0.6882370139362636, "grad_norm": 0.3528546690940857, "learning_rate": 2.4302422064755154e-06, "loss": 0.01220800168812275, "memory(GiB)": 22.01, "step": 21186, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.95595 }, { "epoch": 0.6882694993990189, "grad_norm": 0.2390679121017456, "learning_rate": 2.429781441200111e-06, "loss": 0.008021579124033451, "memory(GiB)": 22.01, "step": 21187, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.955955 }, { "epoch": 0.6883019848617744, "grad_norm": 0.26873159408569336, "learning_rate": 2.429320705588447e-06, "loss": 0.01219395361840725, "memory(GiB)": 22.01, "step": 21188, "token_acc": 1.0, "train_speed(iter/s)": 0.955963 }, { "epoch": 0.6883344703245298, "grad_norm": 0.41125741600990295, "learning_rate": 2.4288599996458444e-06, "loss": 0.02018028125166893, "memory(GiB)": 22.01, "step": 21189, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.95597 }, { "epoch": 0.6883669557872852, "grad_norm": 0.29982703924179077, "learning_rate": 2.4283993233776192e-06, "loss": 0.016260569915175438, "memory(GiB)": 22.01, "step": 21190, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955978 }, { "epoch": 0.6883994412500406, "grad_norm": 0.31018099188804626, "learning_rate": 2.427938676789089e-06, "loss": 0.013929514214396477, "memory(GiB)": 22.01, "step": 21191, "token_acc": 1.0, "train_speed(iter/s)": 0.955985 }, { "epoch": 0.6884319267127961, "grad_norm": 0.5239633917808533, "learning_rate": 2.4274780598855714e-06, "loss": 0.021295838057994843, "memory(GiB)": 22.01, "step": 21192, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955993 }, { "epoch": 0.6884644121755514, "grad_norm": 0.481009840965271, "learning_rate": 2.427017472672379e-06, "loss": 0.01807555928826332, "memory(GiB)": 22.01, "step": 21193, "token_acc": 0.9933554817275747, "train_speed(iter/s)": 0.955999 }, { "epoch": 0.6884968976383069, "grad_norm": 0.3818719685077667, "learning_rate": 2.4265569151548294e-06, "loss": 0.016171161085367203, "memory(GiB)": 22.01, "step": 21194, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.956006 }, { "epoch": 0.6885293831010623, "grad_norm": 0.3279416263103485, "learning_rate": 2.4260963873382385e-06, "loss": 0.011941695585846901, "memory(GiB)": 22.01, "step": 21195, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.956013 }, { "epoch": 0.6885618685638177, "grad_norm": 0.39843255281448364, "learning_rate": 2.4256358892279236e-06, "loss": 0.01986563391983509, "memory(GiB)": 22.01, "step": 21196, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.95602 }, { "epoch": 0.6885943540265731, "grad_norm": 0.4329036772251129, "learning_rate": 2.4251754208291943e-06, "loss": 0.019057944416999817, "memory(GiB)": 22.01, "step": 21197, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.956027 }, { "epoch": 0.6886268394893286, "grad_norm": 0.2803891897201538, "learning_rate": 2.4247149821473704e-06, "loss": 0.013056980445981026, "memory(GiB)": 22.01, "step": 21198, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956035 }, { "epoch": 0.6886593249520839, "grad_norm": 0.41340869665145874, "learning_rate": 2.4242545731877585e-06, "loss": 0.02065102942287922, "memory(GiB)": 22.01, "step": 21199, "token_acc": 1.0, "train_speed(iter/s)": 0.956043 }, { "epoch": 0.6886918104148394, "grad_norm": 0.25832730531692505, "learning_rate": 2.423794193955682e-06, "loss": 0.009506400674581528, "memory(GiB)": 22.01, "step": 21200, "token_acc": 1.0, "train_speed(iter/s)": 0.95605 }, { "epoch": 0.6887242958775948, "grad_norm": 0.4441209137439728, "learning_rate": 2.423333844456447e-06, "loss": 0.011996462941169739, "memory(GiB)": 22.01, "step": 21201, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.956058 }, { "epoch": 0.6887567813403502, "grad_norm": 0.3791305124759674, "learning_rate": 2.4228735246953707e-06, "loss": 0.010735047981142998, "memory(GiB)": 22.01, "step": 21202, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956065 }, { "epoch": 0.6887892668031056, "grad_norm": 0.4440285265445709, "learning_rate": 2.4224132346777625e-06, "loss": 0.01608922705054283, "memory(GiB)": 22.01, "step": 21203, "token_acc": 0.9753694581280788, "train_speed(iter/s)": 0.956072 }, { "epoch": 0.6888217522658611, "grad_norm": 0.384798139333725, "learning_rate": 2.421952974408936e-06, "loss": 0.016618791967630386, "memory(GiB)": 22.01, "step": 21204, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.956079 }, { "epoch": 0.6888542377286164, "grad_norm": 0.35346758365631104, "learning_rate": 2.421492743894204e-06, "loss": 0.014450387097895145, "memory(GiB)": 22.01, "step": 21205, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.956087 }, { "epoch": 0.6888867231913719, "grad_norm": 0.3110559582710266, "learning_rate": 2.421032543138877e-06, "loss": 0.01659899577498436, "memory(GiB)": 22.01, "step": 21206, "token_acc": 1.0, "train_speed(iter/s)": 0.956094 }, { "epoch": 0.6889192086541273, "grad_norm": 0.4009944200515747, "learning_rate": 2.420572372148269e-06, "loss": 0.019672604277729988, "memory(GiB)": 22.01, "step": 21207, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956102 }, { "epoch": 0.6889516941168827, "grad_norm": 0.3053091764450073, "learning_rate": 2.4201122309276883e-06, "loss": 0.01510363444685936, "memory(GiB)": 22.01, "step": 21208, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.956112 }, { "epoch": 0.6889841795796381, "grad_norm": 0.40683671832084656, "learning_rate": 2.4196521194824454e-06, "loss": 0.010572858154773712, "memory(GiB)": 22.01, "step": 21209, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956121 }, { "epoch": 0.6890166650423936, "grad_norm": 0.2707826793193817, "learning_rate": 2.4191920378178513e-06, "loss": 0.014320655725896358, "memory(GiB)": 22.01, "step": 21210, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.956131 }, { "epoch": 0.6890491505051489, "grad_norm": 0.715276300907135, "learning_rate": 2.418731985939218e-06, "loss": 0.015169861726462841, "memory(GiB)": 22.01, "step": 21211, "token_acc": 0.9965635738831615, "train_speed(iter/s)": 0.95614 }, { "epoch": 0.6890816359679044, "grad_norm": 0.400714635848999, "learning_rate": 2.418271963851852e-06, "loss": 0.020162921398878098, "memory(GiB)": 22.01, "step": 21212, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.956149 }, { "epoch": 0.6891141214306598, "grad_norm": 0.403933048248291, "learning_rate": 2.417811971561064e-06, "loss": 0.013729043304920197, "memory(GiB)": 22.01, "step": 21213, "token_acc": 1.0, "train_speed(iter/s)": 0.956158 }, { "epoch": 0.6891466068934152, "grad_norm": 0.2765653729438782, "learning_rate": 2.417352009072162e-06, "loss": 0.015608750283718109, "memory(GiB)": 22.01, "step": 21214, "token_acc": 0.9897260273972602, "train_speed(iter/s)": 0.956168 }, { "epoch": 0.6891790923561706, "grad_norm": 0.28367137908935547, "learning_rate": 2.4168920763904574e-06, "loss": 0.012512503191828728, "memory(GiB)": 22.01, "step": 21215, "token_acc": 1.0, "train_speed(iter/s)": 0.956177 }, { "epoch": 0.6892115778189261, "grad_norm": 0.3741009533405304, "learning_rate": 2.4164321735212543e-06, "loss": 0.010910078883171082, "memory(GiB)": 22.01, "step": 21216, "token_acc": 1.0, "train_speed(iter/s)": 0.956186 }, { "epoch": 0.6892440632816814, "grad_norm": 0.38215577602386475, "learning_rate": 2.4159723004698633e-06, "loss": 0.014984127134084702, "memory(GiB)": 22.01, "step": 21217, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.956195 }, { "epoch": 0.6892765487444369, "grad_norm": 0.4169594645500183, "learning_rate": 2.4155124572415905e-06, "loss": 0.015097713097929955, "memory(GiB)": 22.01, "step": 21218, "token_acc": 1.0, "train_speed(iter/s)": 0.956205 }, { "epoch": 0.6893090342071923, "grad_norm": 0.3373962342739105, "learning_rate": 2.415052643841746e-06, "loss": 0.01512886956334114, "memory(GiB)": 22.01, "step": 21219, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956214 }, { "epoch": 0.6893415196699477, "grad_norm": 0.3446841537952423, "learning_rate": 2.414592860275633e-06, "loss": 0.01010657288134098, "memory(GiB)": 22.01, "step": 21220, "token_acc": 1.0, "train_speed(iter/s)": 0.956224 }, { "epoch": 0.6893740051327031, "grad_norm": 0.3560730218887329, "learning_rate": 2.414133106548559e-06, "loss": 0.014505947008728981, "memory(GiB)": 22.01, "step": 21221, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.956233 }, { "epoch": 0.6894064905954586, "grad_norm": 0.38828879594802856, "learning_rate": 2.413673382665831e-06, "loss": 0.015717310830950737, "memory(GiB)": 22.01, "step": 21222, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.956242 }, { "epoch": 0.6894389760582139, "grad_norm": 0.6062744855880737, "learning_rate": 2.4132136886327545e-06, "loss": 0.010761849582195282, "memory(GiB)": 22.01, "step": 21223, "token_acc": 1.0, "train_speed(iter/s)": 0.956252 }, { "epoch": 0.6894714615209694, "grad_norm": 0.3695169687271118, "learning_rate": 2.412754024454637e-06, "loss": 0.011076277121901512, "memory(GiB)": 22.01, "step": 21224, "token_acc": 1.0, "train_speed(iter/s)": 0.956261 }, { "epoch": 0.6895039469837247, "grad_norm": 0.4319465160369873, "learning_rate": 2.41229439013678e-06, "loss": 0.018797632306814194, "memory(GiB)": 22.01, "step": 21225, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956271 }, { "epoch": 0.6895364324464802, "grad_norm": 0.4099912941455841, "learning_rate": 2.4118347856844914e-06, "loss": 0.020036226138472557, "memory(GiB)": 22.01, "step": 21226, "token_acc": 1.0, "train_speed(iter/s)": 0.95628 }, { "epoch": 0.6895689179092356, "grad_norm": 0.2602533996105194, "learning_rate": 2.4113752111030707e-06, "loss": 0.015129470266401768, "memory(GiB)": 22.01, "step": 21227, "token_acc": 1.0, "train_speed(iter/s)": 0.956288 }, { "epoch": 0.689601403371991, "grad_norm": 0.4806808829307556, "learning_rate": 2.4109156663978293e-06, "loss": 0.018564241006970406, "memory(GiB)": 22.01, "step": 21228, "token_acc": 0.9964285714285714, "train_speed(iter/s)": 0.956294 }, { "epoch": 0.6896338888347464, "grad_norm": 0.3647449016571045, "learning_rate": 2.4104561515740655e-06, "loss": 0.010067220777273178, "memory(GiB)": 22.01, "step": 21229, "token_acc": 1.0, "train_speed(iter/s)": 0.956302 }, { "epoch": 0.6896663742975019, "grad_norm": 0.27086901664733887, "learning_rate": 2.4099966666370863e-06, "loss": 0.009371964260935783, "memory(GiB)": 22.01, "step": 21230, "token_acc": 0.995, "train_speed(iter/s)": 0.95631 }, { "epoch": 0.6896988597602572, "grad_norm": 0.3137074410915375, "learning_rate": 2.4095372115921908e-06, "loss": 0.01480066403746605, "memory(GiB)": 22.01, "step": 21231, "token_acc": 1.0, "train_speed(iter/s)": 0.956316 }, { "epoch": 0.6897313452230127, "grad_norm": 0.34542521834373474, "learning_rate": 2.4090777864446834e-06, "loss": 0.01347622275352478, "memory(GiB)": 22.01, "step": 21232, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.956323 }, { "epoch": 0.6897638306857681, "grad_norm": 0.30103006958961487, "learning_rate": 2.408618391199867e-06, "loss": 0.011212898418307304, "memory(GiB)": 22.01, "step": 21233, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.956331 }, { "epoch": 0.6897963161485235, "grad_norm": 0.2756247818470001, "learning_rate": 2.4081590258630457e-06, "loss": 0.015603073872625828, "memory(GiB)": 22.01, "step": 21234, "token_acc": 0.99, "train_speed(iter/s)": 0.956338 }, { "epoch": 0.6898288016112789, "grad_norm": 0.2689872086048126, "learning_rate": 2.4076996904395167e-06, "loss": 0.014005305245518684, "memory(GiB)": 22.01, "step": 21235, "token_acc": 1.0, "train_speed(iter/s)": 0.956345 }, { "epoch": 0.6898612870740344, "grad_norm": 0.39075902104377747, "learning_rate": 2.407240384934584e-06, "loss": 0.022120293229818344, "memory(GiB)": 22.01, "step": 21236, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956353 }, { "epoch": 0.6898937725367897, "grad_norm": 0.24401773512363434, "learning_rate": 2.406781109353548e-06, "loss": 0.014979172497987747, "memory(GiB)": 22.01, "step": 21237, "token_acc": 0.9922779922779923, "train_speed(iter/s)": 0.956361 }, { "epoch": 0.6899262579995452, "grad_norm": 0.9356427788734436, "learning_rate": 2.4063218637017096e-06, "loss": 0.016089707612991333, "memory(GiB)": 22.01, "step": 21238, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956367 }, { "epoch": 0.6899587434623006, "grad_norm": 0.41043055057525635, "learning_rate": 2.405862647984371e-06, "loss": 0.018817877396941185, "memory(GiB)": 22.01, "step": 21239, "token_acc": 0.992, "train_speed(iter/s)": 0.956375 }, { "epoch": 0.689991228925056, "grad_norm": 0.4689728319644928, "learning_rate": 2.4054034622068284e-06, "loss": 0.018408838659524918, "memory(GiB)": 22.01, "step": 21240, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.956382 }, { "epoch": 0.6900237143878114, "grad_norm": 0.3991207480430603, "learning_rate": 2.4049443063743827e-06, "loss": 0.016663340851664543, "memory(GiB)": 22.01, "step": 21241, "token_acc": 0.9875, "train_speed(iter/s)": 0.956389 }, { "epoch": 0.6900561998505669, "grad_norm": 0.3312579393386841, "learning_rate": 2.404485180492334e-06, "loss": 0.021408602595329285, "memory(GiB)": 22.01, "step": 21242, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.956397 }, { "epoch": 0.6900886853133222, "grad_norm": 0.3367060720920563, "learning_rate": 2.4040260845659835e-06, "loss": 0.018307605758309364, "memory(GiB)": 22.01, "step": 21243, "token_acc": 1.0, "train_speed(iter/s)": 0.956404 }, { "epoch": 0.6901211707760777, "grad_norm": 0.45901182293891907, "learning_rate": 2.403567018600625e-06, "loss": 0.016327179968357086, "memory(GiB)": 22.01, "step": 21244, "token_acc": 1.0, "train_speed(iter/s)": 0.956411 }, { "epoch": 0.6901536562388331, "grad_norm": 0.2975400984287262, "learning_rate": 2.4031079826015584e-06, "loss": 0.011055169627070427, "memory(GiB)": 22.01, "step": 21245, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.956418 }, { "epoch": 0.6901861417015885, "grad_norm": 0.30494558811187744, "learning_rate": 2.4026489765740824e-06, "loss": 0.012541530653834343, "memory(GiB)": 22.01, "step": 21246, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.956424 }, { "epoch": 0.6902186271643439, "grad_norm": 0.3105805516242981, "learning_rate": 2.4021900005234966e-06, "loss": 0.015306998044252396, "memory(GiB)": 22.01, "step": 21247, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956432 }, { "epoch": 0.6902511126270994, "grad_norm": 0.3209732472896576, "learning_rate": 2.401731054455094e-06, "loss": 0.015966525301337242, "memory(GiB)": 22.01, "step": 21248, "token_acc": 1.0, "train_speed(iter/s)": 0.956439 }, { "epoch": 0.6902835980898548, "grad_norm": 0.33683010935783386, "learning_rate": 2.4012721383741744e-06, "loss": 0.014917716383934021, "memory(GiB)": 22.01, "step": 21249, "token_acc": 0.9948186528497409, "train_speed(iter/s)": 0.956447 }, { "epoch": 0.6903160835526102, "grad_norm": 0.4068770408630371, "learning_rate": 2.40081325228603e-06, "loss": 0.01676921173930168, "memory(GiB)": 22.01, "step": 21250, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.956454 }, { "epoch": 0.6903485690153657, "grad_norm": 0.3290870189666748, "learning_rate": 2.4003543961959637e-06, "loss": 0.01637895405292511, "memory(GiB)": 22.01, "step": 21251, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.956461 }, { "epoch": 0.690381054478121, "grad_norm": 0.42244458198547363, "learning_rate": 2.399895570109266e-06, "loss": 0.01959840953350067, "memory(GiB)": 22.01, "step": 21252, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.956468 }, { "epoch": 0.6904135399408765, "grad_norm": 0.26795294880867004, "learning_rate": 2.399436774031234e-06, "loss": 0.012120055966079235, "memory(GiB)": 22.01, "step": 21253, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956475 }, { "epoch": 0.6904460254036319, "grad_norm": 0.2143562138080597, "learning_rate": 2.3989780079671653e-06, "loss": 0.011781880632042885, "memory(GiB)": 22.01, "step": 21254, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956482 }, { "epoch": 0.6904785108663873, "grad_norm": 0.30901145935058594, "learning_rate": 2.3985192719223483e-06, "loss": 0.016647644340991974, "memory(GiB)": 22.01, "step": 21255, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.95649 }, { "epoch": 0.6905109963291427, "grad_norm": 0.3073902130126953, "learning_rate": 2.398060565902085e-06, "loss": 0.011403854936361313, "memory(GiB)": 22.01, "step": 21256, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.956496 }, { "epoch": 0.6905434817918982, "grad_norm": 0.3592035174369812, "learning_rate": 2.397601889911665e-06, "loss": 0.015009989961981773, "memory(GiB)": 22.01, "step": 21257, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.956503 }, { "epoch": 0.6905759672546535, "grad_norm": 0.37758496403694153, "learning_rate": 2.397143243956384e-06, "loss": 0.012023583985865116, "memory(GiB)": 22.01, "step": 21258, "token_acc": 1.0, "train_speed(iter/s)": 0.956511 }, { "epoch": 0.690608452717409, "grad_norm": 0.38643530011177063, "learning_rate": 2.396684628041532e-06, "loss": 0.012539325281977654, "memory(GiB)": 22.01, "step": 21259, "token_acc": 1.0, "train_speed(iter/s)": 0.956518 }, { "epoch": 0.6906409381801644, "grad_norm": 0.2866186797618866, "learning_rate": 2.3962260421724054e-06, "loss": 0.009782114997506142, "memory(GiB)": 22.01, "step": 21260, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.956526 }, { "epoch": 0.6906734236429198, "grad_norm": 0.37577173113822937, "learning_rate": 2.3957674863542957e-06, "loss": 0.017613714560866356, "memory(GiB)": 22.01, "step": 21261, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.956534 }, { "epoch": 0.6907059091056752, "grad_norm": 0.24203982949256897, "learning_rate": 2.3953089605924975e-06, "loss": 0.008792063221335411, "memory(GiB)": 22.01, "step": 21262, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956542 }, { "epoch": 0.6907383945684307, "grad_norm": 0.25522154569625854, "learning_rate": 2.3948504648922984e-06, "loss": 0.010972193442285061, "memory(GiB)": 22.01, "step": 21263, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.956549 }, { "epoch": 0.690770880031186, "grad_norm": 0.35798394680023193, "learning_rate": 2.394391999258992e-06, "loss": 0.011564554646611214, "memory(GiB)": 22.01, "step": 21264, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956555 }, { "epoch": 0.6908033654939415, "grad_norm": 0.33425840735435486, "learning_rate": 2.3939335636978704e-06, "loss": 0.013675606809556484, "memory(GiB)": 22.01, "step": 21265, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.956561 }, { "epoch": 0.6908358509566969, "grad_norm": 0.31388798356056213, "learning_rate": 2.3934751582142264e-06, "loss": 0.013460922986268997, "memory(GiB)": 22.01, "step": 21266, "token_acc": 1.0, "train_speed(iter/s)": 0.956568 }, { "epoch": 0.6908683364194523, "grad_norm": 0.40246567130088806, "learning_rate": 2.393016782813346e-06, "loss": 0.02173835039138794, "memory(GiB)": 22.01, "step": 21267, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956576 }, { "epoch": 0.6909008218822077, "grad_norm": 0.2693193256855011, "learning_rate": 2.3925584375005224e-06, "loss": 0.012210066430270672, "memory(GiB)": 22.01, "step": 21268, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.956583 }, { "epoch": 0.6909333073449632, "grad_norm": 0.24051283299922943, "learning_rate": 2.392100122281044e-06, "loss": 0.01136090513318777, "memory(GiB)": 22.01, "step": 21269, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.956591 }, { "epoch": 0.6909657928077185, "grad_norm": 0.5049495100975037, "learning_rate": 2.3916418371602025e-06, "loss": 0.02867993898689747, "memory(GiB)": 22.01, "step": 21270, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956601 }, { "epoch": 0.690998278270474, "grad_norm": 0.46638020873069763, "learning_rate": 2.3911835821432867e-06, "loss": 0.01974017545580864, "memory(GiB)": 22.01, "step": 21271, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.95661 }, { "epoch": 0.6910307637332294, "grad_norm": 0.41744640469551086, "learning_rate": 2.3907253572355834e-06, "loss": 0.018234312534332275, "memory(GiB)": 22.01, "step": 21272, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.956619 }, { "epoch": 0.6910632491959848, "grad_norm": 0.44034072756767273, "learning_rate": 2.3902671624423823e-06, "loss": 0.016464874148368835, "memory(GiB)": 22.01, "step": 21273, "token_acc": 0.9948186528497409, "train_speed(iter/s)": 0.956628 }, { "epoch": 0.6910957346587402, "grad_norm": 0.25290244817733765, "learning_rate": 2.3898089977689716e-06, "loss": 0.00990593247115612, "memory(GiB)": 22.01, "step": 21274, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.956637 }, { "epoch": 0.6911282201214957, "grad_norm": 0.30049142241477966, "learning_rate": 2.3893508632206408e-06, "loss": 0.015635281801223755, "memory(GiB)": 22.01, "step": 21275, "token_acc": 1.0, "train_speed(iter/s)": 0.956646 }, { "epoch": 0.691160705584251, "grad_norm": 0.3249458968639374, "learning_rate": 2.3888927588026746e-06, "loss": 0.01640385016798973, "memory(GiB)": 22.01, "step": 21276, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.956656 }, { "epoch": 0.6911931910470065, "grad_norm": 0.25626716017723083, "learning_rate": 2.3884346845203627e-06, "loss": 0.011126531288027763, "memory(GiB)": 22.01, "step": 21277, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.956666 }, { "epoch": 0.6912256765097619, "grad_norm": 0.5884513258934021, "learning_rate": 2.3879766403789865e-06, "loss": 0.025676269084215164, "memory(GiB)": 22.01, "step": 21278, "token_acc": 0.9825174825174825, "train_speed(iter/s)": 0.956675 }, { "epoch": 0.6912581619725173, "grad_norm": 0.3701426684856415, "learning_rate": 2.387518626383841e-06, "loss": 0.013426955789327621, "memory(GiB)": 22.01, "step": 21279, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956685 }, { "epoch": 0.6912906474352727, "grad_norm": 0.6602005958557129, "learning_rate": 2.3870606425402056e-06, "loss": 0.019003212451934814, "memory(GiB)": 22.01, "step": 21280, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.956694 }, { "epoch": 0.6913231328980282, "grad_norm": 0.36318814754486084, "learning_rate": 2.38660268885337e-06, "loss": 0.018961947411298752, "memory(GiB)": 22.01, "step": 21281, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956704 }, { "epoch": 0.6913556183607835, "grad_norm": 0.5531190633773804, "learning_rate": 2.3861447653286157e-06, "loss": 0.025660578161478043, "memory(GiB)": 22.01, "step": 21282, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.956714 }, { "epoch": 0.691388103823539, "grad_norm": 0.376004695892334, "learning_rate": 2.3856868719712285e-06, "loss": 0.015510305762290955, "memory(GiB)": 22.01, "step": 21283, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956723 }, { "epoch": 0.6914205892862944, "grad_norm": 0.9127469062805176, "learning_rate": 2.385229008786498e-06, "loss": 0.02177191898226738, "memory(GiB)": 22.01, "step": 21284, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956733 }, { "epoch": 0.6914530747490498, "grad_norm": 0.3845761716365814, "learning_rate": 2.3847711757797025e-06, "loss": 0.015100843273103237, "memory(GiB)": 22.01, "step": 21285, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.956742 }, { "epoch": 0.6914855602118052, "grad_norm": 0.3598080277442932, "learning_rate": 2.384313372956131e-06, "loss": 0.013775194063782692, "memory(GiB)": 22.01, "step": 21286, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956751 }, { "epoch": 0.6915180456745607, "grad_norm": 0.22396817803382874, "learning_rate": 2.383855600321063e-06, "loss": 0.010853134095668793, "memory(GiB)": 22.01, "step": 21287, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.956757 }, { "epoch": 0.691550531137316, "grad_norm": 1.1369034051895142, "learning_rate": 2.3833978578797827e-06, "loss": 0.011535540223121643, "memory(GiB)": 22.01, "step": 21288, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.956765 }, { "epoch": 0.6915830166000715, "grad_norm": 0.24536362290382385, "learning_rate": 2.3829401456375743e-06, "loss": 0.00720870029181242, "memory(GiB)": 22.01, "step": 21289, "token_acc": 1.0, "train_speed(iter/s)": 0.956773 }, { "epoch": 0.6916155020628268, "grad_norm": 0.3626214265823364, "learning_rate": 2.382482463599722e-06, "loss": 0.017994025722146034, "memory(GiB)": 22.01, "step": 21290, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956781 }, { "epoch": 0.6916479875255823, "grad_norm": 0.34335950016975403, "learning_rate": 2.382024811771503e-06, "loss": 0.01774659752845764, "memory(GiB)": 22.01, "step": 21291, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.956789 }, { "epoch": 0.6916804729883377, "grad_norm": 0.45268648862838745, "learning_rate": 2.3815671901582037e-06, "loss": 0.017068682238459587, "memory(GiB)": 22.01, "step": 21292, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.956796 }, { "epoch": 0.6917129584510932, "grad_norm": 0.43005257844924927, "learning_rate": 2.3811095987651035e-06, "loss": 0.013216852210462093, "memory(GiB)": 22.01, "step": 21293, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.956803 }, { "epoch": 0.6917454439138485, "grad_norm": 0.3450887203216553, "learning_rate": 2.3806520375974855e-06, "loss": 0.01923496276140213, "memory(GiB)": 22.01, "step": 21294, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.956811 }, { "epoch": 0.691777929376604, "grad_norm": 0.33640339970588684, "learning_rate": 2.3801945066606284e-06, "loss": 0.015035943128168583, "memory(GiB)": 22.01, "step": 21295, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956819 }, { "epoch": 0.6918104148393593, "grad_norm": 0.37780362367630005, "learning_rate": 2.3797370059598134e-06, "loss": 0.017869720235466957, "memory(GiB)": 22.01, "step": 21296, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956827 }, { "epoch": 0.6918429003021148, "grad_norm": 0.29746386408805847, "learning_rate": 2.3792795355003206e-06, "loss": 0.018692193552851677, "memory(GiB)": 22.01, "step": 21297, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.956833 }, { "epoch": 0.6918753857648702, "grad_norm": 0.3510845899581909, "learning_rate": 2.3788220952874302e-06, "loss": 0.013410449028015137, "memory(GiB)": 22.01, "step": 21298, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.956841 }, { "epoch": 0.6919078712276256, "grad_norm": 0.9512779712677002, "learning_rate": 2.3783646853264237e-06, "loss": 0.023180823773145676, "memory(GiB)": 22.01, "step": 21299, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.956848 }, { "epoch": 0.691940356690381, "grad_norm": 0.28598618507385254, "learning_rate": 2.3779073056225767e-06, "loss": 0.011570674367249012, "memory(GiB)": 22.01, "step": 21300, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956856 }, { "epoch": 0.6919728421531365, "grad_norm": 0.27573156356811523, "learning_rate": 2.3774499561811688e-06, "loss": 0.011036386713385582, "memory(GiB)": 22.01, "step": 21301, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956864 }, { "epoch": 0.6920053276158918, "grad_norm": 0.2805134356021881, "learning_rate": 2.3769926370074786e-06, "loss": 0.014238037168979645, "memory(GiB)": 22.01, "step": 21302, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956872 }, { "epoch": 0.6920378130786473, "grad_norm": 0.3452502191066742, "learning_rate": 2.376535348106787e-06, "loss": 0.01452496089041233, "memory(GiB)": 22.01, "step": 21303, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956879 }, { "epoch": 0.6920702985414027, "grad_norm": 0.3057633638381958, "learning_rate": 2.376078089484367e-06, "loss": 0.009711219929158688, "memory(GiB)": 22.01, "step": 21304, "token_acc": 1.0, "train_speed(iter/s)": 0.956886 }, { "epoch": 0.6921027840041581, "grad_norm": 0.23806865513324738, "learning_rate": 2.3756208611455007e-06, "loss": 0.006800966337323189, "memory(GiB)": 22.01, "step": 21305, "token_acc": 1.0, "train_speed(iter/s)": 0.956894 }, { "epoch": 0.6921352694669135, "grad_norm": 0.37556636333465576, "learning_rate": 2.375163663095459e-06, "loss": 0.013459905050694942, "memory(GiB)": 22.01, "step": 21306, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956902 }, { "epoch": 0.692167754929669, "grad_norm": 0.3138878643512726, "learning_rate": 2.3747064953395254e-06, "loss": 0.01386057585477829, "memory(GiB)": 22.01, "step": 21307, "token_acc": 1.0, "train_speed(iter/s)": 0.956912 }, { "epoch": 0.6922002403924243, "grad_norm": 0.5866461992263794, "learning_rate": 2.3742493578829716e-06, "loss": 0.009736109524965286, "memory(GiB)": 22.01, "step": 21308, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.956919 }, { "epoch": 0.6922327258551798, "grad_norm": 0.5906240344047546, "learning_rate": 2.373792250731077e-06, "loss": 0.027893081307411194, "memory(GiB)": 22.01, "step": 21309, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.956926 }, { "epoch": 0.6922652113179352, "grad_norm": 0.312976598739624, "learning_rate": 2.3733351738891138e-06, "loss": 0.014455083757638931, "memory(GiB)": 22.01, "step": 21310, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.956934 }, { "epoch": 0.6922976967806906, "grad_norm": 0.3159993886947632, "learning_rate": 2.372878127362358e-06, "loss": 0.013505300506949425, "memory(GiB)": 22.01, "step": 21311, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.956941 }, { "epoch": 0.6923301822434461, "grad_norm": 0.2806859016418457, "learning_rate": 2.3724211111560857e-06, "loss": 0.020703686401247978, "memory(GiB)": 22.01, "step": 21312, "token_acc": 1.0, "train_speed(iter/s)": 0.956948 }, { "epoch": 0.6923626677062015, "grad_norm": 0.339107483625412, "learning_rate": 2.3719641252755734e-06, "loss": 0.01416061818599701, "memory(GiB)": 22.01, "step": 21313, "token_acc": 1.0, "train_speed(iter/s)": 0.956955 }, { "epoch": 0.692395153168957, "grad_norm": 0.41131940484046936, "learning_rate": 2.3715071697260903e-06, "loss": 0.013733494095504284, "memory(GiB)": 22.01, "step": 21314, "token_acc": 1.0, "train_speed(iter/s)": 0.956963 }, { "epoch": 0.6924276386317123, "grad_norm": 0.3146767318248749, "learning_rate": 2.3710502445129136e-06, "loss": 0.014065805822610855, "memory(GiB)": 22.01, "step": 21315, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.95697 }, { "epoch": 0.6924601240944678, "grad_norm": 0.20888903737068176, "learning_rate": 2.370593349641316e-06, "loss": 0.008522998541593552, "memory(GiB)": 22.01, "step": 21316, "token_acc": 1.0, "train_speed(iter/s)": 0.956977 }, { "epoch": 0.6924926095572231, "grad_norm": 0.3886549174785614, "learning_rate": 2.3701364851165704e-06, "loss": 0.017534306272864342, "memory(GiB)": 22.01, "step": 21317, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956984 }, { "epoch": 0.6925250950199786, "grad_norm": 0.28115198016166687, "learning_rate": 2.369679650943952e-06, "loss": 0.00850784219801426, "memory(GiB)": 22.01, "step": 21318, "token_acc": 1.0, "train_speed(iter/s)": 0.956991 }, { "epoch": 0.692557580482734, "grad_norm": 0.5047717094421387, "learning_rate": 2.3692228471287305e-06, "loss": 0.020297912880778313, "memory(GiB)": 22.01, "step": 21319, "token_acc": 0.99, "train_speed(iter/s)": 0.956998 }, { "epoch": 0.6925900659454894, "grad_norm": 0.5050362348556519, "learning_rate": 2.368766073676178e-06, "loss": 0.02261161245405674, "memory(GiB)": 22.01, "step": 21320, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.957005 }, { "epoch": 0.6926225514082448, "grad_norm": 0.44258832931518555, "learning_rate": 2.3683093305915674e-06, "loss": 0.017499808222055435, "memory(GiB)": 22.01, "step": 21321, "token_acc": 1.0, "train_speed(iter/s)": 0.957012 }, { "epoch": 0.6926550368710003, "grad_norm": 0.396733820438385, "learning_rate": 2.367852617880172e-06, "loss": 0.014270555227994919, "memory(GiB)": 22.01, "step": 21322, "token_acc": 1.0, "train_speed(iter/s)": 0.95702 }, { "epoch": 0.6926875223337556, "grad_norm": 0.38265126943588257, "learning_rate": 2.367395935547259e-06, "loss": 0.01864761859178543, "memory(GiB)": 22.01, "step": 21323, "token_acc": 1.0, "train_speed(iter/s)": 0.957028 }, { "epoch": 0.6927200077965111, "grad_norm": 0.37046825885772705, "learning_rate": 2.3669392835981e-06, "loss": 0.01418701559305191, "memory(GiB)": 22.01, "step": 21324, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957036 }, { "epoch": 0.6927524932592665, "grad_norm": 0.41078776121139526, "learning_rate": 2.366482662037967e-06, "loss": 0.019063802435994148, "memory(GiB)": 22.01, "step": 21325, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.957043 }, { "epoch": 0.6927849787220219, "grad_norm": 0.3612363934516907, "learning_rate": 2.366026070872131e-06, "loss": 0.015835624188184738, "memory(GiB)": 22.01, "step": 21326, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.95705 }, { "epoch": 0.6928174641847773, "grad_norm": 0.2702614963054657, "learning_rate": 2.3655695101058584e-06, "loss": 0.012529226951301098, "memory(GiB)": 22.01, "step": 21327, "token_acc": 0.9948979591836735, "train_speed(iter/s)": 0.957057 }, { "epoch": 0.6928499496475328, "grad_norm": 0.3268148601055145, "learning_rate": 2.3651129797444215e-06, "loss": 0.01373017393052578, "memory(GiB)": 22.01, "step": 21328, "token_acc": 0.988, "train_speed(iter/s)": 0.957064 }, { "epoch": 0.6928824351102881, "grad_norm": 0.26721179485321045, "learning_rate": 2.3646564797930843e-06, "loss": 0.009162846952676773, "memory(GiB)": 22.01, "step": 21329, "token_acc": 1.0, "train_speed(iter/s)": 0.957071 }, { "epoch": 0.6929149205730436, "grad_norm": 0.2660757899284363, "learning_rate": 2.3642000102571207e-06, "loss": 0.010371379554271698, "memory(GiB)": 22.01, "step": 21330, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.957078 }, { "epoch": 0.692947406035799, "grad_norm": 0.4719507396221161, "learning_rate": 2.3637435711417993e-06, "loss": 0.01872815005481243, "memory(GiB)": 22.01, "step": 21331, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.957086 }, { "epoch": 0.6929798914985544, "grad_norm": 0.26595577597618103, "learning_rate": 2.363287162452384e-06, "loss": 0.013577625155448914, "memory(GiB)": 22.01, "step": 21332, "token_acc": 1.0, "train_speed(iter/s)": 0.957095 }, { "epoch": 0.6930123769613098, "grad_norm": 0.31145691871643066, "learning_rate": 2.3628307841941462e-06, "loss": 0.01437923964112997, "memory(GiB)": 22.01, "step": 21333, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957104 }, { "epoch": 0.6930448624240653, "grad_norm": 0.28576645255088806, "learning_rate": 2.3623744363723473e-06, "loss": 0.015678292140364647, "memory(GiB)": 22.01, "step": 21334, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.957113 }, { "epoch": 0.6930773478868206, "grad_norm": 0.36229968070983887, "learning_rate": 2.361918118992262e-06, "loss": 0.023393210023641586, "memory(GiB)": 22.01, "step": 21335, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.957121 }, { "epoch": 0.6931098333495761, "grad_norm": 0.37362000346183777, "learning_rate": 2.3614618320591502e-06, "loss": 0.01714988984167576, "memory(GiB)": 22.01, "step": 21336, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.95713 }, { "epoch": 0.6931423188123315, "grad_norm": 0.24943891167640686, "learning_rate": 2.361005575578284e-06, "loss": 0.009890232235193253, "memory(GiB)": 22.01, "step": 21337, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.957139 }, { "epoch": 0.6931748042750869, "grad_norm": 0.2937172055244446, "learning_rate": 2.360549349554923e-06, "loss": 0.020144494250416756, "memory(GiB)": 22.01, "step": 21338, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.957149 }, { "epoch": 0.6932072897378423, "grad_norm": 0.44190099835395813, "learning_rate": 2.3600931539943363e-06, "loss": 0.02186434157192707, "memory(GiB)": 22.01, "step": 21339, "token_acc": 1.0, "train_speed(iter/s)": 0.957158 }, { "epoch": 0.6932397752005978, "grad_norm": 0.28058525919914246, "learning_rate": 2.359636988901788e-06, "loss": 0.012562872841954231, "memory(GiB)": 22.01, "step": 21340, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.957168 }, { "epoch": 0.6932722606633531, "grad_norm": 0.4727572798728943, "learning_rate": 2.359180854282545e-06, "loss": 0.011406244710087776, "memory(GiB)": 22.01, "step": 21341, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.957178 }, { "epoch": 0.6933047461261086, "grad_norm": 0.33441677689552307, "learning_rate": 2.358724750141868e-06, "loss": 0.01574619486927986, "memory(GiB)": 22.01, "step": 21342, "token_acc": 1.0, "train_speed(iter/s)": 0.957188 }, { "epoch": 0.693337231588864, "grad_norm": 0.32550951838493347, "learning_rate": 2.358268676485023e-06, "loss": 0.013517428189516068, "memory(GiB)": 22.01, "step": 21343, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.957197 }, { "epoch": 0.6933697170516194, "grad_norm": 0.9038589596748352, "learning_rate": 2.3578126333172736e-06, "loss": 0.017825022339820862, "memory(GiB)": 22.01, "step": 21344, "token_acc": 1.0, "train_speed(iter/s)": 0.957206 }, { "epoch": 0.6934022025143748, "grad_norm": 0.25932416319847107, "learning_rate": 2.357356620643883e-06, "loss": 0.014340699650347233, "memory(GiB)": 22.01, "step": 21345, "token_acc": 1.0, "train_speed(iter/s)": 0.957215 }, { "epoch": 0.6934346879771303, "grad_norm": 0.26525166630744934, "learning_rate": 2.3569006384701167e-06, "loss": 0.010579200461506844, "memory(GiB)": 22.01, "step": 21346, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957223 }, { "epoch": 0.6934671734398856, "grad_norm": 0.32143422961235046, "learning_rate": 2.3564446868012327e-06, "loss": 0.011052705347537994, "memory(GiB)": 22.01, "step": 21347, "token_acc": 1.0, "train_speed(iter/s)": 0.95723 }, { "epoch": 0.6934996589026411, "grad_norm": 0.3637539744377136, "learning_rate": 2.355988765642496e-06, "loss": 0.012967310845851898, "memory(GiB)": 22.01, "step": 21348, "token_acc": 0.9966887417218543, "train_speed(iter/s)": 0.957238 }, { "epoch": 0.6935321443653965, "grad_norm": 0.20523664355278015, "learning_rate": 2.3555328749991674e-06, "loss": 0.009815111756324768, "memory(GiB)": 22.01, "step": 21349, "token_acc": 1.0, "train_speed(iter/s)": 0.957244 }, { "epoch": 0.6935646298281519, "grad_norm": 0.4092358350753784, "learning_rate": 2.3550770148765113e-06, "loss": 0.017287906259298325, "memory(GiB)": 22.01, "step": 21350, "token_acc": 1.0, "train_speed(iter/s)": 0.957252 }, { "epoch": 0.6935971152909073, "grad_norm": 0.4383668899536133, "learning_rate": 2.354621185279785e-06, "loss": 0.015045362524688244, "memory(GiB)": 22.01, "step": 21351, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.957258 }, { "epoch": 0.6936296007536628, "grad_norm": 0.2985348701477051, "learning_rate": 2.3541653862142518e-06, "loss": 0.01410706713795662, "memory(GiB)": 22.01, "step": 21352, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957264 }, { "epoch": 0.6936620862164181, "grad_norm": 0.2860439121723175, "learning_rate": 2.353709617685171e-06, "loss": 0.009095607325434685, "memory(GiB)": 22.01, "step": 21353, "token_acc": 1.0, "train_speed(iter/s)": 0.957271 }, { "epoch": 0.6936945716791736, "grad_norm": 0.23811382055282593, "learning_rate": 2.3532538796978053e-06, "loss": 0.012772567570209503, "memory(GiB)": 22.01, "step": 21354, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.957278 }, { "epoch": 0.693727057141929, "grad_norm": 0.3653735816478729, "learning_rate": 2.352798172257411e-06, "loss": 0.01475098542869091, "memory(GiB)": 22.01, "step": 21355, "token_acc": 1.0, "train_speed(iter/s)": 0.957285 }, { "epoch": 0.6937595426046844, "grad_norm": 0.35424894094467163, "learning_rate": 2.352342495369251e-06, "loss": 0.016592662781476974, "memory(GiB)": 22.01, "step": 21356, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.957291 }, { "epoch": 0.6937920280674398, "grad_norm": 0.2980526089668274, "learning_rate": 2.351886849038579e-06, "loss": 0.008743491023778915, "memory(GiB)": 22.01, "step": 21357, "token_acc": 1.0, "train_speed(iter/s)": 0.957298 }, { "epoch": 0.6938245135301953, "grad_norm": 0.29681238532066345, "learning_rate": 2.3514312332706606e-06, "loss": 0.016138412058353424, "memory(GiB)": 22.01, "step": 21358, "token_acc": 1.0, "train_speed(iter/s)": 0.957305 }, { "epoch": 0.6938569989929506, "grad_norm": 0.48447278141975403, "learning_rate": 2.3509756480707495e-06, "loss": 0.02004065364599228, "memory(GiB)": 22.01, "step": 21359, "token_acc": 0.9834710743801653, "train_speed(iter/s)": 0.957312 }, { "epoch": 0.6938894844557061, "grad_norm": 0.30361583828926086, "learning_rate": 2.350520093444106e-06, "loss": 0.014031754806637764, "memory(GiB)": 22.01, "step": 21360, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.95732 }, { "epoch": 0.6939219699184614, "grad_norm": 0.3753018081188202, "learning_rate": 2.3500645693959878e-06, "loss": 0.014820774085819721, "memory(GiB)": 22.01, "step": 21361, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957327 }, { "epoch": 0.6939544553812169, "grad_norm": 0.3474605977535248, "learning_rate": 2.349609075931648e-06, "loss": 0.018196437507867813, "memory(GiB)": 22.01, "step": 21362, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.957334 }, { "epoch": 0.6939869408439723, "grad_norm": 0.40803682804107666, "learning_rate": 2.3491536130563515e-06, "loss": 0.017248962074518204, "memory(GiB)": 22.01, "step": 21363, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.957342 }, { "epoch": 0.6940194263067277, "grad_norm": 0.4073292315006256, "learning_rate": 2.3486981807753484e-06, "loss": 0.01658046618103981, "memory(GiB)": 22.01, "step": 21364, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.957349 }, { "epoch": 0.6940519117694831, "grad_norm": 0.24588438868522644, "learning_rate": 2.3482427790938988e-06, "loss": 0.010604824870824814, "memory(GiB)": 22.01, "step": 21365, "token_acc": 1.0, "train_speed(iter/s)": 0.957357 }, { "epoch": 0.6940843972322386, "grad_norm": 0.5062944889068604, "learning_rate": 2.347787408017255e-06, "loss": 0.020956676453351974, "memory(GiB)": 22.01, "step": 21366, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.957367 }, { "epoch": 0.6941168826949939, "grad_norm": 0.4033581018447876, "learning_rate": 2.347332067550675e-06, "loss": 0.01801561564207077, "memory(GiB)": 22.01, "step": 21367, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957376 }, { "epoch": 0.6941493681577494, "grad_norm": 0.25475913286209106, "learning_rate": 2.3468767576994135e-06, "loss": 0.011427418328821659, "memory(GiB)": 22.01, "step": 21368, "token_acc": 1.0, "train_speed(iter/s)": 0.957386 }, { "epoch": 0.6941818536205048, "grad_norm": 0.27369973063468933, "learning_rate": 2.346421478468728e-06, "loss": 0.01192898117005825, "memory(GiB)": 22.01, "step": 21369, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957394 }, { "epoch": 0.6942143390832602, "grad_norm": 0.22425509989261627, "learning_rate": 2.3459662298638676e-06, "loss": 0.011448771692812443, "memory(GiB)": 22.01, "step": 21370, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.957402 }, { "epoch": 0.6942468245460156, "grad_norm": 0.42305323481559753, "learning_rate": 2.34551101189009e-06, "loss": 0.014624511823058128, "memory(GiB)": 22.01, "step": 21371, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957409 }, { "epoch": 0.6942793100087711, "grad_norm": 0.9412809610366821, "learning_rate": 2.3450558245526484e-06, "loss": 0.020011747255921364, "memory(GiB)": 22.01, "step": 21372, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.957417 }, { "epoch": 0.6943117954715264, "grad_norm": 0.5162537693977356, "learning_rate": 2.344600667856798e-06, "loss": 0.01047227531671524, "memory(GiB)": 22.01, "step": 21373, "token_acc": 0.992, "train_speed(iter/s)": 0.957424 }, { "epoch": 0.6943442809342819, "grad_norm": 0.4010072350502014, "learning_rate": 2.3441455418077887e-06, "loss": 0.011692451313138008, "memory(GiB)": 22.01, "step": 21374, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957432 }, { "epoch": 0.6943767663970373, "grad_norm": 0.5026498436927795, "learning_rate": 2.3436904464108745e-06, "loss": 0.02188880555331707, "memory(GiB)": 22.01, "step": 21375, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.95744 }, { "epoch": 0.6944092518597927, "grad_norm": 0.29402175545692444, "learning_rate": 2.3432353816713077e-06, "loss": 0.012772247195243835, "memory(GiB)": 22.01, "step": 21376, "token_acc": 1.0, "train_speed(iter/s)": 0.957447 }, { "epoch": 0.6944417373225482, "grad_norm": 0.4154277741909027, "learning_rate": 2.3427803475943412e-06, "loss": 0.012299716472625732, "memory(GiB)": 22.01, "step": 21377, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957455 }, { "epoch": 0.6944742227853036, "grad_norm": 0.397152841091156, "learning_rate": 2.3423253441852284e-06, "loss": 0.012300604023039341, "memory(GiB)": 22.01, "step": 21378, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.957463 }, { "epoch": 0.694506708248059, "grad_norm": 0.24430610239505768, "learning_rate": 2.3418703714492165e-06, "loss": 0.00780548807233572, "memory(GiB)": 22.01, "step": 21379, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957469 }, { "epoch": 0.6945391937108144, "grad_norm": 0.44455498456954956, "learning_rate": 2.341415429391558e-06, "loss": 0.014214854687452316, "memory(GiB)": 22.01, "step": 21380, "token_acc": 0.9966887417218543, "train_speed(iter/s)": 0.957476 }, { "epoch": 0.6945716791735699, "grad_norm": 0.3517105281352997, "learning_rate": 2.340960518017504e-06, "loss": 0.013814684934914112, "memory(GiB)": 22.01, "step": 21381, "token_acc": 0.995, "train_speed(iter/s)": 0.957484 }, { "epoch": 0.6946041646363252, "grad_norm": 0.24753521382808685, "learning_rate": 2.340505637332307e-06, "loss": 0.0076887840405106544, "memory(GiB)": 22.01, "step": 21382, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957491 }, { "epoch": 0.6946366500990807, "grad_norm": 0.394993394613266, "learning_rate": 2.340050787341213e-06, "loss": 0.017935415729880333, "memory(GiB)": 22.01, "step": 21383, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.957499 }, { "epoch": 0.6946691355618361, "grad_norm": 0.5347797870635986, "learning_rate": 2.339595968049475e-06, "loss": 0.023116635158658028, "memory(GiB)": 22.01, "step": 21384, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.957507 }, { "epoch": 0.6947016210245915, "grad_norm": 0.41517412662506104, "learning_rate": 2.339141179462337e-06, "loss": 0.017301730811595917, "memory(GiB)": 22.01, "step": 21385, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957513 }, { "epoch": 0.6947341064873469, "grad_norm": 0.36225032806396484, "learning_rate": 2.3386864215850545e-06, "loss": 0.013130191713571548, "memory(GiB)": 22.01, "step": 21386, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.957521 }, { "epoch": 0.6947665919501024, "grad_norm": 0.24961894750595093, "learning_rate": 2.3382316944228718e-06, "loss": 0.013825921341776848, "memory(GiB)": 22.01, "step": 21387, "token_acc": 1.0, "train_speed(iter/s)": 0.957528 }, { "epoch": 0.6947990774128577, "grad_norm": 0.4212905466556549, "learning_rate": 2.3377769979810392e-06, "loss": 0.024976208806037903, "memory(GiB)": 22.01, "step": 21388, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957535 }, { "epoch": 0.6948315628756132, "grad_norm": 0.271595299243927, "learning_rate": 2.337322332264802e-06, "loss": 0.009010110050439835, "memory(GiB)": 22.01, "step": 21389, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.957542 }, { "epoch": 0.6948640483383686, "grad_norm": 0.32639288902282715, "learning_rate": 2.336867697279409e-06, "loss": 0.009205223992466927, "memory(GiB)": 22.01, "step": 21390, "token_acc": 0.9952830188679245, "train_speed(iter/s)": 0.957549 }, { "epoch": 0.694896533801124, "grad_norm": 0.34579968452453613, "learning_rate": 2.3364130930301075e-06, "loss": 0.016010671854019165, "memory(GiB)": 22.01, "step": 21391, "token_acc": 1.0, "train_speed(iter/s)": 0.957556 }, { "epoch": 0.6949290192638794, "grad_norm": 0.33474820852279663, "learning_rate": 2.3359585195221447e-06, "loss": 0.013088645413517952, "memory(GiB)": 22.01, "step": 21392, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957564 }, { "epoch": 0.6949615047266349, "grad_norm": 0.547114908695221, "learning_rate": 2.3355039767607672e-06, "loss": 0.02011142671108246, "memory(GiB)": 22.01, "step": 21393, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.957572 }, { "epoch": 0.6949939901893902, "grad_norm": 0.5868738889694214, "learning_rate": 2.335049464751219e-06, "loss": 0.012490125373005867, "memory(GiB)": 22.01, "step": 21394, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.957582 }, { "epoch": 0.6950264756521457, "grad_norm": 0.27649831771850586, "learning_rate": 2.3345949834987467e-06, "loss": 0.01275234017521143, "memory(GiB)": 22.01, "step": 21395, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.957592 }, { "epoch": 0.6950589611149011, "grad_norm": 0.3229421079158783, "learning_rate": 2.3341405330085963e-06, "loss": 0.015002367086708546, "memory(GiB)": 22.01, "step": 21396, "token_acc": 1.0, "train_speed(iter/s)": 0.957601 }, { "epoch": 0.6950914465776565, "grad_norm": 0.2991608679294586, "learning_rate": 2.3336861132860143e-06, "loss": 0.011081917211413383, "memory(GiB)": 22.01, "step": 21397, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.957611 }, { "epoch": 0.6951239320404119, "grad_norm": 0.36473989486694336, "learning_rate": 2.3332317243362414e-06, "loss": 0.014451546594500542, "memory(GiB)": 22.01, "step": 21398, "token_acc": 1.0, "train_speed(iter/s)": 0.95762 }, { "epoch": 0.6951564175031674, "grad_norm": 0.3044387996196747, "learning_rate": 2.3327773661645236e-06, "loss": 0.016461515799164772, "memory(GiB)": 22.01, "step": 21399, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.95763 }, { "epoch": 0.6951889029659227, "grad_norm": 0.5086476802825928, "learning_rate": 2.3323230387761054e-06, "loss": 0.017729977145791054, "memory(GiB)": 22.01, "step": 21400, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957639 }, { "epoch": 0.6952213884286782, "grad_norm": 0.26302576065063477, "learning_rate": 2.331868742176232e-06, "loss": 0.00823732279241085, "memory(GiB)": 22.01, "step": 21401, "token_acc": 1.0, "train_speed(iter/s)": 0.957649 }, { "epoch": 0.6952538738914336, "grad_norm": 0.22147662937641144, "learning_rate": 2.331414476370142e-06, "loss": 0.008950978517532349, "memory(GiB)": 22.01, "step": 21402, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957658 }, { "epoch": 0.695286359354189, "grad_norm": 0.5640212297439575, "learning_rate": 2.330960241363082e-06, "loss": 0.021112900227308273, "memory(GiB)": 22.01, "step": 21403, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.957668 }, { "epoch": 0.6953188448169444, "grad_norm": 0.4231567978858948, "learning_rate": 2.3305060371602922e-06, "loss": 0.019271116703748703, "memory(GiB)": 22.01, "step": 21404, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957677 }, { "epoch": 0.6953513302796999, "grad_norm": 0.2827472984790802, "learning_rate": 2.3300518637670183e-06, "loss": 0.012459877878427505, "memory(GiB)": 22.01, "step": 21405, "token_acc": 1.0, "train_speed(iter/s)": 0.957684 }, { "epoch": 0.6953838157424552, "grad_norm": 0.33110177516937256, "learning_rate": 2.3295977211884972e-06, "loss": 0.016956282779574394, "memory(GiB)": 22.01, "step": 21406, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.95769 }, { "epoch": 0.6954163012052107, "grad_norm": 0.2591901421546936, "learning_rate": 2.329143609429973e-06, "loss": 0.012090945616364479, "memory(GiB)": 22.01, "step": 21407, "token_acc": 1.0, "train_speed(iter/s)": 0.957697 }, { "epoch": 0.6954487866679661, "grad_norm": 0.30497655272483826, "learning_rate": 2.3286895284966865e-06, "loss": 0.016278982162475586, "memory(GiB)": 22.01, "step": 21408, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.957705 }, { "epoch": 0.6954812721307215, "grad_norm": 0.3589244484901428, "learning_rate": 2.328235478393879e-06, "loss": 0.016165899112820625, "memory(GiB)": 22.01, "step": 21409, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.957712 }, { "epoch": 0.6955137575934769, "grad_norm": 0.41446805000305176, "learning_rate": 2.3277814591267913e-06, "loss": 0.015462111681699753, "memory(GiB)": 22.01, "step": 21410, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.95772 }, { "epoch": 0.6955462430562324, "grad_norm": 0.3608671724796295, "learning_rate": 2.3273274707006598e-06, "loss": 0.018501685932278633, "memory(GiB)": 22.01, "step": 21411, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.957728 }, { "epoch": 0.6955787285189877, "grad_norm": 0.44990748167037964, "learning_rate": 2.3268735131207294e-06, "loss": 0.014875149354338646, "memory(GiB)": 22.01, "step": 21412, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.957735 }, { "epoch": 0.6956112139817432, "grad_norm": 0.28052911162376404, "learning_rate": 2.326419586392232e-06, "loss": 0.01108273770660162, "memory(GiB)": 22.01, "step": 21413, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.957742 }, { "epoch": 0.6956436994444986, "grad_norm": 0.30801594257354736, "learning_rate": 2.325965690520416e-06, "loss": 0.01307997852563858, "memory(GiB)": 22.01, "step": 21414, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.957749 }, { "epoch": 0.695676184907254, "grad_norm": 0.26874762773513794, "learning_rate": 2.3255118255105123e-06, "loss": 0.013732600957155228, "memory(GiB)": 22.01, "step": 21415, "token_acc": 1.0, "train_speed(iter/s)": 0.957756 }, { "epoch": 0.6957086703700094, "grad_norm": 0.31848323345184326, "learning_rate": 2.3250579913677645e-06, "loss": 0.013015071861445904, "memory(GiB)": 22.01, "step": 21416, "token_acc": 1.0, "train_speed(iter/s)": 0.957763 }, { "epoch": 0.6957411558327649, "grad_norm": 0.3921443521976471, "learning_rate": 2.324604188097405e-06, "loss": 0.01551809161901474, "memory(GiB)": 22.01, "step": 21417, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.95777 }, { "epoch": 0.6957736412955202, "grad_norm": 0.3744838535785675, "learning_rate": 2.324150415704675e-06, "loss": 0.018068306148052216, "memory(GiB)": 22.01, "step": 21418, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957778 }, { "epoch": 0.6958061267582757, "grad_norm": 0.19227799773216248, "learning_rate": 2.3236966741948103e-06, "loss": 0.008675949648022652, "memory(GiB)": 22.01, "step": 21419, "token_acc": 1.0, "train_speed(iter/s)": 0.957785 }, { "epoch": 0.695838612221031, "grad_norm": 0.39577457308769226, "learning_rate": 2.32324296357305e-06, "loss": 0.013594601303339005, "memory(GiB)": 22.01, "step": 21420, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.957793 }, { "epoch": 0.6958710976837865, "grad_norm": 0.2849308252334595, "learning_rate": 2.322789283844627e-06, "loss": 0.013975165784358978, "memory(GiB)": 22.01, "step": 21421, "token_acc": 0.9967105263157895, "train_speed(iter/s)": 0.9578 }, { "epoch": 0.6959035831465419, "grad_norm": 0.2409476786851883, "learning_rate": 2.3223356350147786e-06, "loss": 0.009689465165138245, "memory(GiB)": 22.01, "step": 21422, "token_acc": 1.0, "train_speed(iter/s)": 0.957807 }, { "epoch": 0.6959360686092974, "grad_norm": 0.3817404806613922, "learning_rate": 2.321882017088741e-06, "loss": 0.02011418342590332, "memory(GiB)": 22.01, "step": 21423, "token_acc": 1.0, "train_speed(iter/s)": 0.957814 }, { "epoch": 0.6959685540720527, "grad_norm": 0.27213945984840393, "learning_rate": 2.32142843007175e-06, "loss": 0.010839986614882946, "memory(GiB)": 22.01, "step": 21424, "token_acc": 1.0, "train_speed(iter/s)": 0.957821 }, { "epoch": 0.6960010395348082, "grad_norm": 0.2952413260936737, "learning_rate": 2.320974873969042e-06, "loss": 0.011466042138636112, "memory(GiB)": 22.01, "step": 21425, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.95783 }, { "epoch": 0.6960335249975635, "grad_norm": 0.41459399461746216, "learning_rate": 2.3205213487858475e-06, "loss": 0.018529094755649567, "memory(GiB)": 22.01, "step": 21426, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.95784 }, { "epoch": 0.696066010460319, "grad_norm": 0.27667635679244995, "learning_rate": 2.320067854527403e-06, "loss": 0.010401587001979351, "memory(GiB)": 22.01, "step": 21427, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.957849 }, { "epoch": 0.6960984959230744, "grad_norm": 0.3692817687988281, "learning_rate": 2.319614391198942e-06, "loss": 0.015414793975651264, "memory(GiB)": 22.01, "step": 21428, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.957859 }, { "epoch": 0.6961309813858299, "grad_norm": 0.2545093297958374, "learning_rate": 2.3191609588057013e-06, "loss": 0.012349221855401993, "memory(GiB)": 22.01, "step": 21429, "token_acc": 1.0, "train_speed(iter/s)": 0.957868 }, { "epoch": 0.6961634668485852, "grad_norm": 0.3084111511707306, "learning_rate": 2.3187075573529088e-06, "loss": 0.014465872198343277, "memory(GiB)": 22.01, "step": 21430, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957878 }, { "epoch": 0.6961959523113407, "grad_norm": 0.36099734902381897, "learning_rate": 2.3182541868458004e-06, "loss": 0.02032715454697609, "memory(GiB)": 22.01, "step": 21431, "token_acc": 1.0, "train_speed(iter/s)": 0.957888 }, { "epoch": 0.696228437774096, "grad_norm": 0.3712664842605591, "learning_rate": 2.3178008472896076e-06, "loss": 0.012445257976651192, "memory(GiB)": 22.01, "step": 21432, "token_acc": 0.984, "train_speed(iter/s)": 0.957897 }, { "epoch": 0.6962609232368515, "grad_norm": 0.233305424451828, "learning_rate": 2.317347538689565e-06, "loss": 0.014286795631051064, "memory(GiB)": 22.01, "step": 21433, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.957906 }, { "epoch": 0.6962934086996069, "grad_norm": 0.3078964948654175, "learning_rate": 2.3168942610509004e-06, "loss": 0.0074250707402825356, "memory(GiB)": 22.01, "step": 21434, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.957914 }, { "epoch": 0.6963258941623623, "grad_norm": 0.19786596298217773, "learning_rate": 2.3164410143788498e-06, "loss": 0.006730088032782078, "memory(GiB)": 22.01, "step": 21435, "token_acc": 1.0, "train_speed(iter/s)": 0.957922 }, { "epoch": 0.6963583796251177, "grad_norm": 0.5070544481277466, "learning_rate": 2.3159877986786372e-06, "loss": 0.019942808896303177, "memory(GiB)": 22.01, "step": 21436, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.95793 }, { "epoch": 0.6963908650878732, "grad_norm": 0.4073243737220764, "learning_rate": 2.3155346139555004e-06, "loss": 0.020506776869297028, "memory(GiB)": 22.01, "step": 21437, "token_acc": 1.0, "train_speed(iter/s)": 0.957937 }, { "epoch": 0.6964233505506285, "grad_norm": 0.409601092338562, "learning_rate": 2.3150814602146684e-06, "loss": 0.020890671759843826, "memory(GiB)": 22.01, "step": 21438, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.957945 }, { "epoch": 0.696455836013384, "grad_norm": 0.24948708713054657, "learning_rate": 2.3146283374613683e-06, "loss": 0.01107766106724739, "memory(GiB)": 22.01, "step": 21439, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.957952 }, { "epoch": 0.6964883214761395, "grad_norm": 0.3651096820831299, "learning_rate": 2.314175245700833e-06, "loss": 0.013004491105675697, "memory(GiB)": 22.01, "step": 21440, "token_acc": 1.0, "train_speed(iter/s)": 0.95796 }, { "epoch": 0.6965208069388948, "grad_norm": 0.4479062855243683, "learning_rate": 2.3137221849382867e-06, "loss": 0.01948515884578228, "memory(GiB)": 22.01, "step": 21441, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957968 }, { "epoch": 0.6965532924016503, "grad_norm": 0.31289201974868774, "learning_rate": 2.3132691551789653e-06, "loss": 0.012576311826705933, "memory(GiB)": 22.01, "step": 21442, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957975 }, { "epoch": 0.6965857778644057, "grad_norm": 0.3449569046497345, "learning_rate": 2.312816156428093e-06, "loss": 0.01702546328306198, "memory(GiB)": 22.01, "step": 21443, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957982 }, { "epoch": 0.6966182633271611, "grad_norm": 0.37486305832862854, "learning_rate": 2.3123631886908997e-06, "loss": 0.0142223434522748, "memory(GiB)": 22.01, "step": 21444, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.957989 }, { "epoch": 0.6966507487899165, "grad_norm": 0.3345494568347931, "learning_rate": 2.3119102519726105e-06, "loss": 0.016793400049209595, "memory(GiB)": 22.01, "step": 21445, "token_acc": 1.0, "train_speed(iter/s)": 0.957996 }, { "epoch": 0.696683234252672, "grad_norm": 0.29339438676834106, "learning_rate": 2.3114573462784556e-06, "loss": 0.015877846628427505, "memory(GiB)": 22.01, "step": 21446, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.958004 }, { "epoch": 0.6967157197154273, "grad_norm": 0.5434234142303467, "learning_rate": 2.3110044716136608e-06, "loss": 0.023242324590682983, "memory(GiB)": 22.01, "step": 21447, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.958011 }, { "epoch": 0.6967482051781828, "grad_norm": 0.29385170340538025, "learning_rate": 2.3105516279834556e-06, "loss": 0.017084110528230667, "memory(GiB)": 22.01, "step": 21448, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.958019 }, { "epoch": 0.6967806906409382, "grad_norm": 0.5272731781005859, "learning_rate": 2.3100988153930624e-06, "loss": 0.01695401594042778, "memory(GiB)": 22.01, "step": 21449, "token_acc": 0.992, "train_speed(iter/s)": 0.958027 }, { "epoch": 0.6968131761036936, "grad_norm": 0.3996860086917877, "learning_rate": 2.3096460338477088e-06, "loss": 0.020283976569771767, "memory(GiB)": 22.01, "step": 21450, "token_acc": 0.986013986013986, "train_speed(iter/s)": 0.958034 }, { "epoch": 0.696845661566449, "grad_norm": 0.4223940670490265, "learning_rate": 2.309193283352621e-06, "loss": 0.01270303875207901, "memory(GiB)": 22.01, "step": 21451, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.958041 }, { "epoch": 0.6968781470292045, "grad_norm": 0.27090179920196533, "learning_rate": 2.308740563913026e-06, "loss": 0.011396961286664009, "memory(GiB)": 22.01, "step": 21452, "token_acc": 1.0, "train_speed(iter/s)": 0.958047 }, { "epoch": 0.6969106324919598, "grad_norm": 0.4139746427536011, "learning_rate": 2.3082878755341455e-06, "loss": 0.0187385156750679, "memory(GiB)": 22.01, "step": 21453, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.958053 }, { "epoch": 0.6969431179547153, "grad_norm": 0.6375681161880493, "learning_rate": 2.307835218221205e-06, "loss": 0.019856702536344528, "memory(GiB)": 22.01, "step": 21454, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.95806 }, { "epoch": 0.6969756034174707, "grad_norm": 0.7622684836387634, "learning_rate": 2.3073825919794295e-06, "loss": 0.01652638241648674, "memory(GiB)": 22.01, "step": 21455, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.958067 }, { "epoch": 0.6970080888802261, "grad_norm": 0.36084964871406555, "learning_rate": 2.306929996814043e-06, "loss": 0.013430170714855194, "memory(GiB)": 22.01, "step": 21456, "token_acc": 1.0, "train_speed(iter/s)": 0.958075 }, { "epoch": 0.6970405743429815, "grad_norm": 0.3898746073246002, "learning_rate": 2.306477432730271e-06, "loss": 0.01669737510383129, "memory(GiB)": 22.01, "step": 21457, "token_acc": 0.982532751091703, "train_speed(iter/s)": 0.958082 }, { "epoch": 0.697073059805737, "grad_norm": 0.3920777142047882, "learning_rate": 2.3060248997333317e-06, "loss": 0.021961040794849396, "memory(GiB)": 22.01, "step": 21458, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.958091 }, { "epoch": 0.6971055452684923, "grad_norm": 0.7093819379806519, "learning_rate": 2.3055723978284513e-06, "loss": 0.015973705798387527, "memory(GiB)": 22.01, "step": 21459, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.9581 }, { "epoch": 0.6971380307312478, "grad_norm": 0.32258233428001404, "learning_rate": 2.305119927020851e-06, "loss": 0.01895364187657833, "memory(GiB)": 22.01, "step": 21460, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.958109 }, { "epoch": 0.6971705161940032, "grad_norm": 0.40256011486053467, "learning_rate": 2.3046674873157566e-06, "loss": 0.01573115400969982, "memory(GiB)": 22.01, "step": 21461, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.958119 }, { "epoch": 0.6972030016567586, "grad_norm": 0.26269903779029846, "learning_rate": 2.304215078718384e-06, "loss": 0.012314140796661377, "memory(GiB)": 22.01, "step": 21462, "token_acc": 1.0, "train_speed(iter/s)": 0.958128 }, { "epoch": 0.697235487119514, "grad_norm": 0.34504154324531555, "learning_rate": 2.3037627012339602e-06, "loss": 0.015572680160403252, "memory(GiB)": 22.01, "step": 21463, "token_acc": 1.0, "train_speed(iter/s)": 0.958137 }, { "epoch": 0.6972679725822695, "grad_norm": 0.34562888741493225, "learning_rate": 2.3033103548676994e-06, "loss": 0.012853465974330902, "memory(GiB)": 22.01, "step": 21464, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.958146 }, { "epoch": 0.6973004580450248, "grad_norm": 0.596497654914856, "learning_rate": 2.3028580396248303e-06, "loss": 0.025670746341347694, "memory(GiB)": 22.01, "step": 21465, "token_acc": 0.983957219251337, "train_speed(iter/s)": 0.958154 }, { "epoch": 0.6973329435077803, "grad_norm": 0.4710296392440796, "learning_rate": 2.3024057555105673e-06, "loss": 0.016959331929683685, "memory(GiB)": 22.01, "step": 21466, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.958161 }, { "epoch": 0.6973654289705357, "grad_norm": 1.1929537057876587, "learning_rate": 2.3019535025301336e-06, "loss": 0.016606919467449188, "memory(GiB)": 22.01, "step": 21467, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.958168 }, { "epoch": 0.6973979144332911, "grad_norm": 0.34338870644569397, "learning_rate": 2.3015012806887465e-06, "loss": 0.015984777361154556, "memory(GiB)": 22.01, "step": 21468, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.958175 }, { "epoch": 0.6974303998960465, "grad_norm": 0.23309969902038574, "learning_rate": 2.301049089991624e-06, "loss": 0.01004544086754322, "memory(GiB)": 22.01, "step": 21469, "token_acc": 1.0, "train_speed(iter/s)": 0.958183 }, { "epoch": 0.697462885358802, "grad_norm": 0.34860849380493164, "learning_rate": 2.300596930443992e-06, "loss": 0.013383901678025723, "memory(GiB)": 22.01, "step": 21470, "token_acc": 1.0, "train_speed(iter/s)": 0.95819 }, { "epoch": 0.6974953708215573, "grad_norm": 0.3698155879974365, "learning_rate": 2.3001448020510616e-06, "loss": 0.012953372672200203, "memory(GiB)": 22.01, "step": 21471, "token_acc": 1.0, "train_speed(iter/s)": 0.958197 }, { "epoch": 0.6975278562843128, "grad_norm": 0.3536299765110016, "learning_rate": 2.2996927048180555e-06, "loss": 0.01603202149271965, "memory(GiB)": 22.01, "step": 21472, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.958204 }, { "epoch": 0.6975603417470682, "grad_norm": 0.37554827332496643, "learning_rate": 2.299240638750188e-06, "loss": 0.011597683653235435, "memory(GiB)": 22.01, "step": 21473, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.95821 }, { "epoch": 0.6975928272098236, "grad_norm": 0.2787095904350281, "learning_rate": 2.2987886038526786e-06, "loss": 0.010827092453837395, "memory(GiB)": 22.01, "step": 21474, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.958216 }, { "epoch": 0.697625312672579, "grad_norm": 0.31880053877830505, "learning_rate": 2.2983366001307432e-06, "loss": 0.015203073620796204, "memory(GiB)": 22.01, "step": 21475, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.958223 }, { "epoch": 0.6976577981353345, "grad_norm": 0.2639370858669281, "learning_rate": 2.2978846275896017e-06, "loss": 0.013133417814970016, "memory(GiB)": 22.01, "step": 21476, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.95823 }, { "epoch": 0.6976902835980898, "grad_norm": 0.3408051133155823, "learning_rate": 2.2974326862344663e-06, "loss": 0.01770249381661415, "memory(GiB)": 22.01, "step": 21477, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.958237 }, { "epoch": 0.6977227690608453, "grad_norm": 0.286703497171402, "learning_rate": 2.296980776070555e-06, "loss": 0.015040908008813858, "memory(GiB)": 22.01, "step": 21478, "token_acc": 0.992, "train_speed(iter/s)": 0.958243 }, { "epoch": 0.6977552545236007, "grad_norm": 0.41756242513656616, "learning_rate": 2.2965288971030823e-06, "loss": 0.017292268574237823, "memory(GiB)": 22.01, "step": 21479, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.95825 }, { "epoch": 0.6977877399863561, "grad_norm": 0.4322505295276642, "learning_rate": 2.2960770493372673e-06, "loss": 0.019332993775606155, "memory(GiB)": 22.01, "step": 21480, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.958257 }, { "epoch": 0.6978202254491115, "grad_norm": 0.2746678590774536, "learning_rate": 2.2956252327783197e-06, "loss": 0.011918595060706139, "memory(GiB)": 22.01, "step": 21481, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.958264 }, { "epoch": 0.697852710911867, "grad_norm": 0.27658727765083313, "learning_rate": 2.295173447431457e-06, "loss": 0.009728454053401947, "memory(GiB)": 22.01, "step": 21482, "token_acc": 1.0, "train_speed(iter/s)": 0.958272 }, { "epoch": 0.6978851963746223, "grad_norm": 0.34748575091362, "learning_rate": 2.2947216933018923e-06, "loss": 0.014700263738632202, "memory(GiB)": 22.01, "step": 21483, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.958279 }, { "epoch": 0.6979176818373778, "grad_norm": 0.22981378436088562, "learning_rate": 2.29426997039484e-06, "loss": 0.008449108339846134, "memory(GiB)": 22.01, "step": 21484, "token_acc": 1.0, "train_speed(iter/s)": 0.958288 }, { "epoch": 0.6979501673001332, "grad_norm": 0.3587570786476135, "learning_rate": 2.2938182787155154e-06, "loss": 0.0144155602902174, "memory(GiB)": 22.01, "step": 21485, "token_acc": 0.9965156794425087, "train_speed(iter/s)": 0.958297 }, { "epoch": 0.6979826527628886, "grad_norm": 0.2815176248550415, "learning_rate": 2.2933666182691284e-06, "loss": 0.016203077509999275, "memory(GiB)": 22.01, "step": 21486, "token_acc": 1.0, "train_speed(iter/s)": 0.958307 }, { "epoch": 0.698015138225644, "grad_norm": 0.3227032423019409, "learning_rate": 2.2929149890608925e-06, "loss": 0.017435025423765182, "memory(GiB)": 22.01, "step": 21487, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.958316 }, { "epoch": 0.6980476236883995, "grad_norm": 0.38042399287223816, "learning_rate": 2.292463391096021e-06, "loss": 0.017689641565084457, "memory(GiB)": 22.01, "step": 21488, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.958326 }, { "epoch": 0.6980801091511548, "grad_norm": 0.24087242782115936, "learning_rate": 2.2920118243797278e-06, "loss": 0.01351882703602314, "memory(GiB)": 22.01, "step": 21489, "token_acc": 1.0, "train_speed(iter/s)": 0.958336 }, { "epoch": 0.6981125946139103, "grad_norm": 0.30634692311286926, "learning_rate": 2.29156028891722e-06, "loss": 0.01664102077484131, "memory(GiB)": 22.01, "step": 21490, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.958345 }, { "epoch": 0.6981450800766656, "grad_norm": 0.33243057131767273, "learning_rate": 2.2911087847137143e-06, "loss": 0.011981519870460033, "memory(GiB)": 22.01, "step": 21491, "token_acc": 0.992, "train_speed(iter/s)": 0.958355 }, { "epoch": 0.6981775655394211, "grad_norm": 0.2827881872653961, "learning_rate": 2.290657311774414e-06, "loss": 0.016006261110305786, "memory(GiB)": 22.01, "step": 21492, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.958364 }, { "epoch": 0.6982100510021765, "grad_norm": 0.36283057928085327, "learning_rate": 2.2902058701045387e-06, "loss": 0.018678175285458565, "memory(GiB)": 22.01, "step": 21493, "token_acc": 1.0, "train_speed(iter/s)": 0.958374 }, { "epoch": 0.698242536464932, "grad_norm": 0.30249035358428955, "learning_rate": 2.2897544597092924e-06, "loss": 0.012503577396273613, "memory(GiB)": 22.01, "step": 21494, "token_acc": 1.0, "train_speed(iter/s)": 0.958383 }, { "epoch": 0.6982750219276873, "grad_norm": 0.2533659338951111, "learning_rate": 2.2893030805938887e-06, "loss": 0.01688311994075775, "memory(GiB)": 22.01, "step": 21495, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.958393 }, { "epoch": 0.6983075073904428, "grad_norm": 0.39040982723236084, "learning_rate": 2.2888517327635333e-06, "loss": 0.016005687415599823, "memory(GiB)": 22.01, "step": 21496, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.958402 }, { "epoch": 0.6983399928531981, "grad_norm": 0.29785725474357605, "learning_rate": 2.288400416223438e-06, "loss": 0.0100206034258008, "memory(GiB)": 22.01, "step": 21497, "token_acc": 1.0, "train_speed(iter/s)": 0.958411 }, { "epoch": 0.6983724783159536, "grad_norm": 0.3281145393848419, "learning_rate": 2.287949130978811e-06, "loss": 0.013142341747879982, "memory(GiB)": 22.01, "step": 21498, "token_acc": 1.0, "train_speed(iter/s)": 0.958419 }, { "epoch": 0.698404963778709, "grad_norm": 0.4638749957084656, "learning_rate": 2.2874978770348604e-06, "loss": 0.016367707401514053, "memory(GiB)": 22.01, "step": 21499, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.958426 }, { "epoch": 0.6984374492414644, "grad_norm": 0.3693210184574127, "learning_rate": 2.2870466543967967e-06, "loss": 0.01536945253610611, "memory(GiB)": 22.01, "step": 21500, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.958434 }, { "epoch": 0.6984374492414644, "eval_loss": 0.015384189784526825, "eval_runtime": 80.5781, "eval_samples_per_second": 123.483, "eval_steps_per_second": 3.86, "eval_token_acc": 0.9937877219988434, "step": 21500 }, { "epoch": 0.6984699347042198, "grad_norm": 0.4232710003852844, "learning_rate": 2.2865954630698235e-06, "loss": 0.02021804265677929, "memory(GiB)": 22.66, "step": 21501, "token_acc": 0.9934679904142417, "train_speed(iter/s)": 0.954554 }, { "epoch": 0.6985024201669753, "grad_norm": 0.32802045345306396, "learning_rate": 2.2861443030591503e-06, "loss": 0.01826707273721695, "memory(GiB)": 22.66, "step": 21502, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.954561 }, { "epoch": 0.6985349056297306, "grad_norm": 0.31170204281806946, "learning_rate": 2.285693174369984e-06, "loss": 0.016922039911150932, "memory(GiB)": 22.66, "step": 21503, "token_acc": 0.9922779922779923, "train_speed(iter/s)": 0.954568 }, { "epoch": 0.6985673910924861, "grad_norm": 0.29729124903678894, "learning_rate": 2.285242077007533e-06, "loss": 0.010343106463551521, "memory(GiB)": 22.66, "step": 21504, "token_acc": 0.9966887417218543, "train_speed(iter/s)": 0.954578 }, { "epoch": 0.6985998765552416, "grad_norm": 3.7145488262176514, "learning_rate": 2.284791010977e-06, "loss": 0.01918942481279373, "memory(GiB)": 22.66, "step": 21505, "token_acc": 0.9813432835820896, "train_speed(iter/s)": 0.954587 }, { "epoch": 0.6986323620179969, "grad_norm": 0.2670016288757324, "learning_rate": 2.2843399762835926e-06, "loss": 0.013290958479046822, "memory(GiB)": 22.66, "step": 21506, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.954596 }, { "epoch": 0.6986648474807524, "grad_norm": 0.23733645677566528, "learning_rate": 2.2838889729325164e-06, "loss": 0.009490478783845901, "memory(GiB)": 22.66, "step": 21507, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.954605 }, { "epoch": 0.6986973329435078, "grad_norm": 0.35775312781333923, "learning_rate": 2.2834380009289785e-06, "loss": 0.012696046382188797, "memory(GiB)": 22.66, "step": 21508, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.954613 }, { "epoch": 0.6987298184062632, "grad_norm": 0.39411044120788574, "learning_rate": 2.28298706027818e-06, "loss": 0.018801627680659294, "memory(GiB)": 22.66, "step": 21509, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.954623 }, { "epoch": 0.6987623038690186, "grad_norm": 0.2931782603263855, "learning_rate": 2.2825361509853274e-06, "loss": 0.010780845768749714, "memory(GiB)": 22.66, "step": 21510, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.954632 }, { "epoch": 0.6987947893317741, "grad_norm": 0.7103550434112549, "learning_rate": 2.2820852730556238e-06, "loss": 0.017047878354787827, "memory(GiB)": 22.66, "step": 21511, "token_acc": 0.996742671009772, "train_speed(iter/s)": 0.954642 }, { "epoch": 0.6988272747945294, "grad_norm": 0.30719193816185, "learning_rate": 2.281634426494276e-06, "loss": 0.016448520123958588, "memory(GiB)": 22.66, "step": 21512, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.954651 }, { "epoch": 0.6988597602572849, "grad_norm": 0.3484828472137451, "learning_rate": 2.2811836113064834e-06, "loss": 0.011504705995321274, "memory(GiB)": 22.66, "step": 21513, "token_acc": 0.9963503649635036, "train_speed(iter/s)": 0.954649 }, { "epoch": 0.6988922457200403, "grad_norm": 0.4354070723056793, "learning_rate": 2.280732827497452e-06, "loss": 0.02129734866321087, "memory(GiB)": 22.66, "step": 21514, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.954658 }, { "epoch": 0.6989247311827957, "grad_norm": 0.5378680229187012, "learning_rate": 2.2802820750723797e-06, "loss": 0.01676240935921669, "memory(GiB)": 22.66, "step": 21515, "token_acc": 1.0, "train_speed(iter/s)": 0.954667 }, { "epoch": 0.6989572166455511, "grad_norm": 0.29668712615966797, "learning_rate": 2.2798313540364743e-06, "loss": 0.013233967125415802, "memory(GiB)": 22.66, "step": 21516, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.954677 }, { "epoch": 0.6989897021083066, "grad_norm": 0.2742037773132324, "learning_rate": 2.2793806643949375e-06, "loss": 0.010931331664323807, "memory(GiB)": 22.66, "step": 21517, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.954686 }, { "epoch": 0.6990221875710619, "grad_norm": 0.3627053201198578, "learning_rate": 2.2789300061529662e-06, "loss": 0.020557556301355362, "memory(GiB)": 22.66, "step": 21518, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.954696 }, { "epoch": 0.6990546730338174, "grad_norm": 0.4800223112106323, "learning_rate": 2.2784793793157672e-06, "loss": 0.017021790146827698, "memory(GiB)": 22.66, "step": 21519, "token_acc": 1.0, "train_speed(iter/s)": 0.954705 }, { "epoch": 0.6990871584965728, "grad_norm": 0.3499114513397217, "learning_rate": 2.278028783888534e-06, "loss": 0.015439225360751152, "memory(GiB)": 22.66, "step": 21520, "token_acc": 1.0, "train_speed(iter/s)": 0.954715 }, { "epoch": 0.6991196439593282, "grad_norm": 0.367885947227478, "learning_rate": 2.277578219876476e-06, "loss": 0.012829703278839588, "memory(GiB)": 22.66, "step": 21521, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.954725 }, { "epoch": 0.6991521294220836, "grad_norm": 0.35424941778182983, "learning_rate": 2.2771276872847864e-06, "loss": 0.017833586782217026, "memory(GiB)": 22.66, "step": 21522, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.954735 }, { "epoch": 0.6991846148848391, "grad_norm": 0.3668130934238434, "learning_rate": 2.27667718611867e-06, "loss": 0.015660542994737625, "memory(GiB)": 22.66, "step": 21523, "token_acc": 1.0, "train_speed(iter/s)": 0.954743 }, { "epoch": 0.6992171003475944, "grad_norm": 0.39637669920921326, "learning_rate": 2.2762267163833195e-06, "loss": 0.011771559715270996, "memory(GiB)": 22.66, "step": 21524, "token_acc": 1.0, "train_speed(iter/s)": 0.954749 }, { "epoch": 0.6992495858103499, "grad_norm": 0.29580697417259216, "learning_rate": 2.2757762780839414e-06, "loss": 0.011114818044006824, "memory(GiB)": 22.66, "step": 21525, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.954757 }, { "epoch": 0.6992820712731053, "grad_norm": 0.338270366191864, "learning_rate": 2.2753258712257296e-06, "loss": 0.015750229358673096, "memory(GiB)": 22.66, "step": 21526, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.954765 }, { "epoch": 0.6993145567358607, "grad_norm": 0.9500767588615417, "learning_rate": 2.2748754958138858e-06, "loss": 0.025678850710392, "memory(GiB)": 22.66, "step": 21527, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.954772 }, { "epoch": 0.6993470421986161, "grad_norm": 0.3635411560535431, "learning_rate": 2.274425151853604e-06, "loss": 0.010235745459794998, "memory(GiB)": 22.66, "step": 21528, "token_acc": 0.9947643979057592, "train_speed(iter/s)": 0.954779 }, { "epoch": 0.6993795276613716, "grad_norm": 0.36468052864074707, "learning_rate": 2.2739748393500834e-06, "loss": 0.014536203816533089, "memory(GiB)": 22.66, "step": 21529, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.954787 }, { "epoch": 0.6994120131241269, "grad_norm": 0.32828405499458313, "learning_rate": 2.273524558308521e-06, "loss": 0.023419838398694992, "memory(GiB)": 22.66, "step": 21530, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.954794 }, { "epoch": 0.6994444985868824, "grad_norm": 0.35234105587005615, "learning_rate": 2.2730743087341144e-06, "loss": 0.012110377661883831, "memory(GiB)": 22.66, "step": 21531, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.954801 }, { "epoch": 0.6994769840496378, "grad_norm": 0.41729894280433655, "learning_rate": 2.2726240906320623e-06, "loss": 0.01513428520411253, "memory(GiB)": 22.66, "step": 21532, "token_acc": 1.0, "train_speed(iter/s)": 0.954809 }, { "epoch": 0.6995094695123932, "grad_norm": 0.2824758291244507, "learning_rate": 2.2721739040075558e-06, "loss": 0.013453057035803795, "memory(GiB)": 22.66, "step": 21533, "token_acc": 0.9961685823754789, "train_speed(iter/s)": 0.954817 }, { "epoch": 0.6995419549751486, "grad_norm": 0.3132080137729645, "learning_rate": 2.271723748865794e-06, "loss": 0.016678785905241966, "memory(GiB)": 22.66, "step": 21534, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.954825 }, { "epoch": 0.6995744404379041, "grad_norm": 0.3613368272781372, "learning_rate": 2.2712736252119705e-06, "loss": 0.015302001498639584, "memory(GiB)": 22.66, "step": 21535, "token_acc": 1.0, "train_speed(iter/s)": 0.954832 }, { "epoch": 0.6996069259006594, "grad_norm": 0.2849629521369934, "learning_rate": 2.2708235330512832e-06, "loss": 0.010103285312652588, "memory(GiB)": 22.66, "step": 21536, "token_acc": 1.0, "train_speed(iter/s)": 0.95484 }, { "epoch": 0.6996394113634149, "grad_norm": 0.28364384174346924, "learning_rate": 2.270373472388924e-06, "loss": 0.012052111327648163, "memory(GiB)": 22.66, "step": 21537, "token_acc": 1.0, "train_speed(iter/s)": 0.954848 }, { "epoch": 0.6996718968261703, "grad_norm": 0.25840404629707336, "learning_rate": 2.2699234432300873e-06, "loss": 0.011955584399402142, "memory(GiB)": 22.66, "step": 21538, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.954855 }, { "epoch": 0.6997043822889257, "grad_norm": 0.35475462675094604, "learning_rate": 2.269473445579968e-06, "loss": 0.014312325976788998, "memory(GiB)": 22.66, "step": 21539, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.954863 }, { "epoch": 0.6997368677516811, "grad_norm": 0.3326004445552826, "learning_rate": 2.269023479443761e-06, "loss": 0.015245813876390457, "memory(GiB)": 22.66, "step": 21540, "token_acc": 0.993006993006993, "train_speed(iter/s)": 0.95487 }, { "epoch": 0.6997693532144366, "grad_norm": 0.35286882519721985, "learning_rate": 2.2685735448266567e-06, "loss": 0.014718594029545784, "memory(GiB)": 22.66, "step": 21541, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.954878 }, { "epoch": 0.6998018386771919, "grad_norm": 0.3593999147415161, "learning_rate": 2.2681236417338486e-06, "loss": 0.014453600160777569, "memory(GiB)": 22.66, "step": 21542, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.954884 }, { "epoch": 0.6998343241399474, "grad_norm": 0.25792932510375977, "learning_rate": 2.26767377017053e-06, "loss": 0.010016795247793198, "memory(GiB)": 22.66, "step": 21543, "token_acc": 1.0, "train_speed(iter/s)": 0.95489 }, { "epoch": 0.6998668096027028, "grad_norm": 0.3366548717021942, "learning_rate": 2.267223930141895e-06, "loss": 0.01684233546257019, "memory(GiB)": 22.66, "step": 21544, "token_acc": 1.0, "train_speed(iter/s)": 0.954898 }, { "epoch": 0.6998992950654582, "grad_norm": 0.334718257188797, "learning_rate": 2.2667741216531307e-06, "loss": 0.016239993274211884, "memory(GiB)": 22.66, "step": 21545, "token_acc": 1.0, "train_speed(iter/s)": 0.954905 }, { "epoch": 0.6999317805282136, "grad_norm": 0.3856789767742157, "learning_rate": 2.266324344709431e-06, "loss": 0.016880188137292862, "memory(GiB)": 22.66, "step": 21546, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.954912 }, { "epoch": 0.6999642659909691, "grad_norm": 0.4301266074180603, "learning_rate": 2.2658745993159874e-06, "loss": 0.013264094479382038, "memory(GiB)": 22.66, "step": 21547, "token_acc": 1.0, "train_speed(iter/s)": 0.954919 }, { "epoch": 0.6999967514537244, "grad_norm": 0.39768391847610474, "learning_rate": 2.26542488547799e-06, "loss": 0.012916682288050652, "memory(GiB)": 22.66, "step": 21548, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.954926 }, { "epoch": 0.7000292369164799, "grad_norm": 0.2556025981903076, "learning_rate": 2.2649752032006306e-06, "loss": 0.00959167443215847, "memory(GiB)": 22.66, "step": 21549, "token_acc": 1.0, "train_speed(iter/s)": 0.954934 }, { "epoch": 0.7000617223792353, "grad_norm": 0.2695518732070923, "learning_rate": 2.264525552489096e-06, "loss": 0.013446621596813202, "memory(GiB)": 22.66, "step": 21550, "token_acc": 0.9965397923875432, "train_speed(iter/s)": 0.954942 }, { "epoch": 0.7000942078419907, "grad_norm": 0.2831198573112488, "learning_rate": 2.2640759333485794e-06, "loss": 0.015454668551683426, "memory(GiB)": 22.66, "step": 21551, "token_acc": 0.9947916666666666, "train_speed(iter/s)": 0.954949 }, { "epoch": 0.7001266933047461, "grad_norm": 0.29821646213531494, "learning_rate": 2.263626345784264e-06, "loss": 0.012195132672786713, "memory(GiB)": 22.66, "step": 21552, "token_acc": 1.0, "train_speed(iter/s)": 0.954956 }, { "epoch": 0.7001591787675016, "grad_norm": 0.31157904863357544, "learning_rate": 2.2631767898013467e-06, "loss": 0.010716051794588566, "memory(GiB)": 22.66, "step": 21553, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.954963 }, { "epoch": 0.7001916642302569, "grad_norm": 0.42273563146591187, "learning_rate": 2.2627272654050097e-06, "loss": 0.019768014550209045, "memory(GiB)": 22.66, "step": 21554, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.95497 }, { "epoch": 0.7002241496930124, "grad_norm": 0.39784330129623413, "learning_rate": 2.2622777726004457e-06, "loss": 0.01415939349681139, "memory(GiB)": 22.66, "step": 21555, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.954977 }, { "epoch": 0.7002566351557677, "grad_norm": 0.23546679317951202, "learning_rate": 2.2618283113928378e-06, "loss": 0.00965905748307705, "memory(GiB)": 22.66, "step": 21556, "token_acc": 1.0, "train_speed(iter/s)": 0.954985 }, { "epoch": 0.7002891206185232, "grad_norm": 0.32446128129959106, "learning_rate": 2.261378881787376e-06, "loss": 0.009217847138643265, "memory(GiB)": 22.66, "step": 21557, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.954992 }, { "epoch": 0.7003216060812786, "grad_norm": 0.29218339920043945, "learning_rate": 2.2609294837892472e-06, "loss": 0.014361575245857239, "memory(GiB)": 22.66, "step": 21558, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.955 }, { "epoch": 0.700354091544034, "grad_norm": 0.2946075201034546, "learning_rate": 2.2604801174036396e-06, "loss": 0.01354123093187809, "memory(GiB)": 22.66, "step": 21559, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.955007 }, { "epoch": 0.7003865770067894, "grad_norm": 0.39247018098831177, "learning_rate": 2.2600307826357355e-06, "loss": 0.01579735055565834, "memory(GiB)": 22.66, "step": 21560, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955014 }, { "epoch": 0.7004190624695449, "grad_norm": 0.31894049048423767, "learning_rate": 2.2595814794907232e-06, "loss": 0.015337199904024601, "memory(GiB)": 22.66, "step": 21561, "token_acc": 1.0, "train_speed(iter/s)": 0.95502 }, { "epoch": 0.7004515479323002, "grad_norm": 0.3860680162906647, "learning_rate": 2.259132207973788e-06, "loss": 0.017063304781913757, "memory(GiB)": 22.66, "step": 21562, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955027 }, { "epoch": 0.7004840333950557, "grad_norm": 0.3507644534111023, "learning_rate": 2.2586829680901155e-06, "loss": 0.015453912317752838, "memory(GiB)": 22.66, "step": 21563, "token_acc": 1.0, "train_speed(iter/s)": 0.955034 }, { "epoch": 0.7005165188578111, "grad_norm": 0.5480866432189941, "learning_rate": 2.258233759844892e-06, "loss": 0.013357246294617653, "memory(GiB)": 22.66, "step": 21564, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.955041 }, { "epoch": 0.7005490043205665, "grad_norm": 0.3468778431415558, "learning_rate": 2.2577845832432982e-06, "loss": 0.017034459859132767, "memory(GiB)": 22.66, "step": 21565, "token_acc": 0.98828125, "train_speed(iter/s)": 0.955049 }, { "epoch": 0.7005814897833219, "grad_norm": 0.26598840951919556, "learning_rate": 2.2573354382905204e-06, "loss": 0.017274193465709686, "memory(GiB)": 22.66, "step": 21566, "token_acc": 0.9822485207100592, "train_speed(iter/s)": 0.955057 }, { "epoch": 0.7006139752460774, "grad_norm": 0.44523411989212036, "learning_rate": 2.256886324991742e-06, "loss": 0.016660429537296295, "memory(GiB)": 22.66, "step": 21567, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.955066 }, { "epoch": 0.7006464607088329, "grad_norm": 0.2649852931499481, "learning_rate": 2.2564372433521487e-06, "loss": 0.013238144107162952, "memory(GiB)": 22.66, "step": 21568, "token_acc": 1.0, "train_speed(iter/s)": 0.955075 }, { "epoch": 0.7006789461715882, "grad_norm": 0.37883836030960083, "learning_rate": 2.255988193376919e-06, "loss": 0.01780586689710617, "memory(GiB)": 22.66, "step": 21569, "token_acc": 0.9877049180327869, "train_speed(iter/s)": 0.955085 }, { "epoch": 0.7007114316343437, "grad_norm": 0.4237478971481323, "learning_rate": 2.255539175071238e-06, "loss": 0.022464299574494362, "memory(GiB)": 22.66, "step": 21570, "token_acc": 1.0, "train_speed(iter/s)": 0.955094 }, { "epoch": 0.700743917097099, "grad_norm": 0.33561164140701294, "learning_rate": 2.2550901884402875e-06, "loss": 0.014202447608113289, "memory(GiB)": 22.66, "step": 21571, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.955103 }, { "epoch": 0.7007764025598545, "grad_norm": 0.3368756175041199, "learning_rate": 2.254641233489252e-06, "loss": 0.008732674643397331, "memory(GiB)": 22.66, "step": 21572, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.955112 }, { "epoch": 0.7008088880226099, "grad_norm": 0.25184762477874756, "learning_rate": 2.254192310223308e-06, "loss": 0.008622199296951294, "memory(GiB)": 22.66, "step": 21573, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.955122 }, { "epoch": 0.7008413734853653, "grad_norm": 0.2739909887313843, "learning_rate": 2.2537434186476425e-06, "loss": 0.01224813237786293, "memory(GiB)": 22.66, "step": 21574, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.955131 }, { "epoch": 0.7008738589481207, "grad_norm": 0.28110161423683167, "learning_rate": 2.253294558767429e-06, "loss": 0.014048729091882706, "memory(GiB)": 22.66, "step": 21575, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955141 }, { "epoch": 0.7009063444108762, "grad_norm": 0.3809134364128113, "learning_rate": 2.252845730587855e-06, "loss": 0.014613638631999493, "memory(GiB)": 22.66, "step": 21576, "token_acc": 1.0, "train_speed(iter/s)": 0.95515 }, { "epoch": 0.7009388298736315, "grad_norm": 0.3577321767807007, "learning_rate": 2.2523969341140974e-06, "loss": 0.013555767014622688, "memory(GiB)": 22.66, "step": 21577, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955158 }, { "epoch": 0.700971315336387, "grad_norm": 0.5984846353530884, "learning_rate": 2.251948169351335e-06, "loss": 0.01983446255326271, "memory(GiB)": 22.66, "step": 21578, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.955167 }, { "epoch": 0.7010038007991424, "grad_norm": 0.299203097820282, "learning_rate": 2.251499436304751e-06, "loss": 0.012722347863018513, "memory(GiB)": 22.66, "step": 21579, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.955177 }, { "epoch": 0.7010362862618978, "grad_norm": 0.3726523518562317, "learning_rate": 2.2510507349795175e-06, "loss": 0.012678878381848335, "memory(GiB)": 22.66, "step": 21580, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955186 }, { "epoch": 0.7010687717246532, "grad_norm": 0.37942323088645935, "learning_rate": 2.2506020653808212e-06, "loss": 0.01931132934987545, "memory(GiB)": 22.66, "step": 21581, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.955196 }, { "epoch": 0.7011012571874087, "grad_norm": 0.3015458881855011, "learning_rate": 2.250153427513835e-06, "loss": 0.016104647889733315, "memory(GiB)": 22.66, "step": 21582, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.955205 }, { "epoch": 0.701133742650164, "grad_norm": 0.3638904392719269, "learning_rate": 2.2497048213837396e-06, "loss": 0.018999241292476654, "memory(GiB)": 22.66, "step": 21583, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.955212 }, { "epoch": 0.7011662281129195, "grad_norm": 0.4934127628803253, "learning_rate": 2.2492562469957098e-06, "loss": 0.017161980271339417, "memory(GiB)": 22.66, "step": 21584, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.95522 }, { "epoch": 0.7011987135756749, "grad_norm": 0.3166806399822235, "learning_rate": 2.2488077043549237e-06, "loss": 0.01640978269279003, "memory(GiB)": 22.66, "step": 21585, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.955227 }, { "epoch": 0.7012311990384303, "grad_norm": 0.4840799570083618, "learning_rate": 2.248359193466559e-06, "loss": 0.012280724942684174, "memory(GiB)": 22.66, "step": 21586, "token_acc": 1.0, "train_speed(iter/s)": 0.955233 }, { "epoch": 0.7012636845011857, "grad_norm": 0.3984278738498688, "learning_rate": 2.2479107143357933e-06, "loss": 0.011508574709296227, "memory(GiB)": 22.66, "step": 21587, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.95524 }, { "epoch": 0.7012961699639412, "grad_norm": 0.4187089502811432, "learning_rate": 2.2474622669677993e-06, "loss": 0.014857493340969086, "memory(GiB)": 22.66, "step": 21588, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.955247 }, { "epoch": 0.7013286554266965, "grad_norm": 0.3407735228538513, "learning_rate": 2.2470138513677535e-06, "loss": 0.014767719432711601, "memory(GiB)": 22.66, "step": 21589, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.955254 }, { "epoch": 0.701361140889452, "grad_norm": 0.42683467268943787, "learning_rate": 2.246565467540833e-06, "loss": 0.01643327996134758, "memory(GiB)": 22.66, "step": 21590, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955261 }, { "epoch": 0.7013936263522074, "grad_norm": 0.279541015625, "learning_rate": 2.2461171154922133e-06, "loss": 0.01452791690826416, "memory(GiB)": 22.66, "step": 21591, "token_acc": 1.0, "train_speed(iter/s)": 0.955268 }, { "epoch": 0.7014261118149628, "grad_norm": 0.4340652525424957, "learning_rate": 2.245668795227066e-06, "loss": 0.017799843102693558, "memory(GiB)": 22.66, "step": 21592, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955275 }, { "epoch": 0.7014585972777182, "grad_norm": 0.4138178825378418, "learning_rate": 2.2452205067505667e-06, "loss": 0.01771240122616291, "memory(GiB)": 22.66, "step": 21593, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.955282 }, { "epoch": 0.7014910827404737, "grad_norm": 0.25612199306488037, "learning_rate": 2.2447722500678893e-06, "loss": 0.011241959407925606, "memory(GiB)": 22.66, "step": 21594, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.955289 }, { "epoch": 0.701523568203229, "grad_norm": 0.3296220302581787, "learning_rate": 2.244324025184207e-06, "loss": 0.015654224902391434, "memory(GiB)": 22.66, "step": 21595, "token_acc": 0.9774774774774775, "train_speed(iter/s)": 0.955295 }, { "epoch": 0.7015560536659845, "grad_norm": 0.3442123830318451, "learning_rate": 2.2438758321046956e-06, "loss": 0.013462556526064873, "memory(GiB)": 22.66, "step": 21596, "token_acc": 1.0, "train_speed(iter/s)": 0.955302 }, { "epoch": 0.7015885391287399, "grad_norm": 0.3826826810836792, "learning_rate": 2.2434276708345233e-06, "loss": 0.01421232521533966, "memory(GiB)": 22.66, "step": 21597, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.955309 }, { "epoch": 0.7016210245914953, "grad_norm": 0.3401035666465759, "learning_rate": 2.2429795413788645e-06, "loss": 0.01389341615140438, "memory(GiB)": 22.66, "step": 21598, "token_acc": 0.99, "train_speed(iter/s)": 0.955315 }, { "epoch": 0.7016535100542507, "grad_norm": 0.6004899144172668, "learning_rate": 2.2425314437428915e-06, "loss": 0.023595066741108894, "memory(GiB)": 22.66, "step": 21599, "token_acc": 1.0, "train_speed(iter/s)": 0.955323 }, { "epoch": 0.7016859955170062, "grad_norm": 0.2813630700111389, "learning_rate": 2.242083377931778e-06, "loss": 0.014487551525235176, "memory(GiB)": 22.66, "step": 21600, "token_acc": 1.0, "train_speed(iter/s)": 0.955329 }, { "epoch": 0.7017184809797615, "grad_norm": 0.3246041238307953, "learning_rate": 2.241635343950691e-06, "loss": 0.014506996609270573, "memory(GiB)": 22.66, "step": 21601, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955337 }, { "epoch": 0.701750966442517, "grad_norm": 0.4949183166027069, "learning_rate": 2.2411873418048053e-06, "loss": 0.0240069180727005, "memory(GiB)": 22.66, "step": 21602, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.955346 }, { "epoch": 0.7017834519052724, "grad_norm": 0.21393129229545593, "learning_rate": 2.240739371499286e-06, "loss": 0.005788849666714668, "memory(GiB)": 22.66, "step": 21603, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955355 }, { "epoch": 0.7018159373680278, "grad_norm": 0.30435940623283386, "learning_rate": 2.2402914330393105e-06, "loss": 0.011415857821702957, "memory(GiB)": 22.66, "step": 21604, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.955365 }, { "epoch": 0.7018484228307832, "grad_norm": 0.35973548889160156, "learning_rate": 2.239843526430043e-06, "loss": 0.014944862574338913, "memory(GiB)": 22.66, "step": 21605, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.955374 }, { "epoch": 0.7018809082935387, "grad_norm": 0.36511266231536865, "learning_rate": 2.239395651676658e-06, "loss": 0.01544902939349413, "memory(GiB)": 22.66, "step": 21606, "token_acc": 1.0, "train_speed(iter/s)": 0.955382 }, { "epoch": 0.701913393756294, "grad_norm": 0.3073461055755615, "learning_rate": 2.2389478087843187e-06, "loss": 0.009774763137102127, "memory(GiB)": 22.66, "step": 21607, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.95539 }, { "epoch": 0.7019458792190495, "grad_norm": 0.4013853371143341, "learning_rate": 2.238499997758195e-06, "loss": 0.01566818170249462, "memory(GiB)": 22.66, "step": 21608, "token_acc": 1.0, "train_speed(iter/s)": 0.955398 }, { "epoch": 0.7019783646818049, "grad_norm": 0.3729366660118103, "learning_rate": 2.2380522186034605e-06, "loss": 0.01950118876993656, "memory(GiB)": 22.66, "step": 21609, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955406 }, { "epoch": 0.7020108501445603, "grad_norm": 0.2887170910835266, "learning_rate": 2.2376044713252774e-06, "loss": 0.009134184569120407, "memory(GiB)": 22.66, "step": 21610, "token_acc": 1.0, "train_speed(iter/s)": 0.955413 }, { "epoch": 0.7020433356073157, "grad_norm": 0.4358437657356262, "learning_rate": 2.2371567559288177e-06, "loss": 0.012915967032313347, "memory(GiB)": 22.66, "step": 21611, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.955421 }, { "epoch": 0.7020758210700712, "grad_norm": 0.39009732007980347, "learning_rate": 2.236709072419244e-06, "loss": 0.015578004531562328, "memory(GiB)": 22.66, "step": 21612, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955428 }, { "epoch": 0.7021083065328265, "grad_norm": 0.5005989670753479, "learning_rate": 2.2362614208017263e-06, "loss": 0.021510997787117958, "memory(GiB)": 22.66, "step": 21613, "token_acc": 0.9856459330143541, "train_speed(iter/s)": 0.955435 }, { "epoch": 0.702140791995582, "grad_norm": 0.5645914673805237, "learning_rate": 2.23581380108143e-06, "loss": 0.017982788383960724, "memory(GiB)": 22.66, "step": 21614, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.955443 }, { "epoch": 0.7021732774583374, "grad_norm": 0.6387452483177185, "learning_rate": 2.2353662132635233e-06, "loss": 0.011858202517032623, "memory(GiB)": 22.66, "step": 21615, "token_acc": 1.0, "train_speed(iter/s)": 0.955451 }, { "epoch": 0.7022057629210928, "grad_norm": 0.331893652677536, "learning_rate": 2.2349186573531685e-06, "loss": 0.01753368228673935, "memory(GiB)": 22.66, "step": 21616, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.955457 }, { "epoch": 0.7022382483838482, "grad_norm": 0.27165403962135315, "learning_rate": 2.2344711333555325e-06, "loss": 0.013990798965096474, "memory(GiB)": 22.66, "step": 21617, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.955465 }, { "epoch": 0.7022707338466037, "grad_norm": 0.44339677691459656, "learning_rate": 2.234023641275781e-06, "loss": 0.016786592081189156, "memory(GiB)": 22.66, "step": 21618, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.955472 }, { "epoch": 0.702303219309359, "grad_norm": 0.41080373525619507, "learning_rate": 2.2335761811190797e-06, "loss": 0.020683331415057182, "memory(GiB)": 22.66, "step": 21619, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955479 }, { "epoch": 0.7023357047721145, "grad_norm": 0.309057354927063, "learning_rate": 2.2331287528905895e-06, "loss": 0.015683120116591454, "memory(GiB)": 22.66, "step": 21620, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.955487 }, { "epoch": 0.7023681902348698, "grad_norm": 0.5732824206352234, "learning_rate": 2.2326813565954764e-06, "loss": 0.018861886113882065, "memory(GiB)": 22.66, "step": 21621, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.955496 }, { "epoch": 0.7024006756976253, "grad_norm": 0.3096782863140106, "learning_rate": 2.2322339922389036e-06, "loss": 0.016968650743365288, "memory(GiB)": 22.66, "step": 21622, "token_acc": 1.0, "train_speed(iter/s)": 0.955504 }, { "epoch": 0.7024331611603807, "grad_norm": 1.2773839235305786, "learning_rate": 2.231786659826034e-06, "loss": 0.018931344151496887, "memory(GiB)": 22.66, "step": 21623, "token_acc": 0.996, "train_speed(iter/s)": 0.955511 }, { "epoch": 0.7024656466231362, "grad_norm": 0.32087352871894836, "learning_rate": 2.231339359362033e-06, "loss": 0.016599509865045547, "memory(GiB)": 22.66, "step": 21624, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955519 }, { "epoch": 0.7024981320858915, "grad_norm": 0.27786189317703247, "learning_rate": 2.2308920908520592e-06, "loss": 0.010077841579914093, "memory(GiB)": 22.66, "step": 21625, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955525 }, { "epoch": 0.702530617548647, "grad_norm": 0.37970173358917236, "learning_rate": 2.230444854301276e-06, "loss": 0.014314265921711922, "memory(GiB)": 22.66, "step": 21626, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.955532 }, { "epoch": 0.7025631030114023, "grad_norm": 0.27677059173583984, "learning_rate": 2.2299976497148463e-06, "loss": 0.013665949925780296, "memory(GiB)": 22.66, "step": 21627, "token_acc": 1.0, "train_speed(iter/s)": 0.95554 }, { "epoch": 0.7025955884741578, "grad_norm": 0.44833904504776, "learning_rate": 2.2295504770979314e-06, "loss": 0.021902794018387794, "memory(GiB)": 22.66, "step": 21628, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.955548 }, { "epoch": 0.7026280739369132, "grad_norm": 0.29956087470054626, "learning_rate": 2.2291033364556904e-06, "loss": 0.016452563926577568, "memory(GiB)": 22.66, "step": 21629, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.955555 }, { "epoch": 0.7026605593996686, "grad_norm": 0.3752959668636322, "learning_rate": 2.2286562277932862e-06, "loss": 0.014934000559151173, "memory(GiB)": 22.66, "step": 21630, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955563 }, { "epoch": 0.702693044862424, "grad_norm": 0.44434869289398193, "learning_rate": 2.2282091511158745e-06, "loss": 0.014280160889029503, "memory(GiB)": 22.66, "step": 21631, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.955573 }, { "epoch": 0.7027255303251795, "grad_norm": 0.2502506673336029, "learning_rate": 2.227762106428622e-06, "loss": 0.012411828152835369, "memory(GiB)": 22.66, "step": 21632, "token_acc": 1.0, "train_speed(iter/s)": 0.955583 }, { "epoch": 0.702758015787935, "grad_norm": 0.2892369329929352, "learning_rate": 2.2273150937366826e-06, "loss": 0.01131797768175602, "memory(GiB)": 22.66, "step": 21633, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955592 }, { "epoch": 0.7027905012506903, "grad_norm": 0.22152914106845856, "learning_rate": 2.2268681130452198e-06, "loss": 0.010243097320199013, "memory(GiB)": 22.66, "step": 21634, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.955601 }, { "epoch": 0.7028229867134458, "grad_norm": 0.2645168900489807, "learning_rate": 2.2264211643593873e-06, "loss": 0.01175697986036539, "memory(GiB)": 22.66, "step": 21635, "token_acc": 1.0, "train_speed(iter/s)": 0.95561 }, { "epoch": 0.7028554721762011, "grad_norm": 0.395590215921402, "learning_rate": 2.225974247684347e-06, "loss": 0.019070450216531754, "memory(GiB)": 22.66, "step": 21636, "token_acc": 1.0, "train_speed(iter/s)": 0.95562 }, { "epoch": 0.7028879576389566, "grad_norm": 0.47990167140960693, "learning_rate": 2.2255273630252555e-06, "loss": 0.01598106510937214, "memory(GiB)": 22.66, "step": 21637, "token_acc": 1.0, "train_speed(iter/s)": 0.955629 }, { "epoch": 0.702920443101712, "grad_norm": 0.3070598542690277, "learning_rate": 2.2250805103872714e-06, "loss": 0.013284094631671906, "memory(GiB)": 22.66, "step": 21638, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955639 }, { "epoch": 0.7029529285644674, "grad_norm": 0.6176784634590149, "learning_rate": 2.224633689775553e-06, "loss": 0.018610719591379166, "memory(GiB)": 22.66, "step": 21639, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.955648 }, { "epoch": 0.7029854140272228, "grad_norm": 0.3438067138195038, "learning_rate": 2.224186901195255e-06, "loss": 0.013992596417665482, "memory(GiB)": 22.66, "step": 21640, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.955658 }, { "epoch": 0.7030178994899783, "grad_norm": 0.339200884103775, "learning_rate": 2.223740144651534e-06, "loss": 0.0150369293987751, "memory(GiB)": 22.66, "step": 21641, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.955667 }, { "epoch": 0.7030503849527336, "grad_norm": 0.4359641373157501, "learning_rate": 2.2232934201495473e-06, "loss": 0.011209235526621342, "memory(GiB)": 22.66, "step": 21642, "token_acc": 1.0, "train_speed(iter/s)": 0.955674 }, { "epoch": 0.7030828704154891, "grad_norm": 0.29869839549064636, "learning_rate": 2.2228467276944517e-06, "loss": 0.014101807959377766, "memory(GiB)": 22.66, "step": 21643, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955681 }, { "epoch": 0.7031153558782445, "grad_norm": 0.4888204038143158, "learning_rate": 2.2224000672913997e-06, "loss": 0.019919665530323982, "memory(GiB)": 22.66, "step": 21644, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.955688 }, { "epoch": 0.703147841341, "grad_norm": 0.5241461992263794, "learning_rate": 2.221953438945548e-06, "loss": 0.02066429890692234, "memory(GiB)": 22.66, "step": 21645, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955696 }, { "epoch": 0.7031803268037553, "grad_norm": 0.3387661576271057, "learning_rate": 2.2215068426620506e-06, "loss": 0.013629825785756111, "memory(GiB)": 22.66, "step": 21646, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955703 }, { "epoch": 0.7032128122665108, "grad_norm": 0.3375336527824402, "learning_rate": 2.221060278446065e-06, "loss": 0.02311491221189499, "memory(GiB)": 22.66, "step": 21647, "token_acc": 0.992, "train_speed(iter/s)": 0.95571 }, { "epoch": 0.7032452977292661, "grad_norm": 0.3031812608242035, "learning_rate": 2.22061374630274e-06, "loss": 0.01096924114972353, "memory(GiB)": 22.66, "step": 21648, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.955716 }, { "epoch": 0.7032777831920216, "grad_norm": 0.41079190373420715, "learning_rate": 2.2201672462372327e-06, "loss": 0.01432832796126604, "memory(GiB)": 22.66, "step": 21649, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.955723 }, { "epoch": 0.703310268654777, "grad_norm": 0.3748846650123596, "learning_rate": 2.2197207782546942e-06, "loss": 0.011936089023947716, "memory(GiB)": 22.66, "step": 21650, "token_acc": 0.9930313588850174, "train_speed(iter/s)": 0.95573 }, { "epoch": 0.7033427541175324, "grad_norm": 0.5145910978317261, "learning_rate": 2.2192743423602808e-06, "loss": 0.016588035970926285, "memory(GiB)": 22.66, "step": 21651, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.955737 }, { "epoch": 0.7033752395802878, "grad_norm": 0.5192002654075623, "learning_rate": 2.218827938559141e-06, "loss": 0.0184982530772686, "memory(GiB)": 22.66, "step": 21652, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.955744 }, { "epoch": 0.7034077250430433, "grad_norm": 0.35079124569892883, "learning_rate": 2.21838156685643e-06, "loss": 0.020497146993875504, "memory(GiB)": 22.66, "step": 21653, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.95575 }, { "epoch": 0.7034402105057986, "grad_norm": 0.441216379404068, "learning_rate": 2.2179352272572947e-06, "loss": 0.01478666439652443, "memory(GiB)": 22.66, "step": 21654, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.955756 }, { "epoch": 0.7034726959685541, "grad_norm": 0.31528061628341675, "learning_rate": 2.2174889197668914e-06, "loss": 0.014888647012412548, "memory(GiB)": 22.66, "step": 21655, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.955763 }, { "epoch": 0.7035051814313095, "grad_norm": 0.3500031530857086, "learning_rate": 2.2170426443903715e-06, "loss": 0.010883238166570663, "memory(GiB)": 22.66, "step": 21656, "token_acc": 1.0, "train_speed(iter/s)": 0.955769 }, { "epoch": 0.7035376668940649, "grad_norm": 0.3263438940048218, "learning_rate": 2.2165964011328817e-06, "loss": 0.01128121092915535, "memory(GiB)": 22.66, "step": 21657, "token_acc": 1.0, "train_speed(iter/s)": 0.955776 }, { "epoch": 0.7035701523568203, "grad_norm": 0.2606617510318756, "learning_rate": 2.2161501899995756e-06, "loss": 0.014228572137653828, "memory(GiB)": 22.66, "step": 21658, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955782 }, { "epoch": 0.7036026378195758, "grad_norm": 0.6778396368026733, "learning_rate": 2.2157040109955985e-06, "loss": 0.02090469002723694, "memory(GiB)": 22.66, "step": 21659, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.955789 }, { "epoch": 0.7036351232823311, "grad_norm": 0.5654050707817078, "learning_rate": 2.2152578641261065e-06, "loss": 0.022593388333916664, "memory(GiB)": 22.66, "step": 21660, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.955797 }, { "epoch": 0.7036676087450866, "grad_norm": 0.5841339230537415, "learning_rate": 2.214811749396243e-06, "loss": 0.018173765391111374, "memory(GiB)": 22.66, "step": 21661, "token_acc": 1.0, "train_speed(iter/s)": 0.955804 }, { "epoch": 0.703700094207842, "grad_norm": 0.29185977578163147, "learning_rate": 2.214365666811161e-06, "loss": 0.009197712875902653, "memory(GiB)": 22.66, "step": 21662, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.955814 }, { "epoch": 0.7037325796705974, "grad_norm": 0.41569823026657104, "learning_rate": 2.2139196163760052e-06, "loss": 0.015022329986095428, "memory(GiB)": 22.66, "step": 21663, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955823 }, { "epoch": 0.7037650651333528, "grad_norm": 0.2972578704357147, "learning_rate": 2.2134735980959254e-06, "loss": 0.010046780109405518, "memory(GiB)": 22.66, "step": 21664, "token_acc": 1.0, "train_speed(iter/s)": 0.955832 }, { "epoch": 0.7037975505961083, "grad_norm": 0.4408855438232422, "learning_rate": 2.2130276119760692e-06, "loss": 0.019646329805254936, "memory(GiB)": 22.66, "step": 21665, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.955841 }, { "epoch": 0.7038300360588636, "grad_norm": 0.5579485893249512, "learning_rate": 2.2125816580215853e-06, "loss": 0.021069619804620743, "memory(GiB)": 22.66, "step": 21666, "token_acc": 0.995, "train_speed(iter/s)": 0.95585 }, { "epoch": 0.7038625215216191, "grad_norm": 0.3577752113342285, "learning_rate": 2.2121357362376173e-06, "loss": 0.015166367404162884, "memory(GiB)": 22.66, "step": 21667, "token_acc": 1.0, "train_speed(iter/s)": 0.955859 }, { "epoch": 0.7038950069843745, "grad_norm": 0.3612682521343231, "learning_rate": 2.211689846629314e-06, "loss": 0.014573488384485245, "memory(GiB)": 22.66, "step": 21668, "token_acc": 1.0, "train_speed(iter/s)": 0.955867 }, { "epoch": 0.7039274924471299, "grad_norm": 0.32276010513305664, "learning_rate": 2.2112439892018208e-06, "loss": 0.01757063716650009, "memory(GiB)": 22.66, "step": 21669, "token_acc": 0.9863481228668942, "train_speed(iter/s)": 0.955875 }, { "epoch": 0.7039599779098853, "grad_norm": 0.3334294259548187, "learning_rate": 2.2107981639602837e-06, "loss": 0.014659471809864044, "memory(GiB)": 22.66, "step": 21670, "token_acc": 1.0, "train_speed(iter/s)": 0.955882 }, { "epoch": 0.7039924633726408, "grad_norm": 0.3282129764556885, "learning_rate": 2.21035237090985e-06, "loss": 0.0160360187292099, "memory(GiB)": 22.66, "step": 21671, "token_acc": 0.9947368421052631, "train_speed(iter/s)": 0.95589 }, { "epoch": 0.7040249488353961, "grad_norm": 0.4038754105567932, "learning_rate": 2.209906610055661e-06, "loss": 0.022613830864429474, "memory(GiB)": 22.66, "step": 21672, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955898 }, { "epoch": 0.7040574342981516, "grad_norm": 0.32360899448394775, "learning_rate": 2.209460881402864e-06, "loss": 0.01481928862631321, "memory(GiB)": 22.66, "step": 21673, "token_acc": 1.0, "train_speed(iter/s)": 0.955905 }, { "epoch": 0.704089919760907, "grad_norm": 0.3743247985839844, "learning_rate": 2.209015184956602e-06, "loss": 0.014625494368374348, "memory(GiB)": 22.66, "step": 21674, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.955913 }, { "epoch": 0.7041224052236624, "grad_norm": 0.4572913348674774, "learning_rate": 2.2085695207220214e-06, "loss": 0.017454613000154495, "memory(GiB)": 22.66, "step": 21675, "token_acc": 0.9894179894179894, "train_speed(iter/s)": 0.955921 }, { "epoch": 0.7041548906864178, "grad_norm": 0.2148449569940567, "learning_rate": 2.208123888704262e-06, "loss": 0.009263251908123493, "memory(GiB)": 22.66, "step": 21676, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955928 }, { "epoch": 0.7041873761491733, "grad_norm": 0.36999720335006714, "learning_rate": 2.2076782889084696e-06, "loss": 0.015384642407298088, "memory(GiB)": 22.66, "step": 21677, "token_acc": 0.974025974025974, "train_speed(iter/s)": 0.955935 }, { "epoch": 0.7042198616119286, "grad_norm": 0.23391515016555786, "learning_rate": 2.207232721339786e-06, "loss": 0.009275680407881737, "memory(GiB)": 22.66, "step": 21678, "token_acc": 0.9965635738831615, "train_speed(iter/s)": 0.955942 }, { "epoch": 0.7042523470746841, "grad_norm": 0.3465847074985504, "learning_rate": 2.206787186003355e-06, "loss": 0.011793001554906368, "memory(GiB)": 22.66, "step": 21679, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.95595 }, { "epoch": 0.7042848325374395, "grad_norm": 0.3589053750038147, "learning_rate": 2.2063416829043167e-06, "loss": 0.015726283192634583, "memory(GiB)": 22.66, "step": 21680, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.955957 }, { "epoch": 0.7043173180001949, "grad_norm": 0.3833181858062744, "learning_rate": 2.2058962120478156e-06, "loss": 0.021943537518382072, "memory(GiB)": 22.66, "step": 21681, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.955965 }, { "epoch": 0.7043498034629503, "grad_norm": 0.38305437564849854, "learning_rate": 2.205450773438987e-06, "loss": 0.017659276723861694, "memory(GiB)": 22.66, "step": 21682, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.955973 }, { "epoch": 0.7043822889257058, "grad_norm": 0.30633455514907837, "learning_rate": 2.20500536708298e-06, "loss": 0.012823511846363544, "memory(GiB)": 22.66, "step": 21683, "token_acc": 1.0, "train_speed(iter/s)": 0.955981 }, { "epoch": 0.7044147743884611, "grad_norm": 0.38481462001800537, "learning_rate": 2.2045599929849292e-06, "loss": 0.014686617068946362, "memory(GiB)": 22.66, "step": 21684, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.955988 }, { "epoch": 0.7044472598512166, "grad_norm": 0.34084630012512207, "learning_rate": 2.2041146511499766e-06, "loss": 0.020893992856144905, "memory(GiB)": 22.66, "step": 21685, "token_acc": 1.0, "train_speed(iter/s)": 0.955996 }, { "epoch": 0.704479745313972, "grad_norm": 0.294281005859375, "learning_rate": 2.2036693415832646e-06, "loss": 0.011460140347480774, "memory(GiB)": 22.66, "step": 21686, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956003 }, { "epoch": 0.7045122307767274, "grad_norm": 0.25103822350502014, "learning_rate": 2.2032240642899266e-06, "loss": 0.011867654509842396, "memory(GiB)": 22.66, "step": 21687, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.95601 }, { "epoch": 0.7045447162394828, "grad_norm": 0.32523101568222046, "learning_rate": 2.202778819275109e-06, "loss": 0.009117748588323593, "memory(GiB)": 22.66, "step": 21688, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.956017 }, { "epoch": 0.7045772017022383, "grad_norm": 0.3490983545780182, "learning_rate": 2.2023336065439444e-06, "loss": 0.019740285351872444, "memory(GiB)": 22.66, "step": 21689, "token_acc": 1.0, "train_speed(iter/s)": 0.956025 }, { "epoch": 0.7046096871649936, "grad_norm": 0.4124792218208313, "learning_rate": 2.201888426101576e-06, "loss": 0.013713886961340904, "memory(GiB)": 22.66, "step": 21690, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.956031 }, { "epoch": 0.7046421726277491, "grad_norm": 0.3616993725299835, "learning_rate": 2.201443277953138e-06, "loss": 0.011939054355025291, "memory(GiB)": 22.66, "step": 21691, "token_acc": 1.0, "train_speed(iter/s)": 0.95604 }, { "epoch": 0.7046746580905044, "grad_norm": 0.3048955798149109, "learning_rate": 2.2009981621037692e-06, "loss": 0.012698313221335411, "memory(GiB)": 22.66, "step": 21692, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956048 }, { "epoch": 0.7047071435532599, "grad_norm": 0.48159047961235046, "learning_rate": 2.2005530785586064e-06, "loss": 0.014227062463760376, "memory(GiB)": 22.66, "step": 21693, "token_acc": 0.9963768115942029, "train_speed(iter/s)": 0.956057 }, { "epoch": 0.7047396290160153, "grad_norm": 0.31586992740631104, "learning_rate": 2.2001080273227898e-06, "loss": 0.008695963770151138, "memory(GiB)": 22.66, "step": 21694, "token_acc": 0.9947643979057592, "train_speed(iter/s)": 0.956067 }, { "epoch": 0.7047721144787708, "grad_norm": 0.3200075626373291, "learning_rate": 2.1996630084014508e-06, "loss": 0.008856179192662239, "memory(GiB)": 22.66, "step": 21695, "token_acc": 1.0, "train_speed(iter/s)": 0.956076 }, { "epoch": 0.7048045999415262, "grad_norm": 0.37889236211776733, "learning_rate": 2.1992180217997284e-06, "loss": 0.01918932795524597, "memory(GiB)": 22.66, "step": 21696, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.956086 }, { "epoch": 0.7048370854042816, "grad_norm": 0.21538306772708893, "learning_rate": 2.1987730675227577e-06, "loss": 0.009381292387843132, "memory(GiB)": 22.66, "step": 21697, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.956095 }, { "epoch": 0.704869570867037, "grad_norm": 0.2719719707965851, "learning_rate": 2.198328145575676e-06, "loss": 0.012276115827262402, "memory(GiB)": 22.66, "step": 21698, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.956105 }, { "epoch": 0.7049020563297924, "grad_norm": 0.21607764065265656, "learning_rate": 2.197883255963615e-06, "loss": 0.010529499500989914, "memory(GiB)": 22.66, "step": 21699, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.956114 }, { "epoch": 0.7049345417925479, "grad_norm": 0.39972129464149475, "learning_rate": 2.1974383986917105e-06, "loss": 0.014904597774147987, "memory(GiB)": 22.66, "step": 21700, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.956123 }, { "epoch": 0.7049670272553032, "grad_norm": 0.3370606005191803, "learning_rate": 2.196993573765097e-06, "loss": 0.011927453801035881, "memory(GiB)": 22.66, "step": 21701, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.956131 }, { "epoch": 0.7049995127180587, "grad_norm": 0.39921045303344727, "learning_rate": 2.1965487811889086e-06, "loss": 0.018870292231440544, "memory(GiB)": 22.66, "step": 21702, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956137 }, { "epoch": 0.7050319981808141, "grad_norm": 0.3723849952220917, "learning_rate": 2.1961040209682804e-06, "loss": 0.01915963739156723, "memory(GiB)": 22.66, "step": 21703, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.956143 }, { "epoch": 0.7050644836435696, "grad_norm": 0.37365901470184326, "learning_rate": 2.195659293108342e-06, "loss": 0.014374446123838425, "memory(GiB)": 22.66, "step": 21704, "token_acc": 1.0, "train_speed(iter/s)": 0.95615 }, { "epoch": 0.7050969691063249, "grad_norm": 0.5052819848060608, "learning_rate": 2.195214597614228e-06, "loss": 0.01756911352276802, "memory(GiB)": 22.66, "step": 21705, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.956157 }, { "epoch": 0.7051294545690804, "grad_norm": 0.24908243119716644, "learning_rate": 2.1947699344910705e-06, "loss": 0.010061528533697128, "memory(GiB)": 22.66, "step": 21706, "token_acc": 1.0, "train_speed(iter/s)": 0.956164 }, { "epoch": 0.7051619400318357, "grad_norm": 0.2691831588745117, "learning_rate": 2.1943253037440036e-06, "loss": 0.011408133432269096, "memory(GiB)": 22.66, "step": 21707, "token_acc": 1.0, "train_speed(iter/s)": 0.956171 }, { "epoch": 0.7051944254945912, "grad_norm": 0.2930394113063812, "learning_rate": 2.1938807053781554e-06, "loss": 0.010565335862338543, "memory(GiB)": 22.66, "step": 21708, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.956178 }, { "epoch": 0.7052269109573466, "grad_norm": 0.2710498571395874, "learning_rate": 2.1934361393986602e-06, "loss": 0.012343459762632847, "memory(GiB)": 22.66, "step": 21709, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956185 }, { "epoch": 0.705259396420102, "grad_norm": 0.2676374614238739, "learning_rate": 2.1929916058106444e-06, "loss": 0.013549482449889183, "memory(GiB)": 22.66, "step": 21710, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.956192 }, { "epoch": 0.7052918818828574, "grad_norm": 0.32404398918151855, "learning_rate": 2.192547104619245e-06, "loss": 0.013197844848036766, "memory(GiB)": 22.66, "step": 21711, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956199 }, { "epoch": 0.7053243673456129, "grad_norm": 0.3570045530796051, "learning_rate": 2.1921026358295867e-06, "loss": 0.01051271241158247, "memory(GiB)": 22.66, "step": 21712, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956205 }, { "epoch": 0.7053568528083682, "grad_norm": 0.37123873829841614, "learning_rate": 2.1916581994468033e-06, "loss": 0.017427396029233932, "memory(GiB)": 22.66, "step": 21713, "token_acc": 1.0, "train_speed(iter/s)": 0.956212 }, { "epoch": 0.7053893382711237, "grad_norm": 0.2450798898935318, "learning_rate": 2.1912137954760203e-06, "loss": 0.010261078365147114, "memory(GiB)": 22.66, "step": 21714, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.956219 }, { "epoch": 0.7054218237338791, "grad_norm": 0.38845112919807434, "learning_rate": 2.190769423922369e-06, "loss": 0.015535159036517143, "memory(GiB)": 22.66, "step": 21715, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956226 }, { "epoch": 0.7054543091966345, "grad_norm": 0.3811933398246765, "learning_rate": 2.1903250847909767e-06, "loss": 0.02088218927383423, "memory(GiB)": 22.66, "step": 21716, "token_acc": 0.995, "train_speed(iter/s)": 0.956232 }, { "epoch": 0.7054867946593899, "grad_norm": 0.28229039907455444, "learning_rate": 2.1898807780869737e-06, "loss": 0.013571787625551224, "memory(GiB)": 22.66, "step": 21717, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.956237 }, { "epoch": 0.7055192801221454, "grad_norm": 0.28318315744400024, "learning_rate": 2.189436503815488e-06, "loss": 0.013458682224154472, "memory(GiB)": 22.66, "step": 21718, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.956244 }, { "epoch": 0.7055517655849007, "grad_norm": 1.0967965126037598, "learning_rate": 2.1889922619816438e-06, "loss": 0.018073610961437225, "memory(GiB)": 22.66, "step": 21719, "token_acc": 1.0, "train_speed(iter/s)": 0.956252 }, { "epoch": 0.7055842510476562, "grad_norm": 0.28320878744125366, "learning_rate": 2.1885480525905706e-06, "loss": 0.01399749331176281, "memory(GiB)": 22.66, "step": 21720, "token_acc": 0.99609375, "train_speed(iter/s)": 0.95626 }, { "epoch": 0.7056167365104116, "grad_norm": 0.22401881217956543, "learning_rate": 2.188103875647395e-06, "loss": 0.009336345829069614, "memory(GiB)": 22.66, "step": 21721, "token_acc": 1.0, "train_speed(iter/s)": 0.956268 }, { "epoch": 0.705649221973167, "grad_norm": 0.31745487451553345, "learning_rate": 2.187659731157245e-06, "loss": 0.01227949745953083, "memory(GiB)": 22.66, "step": 21722, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956277 }, { "epoch": 0.7056817074359224, "grad_norm": 0.29310330748558044, "learning_rate": 2.1872156191252436e-06, "loss": 0.012886430136859417, "memory(GiB)": 22.66, "step": 21723, "token_acc": 1.0, "train_speed(iter/s)": 0.956287 }, { "epoch": 0.7057141928986779, "grad_norm": 0.3951042592525482, "learning_rate": 2.186771539556517e-06, "loss": 0.0164963211864233, "memory(GiB)": 22.66, "step": 21724, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956296 }, { "epoch": 0.7057466783614332, "grad_norm": 0.34754329919815063, "learning_rate": 2.1863274924561915e-06, "loss": 0.013567640446126461, "memory(GiB)": 22.66, "step": 21725, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956305 }, { "epoch": 0.7057791638241887, "grad_norm": 0.47698676586151123, "learning_rate": 2.1858834778293937e-06, "loss": 0.02242964133620262, "memory(GiB)": 22.66, "step": 21726, "token_acc": 1.0, "train_speed(iter/s)": 0.956314 }, { "epoch": 0.7058116492869441, "grad_norm": 0.3323379456996918, "learning_rate": 2.185439495681244e-06, "loss": 0.013687596656382084, "memory(GiB)": 22.66, "step": 21727, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.956323 }, { "epoch": 0.7058441347496995, "grad_norm": 0.2928377091884613, "learning_rate": 2.184995546016869e-06, "loss": 0.010955305770039558, "memory(GiB)": 22.66, "step": 21728, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.956332 }, { "epoch": 0.7058766202124549, "grad_norm": 0.3526705503463745, "learning_rate": 2.1845516288413926e-06, "loss": 0.013464486226439476, "memory(GiB)": 22.66, "step": 21729, "token_acc": 1.0, "train_speed(iter/s)": 0.95634 }, { "epoch": 0.7059091056752104, "grad_norm": 0.2959333658218384, "learning_rate": 2.184107744159939e-06, "loss": 0.011176031082868576, "memory(GiB)": 22.66, "step": 21730, "token_acc": 1.0, "train_speed(iter/s)": 0.956349 }, { "epoch": 0.7059415911379657, "grad_norm": 0.28022149205207825, "learning_rate": 2.183663891977628e-06, "loss": 0.009228020906448364, "memory(GiB)": 22.66, "step": 21731, "token_acc": 1.0, "train_speed(iter/s)": 0.956356 }, { "epoch": 0.7059740766007212, "grad_norm": 0.40612125396728516, "learning_rate": 2.1832200722995846e-06, "loss": 0.02725556120276451, "memory(GiB)": 22.66, "step": 21732, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956363 }, { "epoch": 0.7060065620634766, "grad_norm": 0.31066858768463135, "learning_rate": 2.1827762851309305e-06, "loss": 0.01636592298746109, "memory(GiB)": 22.66, "step": 21733, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.95637 }, { "epoch": 0.706039047526232, "grad_norm": 0.3709861934185028, "learning_rate": 2.182332530476788e-06, "loss": 0.014631671831011772, "memory(GiB)": 22.66, "step": 21734, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956378 }, { "epoch": 0.7060715329889874, "grad_norm": 0.3781376779079437, "learning_rate": 2.1818888083422795e-06, "loss": 0.014103705994784832, "memory(GiB)": 22.66, "step": 21735, "token_acc": 1.0, "train_speed(iter/s)": 0.956385 }, { "epoch": 0.7061040184517429, "grad_norm": 0.2322242558002472, "learning_rate": 2.1814451187325242e-06, "loss": 0.009995941072702408, "memory(GiB)": 22.66, "step": 21736, "token_acc": 1.0, "train_speed(iter/s)": 0.956393 }, { "epoch": 0.7061365039144982, "grad_norm": 0.46332433819770813, "learning_rate": 2.181001461652645e-06, "loss": 0.022390684112906456, "memory(GiB)": 22.66, "step": 21737, "token_acc": 1.0, "train_speed(iter/s)": 0.9564 }, { "epoch": 0.7061689893772537, "grad_norm": 0.5349341630935669, "learning_rate": 2.180557837107757e-06, "loss": 0.0232350155711174, "memory(GiB)": 22.66, "step": 21738, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.956407 }, { "epoch": 0.7062014748400091, "grad_norm": 0.3970934748649597, "learning_rate": 2.180114245102989e-06, "loss": 0.016126209869980812, "memory(GiB)": 22.66, "step": 21739, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956415 }, { "epoch": 0.7062339603027645, "grad_norm": 0.45995983481407166, "learning_rate": 2.1796706856434536e-06, "loss": 0.01863153651356697, "memory(GiB)": 22.66, "step": 21740, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956422 }, { "epoch": 0.7062664457655199, "grad_norm": 0.4543512761592865, "learning_rate": 2.179227158734274e-06, "loss": 0.01693686842918396, "memory(GiB)": 22.66, "step": 21741, "token_acc": 0.992, "train_speed(iter/s)": 0.956429 }, { "epoch": 0.7062989312282754, "grad_norm": 0.31500136852264404, "learning_rate": 2.1787836643805654e-06, "loss": 0.015987131744623184, "memory(GiB)": 22.66, "step": 21742, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.956437 }, { "epoch": 0.7063314166910307, "grad_norm": 0.415760338306427, "learning_rate": 2.1783402025874486e-06, "loss": 0.018288280814886093, "memory(GiB)": 22.66, "step": 21743, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.956444 }, { "epoch": 0.7063639021537862, "grad_norm": 1.422029733657837, "learning_rate": 2.1778967733600415e-06, "loss": 0.01850036345422268, "memory(GiB)": 22.66, "step": 21744, "token_acc": 0.99, "train_speed(iter/s)": 0.956452 }, { "epoch": 0.7063963876165416, "grad_norm": 0.2229968160390854, "learning_rate": 2.177453376703463e-06, "loss": 0.011338959448039532, "memory(GiB)": 22.66, "step": 21745, "token_acc": 1.0, "train_speed(iter/s)": 0.956459 }, { "epoch": 0.706428873079297, "grad_norm": 0.43895280361175537, "learning_rate": 2.1770100126228284e-06, "loss": 0.01088119950145483, "memory(GiB)": 22.66, "step": 21746, "token_acc": 1.0, "train_speed(iter/s)": 0.956467 }, { "epoch": 0.7064613585420524, "grad_norm": 0.4200276732444763, "learning_rate": 2.1765666811232554e-06, "loss": 0.018008291721343994, "memory(GiB)": 22.66, "step": 21747, "token_acc": 0.9699248120300752, "train_speed(iter/s)": 0.956474 }, { "epoch": 0.7064938440048079, "grad_norm": 0.36152538657188416, "learning_rate": 2.176123382209861e-06, "loss": 0.01321972906589508, "memory(GiB)": 22.66, "step": 21748, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.956481 }, { "epoch": 0.7065263294675632, "grad_norm": 0.35237300395965576, "learning_rate": 2.175680115887761e-06, "loss": 0.021358748897910118, "memory(GiB)": 22.66, "step": 21749, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956488 }, { "epoch": 0.7065588149303187, "grad_norm": 0.4562121033668518, "learning_rate": 2.1752368821620735e-06, "loss": 0.027566466480493546, "memory(GiB)": 22.66, "step": 21750, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956495 }, { "epoch": 0.706591300393074, "grad_norm": 0.4450477063655853, "learning_rate": 2.1747936810379104e-06, "loss": 0.015908189117908478, "memory(GiB)": 22.66, "step": 21751, "token_acc": 1.0, "train_speed(iter/s)": 0.956503 }, { "epoch": 0.7066237858558295, "grad_norm": 0.33831772208213806, "learning_rate": 2.1743505125203886e-06, "loss": 0.015100451186299324, "memory(GiB)": 22.66, "step": 21752, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.956511 }, { "epoch": 0.7066562713185849, "grad_norm": 0.3592005968093872, "learning_rate": 2.173907376614623e-06, "loss": 0.02193206362426281, "memory(GiB)": 22.66, "step": 21753, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.956518 }, { "epoch": 0.7066887567813404, "grad_norm": 0.397163987159729, "learning_rate": 2.173464273325729e-06, "loss": 0.012091157957911491, "memory(GiB)": 22.66, "step": 21754, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956528 }, { "epoch": 0.7067212422440957, "grad_norm": 0.4417581856250763, "learning_rate": 2.1730212026588182e-06, "loss": 0.01671714335680008, "memory(GiB)": 22.66, "step": 21755, "token_acc": 1.0, "train_speed(iter/s)": 0.956537 }, { "epoch": 0.7067537277068512, "grad_norm": 0.22051112353801727, "learning_rate": 2.1725781646190055e-06, "loss": 0.009195057675242424, "memory(GiB)": 22.66, "step": 21756, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956547 }, { "epoch": 0.7067862131696065, "grad_norm": 0.8371103405952454, "learning_rate": 2.172135159211403e-06, "loss": 0.015250232070684433, "memory(GiB)": 22.66, "step": 21757, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.956556 }, { "epoch": 0.706818698632362, "grad_norm": 0.45303478837013245, "learning_rate": 2.1716921864411277e-06, "loss": 0.01818072609603405, "memory(GiB)": 22.66, "step": 21758, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.956565 }, { "epoch": 0.7068511840951174, "grad_norm": 0.46020299196243286, "learning_rate": 2.1712492463132868e-06, "loss": 0.02201683446764946, "memory(GiB)": 22.66, "step": 21759, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956574 }, { "epoch": 0.7068836695578729, "grad_norm": 0.21770836412906647, "learning_rate": 2.1708063388329963e-06, "loss": 0.010843414813280106, "memory(GiB)": 22.66, "step": 21760, "token_acc": 1.0, "train_speed(iter/s)": 0.956583 }, { "epoch": 0.7069161550206283, "grad_norm": 0.5604685544967651, "learning_rate": 2.1703634640053626e-06, "loss": 0.01937215030193329, "memory(GiB)": 22.66, "step": 21761, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.956591 }, { "epoch": 0.7069486404833837, "grad_norm": 0.3850455582141876, "learning_rate": 2.169920621835503e-06, "loss": 0.0068975575268268585, "memory(GiB)": 22.66, "step": 21762, "token_acc": 1.0, "train_speed(iter/s)": 0.956597 }, { "epoch": 0.7069811259461392, "grad_norm": 0.3068646192550659, "learning_rate": 2.169477812328528e-06, "loss": 0.012752082198858261, "memory(GiB)": 22.66, "step": 21763, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.956605 }, { "epoch": 0.7070136114088945, "grad_norm": 0.40346023440361023, "learning_rate": 2.169035035489544e-06, "loss": 0.013123439624905586, "memory(GiB)": 22.66, "step": 21764, "token_acc": 0.9885496183206107, "train_speed(iter/s)": 0.956611 }, { "epoch": 0.70704609687165, "grad_norm": 0.296336829662323, "learning_rate": 2.168592291323666e-06, "loss": 0.01266210712492466, "memory(GiB)": 22.66, "step": 21765, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.956619 }, { "epoch": 0.7070785823344053, "grad_norm": 0.3746175169944763, "learning_rate": 2.1681495798359982e-06, "loss": 0.015381423756480217, "memory(GiB)": 22.66, "step": 21766, "token_acc": 1.0, "train_speed(iter/s)": 0.956625 }, { "epoch": 0.7071110677971608, "grad_norm": 0.2514720857143402, "learning_rate": 2.1677069010316565e-06, "loss": 0.012095658108592033, "memory(GiB)": 22.66, "step": 21767, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.956633 }, { "epoch": 0.7071435532599162, "grad_norm": 0.29454299807548523, "learning_rate": 2.1672642549157457e-06, "loss": 0.012579469941556454, "memory(GiB)": 22.66, "step": 21768, "token_acc": 1.0, "train_speed(iter/s)": 0.95664 }, { "epoch": 0.7071760387226717, "grad_norm": 0.5016444325447083, "learning_rate": 2.1668216414933774e-06, "loss": 0.01690344698727131, "memory(GiB)": 22.66, "step": 21769, "token_acc": 0.994475138121547, "train_speed(iter/s)": 0.956646 }, { "epoch": 0.707208524185427, "grad_norm": 0.266034871339798, "learning_rate": 2.166379060769656e-06, "loss": 0.013994591310620308, "memory(GiB)": 22.66, "step": 21770, "token_acc": 1.0, "train_speed(iter/s)": 0.956652 }, { "epoch": 0.7072410096481825, "grad_norm": 0.3453209102153778, "learning_rate": 2.165936512749692e-06, "loss": 0.017912529408931732, "memory(GiB)": 22.66, "step": 21771, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.95666 }, { "epoch": 0.7072734951109378, "grad_norm": 0.6269774436950684, "learning_rate": 2.165493997438593e-06, "loss": 0.022189853712916374, "memory(GiB)": 22.66, "step": 21772, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.956666 }, { "epoch": 0.7073059805736933, "grad_norm": 0.3221205174922943, "learning_rate": 2.165051514841468e-06, "loss": 0.016141971573233604, "memory(GiB)": 22.66, "step": 21773, "token_acc": 1.0, "train_speed(iter/s)": 0.956674 }, { "epoch": 0.7073384660364487, "grad_norm": 0.45060619711875916, "learning_rate": 2.1646090649634198e-06, "loss": 0.012619979679584503, "memory(GiB)": 22.66, "step": 21774, "token_acc": 1.0, "train_speed(iter/s)": 0.95668 }, { "epoch": 0.7073709514992041, "grad_norm": 0.3250899910926819, "learning_rate": 2.164166647809557e-06, "loss": 0.01619432494044304, "memory(GiB)": 22.66, "step": 21775, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956687 }, { "epoch": 0.7074034369619595, "grad_norm": 0.42143434286117554, "learning_rate": 2.163724263384985e-06, "loss": 0.013598017394542694, "memory(GiB)": 22.66, "step": 21776, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.956695 }, { "epoch": 0.707435922424715, "grad_norm": 0.3298039436340332, "learning_rate": 2.1632819116948108e-06, "loss": 0.015089351683855057, "memory(GiB)": 22.66, "step": 21777, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.956702 }, { "epoch": 0.7074684078874703, "grad_norm": 0.39005348086357117, "learning_rate": 2.1628395927441404e-06, "loss": 0.021108217537403107, "memory(GiB)": 22.66, "step": 21778, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956709 }, { "epoch": 0.7075008933502258, "grad_norm": 0.3789916932582855, "learning_rate": 2.1623973065380756e-06, "loss": 0.01734795793890953, "memory(GiB)": 22.66, "step": 21779, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956717 }, { "epoch": 0.7075333788129812, "grad_norm": 0.3193885087966919, "learning_rate": 2.161955053081723e-06, "loss": 0.01206393726170063, "memory(GiB)": 22.66, "step": 21780, "token_acc": 1.0, "train_speed(iter/s)": 0.956723 }, { "epoch": 0.7075658642757366, "grad_norm": 0.28820306062698364, "learning_rate": 2.161512832380186e-06, "loss": 0.014544399455189705, "memory(GiB)": 22.66, "step": 21781, "token_acc": 1.0, "train_speed(iter/s)": 0.956732 }, { "epoch": 0.707598349738492, "grad_norm": 0.3181130886077881, "learning_rate": 2.161070644438571e-06, "loss": 0.017799513414502144, "memory(GiB)": 22.66, "step": 21782, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.956741 }, { "epoch": 0.7076308352012475, "grad_norm": 0.25314047932624817, "learning_rate": 2.1606284892619776e-06, "loss": 0.011633617803454399, "memory(GiB)": 22.66, "step": 21783, "token_acc": 1.0, "train_speed(iter/s)": 0.95675 }, { "epoch": 0.7076633206640028, "grad_norm": 0.3996543884277344, "learning_rate": 2.1601863668555104e-06, "loss": 0.018410352990031242, "memory(GiB)": 22.66, "step": 21784, "token_acc": 1.0, "train_speed(iter/s)": 0.956759 }, { "epoch": 0.7076958061267583, "grad_norm": 0.3128318786621094, "learning_rate": 2.1597442772242726e-06, "loss": 0.013469724915921688, "memory(GiB)": 22.66, "step": 21785, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.956768 }, { "epoch": 0.7077282915895137, "grad_norm": 0.39332762360572815, "learning_rate": 2.1593022203733677e-06, "loss": 0.016043953597545624, "memory(GiB)": 22.66, "step": 21786, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.956777 }, { "epoch": 0.7077607770522691, "grad_norm": 0.27043282985687256, "learning_rate": 2.1588601963078946e-06, "loss": 0.009998781606554985, "memory(GiB)": 22.66, "step": 21787, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.956786 }, { "epoch": 0.7077932625150245, "grad_norm": 0.6063588857650757, "learning_rate": 2.1584182050329583e-06, "loss": 0.014985864982008934, "memory(GiB)": 22.66, "step": 21788, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956795 }, { "epoch": 0.70782574797778, "grad_norm": 0.4455999732017517, "learning_rate": 2.157976246553654e-06, "loss": 0.020668085664510727, "memory(GiB)": 22.66, "step": 21789, "token_acc": 1.0, "train_speed(iter/s)": 0.956804 }, { "epoch": 0.7078582334405353, "grad_norm": 0.36493080854415894, "learning_rate": 2.157534320875091e-06, "loss": 0.011956503614783287, "memory(GiB)": 22.66, "step": 21790, "token_acc": 1.0, "train_speed(iter/s)": 0.956814 }, { "epoch": 0.7078907189032908, "grad_norm": 0.31008201837539673, "learning_rate": 2.1570924280023625e-06, "loss": 0.01573152467608452, "memory(GiB)": 22.66, "step": 21791, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956823 }, { "epoch": 0.7079232043660462, "grad_norm": 0.3624643087387085, "learning_rate": 2.1566505679405737e-06, "loss": 0.014129461720585823, "memory(GiB)": 22.66, "step": 21792, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.95683 }, { "epoch": 0.7079556898288016, "grad_norm": 0.3466951847076416, "learning_rate": 2.1562087406948196e-06, "loss": 0.01427207700908184, "memory(GiB)": 22.66, "step": 21793, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.956838 }, { "epoch": 0.707988175291557, "grad_norm": 0.3741753101348877, "learning_rate": 2.1557669462702e-06, "loss": 0.016377165913581848, "memory(GiB)": 22.66, "step": 21794, "token_acc": 1.0, "train_speed(iter/s)": 0.956846 }, { "epoch": 0.7080206607543125, "grad_norm": 0.40058469772338867, "learning_rate": 2.1553251846718193e-06, "loss": 0.017256595194339752, "memory(GiB)": 22.66, "step": 21795, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956853 }, { "epoch": 0.7080531462170678, "grad_norm": 0.4461671710014343, "learning_rate": 2.1548834559047702e-06, "loss": 0.015734704211354256, "memory(GiB)": 22.66, "step": 21796, "token_acc": 1.0, "train_speed(iter/s)": 0.956861 }, { "epoch": 0.7080856316798233, "grad_norm": 0.49228471517562866, "learning_rate": 2.1544417599741553e-06, "loss": 0.01972864754498005, "memory(GiB)": 22.66, "step": 21797, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.956868 }, { "epoch": 0.7081181171425787, "grad_norm": 0.22144417464733124, "learning_rate": 2.1540000968850672e-06, "loss": 0.008957315236330032, "memory(GiB)": 22.66, "step": 21798, "token_acc": 1.0, "train_speed(iter/s)": 0.956876 }, { "epoch": 0.7081506026053341, "grad_norm": 0.3998209238052368, "learning_rate": 2.153558466642607e-06, "loss": 0.013951210305094719, "memory(GiB)": 22.66, "step": 21799, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.956883 }, { "epoch": 0.7081830880680895, "grad_norm": 0.46814271807670593, "learning_rate": 2.1531168692518693e-06, "loss": 0.021532656624913216, "memory(GiB)": 22.66, "step": 21800, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.956891 }, { "epoch": 0.708215573530845, "grad_norm": 0.4599902033805847, "learning_rate": 2.152675304717955e-06, "loss": 0.01718970015645027, "memory(GiB)": 22.66, "step": 21801, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.956898 }, { "epoch": 0.7082480589936003, "grad_norm": 0.43333548307418823, "learning_rate": 2.1522337730459557e-06, "loss": 0.018636900931596756, "memory(GiB)": 22.66, "step": 21802, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956906 }, { "epoch": 0.7082805444563558, "grad_norm": 0.3700862228870392, "learning_rate": 2.1517922742409686e-06, "loss": 0.013040410354733467, "memory(GiB)": 22.66, "step": 21803, "token_acc": 1.0, "train_speed(iter/s)": 0.956913 }, { "epoch": 0.7083130299191112, "grad_norm": 0.4313906729221344, "learning_rate": 2.1513508083080897e-06, "loss": 0.012133141979575157, "memory(GiB)": 22.66, "step": 21804, "token_acc": 1.0, "train_speed(iter/s)": 0.956921 }, { "epoch": 0.7083455153818666, "grad_norm": 0.35684558749198914, "learning_rate": 2.1509093752524157e-06, "loss": 0.013011246919631958, "memory(GiB)": 22.66, "step": 21805, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956927 }, { "epoch": 0.708378000844622, "grad_norm": 0.35702353715896606, "learning_rate": 2.150467975079038e-06, "loss": 0.012381056323647499, "memory(GiB)": 22.66, "step": 21806, "token_acc": 0.996, "train_speed(iter/s)": 0.956935 }, { "epoch": 0.7084104863073775, "grad_norm": 0.3989967703819275, "learning_rate": 2.1500266077930526e-06, "loss": 0.015879765152931213, "memory(GiB)": 22.66, "step": 21807, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956943 }, { "epoch": 0.7084429717701328, "grad_norm": 0.22093529999256134, "learning_rate": 2.149585273399554e-06, "loss": 0.009344667196273804, "memory(GiB)": 22.66, "step": 21808, "token_acc": 0.99609375, "train_speed(iter/s)": 0.956952 }, { "epoch": 0.7084754572328883, "grad_norm": 0.32355138659477234, "learning_rate": 2.1491439719036345e-06, "loss": 0.013039054349064827, "memory(GiB)": 22.66, "step": 21809, "token_acc": 1.0, "train_speed(iter/s)": 0.95696 }, { "epoch": 0.7085079426956437, "grad_norm": 0.35695868730545044, "learning_rate": 2.1487027033103897e-06, "loss": 0.016979146748781204, "memory(GiB)": 22.66, "step": 21810, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.956967 }, { "epoch": 0.7085404281583991, "grad_norm": 0.5126247406005859, "learning_rate": 2.148261467624909e-06, "loss": 0.021424509584903717, "memory(GiB)": 22.66, "step": 21811, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.956959 }, { "epoch": 0.7085729136211545, "grad_norm": 0.42427217960357666, "learning_rate": 2.1478202648522873e-06, "loss": 0.019171088933944702, "memory(GiB)": 22.66, "step": 21812, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956967 }, { "epoch": 0.70860539908391, "grad_norm": 0.5283916592597961, "learning_rate": 2.1473790949976152e-06, "loss": 0.01583540439605713, "memory(GiB)": 22.66, "step": 21813, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.956974 }, { "epoch": 0.7086378845466653, "grad_norm": 0.3921842575073242, "learning_rate": 2.1469379580659877e-06, "loss": 0.010906057432293892, "memory(GiB)": 22.66, "step": 21814, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956982 }, { "epoch": 0.7086703700094208, "grad_norm": 0.3569951057434082, "learning_rate": 2.1464968540624915e-06, "loss": 0.014859490096569061, "memory(GiB)": 22.66, "step": 21815, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956991 }, { "epoch": 0.7087028554721762, "grad_norm": 0.4027796983718872, "learning_rate": 2.146055782992221e-06, "loss": 0.012258024886250496, "memory(GiB)": 22.66, "step": 21816, "token_acc": 1.0, "train_speed(iter/s)": 0.957 }, { "epoch": 0.7087353409349316, "grad_norm": 0.3541249632835388, "learning_rate": 2.1456147448602627e-06, "loss": 0.013780981302261353, "memory(GiB)": 22.66, "step": 21817, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.95701 }, { "epoch": 0.708767826397687, "grad_norm": 0.35676419734954834, "learning_rate": 2.145173739671712e-06, "loss": 0.017136547714471817, "memory(GiB)": 22.66, "step": 21818, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.957019 }, { "epoch": 0.7088003118604425, "grad_norm": 0.3851923644542694, "learning_rate": 2.1447327674316547e-06, "loss": 0.008970905095338821, "memory(GiB)": 22.66, "step": 21819, "token_acc": 1.0, "train_speed(iter/s)": 0.957027 }, { "epoch": 0.7088327973231978, "grad_norm": 0.21842500567436218, "learning_rate": 2.1442918281451835e-06, "loss": 0.008602917194366455, "memory(GiB)": 22.66, "step": 21820, "token_acc": 1.0, "train_speed(iter/s)": 0.957035 }, { "epoch": 0.7088652827859533, "grad_norm": 0.4668092727661133, "learning_rate": 2.1438509218173843e-06, "loss": 0.023646119982004166, "memory(GiB)": 22.66, "step": 21821, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.957041 }, { "epoch": 0.7088977682487086, "grad_norm": 0.29864251613616943, "learning_rate": 2.143410048453346e-06, "loss": 0.014978300780057907, "memory(GiB)": 22.66, "step": 21822, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957049 }, { "epoch": 0.7089302537114641, "grad_norm": 0.5230826735496521, "learning_rate": 2.1429692080581583e-06, "loss": 0.026371831074357033, "memory(GiB)": 22.66, "step": 21823, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.957056 }, { "epoch": 0.7089627391742196, "grad_norm": 0.2861512005329132, "learning_rate": 2.1425284006369086e-06, "loss": 0.009804586879909039, "memory(GiB)": 22.66, "step": 21824, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.957064 }, { "epoch": 0.708995224636975, "grad_norm": 0.3056514859199524, "learning_rate": 2.142087626194686e-06, "loss": 0.01354294829070568, "memory(GiB)": 22.66, "step": 21825, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.957071 }, { "epoch": 0.7090277100997304, "grad_norm": 1.246583104133606, "learning_rate": 2.1416468847365745e-06, "loss": 0.01825403980910778, "memory(GiB)": 22.66, "step": 21826, "token_acc": 0.9966887417218543, "train_speed(iter/s)": 0.957079 }, { "epoch": 0.7090601955624858, "grad_norm": 0.3714275062084198, "learning_rate": 2.1412061762676627e-06, "loss": 0.014791037887334824, "memory(GiB)": 22.66, "step": 21827, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957086 }, { "epoch": 0.7090926810252413, "grad_norm": 1.3439977169036865, "learning_rate": 2.140765500793036e-06, "loss": 0.01024119183421135, "memory(GiB)": 22.66, "step": 21828, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.957093 }, { "epoch": 0.7091251664879966, "grad_norm": 0.35719117522239685, "learning_rate": 2.1403248583177833e-06, "loss": 0.01508799847215414, "memory(GiB)": 22.66, "step": 21829, "token_acc": 0.9940476190476191, "train_speed(iter/s)": 0.957101 }, { "epoch": 0.7091576519507521, "grad_norm": 0.22387133538722992, "learning_rate": 2.139884248846986e-06, "loss": 0.010758797638118267, "memory(GiB)": 22.66, "step": 21830, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957108 }, { "epoch": 0.7091901374135074, "grad_norm": 0.3139567971229553, "learning_rate": 2.1394436723857314e-06, "loss": 0.013724720105528831, "memory(GiB)": 22.66, "step": 21831, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.957115 }, { "epoch": 0.7092226228762629, "grad_norm": 0.5251248478889465, "learning_rate": 2.1390031289391043e-06, "loss": 0.014831816777586937, "memory(GiB)": 22.66, "step": 21832, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.957122 }, { "epoch": 0.7092551083390183, "grad_norm": 0.28949809074401855, "learning_rate": 2.138562618512191e-06, "loss": 0.013674666173756123, "memory(GiB)": 22.66, "step": 21833, "token_acc": 1.0, "train_speed(iter/s)": 0.95713 }, { "epoch": 0.7092875938017738, "grad_norm": 0.4652392566204071, "learning_rate": 2.1381221411100716e-06, "loss": 0.0181626807898283, "memory(GiB)": 22.66, "step": 21834, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.957137 }, { "epoch": 0.7093200792645291, "grad_norm": 0.28147193789482117, "learning_rate": 2.1376816967378317e-06, "loss": 0.017225291579961777, "memory(GiB)": 22.66, "step": 21835, "token_acc": 1.0, "train_speed(iter/s)": 0.957144 }, { "epoch": 0.7093525647272846, "grad_norm": 0.39045751094818115, "learning_rate": 2.137241285400555e-06, "loss": 0.015580616891384125, "memory(GiB)": 22.66, "step": 21836, "token_acc": 0.9827586206896551, "train_speed(iter/s)": 0.957151 }, { "epoch": 0.70938505019004, "grad_norm": 0.4840974509716034, "learning_rate": 2.1368009071033258e-06, "loss": 0.015708077698946, "memory(GiB)": 22.66, "step": 21837, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.957157 }, { "epoch": 0.7094175356527954, "grad_norm": 0.277692049741745, "learning_rate": 2.1363605618512236e-06, "loss": 0.012331032194197178, "memory(GiB)": 22.66, "step": 21838, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957165 }, { "epoch": 0.7094500211155508, "grad_norm": 0.33119654655456543, "learning_rate": 2.1359202496493315e-06, "loss": 0.013668367639183998, "memory(GiB)": 22.66, "step": 21839, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957172 }, { "epoch": 0.7094825065783062, "grad_norm": 0.34976160526275635, "learning_rate": 2.135479970502732e-06, "loss": 0.01694418489933014, "memory(GiB)": 22.66, "step": 21840, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.957181 }, { "epoch": 0.7095149920410616, "grad_norm": 0.6194717884063721, "learning_rate": 2.135039724416506e-06, "loss": 0.01783410832285881, "memory(GiB)": 22.66, "step": 21841, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.95719 }, { "epoch": 0.7095474775038171, "grad_norm": 0.28309816122055054, "learning_rate": 2.1345995113957375e-06, "loss": 0.008727900683879852, "memory(GiB)": 22.66, "step": 21842, "token_acc": 1.0, "train_speed(iter/s)": 0.9572 }, { "epoch": 0.7095799629665724, "grad_norm": 0.3602317273616791, "learning_rate": 2.134159331445502e-06, "loss": 0.010421942919492722, "memory(GiB)": 22.66, "step": 21843, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.95721 }, { "epoch": 0.7096124484293279, "grad_norm": 0.36883544921875, "learning_rate": 2.1337191845708842e-06, "loss": 0.013900847174227238, "memory(GiB)": 22.66, "step": 21844, "token_acc": 0.996415770609319, "train_speed(iter/s)": 0.957219 }, { "epoch": 0.7096449338920833, "grad_norm": 0.34507718682289124, "learning_rate": 2.1332790707769584e-06, "loss": 0.013615133240818977, "memory(GiB)": 22.66, "step": 21845, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.957228 }, { "epoch": 0.7096774193548387, "grad_norm": 0.3566182255744934, "learning_rate": 2.132838990068811e-06, "loss": 0.017100201919674873, "memory(GiB)": 22.66, "step": 21846, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.957237 }, { "epoch": 0.7097099048175941, "grad_norm": 0.6380085349082947, "learning_rate": 2.1323989424515167e-06, "loss": 0.024127259850502014, "memory(GiB)": 22.66, "step": 21847, "token_acc": 0.9885714285714285, "train_speed(iter/s)": 0.957246 }, { "epoch": 0.7097423902803496, "grad_norm": 0.3150296211242676, "learning_rate": 2.131958927930157e-06, "loss": 0.013548365794122219, "memory(GiB)": 22.66, "step": 21848, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957255 }, { "epoch": 0.7097748757431049, "grad_norm": 0.37350165843963623, "learning_rate": 2.1315189465098064e-06, "loss": 0.012839091941714287, "memory(GiB)": 22.66, "step": 21849, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957265 }, { "epoch": 0.7098073612058604, "grad_norm": 0.34335410594940186, "learning_rate": 2.1310789981955452e-06, "loss": 0.014840896241366863, "memory(GiB)": 22.66, "step": 21850, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957274 }, { "epoch": 0.7098398466686158, "grad_norm": 0.40301910042762756, "learning_rate": 2.1306390829924512e-06, "loss": 0.011011790484189987, "memory(GiB)": 22.66, "step": 21851, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.957283 }, { "epoch": 0.7098723321313712, "grad_norm": 0.2933133840560913, "learning_rate": 2.1301992009056027e-06, "loss": 0.013598412275314331, "memory(GiB)": 22.66, "step": 21852, "token_acc": 1.0, "train_speed(iter/s)": 0.957292 }, { "epoch": 0.7099048175941266, "grad_norm": 0.5214527249336243, "learning_rate": 2.129759351940074e-06, "loss": 0.012967882677912712, "memory(GiB)": 22.66, "step": 21853, "token_acc": 0.9923954372623575, "train_speed(iter/s)": 0.957301 }, { "epoch": 0.7099373030568821, "grad_norm": 0.35709211230278015, "learning_rate": 2.1293195361009424e-06, "loss": 0.014079049229621887, "memory(GiB)": 22.66, "step": 21854, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.957309 }, { "epoch": 0.7099697885196374, "grad_norm": 0.33269476890563965, "learning_rate": 2.1288797533932835e-06, "loss": 0.012208999134600163, "memory(GiB)": 22.66, "step": 21855, "token_acc": 0.9837837837837838, "train_speed(iter/s)": 0.957316 }, { "epoch": 0.7100022739823929, "grad_norm": 0.3597840666770935, "learning_rate": 2.128440003822174e-06, "loss": 0.014028936624526978, "memory(GiB)": 22.66, "step": 21856, "token_acc": 1.0, "train_speed(iter/s)": 0.957323 }, { "epoch": 0.7100347594451483, "grad_norm": 0.2331002950668335, "learning_rate": 2.1280002873926913e-06, "loss": 0.013542726635932922, "memory(GiB)": 22.66, "step": 21857, "token_acc": 1.0, "train_speed(iter/s)": 0.95733 }, { "epoch": 0.7100672449079037, "grad_norm": 0.44593527913093567, "learning_rate": 2.1275606041099055e-06, "loss": 0.01766950450837612, "memory(GiB)": 22.66, "step": 21858, "token_acc": 1.0, "train_speed(iter/s)": 0.957338 }, { "epoch": 0.7100997303706591, "grad_norm": 0.46226760745048523, "learning_rate": 2.127120953978894e-06, "loss": 0.017267577350139618, "memory(GiB)": 22.66, "step": 21859, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957345 }, { "epoch": 0.7101322158334146, "grad_norm": 0.3451279103755951, "learning_rate": 2.12668133700473e-06, "loss": 0.010601747781038284, "memory(GiB)": 22.66, "step": 21860, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957353 }, { "epoch": 0.7101647012961699, "grad_norm": 0.3111780285835266, "learning_rate": 2.12624175319249e-06, "loss": 0.011499764397740364, "memory(GiB)": 22.66, "step": 21861, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.95736 }, { "epoch": 0.7101971867589254, "grad_norm": 0.26715847849845886, "learning_rate": 2.125802202547243e-06, "loss": 0.013823538087308407, "memory(GiB)": 22.66, "step": 21862, "token_acc": 0.9837837837837838, "train_speed(iter/s)": 0.957368 }, { "epoch": 0.7102296722216808, "grad_norm": 0.28598788380622864, "learning_rate": 2.125362685074064e-06, "loss": 0.013116540387272835, "memory(GiB)": 22.66, "step": 21863, "token_acc": 1.0, "train_speed(iter/s)": 0.957375 }, { "epoch": 0.7102621576844362, "grad_norm": 0.3556181490421295, "learning_rate": 2.124923200778026e-06, "loss": 0.009523221291601658, "memory(GiB)": 22.66, "step": 21864, "token_acc": 1.0, "train_speed(iter/s)": 0.957383 }, { "epoch": 0.7102946431471916, "grad_norm": 0.3724251091480255, "learning_rate": 2.1244837496642034e-06, "loss": 0.011533575132489204, "memory(GiB)": 22.66, "step": 21865, "token_acc": 1.0, "train_speed(iter/s)": 0.95739 }, { "epoch": 0.7103271286099471, "grad_norm": 0.3166157305240631, "learning_rate": 2.124044331737663e-06, "loss": 0.01028447411954403, "memory(GiB)": 22.66, "step": 21866, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957397 }, { "epoch": 0.7103596140727024, "grad_norm": 0.3899938464164734, "learning_rate": 2.123604947003481e-06, "loss": 0.011345336213707924, "memory(GiB)": 22.66, "step": 21867, "token_acc": 1.0, "train_speed(iter/s)": 0.957405 }, { "epoch": 0.7103920995354579, "grad_norm": 0.24724331498146057, "learning_rate": 2.123165595466723e-06, "loss": 0.008317621424794197, "memory(GiB)": 22.66, "step": 21868, "token_acc": 1.0, "train_speed(iter/s)": 0.957413 }, { "epoch": 0.7104245849982133, "grad_norm": 1.328690767288208, "learning_rate": 2.1227262771324666e-06, "loss": 0.022959498688578606, "memory(GiB)": 22.66, "step": 21869, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.957421 }, { "epoch": 0.7104570704609687, "grad_norm": 0.3291945457458496, "learning_rate": 2.1222869920057766e-06, "loss": 0.015575307421386242, "memory(GiB)": 22.66, "step": 21870, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.957429 }, { "epoch": 0.7104895559237241, "grad_norm": 0.33088594675064087, "learning_rate": 2.121847740091725e-06, "loss": 0.016656745225191116, "memory(GiB)": 22.66, "step": 21871, "token_acc": 1.0, "train_speed(iter/s)": 0.957436 }, { "epoch": 0.7105220413864796, "grad_norm": 0.48134610056877136, "learning_rate": 2.1214085213953827e-06, "loss": 0.017210060730576515, "memory(GiB)": 22.66, "step": 21872, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.957443 }, { "epoch": 0.7105545268492349, "grad_norm": 0.44715937972068787, "learning_rate": 2.120969335921814e-06, "loss": 0.013259855099022388, "memory(GiB)": 22.66, "step": 21873, "token_acc": 1.0, "train_speed(iter/s)": 0.957449 }, { "epoch": 0.7105870123119904, "grad_norm": 0.31176814436912537, "learning_rate": 2.120530183676095e-06, "loss": 0.01347656361758709, "memory(GiB)": 22.66, "step": 21874, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957457 }, { "epoch": 0.7106194977747458, "grad_norm": 0.3733086884021759, "learning_rate": 2.1200910646632882e-06, "loss": 0.013011196628212929, "memory(GiB)": 22.66, "step": 21875, "token_acc": 0.9921875, "train_speed(iter/s)": 0.957464 }, { "epoch": 0.7106519832375012, "grad_norm": 0.3932304382324219, "learning_rate": 2.1196519788884663e-06, "loss": 0.014368187636137009, "memory(GiB)": 22.66, "step": 21876, "token_acc": 1.0, "train_speed(iter/s)": 0.957472 }, { "epoch": 0.7106844687002566, "grad_norm": 0.30133989453315735, "learning_rate": 2.119212926356691e-06, "loss": 0.013374848291277885, "memory(GiB)": 22.66, "step": 21877, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.957479 }, { "epoch": 0.7107169541630121, "grad_norm": 0.32483774423599243, "learning_rate": 2.118773907073034e-06, "loss": 0.018748287111520767, "memory(GiB)": 22.66, "step": 21878, "token_acc": 1.0, "train_speed(iter/s)": 0.957489 }, { "epoch": 0.7107494396257674, "grad_norm": 0.3324747681617737, "learning_rate": 2.118334921042561e-06, "loss": 0.014424825087189674, "memory(GiB)": 22.66, "step": 21879, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.957498 }, { "epoch": 0.7107819250885229, "grad_norm": 0.3523785173892975, "learning_rate": 2.1178959682703397e-06, "loss": 0.01504797674715519, "memory(GiB)": 22.66, "step": 21880, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.957505 }, { "epoch": 0.7108144105512783, "grad_norm": 0.2608224153518677, "learning_rate": 2.1174570487614336e-06, "loss": 0.011368838138878345, "memory(GiB)": 22.66, "step": 21881, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.957512 }, { "epoch": 0.7108468960140337, "grad_norm": 0.7763302326202393, "learning_rate": 2.11701816252091e-06, "loss": 0.01163279078900814, "memory(GiB)": 22.66, "step": 21882, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.95752 }, { "epoch": 0.7108793814767891, "grad_norm": 0.25223657488822937, "learning_rate": 2.1165793095538333e-06, "loss": 0.015184896066784859, "memory(GiB)": 22.66, "step": 21883, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.957527 }, { "epoch": 0.7109118669395446, "grad_norm": 0.2967599034309387, "learning_rate": 2.1161404898652714e-06, "loss": 0.010673973709344864, "memory(GiB)": 22.66, "step": 21884, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957534 }, { "epoch": 0.7109443524022999, "grad_norm": 0.3265593945980072, "learning_rate": 2.1157017034602856e-06, "loss": 0.013953201472759247, "memory(GiB)": 22.66, "step": 21885, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957541 }, { "epoch": 0.7109768378650554, "grad_norm": 0.5641876459121704, "learning_rate": 2.1152629503439402e-06, "loss": 0.014867004007101059, "memory(GiB)": 22.66, "step": 21886, "token_acc": 1.0, "train_speed(iter/s)": 0.957546 }, { "epoch": 0.7110093233278107, "grad_norm": 0.20570245385169983, "learning_rate": 2.1148242305213e-06, "loss": 0.009116029366850853, "memory(GiB)": 22.66, "step": 21887, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.957553 }, { "epoch": 0.7110418087905662, "grad_norm": 0.2777094542980194, "learning_rate": 2.1143855439974285e-06, "loss": 0.01405191421508789, "memory(GiB)": 22.66, "step": 21888, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.95756 }, { "epoch": 0.7110742942533217, "grad_norm": 0.3526402711868286, "learning_rate": 2.1139468907773906e-06, "loss": 0.012842860072851181, "memory(GiB)": 22.66, "step": 21889, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957567 }, { "epoch": 0.711106779716077, "grad_norm": 0.5848612189292908, "learning_rate": 2.113508270866245e-06, "loss": 0.012897566892206669, "memory(GiB)": 22.66, "step": 21890, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.957574 }, { "epoch": 0.7111392651788325, "grad_norm": 0.33381959795951843, "learning_rate": 2.1130696842690556e-06, "loss": 0.010673342272639275, "memory(GiB)": 22.66, "step": 21891, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.957581 }, { "epoch": 0.7111717506415879, "grad_norm": 0.3756794333457947, "learning_rate": 2.112631130990885e-06, "loss": 0.00966699980199337, "memory(GiB)": 22.66, "step": 21892, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957588 }, { "epoch": 0.7112042361043434, "grad_norm": 0.32111600041389465, "learning_rate": 2.112192611036796e-06, "loss": 0.014499323442578316, "memory(GiB)": 22.66, "step": 21893, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.957594 }, { "epoch": 0.7112367215670987, "grad_norm": 0.36216166615486145, "learning_rate": 2.1117541244118455e-06, "loss": 0.01586442068219185, "memory(GiB)": 22.66, "step": 21894, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.957602 }, { "epoch": 0.7112692070298542, "grad_norm": 0.4772041141986847, "learning_rate": 2.1113156711210984e-06, "loss": 0.016034014523029327, "memory(GiB)": 22.66, "step": 21895, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.957609 }, { "epoch": 0.7113016924926095, "grad_norm": 0.3013162910938263, "learning_rate": 2.11087725116961e-06, "loss": 0.01701120100915432, "memory(GiB)": 22.66, "step": 21896, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.957616 }, { "epoch": 0.711334177955365, "grad_norm": 0.4060671925544739, "learning_rate": 2.1104388645624473e-06, "loss": 0.015372811816632748, "memory(GiB)": 22.66, "step": 21897, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.957623 }, { "epoch": 0.7113666634181204, "grad_norm": 0.3311435580253601, "learning_rate": 2.1100005113046646e-06, "loss": 0.016596945002675056, "memory(GiB)": 22.66, "step": 21898, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.95763 }, { "epoch": 0.7113991488808759, "grad_norm": 0.32236814498901367, "learning_rate": 2.1095621914013236e-06, "loss": 0.01322435587644577, "memory(GiB)": 22.66, "step": 21899, "token_acc": 1.0, "train_speed(iter/s)": 0.957639 }, { "epoch": 0.7114316343436312, "grad_norm": 0.27074405550956726, "learning_rate": 2.109123904857481e-06, "loss": 0.009092177264392376, "memory(GiB)": 22.66, "step": 21900, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.957649 }, { "epoch": 0.7114641198063867, "grad_norm": 0.3702954351902008, "learning_rate": 2.108685651678194e-06, "loss": 0.016952481120824814, "memory(GiB)": 22.66, "step": 21901, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.957658 }, { "epoch": 0.711496605269142, "grad_norm": 0.4662787914276123, "learning_rate": 2.108247431868527e-06, "loss": 0.015072403475642204, "memory(GiB)": 22.66, "step": 21902, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.957667 }, { "epoch": 0.7115290907318975, "grad_norm": 0.3369406461715698, "learning_rate": 2.1078092454335313e-06, "loss": 0.013864005915820599, "memory(GiB)": 22.66, "step": 21903, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.957676 }, { "epoch": 0.7115615761946529, "grad_norm": 0.501250147819519, "learning_rate": 2.107371092378268e-06, "loss": 0.01687949150800705, "memory(GiB)": 22.66, "step": 21904, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957685 }, { "epoch": 0.7115940616574083, "grad_norm": 0.4167908728122711, "learning_rate": 2.1069329727077916e-06, "loss": 0.017708364874124527, "memory(GiB)": 22.66, "step": 21905, "token_acc": 0.9878048780487805, "train_speed(iter/s)": 0.957694 }, { "epoch": 0.7116265471201637, "grad_norm": 0.3117929697036743, "learning_rate": 2.1064948864271582e-06, "loss": 0.008056703954935074, "memory(GiB)": 22.66, "step": 21906, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957703 }, { "epoch": 0.7116590325829192, "grad_norm": 0.3323694169521332, "learning_rate": 2.1060568335414256e-06, "loss": 0.012161081656813622, "memory(GiB)": 22.66, "step": 21907, "token_acc": 1.0, "train_speed(iter/s)": 0.957712 }, { "epoch": 0.7116915180456745, "grad_norm": 0.2936576306819916, "learning_rate": 2.1056188140556506e-06, "loss": 0.010414021089673042, "memory(GiB)": 22.66, "step": 21908, "token_acc": 0.9921875, "train_speed(iter/s)": 0.957721 }, { "epoch": 0.71172400350843, "grad_norm": 0.295341432094574, "learning_rate": 2.105180827974885e-06, "loss": 0.011127127334475517, "memory(GiB)": 22.66, "step": 21909, "token_acc": 1.0, "train_speed(iter/s)": 0.95773 }, { "epoch": 0.7117564889711854, "grad_norm": 0.24265682697296143, "learning_rate": 2.1047428753041858e-06, "loss": 0.009546604938805103, "memory(GiB)": 22.66, "step": 21910, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957739 }, { "epoch": 0.7117889744339408, "grad_norm": 0.35511916875839233, "learning_rate": 2.104304956048608e-06, "loss": 0.017819354310631752, "memory(GiB)": 22.66, "step": 21911, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957748 }, { "epoch": 0.7118214598966962, "grad_norm": 0.4113369882106781, "learning_rate": 2.1038670702132063e-06, "loss": 0.011798831634223461, "memory(GiB)": 22.66, "step": 21912, "token_acc": 0.9966666666666667, "train_speed(iter/s)": 0.957757 }, { "epoch": 0.7118539453594517, "grad_norm": 0.3688516914844513, "learning_rate": 2.1034292178030318e-06, "loss": 0.013146212324500084, "memory(GiB)": 22.66, "step": 21913, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.957766 }, { "epoch": 0.711886430822207, "grad_norm": 0.39536988735198975, "learning_rate": 2.1029913988231394e-06, "loss": 0.012403029948472977, "memory(GiB)": 22.66, "step": 21914, "token_acc": 1.0, "train_speed(iter/s)": 0.957775 }, { "epoch": 0.7119189162849625, "grad_norm": 0.4259062111377716, "learning_rate": 2.1025536132785817e-06, "loss": 0.01543421857059002, "memory(GiB)": 22.66, "step": 21915, "token_acc": 0.9964028776978417, "train_speed(iter/s)": 0.957784 }, { "epoch": 0.7119514017477179, "grad_norm": 0.5722470283508301, "learning_rate": 2.1021158611744126e-06, "loss": 0.01626323163509369, "memory(GiB)": 22.66, "step": 21916, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.957791 }, { "epoch": 0.7119838872104733, "grad_norm": 0.21292752027511597, "learning_rate": 2.1016781425156845e-06, "loss": 0.00832931138575077, "memory(GiB)": 22.66, "step": 21917, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.957798 }, { "epoch": 0.7120163726732287, "grad_norm": 0.3236730396747589, "learning_rate": 2.1012404573074467e-06, "loss": 0.010693350806832314, "memory(GiB)": 22.66, "step": 21918, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957805 }, { "epoch": 0.7120488581359842, "grad_norm": 0.3039131164550781, "learning_rate": 2.100802805554753e-06, "loss": 0.013328754343092442, "memory(GiB)": 22.66, "step": 21919, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.957812 }, { "epoch": 0.7120813435987395, "grad_norm": 0.2637757956981659, "learning_rate": 2.100365187262653e-06, "loss": 0.008950920775532722, "memory(GiB)": 22.66, "step": 21920, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957819 }, { "epoch": 0.712113829061495, "grad_norm": 0.3000001907348633, "learning_rate": 2.0999276024362003e-06, "loss": 0.012475337833166122, "memory(GiB)": 22.66, "step": 21921, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957825 }, { "epoch": 0.7121463145242504, "grad_norm": 0.3369758427143097, "learning_rate": 2.099490051080441e-06, "loss": 0.0164474043995142, "memory(GiB)": 22.66, "step": 21922, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957833 }, { "epoch": 0.7121787999870058, "grad_norm": 0.4487290680408478, "learning_rate": 2.099052533200429e-06, "loss": 0.012372354045510292, "memory(GiB)": 22.66, "step": 21923, "token_acc": 1.0, "train_speed(iter/s)": 0.95784 }, { "epoch": 0.7122112854497612, "grad_norm": 0.3740277588367462, "learning_rate": 2.098615048801208e-06, "loss": 0.009810974821448326, "memory(GiB)": 22.66, "step": 21924, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.957847 }, { "epoch": 0.7122437709125167, "grad_norm": 0.4177292287349701, "learning_rate": 2.0981775978878354e-06, "loss": 0.016430236399173737, "memory(GiB)": 22.66, "step": 21925, "token_acc": 0.984, "train_speed(iter/s)": 0.957854 }, { "epoch": 0.712276256375272, "grad_norm": 0.31448042392730713, "learning_rate": 2.0977401804653534e-06, "loss": 0.015848422423005104, "memory(GiB)": 22.66, "step": 21926, "token_acc": 1.0, "train_speed(iter/s)": 0.957861 }, { "epoch": 0.7123087418380275, "grad_norm": 0.3169380724430084, "learning_rate": 2.0973027965388144e-06, "loss": 0.01281420886516571, "memory(GiB)": 22.66, "step": 21927, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957868 }, { "epoch": 0.7123412273007829, "grad_norm": 0.5407891869544983, "learning_rate": 2.096865446113261e-06, "loss": 0.01894019916653633, "memory(GiB)": 22.66, "step": 21928, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957875 }, { "epoch": 0.7123737127635383, "grad_norm": 0.337384968996048, "learning_rate": 2.0964281291937484e-06, "loss": 0.01124422624707222, "memory(GiB)": 22.66, "step": 21929, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957882 }, { "epoch": 0.7124061982262937, "grad_norm": 0.35556739568710327, "learning_rate": 2.0959908457853173e-06, "loss": 0.013763762079179287, "memory(GiB)": 22.66, "step": 21930, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957889 }, { "epoch": 0.7124386836890492, "grad_norm": 0.3293165862560272, "learning_rate": 2.0955535958930194e-06, "loss": 0.018552429974079132, "memory(GiB)": 22.66, "step": 21931, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.957897 }, { "epoch": 0.7124711691518045, "grad_norm": 0.3708433508872986, "learning_rate": 2.095116379521897e-06, "loss": 0.015117641538381577, "memory(GiB)": 22.66, "step": 21932, "token_acc": 0.9802955665024631, "train_speed(iter/s)": 0.957904 }, { "epoch": 0.71250365461456, "grad_norm": 1.0627750158309937, "learning_rate": 2.0946791966769957e-06, "loss": 0.020683595910668373, "memory(GiB)": 22.66, "step": 21933, "token_acc": 1.0, "train_speed(iter/s)": 0.957912 }, { "epoch": 0.7125361400773154, "grad_norm": 0.31980806589126587, "learning_rate": 2.0942420473633673e-06, "loss": 0.014170996844768524, "memory(GiB)": 22.66, "step": 21934, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957918 }, { "epoch": 0.7125686255400708, "grad_norm": 0.2845165431499481, "learning_rate": 2.093804931586052e-06, "loss": 0.01237020269036293, "memory(GiB)": 22.66, "step": 21935, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.957924 }, { "epoch": 0.7126011110028262, "grad_norm": 0.34692060947418213, "learning_rate": 2.093367849350097e-06, "loss": 0.014757277444005013, "memory(GiB)": 22.66, "step": 21936, "token_acc": 0.9918367346938776, "train_speed(iter/s)": 0.957932 }, { "epoch": 0.7126335964655817, "grad_norm": 0.2886897325515747, "learning_rate": 2.092930800660544e-06, "loss": 0.012779192999005318, "memory(GiB)": 22.66, "step": 21937, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.957939 }, { "epoch": 0.712666081928337, "grad_norm": 0.466383159160614, "learning_rate": 2.092493785522439e-06, "loss": 0.019241075962781906, "memory(GiB)": 22.66, "step": 21938, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.957946 }, { "epoch": 0.7126985673910925, "grad_norm": 0.2672981917858124, "learning_rate": 2.0920568039408257e-06, "loss": 0.008470125496387482, "memory(GiB)": 22.66, "step": 21939, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957953 }, { "epoch": 0.7127310528538479, "grad_norm": 1.3588836193084717, "learning_rate": 2.0916198559207495e-06, "loss": 0.0152437100186944, "memory(GiB)": 22.66, "step": 21940, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.957959 }, { "epoch": 0.7127635383166033, "grad_norm": 0.35633552074432373, "learning_rate": 2.0911829414672486e-06, "loss": 0.014496374875307083, "memory(GiB)": 22.66, "step": 21941, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.957966 }, { "epoch": 0.7127960237793587, "grad_norm": 0.45461955666542053, "learning_rate": 2.0907460605853688e-06, "loss": 0.017867188900709152, "memory(GiB)": 22.66, "step": 21942, "token_acc": 0.9868995633187773, "train_speed(iter/s)": 0.957973 }, { "epoch": 0.7128285092421142, "grad_norm": 0.18145114183425903, "learning_rate": 2.0903092132801515e-06, "loss": 0.010058612562716007, "memory(GiB)": 22.66, "step": 21943, "token_acc": 0.9921875, "train_speed(iter/s)": 0.95798 }, { "epoch": 0.7128609947048695, "grad_norm": 0.48198407888412476, "learning_rate": 2.0898723995566404e-06, "loss": 0.01234235055744648, "memory(GiB)": 22.66, "step": 21944, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.957986 }, { "epoch": 0.712893480167625, "grad_norm": 1.0773849487304688, "learning_rate": 2.0894356194198734e-06, "loss": 0.01070246659219265, "memory(GiB)": 22.66, "step": 21945, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.957994 }, { "epoch": 0.7129259656303804, "grad_norm": 0.2836056053638458, "learning_rate": 2.0889988728748935e-06, "loss": 0.012443368323147297, "memory(GiB)": 22.66, "step": 21946, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.958001 }, { "epoch": 0.7129584510931358, "grad_norm": 0.4238221049308777, "learning_rate": 2.0885621599267407e-06, "loss": 0.011847035959362984, "memory(GiB)": 22.66, "step": 21947, "token_acc": 1.0, "train_speed(iter/s)": 0.958007 }, { "epoch": 0.7129909365558912, "grad_norm": 0.34714236855506897, "learning_rate": 2.0881254805804557e-06, "loss": 0.01479429192841053, "memory(GiB)": 22.66, "step": 21948, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.958015 }, { "epoch": 0.7130234220186467, "grad_norm": 0.24764834344387054, "learning_rate": 2.0876888348410807e-06, "loss": 0.012260600924491882, "memory(GiB)": 22.66, "step": 21949, "token_acc": 1.0, "train_speed(iter/s)": 0.958022 }, { "epoch": 0.713055907481402, "grad_norm": 0.4201008975505829, "learning_rate": 2.0872522227136503e-06, "loss": 0.018081437796354294, "memory(GiB)": 22.66, "step": 21950, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.958028 }, { "epoch": 0.7130883929441575, "grad_norm": 0.3334748148918152, "learning_rate": 2.0868156442032067e-06, "loss": 0.014418642967939377, "memory(GiB)": 22.66, "step": 21951, "token_acc": 0.99609375, "train_speed(iter/s)": 0.958035 }, { "epoch": 0.713120878406913, "grad_norm": 0.5390240550041199, "learning_rate": 2.086379099314788e-06, "loss": 0.020057406276464462, "memory(GiB)": 22.66, "step": 21952, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.958042 }, { "epoch": 0.7131533638696683, "grad_norm": 0.3919064402580261, "learning_rate": 2.085942588053434e-06, "loss": 0.014205796644091606, "memory(GiB)": 22.66, "step": 21953, "token_acc": 1.0, "train_speed(iter/s)": 0.958049 }, { "epoch": 0.7131858493324238, "grad_norm": 0.33533281087875366, "learning_rate": 2.08550611042418e-06, "loss": 0.008744542486965656, "memory(GiB)": 22.66, "step": 21954, "token_acc": 1.0, "train_speed(iter/s)": 0.958056 }, { "epoch": 0.7132183347951792, "grad_norm": 0.2843635380268097, "learning_rate": 2.0850696664320656e-06, "loss": 0.013243676163256168, "memory(GiB)": 22.66, "step": 21955, "token_acc": 1.0, "train_speed(iter/s)": 0.958063 }, { "epoch": 0.7132508202579346, "grad_norm": 0.31726232171058655, "learning_rate": 2.0846332560821235e-06, "loss": 0.014023639261722565, "memory(GiB)": 22.66, "step": 21956, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.95807 }, { "epoch": 0.71328330572069, "grad_norm": 0.49760758876800537, "learning_rate": 2.084196879379398e-06, "loss": 0.021380193531513214, "memory(GiB)": 22.66, "step": 21957, "token_acc": 0.9790794979079498, "train_speed(iter/s)": 0.958077 }, { "epoch": 0.7133157911834455, "grad_norm": 0.29380306601524353, "learning_rate": 2.0837605363289193e-06, "loss": 0.010923326015472412, "memory(GiB)": 22.66, "step": 21958, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.958085 }, { "epoch": 0.7133482766462008, "grad_norm": 0.3369797468185425, "learning_rate": 2.083324226935728e-06, "loss": 0.013329663313925266, "memory(GiB)": 22.66, "step": 21959, "token_acc": 0.9932432432432432, "train_speed(iter/s)": 0.958094 }, { "epoch": 0.7133807621089563, "grad_norm": 0.3104037940502167, "learning_rate": 2.0828879512048545e-06, "loss": 0.0186055488884449, "memory(GiB)": 22.66, "step": 21960, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.958103 }, { "epoch": 0.7134132475717117, "grad_norm": 0.3866998553276062, "learning_rate": 2.082451709141337e-06, "loss": 0.018553875386714935, "memory(GiB)": 22.66, "step": 21961, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.958112 }, { "epoch": 0.7134457330344671, "grad_norm": 0.29786279797554016, "learning_rate": 2.0820155007502103e-06, "loss": 0.014445037581026554, "memory(GiB)": 22.66, "step": 21962, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.958122 }, { "epoch": 0.7134782184972225, "grad_norm": 0.31876856088638306, "learning_rate": 2.081579326036508e-06, "loss": 0.01156574860215187, "memory(GiB)": 22.66, "step": 21963, "token_acc": 0.9923371647509579, "train_speed(iter/s)": 0.958131 }, { "epoch": 0.713510703959978, "grad_norm": 0.38728681206703186, "learning_rate": 2.081143185005267e-06, "loss": 0.015651624649763107, "memory(GiB)": 22.66, "step": 21964, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.958141 }, { "epoch": 0.7135431894227333, "grad_norm": 0.36653146147727966, "learning_rate": 2.0807070776615166e-06, "loss": 0.017452603206038475, "memory(GiB)": 22.66, "step": 21965, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.95815 }, { "epoch": 0.7135756748854888, "grad_norm": 0.23755337297916412, "learning_rate": 2.0802710040102924e-06, "loss": 0.011726034805178642, "memory(GiB)": 22.66, "step": 21966, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.958158 }, { "epoch": 0.7136081603482441, "grad_norm": 0.4087904691696167, "learning_rate": 2.079834964056627e-06, "loss": 0.013861631974577904, "memory(GiB)": 22.66, "step": 21967, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.958167 }, { "epoch": 0.7136406458109996, "grad_norm": 0.25912249088287354, "learning_rate": 2.079398957805554e-06, "loss": 0.015744872391223907, "memory(GiB)": 22.66, "step": 21968, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.958177 }, { "epoch": 0.713673131273755, "grad_norm": 0.3362036943435669, "learning_rate": 2.078962985262103e-06, "loss": 0.014437064528465271, "memory(GiB)": 22.66, "step": 21969, "token_acc": 1.0, "train_speed(iter/s)": 0.958186 }, { "epoch": 0.7137056167365105, "grad_norm": 0.2692241966724396, "learning_rate": 2.0785270464313072e-06, "loss": 0.009817386046051979, "memory(GiB)": 22.66, "step": 21970, "token_acc": 1.0, "train_speed(iter/s)": 0.958195 }, { "epoch": 0.7137381021992658, "grad_norm": 0.3773621916770935, "learning_rate": 2.0780911413181977e-06, "loss": 0.01280677318572998, "memory(GiB)": 22.66, "step": 21971, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.958205 }, { "epoch": 0.7137705876620213, "grad_norm": 0.19352790713310242, "learning_rate": 2.0776552699278067e-06, "loss": 0.006535204593092203, "memory(GiB)": 22.66, "step": 21972, "token_acc": 1.0, "train_speed(iter/s)": 0.958214 }, { "epoch": 0.7138030731247766, "grad_norm": 0.35878702998161316, "learning_rate": 2.0772194322651624e-06, "loss": 0.01731235161423683, "memory(GiB)": 22.66, "step": 21973, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.958223 }, { "epoch": 0.7138355585875321, "grad_norm": 0.3398098051548004, "learning_rate": 2.0767836283352955e-06, "loss": 0.014552763663232327, "memory(GiB)": 22.66, "step": 21974, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.958232 }, { "epoch": 0.7138680440502875, "grad_norm": 0.27540016174316406, "learning_rate": 2.0763478581432368e-06, "loss": 0.011362257413566113, "memory(GiB)": 22.66, "step": 21975, "token_acc": 1.0, "train_speed(iter/s)": 0.958242 }, { "epoch": 0.713900529513043, "grad_norm": 0.31886711716651917, "learning_rate": 2.0759121216940166e-06, "loss": 0.011696144938468933, "memory(GiB)": 22.66, "step": 21976, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.958251 }, { "epoch": 0.7139330149757983, "grad_norm": 0.3618101179599762, "learning_rate": 2.075476418992661e-06, "loss": 0.019137345254421234, "memory(GiB)": 22.66, "step": 21977, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.958259 }, { "epoch": 0.7139655004385538, "grad_norm": 0.35627225041389465, "learning_rate": 2.0750407500441997e-06, "loss": 0.01396653987467289, "memory(GiB)": 22.66, "step": 21978, "token_acc": 0.9931972789115646, "train_speed(iter/s)": 0.958266 }, { "epoch": 0.7139979859013091, "grad_norm": 0.38480448722839355, "learning_rate": 2.0746051148536615e-06, "loss": 0.01745683141052723, "memory(GiB)": 22.66, "step": 21979, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.958274 }, { "epoch": 0.7140304713640646, "grad_norm": 0.3097022771835327, "learning_rate": 2.0741695134260743e-06, "loss": 0.017159931361675262, "memory(GiB)": 22.66, "step": 21980, "token_acc": 0.986013986013986, "train_speed(iter/s)": 0.958281 }, { "epoch": 0.71406295682682, "grad_norm": 0.23396459221839905, "learning_rate": 2.073733945766466e-06, "loss": 0.011333414353430271, "memory(GiB)": 22.66, "step": 21981, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.958287 }, { "epoch": 0.7140954422895754, "grad_norm": 0.27853450179100037, "learning_rate": 2.0732984118798622e-06, "loss": 0.011160585097968578, "memory(GiB)": 22.66, "step": 21982, "token_acc": 1.0, "train_speed(iter/s)": 0.958295 }, { "epoch": 0.7141279277523308, "grad_norm": 0.2580883800983429, "learning_rate": 2.072862911771291e-06, "loss": 0.011811566539108753, "memory(GiB)": 22.66, "step": 21983, "token_acc": 1.0, "train_speed(iter/s)": 0.958302 }, { "epoch": 0.7141604132150863, "grad_norm": 0.2143075317144394, "learning_rate": 2.072427445445775e-06, "loss": 0.011477796360850334, "memory(GiB)": 22.66, "step": 21984, "token_acc": 1.0, "train_speed(iter/s)": 0.95831 }, { "epoch": 0.7141928986778416, "grad_norm": 0.3390044569969177, "learning_rate": 2.071992012908346e-06, "loss": 0.012610546313226223, "memory(GiB)": 22.66, "step": 21985, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.958317 }, { "epoch": 0.7142253841405971, "grad_norm": 0.3443383574485779, "learning_rate": 2.0715566141640247e-06, "loss": 0.008260712958872318, "memory(GiB)": 22.66, "step": 21986, "token_acc": 1.0, "train_speed(iter/s)": 0.958324 }, { "epoch": 0.7142578696033525, "grad_norm": 0.409162312746048, "learning_rate": 2.0711212492178394e-06, "loss": 0.020017165690660477, "memory(GiB)": 22.66, "step": 21987, "token_acc": 1.0, "train_speed(iter/s)": 0.95833 }, { "epoch": 0.7142903550661079, "grad_norm": 0.3341871500015259, "learning_rate": 2.070685918074812e-06, "loss": 0.012131690047681332, "memory(GiB)": 22.66, "step": 21988, "token_acc": 1.0, "train_speed(iter/s)": 0.958337 }, { "epoch": 0.7143228405288633, "grad_norm": 0.37523433566093445, "learning_rate": 2.0702506207399675e-06, "loss": 0.012685807421803474, "memory(GiB)": 22.66, "step": 21989, "token_acc": 0.996, "train_speed(iter/s)": 0.958345 }, { "epoch": 0.7143553259916188, "grad_norm": 0.4575952887535095, "learning_rate": 2.06981535721833e-06, "loss": 0.014005688950419426, "memory(GiB)": 22.66, "step": 21990, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.958352 }, { "epoch": 0.7143878114543741, "grad_norm": 0.2100730985403061, "learning_rate": 2.0693801275149254e-06, "loss": 0.012875022366642952, "memory(GiB)": 22.66, "step": 21991, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.958359 }, { "epoch": 0.7144202969171296, "grad_norm": 0.26571500301361084, "learning_rate": 2.0689449316347735e-06, "loss": 0.015251899138092995, "memory(GiB)": 22.66, "step": 21992, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.958367 }, { "epoch": 0.714452782379885, "grad_norm": 0.26460716128349304, "learning_rate": 2.0685097695828984e-06, "loss": 0.01174158975481987, "memory(GiB)": 22.66, "step": 21993, "token_acc": 1.0, "train_speed(iter/s)": 0.958374 }, { "epoch": 0.7144852678426404, "grad_norm": 0.3396786153316498, "learning_rate": 2.0680746413643215e-06, "loss": 0.016335655003786087, "memory(GiB)": 22.66, "step": 21994, "token_acc": 0.996, "train_speed(iter/s)": 0.958381 }, { "epoch": 0.7145177533053958, "grad_norm": 0.37067633867263794, "learning_rate": 2.067639546984066e-06, "loss": 0.01892862841486931, "memory(GiB)": 22.66, "step": 21995, "token_acc": 0.99609375, "train_speed(iter/s)": 0.958388 }, { "epoch": 0.7145502387681513, "grad_norm": 0.4021056294441223, "learning_rate": 2.0672044864471545e-06, "loss": 0.013710328377783298, "memory(GiB)": 22.66, "step": 21996, "token_acc": 0.9890510948905109, "train_speed(iter/s)": 0.958393 }, { "epoch": 0.7145827242309066, "grad_norm": 0.29465270042419434, "learning_rate": 2.066769459758606e-06, "loss": 0.00797085277736187, "memory(GiB)": 22.66, "step": 21997, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.958401 }, { "epoch": 0.7146152096936621, "grad_norm": 0.3189520239830017, "learning_rate": 2.0663344669234416e-06, "loss": 0.012210816144943237, "memory(GiB)": 22.66, "step": 21998, "token_acc": 1.0, "train_speed(iter/s)": 0.958408 }, { "epoch": 0.7146476951564175, "grad_norm": 0.3866201341152191, "learning_rate": 2.0658995079466813e-06, "loss": 0.01917724870145321, "memory(GiB)": 22.66, "step": 21999, "token_acc": 0.984, "train_speed(iter/s)": 0.958414 }, { "epoch": 0.7146801806191729, "grad_norm": 0.3501315414905548, "learning_rate": 2.0654645828333487e-06, "loss": 0.011676153168082237, "memory(GiB)": 22.66, "step": 22000, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.95842 }, { "epoch": 0.7146801806191729, "eval_loss": 0.015376176685094833, "eval_runtime": 81.2588, "eval_samples_per_second": 122.448, "eval_steps_per_second": 3.827, "eval_token_acc": 0.9938563659546572, "step": 22000 }, { "epoch": 0.7147126660819283, "grad_norm": 0.47863510251045227, "learning_rate": 2.0650296915884587e-06, "loss": 0.013076169416308403, "memory(GiB)": 22.66, "step": 22001, "token_acc": 0.9933193697208647, "train_speed(iter/s)": 0.954597 }, { "epoch": 0.7147451515446838, "grad_norm": 0.4001603424549103, "learning_rate": 2.0645948342170325e-06, "loss": 0.017337854951620102, "memory(GiB)": 22.66, "step": 22002, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.954603 }, { "epoch": 0.7147776370074391, "grad_norm": 0.28702259063720703, "learning_rate": 2.0641600107240887e-06, "loss": 0.013545487076044083, "memory(GiB)": 22.66, "step": 22003, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.954611 }, { "epoch": 0.7148101224701946, "grad_norm": 0.42182961106300354, "learning_rate": 2.063725221114648e-06, "loss": 0.013177342712879181, "memory(GiB)": 22.66, "step": 22004, "token_acc": 1.0, "train_speed(iter/s)": 0.954619 }, { "epoch": 0.71484260793295, "grad_norm": 0.32121574878692627, "learning_rate": 2.063290465393725e-06, "loss": 0.014092089608311653, "memory(GiB)": 22.66, "step": 22005, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.954627 }, { "epoch": 0.7148750933957054, "grad_norm": 0.4884273111820221, "learning_rate": 2.062855743566339e-06, "loss": 0.021741095930337906, "memory(GiB)": 22.66, "step": 22006, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.954634 }, { "epoch": 0.7149075788584608, "grad_norm": 0.39518046379089355, "learning_rate": 2.062421055637505e-06, "loss": 0.016059644520282745, "memory(GiB)": 22.66, "step": 22007, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.954641 }, { "epoch": 0.7149400643212163, "grad_norm": 0.43712154030799866, "learning_rate": 2.0619864016122456e-06, "loss": 0.01701001264154911, "memory(GiB)": 22.66, "step": 22008, "token_acc": 1.0, "train_speed(iter/s)": 0.954649 }, { "epoch": 0.7149725497839716, "grad_norm": 0.49690550565719604, "learning_rate": 2.061551781495571e-06, "loss": 0.013185761868953705, "memory(GiB)": 22.66, "step": 22009, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.954655 }, { "epoch": 0.7150050352467271, "grad_norm": 0.558724582195282, "learning_rate": 2.0611171952925002e-06, "loss": 0.028685014694929123, "memory(GiB)": 22.66, "step": 22010, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.954662 }, { "epoch": 0.7150375207094825, "grad_norm": 0.3343999981880188, "learning_rate": 2.06068264300805e-06, "loss": 0.016391022130846977, "memory(GiB)": 22.66, "step": 22011, "token_acc": 1.0, "train_speed(iter/s)": 0.954669 }, { "epoch": 0.7150700061722379, "grad_norm": 0.3658812940120697, "learning_rate": 2.0602481246472317e-06, "loss": 0.018672211095690727, "memory(GiB)": 22.66, "step": 22012, "token_acc": 1.0, "train_speed(iter/s)": 0.954676 }, { "epoch": 0.7151024916349933, "grad_norm": 0.3573090136051178, "learning_rate": 2.059813640215066e-06, "loss": 0.01853140816092491, "memory(GiB)": 22.66, "step": 22013, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.954681 }, { "epoch": 0.7151349770977488, "grad_norm": 0.40776166319847107, "learning_rate": 2.0593791897165623e-06, "loss": 0.014721179381012917, "memory(GiB)": 22.66, "step": 22014, "token_acc": 0.9904761904761905, "train_speed(iter/s)": 0.954687 }, { "epoch": 0.7151674625605041, "grad_norm": 0.37924978137016296, "learning_rate": 2.058944773156738e-06, "loss": 0.018542975187301636, "memory(GiB)": 22.66, "step": 22015, "token_acc": 1.0, "train_speed(iter/s)": 0.954693 }, { "epoch": 0.7151999480232596, "grad_norm": 0.23624145984649658, "learning_rate": 2.0585103905406044e-06, "loss": 0.009421401657164097, "memory(GiB)": 22.66, "step": 22016, "token_acc": 1.0, "train_speed(iter/s)": 0.954699 }, { "epoch": 0.7152324334860151, "grad_norm": 0.2758645713329315, "learning_rate": 2.058076041873176e-06, "loss": 0.012345651164650917, "memory(GiB)": 22.66, "step": 22017, "token_acc": 0.99609375, "train_speed(iter/s)": 0.954705 }, { "epoch": 0.7152649189487704, "grad_norm": 0.2591049075126648, "learning_rate": 2.0576417271594657e-06, "loss": 0.014897298067808151, "memory(GiB)": 22.66, "step": 22018, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.954712 }, { "epoch": 0.7152974044115259, "grad_norm": 0.23197001218795776, "learning_rate": 2.0572074464044884e-06, "loss": 0.008531915955245495, "memory(GiB)": 22.66, "step": 22019, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.954718 }, { "epoch": 0.7153298898742813, "grad_norm": 0.38075366616249084, "learning_rate": 2.0567731996132512e-06, "loss": 0.016575029119849205, "memory(GiB)": 22.66, "step": 22020, "token_acc": 0.9918367346938776, "train_speed(iter/s)": 0.954724 }, { "epoch": 0.7153623753370367, "grad_norm": 0.35308897495269775, "learning_rate": 2.0563389867907695e-06, "loss": 0.019382011145353317, "memory(GiB)": 22.66, "step": 22021, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.95473 }, { "epoch": 0.7153948607997921, "grad_norm": 0.47632524371147156, "learning_rate": 2.0559048079420534e-06, "loss": 0.02248290181159973, "memory(GiB)": 22.66, "step": 22022, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.954737 }, { "epoch": 0.7154273462625476, "grad_norm": 0.3657286763191223, "learning_rate": 2.0554706630721164e-06, "loss": 0.01638062298297882, "memory(GiB)": 22.66, "step": 22023, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.954743 }, { "epoch": 0.7154598317253029, "grad_norm": 0.38228490948677063, "learning_rate": 2.0550365521859655e-06, "loss": 0.01950278878211975, "memory(GiB)": 22.66, "step": 22024, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.954749 }, { "epoch": 0.7154923171880584, "grad_norm": 0.3271074891090393, "learning_rate": 2.0546024752886125e-06, "loss": 0.010996868833899498, "memory(GiB)": 22.66, "step": 22025, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.954756 }, { "epoch": 0.7155248026508138, "grad_norm": 0.2690447270870209, "learning_rate": 2.054168432385067e-06, "loss": 0.010829497128725052, "memory(GiB)": 22.66, "step": 22026, "token_acc": 1.0, "train_speed(iter/s)": 0.954764 }, { "epoch": 0.7155572881135692, "grad_norm": 0.3366273045539856, "learning_rate": 2.0537344234803392e-06, "loss": 0.017575936391949654, "memory(GiB)": 22.66, "step": 22027, "token_acc": 0.9825174825174825, "train_speed(iter/s)": 0.95477 }, { "epoch": 0.7155897735763246, "grad_norm": 0.3574080467224121, "learning_rate": 2.053300448579439e-06, "loss": 0.01185732800513506, "memory(GiB)": 22.66, "step": 22028, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.954777 }, { "epoch": 0.71562225903908, "grad_norm": 0.44551733136177063, "learning_rate": 2.0528665076873715e-06, "loss": 0.01504102349281311, "memory(GiB)": 22.66, "step": 22029, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.954785 }, { "epoch": 0.7156547445018354, "grad_norm": 0.27272096276283264, "learning_rate": 2.052432600809148e-06, "loss": 0.009662507101893425, "memory(GiB)": 22.66, "step": 22030, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.954793 }, { "epoch": 0.7156872299645909, "grad_norm": 0.24230843782424927, "learning_rate": 2.051998727949775e-06, "loss": 0.009241544641554356, "memory(GiB)": 22.66, "step": 22031, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.9548 }, { "epoch": 0.7157197154273462, "grad_norm": 0.3370808959007263, "learning_rate": 2.051564889114262e-06, "loss": 0.01603575237095356, "memory(GiB)": 22.66, "step": 22032, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.954807 }, { "epoch": 0.7157522008901017, "grad_norm": 0.23934410512447357, "learning_rate": 2.0511310843076134e-06, "loss": 0.01176919974386692, "memory(GiB)": 22.66, "step": 22033, "token_acc": 1.0, "train_speed(iter/s)": 0.954803 }, { "epoch": 0.7157846863528571, "grad_norm": 0.6893704533576965, "learning_rate": 2.0506973135348378e-06, "loss": 0.009668519720435143, "memory(GiB)": 22.66, "step": 22034, "token_acc": 1.0, "train_speed(iter/s)": 0.954811 }, { "epoch": 0.7158171718156126, "grad_norm": 0.3530474007129669, "learning_rate": 2.0502635768009384e-06, "loss": 0.012812277302145958, "memory(GiB)": 22.66, "step": 22035, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.954818 }, { "epoch": 0.7158496572783679, "grad_norm": 0.40897953510284424, "learning_rate": 2.049829874110926e-06, "loss": 0.015033422969281673, "memory(GiB)": 22.66, "step": 22036, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.954825 }, { "epoch": 0.7158821427411234, "grad_norm": 0.35508307814598083, "learning_rate": 2.049396205469802e-06, "loss": 0.015368543565273285, "memory(GiB)": 22.66, "step": 22037, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.954834 }, { "epoch": 0.7159146282038787, "grad_norm": 0.3842199742794037, "learning_rate": 2.048962570882575e-06, "loss": 0.016509156674146652, "memory(GiB)": 22.66, "step": 22038, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.954842 }, { "epoch": 0.7159471136666342, "grad_norm": 0.34961915016174316, "learning_rate": 2.048528970354246e-06, "loss": 0.01014617271721363, "memory(GiB)": 22.66, "step": 22039, "token_acc": 1.0, "train_speed(iter/s)": 0.954852 }, { "epoch": 0.7159795991293896, "grad_norm": 0.4648381769657135, "learning_rate": 2.0480954038898194e-06, "loss": 0.015745775774121284, "memory(GiB)": 22.66, "step": 22040, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.954861 }, { "epoch": 0.716012084592145, "grad_norm": 0.37986549735069275, "learning_rate": 2.047661871494304e-06, "loss": 0.017984040081501007, "memory(GiB)": 22.66, "step": 22041, "token_acc": 0.9836956521739131, "train_speed(iter/s)": 0.95487 }, { "epoch": 0.7160445700549004, "grad_norm": 0.39556455612182617, "learning_rate": 2.047228373172698e-06, "loss": 0.02004946395754814, "memory(GiB)": 22.66, "step": 22042, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.954879 }, { "epoch": 0.7160770555176559, "grad_norm": 0.3029129207134247, "learning_rate": 2.0467949089300093e-06, "loss": 0.011580565944314003, "memory(GiB)": 22.66, "step": 22043, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.954888 }, { "epoch": 0.7161095409804112, "grad_norm": 0.2755492031574249, "learning_rate": 2.046361478771236e-06, "loss": 0.013306032866239548, "memory(GiB)": 22.66, "step": 22044, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.954898 }, { "epoch": 0.7161420264431667, "grad_norm": 0.2809845209121704, "learning_rate": 2.045928082701383e-06, "loss": 0.01274680346250534, "memory(GiB)": 22.66, "step": 22045, "token_acc": 1.0, "train_speed(iter/s)": 0.954907 }, { "epoch": 0.7161745119059221, "grad_norm": 0.421963632106781, "learning_rate": 2.045494720725451e-06, "loss": 0.0210771132260561, "memory(GiB)": 22.66, "step": 22046, "token_acc": 0.9948979591836735, "train_speed(iter/s)": 0.954916 }, { "epoch": 0.7162069973686775, "grad_norm": 0.5999838709831238, "learning_rate": 2.0450613928484448e-06, "loss": 0.01949041709303856, "memory(GiB)": 22.66, "step": 22047, "token_acc": 0.975, "train_speed(iter/s)": 0.954924 }, { "epoch": 0.7162394828314329, "grad_norm": 0.5432167053222656, "learning_rate": 2.0446280990753608e-06, "loss": 0.014702018350362778, "memory(GiB)": 22.66, "step": 22048, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.954933 }, { "epoch": 0.7162719682941884, "grad_norm": 0.4386739730834961, "learning_rate": 2.044194839411202e-06, "loss": 0.01846415363252163, "memory(GiB)": 22.66, "step": 22049, "token_acc": 1.0, "train_speed(iter/s)": 0.954942 }, { "epoch": 0.7163044537569437, "grad_norm": 0.30748143792152405, "learning_rate": 2.0437616138609694e-06, "loss": 0.011986907571554184, "memory(GiB)": 22.66, "step": 22050, "token_acc": 1.0, "train_speed(iter/s)": 0.954951 }, { "epoch": 0.7163369392196992, "grad_norm": 0.31290921568870544, "learning_rate": 2.043328422429664e-06, "loss": 0.014777895994484425, "memory(GiB)": 22.66, "step": 22051, "token_acc": 1.0, "train_speed(iter/s)": 0.95496 }, { "epoch": 0.7163694246824546, "grad_norm": 0.24769292771816254, "learning_rate": 2.042895265122283e-06, "loss": 0.01267310231924057, "memory(GiB)": 22.66, "step": 22052, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.95497 }, { "epoch": 0.71640191014521, "grad_norm": 0.2755332291126251, "learning_rate": 2.0424621419438256e-06, "loss": 0.01543358899652958, "memory(GiB)": 22.66, "step": 22053, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.954979 }, { "epoch": 0.7164343956079654, "grad_norm": 1.6730445623397827, "learning_rate": 2.0420290528992915e-06, "loss": 0.018535912036895752, "memory(GiB)": 22.66, "step": 22054, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.954989 }, { "epoch": 0.7164668810707209, "grad_norm": 0.36382749676704407, "learning_rate": 2.0415959979936796e-06, "loss": 0.02024845965206623, "memory(GiB)": 22.66, "step": 22055, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.954998 }, { "epoch": 0.7164993665334762, "grad_norm": 0.41966110467910767, "learning_rate": 2.041162977231989e-06, "loss": 0.01710897870361805, "memory(GiB)": 22.66, "step": 22056, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.955007 }, { "epoch": 0.7165318519962317, "grad_norm": 0.23648491501808167, "learning_rate": 2.0407299906192145e-06, "loss": 0.008639472536742687, "memory(GiB)": 22.66, "step": 22057, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.955017 }, { "epoch": 0.7165643374589871, "grad_norm": 0.3609040677547455, "learning_rate": 2.040297038160354e-06, "loss": 0.017117206007242203, "memory(GiB)": 22.66, "step": 22058, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955026 }, { "epoch": 0.7165968229217425, "grad_norm": 0.2371077835559845, "learning_rate": 2.0398641198604056e-06, "loss": 0.009099800139665604, "memory(GiB)": 22.66, "step": 22059, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.955035 }, { "epoch": 0.7166293083844979, "grad_norm": 0.24116170406341553, "learning_rate": 2.0394312357243667e-06, "loss": 0.01048331893980503, "memory(GiB)": 22.66, "step": 22060, "token_acc": 1.0, "train_speed(iter/s)": 0.955044 }, { "epoch": 0.7166617938472534, "grad_norm": 0.2318975329399109, "learning_rate": 2.0389983857572304e-06, "loss": 0.007993156090378761, "memory(GiB)": 22.66, "step": 22061, "token_acc": 1.0, "train_speed(iter/s)": 0.955053 }, { "epoch": 0.7166942793100087, "grad_norm": 0.20000898838043213, "learning_rate": 2.0385655699639957e-06, "loss": 0.008613109588623047, "memory(GiB)": 22.66, "step": 22062, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955062 }, { "epoch": 0.7167267647727642, "grad_norm": 0.2396182268857956, "learning_rate": 2.038132788349652e-06, "loss": 0.010778119787573814, "memory(GiB)": 22.66, "step": 22063, "token_acc": 1.0, "train_speed(iter/s)": 0.955068 }, { "epoch": 0.7167592502355196, "grad_norm": 0.30885928869247437, "learning_rate": 2.0377000409192016e-06, "loss": 0.012743913568556309, "memory(GiB)": 22.66, "step": 22064, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.955075 }, { "epoch": 0.716791735698275, "grad_norm": 0.26642388105392456, "learning_rate": 2.037267327677634e-06, "loss": 0.015292691066861153, "memory(GiB)": 22.66, "step": 22065, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.955082 }, { "epoch": 0.7168242211610304, "grad_norm": 0.6549772620201111, "learning_rate": 2.036834648629947e-06, "loss": 0.012734887190163136, "memory(GiB)": 22.66, "step": 22066, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.955089 }, { "epoch": 0.7168567066237859, "grad_norm": 0.3695884346961975, "learning_rate": 2.0364020037811294e-06, "loss": 0.01812884770333767, "memory(GiB)": 22.66, "step": 22067, "token_acc": 0.9856459330143541, "train_speed(iter/s)": 0.955096 }, { "epoch": 0.7168891920865412, "grad_norm": 0.2519327700138092, "learning_rate": 2.035969393136178e-06, "loss": 0.009580383077263832, "memory(GiB)": 22.66, "step": 22068, "token_acc": 1.0, "train_speed(iter/s)": 0.955103 }, { "epoch": 0.7169216775492967, "grad_norm": 0.26064369082450867, "learning_rate": 2.0355368167000846e-06, "loss": 0.007340621668845415, "memory(GiB)": 22.66, "step": 22069, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.95511 }, { "epoch": 0.7169541630120521, "grad_norm": 0.38792556524276733, "learning_rate": 2.0351042744778437e-06, "loss": 0.01653018221259117, "memory(GiB)": 22.66, "step": 22070, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.955117 }, { "epoch": 0.7169866484748075, "grad_norm": 0.2837969958782196, "learning_rate": 2.0346717664744444e-06, "loss": 0.00967338215559721, "memory(GiB)": 22.66, "step": 22071, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.955123 }, { "epoch": 0.7170191339375629, "grad_norm": 0.275443434715271, "learning_rate": 2.0342392926948793e-06, "loss": 0.013830777257680893, "memory(GiB)": 22.66, "step": 22072, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.95513 }, { "epoch": 0.7170516194003184, "grad_norm": 1.201866865158081, "learning_rate": 2.0338068531441405e-06, "loss": 0.01614363305270672, "memory(GiB)": 22.66, "step": 22073, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.955137 }, { "epoch": 0.7170841048630737, "grad_norm": 0.3147123456001282, "learning_rate": 2.0333744478272183e-06, "loss": 0.010346082970499992, "memory(GiB)": 22.66, "step": 22074, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.955144 }, { "epoch": 0.7171165903258292, "grad_norm": 0.4151757061481476, "learning_rate": 2.032942076749106e-06, "loss": 0.021892961114645004, "memory(GiB)": 22.66, "step": 22075, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.955151 }, { "epoch": 0.7171490757885846, "grad_norm": 0.3477500379085541, "learning_rate": 2.0325097399147887e-06, "loss": 0.01563083752989769, "memory(GiB)": 22.66, "step": 22076, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.955157 }, { "epoch": 0.71718156125134, "grad_norm": 0.2915763854980469, "learning_rate": 2.032077437329259e-06, "loss": 0.014521322213113308, "memory(GiB)": 22.66, "step": 22077, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.955163 }, { "epoch": 0.7172140467140954, "grad_norm": 0.3457898795604706, "learning_rate": 2.031645168997506e-06, "loss": 0.0163542740046978, "memory(GiB)": 22.66, "step": 22078, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.95517 }, { "epoch": 0.7172465321768509, "grad_norm": 0.2682937681674957, "learning_rate": 2.03121293492452e-06, "loss": 0.009451048448681831, "memory(GiB)": 22.66, "step": 22079, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.955176 }, { "epoch": 0.7172790176396063, "grad_norm": 0.40899401903152466, "learning_rate": 2.0307807351152873e-06, "loss": 0.020539402961730957, "memory(GiB)": 22.66, "step": 22080, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955182 }, { "epoch": 0.7173115031023617, "grad_norm": 0.49824148416519165, "learning_rate": 2.0303485695747967e-06, "loss": 0.020553622394800186, "memory(GiB)": 22.66, "step": 22081, "token_acc": 0.9895287958115183, "train_speed(iter/s)": 0.955188 }, { "epoch": 0.7173439885651172, "grad_norm": 0.44658362865448, "learning_rate": 2.0299164383080353e-06, "loss": 0.01805988699197769, "memory(GiB)": 22.66, "step": 22082, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.955196 }, { "epoch": 0.7173764740278725, "grad_norm": 0.3947966396808624, "learning_rate": 2.0294843413199943e-06, "loss": 0.017283327877521515, "memory(GiB)": 22.66, "step": 22083, "token_acc": 0.9905660377358491, "train_speed(iter/s)": 0.955203 }, { "epoch": 0.717408959490628, "grad_norm": 0.3617572486400604, "learning_rate": 2.0290522786156554e-06, "loss": 0.013388087041676044, "memory(GiB)": 22.66, "step": 22084, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955211 }, { "epoch": 0.7174414449533834, "grad_norm": 0.27672865986824036, "learning_rate": 2.028620250200009e-06, "loss": 0.00860653631389141, "memory(GiB)": 22.66, "step": 22085, "token_acc": 1.0, "train_speed(iter/s)": 0.955217 }, { "epoch": 0.7174739304161388, "grad_norm": 0.3056899309158325, "learning_rate": 2.028188256078037e-06, "loss": 0.01908034086227417, "memory(GiB)": 22.66, "step": 22086, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.955224 }, { "epoch": 0.7175064158788942, "grad_norm": 0.42459988594055176, "learning_rate": 2.0277562962547292e-06, "loss": 0.01642424985766411, "memory(GiB)": 22.66, "step": 22087, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.955231 }, { "epoch": 0.7175389013416497, "grad_norm": 0.3023853898048401, "learning_rate": 2.027324370735072e-06, "loss": 0.013653363101184368, "memory(GiB)": 22.66, "step": 22088, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955238 }, { "epoch": 0.717571386804405, "grad_norm": 0.4025173485279083, "learning_rate": 2.0268924795240457e-06, "loss": 0.014086868613958359, "memory(GiB)": 22.66, "step": 22089, "token_acc": 1.0, "train_speed(iter/s)": 0.955246 }, { "epoch": 0.7176038722671605, "grad_norm": 0.37417715787887573, "learning_rate": 2.0264606226266397e-06, "loss": 0.01329244114458561, "memory(GiB)": 22.66, "step": 22090, "token_acc": 1.0, "train_speed(iter/s)": 0.955253 }, { "epoch": 0.7176363577299159, "grad_norm": 0.3955870568752289, "learning_rate": 2.0260288000478324e-06, "loss": 0.01374928466975689, "memory(GiB)": 22.66, "step": 22091, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.95526 }, { "epoch": 0.7176688431926713, "grad_norm": 0.32494500279426575, "learning_rate": 2.0255970117926145e-06, "loss": 0.011759744957089424, "memory(GiB)": 22.66, "step": 22092, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955268 }, { "epoch": 0.7177013286554267, "grad_norm": 0.39515042304992676, "learning_rate": 2.0251652578659643e-06, "loss": 0.02033758908510208, "memory(GiB)": 22.66, "step": 22093, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.955275 }, { "epoch": 0.7177338141181822, "grad_norm": 0.3366834819316864, "learning_rate": 2.0247335382728685e-06, "loss": 0.017266612499952316, "memory(GiB)": 22.66, "step": 22094, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.955282 }, { "epoch": 0.7177662995809375, "grad_norm": 0.3073942959308624, "learning_rate": 2.0243018530183057e-06, "loss": 0.007295048795640469, "memory(GiB)": 22.66, "step": 22095, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955288 }, { "epoch": 0.717798785043693, "grad_norm": 0.4333004057407379, "learning_rate": 2.0238702021072597e-06, "loss": 0.01124822162091732, "memory(GiB)": 22.66, "step": 22096, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.955295 }, { "epoch": 0.7178312705064483, "grad_norm": 0.41935276985168457, "learning_rate": 2.0234385855447134e-06, "loss": 0.014516428112983704, "memory(GiB)": 22.66, "step": 22097, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955301 }, { "epoch": 0.7178637559692038, "grad_norm": 0.3626314699649811, "learning_rate": 2.023007003335649e-06, "loss": 0.012640384957194328, "memory(GiB)": 22.66, "step": 22098, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.955308 }, { "epoch": 0.7178962414319592, "grad_norm": 0.27114301919937134, "learning_rate": 2.0225754554850453e-06, "loss": 0.012050664983689785, "memory(GiB)": 22.66, "step": 22099, "token_acc": 1.0, "train_speed(iter/s)": 0.955316 }, { "epoch": 0.7179287268947147, "grad_norm": 0.26507219672203064, "learning_rate": 2.022143941997883e-06, "loss": 0.01195517834275961, "memory(GiB)": 22.66, "step": 22100, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955323 }, { "epoch": 0.71796121235747, "grad_norm": 0.32143455743789673, "learning_rate": 2.021712462879143e-06, "loss": 0.01124256569892168, "memory(GiB)": 22.66, "step": 22101, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.95533 }, { "epoch": 0.7179936978202255, "grad_norm": 0.34904399514198303, "learning_rate": 2.0212810181338054e-06, "loss": 0.012956538237631321, "memory(GiB)": 22.66, "step": 22102, "token_acc": 0.9885931558935361, "train_speed(iter/s)": 0.955339 }, { "epoch": 0.7180261832829808, "grad_norm": 0.3764921724796295, "learning_rate": 2.0208496077668514e-06, "loss": 0.015337709337472916, "memory(GiB)": 22.66, "step": 22103, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955348 }, { "epoch": 0.7180586687457363, "grad_norm": 0.3870362341403961, "learning_rate": 2.0204182317832564e-06, "loss": 0.014384620822966099, "memory(GiB)": 22.66, "step": 22104, "token_acc": 1.0, "train_speed(iter/s)": 0.955357 }, { "epoch": 0.7180911542084917, "grad_norm": 0.38227981328964233, "learning_rate": 2.019986890188001e-06, "loss": 0.01597467251121998, "memory(GiB)": 22.66, "step": 22105, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955366 }, { "epoch": 0.7181236396712471, "grad_norm": 0.5558002591133118, "learning_rate": 2.019555582986063e-06, "loss": 0.018670575693249702, "memory(GiB)": 22.66, "step": 22106, "token_acc": 1.0, "train_speed(iter/s)": 0.955376 }, { "epoch": 0.7181561251340025, "grad_norm": 0.3249114751815796, "learning_rate": 2.0191243101824227e-06, "loss": 0.010189883410930634, "memory(GiB)": 22.66, "step": 22107, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955385 }, { "epoch": 0.718188610596758, "grad_norm": 0.25122082233428955, "learning_rate": 2.018693071782053e-06, "loss": 0.011376926675438881, "memory(GiB)": 22.66, "step": 22108, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955394 }, { "epoch": 0.7182210960595133, "grad_norm": 0.28818827867507935, "learning_rate": 2.0182618677899337e-06, "loss": 0.015193035826086998, "memory(GiB)": 22.66, "step": 22109, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.955403 }, { "epoch": 0.7182535815222688, "grad_norm": 0.2717618942260742, "learning_rate": 2.0178306982110414e-06, "loss": 0.01504253689199686, "memory(GiB)": 22.66, "step": 22110, "token_acc": 1.0, "train_speed(iter/s)": 0.955412 }, { "epoch": 0.7182860669850242, "grad_norm": 0.23639757931232452, "learning_rate": 2.0173995630503533e-06, "loss": 0.010676412843167782, "memory(GiB)": 22.66, "step": 22111, "token_acc": 1.0, "train_speed(iter/s)": 0.955421 }, { "epoch": 0.7183185524477796, "grad_norm": 0.3770218789577484, "learning_rate": 2.016968462312843e-06, "loss": 0.013169005513191223, "memory(GiB)": 22.66, "step": 22112, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.95543 }, { "epoch": 0.718351037910535, "grad_norm": 0.35232579708099365, "learning_rate": 2.016537396003488e-06, "loss": 0.01644960418343544, "memory(GiB)": 22.66, "step": 22113, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955439 }, { "epoch": 0.7183835233732905, "grad_norm": 0.33409667015075684, "learning_rate": 2.0161063641272594e-06, "loss": 0.017129652202129364, "memory(GiB)": 22.66, "step": 22114, "token_acc": 0.9961832061068703, "train_speed(iter/s)": 0.955448 }, { "epoch": 0.7184160088360458, "grad_norm": 0.37480437755584717, "learning_rate": 2.015675366689138e-06, "loss": 0.016527745872735977, "memory(GiB)": 22.66, "step": 22115, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.955456 }, { "epoch": 0.7184484942988013, "grad_norm": 0.29117533564567566, "learning_rate": 2.015244403694094e-06, "loss": 0.012805011123418808, "memory(GiB)": 22.66, "step": 22116, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955465 }, { "epoch": 0.7184809797615567, "grad_norm": 0.4928424060344696, "learning_rate": 2.0148134751471017e-06, "loss": 0.015651168301701546, "memory(GiB)": 22.66, "step": 22117, "token_acc": 0.9863481228668942, "train_speed(iter/s)": 0.955474 }, { "epoch": 0.7185134652243121, "grad_norm": 0.2831549644470215, "learning_rate": 2.014382581053137e-06, "loss": 0.017090238630771637, "memory(GiB)": 22.66, "step": 22118, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.955483 }, { "epoch": 0.7185459506870675, "grad_norm": 0.38716477155685425, "learning_rate": 2.0139517214171678e-06, "loss": 0.010766217485070229, "memory(GiB)": 22.66, "step": 22119, "token_acc": 1.0, "train_speed(iter/s)": 0.955491 }, { "epoch": 0.718578436149823, "grad_norm": 0.4028584361076355, "learning_rate": 2.013520896244173e-06, "loss": 0.021347330883145332, "memory(GiB)": 22.66, "step": 22120, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.9555 }, { "epoch": 0.7186109216125783, "grad_norm": 0.39226117730140686, "learning_rate": 2.013090105539121e-06, "loss": 0.01431223377585411, "memory(GiB)": 22.66, "step": 22121, "token_acc": 1.0, "train_speed(iter/s)": 0.955509 }, { "epoch": 0.7186434070753338, "grad_norm": 0.3245033919811249, "learning_rate": 2.012659349306987e-06, "loss": 0.010573316365480423, "memory(GiB)": 22.66, "step": 22122, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.955516 }, { "epoch": 0.7186758925380892, "grad_norm": 0.24156266450881958, "learning_rate": 2.0122286275527374e-06, "loss": 0.011348412372171879, "memory(GiB)": 22.66, "step": 22123, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955523 }, { "epoch": 0.7187083780008446, "grad_norm": 0.32918843626976013, "learning_rate": 2.011797940281347e-06, "loss": 0.014315187931060791, "memory(GiB)": 22.66, "step": 22124, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.95553 }, { "epoch": 0.7187408634636, "grad_norm": 0.3528546988964081, "learning_rate": 2.011367287497785e-06, "loss": 0.014845849014818668, "memory(GiB)": 22.66, "step": 22125, "token_acc": 1.0, "train_speed(iter/s)": 0.955537 }, { "epoch": 0.7187733489263555, "grad_norm": 0.29261359572410583, "learning_rate": 2.0109366692070246e-06, "loss": 0.012495080009102821, "memory(GiB)": 22.66, "step": 22126, "token_acc": 1.0, "train_speed(iter/s)": 0.955544 }, { "epoch": 0.7188058343891108, "grad_norm": 0.5378719568252563, "learning_rate": 2.010506085414032e-06, "loss": 0.013291377574205399, "memory(GiB)": 22.66, "step": 22127, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.955552 }, { "epoch": 0.7188383198518663, "grad_norm": 0.3551001250743866, "learning_rate": 2.010075536123777e-06, "loss": 0.011323230341076851, "memory(GiB)": 22.66, "step": 22128, "token_acc": 1.0, "train_speed(iter/s)": 0.955558 }, { "epoch": 0.7188708053146217, "grad_norm": 0.2686583697795868, "learning_rate": 2.009645021341231e-06, "loss": 0.01139046810567379, "memory(GiB)": 22.66, "step": 22129, "token_acc": 0.9961685823754789, "train_speed(iter/s)": 0.955564 }, { "epoch": 0.7189032907773771, "grad_norm": 0.3194139301776886, "learning_rate": 2.0092145410713633e-06, "loss": 0.011655773967504501, "memory(GiB)": 22.66, "step": 22130, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955571 }, { "epoch": 0.7189357762401325, "grad_norm": 0.3174408972263336, "learning_rate": 2.008784095319138e-06, "loss": 0.011221567168831825, "memory(GiB)": 22.66, "step": 22131, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.955578 }, { "epoch": 0.718968261702888, "grad_norm": 0.40885764360427856, "learning_rate": 2.008353684089526e-06, "loss": 0.019412465393543243, "memory(GiB)": 22.66, "step": 22132, "token_acc": 1.0, "train_speed(iter/s)": 0.955585 }, { "epoch": 0.7190007471656433, "grad_norm": 0.3317736089229584, "learning_rate": 2.007923307387494e-06, "loss": 0.014593657106161118, "memory(GiB)": 22.66, "step": 22133, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955592 }, { "epoch": 0.7190332326283988, "grad_norm": 0.3516864776611328, "learning_rate": 2.0074929652180102e-06, "loss": 0.014274581335484982, "memory(GiB)": 22.66, "step": 22134, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955599 }, { "epoch": 0.7190657180911542, "grad_norm": 0.27400749921798706, "learning_rate": 2.007062657586042e-06, "loss": 0.008979933336377144, "memory(GiB)": 22.66, "step": 22135, "token_acc": 0.99609375, "train_speed(iter/s)": 0.955605 }, { "epoch": 0.7190982035539096, "grad_norm": 0.2654437720775604, "learning_rate": 2.0066323844965525e-06, "loss": 0.009171558544039726, "memory(GiB)": 22.66, "step": 22136, "token_acc": 1.0, "train_speed(iter/s)": 0.955613 }, { "epoch": 0.719130689016665, "grad_norm": 0.2963777184486389, "learning_rate": 2.0062021459545096e-06, "loss": 0.013028345070779324, "memory(GiB)": 22.66, "step": 22137, "token_acc": 1.0, "train_speed(iter/s)": 0.95562 }, { "epoch": 0.7191631744794205, "grad_norm": 0.3510165810585022, "learning_rate": 2.0057719419648786e-06, "loss": 0.013760160654783249, "memory(GiB)": 22.66, "step": 22138, "token_acc": 1.0, "train_speed(iter/s)": 0.955627 }, { "epoch": 0.7191956599421758, "grad_norm": 0.4501519799232483, "learning_rate": 2.005341772532626e-06, "loss": 0.018142081797122955, "memory(GiB)": 22.66, "step": 22139, "token_acc": 1.0, "train_speed(iter/s)": 0.955633 }, { "epoch": 0.7192281454049313, "grad_norm": 0.3080527186393738, "learning_rate": 2.004911637662714e-06, "loss": 0.012339831329882145, "memory(GiB)": 22.66, "step": 22140, "token_acc": 0.9858156028368794, "train_speed(iter/s)": 0.95564 }, { "epoch": 0.7192606308676867, "grad_norm": 0.49370840191841125, "learning_rate": 2.00448153736011e-06, "loss": 0.014369671232998371, "memory(GiB)": 22.66, "step": 22141, "token_acc": 1.0, "train_speed(iter/s)": 0.955647 }, { "epoch": 0.7192931163304421, "grad_norm": 0.39278846979141235, "learning_rate": 2.0040514716297722e-06, "loss": 0.019805073738098145, "memory(GiB)": 22.66, "step": 22142, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.955655 }, { "epoch": 0.7193256017931975, "grad_norm": 0.27652454376220703, "learning_rate": 2.003621440476672e-06, "loss": 0.012395218014717102, "memory(GiB)": 22.66, "step": 22143, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.955662 }, { "epoch": 0.719358087255953, "grad_norm": 0.31444358825683594, "learning_rate": 2.0031914439057663e-06, "loss": 0.015780534595251083, "memory(GiB)": 22.66, "step": 22144, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.955669 }, { "epoch": 0.7193905727187084, "grad_norm": 0.47326260805130005, "learning_rate": 2.002761481922022e-06, "loss": 0.028991002589464188, "memory(GiB)": 22.66, "step": 22145, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.955676 }, { "epoch": 0.7194230581814638, "grad_norm": 0.3014073371887207, "learning_rate": 2.002331554530398e-06, "loss": 0.011015275493264198, "memory(GiB)": 22.66, "step": 22146, "token_acc": 1.0, "train_speed(iter/s)": 0.955683 }, { "epoch": 0.7194555436442193, "grad_norm": 0.2639494836330414, "learning_rate": 2.001901661735857e-06, "loss": 0.013704207725822926, "memory(GiB)": 22.66, "step": 22147, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.955691 }, { "epoch": 0.7194880291069746, "grad_norm": 0.403489351272583, "learning_rate": 2.0014718035433624e-06, "loss": 0.017447972670197487, "memory(GiB)": 22.66, "step": 22148, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.955696 }, { "epoch": 0.7195205145697301, "grad_norm": 0.47492462396621704, "learning_rate": 2.0010419799578735e-06, "loss": 0.015064063481986523, "memory(GiB)": 22.66, "step": 22149, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955702 }, { "epoch": 0.7195530000324855, "grad_norm": 0.2840753495693207, "learning_rate": 2.0006121909843534e-06, "loss": 0.010287581942975521, "memory(GiB)": 22.66, "step": 22150, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.955709 }, { "epoch": 0.7195854854952409, "grad_norm": 0.202610582113266, "learning_rate": 2.0001824366277596e-06, "loss": 0.00926200207322836, "memory(GiB)": 22.66, "step": 22151, "token_acc": 1.0, "train_speed(iter/s)": 0.955716 }, { "epoch": 0.7196179709579963, "grad_norm": 0.5000075697898865, "learning_rate": 1.9997527168930526e-06, "loss": 0.015189086087048054, "memory(GiB)": 22.66, "step": 22152, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.955722 }, { "epoch": 0.7196504564207518, "grad_norm": 0.37667617201805115, "learning_rate": 1.999323031785193e-06, "loss": 0.013710783794522285, "memory(GiB)": 22.66, "step": 22153, "token_acc": 0.9893992932862191, "train_speed(iter/s)": 0.95573 }, { "epoch": 0.7196829418835071, "grad_norm": 0.2025696188211441, "learning_rate": 1.998893381309141e-06, "loss": 0.005367442034184933, "memory(GiB)": 22.66, "step": 22154, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.955737 }, { "epoch": 0.7197154273462626, "grad_norm": 0.33845439553260803, "learning_rate": 1.9984637654698523e-06, "loss": 0.013162019662559032, "memory(GiB)": 22.66, "step": 22155, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.955744 }, { "epoch": 0.719747912809018, "grad_norm": 0.5490901470184326, "learning_rate": 1.998034184272287e-06, "loss": 0.02005467563867569, "memory(GiB)": 22.66, "step": 22156, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.955752 }, { "epoch": 0.7197803982717734, "grad_norm": 0.3011051118373871, "learning_rate": 1.9976046377214026e-06, "loss": 0.01514984667301178, "memory(GiB)": 22.66, "step": 22157, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955759 }, { "epoch": 0.7198128837345288, "grad_norm": 0.3919152021408081, "learning_rate": 1.997175125822158e-06, "loss": 0.010117486119270325, "memory(GiB)": 22.66, "step": 22158, "token_acc": 0.9930313588850174, "train_speed(iter/s)": 0.955766 }, { "epoch": 0.7198453691972843, "grad_norm": 0.38756340742111206, "learning_rate": 1.9967456485795085e-06, "loss": 0.017884861677885056, "memory(GiB)": 22.66, "step": 22159, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.955773 }, { "epoch": 0.7198778546600396, "grad_norm": 0.3882614076137543, "learning_rate": 1.996316205998411e-06, "loss": 0.015556433238089085, "memory(GiB)": 22.66, "step": 22160, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955779 }, { "epoch": 0.7199103401227951, "grad_norm": 0.39801862835884094, "learning_rate": 1.995886798083822e-06, "loss": 0.017123784869909286, "memory(GiB)": 22.66, "step": 22161, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955786 }, { "epoch": 0.7199428255855504, "grad_norm": 0.39226704835891724, "learning_rate": 1.9954574248407e-06, "loss": 0.015460923314094543, "memory(GiB)": 22.66, "step": 22162, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.955793 }, { "epoch": 0.7199753110483059, "grad_norm": 0.8850575089454651, "learning_rate": 1.995028086273997e-06, "loss": 0.017834896221756935, "memory(GiB)": 22.66, "step": 22163, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.955801 }, { "epoch": 0.7200077965110613, "grad_norm": 0.4963058531284332, "learning_rate": 1.9945987823886686e-06, "loss": 0.017977021634578705, "memory(GiB)": 22.66, "step": 22164, "token_acc": 1.0, "train_speed(iter/s)": 0.955809 }, { "epoch": 0.7200402819738168, "grad_norm": 0.33529412746429443, "learning_rate": 1.994169513189671e-06, "loss": 0.011672765016555786, "memory(GiB)": 22.66, "step": 22165, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955817 }, { "epoch": 0.7200727674365721, "grad_norm": 0.2951173484325409, "learning_rate": 1.9937402786819584e-06, "loss": 0.013256927952170372, "memory(GiB)": 22.66, "step": 22166, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955826 }, { "epoch": 0.7201052528993276, "grad_norm": 0.2508716583251953, "learning_rate": 1.993311078870485e-06, "loss": 0.011812879703938961, "memory(GiB)": 22.66, "step": 22167, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.955834 }, { "epoch": 0.720137738362083, "grad_norm": 0.25332382321357727, "learning_rate": 1.9928819137602023e-06, "loss": 0.009126376360654831, "memory(GiB)": 22.66, "step": 22168, "token_acc": 1.0, "train_speed(iter/s)": 0.955843 }, { "epoch": 0.7201702238248384, "grad_norm": 0.3786347806453705, "learning_rate": 1.992452783356067e-06, "loss": 0.01550906989723444, "memory(GiB)": 22.66, "step": 22169, "token_acc": 1.0, "train_speed(iter/s)": 0.955853 }, { "epoch": 0.7202027092875938, "grad_norm": 0.3375574052333832, "learning_rate": 1.9920236876630254e-06, "loss": 0.014154577627778053, "memory(GiB)": 22.66, "step": 22170, "token_acc": 1.0, "train_speed(iter/s)": 0.955862 }, { "epoch": 0.7202351947503493, "grad_norm": 0.4115927815437317, "learning_rate": 1.991594626686038e-06, "loss": 0.015635661780834198, "memory(GiB)": 22.66, "step": 22171, "token_acc": 1.0, "train_speed(iter/s)": 0.955871 }, { "epoch": 0.7202676802131046, "grad_norm": 0.4384131133556366, "learning_rate": 1.991165600430051e-06, "loss": 0.009191470220685005, "memory(GiB)": 22.66, "step": 22172, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.95588 }, { "epoch": 0.7203001656758601, "grad_norm": 0.3686206042766571, "learning_rate": 1.9907366089000196e-06, "loss": 0.015455870889127254, "memory(GiB)": 22.66, "step": 22173, "token_acc": 1.0, "train_speed(iter/s)": 0.955889 }, { "epoch": 0.7203326511386154, "grad_norm": 0.3526075780391693, "learning_rate": 1.9903076521008914e-06, "loss": 0.014724661596119404, "memory(GiB)": 22.66, "step": 22174, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955899 }, { "epoch": 0.7203651366013709, "grad_norm": 0.46468019485473633, "learning_rate": 1.9898787300376183e-06, "loss": 0.019787650555372238, "memory(GiB)": 22.66, "step": 22175, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.955908 }, { "epoch": 0.7203976220641263, "grad_norm": 0.34437650442123413, "learning_rate": 1.989449842715151e-06, "loss": 0.016997240483760834, "memory(GiB)": 22.66, "step": 22176, "token_acc": 0.9904761904761905, "train_speed(iter/s)": 0.955917 }, { "epoch": 0.7204301075268817, "grad_norm": 0.4641103446483612, "learning_rate": 1.9890209901384417e-06, "loss": 0.01464551966637373, "memory(GiB)": 22.66, "step": 22177, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.955925 }, { "epoch": 0.7204625929896371, "grad_norm": 0.2747313976287842, "learning_rate": 1.9885921723124365e-06, "loss": 0.01057938951998949, "memory(GiB)": 22.66, "step": 22178, "token_acc": 1.0, "train_speed(iter/s)": 0.955935 }, { "epoch": 0.7204950784523926, "grad_norm": 0.49136823415756226, "learning_rate": 1.988163389242085e-06, "loss": 0.01989293284714222, "memory(GiB)": 22.66, "step": 22179, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.955944 }, { "epoch": 0.7205275639151479, "grad_norm": 0.4289034605026245, "learning_rate": 1.9877346409323367e-06, "loss": 0.014243332669138908, "memory(GiB)": 22.66, "step": 22180, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.955953 }, { "epoch": 0.7205600493779034, "grad_norm": 0.318337619304657, "learning_rate": 1.9873059273881406e-06, "loss": 0.012430699542164803, "memory(GiB)": 22.66, "step": 22181, "token_acc": 1.0, "train_speed(iter/s)": 0.95596 }, { "epoch": 0.7205925348406588, "grad_norm": 0.3642803430557251, "learning_rate": 1.986877248614445e-06, "loss": 0.011186089366674423, "memory(GiB)": 22.66, "step": 22182, "token_acc": 0.9930555555555556, "train_speed(iter/s)": 0.955966 }, { "epoch": 0.7206250203034142, "grad_norm": 0.3183276653289795, "learning_rate": 1.986448604616195e-06, "loss": 0.01354594063013792, "memory(GiB)": 22.66, "step": 22183, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955974 }, { "epoch": 0.7206575057661696, "grad_norm": 0.3863021433353424, "learning_rate": 1.9860199953983396e-06, "loss": 0.017408154904842377, "memory(GiB)": 22.66, "step": 22184, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.955981 }, { "epoch": 0.7206899912289251, "grad_norm": 0.25441810488700867, "learning_rate": 1.9855914209658246e-06, "loss": 0.01052049919962883, "memory(GiB)": 22.66, "step": 22185, "token_acc": 1.0, "train_speed(iter/s)": 0.955988 }, { "epoch": 0.7207224766916804, "grad_norm": 0.429188996553421, "learning_rate": 1.9851628813235983e-06, "loss": 0.015882277861237526, "memory(GiB)": 22.66, "step": 22186, "token_acc": 1.0, "train_speed(iter/s)": 0.955995 }, { "epoch": 0.7207549621544359, "grad_norm": 0.3729124367237091, "learning_rate": 1.9847343764766034e-06, "loss": 0.01822587102651596, "memory(GiB)": 22.66, "step": 22187, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956001 }, { "epoch": 0.7207874476171913, "grad_norm": 0.29449698328971863, "learning_rate": 1.9843059064297877e-06, "loss": 0.0100696487352252, "memory(GiB)": 22.66, "step": 22188, "token_acc": 1.0, "train_speed(iter/s)": 0.956008 }, { "epoch": 0.7208199330799467, "grad_norm": 0.44860243797302246, "learning_rate": 1.9838774711880954e-06, "loss": 0.019681688398122787, "memory(GiB)": 22.66, "step": 22189, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.956014 }, { "epoch": 0.7208524185427021, "grad_norm": 0.36581453680992126, "learning_rate": 1.983449070756473e-06, "loss": 0.015548871830105782, "memory(GiB)": 22.66, "step": 22190, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956021 }, { "epoch": 0.7208849040054576, "grad_norm": 0.3173091411590576, "learning_rate": 1.983020705139862e-06, "loss": 0.01356610469520092, "memory(GiB)": 22.66, "step": 22191, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.956027 }, { "epoch": 0.7209173894682129, "grad_norm": 0.6833610534667969, "learning_rate": 1.9825923743432092e-06, "loss": 0.011598795652389526, "memory(GiB)": 22.66, "step": 22192, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956034 }, { "epoch": 0.7209498749309684, "grad_norm": 0.24605613946914673, "learning_rate": 1.982164078371453e-06, "loss": 0.006538392975926399, "memory(GiB)": 22.66, "step": 22193, "token_acc": 0.9904761904761905, "train_speed(iter/s)": 0.956041 }, { "epoch": 0.7209823603937238, "grad_norm": 0.40447527170181274, "learning_rate": 1.981735817229542e-06, "loss": 0.01329873874783516, "memory(GiB)": 22.66, "step": 22194, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956047 }, { "epoch": 0.7210148458564792, "grad_norm": 0.318386048078537, "learning_rate": 1.9813075909224182e-06, "loss": 0.010302698239684105, "memory(GiB)": 22.66, "step": 22195, "token_acc": 1.0, "train_speed(iter/s)": 0.956055 }, { "epoch": 0.7210473313192346, "grad_norm": 0.4546926021575928, "learning_rate": 1.9808793994550213e-06, "loss": 0.011732960119843483, "memory(GiB)": 22.66, "step": 22196, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.956062 }, { "epoch": 0.7210798167819901, "grad_norm": 0.2797258496284485, "learning_rate": 1.9804512428322966e-06, "loss": 0.010505862534046173, "memory(GiB)": 22.66, "step": 22197, "token_acc": 0.9826839826839827, "train_speed(iter/s)": 0.956069 }, { "epoch": 0.7211123022447454, "grad_norm": 0.2802923023700714, "learning_rate": 1.9800231210591797e-06, "loss": 0.014808396808803082, "memory(GiB)": 22.66, "step": 22198, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.956075 }, { "epoch": 0.7211447877075009, "grad_norm": 0.3283328711986542, "learning_rate": 1.979595034140619e-06, "loss": 0.00874311476945877, "memory(GiB)": 22.66, "step": 22199, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.956083 }, { "epoch": 0.7211772731702563, "grad_norm": 0.21430321037769318, "learning_rate": 1.97916698208155e-06, "loss": 0.008332695811986923, "memory(GiB)": 22.66, "step": 22200, "token_acc": 1.0, "train_speed(iter/s)": 0.95609 }, { "epoch": 0.7212097586330117, "grad_norm": 0.19606980681419373, "learning_rate": 1.9787389648869173e-06, "loss": 0.007786272093653679, "memory(GiB)": 22.66, "step": 22201, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956099 }, { "epoch": 0.7212422440957671, "grad_norm": 0.2951720356941223, "learning_rate": 1.9783109825616553e-06, "loss": 0.0178836677223444, "memory(GiB)": 22.66, "step": 22202, "token_acc": 0.9898477157360406, "train_speed(iter/s)": 0.956107 }, { "epoch": 0.7212747295585226, "grad_norm": 0.4001706540584564, "learning_rate": 1.977883035110707e-06, "loss": 0.012936495244503021, "memory(GiB)": 22.66, "step": 22203, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.956114 }, { "epoch": 0.7213072150212779, "grad_norm": 0.5050044655799866, "learning_rate": 1.977455122539011e-06, "loss": 0.017769141122698784, "memory(GiB)": 22.66, "step": 22204, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.956121 }, { "epoch": 0.7213397004840334, "grad_norm": 0.40209129452705383, "learning_rate": 1.977027244851508e-06, "loss": 0.011938950046896935, "memory(GiB)": 22.66, "step": 22205, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956128 }, { "epoch": 0.7213721859467888, "grad_norm": 0.4061623513698578, "learning_rate": 1.9765994020531314e-06, "loss": 0.011315176263451576, "memory(GiB)": 22.66, "step": 22206, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956135 }, { "epoch": 0.7214046714095442, "grad_norm": 0.42922738194465637, "learning_rate": 1.9761715941488223e-06, "loss": 0.02014080062508583, "memory(GiB)": 22.66, "step": 22207, "token_acc": 0.9702970297029703, "train_speed(iter/s)": 0.956143 }, { "epoch": 0.7214371568722997, "grad_norm": 0.3032561242580414, "learning_rate": 1.9757438211435183e-06, "loss": 0.01656632497906685, "memory(GiB)": 22.66, "step": 22208, "token_acc": 0.9826839826839827, "train_speed(iter/s)": 0.95615 }, { "epoch": 0.7214696423350551, "grad_norm": 0.3804715573787689, "learning_rate": 1.975316083042157e-06, "loss": 0.017163822427392006, "memory(GiB)": 22.66, "step": 22209, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956158 }, { "epoch": 0.7215021277978105, "grad_norm": 0.44190478324890137, "learning_rate": 1.974888379849672e-06, "loss": 0.010898074135184288, "memory(GiB)": 22.66, "step": 22210, "token_acc": 1.0, "train_speed(iter/s)": 0.956165 }, { "epoch": 0.7215346132605659, "grad_norm": 0.32851383090019226, "learning_rate": 1.9744607115710016e-06, "loss": 0.013780927285552025, "memory(GiB)": 22.66, "step": 22211, "token_acc": 0.9961977186311787, "train_speed(iter/s)": 0.956172 }, { "epoch": 0.7215670987233214, "grad_norm": 0.38780686259269714, "learning_rate": 1.9740330782110823e-06, "loss": 0.01716477796435356, "memory(GiB)": 22.66, "step": 22212, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956178 }, { "epoch": 0.7215995841860767, "grad_norm": 0.29835957288742065, "learning_rate": 1.9736054797748485e-06, "loss": 0.015659313648939133, "memory(GiB)": 22.66, "step": 22213, "token_acc": 1.0, "train_speed(iter/s)": 0.956185 }, { "epoch": 0.7216320696488322, "grad_norm": 0.2702289819717407, "learning_rate": 1.9731779162672367e-06, "loss": 0.007488982751965523, "memory(GiB)": 22.66, "step": 22214, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.956191 }, { "epoch": 0.7216645551115876, "grad_norm": 1.7338756322860718, "learning_rate": 1.9727503876931793e-06, "loss": 0.021494343876838684, "memory(GiB)": 22.66, "step": 22215, "token_acc": 0.9965277777777778, "train_speed(iter/s)": 0.956199 }, { "epoch": 0.721697040574343, "grad_norm": 0.4245189428329468, "learning_rate": 1.972322894057611e-06, "loss": 0.015331676229834557, "memory(GiB)": 22.66, "step": 22216, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.956206 }, { "epoch": 0.7217295260370984, "grad_norm": 0.3654559850692749, "learning_rate": 1.9718954353654674e-06, "loss": 0.01689109578728676, "memory(GiB)": 22.66, "step": 22217, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.956213 }, { "epoch": 0.7217620114998539, "grad_norm": 0.5140244364738464, "learning_rate": 1.9714680116216817e-06, "loss": 0.01452699676156044, "memory(GiB)": 22.66, "step": 22218, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.95622 }, { "epoch": 0.7217944969626092, "grad_norm": 0.3696804940700531, "learning_rate": 1.971040622831185e-06, "loss": 0.011777945794165134, "memory(GiB)": 22.66, "step": 22219, "token_acc": 1.0, "train_speed(iter/s)": 0.956227 }, { "epoch": 0.7218269824253647, "grad_norm": 0.5310850143432617, "learning_rate": 1.970613268998913e-06, "loss": 0.01593117043375969, "memory(GiB)": 22.66, "step": 22220, "token_acc": 1.0, "train_speed(iter/s)": 0.956233 }, { "epoch": 0.72185946788812, "grad_norm": 0.34425100684165955, "learning_rate": 1.9701859501297916e-06, "loss": 0.011289263144135475, "memory(GiB)": 22.66, "step": 22221, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.956239 }, { "epoch": 0.7218919533508755, "grad_norm": 0.3967081904411316, "learning_rate": 1.969758666228761e-06, "loss": 0.01384318619966507, "memory(GiB)": 22.66, "step": 22222, "token_acc": 0.991869918699187, "train_speed(iter/s)": 0.956245 }, { "epoch": 0.7219244388136309, "grad_norm": 0.5519124865531921, "learning_rate": 1.9693314173007465e-06, "loss": 0.021672450006008148, "memory(GiB)": 22.66, "step": 22223, "token_acc": 0.996, "train_speed(iter/s)": 0.956253 }, { "epoch": 0.7219569242763864, "grad_norm": 0.34287047386169434, "learning_rate": 1.9689042033506835e-06, "loss": 0.012900525704026222, "memory(GiB)": 22.66, "step": 22224, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.956259 }, { "epoch": 0.7219894097391417, "grad_norm": 0.47442612051963806, "learning_rate": 1.9684770243834984e-06, "loss": 0.01244364120066166, "memory(GiB)": 22.66, "step": 22225, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.956267 }, { "epoch": 0.7220218952018972, "grad_norm": 0.22107921540737152, "learning_rate": 1.9680498804041217e-06, "loss": 0.011631257832050323, "memory(GiB)": 22.66, "step": 22226, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.956275 }, { "epoch": 0.7220543806646526, "grad_norm": 0.28350329399108887, "learning_rate": 1.9676227714174883e-06, "loss": 0.013440930284559727, "memory(GiB)": 22.66, "step": 22227, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.956284 }, { "epoch": 0.722086866127408, "grad_norm": 0.3055601418018341, "learning_rate": 1.967195697428523e-06, "loss": 0.01344209909439087, "memory(GiB)": 22.66, "step": 22228, "token_acc": 1.0, "train_speed(iter/s)": 0.956293 }, { "epoch": 0.7221193515901634, "grad_norm": 0.31251829862594604, "learning_rate": 1.966768658442157e-06, "loss": 0.019339950755238533, "memory(GiB)": 22.66, "step": 22229, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956301 }, { "epoch": 0.7221518370529189, "grad_norm": 0.29339930415153503, "learning_rate": 1.9663416544633163e-06, "loss": 0.011213542893528938, "memory(GiB)": 22.66, "step": 22230, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.95631 }, { "epoch": 0.7221843225156742, "grad_norm": 0.4277442991733551, "learning_rate": 1.965914685496931e-06, "loss": 0.015565150417387486, "memory(GiB)": 22.66, "step": 22231, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.956319 }, { "epoch": 0.7222168079784297, "grad_norm": 0.33742305636405945, "learning_rate": 1.965487751547928e-06, "loss": 0.01629728265106678, "memory(GiB)": 22.66, "step": 22232, "token_acc": 1.0, "train_speed(iter/s)": 0.956328 }, { "epoch": 0.722249293441185, "grad_norm": 0.34091827273368835, "learning_rate": 1.965060852621238e-06, "loss": 0.010211450047791004, "memory(GiB)": 22.66, "step": 22233, "token_acc": 1.0, "train_speed(iter/s)": 0.956337 }, { "epoch": 0.7222817789039405, "grad_norm": 0.3545563519001007, "learning_rate": 1.9646339887217824e-06, "loss": 0.01132508460432291, "memory(GiB)": 22.66, "step": 22234, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956345 }, { "epoch": 0.7223142643666959, "grad_norm": 0.38677680492401123, "learning_rate": 1.9642071598544915e-06, "loss": 0.008956479839980602, "memory(GiB)": 22.66, "step": 22235, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956354 }, { "epoch": 0.7223467498294514, "grad_norm": 0.2890215516090393, "learning_rate": 1.96378036602429e-06, "loss": 0.010951749980449677, "memory(GiB)": 22.66, "step": 22236, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956362 }, { "epoch": 0.7223792352922067, "grad_norm": 0.21892571449279785, "learning_rate": 1.9633536072361063e-06, "loss": 0.00881291925907135, "memory(GiB)": 22.66, "step": 22237, "token_acc": 0.996, "train_speed(iter/s)": 0.956371 }, { "epoch": 0.7224117207549622, "grad_norm": 0.2715242803096771, "learning_rate": 1.962926883494862e-06, "loss": 0.009689165279269218, "memory(GiB)": 22.66, "step": 22238, "token_acc": 1.0, "train_speed(iter/s)": 0.956381 }, { "epoch": 0.7224442062177175, "grad_norm": 0.3115813434123993, "learning_rate": 1.9625001948054838e-06, "loss": 0.007614540867507458, "memory(GiB)": 22.66, "step": 22239, "token_acc": 1.0, "train_speed(iter/s)": 0.95639 }, { "epoch": 0.722476691680473, "grad_norm": 0.29786768555641174, "learning_rate": 1.9620735411728962e-06, "loss": 0.0073565272614359856, "memory(GiB)": 22.66, "step": 22240, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.956397 }, { "epoch": 0.7225091771432284, "grad_norm": 0.48780322074890137, "learning_rate": 1.9616469226020233e-06, "loss": 0.015229200944304466, "memory(GiB)": 22.66, "step": 22241, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.956403 }, { "epoch": 0.7225416626059838, "grad_norm": 0.33872362971305847, "learning_rate": 1.9612203390977914e-06, "loss": 0.012084275484085083, "memory(GiB)": 22.66, "step": 22242, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.95641 }, { "epoch": 0.7225741480687392, "grad_norm": 0.2700614929199219, "learning_rate": 1.960793790665119e-06, "loss": 0.015539832413196564, "memory(GiB)": 22.66, "step": 22243, "token_acc": 1.0, "train_speed(iter/s)": 0.956418 }, { "epoch": 0.7226066335314947, "grad_norm": 0.42265430092811584, "learning_rate": 1.9603672773089316e-06, "loss": 0.018146593123674393, "memory(GiB)": 22.66, "step": 22244, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.956425 }, { "epoch": 0.72263911899425, "grad_norm": 0.3041546940803528, "learning_rate": 1.959940799034151e-06, "loss": 0.008621443063020706, "memory(GiB)": 22.66, "step": 22245, "token_acc": 1.0, "train_speed(iter/s)": 0.956431 }, { "epoch": 0.7226716044570055, "grad_norm": 0.32550153136253357, "learning_rate": 1.9595143558457024e-06, "loss": 0.01437524426728487, "memory(GiB)": 22.66, "step": 22246, "token_acc": 1.0, "train_speed(iter/s)": 0.956438 }, { "epoch": 0.7227040899197609, "grad_norm": 0.45945966243743896, "learning_rate": 1.959087947748503e-06, "loss": 0.016722915694117546, "memory(GiB)": 22.66, "step": 22247, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.956445 }, { "epoch": 0.7227365753825163, "grad_norm": 0.45001283288002014, "learning_rate": 1.958661574747478e-06, "loss": 0.019352983683347702, "memory(GiB)": 22.66, "step": 22248, "token_acc": 1.0, "train_speed(iter/s)": 0.956452 }, { "epoch": 0.7227690608452717, "grad_norm": 0.44193220138549805, "learning_rate": 1.9582352368475425e-06, "loss": 0.017824187874794006, "memory(GiB)": 22.66, "step": 22249, "token_acc": 0.9961538461538462, "train_speed(iter/s)": 0.956459 }, { "epoch": 0.7228015463080272, "grad_norm": 0.4057106673717499, "learning_rate": 1.957808934053625e-06, "loss": 0.011128140613436699, "memory(GiB)": 22.66, "step": 22250, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.956466 }, { "epoch": 0.7228340317707825, "grad_norm": 0.4250495433807373, "learning_rate": 1.9573826663706397e-06, "loss": 0.010910875163972378, "memory(GiB)": 22.66, "step": 22251, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.956473 }, { "epoch": 0.722866517233538, "grad_norm": 0.26222968101501465, "learning_rate": 1.9569564338035097e-06, "loss": 0.011321227997541428, "memory(GiB)": 22.66, "step": 22252, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.956479 }, { "epoch": 0.7228990026962934, "grad_norm": 0.25062665343284607, "learning_rate": 1.956530236357151e-06, "loss": 0.008148848079144955, "memory(GiB)": 22.66, "step": 22253, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.956486 }, { "epoch": 0.7229314881590488, "grad_norm": 0.30789715051651, "learning_rate": 1.9561040740364846e-06, "loss": 0.01711656153202057, "memory(GiB)": 22.66, "step": 22254, "token_acc": 1.0, "train_speed(iter/s)": 0.956492 }, { "epoch": 0.7229639736218042, "grad_norm": 0.9189385175704956, "learning_rate": 1.955677946846428e-06, "loss": 0.02577369287610054, "memory(GiB)": 22.66, "step": 22255, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.956499 }, { "epoch": 0.7229964590845597, "grad_norm": 0.4008757174015045, "learning_rate": 1.9552518547919e-06, "loss": 0.0107872374355793, "memory(GiB)": 22.66, "step": 22256, "token_acc": 0.9921875, "train_speed(iter/s)": 0.956506 }, { "epoch": 0.723028944547315, "grad_norm": 0.31306588649749756, "learning_rate": 1.9548257978778203e-06, "loss": 0.011258797720074654, "memory(GiB)": 22.66, "step": 22257, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.956512 }, { "epoch": 0.7230614300100705, "grad_norm": 0.2644082307815552, "learning_rate": 1.9543997761091016e-06, "loss": 0.015151230618357658, "memory(GiB)": 22.66, "step": 22258, "token_acc": 0.9944444444444445, "train_speed(iter/s)": 0.956519 }, { "epoch": 0.7230939154728259, "grad_norm": 0.44332805275917053, "learning_rate": 1.9539737894906637e-06, "loss": 0.020571667701005936, "memory(GiB)": 22.66, "step": 22259, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.956527 }, { "epoch": 0.7231264009355813, "grad_norm": 0.23746730387210846, "learning_rate": 1.9535478380274224e-06, "loss": 0.008689181879162788, "memory(GiB)": 22.66, "step": 22260, "token_acc": 1.0, "train_speed(iter/s)": 0.956535 }, { "epoch": 0.7231588863983367, "grad_norm": 0.3263922333717346, "learning_rate": 1.953121921724296e-06, "loss": 0.01610838994383812, "memory(GiB)": 22.66, "step": 22261, "token_acc": 0.9823008849557522, "train_speed(iter/s)": 0.956544 }, { "epoch": 0.7231913718610922, "grad_norm": 0.3924493193626404, "learning_rate": 1.9526960405861955e-06, "loss": 0.013532276265323162, "memory(GiB)": 22.66, "step": 22262, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956552 }, { "epoch": 0.7232238573238475, "grad_norm": 0.37386688590049744, "learning_rate": 1.952270194618039e-06, "loss": 0.011576691642403603, "memory(GiB)": 22.66, "step": 22263, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.956561 }, { "epoch": 0.723256342786603, "grad_norm": 0.2882978916168213, "learning_rate": 1.951844383824741e-06, "loss": 0.009748304262757301, "memory(GiB)": 22.66, "step": 22264, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956568 }, { "epoch": 0.7232888282493584, "grad_norm": 0.3017612099647522, "learning_rate": 1.9514186082112172e-06, "loss": 0.012620396912097931, "memory(GiB)": 22.66, "step": 22265, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.956576 }, { "epoch": 0.7233213137121138, "grad_norm": 0.40059348940849304, "learning_rate": 1.95099286778238e-06, "loss": 0.01744648814201355, "memory(GiB)": 22.66, "step": 22266, "token_acc": 0.9963636363636363, "train_speed(iter/s)": 0.956583 }, { "epoch": 0.7233537991748692, "grad_norm": 0.21709319949150085, "learning_rate": 1.9505671625431418e-06, "loss": 0.010458119213581085, "memory(GiB)": 22.66, "step": 22267, "token_acc": 1.0, "train_speed(iter/s)": 0.95659 }, { "epoch": 0.7233862846376247, "grad_norm": 0.3849412500858307, "learning_rate": 1.950141492498418e-06, "loss": 0.013992154970765114, "memory(GiB)": 22.66, "step": 22268, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.956597 }, { "epoch": 0.72341877010038, "grad_norm": 0.26850658655166626, "learning_rate": 1.949715857653122e-06, "loss": 0.01431370060890913, "memory(GiB)": 22.66, "step": 22269, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.956604 }, { "epoch": 0.7234512555631355, "grad_norm": 0.47125867009162903, "learning_rate": 1.9492902580121635e-06, "loss": 0.015770751982927322, "memory(GiB)": 22.66, "step": 22270, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956611 }, { "epoch": 0.7234837410258909, "grad_norm": 0.4641605317592621, "learning_rate": 1.948864693580458e-06, "loss": 0.01539126317948103, "memory(GiB)": 22.66, "step": 22271, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.956619 }, { "epoch": 0.7235162264886463, "grad_norm": 0.2714475691318512, "learning_rate": 1.948439164362911e-06, "loss": 0.01136347558349371, "memory(GiB)": 22.66, "step": 22272, "token_acc": 1.0, "train_speed(iter/s)": 0.956626 }, { "epoch": 0.7235487119514018, "grad_norm": 0.5376740097999573, "learning_rate": 1.9480136703644394e-06, "loss": 0.01155221275985241, "memory(GiB)": 22.66, "step": 22273, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.956633 }, { "epoch": 0.7235811974141572, "grad_norm": 0.29156625270843506, "learning_rate": 1.947588211589954e-06, "loss": 0.013283949345350266, "memory(GiB)": 22.66, "step": 22274, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.956639 }, { "epoch": 0.7236136828769126, "grad_norm": 0.3400672674179077, "learning_rate": 1.9471627880443624e-06, "loss": 0.01100659929215908, "memory(GiB)": 22.66, "step": 22275, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956646 }, { "epoch": 0.723646168339668, "grad_norm": 0.3572857677936554, "learning_rate": 1.946737399732576e-06, "loss": 0.01483749970793724, "memory(GiB)": 22.66, "step": 22276, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956653 }, { "epoch": 0.7236786538024235, "grad_norm": 0.39183446764945984, "learning_rate": 1.9463120466595013e-06, "loss": 0.019913500174880028, "memory(GiB)": 22.66, "step": 22277, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.95666 }, { "epoch": 0.7237111392651788, "grad_norm": 0.45316410064697266, "learning_rate": 1.9458867288300533e-06, "loss": 0.010538922622799873, "memory(GiB)": 22.66, "step": 22278, "token_acc": 1.0, "train_speed(iter/s)": 0.956667 }, { "epoch": 0.7237436247279343, "grad_norm": 0.3470277190208435, "learning_rate": 1.9454614462491356e-06, "loss": 0.01909259893000126, "memory(GiB)": 22.66, "step": 22279, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.956675 }, { "epoch": 0.7237761101906897, "grad_norm": 0.2555634379386902, "learning_rate": 1.9450361989216606e-06, "loss": 0.015394177287817001, "memory(GiB)": 22.66, "step": 22280, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.956682 }, { "epoch": 0.7238085956534451, "grad_norm": 0.3028589189052582, "learning_rate": 1.9446109868525313e-06, "loss": 0.01325017400085926, "memory(GiB)": 22.66, "step": 22281, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956689 }, { "epoch": 0.7238410811162005, "grad_norm": 0.33513644337654114, "learning_rate": 1.9441858100466587e-06, "loss": 0.012974347919225693, "memory(GiB)": 22.66, "step": 22282, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956694 }, { "epoch": 0.723873566578956, "grad_norm": 0.2951166033744812, "learning_rate": 1.9437606685089495e-06, "loss": 0.012581740505993366, "memory(GiB)": 22.66, "step": 22283, "token_acc": 1.0, "train_speed(iter/s)": 0.956701 }, { "epoch": 0.7239060520417113, "grad_norm": 0.4437733292579651, "learning_rate": 1.943335562244311e-06, "loss": 0.01628362573683262, "memory(GiB)": 22.66, "step": 22284, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.956708 }, { "epoch": 0.7239385375044668, "grad_norm": 0.32248812913894653, "learning_rate": 1.9429104912576468e-06, "loss": 0.019534554332494736, "memory(GiB)": 22.66, "step": 22285, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.956715 }, { "epoch": 0.7239710229672222, "grad_norm": 0.2653089165687561, "learning_rate": 1.9424854555538646e-06, "loss": 0.007881957106292248, "memory(GiB)": 22.66, "step": 22286, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956722 }, { "epoch": 0.7240035084299776, "grad_norm": 0.30101531744003296, "learning_rate": 1.94206045513787e-06, "loss": 0.012222141958773136, "memory(GiB)": 22.66, "step": 22287, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956729 }, { "epoch": 0.724035993892733, "grad_norm": 0.26289239525794983, "learning_rate": 1.9416354900145677e-06, "loss": 0.014645608142018318, "memory(GiB)": 22.66, "step": 22288, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.956737 }, { "epoch": 0.7240684793554885, "grad_norm": 0.38170602917671204, "learning_rate": 1.941210560188865e-06, "loss": 0.011537816375494003, "memory(GiB)": 22.66, "step": 22289, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.956746 }, { "epoch": 0.7241009648182438, "grad_norm": 0.43107709288597107, "learning_rate": 1.940785665665661e-06, "loss": 0.013582482002675533, "memory(GiB)": 22.66, "step": 22290, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.956755 }, { "epoch": 0.7241334502809993, "grad_norm": 0.23911191523075104, "learning_rate": 1.9403608064498626e-06, "loss": 0.01235822681337595, "memory(GiB)": 22.66, "step": 22291, "token_acc": 1.0, "train_speed(iter/s)": 0.956764 }, { "epoch": 0.7241659357437547, "grad_norm": 0.4156973361968994, "learning_rate": 1.939935982546373e-06, "loss": 0.01366075687110424, "memory(GiB)": 22.66, "step": 22292, "token_acc": 1.0, "train_speed(iter/s)": 0.956772 }, { "epoch": 0.7241984212065101, "grad_norm": 0.36347296833992004, "learning_rate": 1.939511193960096e-06, "loss": 0.011713853105902672, "memory(GiB)": 22.66, "step": 22293, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.956781 }, { "epoch": 0.7242309066692655, "grad_norm": 0.2539578080177307, "learning_rate": 1.939086440695932e-06, "loss": 0.009883682243525982, "memory(GiB)": 22.66, "step": 22294, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.956791 }, { "epoch": 0.724263392132021, "grad_norm": 0.3882378041744232, "learning_rate": 1.9386617227587853e-06, "loss": 0.010921983048319817, "memory(GiB)": 22.66, "step": 22295, "token_acc": 1.0, "train_speed(iter/s)": 0.9568 }, { "epoch": 0.7242958775947763, "grad_norm": 0.3899695575237274, "learning_rate": 1.938237040153556e-06, "loss": 0.011378767900168896, "memory(GiB)": 22.66, "step": 22296, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956808 }, { "epoch": 0.7243283630575318, "grad_norm": 0.4290228486061096, "learning_rate": 1.9378123928851485e-06, "loss": 0.018692515790462494, "memory(GiB)": 22.66, "step": 22297, "token_acc": 0.9883040935672515, "train_speed(iter/s)": 0.956817 }, { "epoch": 0.7243608485202871, "grad_norm": 0.2693641781806946, "learning_rate": 1.93738778095846e-06, "loss": 0.012400036677718163, "memory(GiB)": 22.66, "step": 22298, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956825 }, { "epoch": 0.7243933339830426, "grad_norm": 0.19682617485523224, "learning_rate": 1.936963204378395e-06, "loss": 0.013668771833181381, "memory(GiB)": 22.66, "step": 22299, "token_acc": 1.0, "train_speed(iter/s)": 0.956833 }, { "epoch": 0.724425819445798, "grad_norm": 0.3009585738182068, "learning_rate": 1.936538663149847e-06, "loss": 0.01461123675107956, "memory(GiB)": 22.66, "step": 22300, "token_acc": 1.0, "train_speed(iter/s)": 0.95684 }, { "epoch": 0.7244583049085535, "grad_norm": 0.3504922688007355, "learning_rate": 1.9361141572777247e-06, "loss": 0.014710902236402035, "memory(GiB)": 22.66, "step": 22301, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.956847 }, { "epoch": 0.7244907903713088, "grad_norm": 0.3169780969619751, "learning_rate": 1.935689686766921e-06, "loss": 0.00958048366010189, "memory(GiB)": 22.66, "step": 22302, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.956854 }, { "epoch": 0.7245232758340643, "grad_norm": 0.4410291910171509, "learning_rate": 1.935265251622337e-06, "loss": 0.013342434540390968, "memory(GiB)": 22.66, "step": 22303, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.95686 }, { "epoch": 0.7245557612968196, "grad_norm": 0.40960028767585754, "learning_rate": 1.9348408518488733e-06, "loss": 0.014482252299785614, "memory(GiB)": 22.66, "step": 22304, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.956867 }, { "epoch": 0.7245882467595751, "grad_norm": 0.37656736373901367, "learning_rate": 1.9344164874514214e-06, "loss": 0.016520295292139053, "memory(GiB)": 22.66, "step": 22305, "token_acc": 1.0, "train_speed(iter/s)": 0.956873 }, { "epoch": 0.7246207322223305, "grad_norm": 0.4226779043674469, "learning_rate": 1.933992158434888e-06, "loss": 0.013708262704312801, "memory(GiB)": 22.66, "step": 22306, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.956879 }, { "epoch": 0.724653217685086, "grad_norm": 0.33922243118286133, "learning_rate": 1.9335678648041632e-06, "loss": 0.016128920018672943, "memory(GiB)": 22.66, "step": 22307, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956886 }, { "epoch": 0.7246857031478413, "grad_norm": 0.4964914619922638, "learning_rate": 1.9331436065641495e-06, "loss": 0.017247876152396202, "memory(GiB)": 22.66, "step": 22308, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.956892 }, { "epoch": 0.7247181886105968, "grad_norm": 0.401891827583313, "learning_rate": 1.932719383719738e-06, "loss": 0.01248011365532875, "memory(GiB)": 22.66, "step": 22309, "token_acc": 1.0, "train_speed(iter/s)": 0.956899 }, { "epoch": 0.7247506740733521, "grad_norm": 0.3532806634902954, "learning_rate": 1.9322951962758274e-06, "loss": 0.01627199724316597, "memory(GiB)": 22.66, "step": 22310, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.956906 }, { "epoch": 0.7247831595361076, "grad_norm": 0.21908678114414215, "learning_rate": 1.931871044237314e-06, "loss": 0.0075413864105939865, "memory(GiB)": 22.66, "step": 22311, "token_acc": 0.9966887417218543, "train_speed(iter/s)": 0.956912 }, { "epoch": 0.724815644998863, "grad_norm": 0.37668389081954956, "learning_rate": 1.9314469276090937e-06, "loss": 0.017571628093719482, "memory(GiB)": 22.66, "step": 22312, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.956919 }, { "epoch": 0.7248481304616184, "grad_norm": 0.3669433891773224, "learning_rate": 1.9310228463960585e-06, "loss": 0.014369145035743713, "memory(GiB)": 22.66, "step": 22313, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956926 }, { "epoch": 0.7248806159243738, "grad_norm": 0.356176495552063, "learning_rate": 1.930598800603104e-06, "loss": 0.011222686618566513, "memory(GiB)": 22.66, "step": 22314, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956933 }, { "epoch": 0.7249131013871293, "grad_norm": 0.3740132451057434, "learning_rate": 1.930174790235125e-06, "loss": 0.013681855984032154, "memory(GiB)": 22.66, "step": 22315, "token_acc": 1.0, "train_speed(iter/s)": 0.956939 }, { "epoch": 0.7249455868498846, "grad_norm": 0.31859248876571655, "learning_rate": 1.929750815297017e-06, "loss": 0.014989486895501614, "memory(GiB)": 22.66, "step": 22316, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.956946 }, { "epoch": 0.7249780723126401, "grad_norm": 0.407831609249115, "learning_rate": 1.9293268757936683e-06, "loss": 0.015367587096989155, "memory(GiB)": 22.66, "step": 22317, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.956953 }, { "epoch": 0.7250105577753955, "grad_norm": 0.5986472964286804, "learning_rate": 1.9289029717299757e-06, "loss": 0.010593779385089874, "memory(GiB)": 22.66, "step": 22318, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.95696 }, { "epoch": 0.7250430432381509, "grad_norm": 0.3632572889328003, "learning_rate": 1.9284791031108295e-06, "loss": 0.018258480355143547, "memory(GiB)": 22.66, "step": 22319, "token_acc": 1.0, "train_speed(iter/s)": 0.956968 }, { "epoch": 0.7250755287009063, "grad_norm": 0.4029232859611511, "learning_rate": 1.928055269941123e-06, "loss": 0.01949997991323471, "memory(GiB)": 22.66, "step": 22320, "token_acc": 1.0, "train_speed(iter/s)": 0.956977 }, { "epoch": 0.7251080141636618, "grad_norm": 0.38229164481163025, "learning_rate": 1.9276314722257493e-06, "loss": 0.01519252173602581, "memory(GiB)": 22.66, "step": 22321, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.956986 }, { "epoch": 0.7251404996264171, "grad_norm": 0.41029420495033264, "learning_rate": 1.927207709969596e-06, "loss": 0.013176200911402702, "memory(GiB)": 22.66, "step": 22322, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956995 }, { "epoch": 0.7251729850891726, "grad_norm": 0.37974968552589417, "learning_rate": 1.9267839831775554e-06, "loss": 0.016589535400271416, "memory(GiB)": 22.66, "step": 22323, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957004 }, { "epoch": 0.725205470551928, "grad_norm": 0.40420639514923096, "learning_rate": 1.926360291854518e-06, "loss": 0.01526489108800888, "memory(GiB)": 22.66, "step": 22324, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.957013 }, { "epoch": 0.7252379560146834, "grad_norm": 0.3121475577354431, "learning_rate": 1.925936636005376e-06, "loss": 0.01452932134270668, "memory(GiB)": 22.66, "step": 22325, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.957022 }, { "epoch": 0.7252704414774388, "grad_norm": 0.34672224521636963, "learning_rate": 1.925513015635015e-06, "loss": 0.012869976460933685, "memory(GiB)": 22.66, "step": 22326, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957031 }, { "epoch": 0.7253029269401943, "grad_norm": 0.2912900149822235, "learning_rate": 1.925089430748327e-06, "loss": 0.01633533462882042, "memory(GiB)": 22.66, "step": 22327, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.95704 }, { "epoch": 0.7253354124029496, "grad_norm": 0.3291061520576477, "learning_rate": 1.9246658813501966e-06, "loss": 0.010331602767109871, "memory(GiB)": 22.66, "step": 22328, "token_acc": 1.0, "train_speed(iter/s)": 0.957047 }, { "epoch": 0.7253678978657051, "grad_norm": 0.3892437815666199, "learning_rate": 1.924242367445519e-06, "loss": 0.017051469534635544, "memory(GiB)": 22.66, "step": 22329, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.957054 }, { "epoch": 0.7254003833284605, "grad_norm": 0.23968017101287842, "learning_rate": 1.923818889039176e-06, "loss": 0.009448395110666752, "memory(GiB)": 22.66, "step": 22330, "token_acc": 1.0, "train_speed(iter/s)": 0.957062 }, { "epoch": 0.7254328687912159, "grad_norm": 0.3402230441570282, "learning_rate": 1.923395446136059e-06, "loss": 0.018288711085915565, "memory(GiB)": 22.66, "step": 22331, "token_acc": 1.0, "train_speed(iter/s)": 0.957069 }, { "epoch": 0.7254653542539713, "grad_norm": 0.25873416662216187, "learning_rate": 1.9229720387410527e-06, "loss": 0.008077330887317657, "memory(GiB)": 22.66, "step": 22332, "token_acc": 1.0, "train_speed(iter/s)": 0.957076 }, { "epoch": 0.7254978397167268, "grad_norm": 0.4447462856769562, "learning_rate": 1.9225486668590425e-06, "loss": 0.01452704332768917, "memory(GiB)": 22.66, "step": 22333, "token_acc": 0.9965277777777778, "train_speed(iter/s)": 0.957083 }, { "epoch": 0.7255303251794821, "grad_norm": 0.7146628499031067, "learning_rate": 1.92212533049492e-06, "loss": 0.015893932431936264, "memory(GiB)": 22.66, "step": 22334, "token_acc": 0.9859649122807017, "train_speed(iter/s)": 0.95709 }, { "epoch": 0.7255628106422376, "grad_norm": 0.34517645835876465, "learning_rate": 1.921702029653566e-06, "loss": 0.01577017828822136, "memory(GiB)": 22.66, "step": 22335, "token_acc": 1.0, "train_speed(iter/s)": 0.957098 }, { "epoch": 0.7255952961049931, "grad_norm": 0.28374865651130676, "learning_rate": 1.92127876433987e-06, "loss": 0.011528108268976212, "memory(GiB)": 22.66, "step": 22336, "token_acc": 0.9963636363636363, "train_speed(iter/s)": 0.957105 }, { "epoch": 0.7256277815677484, "grad_norm": 0.28090161085128784, "learning_rate": 1.920855534558711e-06, "loss": 0.011319087818264961, "memory(GiB)": 22.66, "step": 22337, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957111 }, { "epoch": 0.7256602670305039, "grad_norm": 0.2989136874675751, "learning_rate": 1.920432340314981e-06, "loss": 0.015101470984518528, "memory(GiB)": 22.66, "step": 22338, "token_acc": 1.0, "train_speed(iter/s)": 0.957118 }, { "epoch": 0.7256927524932593, "grad_norm": 0.2250036597251892, "learning_rate": 1.9200091816135585e-06, "loss": 0.0108976811170578, "memory(GiB)": 22.66, "step": 22339, "token_acc": 1.0, "train_speed(iter/s)": 0.957124 }, { "epoch": 0.7257252379560147, "grad_norm": 0.36791548132896423, "learning_rate": 1.919586058459332e-06, "loss": 0.02034250646829605, "memory(GiB)": 22.66, "step": 22340, "token_acc": 0.9836956521739131, "train_speed(iter/s)": 0.957131 }, { "epoch": 0.7257577234187701, "grad_norm": 0.3853355944156647, "learning_rate": 1.9191629708571803e-06, "loss": 0.01931971125304699, "memory(GiB)": 22.66, "step": 22341, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.957139 }, { "epoch": 0.7257902088815256, "grad_norm": 0.2854967415332794, "learning_rate": 1.9187399188119888e-06, "loss": 0.011744780465960503, "memory(GiB)": 22.66, "step": 22342, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957146 }, { "epoch": 0.7258226943442809, "grad_norm": 0.47673892974853516, "learning_rate": 1.918316902328639e-06, "loss": 0.014934616163372993, "memory(GiB)": 22.66, "step": 22343, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.957153 }, { "epoch": 0.7258551798070364, "grad_norm": 0.4363131523132324, "learning_rate": 1.917893921412016e-06, "loss": 0.015473428182303905, "memory(GiB)": 22.66, "step": 22344, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.957161 }, { "epoch": 0.7258876652697918, "grad_norm": 0.44633781909942627, "learning_rate": 1.917470976066998e-06, "loss": 0.021564055234193802, "memory(GiB)": 22.66, "step": 22345, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957168 }, { "epoch": 0.7259201507325472, "grad_norm": 0.42033684253692627, "learning_rate": 1.9170480662984674e-06, "loss": 0.016822094097733498, "memory(GiB)": 22.66, "step": 22346, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.957174 }, { "epoch": 0.7259526361953026, "grad_norm": 0.5214666128158569, "learning_rate": 1.916625192111306e-06, "loss": 0.01513977162539959, "memory(GiB)": 22.66, "step": 22347, "token_acc": 0.99, "train_speed(iter/s)": 0.95718 }, { "epoch": 0.7259851216580581, "grad_norm": 0.3194623589515686, "learning_rate": 1.9162023535103953e-06, "loss": 0.01798618584871292, "memory(GiB)": 22.66, "step": 22348, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.957188 }, { "epoch": 0.7260176071208134, "grad_norm": 0.3056322932243347, "learning_rate": 1.9157795505006122e-06, "loss": 0.015315367840230465, "memory(GiB)": 22.66, "step": 22349, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.957196 }, { "epoch": 0.7260500925835689, "grad_norm": 0.4117472469806671, "learning_rate": 1.915356783086838e-06, "loss": 0.01591958850622177, "memory(GiB)": 22.66, "step": 22350, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957203 }, { "epoch": 0.7260825780463243, "grad_norm": 0.47876864671707153, "learning_rate": 1.914934051273952e-06, "loss": 0.017093198373913765, "memory(GiB)": 22.66, "step": 22351, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.95721 }, { "epoch": 0.7261150635090797, "grad_norm": 0.351998895406723, "learning_rate": 1.914511355066834e-06, "loss": 0.01497281901538372, "memory(GiB)": 22.66, "step": 22352, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.957219 }, { "epoch": 0.7261475489718351, "grad_norm": 0.35424143075942993, "learning_rate": 1.9140886944703636e-06, "loss": 0.014496224001049995, "memory(GiB)": 22.66, "step": 22353, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.957228 }, { "epoch": 0.7261800344345906, "grad_norm": 0.3185105323791504, "learning_rate": 1.9136660694894144e-06, "loss": 0.019286103546619415, "memory(GiB)": 22.66, "step": 22354, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.957237 }, { "epoch": 0.7262125198973459, "grad_norm": 0.4819456934928894, "learning_rate": 1.9132434801288667e-06, "loss": 0.016205189749598503, "memory(GiB)": 22.66, "step": 22355, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.957246 }, { "epoch": 0.7262450053601014, "grad_norm": 0.32185935974121094, "learning_rate": 1.912820926393598e-06, "loss": 0.015797723084688187, "memory(GiB)": 22.66, "step": 22356, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.957255 }, { "epoch": 0.7262774908228568, "grad_norm": 0.34412550926208496, "learning_rate": 1.9123984082884866e-06, "loss": 0.01572597399353981, "memory(GiB)": 22.66, "step": 22357, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957264 }, { "epoch": 0.7263099762856122, "grad_norm": 0.33844465017318726, "learning_rate": 1.9119759258184057e-06, "loss": 0.013573282398283482, "memory(GiB)": 22.66, "step": 22358, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.957273 }, { "epoch": 0.7263424617483676, "grad_norm": 0.2966729402542114, "learning_rate": 1.9115534789882347e-06, "loss": 0.014644294045865536, "memory(GiB)": 22.66, "step": 22359, "token_acc": 1.0, "train_speed(iter/s)": 0.95728 }, { "epoch": 0.7263749472111231, "grad_norm": 0.33875471353530884, "learning_rate": 1.9111310678028428e-06, "loss": 0.01618078164756298, "memory(GiB)": 22.66, "step": 22360, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957286 }, { "epoch": 0.7264074326738784, "grad_norm": 0.39456045627593994, "learning_rate": 1.9107086922671135e-06, "loss": 0.011225776746869087, "memory(GiB)": 22.66, "step": 22361, "token_acc": 1.0, "train_speed(iter/s)": 0.957293 }, { "epoch": 0.7264399181366339, "grad_norm": 0.37221208214759827, "learning_rate": 1.910286352385916e-06, "loss": 0.015634479001164436, "memory(GiB)": 22.66, "step": 22362, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.9573 }, { "epoch": 0.7264724035993892, "grad_norm": 0.35615575313568115, "learning_rate": 1.9098640481641285e-06, "loss": 0.016302525997161865, "memory(GiB)": 22.66, "step": 22363, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.957307 }, { "epoch": 0.7265048890621447, "grad_norm": 0.28590071201324463, "learning_rate": 1.909441779606621e-06, "loss": 0.012822076678276062, "memory(GiB)": 22.66, "step": 22364, "token_acc": 1.0, "train_speed(iter/s)": 0.957314 }, { "epoch": 0.7265373745249001, "grad_norm": 0.3207026720046997, "learning_rate": 1.9090195467182672e-06, "loss": 0.01461893878877163, "memory(GiB)": 22.66, "step": 22365, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.95732 }, { "epoch": 0.7265698599876556, "grad_norm": 0.6788788437843323, "learning_rate": 1.908597349503945e-06, "loss": 0.023178651928901672, "memory(GiB)": 22.66, "step": 22366, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.957327 }, { "epoch": 0.7266023454504109, "grad_norm": 0.3451412320137024, "learning_rate": 1.908175187968522e-06, "loss": 0.014985380694270134, "memory(GiB)": 22.66, "step": 22367, "token_acc": 1.0, "train_speed(iter/s)": 0.957333 }, { "epoch": 0.7266348309131664, "grad_norm": 0.6805321574211121, "learning_rate": 1.907753062116875e-06, "loss": 0.015712285414338112, "memory(GiB)": 22.66, "step": 22368, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.95734 }, { "epoch": 0.7266673163759217, "grad_norm": 0.19686681032180786, "learning_rate": 1.907330971953872e-06, "loss": 0.005918894428759813, "memory(GiB)": 22.66, "step": 22369, "token_acc": 1.0, "train_speed(iter/s)": 0.957346 }, { "epoch": 0.7266998018386772, "grad_norm": 0.3537687361240387, "learning_rate": 1.9069089174843853e-06, "loss": 0.014722906984388828, "memory(GiB)": 22.66, "step": 22370, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957353 }, { "epoch": 0.7267322873014326, "grad_norm": 0.297372430562973, "learning_rate": 1.9064868987132866e-06, "loss": 0.015357568860054016, "memory(GiB)": 22.66, "step": 22371, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.95736 }, { "epoch": 0.726764772764188, "grad_norm": 0.5102635025978088, "learning_rate": 1.906064915645448e-06, "loss": 0.0186423659324646, "memory(GiB)": 22.66, "step": 22372, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.957366 }, { "epoch": 0.7267972582269434, "grad_norm": 0.3550076186656952, "learning_rate": 1.9056429682857369e-06, "loss": 0.01832728087902069, "memory(GiB)": 22.66, "step": 22373, "token_acc": 1.0, "train_speed(iter/s)": 0.957373 }, { "epoch": 0.7268297436896989, "grad_norm": 0.23245848715305328, "learning_rate": 1.905221056639025e-06, "loss": 0.007980679161846638, "memory(GiB)": 22.66, "step": 22374, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.95738 }, { "epoch": 0.7268622291524542, "grad_norm": 0.4707263112068176, "learning_rate": 1.9047991807101807e-06, "loss": 0.01632937230169773, "memory(GiB)": 22.66, "step": 22375, "token_acc": 1.0, "train_speed(iter/s)": 0.957386 }, { "epoch": 0.7268947146152097, "grad_norm": 0.48379045724868774, "learning_rate": 1.9043773405040755e-06, "loss": 0.017857540398836136, "memory(GiB)": 22.66, "step": 22376, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.957393 }, { "epoch": 0.7269272000779651, "grad_norm": 0.32873445749282837, "learning_rate": 1.903955536025574e-06, "loss": 0.012999849393963814, "memory(GiB)": 22.66, "step": 22377, "token_acc": 1.0, "train_speed(iter/s)": 0.9574 }, { "epoch": 0.7269596855407205, "grad_norm": 0.3333403468132019, "learning_rate": 1.9035337672795468e-06, "loss": 0.012303543277084827, "memory(GiB)": 22.66, "step": 22378, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957408 }, { "epoch": 0.7269921710034759, "grad_norm": 0.4618256390094757, "learning_rate": 1.9031120342708609e-06, "loss": 0.02079465053975582, "memory(GiB)": 22.66, "step": 22379, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.957417 }, { "epoch": 0.7270246564662314, "grad_norm": 0.2739619314670563, "learning_rate": 1.9026903370043842e-06, "loss": 0.009800741448998451, "memory(GiB)": 22.66, "step": 22380, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.957426 }, { "epoch": 0.7270571419289867, "grad_norm": 0.22824274003505707, "learning_rate": 1.9022686754849856e-06, "loss": 0.00782643910497427, "memory(GiB)": 22.66, "step": 22381, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957436 }, { "epoch": 0.7270896273917422, "grad_norm": 0.37458205223083496, "learning_rate": 1.9018470497175278e-06, "loss": 0.014827752485871315, "memory(GiB)": 22.66, "step": 22382, "token_acc": 0.9923664122137404, "train_speed(iter/s)": 0.957445 }, { "epoch": 0.7271221128544976, "grad_norm": 0.17233826220035553, "learning_rate": 1.9014254597068783e-06, "loss": 0.008030615746974945, "memory(GiB)": 22.66, "step": 22383, "token_acc": 1.0, "train_speed(iter/s)": 0.957454 }, { "epoch": 0.727154598317253, "grad_norm": 0.41862720251083374, "learning_rate": 1.9010039054579032e-06, "loss": 0.01755889505147934, "memory(GiB)": 22.66, "step": 22384, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.957464 }, { "epoch": 0.7271870837800084, "grad_norm": 0.27536386251449585, "learning_rate": 1.9005823869754697e-06, "loss": 0.013427888974547386, "memory(GiB)": 22.66, "step": 22385, "token_acc": 0.9823943661971831, "train_speed(iter/s)": 0.957473 }, { "epoch": 0.7272195692427639, "grad_norm": 0.4344610273838043, "learning_rate": 1.9001609042644391e-06, "loss": 0.01713407412171364, "memory(GiB)": 22.66, "step": 22386, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957483 }, { "epoch": 0.7272520547055192, "grad_norm": 0.4160974323749542, "learning_rate": 1.8997394573296795e-06, "loss": 0.016597341746091843, "memory(GiB)": 22.66, "step": 22387, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957492 }, { "epoch": 0.7272845401682747, "grad_norm": 0.48409223556518555, "learning_rate": 1.899318046176049e-06, "loss": 0.01485845074057579, "memory(GiB)": 22.66, "step": 22388, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.957501 }, { "epoch": 0.7273170256310301, "grad_norm": 0.4647623300552368, "learning_rate": 1.898896670808419e-06, "loss": 0.011259564198553562, "memory(GiB)": 22.66, "step": 22389, "token_acc": 1.0, "train_speed(iter/s)": 0.957509 }, { "epoch": 0.7273495110937855, "grad_norm": 0.352164626121521, "learning_rate": 1.8984753312316473e-06, "loss": 0.019557490944862366, "memory(GiB)": 22.66, "step": 22390, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.957516 }, { "epoch": 0.7273819965565409, "grad_norm": 0.41232818365097046, "learning_rate": 1.8980540274505998e-06, "loss": 0.012609526515007019, "memory(GiB)": 22.66, "step": 22391, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957524 }, { "epoch": 0.7274144820192964, "grad_norm": 0.3466830551624298, "learning_rate": 1.897632759470136e-06, "loss": 0.0168756116181612, "memory(GiB)": 22.66, "step": 22392, "token_acc": 1.0, "train_speed(iter/s)": 0.95753 }, { "epoch": 0.7274469674820517, "grad_norm": 0.31002646684646606, "learning_rate": 1.8972115272951196e-06, "loss": 0.014018462970852852, "memory(GiB)": 22.66, "step": 22393, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957537 }, { "epoch": 0.7274794529448072, "grad_norm": 0.4165314733982086, "learning_rate": 1.8967903309304113e-06, "loss": 0.011893351562321186, "memory(GiB)": 22.66, "step": 22394, "token_acc": 1.0, "train_speed(iter/s)": 0.957544 }, { "epoch": 0.7275119384075626, "grad_norm": 0.43905994296073914, "learning_rate": 1.896369170380874e-06, "loss": 0.019619591534137726, "memory(GiB)": 22.66, "step": 22395, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.957551 }, { "epoch": 0.727544423870318, "grad_norm": 0.41168633103370667, "learning_rate": 1.8959480456513684e-06, "loss": 0.016982970759272575, "memory(GiB)": 22.66, "step": 22396, "token_acc": 0.9857651245551602, "train_speed(iter/s)": 0.957559 }, { "epoch": 0.7275769093330734, "grad_norm": 0.32257816195487976, "learning_rate": 1.8955269567467521e-06, "loss": 0.013838819228112698, "memory(GiB)": 22.66, "step": 22397, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.957566 }, { "epoch": 0.7276093947958289, "grad_norm": 0.3291398882865906, "learning_rate": 1.8951059036718873e-06, "loss": 0.012358981184661388, "memory(GiB)": 22.66, "step": 22398, "token_acc": 1.0, "train_speed(iter/s)": 0.957572 }, { "epoch": 0.7276418802585842, "grad_norm": 0.4062636196613312, "learning_rate": 1.8946848864316326e-06, "loss": 0.017630983144044876, "memory(GiB)": 22.66, "step": 22399, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.95758 }, { "epoch": 0.7276743657213397, "grad_norm": 0.28692954778671265, "learning_rate": 1.8942639050308486e-06, "loss": 0.01458472479134798, "memory(GiB)": 22.66, "step": 22400, "token_acc": 1.0, "train_speed(iter/s)": 0.957587 }, { "epoch": 0.7277068511840952, "grad_norm": 0.3465676009654999, "learning_rate": 1.8938429594743919e-06, "loss": 0.01709078624844551, "memory(GiB)": 22.66, "step": 22401, "token_acc": 1.0, "train_speed(iter/s)": 0.957593 }, { "epoch": 0.7277393366468505, "grad_norm": 0.37000972032546997, "learning_rate": 1.893422049767122e-06, "loss": 0.0164509117603302, "memory(GiB)": 22.66, "step": 22402, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957601 }, { "epoch": 0.727771822109606, "grad_norm": 0.4008941948413849, "learning_rate": 1.8930011759138954e-06, "loss": 0.014662785455584526, "memory(GiB)": 22.66, "step": 22403, "token_acc": 1.0, "train_speed(iter/s)": 0.957607 }, { "epoch": 0.7278043075723614, "grad_norm": 0.49725520610809326, "learning_rate": 1.8925803379195735e-06, "loss": 0.01815769076347351, "memory(GiB)": 22.66, "step": 22404, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.957615 }, { "epoch": 0.7278367930351168, "grad_norm": 0.37825828790664673, "learning_rate": 1.8921595357890082e-06, "loss": 0.017631735652685165, "memory(GiB)": 22.66, "step": 22405, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957622 }, { "epoch": 0.7278692784978722, "grad_norm": 0.3389580547809601, "learning_rate": 1.8917387695270584e-06, "loss": 0.01700674183666706, "memory(GiB)": 22.66, "step": 22406, "token_acc": 0.9887005649717514, "train_speed(iter/s)": 0.95763 }, { "epoch": 0.7279017639606277, "grad_norm": 0.29454660415649414, "learning_rate": 1.8913180391385804e-06, "loss": 0.014608044177293777, "memory(GiB)": 22.66, "step": 22407, "token_acc": 1.0, "train_speed(iter/s)": 0.957637 }, { "epoch": 0.727934249423383, "grad_norm": 0.28336668014526367, "learning_rate": 1.8908973446284312e-06, "loss": 0.015023437328636646, "memory(GiB)": 22.66, "step": 22408, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.957643 }, { "epoch": 0.7279667348861385, "grad_norm": 0.3171997666358948, "learning_rate": 1.8904766860014633e-06, "loss": 0.014974003657698631, "memory(GiB)": 22.66, "step": 22409, "token_acc": 1.0, "train_speed(iter/s)": 0.95765 }, { "epoch": 0.7279992203488939, "grad_norm": 0.3125534951686859, "learning_rate": 1.8900560632625354e-06, "loss": 0.012777533382177353, "memory(GiB)": 22.66, "step": 22410, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.957657 }, { "epoch": 0.7280317058116493, "grad_norm": 0.30143365263938904, "learning_rate": 1.8896354764164959e-06, "loss": 0.012153621762990952, "memory(GiB)": 22.66, "step": 22411, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.957665 }, { "epoch": 0.7280641912744047, "grad_norm": 0.32222041487693787, "learning_rate": 1.8892149254682052e-06, "loss": 0.016228437423706055, "memory(GiB)": 22.66, "step": 22412, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.957673 }, { "epoch": 0.7280966767371602, "grad_norm": 0.21708400547504425, "learning_rate": 1.8887944104225158e-06, "loss": 0.008696531876921654, "memory(GiB)": 22.66, "step": 22413, "token_acc": 1.0, "train_speed(iter/s)": 0.95768 }, { "epoch": 0.7281291621999155, "grad_norm": 0.31174349784851074, "learning_rate": 1.8883739312842786e-06, "loss": 0.01065149623900652, "memory(GiB)": 22.66, "step": 22414, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.957689 }, { "epoch": 0.728161647662671, "grad_norm": 0.2564637064933777, "learning_rate": 1.8879534880583494e-06, "loss": 0.011328385211527348, "memory(GiB)": 22.66, "step": 22415, "token_acc": 1.0, "train_speed(iter/s)": 0.957698 }, { "epoch": 0.7281941331254264, "grad_norm": 0.3905208706855774, "learning_rate": 1.887533080749575e-06, "loss": 0.01690763421356678, "memory(GiB)": 22.66, "step": 22416, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.957707 }, { "epoch": 0.7282266185881818, "grad_norm": 0.25469812750816345, "learning_rate": 1.8871127093628161e-06, "loss": 0.007894555106759071, "memory(GiB)": 22.66, "step": 22417, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957716 }, { "epoch": 0.7282591040509372, "grad_norm": 0.34492331743240356, "learning_rate": 1.8866923739029175e-06, "loss": 0.010230648331344128, "memory(GiB)": 22.66, "step": 22418, "token_acc": 1.0, "train_speed(iter/s)": 0.957724 }, { "epoch": 0.7282915895136927, "grad_norm": 0.29098182916641235, "learning_rate": 1.8862720743747343e-06, "loss": 0.009348440915346146, "memory(GiB)": 22.66, "step": 22419, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957731 }, { "epoch": 0.728324074976448, "grad_norm": 0.3877931535243988, "learning_rate": 1.8858518107831143e-06, "loss": 0.018664568662643433, "memory(GiB)": 22.66, "step": 22420, "token_acc": 0.9816849816849816, "train_speed(iter/s)": 0.957738 }, { "epoch": 0.7283565604392035, "grad_norm": 0.43329378962516785, "learning_rate": 1.8854315831329089e-06, "loss": 0.01924401894211769, "memory(GiB)": 22.66, "step": 22421, "token_acc": 0.9856459330143541, "train_speed(iter/s)": 0.957746 }, { "epoch": 0.7283890459019589, "grad_norm": 0.1916089802980423, "learning_rate": 1.8850113914289687e-06, "loss": 0.010425524786114693, "memory(GiB)": 22.66, "step": 22422, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.957752 }, { "epoch": 0.7284215313647143, "grad_norm": 0.3249233067035675, "learning_rate": 1.8845912356761447e-06, "loss": 0.010473228059709072, "memory(GiB)": 22.66, "step": 22423, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.957759 }, { "epoch": 0.7284540168274697, "grad_norm": 0.3472576439380646, "learning_rate": 1.8841711158792831e-06, "loss": 0.012063923291862011, "memory(GiB)": 22.66, "step": 22424, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.957766 }, { "epoch": 0.7284865022902252, "grad_norm": 0.3622877895832062, "learning_rate": 1.883751032043234e-06, "loss": 0.012249917723238468, "memory(GiB)": 22.66, "step": 22425, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.957772 }, { "epoch": 0.7285189877529805, "grad_norm": 0.3192276656627655, "learning_rate": 1.8833309841728453e-06, "loss": 0.015334472060203552, "memory(GiB)": 22.66, "step": 22426, "token_acc": 1.0, "train_speed(iter/s)": 0.957779 }, { "epoch": 0.728551473215736, "grad_norm": 0.34497368335723877, "learning_rate": 1.8829109722729655e-06, "loss": 0.009441947564482689, "memory(GiB)": 22.66, "step": 22427, "token_acc": 1.0, "train_speed(iter/s)": 0.957786 }, { "epoch": 0.7285839586784914, "grad_norm": 0.2569868564605713, "learning_rate": 1.8824909963484439e-06, "loss": 0.007207528688013554, "memory(GiB)": 22.66, "step": 22428, "token_acc": 1.0, "train_speed(iter/s)": 0.957793 }, { "epoch": 0.7286164441412468, "grad_norm": 0.3400365114212036, "learning_rate": 1.8820710564041234e-06, "loss": 0.013970605097711086, "memory(GiB)": 22.66, "step": 22429, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.9578 }, { "epoch": 0.7286489296040022, "grad_norm": 0.30656883120536804, "learning_rate": 1.8816511524448534e-06, "loss": 0.008502984419465065, "memory(GiB)": 22.66, "step": 22430, "token_acc": 1.0, "train_speed(iter/s)": 0.957806 }, { "epoch": 0.7286814150667577, "grad_norm": 0.3623788356781006, "learning_rate": 1.8812312844754788e-06, "loss": 0.013754520565271378, "memory(GiB)": 22.66, "step": 22431, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957812 }, { "epoch": 0.728713900529513, "grad_norm": 0.3200955092906952, "learning_rate": 1.8808114525008486e-06, "loss": 0.013736603781580925, "memory(GiB)": 22.66, "step": 22432, "token_acc": 1.0, "train_speed(iter/s)": 0.957819 }, { "epoch": 0.7287463859922685, "grad_norm": 0.367311030626297, "learning_rate": 1.880391656525804e-06, "loss": 0.011389380320906639, "memory(GiB)": 22.66, "step": 22433, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.957826 }, { "epoch": 0.7287788714550238, "grad_norm": 0.3313463628292084, "learning_rate": 1.8799718965551917e-06, "loss": 0.01437309104949236, "memory(GiB)": 22.66, "step": 22434, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.957833 }, { "epoch": 0.7288113569177793, "grad_norm": 0.31113073229789734, "learning_rate": 1.879552172593857e-06, "loss": 0.008468741551041603, "memory(GiB)": 22.66, "step": 22435, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.95784 }, { "epoch": 0.7288438423805347, "grad_norm": 0.23900339007377625, "learning_rate": 1.8791324846466447e-06, "loss": 0.008001454174518585, "memory(GiB)": 22.66, "step": 22436, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.957847 }, { "epoch": 0.7288763278432902, "grad_norm": 0.36730408668518066, "learning_rate": 1.8787128327183956e-06, "loss": 0.020899347960948944, "memory(GiB)": 22.66, "step": 22437, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957853 }, { "epoch": 0.7289088133060455, "grad_norm": 0.4411119222640991, "learning_rate": 1.8782932168139572e-06, "loss": 0.015228471718728542, "memory(GiB)": 22.66, "step": 22438, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.957862 }, { "epoch": 0.728941298768801, "grad_norm": 0.3430548310279846, "learning_rate": 1.8778736369381662e-06, "loss": 0.013341818004846573, "memory(GiB)": 22.66, "step": 22439, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.957871 }, { "epoch": 0.7289737842315563, "grad_norm": 0.39497655630111694, "learning_rate": 1.8774540930958723e-06, "loss": 0.01520375907421112, "memory(GiB)": 22.66, "step": 22440, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.95788 }, { "epoch": 0.7290062696943118, "grad_norm": 0.39046719670295715, "learning_rate": 1.8770345852919132e-06, "loss": 0.013746703043580055, "memory(GiB)": 22.66, "step": 22441, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957889 }, { "epoch": 0.7290387551570672, "grad_norm": 0.3781217038631439, "learning_rate": 1.8766151135311306e-06, "loss": 0.014929704368114471, "memory(GiB)": 22.66, "step": 22442, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957898 }, { "epoch": 0.7290712406198226, "grad_norm": 0.4003811776638031, "learning_rate": 1.8761956778183694e-06, "loss": 0.01718943938612938, "memory(GiB)": 22.66, "step": 22443, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.957907 }, { "epoch": 0.729103726082578, "grad_norm": 0.29460862278938293, "learning_rate": 1.8757762781584638e-06, "loss": 0.006246535107493401, "memory(GiB)": 22.66, "step": 22444, "token_acc": 1.0, "train_speed(iter/s)": 0.957915 }, { "epoch": 0.7291362115453335, "grad_norm": 0.44245848059654236, "learning_rate": 1.8753569145562623e-06, "loss": 0.01472351886332035, "memory(GiB)": 22.66, "step": 22445, "token_acc": 1.0, "train_speed(iter/s)": 0.957923 }, { "epoch": 0.7291686970080888, "grad_norm": 0.2820797860622406, "learning_rate": 1.8749375870165986e-06, "loss": 0.008123288862407207, "memory(GiB)": 22.66, "step": 22446, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957932 }, { "epoch": 0.7292011824708443, "grad_norm": 0.370760440826416, "learning_rate": 1.874518295544317e-06, "loss": 0.012474839575588703, "memory(GiB)": 22.66, "step": 22447, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.957941 }, { "epoch": 0.7292336679335997, "grad_norm": 0.30765900015830994, "learning_rate": 1.8740990401442521e-06, "loss": 0.013609867542982101, "memory(GiB)": 22.66, "step": 22448, "token_acc": 1.0, "train_speed(iter/s)": 0.95795 }, { "epoch": 0.7292661533963551, "grad_norm": 0.37684038281440735, "learning_rate": 1.8736798208212448e-06, "loss": 0.009595854207873344, "memory(GiB)": 22.66, "step": 22449, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957959 }, { "epoch": 0.7292986388591105, "grad_norm": 0.4511280655860901, "learning_rate": 1.8732606375801333e-06, "loss": 0.019123826175928116, "memory(GiB)": 22.66, "step": 22450, "token_acc": 1.0, "train_speed(iter/s)": 0.957967 }, { "epoch": 0.729331124321866, "grad_norm": 0.4020354151725769, "learning_rate": 1.872841490425758e-06, "loss": 0.0151209132745862, "memory(GiB)": 22.66, "step": 22451, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.957976 }, { "epoch": 0.7293636097846213, "grad_norm": 0.3162277638912201, "learning_rate": 1.872422379362952e-06, "loss": 0.010791880078613758, "memory(GiB)": 22.66, "step": 22452, "token_acc": 1.0, "train_speed(iter/s)": 0.957983 }, { "epoch": 0.7293960952473768, "grad_norm": 0.3698658347129822, "learning_rate": 1.8720033043965547e-06, "loss": 0.013744944706559181, "memory(GiB)": 22.66, "step": 22453, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.95799 }, { "epoch": 0.7294285807101322, "grad_norm": 0.3962929844856262, "learning_rate": 1.8715842655314026e-06, "loss": 0.01674865186214447, "memory(GiB)": 22.66, "step": 22454, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.957997 }, { "epoch": 0.7294610661728876, "grad_norm": 0.27227845788002014, "learning_rate": 1.8711652627723337e-06, "loss": 0.009047282859683037, "memory(GiB)": 22.66, "step": 22455, "token_acc": 1.0, "train_speed(iter/s)": 0.958003 }, { "epoch": 0.729493551635643, "grad_norm": 0.29452067613601685, "learning_rate": 1.8707462961241807e-06, "loss": 0.011465862393379211, "memory(GiB)": 22.66, "step": 22456, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.95801 }, { "epoch": 0.7295260370983985, "grad_norm": 0.5106567144393921, "learning_rate": 1.8703273655917797e-06, "loss": 0.018571652472019196, "memory(GiB)": 22.66, "step": 22457, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.958017 }, { "epoch": 0.7295585225611538, "grad_norm": 0.4070405960083008, "learning_rate": 1.869908471179967e-06, "loss": 0.013431837782263756, "memory(GiB)": 22.66, "step": 22458, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.958024 }, { "epoch": 0.7295910080239093, "grad_norm": 0.4836445152759552, "learning_rate": 1.8694896128935768e-06, "loss": 0.01721348986029625, "memory(GiB)": 22.66, "step": 22459, "token_acc": 0.9846938775510204, "train_speed(iter/s)": 0.958031 }, { "epoch": 0.7296234934866647, "grad_norm": 0.3623684048652649, "learning_rate": 1.869070790737445e-06, "loss": 0.009944295510649681, "memory(GiB)": 22.66, "step": 22460, "token_acc": 1.0, "train_speed(iter/s)": 0.958038 }, { "epoch": 0.7296559789494201, "grad_norm": 0.4086513817310333, "learning_rate": 1.8686520047164014e-06, "loss": 0.020157981663942337, "memory(GiB)": 22.66, "step": 22461, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.958045 }, { "epoch": 0.7296884644121755, "grad_norm": 0.36247557401657104, "learning_rate": 1.8682332548352815e-06, "loss": 0.017865188419818878, "memory(GiB)": 22.66, "step": 22462, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.958051 }, { "epoch": 0.729720949874931, "grad_norm": 0.46208134293556213, "learning_rate": 1.8678145410989185e-06, "loss": 0.017193200066685677, "memory(GiB)": 22.66, "step": 22463, "token_acc": 1.0, "train_speed(iter/s)": 0.958058 }, { "epoch": 0.7297534353376864, "grad_norm": 0.5290588140487671, "learning_rate": 1.8673958635121464e-06, "loss": 0.016471099108457565, "memory(GiB)": 22.66, "step": 22464, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.958065 }, { "epoch": 0.7297859208004418, "grad_norm": 0.43445804715156555, "learning_rate": 1.8669772220797933e-06, "loss": 0.017053324729204178, "memory(GiB)": 22.66, "step": 22465, "token_acc": 0.995, "train_speed(iter/s)": 0.958071 }, { "epoch": 0.7298184062631973, "grad_norm": 0.23130862414836884, "learning_rate": 1.8665586168066952e-06, "loss": 0.007982338778674603, "memory(GiB)": 22.66, "step": 22466, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.958079 }, { "epoch": 0.7298508917259526, "grad_norm": 0.360645592212677, "learning_rate": 1.8661400476976777e-06, "loss": 0.012458992190659046, "memory(GiB)": 22.66, "step": 22467, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.958086 }, { "epoch": 0.7298833771887081, "grad_norm": 0.3913688361644745, "learning_rate": 1.8657215147575792e-06, "loss": 0.014098968356847763, "memory(GiB)": 22.66, "step": 22468, "token_acc": 1.0, "train_speed(iter/s)": 0.958093 }, { "epoch": 0.7299158626514635, "grad_norm": 0.41249680519104004, "learning_rate": 1.8653030179912236e-06, "loss": 0.013672905042767525, "memory(GiB)": 22.66, "step": 22469, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.9581 }, { "epoch": 0.7299483481142189, "grad_norm": 0.27820533514022827, "learning_rate": 1.8648845574034453e-06, "loss": 0.012576148845255375, "memory(GiB)": 22.66, "step": 22470, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.958107 }, { "epoch": 0.7299808335769743, "grad_norm": 0.31811317801475525, "learning_rate": 1.8644661329990703e-06, "loss": 0.012084484100341797, "memory(GiB)": 22.66, "step": 22471, "token_acc": 1.0, "train_speed(iter/s)": 0.958114 }, { "epoch": 0.7300133190397298, "grad_norm": 0.43653005361557007, "learning_rate": 1.8640477447829287e-06, "loss": 0.01667226105928421, "memory(GiB)": 22.66, "step": 22472, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.95812 }, { "epoch": 0.7300458045024851, "grad_norm": 0.5219536423683167, "learning_rate": 1.8636293927598526e-06, "loss": 0.017955802381038666, "memory(GiB)": 22.66, "step": 22473, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.958127 }, { "epoch": 0.7300782899652406, "grad_norm": 0.22611388564109802, "learning_rate": 1.863211076934666e-06, "loss": 0.008709273301064968, "memory(GiB)": 22.66, "step": 22474, "token_acc": 1.0, "train_speed(iter/s)": 0.958133 }, { "epoch": 0.730110775427996, "grad_norm": 0.42127516865730286, "learning_rate": 1.8627927973122007e-06, "loss": 0.015145020559430122, "memory(GiB)": 22.66, "step": 22475, "token_acc": 0.9966777408637874, "train_speed(iter/s)": 0.958141 }, { "epoch": 0.7301432608907514, "grad_norm": 0.30417782068252563, "learning_rate": 1.8623745538972805e-06, "loss": 0.00634932704269886, "memory(GiB)": 22.66, "step": 22476, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.95815 }, { "epoch": 0.7301757463535068, "grad_norm": 0.3902429938316345, "learning_rate": 1.8619563466947338e-06, "loss": 0.019963381811976433, "memory(GiB)": 22.66, "step": 22477, "token_acc": 1.0, "train_speed(iter/s)": 0.958158 }, { "epoch": 0.7302082318162623, "grad_norm": 0.3202410340309143, "learning_rate": 1.8615381757093882e-06, "loss": 0.02023273892700672, "memory(GiB)": 22.66, "step": 22478, "token_acc": 1.0, "train_speed(iter/s)": 0.958165 }, { "epoch": 0.7302407172790176, "grad_norm": 0.46850723028182983, "learning_rate": 1.86112004094607e-06, "loss": 0.017004016786813736, "memory(GiB)": 22.66, "step": 22479, "token_acc": 1.0, "train_speed(iter/s)": 0.958172 }, { "epoch": 0.7302732027417731, "grad_norm": 0.248263418674469, "learning_rate": 1.8607019424096034e-06, "loss": 0.007351522333920002, "memory(GiB)": 22.66, "step": 22480, "token_acc": 1.0, "train_speed(iter/s)": 0.958179 }, { "epoch": 0.7303056882045285, "grad_norm": 0.3686411678791046, "learning_rate": 1.8602838801048146e-06, "loss": 0.010252969339489937, "memory(GiB)": 22.66, "step": 22481, "token_acc": 1.0, "train_speed(iter/s)": 0.958186 }, { "epoch": 0.7303381736672839, "grad_norm": 0.2868179380893707, "learning_rate": 1.859865854036529e-06, "loss": 0.016715310513973236, "memory(GiB)": 22.66, "step": 22482, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.958193 }, { "epoch": 0.7303706591300393, "grad_norm": 0.28923866152763367, "learning_rate": 1.8594478642095726e-06, "loss": 0.011923868209123611, "memory(GiB)": 22.66, "step": 22483, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.958199 }, { "epoch": 0.7304031445927948, "grad_norm": 0.5572801232337952, "learning_rate": 1.8590299106287657e-06, "loss": 0.013795780017971992, "memory(GiB)": 22.66, "step": 22484, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.958207 }, { "epoch": 0.7304356300555501, "grad_norm": 0.3807332217693329, "learning_rate": 1.858611993298935e-06, "loss": 0.014130039140582085, "memory(GiB)": 22.66, "step": 22485, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.958213 }, { "epoch": 0.7304681155183056, "grad_norm": 1.260798454284668, "learning_rate": 1.8581941122249026e-06, "loss": 0.020386075600981712, "memory(GiB)": 22.66, "step": 22486, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.95822 }, { "epoch": 0.730500600981061, "grad_norm": 0.3062320947647095, "learning_rate": 1.8577762674114935e-06, "loss": 0.014349738135933876, "memory(GiB)": 22.66, "step": 22487, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.958227 }, { "epoch": 0.7305330864438164, "grad_norm": 0.2654894292354584, "learning_rate": 1.8573584588635273e-06, "loss": 0.016758210957050323, "memory(GiB)": 22.66, "step": 22488, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.958233 }, { "epoch": 0.7305655719065718, "grad_norm": 0.33854591846466064, "learning_rate": 1.856940686585827e-06, "loss": 0.015670079737901688, "memory(GiB)": 22.66, "step": 22489, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.95824 }, { "epoch": 0.7305980573693273, "grad_norm": 0.36053377389907837, "learning_rate": 1.8565229505832143e-06, "loss": 0.013210833072662354, "memory(GiB)": 22.66, "step": 22490, "token_acc": 1.0, "train_speed(iter/s)": 0.958246 }, { "epoch": 0.7306305428320826, "grad_norm": 0.35546404123306274, "learning_rate": 1.8561052508605115e-06, "loss": 0.013691667467355728, "memory(GiB)": 22.66, "step": 22491, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.958253 }, { "epoch": 0.7306630282948381, "grad_norm": 0.3314691483974457, "learning_rate": 1.85568758742254e-06, "loss": 0.017275718972086906, "memory(GiB)": 22.66, "step": 22492, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.95826 }, { "epoch": 0.7306955137575935, "grad_norm": 0.31628379225730896, "learning_rate": 1.855269960274117e-06, "loss": 0.01433262787759304, "memory(GiB)": 22.66, "step": 22493, "token_acc": 1.0, "train_speed(iter/s)": 0.958266 }, { "epoch": 0.7307279992203489, "grad_norm": 0.3460177481174469, "learning_rate": 1.854852369420066e-06, "loss": 0.009676117449998856, "memory(GiB)": 22.66, "step": 22494, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.958274 }, { "epoch": 0.7307604846831043, "grad_norm": 0.43234097957611084, "learning_rate": 1.8544348148652014e-06, "loss": 0.014822030439972878, "memory(GiB)": 22.66, "step": 22495, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.95828 }, { "epoch": 0.7307929701458598, "grad_norm": 0.29236742854118347, "learning_rate": 1.8540172966143488e-06, "loss": 0.015169303864240646, "memory(GiB)": 22.66, "step": 22496, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.958287 }, { "epoch": 0.7308254556086151, "grad_norm": 0.33981072902679443, "learning_rate": 1.8535998146723228e-06, "loss": 0.013319293968379498, "memory(GiB)": 22.66, "step": 22497, "token_acc": 0.983739837398374, "train_speed(iter/s)": 0.958296 }, { "epoch": 0.7308579410713706, "grad_norm": 0.2999691665172577, "learning_rate": 1.8531823690439443e-06, "loss": 0.010479532182216644, "memory(GiB)": 22.66, "step": 22498, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.958304 }, { "epoch": 0.730890426534126, "grad_norm": 0.5892742276191711, "learning_rate": 1.852764959734028e-06, "loss": 0.01431536115705967, "memory(GiB)": 22.66, "step": 22499, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.958313 }, { "epoch": 0.7309229119968814, "grad_norm": 0.27310219407081604, "learning_rate": 1.8523475867473927e-06, "loss": 0.015688594430685043, "memory(GiB)": 22.66, "step": 22500, "token_acc": 1.0, "train_speed(iter/s)": 0.958322 }, { "epoch": 0.7309229119968814, "eval_loss": 0.014423232525587082, "eval_runtime": 80.0174, "eval_samples_per_second": 124.348, "eval_steps_per_second": 3.887, "eval_token_acc": 0.9941378061734941, "step": 22500 }, { "epoch": 0.7309553974596368, "grad_norm": 0.3596377968788147, "learning_rate": 1.8519302500888563e-06, "loss": 0.013073714450001717, "memory(GiB)": 22.66, "step": 22501, "token_acc": 0.993921638419626, "train_speed(iter/s)": 0.954614 }, { "epoch": 0.7309878829223923, "grad_norm": 0.46488118171691895, "learning_rate": 1.8515129497632362e-06, "loss": 0.0151005107909441, "memory(GiB)": 22.66, "step": 22502, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.954619 }, { "epoch": 0.7310203683851476, "grad_norm": 0.30381983518600464, "learning_rate": 1.8510956857753459e-06, "loss": 0.018997523933649063, "memory(GiB)": 22.66, "step": 22503, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.954625 }, { "epoch": 0.7310528538479031, "grad_norm": 0.3998885750770569, "learning_rate": 1.8506784581300025e-06, "loss": 0.016888033598661423, "memory(GiB)": 22.66, "step": 22504, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.954631 }, { "epoch": 0.7310853393106584, "grad_norm": 0.3084995150566101, "learning_rate": 1.8502612668320214e-06, "loss": 0.010516706854104996, "memory(GiB)": 22.66, "step": 22505, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.954637 }, { "epoch": 0.7311178247734139, "grad_norm": 0.34060803055763245, "learning_rate": 1.849844111886217e-06, "loss": 0.014051631093025208, "memory(GiB)": 22.66, "step": 22506, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.954643 }, { "epoch": 0.7311503102361693, "grad_norm": 0.37820005416870117, "learning_rate": 1.8494269932974068e-06, "loss": 0.017582625150680542, "memory(GiB)": 22.66, "step": 22507, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.954648 }, { "epoch": 0.7311827956989247, "grad_norm": 0.4171188771724701, "learning_rate": 1.8490099110704008e-06, "loss": 0.017121758311986923, "memory(GiB)": 22.66, "step": 22508, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.954654 }, { "epoch": 0.7312152811616801, "grad_norm": 0.36920854449272156, "learning_rate": 1.8485928652100138e-06, "loss": 0.013906835578382015, "memory(GiB)": 22.66, "step": 22509, "token_acc": 0.9964285714285714, "train_speed(iter/s)": 0.954661 }, { "epoch": 0.7312477666244356, "grad_norm": 0.30276572704315186, "learning_rate": 1.84817585572106e-06, "loss": 0.013668527826666832, "memory(GiB)": 22.66, "step": 22510, "token_acc": 1.0, "train_speed(iter/s)": 0.954668 }, { "epoch": 0.7312802520871909, "grad_norm": 0.3165479302406311, "learning_rate": 1.8477588826083537e-06, "loss": 0.014811256900429726, "memory(GiB)": 22.66, "step": 22511, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.954673 }, { "epoch": 0.7313127375499464, "grad_norm": 0.2762053310871124, "learning_rate": 1.847341945876704e-06, "loss": 0.010833565145730972, "memory(GiB)": 22.66, "step": 22512, "token_acc": 1.0, "train_speed(iter/s)": 0.95468 }, { "epoch": 0.7313452230127018, "grad_norm": 0.3538450598716736, "learning_rate": 1.846925045530924e-06, "loss": 0.019999388605356216, "memory(GiB)": 22.66, "step": 22513, "token_acc": 0.9806763285024155, "train_speed(iter/s)": 0.954687 }, { "epoch": 0.7313777084754572, "grad_norm": 0.320587158203125, "learning_rate": 1.8465081815758257e-06, "loss": 0.013880450278520584, "memory(GiB)": 22.66, "step": 22514, "token_acc": 1.0, "train_speed(iter/s)": 0.954693 }, { "epoch": 0.7314101939382126, "grad_norm": 0.21114328503608704, "learning_rate": 1.8460913540162223e-06, "loss": 0.008914629928767681, "memory(GiB)": 22.66, "step": 22515, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.954701 }, { "epoch": 0.7314426794009681, "grad_norm": 0.44072225689888, "learning_rate": 1.8456745628569206e-06, "loss": 0.015108419582247734, "memory(GiB)": 22.66, "step": 22516, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.954708 }, { "epoch": 0.7314751648637234, "grad_norm": 0.4733559191226959, "learning_rate": 1.8452578081027345e-06, "loss": 0.012094051577150822, "memory(GiB)": 22.66, "step": 22517, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.954716 }, { "epoch": 0.7315076503264789, "grad_norm": 0.30058544874191284, "learning_rate": 1.844841089758469e-06, "loss": 0.010151251219213009, "memory(GiB)": 22.66, "step": 22518, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.954726 }, { "epoch": 0.7315401357892343, "grad_norm": 0.3697907030582428, "learning_rate": 1.8444244078289386e-06, "loss": 0.0181635282933712, "memory(GiB)": 22.66, "step": 22519, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.954735 }, { "epoch": 0.7315726212519897, "grad_norm": 0.3555176556110382, "learning_rate": 1.8440077623189518e-06, "loss": 0.014449735172092915, "memory(GiB)": 22.66, "step": 22520, "token_acc": 1.0, "train_speed(iter/s)": 0.954744 }, { "epoch": 0.7316051067147451, "grad_norm": 0.37157970666885376, "learning_rate": 1.843591153233315e-06, "loss": 0.01375991478562355, "memory(GiB)": 22.66, "step": 22521, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.954753 }, { "epoch": 0.7316375921775006, "grad_norm": 0.4031110405921936, "learning_rate": 1.8431745805768386e-06, "loss": 0.013088244944810867, "memory(GiB)": 22.66, "step": 22522, "token_acc": 1.0, "train_speed(iter/s)": 0.954762 }, { "epoch": 0.7316700776402559, "grad_norm": 0.3607591390609741, "learning_rate": 1.8427580443543264e-06, "loss": 0.01050875149667263, "memory(GiB)": 22.66, "step": 22523, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.954771 }, { "epoch": 0.7317025631030114, "grad_norm": 0.38548654317855835, "learning_rate": 1.8423415445705917e-06, "loss": 0.01263769343495369, "memory(GiB)": 22.66, "step": 22524, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.95478 }, { "epoch": 0.7317350485657668, "grad_norm": 0.5018153190612793, "learning_rate": 1.8419250812304367e-06, "loss": 0.02098437026143074, "memory(GiB)": 22.66, "step": 22525, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.954789 }, { "epoch": 0.7317675340285222, "grad_norm": 0.31083011627197266, "learning_rate": 1.8415086543386712e-06, "loss": 0.008989672176539898, "memory(GiB)": 22.66, "step": 22526, "token_acc": 1.0, "train_speed(iter/s)": 0.954798 }, { "epoch": 0.7318000194912776, "grad_norm": 0.4333680272102356, "learning_rate": 1.8410922639000983e-06, "loss": 0.01708657667040825, "memory(GiB)": 22.66, "step": 22527, "token_acc": 1.0, "train_speed(iter/s)": 0.954807 }, { "epoch": 0.7318325049540331, "grad_norm": 0.24064545333385468, "learning_rate": 1.8406759099195254e-06, "loss": 0.009439435787498951, "memory(GiB)": 22.66, "step": 22528, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.954816 }, { "epoch": 0.7318649904167885, "grad_norm": 0.5202349424362183, "learning_rate": 1.8402595924017574e-06, "loss": 0.01870090514421463, "memory(GiB)": 22.66, "step": 22529, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.954826 }, { "epoch": 0.7318974758795439, "grad_norm": 0.31835493445396423, "learning_rate": 1.8398433113516007e-06, "loss": 0.01757056452333927, "memory(GiB)": 22.66, "step": 22530, "token_acc": 0.9923076923076923, "train_speed(iter/s)": 0.954834 }, { "epoch": 0.7319299613422994, "grad_norm": 0.20599333941936493, "learning_rate": 1.8394270667738574e-06, "loss": 0.008300317451357841, "memory(GiB)": 22.66, "step": 22531, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.954843 }, { "epoch": 0.7319624468050547, "grad_norm": 0.3769054114818573, "learning_rate": 1.8390108586733318e-06, "loss": 0.016587698832154274, "memory(GiB)": 22.66, "step": 22532, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.954852 }, { "epoch": 0.7319949322678102, "grad_norm": 0.27971702814102173, "learning_rate": 1.8385946870548282e-06, "loss": 0.012660583481192589, "memory(GiB)": 22.66, "step": 22533, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.95486 }, { "epoch": 0.7320274177305656, "grad_norm": 0.2140021026134491, "learning_rate": 1.83817855192315e-06, "loss": 0.009930968284606934, "memory(GiB)": 22.66, "step": 22534, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.954869 }, { "epoch": 0.732059903193321, "grad_norm": 0.4118978679180145, "learning_rate": 1.8377624532831011e-06, "loss": 0.014462918043136597, "memory(GiB)": 22.66, "step": 22535, "token_acc": 0.9870967741935484, "train_speed(iter/s)": 0.954878 }, { "epoch": 0.7320923886560764, "grad_norm": 0.4436643123626709, "learning_rate": 1.8373463911394807e-06, "loss": 0.013368623331189156, "memory(GiB)": 22.66, "step": 22536, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.954888 }, { "epoch": 0.7321248741188319, "grad_norm": 0.4637855589389801, "learning_rate": 1.8369303654970928e-06, "loss": 0.019425414502620697, "memory(GiB)": 22.66, "step": 22537, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.954897 }, { "epoch": 0.7321573595815872, "grad_norm": 0.48324424028396606, "learning_rate": 1.836514376360738e-06, "loss": 0.01747666299343109, "memory(GiB)": 22.66, "step": 22538, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.954905 }, { "epoch": 0.7321898450443427, "grad_norm": 0.4182042181491852, "learning_rate": 1.8360984237352197e-06, "loss": 0.01993056945502758, "memory(GiB)": 22.66, "step": 22539, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.954914 }, { "epoch": 0.7322223305070981, "grad_norm": 0.23979270458221436, "learning_rate": 1.8356825076253354e-06, "loss": 0.01149340532720089, "memory(GiB)": 22.66, "step": 22540, "token_acc": 1.0, "train_speed(iter/s)": 0.954922 }, { "epoch": 0.7322548159698535, "grad_norm": 0.23411062359809875, "learning_rate": 1.8352666280358866e-06, "loss": 0.007050956133753061, "memory(GiB)": 22.66, "step": 22541, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.954931 }, { "epoch": 0.7322873014326089, "grad_norm": 0.4645249545574188, "learning_rate": 1.834850784971673e-06, "loss": 0.012336540967226028, "memory(GiB)": 22.66, "step": 22542, "token_acc": 0.992, "train_speed(iter/s)": 0.95494 }, { "epoch": 0.7323197868953644, "grad_norm": 0.394538015127182, "learning_rate": 1.834434978437496e-06, "loss": 0.017337124794721603, "memory(GiB)": 22.66, "step": 22543, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.954949 }, { "epoch": 0.7323522723581197, "grad_norm": 0.22114086151123047, "learning_rate": 1.8340192084381508e-06, "loss": 0.011247765272855759, "memory(GiB)": 22.66, "step": 22544, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.954958 }, { "epoch": 0.7323847578208752, "grad_norm": 0.39466699957847595, "learning_rate": 1.8336034749784393e-06, "loss": 0.014490284025669098, "memory(GiB)": 22.66, "step": 22545, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.954967 }, { "epoch": 0.7324172432836306, "grad_norm": 0.3132993280887604, "learning_rate": 1.8331877780631552e-06, "loss": 0.016497552394866943, "memory(GiB)": 22.66, "step": 22546, "token_acc": 1.0, "train_speed(iter/s)": 0.954976 }, { "epoch": 0.732449728746386, "grad_norm": 0.3091214597225189, "learning_rate": 1.8327721176971024e-06, "loss": 0.014565542340278625, "memory(GiB)": 22.66, "step": 22547, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.954985 }, { "epoch": 0.7324822142091414, "grad_norm": 0.5808889865875244, "learning_rate": 1.8323564938850735e-06, "loss": 0.019201330840587616, "memory(GiB)": 22.66, "step": 22548, "token_acc": 1.0, "train_speed(iter/s)": 0.954993 }, { "epoch": 0.7325146996718969, "grad_norm": 0.2751762568950653, "learning_rate": 1.8319409066318683e-06, "loss": 0.01009036973118782, "memory(GiB)": 22.66, "step": 22549, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.955002 }, { "epoch": 0.7325471851346522, "grad_norm": 0.20414458215236664, "learning_rate": 1.8315253559422796e-06, "loss": 0.0064684017561376095, "memory(GiB)": 22.66, "step": 22550, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955009 }, { "epoch": 0.7325796705974077, "grad_norm": 0.26407143473625183, "learning_rate": 1.8311098418211044e-06, "loss": 0.010990004986524582, "memory(GiB)": 22.66, "step": 22551, "token_acc": 0.9963768115942029, "train_speed(iter/s)": 0.955016 }, { "epoch": 0.7326121560601631, "grad_norm": 0.2803924083709717, "learning_rate": 1.830694364273143e-06, "loss": 0.0075255706906318665, "memory(GiB)": 22.66, "step": 22552, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955022 }, { "epoch": 0.7326446415229185, "grad_norm": 0.3264077603816986, "learning_rate": 1.8302789233031848e-06, "loss": 0.017291387543082237, "memory(GiB)": 22.66, "step": 22553, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.955018 }, { "epoch": 0.7326771269856739, "grad_norm": 0.41675081849098206, "learning_rate": 1.829863518916028e-06, "loss": 0.01675696112215519, "memory(GiB)": 22.66, "step": 22554, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955025 }, { "epoch": 0.7327096124484294, "grad_norm": 0.33335956931114197, "learning_rate": 1.8294481511164646e-06, "loss": 0.013593191280961037, "memory(GiB)": 22.66, "step": 22555, "token_acc": 1.0, "train_speed(iter/s)": 0.955031 }, { "epoch": 0.7327420979111847, "grad_norm": 0.2841649651527405, "learning_rate": 1.8290328199092894e-06, "loss": 0.00836154818534851, "memory(GiB)": 22.66, "step": 22556, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.955038 }, { "epoch": 0.7327745833739402, "grad_norm": 0.4498193562030792, "learning_rate": 1.828617525299296e-06, "loss": 0.018429365009069443, "memory(GiB)": 22.66, "step": 22557, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955044 }, { "epoch": 0.7328070688366956, "grad_norm": 0.2735081911087036, "learning_rate": 1.8282022672912792e-06, "loss": 0.013743532821536064, "memory(GiB)": 22.66, "step": 22558, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.95505 }, { "epoch": 0.732839554299451, "grad_norm": 0.22204624116420746, "learning_rate": 1.8277870458900281e-06, "loss": 0.007295413874089718, "memory(GiB)": 22.66, "step": 22559, "token_acc": 1.0, "train_speed(iter/s)": 0.955057 }, { "epoch": 0.7328720397622064, "grad_norm": 0.3624192774295807, "learning_rate": 1.8273718611003365e-06, "loss": 0.01867317408323288, "memory(GiB)": 22.66, "step": 22560, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.955063 }, { "epoch": 0.7329045252249619, "grad_norm": 0.2291160374879837, "learning_rate": 1.8269567129269966e-06, "loss": 0.010497735813260078, "memory(GiB)": 22.66, "step": 22561, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955068 }, { "epoch": 0.7329370106877172, "grad_norm": 0.7514859437942505, "learning_rate": 1.8265416013748017e-06, "loss": 0.020717259496450424, "memory(GiB)": 22.66, "step": 22562, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.955074 }, { "epoch": 0.7329694961504727, "grad_norm": 0.2413025200366974, "learning_rate": 1.8261265264485379e-06, "loss": 0.010783180594444275, "memory(GiB)": 22.66, "step": 22563, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.95508 }, { "epoch": 0.733001981613228, "grad_norm": 0.375857949256897, "learning_rate": 1.8257114881529992e-06, "loss": 0.019746579229831696, "memory(GiB)": 22.66, "step": 22564, "token_acc": 0.99609375, "train_speed(iter/s)": 0.955086 }, { "epoch": 0.7330344670759835, "grad_norm": 0.38036754727363586, "learning_rate": 1.8252964864929745e-06, "loss": 0.014436446130275726, "memory(GiB)": 22.66, "step": 22565, "token_acc": 1.0, "train_speed(iter/s)": 0.955092 }, { "epoch": 0.7330669525387389, "grad_norm": 0.24117545783519745, "learning_rate": 1.8248815214732541e-06, "loss": 0.011152667924761772, "memory(GiB)": 22.66, "step": 22566, "token_acc": 1.0, "train_speed(iter/s)": 0.955098 }, { "epoch": 0.7330994380014944, "grad_norm": 0.46190449595451355, "learning_rate": 1.8244665930986282e-06, "loss": 0.014858195558190346, "memory(GiB)": 22.66, "step": 22567, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.955104 }, { "epoch": 0.7331319234642497, "grad_norm": 0.43713611364364624, "learning_rate": 1.8240517013738834e-06, "loss": 0.011640602722764015, "memory(GiB)": 22.66, "step": 22568, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.95511 }, { "epoch": 0.7331644089270052, "grad_norm": 0.2455446571111679, "learning_rate": 1.823636846303809e-06, "loss": 0.013996977359056473, "memory(GiB)": 22.66, "step": 22569, "token_acc": 0.992, "train_speed(iter/s)": 0.955116 }, { "epoch": 0.7331968943897605, "grad_norm": 0.3853552043437958, "learning_rate": 1.8232220278931928e-06, "loss": 0.019026631489396095, "memory(GiB)": 22.66, "step": 22570, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955123 }, { "epoch": 0.733229379852516, "grad_norm": 0.312310129404068, "learning_rate": 1.8228072461468249e-06, "loss": 0.0098393764346838, "memory(GiB)": 22.66, "step": 22571, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.95513 }, { "epoch": 0.7332618653152714, "grad_norm": 0.3453346788883209, "learning_rate": 1.8223925010694882e-06, "loss": 0.01244160532951355, "memory(GiB)": 22.66, "step": 22572, "token_acc": 0.9933110367892977, "train_speed(iter/s)": 0.955137 }, { "epoch": 0.7332943507780268, "grad_norm": 0.3603891432285309, "learning_rate": 1.821977792665973e-06, "loss": 0.012206979095935822, "memory(GiB)": 22.66, "step": 22573, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.955145 }, { "epoch": 0.7333268362407822, "grad_norm": 0.30829137563705444, "learning_rate": 1.8215631209410605e-06, "loss": 0.015463139861822128, "memory(GiB)": 22.66, "step": 22574, "token_acc": 1.0, "train_speed(iter/s)": 0.955153 }, { "epoch": 0.7333593217035377, "grad_norm": 0.45241788029670715, "learning_rate": 1.8211484858995437e-06, "loss": 0.012465830892324448, "memory(GiB)": 22.66, "step": 22575, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955158 }, { "epoch": 0.733391807166293, "grad_norm": 0.38614514470100403, "learning_rate": 1.8207338875462026e-06, "loss": 0.01805703155696392, "memory(GiB)": 22.66, "step": 22576, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.955165 }, { "epoch": 0.7334242926290485, "grad_norm": 0.2962944507598877, "learning_rate": 1.8203193258858259e-06, "loss": 0.015763696283102036, "memory(GiB)": 22.66, "step": 22577, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.955172 }, { "epoch": 0.7334567780918039, "grad_norm": 0.27427083253860474, "learning_rate": 1.8199048009231952e-06, "loss": 0.011442638002336025, "memory(GiB)": 22.66, "step": 22578, "token_acc": 1.0, "train_speed(iter/s)": 0.955179 }, { "epoch": 0.7334892635545593, "grad_norm": 0.23683398962020874, "learning_rate": 1.8194903126630948e-06, "loss": 0.011882063001394272, "memory(GiB)": 22.66, "step": 22579, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955187 }, { "epoch": 0.7335217490173147, "grad_norm": 0.30423876643180847, "learning_rate": 1.8190758611103098e-06, "loss": 0.010218159295618534, "memory(GiB)": 22.66, "step": 22580, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.955195 }, { "epoch": 0.7335542344800702, "grad_norm": 0.44992226362228394, "learning_rate": 1.8186614462696235e-06, "loss": 0.014545863494277, "memory(GiB)": 22.66, "step": 22581, "token_acc": 1.0, "train_speed(iter/s)": 0.955202 }, { "epoch": 0.7335867199428255, "grad_norm": 0.25893905758857727, "learning_rate": 1.8182470681458198e-06, "loss": 0.012334335595369339, "memory(GiB)": 22.66, "step": 22582, "token_acc": 1.0, "train_speed(iter/s)": 0.955211 }, { "epoch": 0.733619205405581, "grad_norm": 0.31231316924095154, "learning_rate": 1.8178327267436779e-06, "loss": 0.011924856342375278, "memory(GiB)": 22.66, "step": 22583, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.95522 }, { "epoch": 0.7336516908683364, "grad_norm": 0.5030552744865417, "learning_rate": 1.8174184220679819e-06, "loss": 0.015906769782304764, "memory(GiB)": 22.66, "step": 22584, "token_acc": 0.99, "train_speed(iter/s)": 0.955229 }, { "epoch": 0.7336841763310918, "grad_norm": 0.34112298488616943, "learning_rate": 1.8170041541235139e-06, "loss": 0.017714954912662506, "memory(GiB)": 22.66, "step": 22585, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.955238 }, { "epoch": 0.7337166617938472, "grad_norm": 0.285183846950531, "learning_rate": 1.8165899229150558e-06, "loss": 0.010460561141371727, "memory(GiB)": 22.66, "step": 22586, "token_acc": 1.0, "train_speed(iter/s)": 0.955247 }, { "epoch": 0.7337491472566027, "grad_norm": 0.26101255416870117, "learning_rate": 1.8161757284473851e-06, "loss": 0.010950937867164612, "memory(GiB)": 22.66, "step": 22587, "token_acc": 1.0, "train_speed(iter/s)": 0.955256 }, { "epoch": 0.733781632719358, "grad_norm": 0.23810501396656036, "learning_rate": 1.8157615707252846e-06, "loss": 0.01213882490992546, "memory(GiB)": 22.66, "step": 22588, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955265 }, { "epoch": 0.7338141181821135, "grad_norm": 0.35046321153640747, "learning_rate": 1.8153474497535333e-06, "loss": 0.01904572919011116, "memory(GiB)": 22.66, "step": 22589, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.955274 }, { "epoch": 0.7338466036448689, "grad_norm": 0.4192061722278595, "learning_rate": 1.8149333655369134e-06, "loss": 0.02276928350329399, "memory(GiB)": 22.66, "step": 22590, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955283 }, { "epoch": 0.7338790891076243, "grad_norm": 0.3680265247821808, "learning_rate": 1.8145193180802002e-06, "loss": 0.015790976583957672, "memory(GiB)": 22.66, "step": 22591, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.955292 }, { "epoch": 0.7339115745703798, "grad_norm": 0.4052661955356598, "learning_rate": 1.8141053073881736e-06, "loss": 0.017825119197368622, "memory(GiB)": 22.66, "step": 22592, "token_acc": 0.994475138121547, "train_speed(iter/s)": 0.955301 }, { "epoch": 0.7339440600331352, "grad_norm": 0.5606715083122253, "learning_rate": 1.813691333465612e-06, "loss": 0.020208191126585007, "memory(GiB)": 22.66, "step": 22593, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.95531 }, { "epoch": 0.7339765454958906, "grad_norm": 0.2979733347892761, "learning_rate": 1.8132773963172956e-06, "loss": 0.013752304017543793, "memory(GiB)": 22.66, "step": 22594, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.955319 }, { "epoch": 0.734009030958646, "grad_norm": 0.18555861711502075, "learning_rate": 1.8128634959479974e-06, "loss": 0.006808380596339703, "memory(GiB)": 22.66, "step": 22595, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.955328 }, { "epoch": 0.7340415164214015, "grad_norm": 0.36187392473220825, "learning_rate": 1.812449632362497e-06, "loss": 0.0125101488083601, "memory(GiB)": 22.66, "step": 22596, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.955337 }, { "epoch": 0.7340740018841568, "grad_norm": 0.339464396238327, "learning_rate": 1.8120358055655696e-06, "loss": 0.01859544962644577, "memory(GiB)": 22.66, "step": 22597, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.955346 }, { "epoch": 0.7341064873469123, "grad_norm": 0.3252469599246979, "learning_rate": 1.8116220155619929e-06, "loss": 0.01202928926795721, "memory(GiB)": 22.66, "step": 22598, "token_acc": 1.0, "train_speed(iter/s)": 0.955355 }, { "epoch": 0.7341389728096677, "grad_norm": 0.4513590931892395, "learning_rate": 1.8112082623565435e-06, "loss": 0.015372715890407562, "memory(GiB)": 22.66, "step": 22599, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.955364 }, { "epoch": 0.7341714582724231, "grad_norm": 0.3757665753364563, "learning_rate": 1.8107945459539932e-06, "loss": 0.011652318760752678, "memory(GiB)": 22.66, "step": 22600, "token_acc": 1.0, "train_speed(iter/s)": 0.955373 }, { "epoch": 0.7342039437351785, "grad_norm": 0.3366343379020691, "learning_rate": 1.8103808663591205e-06, "loss": 0.009682023897767067, "memory(GiB)": 22.66, "step": 22601, "token_acc": 1.0, "train_speed(iter/s)": 0.955381 }, { "epoch": 0.734236429197934, "grad_norm": 0.39505040645599365, "learning_rate": 1.8099672235766942e-06, "loss": 0.012138532474637032, "memory(GiB)": 22.66, "step": 22602, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955391 }, { "epoch": 0.7342689146606893, "grad_norm": 0.4043775498867035, "learning_rate": 1.8095536176114957e-06, "loss": 0.012013150379061699, "memory(GiB)": 22.66, "step": 22603, "token_acc": 1.0, "train_speed(iter/s)": 0.9554 }, { "epoch": 0.7343014001234448, "grad_norm": 0.37858814001083374, "learning_rate": 1.8091400484682932e-06, "loss": 0.013780409470200539, "memory(GiB)": 22.66, "step": 22604, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.955409 }, { "epoch": 0.7343338855862002, "grad_norm": 0.31618624925613403, "learning_rate": 1.8087265161518635e-06, "loss": 0.009699560701847076, "memory(GiB)": 22.66, "step": 22605, "token_acc": 0.996551724137931, "train_speed(iter/s)": 0.955418 }, { "epoch": 0.7343663710489556, "grad_norm": 0.2576833665370941, "learning_rate": 1.8083130206669753e-06, "loss": 0.011296683922410011, "memory(GiB)": 22.66, "step": 22606, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.955427 }, { "epoch": 0.734398856511711, "grad_norm": 0.28803083300590515, "learning_rate": 1.807899562018403e-06, "loss": 0.011548811569809914, "memory(GiB)": 22.66, "step": 22607, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.955436 }, { "epoch": 0.7344313419744665, "grad_norm": 0.42903339862823486, "learning_rate": 1.807486140210919e-06, "loss": 0.013607967644929886, "memory(GiB)": 22.66, "step": 22608, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.955445 }, { "epoch": 0.7344638274372218, "grad_norm": 0.31649690866470337, "learning_rate": 1.8070727552492951e-06, "loss": 0.011227864772081375, "memory(GiB)": 22.66, "step": 22609, "token_acc": 1.0, "train_speed(iter/s)": 0.955452 }, { "epoch": 0.7344963128999773, "grad_norm": 0.3435647487640381, "learning_rate": 1.8066594071383004e-06, "loss": 0.013790547847747803, "memory(GiB)": 22.66, "step": 22610, "token_acc": 1.0, "train_speed(iter/s)": 0.95546 }, { "epoch": 0.7345287983627327, "grad_norm": 0.23941712081432343, "learning_rate": 1.8062460958827062e-06, "loss": 0.0089158546179533, "memory(GiB)": 22.66, "step": 22611, "token_acc": 0.9947368421052631, "train_speed(iter/s)": 0.955467 }, { "epoch": 0.7345612838254881, "grad_norm": 0.41291144490242004, "learning_rate": 1.8058328214872823e-06, "loss": 0.013073750771582127, "memory(GiB)": 22.66, "step": 22612, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955474 }, { "epoch": 0.7345937692882435, "grad_norm": 0.40449681878089905, "learning_rate": 1.8054195839567994e-06, "loss": 0.014718938618898392, "memory(GiB)": 22.66, "step": 22613, "token_acc": 1.0, "train_speed(iter/s)": 0.95548 }, { "epoch": 0.734626254750999, "grad_norm": 0.5106955170631409, "learning_rate": 1.8050063832960285e-06, "loss": 0.01839020848274231, "memory(GiB)": 22.66, "step": 22614, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955487 }, { "epoch": 0.7346587402137543, "grad_norm": 0.31675148010253906, "learning_rate": 1.8045932195097348e-06, "loss": 0.008177945390343666, "memory(GiB)": 22.66, "step": 22615, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.955494 }, { "epoch": 0.7346912256765098, "grad_norm": 0.38510361313819885, "learning_rate": 1.804180092602688e-06, "loss": 0.015506982803344727, "memory(GiB)": 22.66, "step": 22616, "token_acc": 0.9869281045751634, "train_speed(iter/s)": 0.955501 }, { "epoch": 0.7347237111392652, "grad_norm": 0.3763982057571411, "learning_rate": 1.8037670025796572e-06, "loss": 0.01670064777135849, "memory(GiB)": 22.66, "step": 22617, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.955507 }, { "epoch": 0.7347561966020206, "grad_norm": 1.9029687643051147, "learning_rate": 1.8033539494454105e-06, "loss": 0.020116368308663368, "memory(GiB)": 22.66, "step": 22618, "token_acc": 1.0, "train_speed(iter/s)": 0.955512 }, { "epoch": 0.734788682064776, "grad_norm": 1.0015796422958374, "learning_rate": 1.8029409332047127e-06, "loss": 0.012806536629796028, "memory(GiB)": 22.66, "step": 22619, "token_acc": 0.9930555555555556, "train_speed(iter/s)": 0.955519 }, { "epoch": 0.7348211675275315, "grad_norm": 0.3829190135002136, "learning_rate": 1.8025279538623314e-06, "loss": 0.013983181677758694, "memory(GiB)": 22.66, "step": 22620, "token_acc": 0.9858657243816255, "train_speed(iter/s)": 0.955526 }, { "epoch": 0.7348536529902868, "grad_norm": 0.4393423795700073, "learning_rate": 1.8021150114230335e-06, "loss": 0.01682830974459648, "memory(GiB)": 22.66, "step": 22621, "token_acc": 1.0, "train_speed(iter/s)": 0.955532 }, { "epoch": 0.7348861384530423, "grad_norm": 0.4187788963317871, "learning_rate": 1.8017021058915862e-06, "loss": 0.014181090518832207, "memory(GiB)": 22.66, "step": 22622, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955538 }, { "epoch": 0.7349186239157977, "grad_norm": 0.3811560869216919, "learning_rate": 1.8012892372727519e-06, "loss": 0.009110542945563793, "memory(GiB)": 22.66, "step": 22623, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955544 }, { "epoch": 0.7349511093785531, "grad_norm": 0.35285794734954834, "learning_rate": 1.8008764055712996e-06, "loss": 0.015125352889299393, "memory(GiB)": 22.66, "step": 22624, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.95555 }, { "epoch": 0.7349835948413085, "grad_norm": 0.20181964337825775, "learning_rate": 1.8004636107919875e-06, "loss": 0.007006413768976927, "memory(GiB)": 22.66, "step": 22625, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955556 }, { "epoch": 0.735016080304064, "grad_norm": 0.30917564034461975, "learning_rate": 1.8000508529395878e-06, "loss": 0.01362575963139534, "memory(GiB)": 22.66, "step": 22626, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955562 }, { "epoch": 0.7350485657668193, "grad_norm": 0.3660734295845032, "learning_rate": 1.7996381320188583e-06, "loss": 0.01697104051709175, "memory(GiB)": 22.66, "step": 22627, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.955568 }, { "epoch": 0.7350810512295748, "grad_norm": 0.3018905520439148, "learning_rate": 1.7992254480345645e-06, "loss": 0.01068086177110672, "memory(GiB)": 22.66, "step": 22628, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.955574 }, { "epoch": 0.7351135366923301, "grad_norm": 0.4314136207103729, "learning_rate": 1.7988128009914708e-06, "loss": 0.012556029483675957, "memory(GiB)": 22.66, "step": 22629, "token_acc": 0.9961977186311787, "train_speed(iter/s)": 0.955581 }, { "epoch": 0.7351460221550856, "grad_norm": 0.32727575302124023, "learning_rate": 1.798400190894335e-06, "loss": 0.013863738626241684, "memory(GiB)": 22.66, "step": 22630, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.955588 }, { "epoch": 0.735178507617841, "grad_norm": 0.3792210817337036, "learning_rate": 1.7979876177479255e-06, "loss": 0.015679337084293365, "memory(GiB)": 22.66, "step": 22631, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.955595 }, { "epoch": 0.7352109930805965, "grad_norm": 0.3039546608924866, "learning_rate": 1.7975750815569992e-06, "loss": 0.012890340760350227, "memory(GiB)": 22.66, "step": 22632, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955602 }, { "epoch": 0.7352434785433518, "grad_norm": 0.4057888984680176, "learning_rate": 1.7971625823263206e-06, "loss": 0.015013651922345161, "memory(GiB)": 22.66, "step": 22633, "token_acc": 0.9681274900398407, "train_speed(iter/s)": 0.955609 }, { "epoch": 0.7352759640061073, "grad_norm": 0.5217815041542053, "learning_rate": 1.7967501200606474e-06, "loss": 0.023485928773880005, "memory(GiB)": 22.66, "step": 22634, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.955615 }, { "epoch": 0.7353084494688626, "grad_norm": 0.31045717000961304, "learning_rate": 1.796337694764741e-06, "loss": 0.015638593584299088, "memory(GiB)": 22.66, "step": 22635, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.955622 }, { "epoch": 0.7353409349316181, "grad_norm": 0.35724323987960815, "learning_rate": 1.7959253064433613e-06, "loss": 0.013395529240369797, "memory(GiB)": 22.66, "step": 22636, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.95563 }, { "epoch": 0.7353734203943735, "grad_norm": 0.24724388122558594, "learning_rate": 1.7955129551012707e-06, "loss": 0.007811680436134338, "memory(GiB)": 22.66, "step": 22637, "token_acc": 1.0, "train_speed(iter/s)": 0.955637 }, { "epoch": 0.735405905857129, "grad_norm": 0.4245024025440216, "learning_rate": 1.795100640743223e-06, "loss": 0.01188914105296135, "memory(GiB)": 22.66, "step": 22638, "token_acc": 1.0, "train_speed(iter/s)": 0.955644 }, { "epoch": 0.7354383913198843, "grad_norm": 0.3528226315975189, "learning_rate": 1.7946883633739803e-06, "loss": 0.020098909735679626, "memory(GiB)": 22.66, "step": 22639, "token_acc": 1.0, "train_speed(iter/s)": 0.955651 }, { "epoch": 0.7354708767826398, "grad_norm": 0.6211584210395813, "learning_rate": 1.7942761229983002e-06, "loss": 0.020486129447817802, "memory(GiB)": 22.66, "step": 22640, "token_acc": 1.0, "train_speed(iter/s)": 0.955656 }, { "epoch": 0.7355033622453951, "grad_norm": 0.4093378782272339, "learning_rate": 1.7938639196209423e-06, "loss": 0.016613822430372238, "memory(GiB)": 22.66, "step": 22641, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955662 }, { "epoch": 0.7355358477081506, "grad_norm": 0.2989320158958435, "learning_rate": 1.79345175324666e-06, "loss": 0.010243716649711132, "memory(GiB)": 22.66, "step": 22642, "token_acc": 1.0, "train_speed(iter/s)": 0.95567 }, { "epoch": 0.735568333170906, "grad_norm": 0.4253973066806793, "learning_rate": 1.7930396238802122e-06, "loss": 0.01710059493780136, "memory(GiB)": 22.66, "step": 22643, "token_acc": 0.9921875, "train_speed(iter/s)": 0.955677 }, { "epoch": 0.7356008186336614, "grad_norm": 0.47808966040611267, "learning_rate": 1.7926275315263558e-06, "loss": 0.016918165609240532, "memory(GiB)": 22.66, "step": 22644, "token_acc": 1.0, "train_speed(iter/s)": 0.955684 }, { "epoch": 0.7356333040964168, "grad_norm": 0.38043013215065, "learning_rate": 1.792215476189847e-06, "loss": 0.01838323101401329, "memory(GiB)": 22.66, "step": 22645, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.955693 }, { "epoch": 0.7356657895591723, "grad_norm": 0.5592929720878601, "learning_rate": 1.791803457875443e-06, "loss": 0.015708109363913536, "memory(GiB)": 22.66, "step": 22646, "token_acc": 0.992, "train_speed(iter/s)": 0.955702 }, { "epoch": 0.7356982750219276, "grad_norm": 0.2943045496940613, "learning_rate": 1.7913914765878948e-06, "loss": 0.013935399241745472, "memory(GiB)": 22.66, "step": 22647, "token_acc": 1.0, "train_speed(iter/s)": 0.955711 }, { "epoch": 0.7357307604846831, "grad_norm": 0.3987962305545807, "learning_rate": 1.79097953233196e-06, "loss": 0.01930711790919304, "memory(GiB)": 22.66, "step": 22648, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.95572 }, { "epoch": 0.7357632459474385, "grad_norm": 0.41356924176216125, "learning_rate": 1.7905676251123926e-06, "loss": 0.013824707828462124, "memory(GiB)": 22.66, "step": 22649, "token_acc": 1.0, "train_speed(iter/s)": 0.955729 }, { "epoch": 0.7357957314101939, "grad_norm": 0.41735437512397766, "learning_rate": 1.7901557549339483e-06, "loss": 0.017224757000803947, "memory(GiB)": 22.66, "step": 22650, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.955738 }, { "epoch": 0.7358282168729493, "grad_norm": 0.4339565336704254, "learning_rate": 1.7897439218013768e-06, "loss": 0.022395547479391098, "memory(GiB)": 22.66, "step": 22651, "token_acc": 0.9930555555555556, "train_speed(iter/s)": 0.955747 }, { "epoch": 0.7358607023357048, "grad_norm": 0.43087923526763916, "learning_rate": 1.7893321257194352e-06, "loss": 0.017670858651399612, "memory(GiB)": 22.66, "step": 22652, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955756 }, { "epoch": 0.7358931877984601, "grad_norm": 0.33579298853874207, "learning_rate": 1.7889203666928712e-06, "loss": 0.00882706604897976, "memory(GiB)": 22.66, "step": 22653, "token_acc": 1.0, "train_speed(iter/s)": 0.955764 }, { "epoch": 0.7359256732612156, "grad_norm": 0.4596349000930786, "learning_rate": 1.7885086447264433e-06, "loss": 0.016203254461288452, "memory(GiB)": 22.66, "step": 22654, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.955773 }, { "epoch": 0.735958158723971, "grad_norm": 0.3479527533054352, "learning_rate": 1.7880969598248982e-06, "loss": 0.019120551645755768, "memory(GiB)": 22.66, "step": 22655, "token_acc": 1.0, "train_speed(iter/s)": 0.955782 }, { "epoch": 0.7359906441867264, "grad_norm": 0.4889332056045532, "learning_rate": 1.787685311992991e-06, "loss": 0.018607746809720993, "memory(GiB)": 22.66, "step": 22656, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.95579 }, { "epoch": 0.7360231296494819, "grad_norm": 0.3653312921524048, "learning_rate": 1.787273701235469e-06, "loss": 0.014589881524443626, "memory(GiB)": 22.66, "step": 22657, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.955799 }, { "epoch": 0.7360556151122373, "grad_norm": 0.4517180025577545, "learning_rate": 1.7868621275570836e-06, "loss": 0.014233469031751156, "memory(GiB)": 22.66, "step": 22658, "token_acc": 1.0, "train_speed(iter/s)": 0.955808 }, { "epoch": 0.7360881005749927, "grad_norm": 0.5440109372138977, "learning_rate": 1.7864505909625884e-06, "loss": 0.011844418942928314, "memory(GiB)": 22.66, "step": 22659, "token_acc": 1.0, "train_speed(iter/s)": 0.955817 }, { "epoch": 0.7361205860377481, "grad_norm": 0.4527926445007324, "learning_rate": 1.7860390914567299e-06, "loss": 0.016868717968463898, "memory(GiB)": 22.66, "step": 22660, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955826 }, { "epoch": 0.7361530715005036, "grad_norm": 0.4207506477832794, "learning_rate": 1.7856276290442592e-06, "loss": 0.015887971967458725, "memory(GiB)": 22.66, "step": 22661, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955835 }, { "epoch": 0.7361855569632589, "grad_norm": 0.4177030026912689, "learning_rate": 1.785216203729922e-06, "loss": 0.019045092165470123, "memory(GiB)": 22.66, "step": 22662, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.955844 }, { "epoch": 0.7362180424260144, "grad_norm": 0.2658521831035614, "learning_rate": 1.7848048155184693e-06, "loss": 0.007613185793161392, "memory(GiB)": 22.66, "step": 22663, "token_acc": 1.0, "train_speed(iter/s)": 0.955852 }, { "epoch": 0.7362505278887698, "grad_norm": 0.3299392759799957, "learning_rate": 1.7843934644146483e-06, "loss": 0.011051909998059273, "memory(GiB)": 22.66, "step": 22664, "token_acc": 0.99609375, "train_speed(iter/s)": 0.955861 }, { "epoch": 0.7362830133515252, "grad_norm": 0.4717092216014862, "learning_rate": 1.783982150423208e-06, "loss": 0.019914422184228897, "memory(GiB)": 22.66, "step": 22665, "token_acc": 1.0, "train_speed(iter/s)": 0.95587 }, { "epoch": 0.7363154988142806, "grad_norm": 0.5063430666923523, "learning_rate": 1.7835708735488927e-06, "loss": 0.021877840161323547, "memory(GiB)": 22.66, "step": 22666, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.955879 }, { "epoch": 0.7363479842770361, "grad_norm": 0.36408355832099915, "learning_rate": 1.7831596337964508e-06, "loss": 0.013535896316170692, "memory(GiB)": 22.66, "step": 22667, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.955888 }, { "epoch": 0.7363804697397914, "grad_norm": 0.3466266989707947, "learning_rate": 1.7827484311706277e-06, "loss": 0.012602920643985271, "memory(GiB)": 22.66, "step": 22668, "token_acc": 1.0, "train_speed(iter/s)": 0.955897 }, { "epoch": 0.7364129552025469, "grad_norm": 0.2426101416349411, "learning_rate": 1.7823372656761723e-06, "loss": 0.0077171847224235535, "memory(GiB)": 22.66, "step": 22669, "token_acc": 1.0, "train_speed(iter/s)": 0.955905 }, { "epoch": 0.7364454406653023, "grad_norm": 0.34647589921951294, "learning_rate": 1.7819261373178253e-06, "loss": 0.015359839424490929, "memory(GiB)": 22.66, "step": 22670, "token_acc": 1.0, "train_speed(iter/s)": 0.955911 }, { "epoch": 0.7364779261280577, "grad_norm": 0.42615360021591187, "learning_rate": 1.7815150461003344e-06, "loss": 0.017289524897933006, "memory(GiB)": 22.66, "step": 22671, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.955918 }, { "epoch": 0.7365104115908131, "grad_norm": 0.3596995770931244, "learning_rate": 1.7811039920284434e-06, "loss": 0.014598149806261063, "memory(GiB)": 22.66, "step": 22672, "token_acc": 1.0, "train_speed(iter/s)": 0.955924 }, { "epoch": 0.7365428970535686, "grad_norm": 0.4914941191673279, "learning_rate": 1.7806929751068968e-06, "loss": 0.014518704265356064, "memory(GiB)": 22.66, "step": 22673, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955931 }, { "epoch": 0.7365753825163239, "grad_norm": 0.4968380331993103, "learning_rate": 1.7802819953404394e-06, "loss": 0.02166985347867012, "memory(GiB)": 22.66, "step": 22674, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.955937 }, { "epoch": 0.7366078679790794, "grad_norm": 0.3155771791934967, "learning_rate": 1.779871052733812e-06, "loss": 0.010589148849248886, "memory(GiB)": 22.66, "step": 22675, "token_acc": 1.0, "train_speed(iter/s)": 0.955944 }, { "epoch": 0.7366403534418348, "grad_norm": 0.23171834647655487, "learning_rate": 1.7794601472917584e-06, "loss": 0.009459069930016994, "memory(GiB)": 22.66, "step": 22676, "token_acc": 1.0, "train_speed(iter/s)": 0.955951 }, { "epoch": 0.7366728389045902, "grad_norm": 0.348491907119751, "learning_rate": 1.7790492790190218e-06, "loss": 0.013018813915550709, "memory(GiB)": 22.66, "step": 22677, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955957 }, { "epoch": 0.7367053243673456, "grad_norm": 0.32840463519096375, "learning_rate": 1.7786384479203445e-06, "loss": 0.010060617700219154, "memory(GiB)": 22.66, "step": 22678, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955964 }, { "epoch": 0.7367378098301011, "grad_norm": 0.5454059839248657, "learning_rate": 1.7782276540004656e-06, "loss": 0.017945654690265656, "memory(GiB)": 22.66, "step": 22679, "token_acc": 1.0, "train_speed(iter/s)": 0.955971 }, { "epoch": 0.7367702952928564, "grad_norm": 0.423113077878952, "learning_rate": 1.7778168972641296e-06, "loss": 0.018093079328536987, "memory(GiB)": 22.66, "step": 22680, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955978 }, { "epoch": 0.7368027807556119, "grad_norm": 0.2614256739616394, "learning_rate": 1.7774061777160723e-06, "loss": 0.009030135348439217, "memory(GiB)": 22.66, "step": 22681, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.955983 }, { "epoch": 0.7368352662183673, "grad_norm": 0.3134094178676605, "learning_rate": 1.7769954953610397e-06, "loss": 0.01908630132675171, "memory(GiB)": 22.66, "step": 22682, "token_acc": 0.9842105263157894, "train_speed(iter/s)": 0.955989 }, { "epoch": 0.7368677516811227, "grad_norm": 0.2063082605600357, "learning_rate": 1.7765848502037675e-06, "loss": 0.007001140620559454, "memory(GiB)": 22.66, "step": 22683, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955995 }, { "epoch": 0.7369002371438781, "grad_norm": 0.3974290192127228, "learning_rate": 1.776174242248998e-06, "loss": 0.014261703938245773, "memory(GiB)": 22.66, "step": 22684, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.956001 }, { "epoch": 0.7369327226066336, "grad_norm": 6.24434232711792, "learning_rate": 1.7757636715014675e-06, "loss": 0.020592600107192993, "memory(GiB)": 22.66, "step": 22685, "token_acc": 1.0, "train_speed(iter/s)": 0.956007 }, { "epoch": 0.7369652080693889, "grad_norm": 0.3293308913707733, "learning_rate": 1.775353137965915e-06, "loss": 0.012638280168175697, "memory(GiB)": 22.66, "step": 22686, "token_acc": 0.9967845659163987, "train_speed(iter/s)": 0.956013 }, { "epoch": 0.7369976935321444, "grad_norm": 0.27524200081825256, "learning_rate": 1.77494264164708e-06, "loss": 0.013250863179564476, "memory(GiB)": 22.66, "step": 22687, "token_acc": 1.0, "train_speed(iter/s)": 0.956018 }, { "epoch": 0.7370301789948998, "grad_norm": 0.42046499252319336, "learning_rate": 1.7745321825497002e-06, "loss": 0.018853789195418358, "memory(GiB)": 22.66, "step": 22688, "token_acc": 0.9728260869565217, "train_speed(iter/s)": 0.956025 }, { "epoch": 0.7370626644576552, "grad_norm": 0.3884243071079254, "learning_rate": 1.7741217606785111e-06, "loss": 0.014768596738576889, "memory(GiB)": 22.66, "step": 22689, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956032 }, { "epoch": 0.7370951499204106, "grad_norm": 0.2119312584400177, "learning_rate": 1.7737113760382508e-06, "loss": 0.008018536493182182, "memory(GiB)": 22.66, "step": 22690, "token_acc": 1.0, "train_speed(iter/s)": 0.956039 }, { "epoch": 0.7371276353831661, "grad_norm": 0.31288766860961914, "learning_rate": 1.7733010286336543e-06, "loss": 0.008596532046794891, "memory(GiB)": 22.66, "step": 22691, "token_acc": 0.9940476190476191, "train_speed(iter/s)": 0.956045 }, { "epoch": 0.7371601208459214, "grad_norm": 0.3544699251651764, "learning_rate": 1.7728907184694593e-06, "loss": 0.011336165480315685, "memory(GiB)": 22.66, "step": 22692, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.956053 }, { "epoch": 0.7371926063086769, "grad_norm": 0.37455257773399353, "learning_rate": 1.7724804455504025e-06, "loss": 0.012656690552830696, "memory(GiB)": 22.66, "step": 22693, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.956059 }, { "epoch": 0.7372250917714323, "grad_norm": 0.40419942140579224, "learning_rate": 1.7720702098812154e-06, "loss": 0.017000149935483932, "memory(GiB)": 22.66, "step": 22694, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.956065 }, { "epoch": 0.7372575772341877, "grad_norm": 0.37353530526161194, "learning_rate": 1.7716600114666343e-06, "loss": 0.01812662184238434, "memory(GiB)": 22.66, "step": 22695, "token_acc": 0.9965870307167235, "train_speed(iter/s)": 0.956073 }, { "epoch": 0.7372900626969431, "grad_norm": 0.41280773282051086, "learning_rate": 1.771249850311394e-06, "loss": 0.015646008774638176, "memory(GiB)": 22.66, "step": 22696, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.95608 }, { "epoch": 0.7373225481596986, "grad_norm": 0.3744158446788788, "learning_rate": 1.7708397264202293e-06, "loss": 0.012578779831528664, "memory(GiB)": 22.66, "step": 22697, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.956087 }, { "epoch": 0.7373550336224539, "grad_norm": 0.44558101892471313, "learning_rate": 1.7704296397978704e-06, "loss": 0.019422397017478943, "memory(GiB)": 22.66, "step": 22698, "token_acc": 1.0, "train_speed(iter/s)": 0.956094 }, { "epoch": 0.7373875190852094, "grad_norm": 0.27176186442375183, "learning_rate": 1.7700195904490519e-06, "loss": 0.013186413794755936, "memory(GiB)": 22.66, "step": 22699, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.956101 }, { "epoch": 0.7374200045479647, "grad_norm": 0.33617061376571655, "learning_rate": 1.7696095783785066e-06, "loss": 0.016803303733468056, "memory(GiB)": 22.66, "step": 22700, "token_acc": 1.0, "train_speed(iter/s)": 0.956108 }, { "epoch": 0.7374524900107202, "grad_norm": 0.3930145502090454, "learning_rate": 1.7691996035909682e-06, "loss": 0.016725748777389526, "memory(GiB)": 22.66, "step": 22701, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.956114 }, { "epoch": 0.7374849754734756, "grad_norm": 0.465628981590271, "learning_rate": 1.7687896660911646e-06, "loss": 0.01976395584642887, "memory(GiB)": 22.66, "step": 22702, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.95612 }, { "epoch": 0.737517460936231, "grad_norm": 0.29313594102859497, "learning_rate": 1.7683797658838309e-06, "loss": 0.013448662124574184, "memory(GiB)": 22.66, "step": 22703, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956127 }, { "epoch": 0.7375499463989864, "grad_norm": 1.9338937997817993, "learning_rate": 1.7679699029736925e-06, "loss": 0.016598636284470558, "memory(GiB)": 22.66, "step": 22704, "token_acc": 1.0, "train_speed(iter/s)": 0.956134 }, { "epoch": 0.7375824318617419, "grad_norm": 0.33248257637023926, "learning_rate": 1.767560077365485e-06, "loss": 0.012820061296224594, "memory(GiB)": 22.66, "step": 22705, "token_acc": 1.0, "train_speed(iter/s)": 0.956141 }, { "epoch": 0.7376149173244972, "grad_norm": 0.2791685461997986, "learning_rate": 1.767150289063938e-06, "loss": 0.011223752051591873, "memory(GiB)": 22.66, "step": 22706, "token_acc": 1.0, "train_speed(iter/s)": 0.956149 }, { "epoch": 0.7376474027872527, "grad_norm": 0.2685267925262451, "learning_rate": 1.7667405380737785e-06, "loss": 0.011919433251023293, "memory(GiB)": 22.66, "step": 22707, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.956157 }, { "epoch": 0.7376798882500081, "grad_norm": 0.3947487473487854, "learning_rate": 1.7663308243997379e-06, "loss": 0.010770121589303017, "memory(GiB)": 22.66, "step": 22708, "token_acc": 1.0, "train_speed(iter/s)": 0.956166 }, { "epoch": 0.7377123737127635, "grad_norm": 0.18639734387397766, "learning_rate": 1.76592114804654e-06, "loss": 0.00780277606099844, "memory(GiB)": 22.66, "step": 22709, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.956175 }, { "epoch": 0.7377448591755189, "grad_norm": 0.18852700293064117, "learning_rate": 1.7655115090189202e-06, "loss": 0.008584555238485336, "memory(GiB)": 22.66, "step": 22710, "token_acc": 1.0, "train_speed(iter/s)": 0.956184 }, { "epoch": 0.7377773446382744, "grad_norm": 0.3549773693084717, "learning_rate": 1.7651019073216007e-06, "loss": 0.00883512757718563, "memory(GiB)": 22.66, "step": 22711, "token_acc": 1.0, "train_speed(iter/s)": 0.956192 }, { "epoch": 0.7378098301010297, "grad_norm": 0.26293355226516724, "learning_rate": 1.7646923429593133e-06, "loss": 0.010832589119672775, "memory(GiB)": 22.66, "step": 22712, "token_acc": 0.992, "train_speed(iter/s)": 0.956201 }, { "epoch": 0.7378423155637852, "grad_norm": 0.2552224099636078, "learning_rate": 1.7642828159367803e-06, "loss": 0.009396695531904697, "memory(GiB)": 22.66, "step": 22713, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.95621 }, { "epoch": 0.7378748010265406, "grad_norm": 0.2670845091342926, "learning_rate": 1.7638733262587298e-06, "loss": 0.012504007667303085, "memory(GiB)": 22.66, "step": 22714, "token_acc": 0.9923076923076923, "train_speed(iter/s)": 0.956219 }, { "epoch": 0.737907286489296, "grad_norm": 0.5020556449890137, "learning_rate": 1.7634638739298893e-06, "loss": 0.01784422993659973, "memory(GiB)": 22.66, "step": 22715, "token_acc": 0.9826839826839827, "train_speed(iter/s)": 0.956227 }, { "epoch": 0.7379397719520514, "grad_norm": 0.263723760843277, "learning_rate": 1.7630544589549846e-06, "loss": 0.010911591351032257, "memory(GiB)": 22.66, "step": 22716, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.956236 }, { "epoch": 0.7379722574148069, "grad_norm": 0.34581711888313293, "learning_rate": 1.7626450813387387e-06, "loss": 0.011280396953225136, "memory(GiB)": 22.66, "step": 22717, "token_acc": 1.0, "train_speed(iter/s)": 0.956246 }, { "epoch": 0.7380047428775622, "grad_norm": 0.34537652134895325, "learning_rate": 1.762235741085877e-06, "loss": 0.012027943506836891, "memory(GiB)": 22.66, "step": 22718, "token_acc": 0.9865771812080537, "train_speed(iter/s)": 0.956253 }, { "epoch": 0.7380372283403177, "grad_norm": 0.38952067494392395, "learning_rate": 1.7618264382011242e-06, "loss": 0.010390120558440685, "memory(GiB)": 22.66, "step": 22719, "token_acc": 0.9947368421052631, "train_speed(iter/s)": 0.956262 }, { "epoch": 0.7380697138030732, "grad_norm": 0.3011835813522339, "learning_rate": 1.7614171726892043e-06, "loss": 0.010858241468667984, "memory(GiB)": 22.66, "step": 22720, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.956271 }, { "epoch": 0.7381021992658285, "grad_norm": 0.43170830607414246, "learning_rate": 1.7610079445548428e-06, "loss": 0.012496902607381344, "memory(GiB)": 22.66, "step": 22721, "token_acc": 0.996, "train_speed(iter/s)": 0.956281 }, { "epoch": 0.738134684728584, "grad_norm": 0.3569030165672302, "learning_rate": 1.7605987538027591e-06, "loss": 0.013059061020612717, "memory(GiB)": 22.66, "step": 22722, "token_acc": 1.0, "train_speed(iter/s)": 0.95629 }, { "epoch": 0.7381671701913394, "grad_norm": 0.5020532011985779, "learning_rate": 1.7601896004376772e-06, "loss": 0.022128067910671234, "memory(GiB)": 22.66, "step": 22723, "token_acc": 1.0, "train_speed(iter/s)": 0.956298 }, { "epoch": 0.7381996556540948, "grad_norm": 0.2594224810600281, "learning_rate": 1.7597804844643195e-06, "loss": 0.012525907717645168, "memory(GiB)": 22.66, "step": 22724, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.956306 }, { "epoch": 0.7382321411168502, "grad_norm": 0.3345361053943634, "learning_rate": 1.7593714058874095e-06, "loss": 0.013670634478330612, "memory(GiB)": 22.66, "step": 22725, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.956315 }, { "epoch": 0.7382646265796057, "grad_norm": 0.33956801891326904, "learning_rate": 1.7589623647116649e-06, "loss": 0.011122506111860275, "memory(GiB)": 22.66, "step": 22726, "token_acc": 1.0, "train_speed(iter/s)": 0.956324 }, { "epoch": 0.738297112042361, "grad_norm": 0.6169209480285645, "learning_rate": 1.758553360941808e-06, "loss": 0.011653535068035126, "memory(GiB)": 22.66, "step": 22727, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.956333 }, { "epoch": 0.7383295975051165, "grad_norm": 0.22998012602329254, "learning_rate": 1.7581443945825605e-06, "loss": 0.010400986298918724, "memory(GiB)": 22.66, "step": 22728, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.956342 }, { "epoch": 0.7383620829678719, "grad_norm": 0.3007046580314636, "learning_rate": 1.757735465638643e-06, "loss": 0.014384747482836246, "memory(GiB)": 22.66, "step": 22729, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.956349 }, { "epoch": 0.7383945684306273, "grad_norm": 0.4951635003089905, "learning_rate": 1.757326574114772e-06, "loss": 0.01527705229818821, "memory(GiB)": 22.66, "step": 22730, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.956355 }, { "epoch": 0.7384270538933827, "grad_norm": 0.3496987819671631, "learning_rate": 1.75691772001567e-06, "loss": 0.01332116313278675, "memory(GiB)": 22.66, "step": 22731, "token_acc": 1.0, "train_speed(iter/s)": 0.956362 }, { "epoch": 0.7384595393561382, "grad_norm": 0.4376389980316162, "learning_rate": 1.7565089033460503e-06, "loss": 0.016576509922742844, "memory(GiB)": 22.66, "step": 22732, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.956369 }, { "epoch": 0.7384920248188935, "grad_norm": 0.34950748085975647, "learning_rate": 1.7561001241106384e-06, "loss": 0.014200558885931969, "memory(GiB)": 22.66, "step": 22733, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.956376 }, { "epoch": 0.738524510281649, "grad_norm": 0.4764019250869751, "learning_rate": 1.7556913823141474e-06, "loss": 0.017431531101465225, "memory(GiB)": 22.66, "step": 22734, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.956383 }, { "epoch": 0.7385569957444044, "grad_norm": 0.33475983142852783, "learning_rate": 1.7552826779612959e-06, "loss": 0.013026763685047626, "memory(GiB)": 22.66, "step": 22735, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.95639 }, { "epoch": 0.7385894812071598, "grad_norm": 0.8467816710472107, "learning_rate": 1.7548740110568024e-06, "loss": 0.015899699181318283, "memory(GiB)": 22.66, "step": 22736, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.956397 }, { "epoch": 0.7386219666699152, "grad_norm": 0.4471604526042938, "learning_rate": 1.754465381605379e-06, "loss": 0.015468265861272812, "memory(GiB)": 22.66, "step": 22737, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.956403 }, { "epoch": 0.7386544521326707, "grad_norm": 0.4290892481803894, "learning_rate": 1.7540567896117483e-06, "loss": 0.02195584401488304, "memory(GiB)": 22.66, "step": 22738, "token_acc": 0.995, "train_speed(iter/s)": 0.956409 }, { "epoch": 0.738686937595426, "grad_norm": 0.39564505219459534, "learning_rate": 1.7536482350806206e-06, "loss": 0.015537519007921219, "memory(GiB)": 22.66, "step": 22739, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.956416 }, { "epoch": 0.7387194230581815, "grad_norm": 0.34604397416114807, "learning_rate": 1.7532397180167154e-06, "loss": 0.011869463138282299, "memory(GiB)": 22.66, "step": 22740, "token_acc": 1.0, "train_speed(iter/s)": 0.956423 }, { "epoch": 0.7387519085209369, "grad_norm": 0.5097078680992126, "learning_rate": 1.7528312384247432e-06, "loss": 0.01579441875219345, "memory(GiB)": 22.66, "step": 22741, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.956429 }, { "epoch": 0.7387843939836923, "grad_norm": 0.3966982960700989, "learning_rate": 1.7524227963094203e-06, "loss": 0.019249578937888145, "memory(GiB)": 22.66, "step": 22742, "token_acc": 1.0, "train_speed(iter/s)": 0.956436 }, { "epoch": 0.7388168794464477, "grad_norm": 0.42297908663749695, "learning_rate": 1.7520143916754613e-06, "loss": 0.01637955754995346, "memory(GiB)": 22.66, "step": 22743, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.956442 }, { "epoch": 0.7388493649092032, "grad_norm": 0.3764784634113312, "learning_rate": 1.7516060245275801e-06, "loss": 0.018068615347146988, "memory(GiB)": 22.66, "step": 22744, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956449 }, { "epoch": 0.7388818503719585, "grad_norm": 0.5617127418518066, "learning_rate": 1.7511976948704883e-06, "loss": 0.017911149188876152, "memory(GiB)": 22.66, "step": 22745, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.956456 }, { "epoch": 0.738914335834714, "grad_norm": 0.42648422718048096, "learning_rate": 1.750789402708899e-06, "loss": 0.016056159511208534, "memory(GiB)": 22.66, "step": 22746, "token_acc": 0.9789915966386554, "train_speed(iter/s)": 0.956462 }, { "epoch": 0.7389468212974694, "grad_norm": 0.30573949217796326, "learning_rate": 1.750381148047524e-06, "loss": 0.010640013962984085, "memory(GiB)": 22.66, "step": 22747, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.956468 }, { "epoch": 0.7389793067602248, "grad_norm": 0.47814062237739563, "learning_rate": 1.7499729308910785e-06, "loss": 0.015087475068867207, "memory(GiB)": 22.66, "step": 22748, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.956474 }, { "epoch": 0.7390117922229802, "grad_norm": 0.38187533617019653, "learning_rate": 1.7495647512442687e-06, "loss": 0.01666383445262909, "memory(GiB)": 22.66, "step": 22749, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956481 }, { "epoch": 0.7390442776857357, "grad_norm": 0.38078540563583374, "learning_rate": 1.7491566091118083e-06, "loss": 0.011712716892361641, "memory(GiB)": 22.66, "step": 22750, "token_acc": 1.0, "train_speed(iter/s)": 0.956488 }, { "epoch": 0.739076763148491, "grad_norm": 0.5152083039283752, "learning_rate": 1.7487485044984076e-06, "loss": 0.020135097205638885, "memory(GiB)": 22.66, "step": 22751, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956495 }, { "epoch": 0.7391092486112465, "grad_norm": 0.2933247983455658, "learning_rate": 1.7483404374087759e-06, "loss": 0.010727087035775185, "memory(GiB)": 22.66, "step": 22752, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.956502 }, { "epoch": 0.7391417340740019, "grad_norm": 0.1718449890613556, "learning_rate": 1.7479324078476262e-06, "loss": 0.008078649640083313, "memory(GiB)": 22.66, "step": 22753, "token_acc": 0.98828125, "train_speed(iter/s)": 0.956509 }, { "epoch": 0.7391742195367573, "grad_norm": 0.2847881615161896, "learning_rate": 1.7475244158196625e-06, "loss": 0.013891929760575294, "memory(GiB)": 22.66, "step": 22754, "token_acc": 0.9923954372623575, "train_speed(iter/s)": 0.956515 }, { "epoch": 0.7392067049995127, "grad_norm": 0.22864966094493866, "learning_rate": 1.7471164613295965e-06, "loss": 0.008586307056248188, "memory(GiB)": 22.66, "step": 22755, "token_acc": 1.0, "train_speed(iter/s)": 0.956522 }, { "epoch": 0.7392391904622682, "grad_norm": 0.2715555429458618, "learning_rate": 1.7467085443821358e-06, "loss": 0.011315379291772842, "memory(GiB)": 22.66, "step": 22756, "token_acc": 0.9923954372623575, "train_speed(iter/s)": 0.956528 }, { "epoch": 0.7392716759250235, "grad_norm": 0.5141837000846863, "learning_rate": 1.746300664981991e-06, "loss": 0.0157985407859087, "memory(GiB)": 22.66, "step": 22757, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.956535 }, { "epoch": 0.739304161387779, "grad_norm": 0.3358077108860016, "learning_rate": 1.7458928231338644e-06, "loss": 0.015022188425064087, "memory(GiB)": 22.66, "step": 22758, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.956542 }, { "epoch": 0.7393366468505344, "grad_norm": 1.1341485977172852, "learning_rate": 1.7454850188424682e-06, "loss": 0.016318872570991516, "memory(GiB)": 22.66, "step": 22759, "token_acc": 0.992, "train_speed(iter/s)": 0.956549 }, { "epoch": 0.7393691323132898, "grad_norm": 0.3525022566318512, "learning_rate": 1.745077252112503e-06, "loss": 0.015009990893304348, "memory(GiB)": 22.66, "step": 22760, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.956556 }, { "epoch": 0.7394016177760452, "grad_norm": 0.34965524077415466, "learning_rate": 1.7446695229486826e-06, "loss": 0.012274988926947117, "memory(GiB)": 22.66, "step": 22761, "token_acc": 1.0, "train_speed(iter/s)": 0.956563 }, { "epoch": 0.7394341032388007, "grad_norm": 0.4603964686393738, "learning_rate": 1.7442618313557063e-06, "loss": 0.021247629076242447, "memory(GiB)": 22.66, "step": 22762, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.95657 }, { "epoch": 0.739466588701556, "grad_norm": 0.25239846110343933, "learning_rate": 1.7438541773382838e-06, "loss": 0.012464722618460655, "memory(GiB)": 22.66, "step": 22763, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.956576 }, { "epoch": 0.7394990741643115, "grad_norm": 0.2999391555786133, "learning_rate": 1.7434465609011148e-06, "loss": 0.011825564317405224, "memory(GiB)": 22.66, "step": 22764, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.956582 }, { "epoch": 0.7395315596270668, "grad_norm": 0.33326008915901184, "learning_rate": 1.7430389820489103e-06, "loss": 0.012234979309141636, "memory(GiB)": 22.66, "step": 22765, "token_acc": 1.0, "train_speed(iter/s)": 0.956588 }, { "epoch": 0.7395640450898223, "grad_norm": 0.33157360553741455, "learning_rate": 1.7426314407863693e-06, "loss": 0.012172429822385311, "memory(GiB)": 22.66, "step": 22766, "token_acc": 1.0, "train_speed(iter/s)": 0.956596 }, { "epoch": 0.7395965305525777, "grad_norm": 0.41250041127204895, "learning_rate": 1.742223937118197e-06, "loss": 0.012967301532626152, "memory(GiB)": 22.66, "step": 22767, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956603 }, { "epoch": 0.7396290160153332, "grad_norm": 0.25491395592689514, "learning_rate": 1.7418164710490982e-06, "loss": 0.010853535495698452, "memory(GiB)": 22.66, "step": 22768, "token_acc": 1.0, "train_speed(iter/s)": 0.95661 }, { "epoch": 0.7396615014780885, "grad_norm": 0.33544033765792847, "learning_rate": 1.7414090425837704e-06, "loss": 0.01208093948662281, "memory(GiB)": 22.66, "step": 22769, "token_acc": 0.99, "train_speed(iter/s)": 0.956619 }, { "epoch": 0.739693986940844, "grad_norm": 0.3600054681301117, "learning_rate": 1.741001651726923e-06, "loss": 0.01812300831079483, "memory(GiB)": 22.66, "step": 22770, "token_acc": 0.9776785714285714, "train_speed(iter/s)": 0.956627 }, { "epoch": 0.7397264724035993, "grad_norm": 0.4166337847709656, "learning_rate": 1.7405942984832524e-06, "loss": 0.01471031829714775, "memory(GiB)": 22.66, "step": 22771, "token_acc": 1.0, "train_speed(iter/s)": 0.956636 }, { "epoch": 0.7397589578663548, "grad_norm": 0.22295130789279938, "learning_rate": 1.7401869828574636e-06, "loss": 0.008715683594346046, "memory(GiB)": 22.66, "step": 22772, "token_acc": 1.0, "train_speed(iter/s)": 0.956645 }, { "epoch": 0.7397914433291102, "grad_norm": 0.39128440618515015, "learning_rate": 1.7397797048542542e-06, "loss": 0.01634475775063038, "memory(GiB)": 22.66, "step": 22773, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.956654 }, { "epoch": 0.7398239287918656, "grad_norm": 0.2902112305164337, "learning_rate": 1.7393724644783262e-06, "loss": 0.016433794051408768, "memory(GiB)": 22.66, "step": 22774, "token_acc": 1.0, "train_speed(iter/s)": 0.956663 }, { "epoch": 0.739856414254621, "grad_norm": 0.38233640789985657, "learning_rate": 1.73896526173438e-06, "loss": 0.021663248538970947, "memory(GiB)": 22.66, "step": 22775, "token_acc": 1.0, "train_speed(iter/s)": 0.956672 }, { "epoch": 0.7398888997173765, "grad_norm": 0.41788461804389954, "learning_rate": 1.7385580966271165e-06, "loss": 0.012073086574673653, "memory(GiB)": 22.66, "step": 22776, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.95668 }, { "epoch": 0.7399213851801318, "grad_norm": 0.31799888610839844, "learning_rate": 1.7381509691612326e-06, "loss": 0.01642187498509884, "memory(GiB)": 22.66, "step": 22777, "token_acc": 0.9930555555555556, "train_speed(iter/s)": 0.956689 }, { "epoch": 0.7399538706428873, "grad_norm": 0.3930410146713257, "learning_rate": 1.737743879341427e-06, "loss": 0.01883666217327118, "memory(GiB)": 22.66, "step": 22778, "token_acc": 1.0, "train_speed(iter/s)": 0.956699 }, { "epoch": 0.7399863561056427, "grad_norm": 0.2281368225812912, "learning_rate": 1.7373368271724e-06, "loss": 0.009307995438575745, "memory(GiB)": 22.66, "step": 22779, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.956707 }, { "epoch": 0.7400188415683981, "grad_norm": 0.2836903929710388, "learning_rate": 1.7369298126588497e-06, "loss": 0.009803831577301025, "memory(GiB)": 22.66, "step": 22780, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956717 }, { "epoch": 0.7400513270311535, "grad_norm": 0.34598201513290405, "learning_rate": 1.736522835805471e-06, "loss": 0.015957828611135483, "memory(GiB)": 22.66, "step": 22781, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.956726 }, { "epoch": 0.740083812493909, "grad_norm": 0.30051764845848083, "learning_rate": 1.7361158966169628e-06, "loss": 0.01048277784138918, "memory(GiB)": 22.66, "step": 22782, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956735 }, { "epoch": 0.7401162979566643, "grad_norm": 0.2998785376548767, "learning_rate": 1.735708995098021e-06, "loss": 0.009807148948311806, "memory(GiB)": 22.66, "step": 22783, "token_acc": 0.996, "train_speed(iter/s)": 0.956744 }, { "epoch": 0.7401487834194198, "grad_norm": 0.4569750726222992, "learning_rate": 1.735302131253342e-06, "loss": 0.014915184117853642, "memory(GiB)": 22.66, "step": 22784, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956753 }, { "epoch": 0.7401812688821753, "grad_norm": 0.2129707932472229, "learning_rate": 1.7348953050876238e-06, "loss": 0.011275485157966614, "memory(GiB)": 22.66, "step": 22785, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956762 }, { "epoch": 0.7402137543449306, "grad_norm": 0.43122386932373047, "learning_rate": 1.7344885166055575e-06, "loss": 0.026278439909219742, "memory(GiB)": 22.66, "step": 22786, "token_acc": 0.9904761904761905, "train_speed(iter/s)": 0.956771 }, { "epoch": 0.7402462398076861, "grad_norm": 0.3317125737667084, "learning_rate": 1.7340817658118403e-06, "loss": 0.016646791249513626, "memory(GiB)": 22.66, "step": 22787, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.95678 }, { "epoch": 0.7402787252704415, "grad_norm": 0.3848678469657898, "learning_rate": 1.7336750527111667e-06, "loss": 0.01962696574628353, "memory(GiB)": 22.66, "step": 22788, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.956787 }, { "epoch": 0.740311210733197, "grad_norm": 0.7663529515266418, "learning_rate": 1.7332683773082314e-06, "loss": 0.011750208213925362, "memory(GiB)": 22.66, "step": 22789, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.956793 }, { "epoch": 0.7403436961959523, "grad_norm": 0.3867231607437134, "learning_rate": 1.7328617396077262e-06, "loss": 0.01649429276585579, "memory(GiB)": 22.66, "step": 22790, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.9568 }, { "epoch": 0.7403761816587078, "grad_norm": 0.447811096906662, "learning_rate": 1.7324551396143464e-06, "loss": 0.01592763140797615, "memory(GiB)": 22.66, "step": 22791, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.956808 }, { "epoch": 0.7404086671214631, "grad_norm": 0.27500244975090027, "learning_rate": 1.73204857733278e-06, "loss": 0.009705903008580208, "memory(GiB)": 22.66, "step": 22792, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.956815 }, { "epoch": 0.7404411525842186, "grad_norm": 0.32869216799736023, "learning_rate": 1.7316420527677263e-06, "loss": 0.01197165995836258, "memory(GiB)": 22.66, "step": 22793, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956821 }, { "epoch": 0.740473638046974, "grad_norm": 0.33055055141448975, "learning_rate": 1.7312355659238717e-06, "loss": 0.014898051507771015, "memory(GiB)": 22.66, "step": 22794, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.956827 }, { "epoch": 0.7405061235097294, "grad_norm": 0.22348035871982574, "learning_rate": 1.730829116805911e-06, "loss": 0.009465353563427925, "memory(GiB)": 22.66, "step": 22795, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.956833 }, { "epoch": 0.7405386089724848, "grad_norm": 0.2819519639015198, "learning_rate": 1.7304227054185313e-06, "loss": 0.010145274922251701, "memory(GiB)": 22.66, "step": 22796, "token_acc": 1.0, "train_speed(iter/s)": 0.95684 }, { "epoch": 0.7405710944352403, "grad_norm": 0.4200437068939209, "learning_rate": 1.7300163317664242e-06, "loss": 0.01463289838284254, "memory(GiB)": 22.66, "step": 22797, "token_acc": 0.992, "train_speed(iter/s)": 0.956847 }, { "epoch": 0.7406035798979956, "grad_norm": 0.26352784037590027, "learning_rate": 1.7296099958542845e-06, "loss": 0.010114019736647606, "memory(GiB)": 22.66, "step": 22798, "token_acc": 1.0, "train_speed(iter/s)": 0.956854 }, { "epoch": 0.7406360653607511, "grad_norm": 0.3358786702156067, "learning_rate": 1.7292036976867966e-06, "loss": 0.015422872267663479, "memory(GiB)": 22.66, "step": 22799, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956861 }, { "epoch": 0.7406685508235065, "grad_norm": 0.5366037487983704, "learning_rate": 1.7287974372686528e-06, "loss": 0.018672239035367966, "memory(GiB)": 22.66, "step": 22800, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.956868 }, { "epoch": 0.7407010362862619, "grad_norm": 0.33369261026382446, "learning_rate": 1.7283912146045394e-06, "loss": 0.012120652012526989, "memory(GiB)": 22.66, "step": 22801, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.956875 }, { "epoch": 0.7407335217490173, "grad_norm": 0.24826636910438538, "learning_rate": 1.7279850296991457e-06, "loss": 0.009252489544451237, "memory(GiB)": 22.66, "step": 22802, "token_acc": 1.0, "train_speed(iter/s)": 0.956882 }, { "epoch": 0.7407660072117728, "grad_norm": 0.4491424560546875, "learning_rate": 1.7275788825571593e-06, "loss": 0.013458909466862679, "memory(GiB)": 22.66, "step": 22803, "token_acc": 1.0, "train_speed(iter/s)": 0.956889 }, { "epoch": 0.7407984926745281, "grad_norm": 0.29138287901878357, "learning_rate": 1.7271727731832704e-06, "loss": 0.008441624231636524, "memory(GiB)": 22.66, "step": 22804, "token_acc": 1.0, "train_speed(iter/s)": 0.956896 }, { "epoch": 0.7408309781372836, "grad_norm": 0.5622960329055786, "learning_rate": 1.726766701582162e-06, "loss": 0.02636653557419777, "memory(GiB)": 22.66, "step": 22805, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956903 }, { "epoch": 0.740863463600039, "grad_norm": 0.3040927052497864, "learning_rate": 1.7263606677585227e-06, "loss": 0.014453530311584473, "memory(GiB)": 22.66, "step": 22806, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.95691 }, { "epoch": 0.7408959490627944, "grad_norm": 0.2924739718437195, "learning_rate": 1.7259546717170378e-06, "loss": 0.016269803047180176, "memory(GiB)": 22.66, "step": 22807, "token_acc": 0.9788135593220338, "train_speed(iter/s)": 0.956916 }, { "epoch": 0.7409284345255498, "grad_norm": 0.38660967350006104, "learning_rate": 1.725548713462396e-06, "loss": 0.020092817023396492, "memory(GiB)": 22.66, "step": 22808, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.956923 }, { "epoch": 0.7409609199883053, "grad_norm": 0.3689339756965637, "learning_rate": 1.7251427929992787e-06, "loss": 0.014560741372406483, "memory(GiB)": 22.66, "step": 22809, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.956929 }, { "epoch": 0.7409934054510606, "grad_norm": 0.3450850248336792, "learning_rate": 1.724736910332372e-06, "loss": 0.01355062983930111, "memory(GiB)": 22.66, "step": 22810, "token_acc": 1.0, "train_speed(iter/s)": 0.956936 }, { "epoch": 0.7410258909138161, "grad_norm": 0.31253373622894287, "learning_rate": 1.724331065466361e-06, "loss": 0.015375496819615364, "memory(GiB)": 22.66, "step": 22811, "token_acc": 0.988, "train_speed(iter/s)": 0.956942 }, { "epoch": 0.7410583763765715, "grad_norm": 0.17345374822616577, "learning_rate": 1.7239252584059296e-06, "loss": 0.005781693384051323, "memory(GiB)": 22.66, "step": 22812, "token_acc": 1.0, "train_speed(iter/s)": 0.956949 }, { "epoch": 0.7410908618393269, "grad_norm": 0.3373293876647949, "learning_rate": 1.7235194891557628e-06, "loss": 0.011077407747507095, "memory(GiB)": 22.66, "step": 22813, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.956956 }, { "epoch": 0.7411233473020823, "grad_norm": 0.4709508717060089, "learning_rate": 1.7231137577205397e-06, "loss": 0.012936591170728207, "memory(GiB)": 22.66, "step": 22814, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956963 }, { "epoch": 0.7411558327648378, "grad_norm": 0.29625046253204346, "learning_rate": 1.722708064104946e-06, "loss": 0.015980085358023643, "memory(GiB)": 22.66, "step": 22815, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.95697 }, { "epoch": 0.7411883182275931, "grad_norm": 0.32521313428878784, "learning_rate": 1.722302408313663e-06, "loss": 0.013418380171060562, "memory(GiB)": 22.66, "step": 22816, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.956976 }, { "epoch": 0.7412208036903486, "grad_norm": 0.2832775413990021, "learning_rate": 1.721896790351375e-06, "loss": 0.014222737401723862, "memory(GiB)": 22.66, "step": 22817, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.956983 }, { "epoch": 0.741253289153104, "grad_norm": 0.38696253299713135, "learning_rate": 1.7214912102227593e-06, "loss": 0.015176254324615002, "memory(GiB)": 22.66, "step": 22818, "token_acc": 1.0, "train_speed(iter/s)": 0.95699 }, { "epoch": 0.7412857746158594, "grad_norm": 0.44361862540245056, "learning_rate": 1.7210856679325e-06, "loss": 0.018864668905735016, "memory(GiB)": 22.66, "step": 22819, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.956996 }, { "epoch": 0.7413182600786148, "grad_norm": 0.40195250511169434, "learning_rate": 1.7206801634852733e-06, "loss": 0.019414667040109634, "memory(GiB)": 22.66, "step": 22820, "token_acc": 0.995, "train_speed(iter/s)": 0.957002 }, { "epoch": 0.7413507455413703, "grad_norm": 0.3275868594646454, "learning_rate": 1.7202746968857654e-06, "loss": 0.014310968108475208, "memory(GiB)": 22.66, "step": 22821, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.95701 }, { "epoch": 0.7413832310041256, "grad_norm": 0.43924692273139954, "learning_rate": 1.7198692681386514e-06, "loss": 0.010616296902298927, "memory(GiB)": 22.66, "step": 22822, "token_acc": 1.0, "train_speed(iter/s)": 0.957017 }, { "epoch": 0.7414157164668811, "grad_norm": 0.34939199686050415, "learning_rate": 1.7194638772486132e-06, "loss": 0.017066646367311478, "memory(GiB)": 22.66, "step": 22823, "token_acc": 1.0, "train_speed(iter/s)": 0.957023 }, { "epoch": 0.7414482019296365, "grad_norm": 0.4870835542678833, "learning_rate": 1.7190585242203273e-06, "loss": 0.014299226924777031, "memory(GiB)": 22.66, "step": 22824, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.95703 }, { "epoch": 0.7414806873923919, "grad_norm": 0.26901185512542725, "learning_rate": 1.7186532090584723e-06, "loss": 0.011007515713572502, "memory(GiB)": 22.66, "step": 22825, "token_acc": 0.9921875, "train_speed(iter/s)": 0.957036 }, { "epoch": 0.7415131728551473, "grad_norm": 0.3526599109172821, "learning_rate": 1.718247931767727e-06, "loss": 0.014868287369608879, "memory(GiB)": 22.66, "step": 22826, "token_acc": 0.9875, "train_speed(iter/s)": 0.957041 }, { "epoch": 0.7415456583179028, "grad_norm": 0.5143789649009705, "learning_rate": 1.7178426923527703e-06, "loss": 0.017275353893637657, "memory(GiB)": 22.66, "step": 22827, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957048 }, { "epoch": 0.7415781437806581, "grad_norm": 0.3837250769138336, "learning_rate": 1.7174374908182757e-06, "loss": 0.015623838640749454, "memory(GiB)": 22.66, "step": 22828, "token_acc": 0.9940476190476191, "train_speed(iter/s)": 0.957055 }, { "epoch": 0.7416106292434136, "grad_norm": 0.3249630928039551, "learning_rate": 1.717032327168921e-06, "loss": 0.01531227771192789, "memory(GiB)": 22.66, "step": 22829, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.957062 }, { "epoch": 0.741643114706169, "grad_norm": 0.28780269622802734, "learning_rate": 1.7166272014093838e-06, "loss": 0.009936490096151829, "memory(GiB)": 22.66, "step": 22830, "token_acc": 1.0, "train_speed(iter/s)": 0.957069 }, { "epoch": 0.7416756001689244, "grad_norm": 0.5175557732582092, "learning_rate": 1.716222113544338e-06, "loss": 0.013599976897239685, "memory(GiB)": 22.66, "step": 22831, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957078 }, { "epoch": 0.7417080856316798, "grad_norm": 0.3734109401702881, "learning_rate": 1.7158170635784622e-06, "loss": 0.013943672180175781, "memory(GiB)": 22.66, "step": 22832, "token_acc": 0.9840425531914894, "train_speed(iter/s)": 0.957087 }, { "epoch": 0.7417405710944353, "grad_norm": 0.23800241947174072, "learning_rate": 1.7154120515164264e-06, "loss": 0.010186266154050827, "memory(GiB)": 22.66, "step": 22833, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.957096 }, { "epoch": 0.7417730565571906, "grad_norm": 0.28903278708457947, "learning_rate": 1.7150070773629075e-06, "loss": 0.01337926834821701, "memory(GiB)": 22.66, "step": 22834, "token_acc": 1.0, "train_speed(iter/s)": 0.957105 }, { "epoch": 0.7418055420199461, "grad_norm": 0.3320038318634033, "learning_rate": 1.7146021411225793e-06, "loss": 0.01589900813996792, "memory(GiB)": 22.66, "step": 22835, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.957113 }, { "epoch": 0.7418380274827014, "grad_norm": 0.22875402867794037, "learning_rate": 1.7141972428001168e-06, "loss": 0.010280868038535118, "memory(GiB)": 22.66, "step": 22836, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.957122 }, { "epoch": 0.7418705129454569, "grad_norm": 0.5575075745582581, "learning_rate": 1.7137923824001901e-06, "loss": 0.013970245607197285, "memory(GiB)": 22.66, "step": 22837, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.957131 }, { "epoch": 0.7419029984082123, "grad_norm": 0.29579195380210876, "learning_rate": 1.713387559927473e-06, "loss": 0.015214720740914345, "memory(GiB)": 22.66, "step": 22838, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.95714 }, { "epoch": 0.7419354838709677, "grad_norm": 0.31532490253448486, "learning_rate": 1.712982775386638e-06, "loss": 0.012941824272274971, "memory(GiB)": 22.66, "step": 22839, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.957149 }, { "epoch": 0.7419679693337231, "grad_norm": 0.31376612186431885, "learning_rate": 1.7125780287823584e-06, "loss": 0.01285582035779953, "memory(GiB)": 22.66, "step": 22840, "token_acc": 1.0, "train_speed(iter/s)": 0.957158 }, { "epoch": 0.7420004547964786, "grad_norm": 0.28163400292396545, "learning_rate": 1.7121733201193026e-06, "loss": 0.0084261205047369, "memory(GiB)": 22.66, "step": 22841, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957167 }, { "epoch": 0.7420329402592339, "grad_norm": 0.29978787899017334, "learning_rate": 1.711768649402144e-06, "loss": 0.01190979965031147, "memory(GiB)": 22.66, "step": 22842, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957176 }, { "epoch": 0.7420654257219894, "grad_norm": 0.3828684687614441, "learning_rate": 1.7113640166355489e-06, "loss": 0.0182945653796196, "memory(GiB)": 22.66, "step": 22843, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957185 }, { "epoch": 0.7420979111847448, "grad_norm": 0.527431845664978, "learning_rate": 1.7109594218241915e-06, "loss": 0.012674483470618725, "memory(GiB)": 22.66, "step": 22844, "token_acc": 0.995, "train_speed(iter/s)": 0.957194 }, { "epoch": 0.7421303966475002, "grad_norm": 0.4667283892631531, "learning_rate": 1.7105548649727417e-06, "loss": 0.010221434757113457, "memory(GiB)": 22.66, "step": 22845, "token_acc": 1.0, "train_speed(iter/s)": 0.957202 }, { "epoch": 0.7421628821102556, "grad_norm": 0.5509449243545532, "learning_rate": 1.7101503460858655e-06, "loss": 0.015257236547768116, "memory(GiB)": 22.66, "step": 22846, "token_acc": 1.0, "train_speed(iter/s)": 0.957211 }, { "epoch": 0.7421953675730111, "grad_norm": 0.25763243436813354, "learning_rate": 1.7097458651682347e-06, "loss": 0.011976639740169048, "memory(GiB)": 22.66, "step": 22847, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.95722 }, { "epoch": 0.7422278530357665, "grad_norm": 0.4567384719848633, "learning_rate": 1.7093414222245124e-06, "loss": 0.011546056717634201, "memory(GiB)": 22.66, "step": 22848, "token_acc": 1.0, "train_speed(iter/s)": 0.957227 }, { "epoch": 0.7422603384985219, "grad_norm": 0.31682589650154114, "learning_rate": 1.708937017259374e-06, "loss": 0.014467556029558182, "memory(GiB)": 22.66, "step": 22849, "token_acc": 1.0, "train_speed(iter/s)": 0.957235 }, { "epoch": 0.7422928239612774, "grad_norm": 0.29735925793647766, "learning_rate": 1.7085326502774803e-06, "loss": 0.009931904263794422, "memory(GiB)": 22.66, "step": 22850, "token_acc": 1.0, "train_speed(iter/s)": 0.957242 }, { "epoch": 0.7423253094240327, "grad_norm": 0.39057478308677673, "learning_rate": 1.7081283212835025e-06, "loss": 0.024016020819544792, "memory(GiB)": 22.66, "step": 22851, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957248 }, { "epoch": 0.7423577948867882, "grad_norm": 0.4462376534938812, "learning_rate": 1.707724030282104e-06, "loss": 0.0194550734013319, "memory(GiB)": 22.66, "step": 22852, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.957254 }, { "epoch": 0.7423902803495436, "grad_norm": 0.39367517828941345, "learning_rate": 1.707319777277952e-06, "loss": 0.017495373263955116, "memory(GiB)": 22.66, "step": 22853, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957261 }, { "epoch": 0.742422765812299, "grad_norm": 0.3964258134365082, "learning_rate": 1.7069155622757129e-06, "loss": 0.010574588552117348, "memory(GiB)": 22.66, "step": 22854, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.957268 }, { "epoch": 0.7424552512750544, "grad_norm": 0.2695033848285675, "learning_rate": 1.7065113852800524e-06, "loss": 0.008542343974113464, "memory(GiB)": 22.66, "step": 22855, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957274 }, { "epoch": 0.7424877367378099, "grad_norm": 0.42438098788261414, "learning_rate": 1.7061072462956324e-06, "loss": 0.012261476367712021, "memory(GiB)": 22.66, "step": 22856, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.95728 }, { "epoch": 0.7425202222005652, "grad_norm": 0.4237326383590698, "learning_rate": 1.7057031453271194e-06, "loss": 0.015979064628481865, "memory(GiB)": 22.66, "step": 22857, "token_acc": 1.0, "train_speed(iter/s)": 0.957287 }, { "epoch": 0.7425527076633207, "grad_norm": 0.40134936571121216, "learning_rate": 1.7052990823791765e-06, "loss": 0.014761020429432392, "memory(GiB)": 22.66, "step": 22858, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.957294 }, { "epoch": 0.7425851931260761, "grad_norm": 0.3811657130718231, "learning_rate": 1.7048950574564677e-06, "loss": 0.01657402329146862, "memory(GiB)": 22.66, "step": 22859, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957301 }, { "epoch": 0.7426176785888315, "grad_norm": 0.2308446615934372, "learning_rate": 1.704491070563657e-06, "loss": 0.009344980120658875, "memory(GiB)": 22.66, "step": 22860, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.957307 }, { "epoch": 0.7426501640515869, "grad_norm": 0.35687175393104553, "learning_rate": 1.7040871217054038e-06, "loss": 0.012974606826901436, "memory(GiB)": 22.66, "step": 22861, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957315 }, { "epoch": 0.7426826495143424, "grad_norm": 0.5060767531394958, "learning_rate": 1.7036832108863727e-06, "loss": 0.023748699575662613, "memory(GiB)": 22.66, "step": 22862, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.957322 }, { "epoch": 0.7427151349770977, "grad_norm": 0.2653427720069885, "learning_rate": 1.7032793381112244e-06, "loss": 0.01113892812281847, "memory(GiB)": 22.66, "step": 22863, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957328 }, { "epoch": 0.7427476204398532, "grad_norm": 0.25889652967453003, "learning_rate": 1.7028755033846222e-06, "loss": 0.008407194167375565, "memory(GiB)": 22.66, "step": 22864, "token_acc": 1.0, "train_speed(iter/s)": 0.957335 }, { "epoch": 0.7427801059026086, "grad_norm": 0.4148740768432617, "learning_rate": 1.7024717067112234e-06, "loss": 0.012442120350897312, "memory(GiB)": 22.66, "step": 22865, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957342 }, { "epoch": 0.742812591365364, "grad_norm": 0.33870866894721985, "learning_rate": 1.7020679480956903e-06, "loss": 0.017410773783922195, "memory(GiB)": 22.66, "step": 22866, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.957348 }, { "epoch": 0.7428450768281194, "grad_norm": 0.2316918671131134, "learning_rate": 1.701664227542682e-06, "loss": 0.009452358819544315, "memory(GiB)": 22.66, "step": 22867, "token_acc": 1.0, "train_speed(iter/s)": 0.957356 }, { "epoch": 0.7428775622908749, "grad_norm": 0.4450097382068634, "learning_rate": 1.701260545056861e-06, "loss": 0.011106612160801888, "memory(GiB)": 22.66, "step": 22868, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957362 }, { "epoch": 0.7429100477536302, "grad_norm": 0.5240861177444458, "learning_rate": 1.7008569006428822e-06, "loss": 0.02279995009303093, "memory(GiB)": 22.66, "step": 22869, "token_acc": 1.0, "train_speed(iter/s)": 0.957368 }, { "epoch": 0.7429425332163857, "grad_norm": 0.37046751379966736, "learning_rate": 1.7004532943054069e-06, "loss": 0.013594105839729309, "memory(GiB)": 22.66, "step": 22870, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.957373 }, { "epoch": 0.7429750186791411, "grad_norm": 0.35230952501296997, "learning_rate": 1.7000497260490894e-06, "loss": 0.01151222176849842, "memory(GiB)": 22.66, "step": 22871, "token_acc": 1.0, "train_speed(iter/s)": 0.957379 }, { "epoch": 0.7430075041418965, "grad_norm": 0.387494832277298, "learning_rate": 1.6996461958785938e-06, "loss": 0.013359827920794487, "memory(GiB)": 22.66, "step": 22872, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.957385 }, { "epoch": 0.7430399896046519, "grad_norm": 0.5025038719177246, "learning_rate": 1.699242703798572e-06, "loss": 0.012873008847236633, "memory(GiB)": 22.66, "step": 22873, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.95739 }, { "epoch": 0.7430724750674074, "grad_norm": 0.29305872321128845, "learning_rate": 1.698839249813683e-06, "loss": 0.012828033417463303, "memory(GiB)": 22.66, "step": 22874, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957396 }, { "epoch": 0.7431049605301627, "grad_norm": 0.2941485047340393, "learning_rate": 1.698435833928584e-06, "loss": 0.016504734754562378, "memory(GiB)": 22.66, "step": 22875, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957401 }, { "epoch": 0.7431374459929182, "grad_norm": 0.4143083393573761, "learning_rate": 1.6980324561479268e-06, "loss": 0.015436920337378979, "memory(GiB)": 22.66, "step": 22876, "token_acc": 0.9799196787148594, "train_speed(iter/s)": 0.957407 }, { "epoch": 0.7431699314556736, "grad_norm": 0.31145042181015015, "learning_rate": 1.697629116476373e-06, "loss": 0.007488410919904709, "memory(GiB)": 22.66, "step": 22877, "token_acc": 1.0, "train_speed(iter/s)": 0.957414 }, { "epoch": 0.743202416918429, "grad_norm": 0.3459987938404083, "learning_rate": 1.6972258149185732e-06, "loss": 0.009243706241250038, "memory(GiB)": 22.66, "step": 22878, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957419 }, { "epoch": 0.7432349023811844, "grad_norm": 0.3784579038619995, "learning_rate": 1.6968225514791853e-06, "loss": 0.0144883394241333, "memory(GiB)": 22.66, "step": 22879, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.957424 }, { "epoch": 0.7432673878439399, "grad_norm": 0.2585245370864868, "learning_rate": 1.6964193261628603e-06, "loss": 0.00959341786801815, "memory(GiB)": 22.66, "step": 22880, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.95743 }, { "epoch": 0.7432998733066952, "grad_norm": 0.3382071852684021, "learning_rate": 1.6960161389742529e-06, "loss": 0.011583395302295685, "memory(GiB)": 22.66, "step": 22881, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957436 }, { "epoch": 0.7433323587694507, "grad_norm": 0.3388972580432892, "learning_rate": 1.6956129899180168e-06, "loss": 0.01284021232277155, "memory(GiB)": 22.66, "step": 22882, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957441 }, { "epoch": 0.7433648442322061, "grad_norm": 0.541229248046875, "learning_rate": 1.6952098789988064e-06, "loss": 0.015507373958826065, "memory(GiB)": 22.66, "step": 22883, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.957447 }, { "epoch": 0.7433973296949615, "grad_norm": 0.31202152371406555, "learning_rate": 1.6948068062212715e-06, "loss": 0.009852714836597443, "memory(GiB)": 22.66, "step": 22884, "token_acc": 1.0, "train_speed(iter/s)": 0.957453 }, { "epoch": 0.7434298151577169, "grad_norm": 0.35294562578201294, "learning_rate": 1.6944037715900652e-06, "loss": 0.012833120301365852, "memory(GiB)": 22.66, "step": 22885, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.957458 }, { "epoch": 0.7434623006204724, "grad_norm": 0.5119821429252625, "learning_rate": 1.6940007751098385e-06, "loss": 0.01580752618610859, "memory(GiB)": 22.66, "step": 22886, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957464 }, { "epoch": 0.7434947860832277, "grad_norm": 0.41060128808021545, "learning_rate": 1.6935978167852458e-06, "loss": 0.014911236241459846, "memory(GiB)": 22.66, "step": 22887, "token_acc": 1.0, "train_speed(iter/s)": 0.957469 }, { "epoch": 0.7435272715459832, "grad_norm": 0.5063726305961609, "learning_rate": 1.6931948966209328e-06, "loss": 0.01109281461685896, "memory(GiB)": 22.66, "step": 22888, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.957476 }, { "epoch": 0.7435597570087386, "grad_norm": 0.35837316513061523, "learning_rate": 1.692792014621552e-06, "loss": 0.012474315240979195, "memory(GiB)": 22.66, "step": 22889, "token_acc": 1.0, "train_speed(iter/s)": 0.957483 }, { "epoch": 0.743592242471494, "grad_norm": 0.35171210765838623, "learning_rate": 1.6923891707917534e-06, "loss": 0.014727119356393814, "memory(GiB)": 22.66, "step": 22890, "token_acc": 0.9961685823754789, "train_speed(iter/s)": 0.95749 }, { "epoch": 0.7436247279342494, "grad_norm": 0.3302074074745178, "learning_rate": 1.6919863651361862e-06, "loss": 0.009071104228496552, "memory(GiB)": 22.66, "step": 22891, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.957497 }, { "epoch": 0.7436572133970049, "grad_norm": 0.4108847379684448, "learning_rate": 1.6915835976595008e-06, "loss": 0.014002575539052486, "memory(GiB)": 22.66, "step": 22892, "token_acc": 1.0, "train_speed(iter/s)": 0.957506 }, { "epoch": 0.7436896988597602, "grad_norm": 0.26144179701805115, "learning_rate": 1.6911808683663429e-06, "loss": 0.011053334921598434, "memory(GiB)": 22.66, "step": 22893, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.957515 }, { "epoch": 0.7437221843225157, "grad_norm": 0.3783019781112671, "learning_rate": 1.6907781772613612e-06, "loss": 0.014032847248017788, "memory(GiB)": 22.66, "step": 22894, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957523 }, { "epoch": 0.743754669785271, "grad_norm": 0.31056317687034607, "learning_rate": 1.6903755243492043e-06, "loss": 0.016861479729413986, "memory(GiB)": 22.66, "step": 22895, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.957531 }, { "epoch": 0.7437871552480265, "grad_norm": 0.37973055243492126, "learning_rate": 1.6899729096345209e-06, "loss": 0.014303037896752357, "memory(GiB)": 22.66, "step": 22896, "token_acc": 0.9961832061068703, "train_speed(iter/s)": 0.95754 }, { "epoch": 0.7438196407107819, "grad_norm": 0.40581583976745605, "learning_rate": 1.6895703331219532e-06, "loss": 0.01503018755465746, "memory(GiB)": 22.66, "step": 22897, "token_acc": 1.0, "train_speed(iter/s)": 0.957549 }, { "epoch": 0.7438521261735374, "grad_norm": 0.5192025303840637, "learning_rate": 1.6891677948161522e-06, "loss": 0.022819023579359055, "memory(GiB)": 22.66, "step": 22898, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.957558 }, { "epoch": 0.7438846116362927, "grad_norm": 0.31944695115089417, "learning_rate": 1.6887652947217586e-06, "loss": 0.014566013589501381, "memory(GiB)": 22.66, "step": 22899, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.957566 }, { "epoch": 0.7439170970990482, "grad_norm": 0.3321515917778015, "learning_rate": 1.6883628328434238e-06, "loss": 0.018499158322811127, "memory(GiB)": 22.66, "step": 22900, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957575 }, { "epoch": 0.7439495825618035, "grad_norm": 0.42366549372673035, "learning_rate": 1.6879604091857876e-06, "loss": 0.024338381364941597, "memory(GiB)": 22.66, "step": 22901, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957584 }, { "epoch": 0.743982068024559, "grad_norm": 0.4071483910083771, "learning_rate": 1.687558023753499e-06, "loss": 0.009801136329770088, "memory(GiB)": 22.66, "step": 22902, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.957593 }, { "epoch": 0.7440145534873144, "grad_norm": 0.2906196713447571, "learning_rate": 1.6871556765511977e-06, "loss": 0.011184709146618843, "memory(GiB)": 22.66, "step": 22903, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.957602 }, { "epoch": 0.7440470389500699, "grad_norm": 0.39163997769355774, "learning_rate": 1.686753367583529e-06, "loss": 0.019273582845926285, "memory(GiB)": 22.66, "step": 22904, "token_acc": 1.0, "train_speed(iter/s)": 0.957611 }, { "epoch": 0.7440795244128252, "grad_norm": 0.2713695764541626, "learning_rate": 1.6863510968551366e-06, "loss": 0.010724494233727455, "memory(GiB)": 22.66, "step": 22905, "token_acc": 1.0, "train_speed(iter/s)": 0.957619 }, { "epoch": 0.7441120098755807, "grad_norm": 0.409247487783432, "learning_rate": 1.685948864370663e-06, "loss": 0.013829000294208527, "memory(GiB)": 22.66, "step": 22906, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957629 }, { "epoch": 0.744144495338336, "grad_norm": 0.42701616883277893, "learning_rate": 1.6855466701347516e-06, "loss": 0.012155980803072453, "memory(GiB)": 22.66, "step": 22907, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957638 }, { "epoch": 0.7441769808010915, "grad_norm": 0.38501375913619995, "learning_rate": 1.6851445141520423e-06, "loss": 0.017659112811088562, "memory(GiB)": 22.66, "step": 22908, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957647 }, { "epoch": 0.7442094662638469, "grad_norm": 0.27845388650894165, "learning_rate": 1.684742396427177e-06, "loss": 0.012593381106853485, "memory(GiB)": 22.66, "step": 22909, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.957656 }, { "epoch": 0.7442419517266023, "grad_norm": 0.3044084906578064, "learning_rate": 1.684340316964797e-06, "loss": 0.01531875878572464, "memory(GiB)": 22.66, "step": 22910, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.957665 }, { "epoch": 0.7442744371893577, "grad_norm": 0.3323107361793518, "learning_rate": 1.683938275769545e-06, "loss": 0.01300548855215311, "memory(GiB)": 22.66, "step": 22911, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957674 }, { "epoch": 0.7443069226521132, "grad_norm": 0.2453305870294571, "learning_rate": 1.6835362728460569e-06, "loss": 0.00859898142516613, "memory(GiB)": 22.66, "step": 22912, "token_acc": 1.0, "train_speed(iter/s)": 0.957683 }, { "epoch": 0.7443394081148687, "grad_norm": 0.31197619438171387, "learning_rate": 1.6831343081989744e-06, "loss": 0.01070580817759037, "memory(GiB)": 22.66, "step": 22913, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.957691 }, { "epoch": 0.744371893577624, "grad_norm": 0.2694796323776245, "learning_rate": 1.6827323818329367e-06, "loss": 0.01512275729328394, "memory(GiB)": 22.66, "step": 22914, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.9577 }, { "epoch": 0.7444043790403795, "grad_norm": 0.9335800409317017, "learning_rate": 1.6823304937525843e-06, "loss": 0.00863335095345974, "memory(GiB)": 22.66, "step": 22915, "token_acc": 1.0, "train_speed(iter/s)": 0.957709 }, { "epoch": 0.7444368645031348, "grad_norm": 0.6045476198196411, "learning_rate": 1.6819286439625521e-06, "loss": 0.015180254355072975, "memory(GiB)": 22.66, "step": 22916, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957718 }, { "epoch": 0.7444693499658903, "grad_norm": 0.47342967987060547, "learning_rate": 1.6815268324674799e-06, "loss": 0.013918951153755188, "memory(GiB)": 22.66, "step": 22917, "token_acc": 0.99609375, "train_speed(iter/s)": 0.957727 }, { "epoch": 0.7445018354286457, "grad_norm": 0.45067474246025085, "learning_rate": 1.6811250592720046e-06, "loss": 0.016087597236037254, "memory(GiB)": 22.66, "step": 22918, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957736 }, { "epoch": 0.7445343208914011, "grad_norm": 0.32437723875045776, "learning_rate": 1.6807233243807652e-06, "loss": 0.008225366473197937, "memory(GiB)": 22.66, "step": 22919, "token_acc": 1.0, "train_speed(iter/s)": 0.957745 }, { "epoch": 0.7445668063541565, "grad_norm": 0.35224777460098267, "learning_rate": 1.6803216277983953e-06, "loss": 0.011749054305255413, "memory(GiB)": 22.66, "step": 22920, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957754 }, { "epoch": 0.744599291816912, "grad_norm": 0.4112039804458618, "learning_rate": 1.679919969529532e-06, "loss": 0.018472041934728622, "memory(GiB)": 22.66, "step": 22921, "token_acc": 1.0, "train_speed(iter/s)": 0.957763 }, { "epoch": 0.7446317772796673, "grad_norm": 0.354560524225235, "learning_rate": 1.6795183495788114e-06, "loss": 0.014021338894963264, "memory(GiB)": 22.66, "step": 22922, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.957771 }, { "epoch": 0.7446642627424228, "grad_norm": 0.2704821527004242, "learning_rate": 1.6791167679508691e-06, "loss": 0.01036009006202221, "memory(GiB)": 22.66, "step": 22923, "token_acc": 1.0, "train_speed(iter/s)": 0.95778 }, { "epoch": 0.7446967482051782, "grad_norm": 0.2809317708015442, "learning_rate": 1.6787152246503401e-06, "loss": 0.01195618323981762, "memory(GiB)": 22.66, "step": 22924, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.957789 }, { "epoch": 0.7447292336679336, "grad_norm": 0.33548852801322937, "learning_rate": 1.6783137196818573e-06, "loss": 0.014302403666079044, "memory(GiB)": 22.66, "step": 22925, "token_acc": 1.0, "train_speed(iter/s)": 0.957798 }, { "epoch": 0.744761719130689, "grad_norm": 0.2776772379875183, "learning_rate": 1.6779122530500564e-06, "loss": 0.009393176063895226, "memory(GiB)": 22.66, "step": 22926, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957805 }, { "epoch": 0.7447942045934445, "grad_norm": 0.3302324712276459, "learning_rate": 1.6775108247595661e-06, "loss": 0.017536837607622147, "memory(GiB)": 22.66, "step": 22927, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.957812 }, { "epoch": 0.7448266900561998, "grad_norm": 0.21729570627212524, "learning_rate": 1.6771094348150269e-06, "loss": 0.009609246626496315, "memory(GiB)": 22.66, "step": 22928, "token_acc": 0.9923954372623575, "train_speed(iter/s)": 0.957819 }, { "epoch": 0.7448591755189553, "grad_norm": 0.3000301420688629, "learning_rate": 1.6767080832210658e-06, "loss": 0.011839423328638077, "memory(GiB)": 22.66, "step": 22929, "token_acc": 1.0, "train_speed(iter/s)": 0.957825 }, { "epoch": 0.7448916609817107, "grad_norm": 0.44876569509506226, "learning_rate": 1.6763067699823188e-06, "loss": 0.014189999550580978, "memory(GiB)": 22.66, "step": 22930, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.957832 }, { "epoch": 0.7449241464444661, "grad_norm": 0.34492161870002747, "learning_rate": 1.6759054951034132e-06, "loss": 0.01232725940644741, "memory(GiB)": 22.66, "step": 22931, "token_acc": 1.0, "train_speed(iter/s)": 0.957839 }, { "epoch": 0.7449566319072215, "grad_norm": 0.30087214708328247, "learning_rate": 1.675504258588983e-06, "loss": 0.014806010760366917, "memory(GiB)": 22.66, "step": 22932, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957846 }, { "epoch": 0.744989117369977, "grad_norm": 0.39871713519096375, "learning_rate": 1.6751030604436586e-06, "loss": 0.0130440229550004, "memory(GiB)": 22.66, "step": 22933, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.957851 }, { "epoch": 0.7450216028327323, "grad_norm": 0.3985501825809479, "learning_rate": 1.6747019006720717e-06, "loss": 0.019432788714766502, "memory(GiB)": 22.66, "step": 22934, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.957857 }, { "epoch": 0.7450540882954878, "grad_norm": 0.29634350538253784, "learning_rate": 1.6743007792788491e-06, "loss": 0.011234737932682037, "memory(GiB)": 22.66, "step": 22935, "token_acc": 1.0, "train_speed(iter/s)": 0.957863 }, { "epoch": 0.7450865737582432, "grad_norm": 0.2869662344455719, "learning_rate": 1.6738996962686222e-06, "loss": 0.012139596045017242, "memory(GiB)": 22.66, "step": 22936, "token_acc": 1.0, "train_speed(iter/s)": 0.957869 }, { "epoch": 0.7451190592209986, "grad_norm": 0.33467042446136475, "learning_rate": 1.67349865164602e-06, "loss": 0.01352810300886631, "memory(GiB)": 22.66, "step": 22937, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.957875 }, { "epoch": 0.745151544683754, "grad_norm": 0.32694560289382935, "learning_rate": 1.6730976454156706e-06, "loss": 0.014728846028447151, "memory(GiB)": 22.66, "step": 22938, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.95788 }, { "epoch": 0.7451840301465095, "grad_norm": 0.4456573724746704, "learning_rate": 1.672696677582204e-06, "loss": 0.015377036295831203, "memory(GiB)": 22.66, "step": 22939, "token_acc": 1.0, "train_speed(iter/s)": 0.957886 }, { "epoch": 0.7452165156092648, "grad_norm": 0.42694467306137085, "learning_rate": 1.672295748150245e-06, "loss": 0.023773103952407837, "memory(GiB)": 22.66, "step": 22940, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.957892 }, { "epoch": 0.7452490010720203, "grad_norm": 0.39078155159950256, "learning_rate": 1.671894857124422e-06, "loss": 0.014223767444491386, "memory(GiB)": 22.66, "step": 22941, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.957899 }, { "epoch": 0.7452814865347757, "grad_norm": 0.37205520272254944, "learning_rate": 1.6714940045093625e-06, "loss": 0.01606692001223564, "memory(GiB)": 22.66, "step": 22942, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.957904 }, { "epoch": 0.7453139719975311, "grad_norm": 0.322815477848053, "learning_rate": 1.671093190309694e-06, "loss": 0.01847199536859989, "memory(GiB)": 22.66, "step": 22943, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.95791 }, { "epoch": 0.7453464574602865, "grad_norm": 0.2950230836868286, "learning_rate": 1.6706924145300384e-06, "loss": 0.012814268469810486, "memory(GiB)": 22.66, "step": 22944, "token_acc": 0.9905660377358491, "train_speed(iter/s)": 0.957916 }, { "epoch": 0.745378942923042, "grad_norm": 0.3392554521560669, "learning_rate": 1.6702916771750244e-06, "loss": 0.01040542684495449, "memory(GiB)": 22.66, "step": 22945, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957922 }, { "epoch": 0.7454114283857973, "grad_norm": 0.23049570620059967, "learning_rate": 1.669890978249276e-06, "loss": 0.012722628191113472, "memory(GiB)": 22.66, "step": 22946, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.957929 }, { "epoch": 0.7454439138485528, "grad_norm": 0.3833383023738861, "learning_rate": 1.66949031775742e-06, "loss": 0.009307578206062317, "memory(GiB)": 22.66, "step": 22947, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.957936 }, { "epoch": 0.7454763993113082, "grad_norm": 0.34919142723083496, "learning_rate": 1.669089695704077e-06, "loss": 0.018611256033182144, "memory(GiB)": 22.66, "step": 22948, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.957943 }, { "epoch": 0.7455088847740636, "grad_norm": 0.4650726616382599, "learning_rate": 1.6686891120938736e-06, "loss": 0.022555036470294, "memory(GiB)": 22.66, "step": 22949, "token_acc": 1.0, "train_speed(iter/s)": 0.95795 }, { "epoch": 0.745541370236819, "grad_norm": 0.2661016285419464, "learning_rate": 1.6682885669314286e-06, "loss": 0.01012385729700327, "memory(GiB)": 22.66, "step": 22950, "token_acc": 1.0, "train_speed(iter/s)": 0.957957 }, { "epoch": 0.7455738556995745, "grad_norm": 0.2701253592967987, "learning_rate": 1.6678880602213698e-06, "loss": 0.009546859189867973, "memory(GiB)": 22.66, "step": 22951, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.957963 }, { "epoch": 0.7456063411623298, "grad_norm": 0.2783190608024597, "learning_rate": 1.6674875919683197e-06, "loss": 0.01011599600315094, "memory(GiB)": 22.66, "step": 22952, "token_acc": 0.99609375, "train_speed(iter/s)": 0.957969 }, { "epoch": 0.7456388266250853, "grad_norm": 0.33880990743637085, "learning_rate": 1.6670871621768964e-06, "loss": 0.013789532706141472, "memory(GiB)": 22.66, "step": 22953, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.957976 }, { "epoch": 0.7456713120878407, "grad_norm": 0.6785428524017334, "learning_rate": 1.6666867708517248e-06, "loss": 0.017120204865932465, "memory(GiB)": 22.66, "step": 22954, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.957983 }, { "epoch": 0.7457037975505961, "grad_norm": 0.32076606154441833, "learning_rate": 1.666286417997422e-06, "loss": 0.013608605600893497, "memory(GiB)": 22.66, "step": 22955, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.95799 }, { "epoch": 0.7457362830133515, "grad_norm": 0.22555984556674957, "learning_rate": 1.6658861036186137e-06, "loss": 0.008207942359149456, "memory(GiB)": 22.66, "step": 22956, "token_acc": 1.0, "train_speed(iter/s)": 0.957997 }, { "epoch": 0.745768768476107, "grad_norm": 0.2725374102592468, "learning_rate": 1.6654858277199159e-06, "loss": 0.011351970955729485, "memory(GiB)": 22.66, "step": 22957, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.958006 }, { "epoch": 0.7458012539388623, "grad_norm": 0.29171961545944214, "learning_rate": 1.6650855903059515e-06, "loss": 0.009149749763309956, "memory(GiB)": 22.66, "step": 22958, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.958015 }, { "epoch": 0.7458337394016178, "grad_norm": 0.42398199439048767, "learning_rate": 1.6646853913813365e-06, "loss": 0.013690939173102379, "memory(GiB)": 22.66, "step": 22959, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.958024 }, { "epoch": 0.7458662248643732, "grad_norm": 0.4951210618019104, "learning_rate": 1.6642852309506919e-06, "loss": 0.012906273826956749, "memory(GiB)": 22.66, "step": 22960, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.958032 }, { "epoch": 0.7458987103271286, "grad_norm": 0.3976770043373108, "learning_rate": 1.6638851090186347e-06, "loss": 0.013164933770895004, "memory(GiB)": 22.66, "step": 22961, "token_acc": 1.0, "train_speed(iter/s)": 0.958041 }, { "epoch": 0.745931195789884, "grad_norm": 0.3276057243347168, "learning_rate": 1.6634850255897861e-06, "loss": 0.012677494436502457, "memory(GiB)": 22.66, "step": 22962, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.95805 }, { "epoch": 0.7459636812526395, "grad_norm": 0.45461001992225647, "learning_rate": 1.6630849806687587e-06, "loss": 0.01772363856434822, "memory(GiB)": 22.66, "step": 22963, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.958059 }, { "epoch": 0.7459961667153948, "grad_norm": 0.3191376328468323, "learning_rate": 1.662684974260173e-06, "loss": 0.015054069459438324, "memory(GiB)": 22.66, "step": 22964, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.958067 }, { "epoch": 0.7460286521781503, "grad_norm": 0.3542211353778839, "learning_rate": 1.6622850063686442e-06, "loss": 0.014900192618370056, "memory(GiB)": 22.66, "step": 22965, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.958076 }, { "epoch": 0.7460611376409056, "grad_norm": 0.36474016308784485, "learning_rate": 1.6618850769987903e-06, "loss": 0.01757560670375824, "memory(GiB)": 22.66, "step": 22966, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.958084 }, { "epoch": 0.7460936231036611, "grad_norm": 0.2204463928937912, "learning_rate": 1.6614851861552244e-06, "loss": 0.0076418365351855755, "memory(GiB)": 22.66, "step": 22967, "token_acc": 1.0, "train_speed(iter/s)": 0.958093 }, { "epoch": 0.7461261085664165, "grad_norm": 0.28987202048301697, "learning_rate": 1.661085333842563e-06, "loss": 0.00988706387579441, "memory(GiB)": 22.66, "step": 22968, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.958102 }, { "epoch": 0.746158594029172, "grad_norm": 0.346340537071228, "learning_rate": 1.660685520065421e-06, "loss": 0.013120539486408234, "memory(GiB)": 22.66, "step": 22969, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.958111 }, { "epoch": 0.7461910794919273, "grad_norm": 0.2507654130458832, "learning_rate": 1.6602857448284122e-06, "loss": 0.008292488753795624, "memory(GiB)": 22.66, "step": 22970, "token_acc": 1.0, "train_speed(iter/s)": 0.95812 }, { "epoch": 0.7462235649546828, "grad_norm": 0.3931240439414978, "learning_rate": 1.6598860081361528e-06, "loss": 0.015517841093242168, "memory(GiB)": 22.66, "step": 22971, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.958129 }, { "epoch": 0.7462560504174381, "grad_norm": 0.3922064006328583, "learning_rate": 1.6594863099932534e-06, "loss": 0.015473570674657822, "memory(GiB)": 22.66, "step": 22972, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.958138 }, { "epoch": 0.7462885358801936, "grad_norm": 0.3337693512439728, "learning_rate": 1.659086650404328e-06, "loss": 0.01304132305085659, "memory(GiB)": 22.66, "step": 22973, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.958146 }, { "epoch": 0.746321021342949, "grad_norm": 0.3069998025894165, "learning_rate": 1.6586870293739894e-06, "loss": 0.015225052833557129, "memory(GiB)": 22.66, "step": 22974, "token_acc": 0.9938650306748467, "train_speed(iter/s)": 0.958155 }, { "epoch": 0.7463535068057044, "grad_norm": 0.24485686421394348, "learning_rate": 1.6582874469068511e-06, "loss": 0.014383815228939056, "memory(GiB)": 22.66, "step": 22975, "token_acc": 1.0, "train_speed(iter/s)": 0.958164 }, { "epoch": 0.7463859922684599, "grad_norm": 0.3643893003463745, "learning_rate": 1.6578879030075224e-06, "loss": 0.010641837492585182, "memory(GiB)": 22.66, "step": 22976, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.958173 }, { "epoch": 0.7464184777312153, "grad_norm": 0.41677987575531006, "learning_rate": 1.6574883976806166e-06, "loss": 0.011958053335547447, "memory(GiB)": 22.66, "step": 22977, "token_acc": 1.0, "train_speed(iter/s)": 0.958182 }, { "epoch": 0.7464509631939708, "grad_norm": 0.6181662082672119, "learning_rate": 1.6570889309307408e-06, "loss": 0.013988613151013851, "memory(GiB)": 22.66, "step": 22978, "token_acc": 1.0, "train_speed(iter/s)": 0.958191 }, { "epoch": 0.7464834486567261, "grad_norm": 0.30302000045776367, "learning_rate": 1.6566895027625113e-06, "loss": 0.011131816543638706, "memory(GiB)": 22.66, "step": 22979, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.958199 }, { "epoch": 0.7465159341194816, "grad_norm": 0.3727877736091614, "learning_rate": 1.6562901131805332e-06, "loss": 0.01375328004360199, "memory(GiB)": 22.66, "step": 22980, "token_acc": 1.0, "train_speed(iter/s)": 0.958208 }, { "epoch": 0.7465484195822369, "grad_norm": 0.23492684960365295, "learning_rate": 1.6558907621894193e-06, "loss": 0.00986824743449688, "memory(GiB)": 22.66, "step": 22981, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.958216 }, { "epoch": 0.7465809050449924, "grad_norm": 0.2510485351085663, "learning_rate": 1.6554914497937752e-06, "loss": 0.008060507476329803, "memory(GiB)": 22.66, "step": 22982, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.958225 }, { "epoch": 0.7466133905077478, "grad_norm": 0.2647853195667267, "learning_rate": 1.6550921759982096e-06, "loss": 0.010716571472585201, "memory(GiB)": 22.66, "step": 22983, "token_acc": 1.0, "train_speed(iter/s)": 0.958234 }, { "epoch": 0.7466458759705032, "grad_norm": 0.26747968792915344, "learning_rate": 1.654692940807336e-06, "loss": 0.011541520245373249, "memory(GiB)": 22.66, "step": 22984, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.958243 }, { "epoch": 0.7466783614332586, "grad_norm": 0.22259129583835602, "learning_rate": 1.6542937442257567e-06, "loss": 0.008747141808271408, "memory(GiB)": 22.66, "step": 22985, "token_acc": 0.994413407821229, "train_speed(iter/s)": 0.958251 }, { "epoch": 0.7467108468960141, "grad_norm": 0.48094865679740906, "learning_rate": 1.6538945862580825e-06, "loss": 0.022785138338804245, "memory(GiB)": 22.66, "step": 22986, "token_acc": 0.991869918699187, "train_speed(iter/s)": 0.958258 }, { "epoch": 0.7467433323587694, "grad_norm": 0.26789215207099915, "learning_rate": 1.6534954669089165e-06, "loss": 0.009611627086997032, "memory(GiB)": 22.66, "step": 22987, "token_acc": 0.9963503649635036, "train_speed(iter/s)": 0.958265 }, { "epoch": 0.7467758178215249, "grad_norm": 0.46207159757614136, "learning_rate": 1.653096386182867e-06, "loss": 0.013891584239900112, "memory(GiB)": 22.66, "step": 22988, "token_acc": 0.996, "train_speed(iter/s)": 0.958272 }, { "epoch": 0.7468083032842803, "grad_norm": 0.4760141968727112, "learning_rate": 1.65269734408454e-06, "loss": 0.015022575855255127, "memory(GiB)": 22.66, "step": 22989, "token_acc": 1.0, "train_speed(iter/s)": 0.958279 }, { "epoch": 0.7468407887470357, "grad_norm": 0.5243973731994629, "learning_rate": 1.6522983406185428e-06, "loss": 0.015654318034648895, "memory(GiB)": 22.66, "step": 22990, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.958285 }, { "epoch": 0.7468732742097911, "grad_norm": 0.3940514326095581, "learning_rate": 1.6518993757894764e-06, "loss": 0.019827809184789658, "memory(GiB)": 22.66, "step": 22991, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.958292 }, { "epoch": 0.7469057596725466, "grad_norm": 0.23177194595336914, "learning_rate": 1.6515004496019476e-06, "loss": 0.008557094261050224, "memory(GiB)": 22.66, "step": 22992, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.958299 }, { "epoch": 0.7469382451353019, "grad_norm": 0.4330257773399353, "learning_rate": 1.6511015620605609e-06, "loss": 0.012923387810587883, "memory(GiB)": 22.66, "step": 22993, "token_acc": 1.0, "train_speed(iter/s)": 0.958305 }, { "epoch": 0.7469707305980574, "grad_norm": 1.2995585203170776, "learning_rate": 1.650702713169921e-06, "loss": 0.01396816037595272, "memory(GiB)": 22.66, "step": 22994, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.958312 }, { "epoch": 0.7470032160608128, "grad_norm": 0.4537564218044281, "learning_rate": 1.6503039029346279e-06, "loss": 0.016972046345472336, "memory(GiB)": 22.66, "step": 22995, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.958318 }, { "epoch": 0.7470357015235682, "grad_norm": 0.19820450246334076, "learning_rate": 1.6499051313592861e-06, "loss": 0.00734248710796237, "memory(GiB)": 22.66, "step": 22996, "token_acc": 1.0, "train_speed(iter/s)": 0.958324 }, { "epoch": 0.7470681869863236, "grad_norm": 0.336382120847702, "learning_rate": 1.6495063984484982e-06, "loss": 0.007632916793227196, "memory(GiB)": 22.66, "step": 22997, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.95833 }, { "epoch": 0.7471006724490791, "grad_norm": 0.31287315487861633, "learning_rate": 1.649107704206866e-06, "loss": 0.010380050167441368, "memory(GiB)": 22.66, "step": 22998, "token_acc": 1.0, "train_speed(iter/s)": 0.958336 }, { "epoch": 0.7471331579118344, "grad_norm": 0.3097700774669647, "learning_rate": 1.6487090486389923e-06, "loss": 0.016648873686790466, "memory(GiB)": 22.66, "step": 22999, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.958341 }, { "epoch": 0.7471656433745899, "grad_norm": 0.31968262791633606, "learning_rate": 1.6483104317494757e-06, "loss": 0.012985105626285076, "memory(GiB)": 22.66, "step": 23000, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.958347 }, { "epoch": 0.7471656433745899, "eval_loss": 0.01382831297814846, "eval_runtime": 79.816, "eval_samples_per_second": 124.662, "eval_steps_per_second": 3.896, "eval_token_acc": 0.994453568370238, "step": 23000 }, { "epoch": 0.7471981288373453, "grad_norm": 0.6806787252426147, "learning_rate": 1.6479118535429172e-06, "loss": 0.017827514559030533, "memory(GiB)": 22.66, "step": 23001, "token_acc": 0.993984076302194, "train_speed(iter/s)": 0.954724 }, { "epoch": 0.7472306143001007, "grad_norm": 0.3433862328529358, "learning_rate": 1.647513314023918e-06, "loss": 0.01050529070198536, "memory(GiB)": 22.66, "step": 23002, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.954732 }, { "epoch": 0.7472630997628561, "grad_norm": 0.381254106760025, "learning_rate": 1.6471148131970789e-06, "loss": 0.01481475867331028, "memory(GiB)": 22.66, "step": 23003, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.954741 }, { "epoch": 0.7472955852256116, "grad_norm": 0.3347730338573456, "learning_rate": 1.6467163510669954e-06, "loss": 0.010882810689508915, "memory(GiB)": 22.66, "step": 23004, "token_acc": 1.0, "train_speed(iter/s)": 0.954749 }, { "epoch": 0.7473280706883669, "grad_norm": 0.22618722915649414, "learning_rate": 1.6463179276382702e-06, "loss": 0.009100284427404404, "memory(GiB)": 22.66, "step": 23005, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.954758 }, { "epoch": 0.7473605561511224, "grad_norm": 0.5266817808151245, "learning_rate": 1.6459195429154967e-06, "loss": 0.0165412537753582, "memory(GiB)": 22.66, "step": 23006, "token_acc": 0.9765258215962441, "train_speed(iter/s)": 0.954767 }, { "epoch": 0.7473930416138778, "grad_norm": 0.34131261706352234, "learning_rate": 1.645521196903279e-06, "loss": 0.015173690393567085, "memory(GiB)": 22.66, "step": 23007, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.954776 }, { "epoch": 0.7474255270766332, "grad_norm": 0.24920561909675598, "learning_rate": 1.6451228896062105e-06, "loss": 0.00692211976274848, "memory(GiB)": 22.66, "step": 23008, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.954784 }, { "epoch": 0.7474580125393886, "grad_norm": 0.40883710980415344, "learning_rate": 1.6447246210288903e-06, "loss": 0.019240040332078934, "memory(GiB)": 22.66, "step": 23009, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.954793 }, { "epoch": 0.7474904980021441, "grad_norm": 0.5055431127548218, "learning_rate": 1.6443263911759129e-06, "loss": 0.023382682353258133, "memory(GiB)": 22.66, "step": 23010, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.954802 }, { "epoch": 0.7475229834648994, "grad_norm": 0.30902907252311707, "learning_rate": 1.643928200051874e-06, "loss": 0.015461737290024757, "memory(GiB)": 22.66, "step": 23011, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.954811 }, { "epoch": 0.7475554689276549, "grad_norm": 0.38503730297088623, "learning_rate": 1.6435300476613719e-06, "loss": 0.013689277693629265, "memory(GiB)": 22.66, "step": 23012, "token_acc": 0.9952830188679245, "train_speed(iter/s)": 0.95482 }, { "epoch": 0.7475879543904103, "grad_norm": 0.4561730921268463, "learning_rate": 1.6431319340089997e-06, "loss": 0.014540765434503555, "memory(GiB)": 22.66, "step": 23013, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.954828 }, { "epoch": 0.7476204398531657, "grad_norm": 0.2989432215690613, "learning_rate": 1.6427338590993547e-06, "loss": 0.010755921714007854, "memory(GiB)": 22.66, "step": 23014, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.954835 }, { "epoch": 0.7476529253159211, "grad_norm": 0.4032968580722809, "learning_rate": 1.6423358229370284e-06, "loss": 0.016181154176592827, "memory(GiB)": 22.66, "step": 23015, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.954842 }, { "epoch": 0.7476854107786766, "grad_norm": 0.4698425531387329, "learning_rate": 1.641937825526615e-06, "loss": 0.015829268842935562, "memory(GiB)": 22.66, "step": 23016, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.95485 }, { "epoch": 0.7477178962414319, "grad_norm": 0.42931070923805237, "learning_rate": 1.641539866872709e-06, "loss": 0.013423997908830643, "memory(GiB)": 22.66, "step": 23017, "token_acc": 1.0, "train_speed(iter/s)": 0.954856 }, { "epoch": 0.7477503817041874, "grad_norm": 0.3259110450744629, "learning_rate": 1.641141946979905e-06, "loss": 0.014916591346263885, "memory(GiB)": 22.66, "step": 23018, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.954863 }, { "epoch": 0.7477828671669428, "grad_norm": 0.20475098490715027, "learning_rate": 1.6407440658527918e-06, "loss": 0.007329214829951525, "memory(GiB)": 22.66, "step": 23019, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.95487 }, { "epoch": 0.7478153526296982, "grad_norm": 0.26028507947921753, "learning_rate": 1.6403462234959628e-06, "loss": 0.008515884168446064, "memory(GiB)": 22.66, "step": 23020, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.954877 }, { "epoch": 0.7478478380924536, "grad_norm": 0.2993941903114319, "learning_rate": 1.6399484199140103e-06, "loss": 0.010046396404504776, "memory(GiB)": 22.66, "step": 23021, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.954884 }, { "epoch": 0.7478803235552091, "grad_norm": 0.3391258120536804, "learning_rate": 1.639550655111527e-06, "loss": 0.012322915717959404, "memory(GiB)": 22.66, "step": 23022, "token_acc": 1.0, "train_speed(iter/s)": 0.95489 }, { "epoch": 0.7479128090179644, "grad_norm": 0.2715355157852173, "learning_rate": 1.6391529290931003e-06, "loss": 0.010423962958157063, "memory(GiB)": 22.66, "step": 23023, "token_acc": 0.99609375, "train_speed(iter/s)": 0.954897 }, { "epoch": 0.7479452944807199, "grad_norm": 0.3941609263420105, "learning_rate": 1.6387552418633219e-06, "loss": 0.01092975027859211, "memory(GiB)": 22.66, "step": 23024, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.954902 }, { "epoch": 0.7479777799434753, "grad_norm": 0.3143942356109619, "learning_rate": 1.6383575934267814e-06, "loss": 0.010266466066241264, "memory(GiB)": 22.66, "step": 23025, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.954909 }, { "epoch": 0.7480102654062307, "grad_norm": 0.2692505717277527, "learning_rate": 1.637959983788071e-06, "loss": 0.010266978293657303, "memory(GiB)": 22.66, "step": 23026, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.954917 }, { "epoch": 0.7480427508689861, "grad_norm": 0.2975219786167145, "learning_rate": 1.6375624129517754e-06, "loss": 0.010249313898384571, "memory(GiB)": 22.66, "step": 23027, "token_acc": 0.9894179894179894, "train_speed(iter/s)": 0.954924 }, { "epoch": 0.7480752363317416, "grad_norm": 0.2134399712085724, "learning_rate": 1.6371648809224867e-06, "loss": 0.009377293288707733, "memory(GiB)": 22.66, "step": 23028, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.95493 }, { "epoch": 0.7481077217944969, "grad_norm": 0.3426155149936676, "learning_rate": 1.6367673877047874e-06, "loss": 0.014216996729373932, "memory(GiB)": 22.66, "step": 23029, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.954937 }, { "epoch": 0.7481402072572524, "grad_norm": 0.3646411895751953, "learning_rate": 1.6363699333032706e-06, "loss": 0.018571637570858, "memory(GiB)": 22.66, "step": 23030, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.954944 }, { "epoch": 0.7481726927200077, "grad_norm": 0.29144755005836487, "learning_rate": 1.6359725177225238e-06, "loss": 0.008544037118554115, "memory(GiB)": 22.66, "step": 23031, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.954951 }, { "epoch": 0.7482051781827632, "grad_norm": 0.5072951912879944, "learning_rate": 1.63557514096713e-06, "loss": 0.01594887673854828, "memory(GiB)": 22.66, "step": 23032, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.954958 }, { "epoch": 0.7482376636455186, "grad_norm": 0.35987389087677, "learning_rate": 1.6351778030416788e-06, "loss": 0.014675183221697807, "memory(GiB)": 22.66, "step": 23033, "token_acc": 0.9947643979057592, "train_speed(iter/s)": 0.954965 }, { "epoch": 0.748270149108274, "grad_norm": 0.2975616753101349, "learning_rate": 1.6347805039507508e-06, "loss": 0.009626142680644989, "memory(GiB)": 22.66, "step": 23034, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.954974 }, { "epoch": 0.7483026345710294, "grad_norm": 0.31273964047431946, "learning_rate": 1.6343832436989382e-06, "loss": 0.01020210050046444, "memory(GiB)": 22.66, "step": 23035, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.954983 }, { "epoch": 0.7483351200337849, "grad_norm": 0.4493311643600464, "learning_rate": 1.633986022290821e-06, "loss": 0.016897985711693764, "memory(GiB)": 22.66, "step": 23036, "token_acc": 0.9757575757575757, "train_speed(iter/s)": 0.954991 }, { "epoch": 0.7483676054965402, "grad_norm": 0.39807888865470886, "learning_rate": 1.6335888397309869e-06, "loss": 0.015048783272504807, "memory(GiB)": 22.66, "step": 23037, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955 }, { "epoch": 0.7484000909592957, "grad_norm": 0.3826020061969757, "learning_rate": 1.6331916960240162e-06, "loss": 0.018112748861312866, "memory(GiB)": 22.66, "step": 23038, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.955009 }, { "epoch": 0.7484325764220511, "grad_norm": 0.13675914704799652, "learning_rate": 1.6327945911744952e-06, "loss": 0.00431768037378788, "memory(GiB)": 22.66, "step": 23039, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955017 }, { "epoch": 0.7484650618848065, "grad_norm": 0.28540483117103577, "learning_rate": 1.632397525187006e-06, "loss": 0.010521681979298592, "memory(GiB)": 22.66, "step": 23040, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.955024 }, { "epoch": 0.748497547347562, "grad_norm": 0.2770507037639618, "learning_rate": 1.6320004980661335e-06, "loss": 0.01118549332022667, "memory(GiB)": 22.66, "step": 23041, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.955032 }, { "epoch": 0.7485300328103174, "grad_norm": 0.3418422341346741, "learning_rate": 1.6316035098164562e-06, "loss": 0.0161525160074234, "memory(GiB)": 22.66, "step": 23042, "token_acc": 1.0, "train_speed(iter/s)": 0.955039 }, { "epoch": 0.7485625182730729, "grad_norm": 0.2527576982975006, "learning_rate": 1.6312065604425576e-06, "loss": 0.005748423747718334, "memory(GiB)": 22.66, "step": 23043, "token_acc": 1.0, "train_speed(iter/s)": 0.955046 }, { "epoch": 0.7485950037358282, "grad_norm": 0.44777077436447144, "learning_rate": 1.6308096499490194e-06, "loss": 0.014678718522191048, "memory(GiB)": 22.66, "step": 23044, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.955053 }, { "epoch": 0.7486274891985837, "grad_norm": 0.35250455141067505, "learning_rate": 1.6304127783404222e-06, "loss": 0.01367998868227005, "memory(GiB)": 22.66, "step": 23045, "token_acc": 0.99609375, "train_speed(iter/s)": 0.95506 }, { "epoch": 0.748659974661339, "grad_norm": 0.32770052552223206, "learning_rate": 1.6300159456213482e-06, "loss": 0.010769069194793701, "memory(GiB)": 22.66, "step": 23046, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.955067 }, { "epoch": 0.7486924601240945, "grad_norm": 0.28471067547798157, "learning_rate": 1.629619151796374e-06, "loss": 0.01154616754502058, "memory(GiB)": 22.66, "step": 23047, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955074 }, { "epoch": 0.7487249455868499, "grad_norm": 0.3564535975456238, "learning_rate": 1.6292223968700804e-06, "loss": 0.013082648627460003, "memory(GiB)": 22.66, "step": 23048, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.955081 }, { "epoch": 0.7487574310496053, "grad_norm": 0.4396907091140747, "learning_rate": 1.628825680847047e-06, "loss": 0.016875142231583595, "memory(GiB)": 22.66, "step": 23049, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.955089 }, { "epoch": 0.7487899165123607, "grad_norm": 0.2635287642478943, "learning_rate": 1.628429003731854e-06, "loss": 0.01041331421583891, "memory(GiB)": 22.66, "step": 23050, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.955095 }, { "epoch": 0.7488224019751162, "grad_norm": 0.3291928470134735, "learning_rate": 1.628032365529076e-06, "loss": 0.01313343457877636, "memory(GiB)": 22.66, "step": 23051, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.955102 }, { "epoch": 0.7488548874378715, "grad_norm": 0.46041300892829895, "learning_rate": 1.6276357662432929e-06, "loss": 0.015232647769153118, "memory(GiB)": 22.66, "step": 23052, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.955108 }, { "epoch": 0.748887372900627, "grad_norm": 0.25441446900367737, "learning_rate": 1.6272392058790814e-06, "loss": 0.006360879633575678, "memory(GiB)": 22.66, "step": 23053, "token_acc": 1.0, "train_speed(iter/s)": 0.955114 }, { "epoch": 0.7489198583633824, "grad_norm": 0.330115407705307, "learning_rate": 1.62684268444102e-06, "loss": 0.01592414081096649, "memory(GiB)": 22.66, "step": 23054, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.955121 }, { "epoch": 0.7489523438261378, "grad_norm": 0.21412597596645355, "learning_rate": 1.626446201933683e-06, "loss": 0.007190554868429899, "memory(GiB)": 22.66, "step": 23055, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.955129 }, { "epoch": 0.7489848292888932, "grad_norm": 0.3111476004123688, "learning_rate": 1.6260497583616475e-06, "loss": 0.016469130292534828, "memory(GiB)": 22.66, "step": 23056, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.955136 }, { "epoch": 0.7490173147516487, "grad_norm": 0.3748389482498169, "learning_rate": 1.625653353729486e-06, "loss": 0.014485042542219162, "memory(GiB)": 22.66, "step": 23057, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.955143 }, { "epoch": 0.749049800214404, "grad_norm": 0.283991277217865, "learning_rate": 1.625256988041779e-06, "loss": 0.015774205327033997, "memory(GiB)": 22.66, "step": 23058, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.95515 }, { "epoch": 0.7490822856771595, "grad_norm": 0.44200044870376587, "learning_rate": 1.6248606613030966e-06, "loss": 0.01865559257566929, "memory(GiB)": 22.66, "step": 23059, "token_acc": 1.0, "train_speed(iter/s)": 0.955157 }, { "epoch": 0.7491147711399149, "grad_norm": 0.41623884439468384, "learning_rate": 1.6244643735180143e-06, "loss": 0.01830448769032955, "memory(GiB)": 22.66, "step": 23060, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.955163 }, { "epoch": 0.7491472566026703, "grad_norm": 0.5134594440460205, "learning_rate": 1.6240681246911078e-06, "loss": 0.015002107247710228, "memory(GiB)": 22.66, "step": 23061, "token_acc": 0.9894179894179894, "train_speed(iter/s)": 0.95517 }, { "epoch": 0.7491797420654257, "grad_norm": 0.34552714228630066, "learning_rate": 1.6236719148269453e-06, "loss": 0.01886899769306183, "memory(GiB)": 22.66, "step": 23062, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955177 }, { "epoch": 0.7492122275281812, "grad_norm": 0.2658841013908386, "learning_rate": 1.623275743930106e-06, "loss": 0.008062783628702164, "memory(GiB)": 22.66, "step": 23063, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.955173 }, { "epoch": 0.7492447129909365, "grad_norm": 0.4168838858604431, "learning_rate": 1.622879612005157e-06, "loss": 0.015720022842288017, "memory(GiB)": 22.66, "step": 23064, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.95518 }, { "epoch": 0.749277198453692, "grad_norm": 0.3718692362308502, "learning_rate": 1.6224835190566745e-06, "loss": 0.01571407914161682, "memory(GiB)": 22.66, "step": 23065, "token_acc": 1.0, "train_speed(iter/s)": 0.955189 }, { "epoch": 0.7493096839164474, "grad_norm": 0.299050897359848, "learning_rate": 1.6220874650892254e-06, "loss": 0.0129950987175107, "memory(GiB)": 22.66, "step": 23066, "token_acc": 1.0, "train_speed(iter/s)": 0.955198 }, { "epoch": 0.7493421693792028, "grad_norm": 0.30233827233314514, "learning_rate": 1.6216914501073832e-06, "loss": 0.014371879398822784, "memory(GiB)": 22.66, "step": 23067, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.955207 }, { "epoch": 0.7493746548419582, "grad_norm": 0.3326681852340698, "learning_rate": 1.6212954741157184e-06, "loss": 0.012634972110390663, "memory(GiB)": 22.66, "step": 23068, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955216 }, { "epoch": 0.7494071403047137, "grad_norm": 0.417309045791626, "learning_rate": 1.6208995371188024e-06, "loss": 0.015449739061295986, "memory(GiB)": 22.66, "step": 23069, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.955224 }, { "epoch": 0.749439625767469, "grad_norm": 0.30858132243156433, "learning_rate": 1.6205036391212015e-06, "loss": 0.01649566926062107, "memory(GiB)": 22.66, "step": 23070, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955232 }, { "epoch": 0.7494721112302245, "grad_norm": 0.26217612624168396, "learning_rate": 1.6201077801274867e-06, "loss": 0.010643905960023403, "memory(GiB)": 22.66, "step": 23071, "token_acc": 1.0, "train_speed(iter/s)": 0.955241 }, { "epoch": 0.7495045966929799, "grad_norm": 0.3816656470298767, "learning_rate": 1.6197119601422267e-06, "loss": 0.013791196048259735, "memory(GiB)": 22.66, "step": 23072, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955249 }, { "epoch": 0.7495370821557353, "grad_norm": 0.3155320882797241, "learning_rate": 1.619316179169992e-06, "loss": 0.012967469170689583, "memory(GiB)": 22.66, "step": 23073, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955256 }, { "epoch": 0.7495695676184907, "grad_norm": 0.23208750784397125, "learning_rate": 1.618920437215346e-06, "loss": 0.012670825235545635, "memory(GiB)": 22.66, "step": 23074, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955263 }, { "epoch": 0.7496020530812462, "grad_norm": 0.49349188804626465, "learning_rate": 1.6185247342828586e-06, "loss": 0.010815910063683987, "memory(GiB)": 22.66, "step": 23075, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.955271 }, { "epoch": 0.7496345385440015, "grad_norm": 0.2338760495185852, "learning_rate": 1.6181290703770963e-06, "loss": 0.010776977986097336, "memory(GiB)": 22.66, "step": 23076, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.955278 }, { "epoch": 0.749667024006757, "grad_norm": 0.32503876090049744, "learning_rate": 1.617733445502626e-06, "loss": 0.013176921755075455, "memory(GiB)": 22.66, "step": 23077, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.955285 }, { "epoch": 0.7496995094695124, "grad_norm": 0.37495192885398865, "learning_rate": 1.6173378596640156e-06, "loss": 0.01307770051062107, "memory(GiB)": 22.66, "step": 23078, "token_acc": 1.0, "train_speed(iter/s)": 0.955292 }, { "epoch": 0.7497319949322678, "grad_norm": 0.283477783203125, "learning_rate": 1.6169423128658262e-06, "loss": 0.010146989487111568, "memory(GiB)": 22.66, "step": 23079, "token_acc": 0.9961685823754789, "train_speed(iter/s)": 0.955299 }, { "epoch": 0.7497644803950232, "grad_norm": 0.3474315404891968, "learning_rate": 1.616546805112626e-06, "loss": 0.015093627385795116, "memory(GiB)": 22.66, "step": 23080, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.955306 }, { "epoch": 0.7497969658577787, "grad_norm": 0.346914678812027, "learning_rate": 1.6161513364089788e-06, "loss": 0.01070394366979599, "memory(GiB)": 22.66, "step": 23081, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.955312 }, { "epoch": 0.749829451320534, "grad_norm": 0.3428790867328644, "learning_rate": 1.6157559067594504e-06, "loss": 0.012017348781228065, "memory(GiB)": 22.66, "step": 23082, "token_acc": 1.0, "train_speed(iter/s)": 0.955319 }, { "epoch": 0.7498619367832895, "grad_norm": 0.29407429695129395, "learning_rate": 1.615360516168602e-06, "loss": 0.010620223358273506, "memory(GiB)": 22.66, "step": 23083, "token_acc": 1.0, "train_speed(iter/s)": 0.955325 }, { "epoch": 0.7498944222460449, "grad_norm": 0.25989678502082825, "learning_rate": 1.6149651646409996e-06, "loss": 0.010623618960380554, "memory(GiB)": 22.66, "step": 23084, "token_acc": 0.9963503649635036, "train_speed(iter/s)": 0.955332 }, { "epoch": 0.7499269077088003, "grad_norm": 0.44632989168167114, "learning_rate": 1.6145698521812014e-06, "loss": 0.014813174493610859, "memory(GiB)": 22.66, "step": 23085, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.955339 }, { "epoch": 0.7499593931715557, "grad_norm": 0.23573915660381317, "learning_rate": 1.6141745787937762e-06, "loss": 0.010468857362866402, "memory(GiB)": 22.66, "step": 23086, "token_acc": 1.0, "train_speed(iter/s)": 0.955346 }, { "epoch": 0.7499918786343112, "grad_norm": 0.18816515803337097, "learning_rate": 1.6137793444832816e-06, "loss": 0.006648770533502102, "memory(GiB)": 22.66, "step": 23087, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.955352 }, { "epoch": 0.7500243640970665, "grad_norm": 0.33178240060806274, "learning_rate": 1.6133841492542817e-06, "loss": 0.017000442370772362, "memory(GiB)": 22.66, "step": 23088, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955359 }, { "epoch": 0.750056849559822, "grad_norm": 0.3984185457229614, "learning_rate": 1.6129889931113351e-06, "loss": 0.01560979150235653, "memory(GiB)": 22.66, "step": 23089, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.955366 }, { "epoch": 0.7500893350225774, "grad_norm": 0.3671002686023712, "learning_rate": 1.612593876059002e-06, "loss": 0.01924806833267212, "memory(GiB)": 22.66, "step": 23090, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.955372 }, { "epoch": 0.7501218204853328, "grad_norm": 0.28019896149635315, "learning_rate": 1.6121987981018471e-06, "loss": 0.013649463653564453, "memory(GiB)": 22.66, "step": 23091, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.955379 }, { "epoch": 0.7501543059480882, "grad_norm": 0.3978487551212311, "learning_rate": 1.6118037592444257e-06, "loss": 0.01147559192031622, "memory(GiB)": 22.66, "step": 23092, "token_acc": 1.0, "train_speed(iter/s)": 0.955387 }, { "epoch": 0.7501867914108437, "grad_norm": 0.6648048758506775, "learning_rate": 1.6114087594913003e-06, "loss": 0.01871047355234623, "memory(GiB)": 22.66, "step": 23093, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955396 }, { "epoch": 0.750219276873599, "grad_norm": 0.19457361102104187, "learning_rate": 1.6110137988470264e-06, "loss": 0.006888914853334427, "memory(GiB)": 22.66, "step": 23094, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.955405 }, { "epoch": 0.7502517623363545, "grad_norm": 0.39314037561416626, "learning_rate": 1.610618877316164e-06, "loss": 0.014608243480324745, "memory(GiB)": 22.66, "step": 23095, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.955414 }, { "epoch": 0.7502842477991098, "grad_norm": 0.30276256799697876, "learning_rate": 1.6102239949032711e-06, "loss": 0.01349900383502245, "memory(GiB)": 22.66, "step": 23096, "token_acc": 1.0, "train_speed(iter/s)": 0.955423 }, { "epoch": 0.7503167332618653, "grad_norm": 0.411432147026062, "learning_rate": 1.6098291516129067e-06, "loss": 0.02001834474503994, "memory(GiB)": 22.66, "step": 23097, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955432 }, { "epoch": 0.7503492187246207, "grad_norm": 2.210296392440796, "learning_rate": 1.6094343474496243e-06, "loss": 0.012171823531389236, "memory(GiB)": 22.66, "step": 23098, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.955441 }, { "epoch": 0.7503817041873762, "grad_norm": 0.7535315155982971, "learning_rate": 1.6090395824179822e-06, "loss": 0.01914471760392189, "memory(GiB)": 22.66, "step": 23099, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.95545 }, { "epoch": 0.7504141896501315, "grad_norm": 0.3368990123271942, "learning_rate": 1.6086448565225372e-06, "loss": 0.009622767567634583, "memory(GiB)": 22.66, "step": 23100, "token_acc": 1.0, "train_speed(iter/s)": 0.955459 }, { "epoch": 0.750446675112887, "grad_norm": 0.3717997670173645, "learning_rate": 1.608250169767846e-06, "loss": 0.011792514473199844, "memory(GiB)": 22.66, "step": 23101, "token_acc": 1.0, "train_speed(iter/s)": 0.955467 }, { "epoch": 0.7504791605756423, "grad_norm": 0.4739968776702881, "learning_rate": 1.6078555221584603e-06, "loss": 0.021767940372228622, "memory(GiB)": 22.66, "step": 23102, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.955476 }, { "epoch": 0.7505116460383978, "grad_norm": 0.37939271330833435, "learning_rate": 1.6074609136989367e-06, "loss": 0.015885408967733383, "memory(GiB)": 22.66, "step": 23103, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.955484 }, { "epoch": 0.7505441315011533, "grad_norm": 0.28660205006599426, "learning_rate": 1.60706634439383e-06, "loss": 0.012455921620130539, "memory(GiB)": 22.66, "step": 23104, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.955491 }, { "epoch": 0.7505766169639086, "grad_norm": 0.7230997085571289, "learning_rate": 1.606671814247695e-06, "loss": 0.01931009255349636, "memory(GiB)": 22.66, "step": 23105, "token_acc": 0.9893238434163701, "train_speed(iter/s)": 0.955498 }, { "epoch": 0.7506091024266641, "grad_norm": 0.48433274030685425, "learning_rate": 1.6062773232650825e-06, "loss": 0.022786883637309074, "memory(GiB)": 22.66, "step": 23106, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955506 }, { "epoch": 0.7506415878894195, "grad_norm": 0.35839158296585083, "learning_rate": 1.6058828714505465e-06, "loss": 0.011340856552124023, "memory(GiB)": 22.66, "step": 23107, "token_acc": 1.0, "train_speed(iter/s)": 0.955512 }, { "epoch": 0.750674073352175, "grad_norm": 0.4871678948402405, "learning_rate": 1.60548845880864e-06, "loss": 0.02036294713616371, "memory(GiB)": 22.66, "step": 23108, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.95552 }, { "epoch": 0.7507065588149303, "grad_norm": 0.3311605751514435, "learning_rate": 1.6050940853439146e-06, "loss": 0.015036597847938538, "memory(GiB)": 22.66, "step": 23109, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.955527 }, { "epoch": 0.7507390442776858, "grad_norm": 0.4255521595478058, "learning_rate": 1.6046997510609236e-06, "loss": 0.013058004900813103, "memory(GiB)": 22.66, "step": 23110, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955534 }, { "epoch": 0.7507715297404411, "grad_norm": 0.3419068157672882, "learning_rate": 1.6043054559642151e-06, "loss": 0.01227081473916769, "memory(GiB)": 22.66, "step": 23111, "token_acc": 0.9840425531914894, "train_speed(iter/s)": 0.955542 }, { "epoch": 0.7508040152031966, "grad_norm": 0.4492015540599823, "learning_rate": 1.603911200058343e-06, "loss": 0.013444500975310802, "memory(GiB)": 22.66, "step": 23112, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.955549 }, { "epoch": 0.750836500665952, "grad_norm": 0.4150531589984894, "learning_rate": 1.603516983347852e-06, "loss": 0.0158504880964756, "memory(GiB)": 22.66, "step": 23113, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.955556 }, { "epoch": 0.7508689861287074, "grad_norm": 0.2335374653339386, "learning_rate": 1.6031228058372995e-06, "loss": 0.014987168833613396, "memory(GiB)": 22.66, "step": 23114, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.955564 }, { "epoch": 0.7509014715914628, "grad_norm": 0.4529155492782593, "learning_rate": 1.6027286675312292e-06, "loss": 0.011770792305469513, "memory(GiB)": 22.66, "step": 23115, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955571 }, { "epoch": 0.7509339570542183, "grad_norm": 0.45132049918174744, "learning_rate": 1.602334568434194e-06, "loss": 0.012692637741565704, "memory(GiB)": 22.66, "step": 23116, "token_acc": 1.0, "train_speed(iter/s)": 0.955578 }, { "epoch": 0.7509664425169736, "grad_norm": 0.33634376525878906, "learning_rate": 1.6019405085507377e-06, "loss": 0.00923627708107233, "memory(GiB)": 22.66, "step": 23117, "token_acc": 1.0, "train_speed(iter/s)": 0.955585 }, { "epoch": 0.7509989279797291, "grad_norm": 0.25815314054489136, "learning_rate": 1.6015464878854114e-06, "loss": 0.010162416845560074, "memory(GiB)": 22.66, "step": 23118, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.955592 }, { "epoch": 0.7510314134424845, "grad_norm": 0.23844534158706665, "learning_rate": 1.6011525064427618e-06, "loss": 0.00946742482483387, "memory(GiB)": 22.66, "step": 23119, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.9556 }, { "epoch": 0.75106389890524, "grad_norm": 0.5414338707923889, "learning_rate": 1.6007585642273377e-06, "loss": 0.023003775626420975, "memory(GiB)": 22.66, "step": 23120, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955608 }, { "epoch": 0.7510963843679953, "grad_norm": 0.35583382844924927, "learning_rate": 1.6003646612436825e-06, "loss": 0.012878572568297386, "memory(GiB)": 22.66, "step": 23121, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.955615 }, { "epoch": 0.7511288698307508, "grad_norm": 0.3607172966003418, "learning_rate": 1.5999707974963441e-06, "loss": 0.017230670899152756, "memory(GiB)": 22.66, "step": 23122, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.955621 }, { "epoch": 0.7511613552935061, "grad_norm": 0.3794328272342682, "learning_rate": 1.5995769729898681e-06, "loss": 0.01249491423368454, "memory(GiB)": 22.66, "step": 23123, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955627 }, { "epoch": 0.7511938407562616, "grad_norm": 2.0614235401153564, "learning_rate": 1.5991831877287994e-06, "loss": 0.0237122792750597, "memory(GiB)": 22.66, "step": 23124, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955634 }, { "epoch": 0.751226326219017, "grad_norm": 0.2955199182033539, "learning_rate": 1.5987894417176853e-06, "loss": 0.011080286465585232, "memory(GiB)": 22.66, "step": 23125, "token_acc": 1.0, "train_speed(iter/s)": 0.955641 }, { "epoch": 0.7512588116817724, "grad_norm": 0.3291357457637787, "learning_rate": 1.5983957349610662e-06, "loss": 0.013500068336725235, "memory(GiB)": 22.66, "step": 23126, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955649 }, { "epoch": 0.7512912971445278, "grad_norm": 0.6252169609069824, "learning_rate": 1.5980020674634877e-06, "loss": 0.014580197632312775, "memory(GiB)": 22.66, "step": 23127, "token_acc": 0.9961685823754789, "train_speed(iter/s)": 0.955654 }, { "epoch": 0.7513237826072833, "grad_norm": 0.49140220880508423, "learning_rate": 1.5976084392294938e-06, "loss": 0.021542863920331, "memory(GiB)": 22.66, "step": 23128, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955661 }, { "epoch": 0.7513562680700386, "grad_norm": 0.338619589805603, "learning_rate": 1.5972148502636286e-06, "loss": 0.014668848365545273, "memory(GiB)": 22.66, "step": 23129, "token_acc": 0.992, "train_speed(iter/s)": 0.955669 }, { "epoch": 0.7513887535327941, "grad_norm": 0.37333208322525024, "learning_rate": 1.5968213005704315e-06, "loss": 0.015918347984552383, "memory(GiB)": 22.66, "step": 23130, "token_acc": 0.9943820224719101, "train_speed(iter/s)": 0.955678 }, { "epoch": 0.7514212389955495, "grad_norm": 0.4058817923069, "learning_rate": 1.5964277901544462e-06, "loss": 0.019091475754976273, "memory(GiB)": 22.66, "step": 23131, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955687 }, { "epoch": 0.7514537244583049, "grad_norm": 0.3918095827102661, "learning_rate": 1.5960343190202144e-06, "loss": 0.013685317710042, "memory(GiB)": 22.66, "step": 23132, "token_acc": 1.0, "train_speed(iter/s)": 0.955695 }, { "epoch": 0.7514862099210603, "grad_norm": 0.2695009410381317, "learning_rate": 1.5956408871722785e-06, "loss": 0.013233641162514687, "memory(GiB)": 22.66, "step": 23133, "token_acc": 1.0, "train_speed(iter/s)": 0.9557 }, { "epoch": 0.7515186953838158, "grad_norm": 0.33149516582489014, "learning_rate": 1.5952474946151764e-06, "loss": 0.009632079862058163, "memory(GiB)": 22.66, "step": 23134, "token_acc": 1.0, "train_speed(iter/s)": 0.955707 }, { "epoch": 0.7515511808465711, "grad_norm": 0.344237744808197, "learning_rate": 1.5948541413534518e-06, "loss": 0.013406574726104736, "memory(GiB)": 22.66, "step": 23135, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.955715 }, { "epoch": 0.7515836663093266, "grad_norm": 0.5954771041870117, "learning_rate": 1.594460827391639e-06, "loss": 0.01428611483424902, "memory(GiB)": 22.66, "step": 23136, "token_acc": 1.0, "train_speed(iter/s)": 0.955722 }, { "epoch": 0.751616151772082, "grad_norm": 0.3434789776802063, "learning_rate": 1.594067552734283e-06, "loss": 0.01197686418890953, "memory(GiB)": 22.66, "step": 23137, "token_acc": 1.0, "train_speed(iter/s)": 0.955728 }, { "epoch": 0.7516486372348374, "grad_norm": 0.2807975709438324, "learning_rate": 1.5936743173859226e-06, "loss": 0.010604627430438995, "memory(GiB)": 22.66, "step": 23138, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.955736 }, { "epoch": 0.7516811226975928, "grad_norm": 0.3943845331668854, "learning_rate": 1.5932811213510923e-06, "loss": 0.015314353629946709, "memory(GiB)": 22.66, "step": 23139, "token_acc": 1.0, "train_speed(iter/s)": 0.955742 }, { "epoch": 0.7517136081603483, "grad_norm": 0.261808842420578, "learning_rate": 1.5928879646343343e-06, "loss": 0.007775237318128347, "memory(GiB)": 22.66, "step": 23140, "token_acc": 1.0, "train_speed(iter/s)": 0.955749 }, { "epoch": 0.7517460936231036, "grad_norm": 0.3745211362838745, "learning_rate": 1.5924948472401803e-06, "loss": 0.019769925624132156, "memory(GiB)": 22.66, "step": 23141, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.955756 }, { "epoch": 0.7517785790858591, "grad_norm": 0.36171457171440125, "learning_rate": 1.5921017691731743e-06, "loss": 0.012599808163940907, "memory(GiB)": 22.66, "step": 23142, "token_acc": 1.0, "train_speed(iter/s)": 0.955763 }, { "epoch": 0.7518110645486145, "grad_norm": 0.35723453760147095, "learning_rate": 1.5917087304378487e-06, "loss": 0.014232534915208817, "memory(GiB)": 22.66, "step": 23143, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.955769 }, { "epoch": 0.7518435500113699, "grad_norm": 0.4719529151916504, "learning_rate": 1.5913157310387417e-06, "loss": 0.017922433093190193, "memory(GiB)": 22.66, "step": 23144, "token_acc": 0.9846153846153847, "train_speed(iter/s)": 0.955776 }, { "epoch": 0.7518760354741253, "grad_norm": 0.4586489200592041, "learning_rate": 1.590922770980387e-06, "loss": 0.016345791518688202, "memory(GiB)": 22.66, "step": 23145, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.955783 }, { "epoch": 0.7519085209368808, "grad_norm": 0.4011389911174774, "learning_rate": 1.5905298502673212e-06, "loss": 0.0168167632073164, "memory(GiB)": 22.66, "step": 23146, "token_acc": 1.0, "train_speed(iter/s)": 0.95579 }, { "epoch": 0.7519410063996361, "grad_norm": 0.26329541206359863, "learning_rate": 1.5901369689040786e-06, "loss": 0.008693215437233448, "memory(GiB)": 22.66, "step": 23147, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955797 }, { "epoch": 0.7519734918623916, "grad_norm": 0.4464895725250244, "learning_rate": 1.5897441268951952e-06, "loss": 0.012453142553567886, "memory(GiB)": 22.66, "step": 23148, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955803 }, { "epoch": 0.752005977325147, "grad_norm": 0.4620530903339386, "learning_rate": 1.5893513242452025e-06, "loss": 0.01783374324440956, "memory(GiB)": 22.66, "step": 23149, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.95581 }, { "epoch": 0.7520384627879024, "grad_norm": 0.33565381169319153, "learning_rate": 1.588958560958635e-06, "loss": 0.012187776155769825, "memory(GiB)": 22.66, "step": 23150, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955818 }, { "epoch": 0.7520709482506578, "grad_norm": 0.33248674869537354, "learning_rate": 1.5885658370400259e-06, "loss": 0.009568225592374802, "memory(GiB)": 22.66, "step": 23151, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.955824 }, { "epoch": 0.7521034337134133, "grad_norm": 0.36624303460121155, "learning_rate": 1.588173152493907e-06, "loss": 0.011871591210365295, "memory(GiB)": 22.66, "step": 23152, "token_acc": 1.0, "train_speed(iter/s)": 0.955833 }, { "epoch": 0.7521359191761686, "grad_norm": 0.2709369957447052, "learning_rate": 1.5877805073248132e-06, "loss": 0.013063758611679077, "memory(GiB)": 22.66, "step": 23153, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.955842 }, { "epoch": 0.7521684046389241, "grad_norm": 0.5130171775817871, "learning_rate": 1.5873879015372722e-06, "loss": 0.017077907919883728, "memory(GiB)": 22.66, "step": 23154, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955851 }, { "epoch": 0.7522008901016795, "grad_norm": 0.38172200322151184, "learning_rate": 1.5869953351358174e-06, "loss": 0.016217045485973358, "memory(GiB)": 22.66, "step": 23155, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.95586 }, { "epoch": 0.7522333755644349, "grad_norm": 0.35373666882514954, "learning_rate": 1.586602808124979e-06, "loss": 0.01107180304825306, "memory(GiB)": 22.66, "step": 23156, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955868 }, { "epoch": 0.7522658610271903, "grad_norm": 0.30454742908477783, "learning_rate": 1.5862103205092888e-06, "loss": 0.014477703720331192, "memory(GiB)": 22.66, "step": 23157, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.955877 }, { "epoch": 0.7522983464899458, "grad_norm": 0.2569853961467743, "learning_rate": 1.5858178722932737e-06, "loss": 0.015446737408638, "memory(GiB)": 22.66, "step": 23158, "token_acc": 0.982532751091703, "train_speed(iter/s)": 0.955886 }, { "epoch": 0.7523308319527011, "grad_norm": 0.3426738977432251, "learning_rate": 1.5854254634814648e-06, "loss": 0.016643516719341278, "memory(GiB)": 22.66, "step": 23159, "token_acc": 1.0, "train_speed(iter/s)": 0.955895 }, { "epoch": 0.7523633174154566, "grad_norm": 0.6058038473129272, "learning_rate": 1.5850330940783904e-06, "loss": 0.01679028756916523, "memory(GiB)": 22.66, "step": 23160, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955904 }, { "epoch": 0.752395802878212, "grad_norm": 0.22714672982692719, "learning_rate": 1.5846407640885813e-06, "loss": 0.008569303900003433, "memory(GiB)": 22.66, "step": 23161, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.955913 }, { "epoch": 0.7524282883409674, "grad_norm": 0.3417477011680603, "learning_rate": 1.584248473516562e-06, "loss": 0.01878322847187519, "memory(GiB)": 22.66, "step": 23162, "token_acc": 0.9836065573770492, "train_speed(iter/s)": 0.955922 }, { "epoch": 0.7524607738037228, "grad_norm": 0.356021523475647, "learning_rate": 1.5838562223668629e-06, "loss": 0.009011263027787209, "memory(GiB)": 22.66, "step": 23163, "token_acc": 1.0, "train_speed(iter/s)": 0.95593 }, { "epoch": 0.7524932592664783, "grad_norm": 0.34121692180633545, "learning_rate": 1.5834640106440075e-06, "loss": 0.010071687400341034, "memory(GiB)": 22.66, "step": 23164, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955939 }, { "epoch": 0.7525257447292336, "grad_norm": 0.38915902376174927, "learning_rate": 1.5830718383525273e-06, "loss": 0.011145555414259434, "memory(GiB)": 22.66, "step": 23165, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.955948 }, { "epoch": 0.7525582301919891, "grad_norm": 0.33301666378974915, "learning_rate": 1.5826797054969446e-06, "loss": 0.010037893429398537, "memory(GiB)": 22.66, "step": 23166, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955957 }, { "epoch": 0.7525907156547444, "grad_norm": 0.33473461866378784, "learning_rate": 1.582287612081788e-06, "loss": 0.012300115078687668, "memory(GiB)": 22.66, "step": 23167, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955965 }, { "epoch": 0.7526232011174999, "grad_norm": 0.3481545150279999, "learning_rate": 1.5818955581115786e-06, "loss": 0.0127926766872406, "memory(GiB)": 22.66, "step": 23168, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.955972 }, { "epoch": 0.7526556865802554, "grad_norm": 0.3345375955104828, "learning_rate": 1.5815035435908456e-06, "loss": 0.012318902648985386, "memory(GiB)": 22.66, "step": 23169, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955979 }, { "epoch": 0.7526881720430108, "grad_norm": 0.3823809325695038, "learning_rate": 1.5811115685241125e-06, "loss": 0.014295750297605991, "memory(GiB)": 22.66, "step": 23170, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.955986 }, { "epoch": 0.7527206575057662, "grad_norm": 0.32757389545440674, "learning_rate": 1.5807196329159014e-06, "loss": 0.014182735234498978, "memory(GiB)": 22.66, "step": 23171, "token_acc": 1.0, "train_speed(iter/s)": 0.955992 }, { "epoch": 0.7527531429685216, "grad_norm": 0.2548815608024597, "learning_rate": 1.580327736770738e-06, "loss": 0.008775703608989716, "memory(GiB)": 22.66, "step": 23172, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.955999 }, { "epoch": 0.752785628431277, "grad_norm": 0.44234809279441833, "learning_rate": 1.5799358800931413e-06, "loss": 0.018391840159893036, "memory(GiB)": 22.66, "step": 23173, "token_acc": 1.0, "train_speed(iter/s)": 0.956005 }, { "epoch": 0.7528181138940324, "grad_norm": 0.3037176728248596, "learning_rate": 1.5795440628876395e-06, "loss": 0.014834238216280937, "memory(GiB)": 22.66, "step": 23174, "token_acc": 0.990506329113924, "train_speed(iter/s)": 0.956012 }, { "epoch": 0.7528505993567879, "grad_norm": 0.28743109107017517, "learning_rate": 1.579152285158751e-06, "loss": 0.015839483588933945, "memory(GiB)": 22.66, "step": 23175, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956019 }, { "epoch": 0.7528830848195432, "grad_norm": 0.3097652792930603, "learning_rate": 1.578760546910999e-06, "loss": 0.017856312915682793, "memory(GiB)": 22.66, "step": 23176, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.956026 }, { "epoch": 0.7529155702822987, "grad_norm": 0.3806959092617035, "learning_rate": 1.578368848148903e-06, "loss": 0.015613866969943047, "memory(GiB)": 22.66, "step": 23177, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956033 }, { "epoch": 0.7529480557450541, "grad_norm": 0.618363618850708, "learning_rate": 1.577977188876984e-06, "loss": 0.016101600602269173, "memory(GiB)": 22.66, "step": 23178, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956039 }, { "epoch": 0.7529805412078096, "grad_norm": 0.4968125522136688, "learning_rate": 1.5775855690997633e-06, "loss": 0.016917778179049492, "memory(GiB)": 22.66, "step": 23179, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.956046 }, { "epoch": 0.7530130266705649, "grad_norm": 0.5006206631660461, "learning_rate": 1.5771939888217618e-06, "loss": 0.014935840852558613, "memory(GiB)": 22.66, "step": 23180, "token_acc": 1.0, "train_speed(iter/s)": 0.956053 }, { "epoch": 0.7530455121333204, "grad_norm": 0.4221866726875305, "learning_rate": 1.5768024480474964e-06, "loss": 0.016339540481567383, "memory(GiB)": 22.66, "step": 23181, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.95606 }, { "epoch": 0.7530779975960757, "grad_norm": 0.3042960464954376, "learning_rate": 1.5764109467814865e-06, "loss": 0.015279938466846943, "memory(GiB)": 22.66, "step": 23182, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.956067 }, { "epoch": 0.7531104830588312, "grad_norm": 0.3316490948200226, "learning_rate": 1.5760194850282512e-06, "loss": 0.014421502128243446, "memory(GiB)": 22.66, "step": 23183, "token_acc": 1.0, "train_speed(iter/s)": 0.956074 }, { "epoch": 0.7531429685215866, "grad_norm": 0.3129172921180725, "learning_rate": 1.5756280627923082e-06, "loss": 0.009275879710912704, "memory(GiB)": 22.66, "step": 23184, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956082 }, { "epoch": 0.753175453984342, "grad_norm": 0.3127424418926239, "learning_rate": 1.5752366800781765e-06, "loss": 0.013227391988039017, "memory(GiB)": 22.66, "step": 23185, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956088 }, { "epoch": 0.7532079394470974, "grad_norm": 0.2691092789173126, "learning_rate": 1.574845336890371e-06, "loss": 0.009978831745684147, "memory(GiB)": 22.66, "step": 23186, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.956095 }, { "epoch": 0.7532404249098529, "grad_norm": 0.2808888256549835, "learning_rate": 1.5744540332334085e-06, "loss": 0.0109268669039011, "memory(GiB)": 22.66, "step": 23187, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.9561 }, { "epoch": 0.7532729103726082, "grad_norm": 0.2196660190820694, "learning_rate": 1.574062769111806e-06, "loss": 0.009175579063594341, "memory(GiB)": 22.66, "step": 23188, "token_acc": 1.0, "train_speed(iter/s)": 0.956107 }, { "epoch": 0.7533053958353637, "grad_norm": 0.4246436059474945, "learning_rate": 1.573671544530081e-06, "loss": 0.009253058582544327, "memory(GiB)": 22.66, "step": 23189, "token_acc": 1.0, "train_speed(iter/s)": 0.956114 }, { "epoch": 0.7533378812981191, "grad_norm": 0.373273640871048, "learning_rate": 1.5732803594927447e-06, "loss": 0.016973601654171944, "memory(GiB)": 22.66, "step": 23190, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.956121 }, { "epoch": 0.7533703667608745, "grad_norm": 0.2901858985424042, "learning_rate": 1.5728892140043144e-06, "loss": 0.012894655577838421, "memory(GiB)": 22.66, "step": 23191, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.956128 }, { "epoch": 0.7534028522236299, "grad_norm": 0.2624848186969757, "learning_rate": 1.5724981080693041e-06, "loss": 0.008607856929302216, "memory(GiB)": 22.66, "step": 23192, "token_acc": 1.0, "train_speed(iter/s)": 0.956134 }, { "epoch": 0.7534353376863854, "grad_norm": 0.28949329257011414, "learning_rate": 1.572107041692229e-06, "loss": 0.010850188322365284, "memory(GiB)": 22.66, "step": 23193, "token_acc": 0.992, "train_speed(iter/s)": 0.956141 }, { "epoch": 0.7534678231491407, "grad_norm": 0.3201703131198883, "learning_rate": 1.5717160148776e-06, "loss": 0.013816852122545242, "memory(GiB)": 22.66, "step": 23194, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.956148 }, { "epoch": 0.7535003086118962, "grad_norm": 0.3353284001350403, "learning_rate": 1.5713250276299324e-06, "loss": 0.020146213471889496, "memory(GiB)": 22.66, "step": 23195, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.956154 }, { "epoch": 0.7535327940746516, "grad_norm": 0.34221121668815613, "learning_rate": 1.5709340799537342e-06, "loss": 0.015296143479645252, "memory(GiB)": 22.66, "step": 23196, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.956162 }, { "epoch": 0.753565279537407, "grad_norm": 0.3146476745605469, "learning_rate": 1.5705431718535247e-06, "loss": 0.01555665209889412, "memory(GiB)": 22.66, "step": 23197, "token_acc": 1.0, "train_speed(iter/s)": 0.956169 }, { "epoch": 0.7535977650001624, "grad_norm": 0.34697481989860535, "learning_rate": 1.570152303333809e-06, "loss": 0.01563202776014805, "memory(GiB)": 22.66, "step": 23198, "token_acc": 0.996, "train_speed(iter/s)": 0.956176 }, { "epoch": 0.7536302504629179, "grad_norm": 0.5225797891616821, "learning_rate": 1.5697614743991014e-06, "loss": 0.014348387718200684, "memory(GiB)": 22.66, "step": 23199, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.956182 }, { "epoch": 0.7536627359256732, "grad_norm": 0.4105452597141266, "learning_rate": 1.5693706850539137e-06, "loss": 0.00808672048151493, "memory(GiB)": 22.66, "step": 23200, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.95619 }, { "epoch": 0.7536952213884287, "grad_norm": 0.3874455988407135, "learning_rate": 1.5689799353027513e-06, "loss": 0.01296832412481308, "memory(GiB)": 22.66, "step": 23201, "token_acc": 1.0, "train_speed(iter/s)": 0.956197 }, { "epoch": 0.7537277068511841, "grad_norm": 0.47880634665489197, "learning_rate": 1.5685892251501301e-06, "loss": 0.013152528554201126, "memory(GiB)": 22.66, "step": 23202, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956204 }, { "epoch": 0.7537601923139395, "grad_norm": 0.19984976947307587, "learning_rate": 1.5681985546005545e-06, "loss": 0.008508743718266487, "memory(GiB)": 22.66, "step": 23203, "token_acc": 1.0, "train_speed(iter/s)": 0.956211 }, { "epoch": 0.7537926777766949, "grad_norm": 0.3904845416545868, "learning_rate": 1.5678079236585375e-06, "loss": 0.015979178249835968, "memory(GiB)": 22.66, "step": 23204, "token_acc": 0.984, "train_speed(iter/s)": 0.956218 }, { "epoch": 0.7538251632394504, "grad_norm": 0.26000121235847473, "learning_rate": 1.5674173323285836e-06, "loss": 0.011619826778769493, "memory(GiB)": 22.66, "step": 23205, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956224 }, { "epoch": 0.7538576487022057, "grad_norm": 0.36470967531204224, "learning_rate": 1.5670267806152023e-06, "loss": 0.01884220913052559, "memory(GiB)": 22.66, "step": 23206, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.956231 }, { "epoch": 0.7538901341649612, "grad_norm": 0.44795769453048706, "learning_rate": 1.5666362685229008e-06, "loss": 0.01728067174553871, "memory(GiB)": 22.66, "step": 23207, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.956237 }, { "epoch": 0.7539226196277166, "grad_norm": 0.3705505132675171, "learning_rate": 1.5662457960561888e-06, "loss": 0.010464496910572052, "memory(GiB)": 22.66, "step": 23208, "token_acc": 1.0, "train_speed(iter/s)": 0.956244 }, { "epoch": 0.753955105090472, "grad_norm": 0.304383248090744, "learning_rate": 1.5658553632195688e-06, "loss": 0.01154671236872673, "memory(GiB)": 22.66, "step": 23209, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.956249 }, { "epoch": 0.7539875905532274, "grad_norm": 0.2605122923851013, "learning_rate": 1.565464970017549e-06, "loss": 0.012493164278566837, "memory(GiB)": 22.66, "step": 23210, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956256 }, { "epoch": 0.7540200760159829, "grad_norm": 0.612450361251831, "learning_rate": 1.5650746164546342e-06, "loss": 0.013087622821331024, "memory(GiB)": 22.66, "step": 23211, "token_acc": 0.9942857142857143, "train_speed(iter/s)": 0.956264 }, { "epoch": 0.7540525614787382, "grad_norm": 0.35269054770469666, "learning_rate": 1.5646843025353325e-06, "loss": 0.012822609394788742, "memory(GiB)": 22.66, "step": 23212, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956272 }, { "epoch": 0.7540850469414937, "grad_norm": 0.33179932832717896, "learning_rate": 1.5642940282641444e-06, "loss": 0.009087181650102139, "memory(GiB)": 22.66, "step": 23213, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956281 }, { "epoch": 0.7541175324042491, "grad_norm": 0.44479620456695557, "learning_rate": 1.563903793645576e-06, "loss": 0.019733551889657974, "memory(GiB)": 22.66, "step": 23214, "token_acc": 0.9966777408637874, "train_speed(iter/s)": 0.95629 }, { "epoch": 0.7541500178670045, "grad_norm": 0.29489508271217346, "learning_rate": 1.5635135986841315e-06, "loss": 0.009857486933469772, "memory(GiB)": 22.66, "step": 23215, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.956298 }, { "epoch": 0.7541825033297599, "grad_norm": 0.3062979280948639, "learning_rate": 1.563123443384314e-06, "loss": 0.012861049734055996, "memory(GiB)": 22.66, "step": 23216, "token_acc": 1.0, "train_speed(iter/s)": 0.956307 }, { "epoch": 0.7542149887925154, "grad_norm": 0.336561381816864, "learning_rate": 1.5627333277506279e-06, "loss": 0.017650648951530457, "memory(GiB)": 22.66, "step": 23217, "token_acc": 0.9921875, "train_speed(iter/s)": 0.956316 }, { "epoch": 0.7542474742552707, "grad_norm": 0.5049644708633423, "learning_rate": 1.5623432517875724e-06, "loss": 0.015406278893351555, "memory(GiB)": 22.66, "step": 23218, "token_acc": 0.993006993006993, "train_speed(iter/s)": 0.956324 }, { "epoch": 0.7542799597180262, "grad_norm": 0.3188551366329193, "learning_rate": 1.5619532154996519e-06, "loss": 0.011252032592892647, "memory(GiB)": 22.66, "step": 23219, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.956333 }, { "epoch": 0.7543124451807816, "grad_norm": 0.4085906744003296, "learning_rate": 1.5615632188913664e-06, "loss": 0.023134715855121613, "memory(GiB)": 22.66, "step": 23220, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956342 }, { "epoch": 0.754344930643537, "grad_norm": 0.27633729577064514, "learning_rate": 1.5611732619672204e-06, "loss": 0.012881604954600334, "memory(GiB)": 22.66, "step": 23221, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956351 }, { "epoch": 0.7543774161062924, "grad_norm": 0.2619949281215668, "learning_rate": 1.56078334473171e-06, "loss": 0.008446735329926014, "memory(GiB)": 22.66, "step": 23222, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956359 }, { "epoch": 0.7544099015690479, "grad_norm": 0.3398311138153076, "learning_rate": 1.5603934671893394e-06, "loss": 0.014145059511065483, "memory(GiB)": 22.66, "step": 23223, "token_acc": 1.0, "train_speed(iter/s)": 0.956368 }, { "epoch": 0.7544423870318032, "grad_norm": 0.22453810274600983, "learning_rate": 1.5600036293446025e-06, "loss": 0.012949954718351364, "memory(GiB)": 22.66, "step": 23224, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956377 }, { "epoch": 0.7544748724945587, "grad_norm": 0.48940402269363403, "learning_rate": 1.5596138312020059e-06, "loss": 0.023555463179945946, "memory(GiB)": 22.66, "step": 23225, "token_acc": 1.0, "train_speed(iter/s)": 0.956386 }, { "epoch": 0.754507357957314, "grad_norm": 0.7855355143547058, "learning_rate": 1.559224072766043e-06, "loss": 0.01654113084077835, "memory(GiB)": 22.66, "step": 23226, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956395 }, { "epoch": 0.7545398434200695, "grad_norm": 0.2229907512664795, "learning_rate": 1.5588343540412159e-06, "loss": 0.007655123248696327, "memory(GiB)": 22.66, "step": 23227, "token_acc": 1.0, "train_speed(iter/s)": 0.956403 }, { "epoch": 0.7545723288828249, "grad_norm": 0.3120976686477661, "learning_rate": 1.5584446750320186e-06, "loss": 0.014830859377980232, "memory(GiB)": 22.66, "step": 23228, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956412 }, { "epoch": 0.7546048143455804, "grad_norm": 0.319100946187973, "learning_rate": 1.5580550357429491e-06, "loss": 0.011498373933136463, "memory(GiB)": 22.66, "step": 23229, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.956421 }, { "epoch": 0.7546372998083357, "grad_norm": 0.24660542607307434, "learning_rate": 1.5576654361785087e-06, "loss": 0.01002160832285881, "memory(GiB)": 22.66, "step": 23230, "token_acc": 1.0, "train_speed(iter/s)": 0.956428 }, { "epoch": 0.7546697852710912, "grad_norm": 0.2908685803413391, "learning_rate": 1.5572758763431894e-06, "loss": 0.011587287299335003, "memory(GiB)": 22.66, "step": 23231, "token_acc": 1.0, "train_speed(iter/s)": 0.956435 }, { "epoch": 0.7547022707338467, "grad_norm": 0.3250199854373932, "learning_rate": 1.5568863562414905e-06, "loss": 0.016180168837308884, "memory(GiB)": 22.66, "step": 23232, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956442 }, { "epoch": 0.754734756196602, "grad_norm": 0.3427971303462982, "learning_rate": 1.5564968758779042e-06, "loss": 0.014815909788012505, "memory(GiB)": 22.66, "step": 23233, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.95645 }, { "epoch": 0.7547672416593575, "grad_norm": 0.3706273138523102, "learning_rate": 1.5561074352569267e-06, "loss": 0.01399870216846466, "memory(GiB)": 22.66, "step": 23234, "token_acc": 0.992831541218638, "train_speed(iter/s)": 0.956457 }, { "epoch": 0.7547997271221129, "grad_norm": 0.48979607224464417, "learning_rate": 1.5557180343830537e-06, "loss": 0.018494870513677597, "memory(GiB)": 22.66, "step": 23235, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956464 }, { "epoch": 0.7548322125848683, "grad_norm": 0.22224245965480804, "learning_rate": 1.5553286732607804e-06, "loss": 0.012848066166043282, "memory(GiB)": 22.66, "step": 23236, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.956471 }, { "epoch": 0.7548646980476237, "grad_norm": 0.3689036965370178, "learning_rate": 1.5549393518945977e-06, "loss": 0.019742172211408615, "memory(GiB)": 22.66, "step": 23237, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956478 }, { "epoch": 0.7548971835103792, "grad_norm": 0.3645837604999542, "learning_rate": 1.5545500702889999e-06, "loss": 0.01456694770604372, "memory(GiB)": 22.66, "step": 23238, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.956484 }, { "epoch": 0.7549296689731345, "grad_norm": 0.39176058769226074, "learning_rate": 1.5541608284484805e-06, "loss": 0.016633084043860435, "memory(GiB)": 22.66, "step": 23239, "token_acc": 1.0, "train_speed(iter/s)": 0.956491 }, { "epoch": 0.75496215443589, "grad_norm": 0.26172950863838196, "learning_rate": 1.5537716263775326e-06, "loss": 0.01025811955332756, "memory(GiB)": 22.66, "step": 23240, "token_acc": 1.0, "train_speed(iter/s)": 0.956498 }, { "epoch": 0.7549946398986453, "grad_norm": 0.2783203721046448, "learning_rate": 1.5533824640806462e-06, "loss": 0.01069414708763361, "memory(GiB)": 22.66, "step": 23241, "token_acc": 0.9875, "train_speed(iter/s)": 0.956505 }, { "epoch": 0.7550271253614008, "grad_norm": 0.39839014410972595, "learning_rate": 1.5529933415623127e-06, "loss": 0.014468792825937271, "memory(GiB)": 22.66, "step": 23242, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.956512 }, { "epoch": 0.7550596108241562, "grad_norm": 0.3505537211894989, "learning_rate": 1.5526042588270245e-06, "loss": 0.014858601614832878, "memory(GiB)": 22.66, "step": 23243, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956518 }, { "epoch": 0.7550920962869117, "grad_norm": 0.2717156410217285, "learning_rate": 1.5522152158792735e-06, "loss": 0.012254900299012661, "memory(GiB)": 22.66, "step": 23244, "token_acc": 1.0, "train_speed(iter/s)": 0.956525 }, { "epoch": 0.755124581749667, "grad_norm": 0.40640226006507874, "learning_rate": 1.5518262127235463e-06, "loss": 0.018908988684415817, "memory(GiB)": 22.66, "step": 23245, "token_acc": 0.9921875, "train_speed(iter/s)": 0.956533 }, { "epoch": 0.7551570672124225, "grad_norm": 0.39651060104370117, "learning_rate": 1.5514372493643336e-06, "loss": 0.015885572880506516, "memory(GiB)": 22.66, "step": 23246, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.95654 }, { "epoch": 0.7551895526751778, "grad_norm": 0.328874409198761, "learning_rate": 1.5510483258061254e-06, "loss": 0.011561993509531021, "memory(GiB)": 22.66, "step": 23247, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.956546 }, { "epoch": 0.7552220381379333, "grad_norm": 0.28875574469566345, "learning_rate": 1.55065944205341e-06, "loss": 0.013280627317726612, "memory(GiB)": 22.66, "step": 23248, "token_acc": 1.0, "train_speed(iter/s)": 0.956552 }, { "epoch": 0.7552545236006887, "grad_norm": 0.39767494797706604, "learning_rate": 1.550270598110677e-06, "loss": 0.021078191697597504, "memory(GiB)": 22.66, "step": 23249, "token_acc": 0.9789029535864979, "train_speed(iter/s)": 0.956558 }, { "epoch": 0.7552870090634441, "grad_norm": 0.41531428694725037, "learning_rate": 1.5498817939824118e-06, "loss": 0.013068929314613342, "memory(GiB)": 22.66, "step": 23250, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956565 }, { "epoch": 0.7553194945261995, "grad_norm": 0.32862406969070435, "learning_rate": 1.5494930296731042e-06, "loss": 0.012252148240804672, "memory(GiB)": 22.66, "step": 23251, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.956571 }, { "epoch": 0.755351979988955, "grad_norm": 0.34935495257377625, "learning_rate": 1.5491043051872362e-06, "loss": 0.01545969769358635, "memory(GiB)": 22.66, "step": 23252, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.956577 }, { "epoch": 0.7553844654517103, "grad_norm": 0.30973777174949646, "learning_rate": 1.5487156205293007e-06, "loss": 0.014141938649117947, "memory(GiB)": 22.66, "step": 23253, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.956583 }, { "epoch": 0.7554169509144658, "grad_norm": 0.7221532464027405, "learning_rate": 1.5483269757037795e-06, "loss": 0.023107651621103287, "memory(GiB)": 22.66, "step": 23254, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.95659 }, { "epoch": 0.7554494363772212, "grad_norm": 0.41425999999046326, "learning_rate": 1.5479383707151608e-06, "loss": 0.013389200903475285, "memory(GiB)": 22.66, "step": 23255, "token_acc": 1.0, "train_speed(iter/s)": 0.956597 }, { "epoch": 0.7554819218399766, "grad_norm": 0.26794320344924927, "learning_rate": 1.5475498055679262e-06, "loss": 0.01054447889328003, "memory(GiB)": 22.66, "step": 23256, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.956604 }, { "epoch": 0.755514407302732, "grad_norm": 0.4717787802219391, "learning_rate": 1.547161280266562e-06, "loss": 0.017861124128103256, "memory(GiB)": 22.66, "step": 23257, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956612 }, { "epoch": 0.7555468927654875, "grad_norm": 0.4144224524497986, "learning_rate": 1.5467727948155525e-06, "loss": 0.01704862155020237, "memory(GiB)": 22.66, "step": 23258, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956618 }, { "epoch": 0.7555793782282428, "grad_norm": 0.28805822134017944, "learning_rate": 1.5463843492193826e-06, "loss": 0.010583983734250069, "memory(GiB)": 22.66, "step": 23259, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956624 }, { "epoch": 0.7556118636909983, "grad_norm": 0.3609321415424347, "learning_rate": 1.5459959434825323e-06, "loss": 0.012781010940670967, "memory(GiB)": 22.66, "step": 23260, "token_acc": 1.0, "train_speed(iter/s)": 0.956631 }, { "epoch": 0.7556443491537537, "grad_norm": 0.2612769901752472, "learning_rate": 1.5456075776094865e-06, "loss": 0.013557586818933487, "memory(GiB)": 22.66, "step": 23261, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.956637 }, { "epoch": 0.7556768346165091, "grad_norm": 0.43883630633354187, "learning_rate": 1.5452192516047266e-06, "loss": 0.01683592051267624, "memory(GiB)": 22.66, "step": 23262, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.956644 }, { "epoch": 0.7557093200792645, "grad_norm": 0.3195837736129761, "learning_rate": 1.544830965472735e-06, "loss": 0.012687904760241508, "memory(GiB)": 22.66, "step": 23263, "token_acc": 1.0, "train_speed(iter/s)": 0.956652 }, { "epoch": 0.75574180554202, "grad_norm": 0.2676365077495575, "learning_rate": 1.5444427192179946e-06, "loss": 0.007335471920669079, "memory(GiB)": 22.66, "step": 23264, "token_acc": 1.0, "train_speed(iter/s)": 0.956658 }, { "epoch": 0.7557742910047753, "grad_norm": 0.28658920526504517, "learning_rate": 1.544054512844983e-06, "loss": 0.01054244115948677, "memory(GiB)": 22.66, "step": 23265, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.956665 }, { "epoch": 0.7558067764675308, "grad_norm": 0.32456377148628235, "learning_rate": 1.543666346358182e-06, "loss": 0.009698167443275452, "memory(GiB)": 22.66, "step": 23266, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.956671 }, { "epoch": 0.7558392619302862, "grad_norm": 0.34803664684295654, "learning_rate": 1.5432782197620721e-06, "loss": 0.012233561836183071, "memory(GiB)": 22.66, "step": 23267, "token_acc": 0.996, "train_speed(iter/s)": 0.956678 }, { "epoch": 0.7558717473930416, "grad_norm": 0.31493881344795227, "learning_rate": 1.5428901330611335e-06, "loss": 0.013668704777956009, "memory(GiB)": 22.66, "step": 23268, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956685 }, { "epoch": 0.755904232855797, "grad_norm": 0.5791460871696472, "learning_rate": 1.5425020862598428e-06, "loss": 0.02050943113863468, "memory(GiB)": 22.66, "step": 23269, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.956692 }, { "epoch": 0.7559367183185525, "grad_norm": 0.6038385033607483, "learning_rate": 1.5421140793626798e-06, "loss": 0.01484762318432331, "memory(GiB)": 22.66, "step": 23270, "token_acc": 1.0, "train_speed(iter/s)": 0.956699 }, { "epoch": 0.7559692037813078, "grad_norm": 0.348219633102417, "learning_rate": 1.5417261123741235e-06, "loss": 0.009307246655225754, "memory(GiB)": 22.66, "step": 23271, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.956707 }, { "epoch": 0.7560016892440633, "grad_norm": 0.2831163704395294, "learning_rate": 1.5413381852986514e-06, "loss": 0.013482624664902687, "memory(GiB)": 22.66, "step": 23272, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956716 }, { "epoch": 0.7560341747068187, "grad_norm": 0.4115019738674164, "learning_rate": 1.5409502981407392e-06, "loss": 0.015877176076173782, "memory(GiB)": 22.66, "step": 23273, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.956724 }, { "epoch": 0.7560666601695741, "grad_norm": 0.21785250306129456, "learning_rate": 1.5405624509048655e-06, "loss": 0.010538291186094284, "memory(GiB)": 22.66, "step": 23274, "token_acc": 0.9895104895104895, "train_speed(iter/s)": 0.956733 }, { "epoch": 0.7560991456323295, "grad_norm": 0.49825820326805115, "learning_rate": 1.5401746435955028e-06, "loss": 0.014557222835719585, "memory(GiB)": 22.66, "step": 23275, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.956742 }, { "epoch": 0.756131631095085, "grad_norm": 0.2713328003883362, "learning_rate": 1.5397868762171315e-06, "loss": 0.010490437969565392, "memory(GiB)": 22.66, "step": 23276, "token_acc": 1.0, "train_speed(iter/s)": 0.95675 }, { "epoch": 0.7561641165578403, "grad_norm": 0.3312273919582367, "learning_rate": 1.5393991487742266e-06, "loss": 0.011757571250200272, "memory(GiB)": 22.66, "step": 23277, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.956759 }, { "epoch": 0.7561966020205958, "grad_norm": 0.33377742767333984, "learning_rate": 1.5390114612712604e-06, "loss": 0.010706920176744461, "memory(GiB)": 22.66, "step": 23278, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.956767 }, { "epoch": 0.7562290874833512, "grad_norm": 0.35878291726112366, "learning_rate": 1.5386238137127097e-06, "loss": 0.014273698441684246, "memory(GiB)": 22.66, "step": 23279, "token_acc": 1.0, "train_speed(iter/s)": 0.956775 }, { "epoch": 0.7562615729461066, "grad_norm": 0.36836639046669006, "learning_rate": 1.5382362061030437e-06, "loss": 0.010336056351661682, "memory(GiB)": 22.66, "step": 23280, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.956784 }, { "epoch": 0.756294058408862, "grad_norm": 0.540215015411377, "learning_rate": 1.5378486384467433e-06, "loss": 0.024529414251446724, "memory(GiB)": 22.66, "step": 23281, "token_acc": 0.9932885906040269, "train_speed(iter/s)": 0.956793 }, { "epoch": 0.7563265438716175, "grad_norm": 0.26886630058288574, "learning_rate": 1.5374611107482757e-06, "loss": 0.011281521990895271, "memory(GiB)": 22.66, "step": 23282, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.956801 }, { "epoch": 0.7563590293343728, "grad_norm": 0.34624388813972473, "learning_rate": 1.5370736230121174e-06, "loss": 0.013454754836857319, "memory(GiB)": 22.66, "step": 23283, "token_acc": 1.0, "train_speed(iter/s)": 0.956809 }, { "epoch": 0.7563915147971283, "grad_norm": 0.433301717042923, "learning_rate": 1.536686175242737e-06, "loss": 0.013191024772822857, "memory(GiB)": 22.66, "step": 23284, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.956818 }, { "epoch": 0.7564240002598837, "grad_norm": 0.36553138494491577, "learning_rate": 1.5362987674446073e-06, "loss": 0.010139699093997478, "memory(GiB)": 22.66, "step": 23285, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956826 }, { "epoch": 0.7564564857226391, "grad_norm": 0.42060157656669617, "learning_rate": 1.535911399622201e-06, "loss": 0.02002486027777195, "memory(GiB)": 22.66, "step": 23286, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.956835 }, { "epoch": 0.7564889711853945, "grad_norm": 0.4512818455696106, "learning_rate": 1.5355240717799885e-06, "loss": 0.015770036727190018, "memory(GiB)": 22.66, "step": 23287, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956844 }, { "epoch": 0.75652145664815, "grad_norm": 0.7972001433372498, "learning_rate": 1.535136783922438e-06, "loss": 0.01981852576136589, "memory(GiB)": 22.66, "step": 23288, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.956853 }, { "epoch": 0.7565539421109053, "grad_norm": 0.4923538267612457, "learning_rate": 1.534749536054021e-06, "loss": 0.018724745139479637, "memory(GiB)": 22.66, "step": 23289, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956861 }, { "epoch": 0.7565864275736608, "grad_norm": 0.28506872057914734, "learning_rate": 1.5343623281792063e-06, "loss": 0.010071463882923126, "memory(GiB)": 22.66, "step": 23290, "token_acc": 1.0, "train_speed(iter/s)": 0.956869 }, { "epoch": 0.7566189130364162, "grad_norm": 0.2809751033782959, "learning_rate": 1.5339751603024633e-06, "loss": 0.015067746862769127, "memory(GiB)": 22.66, "step": 23291, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956879 }, { "epoch": 0.7566513984991716, "grad_norm": 0.4013276994228363, "learning_rate": 1.533588032428262e-06, "loss": 0.012048080563545227, "memory(GiB)": 22.66, "step": 23292, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956886 }, { "epoch": 0.756683883961927, "grad_norm": 0.32588914036750793, "learning_rate": 1.5332009445610668e-06, "loss": 0.011458384804427624, "memory(GiB)": 22.66, "step": 23293, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956893 }, { "epoch": 0.7567163694246825, "grad_norm": 0.2319413721561432, "learning_rate": 1.532813896705347e-06, "loss": 0.006333936471492052, "memory(GiB)": 22.66, "step": 23294, "token_acc": 1.0, "train_speed(iter/s)": 0.9569 }, { "epoch": 0.7567488548874378, "grad_norm": 0.449687659740448, "learning_rate": 1.5324268888655697e-06, "loss": 0.019442245364189148, "memory(GiB)": 22.66, "step": 23295, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956906 }, { "epoch": 0.7567813403501933, "grad_norm": 0.19812244176864624, "learning_rate": 1.5320399210462029e-06, "loss": 0.007681426126509905, "memory(GiB)": 22.66, "step": 23296, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956913 }, { "epoch": 0.7568138258129488, "grad_norm": 0.4622877240180969, "learning_rate": 1.53165299325171e-06, "loss": 0.009352745488286018, "memory(GiB)": 22.66, "step": 23297, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.95692 }, { "epoch": 0.7568463112757041, "grad_norm": 0.35094696283340454, "learning_rate": 1.5312661054865574e-06, "loss": 0.016374699771404266, "memory(GiB)": 22.66, "step": 23298, "token_acc": 1.0, "train_speed(iter/s)": 0.956927 }, { "epoch": 0.7568787967384596, "grad_norm": 0.35092824697494507, "learning_rate": 1.5308792577552113e-06, "loss": 0.014236625283956528, "memory(GiB)": 22.66, "step": 23299, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956934 }, { "epoch": 0.756911282201215, "grad_norm": 0.627970278263092, "learning_rate": 1.530492450062137e-06, "loss": 0.02048930898308754, "memory(GiB)": 22.66, "step": 23300, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.95694 }, { "epoch": 0.7569437676639704, "grad_norm": 0.5524571537971497, "learning_rate": 1.5301056824117965e-06, "loss": 0.014964225701987743, "memory(GiB)": 22.66, "step": 23301, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956947 }, { "epoch": 0.7569762531267258, "grad_norm": 0.30818408727645874, "learning_rate": 1.5297189548086567e-06, "loss": 0.01040713768452406, "memory(GiB)": 22.66, "step": 23302, "token_acc": 1.0, "train_speed(iter/s)": 0.956953 }, { "epoch": 0.7570087385894813, "grad_norm": 0.4495730996131897, "learning_rate": 1.5293322672571753e-06, "loss": 0.019720861688256264, "memory(GiB)": 22.66, "step": 23303, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.95696 }, { "epoch": 0.7570412240522366, "grad_norm": 0.3316798508167267, "learning_rate": 1.5289456197618224e-06, "loss": 0.013638874515891075, "memory(GiB)": 22.66, "step": 23304, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.956966 }, { "epoch": 0.7570737095149921, "grad_norm": 0.257771372795105, "learning_rate": 1.5285590123270556e-06, "loss": 0.011087595485150814, "memory(GiB)": 22.66, "step": 23305, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956973 }, { "epoch": 0.7571061949777474, "grad_norm": 0.20914405584335327, "learning_rate": 1.5281724449573397e-06, "loss": 0.010765564627945423, "memory(GiB)": 22.66, "step": 23306, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.95698 }, { "epoch": 0.7571386804405029, "grad_norm": 0.45079657435417175, "learning_rate": 1.527785917657133e-06, "loss": 0.012059834785759449, "memory(GiB)": 22.66, "step": 23307, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.956987 }, { "epoch": 0.7571711659032583, "grad_norm": 0.3359242081642151, "learning_rate": 1.527399430430897e-06, "loss": 0.016588937491178513, "memory(GiB)": 22.66, "step": 23308, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.956994 }, { "epoch": 0.7572036513660138, "grad_norm": 0.4732559621334076, "learning_rate": 1.5270129832830972e-06, "loss": 0.01916131004691124, "memory(GiB)": 22.66, "step": 23309, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.957001 }, { "epoch": 0.7572361368287691, "grad_norm": 1.0620368719100952, "learning_rate": 1.5266265762181876e-06, "loss": 0.026913940906524658, "memory(GiB)": 22.66, "step": 23310, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.957008 }, { "epoch": 0.7572686222915246, "grad_norm": 0.4376266896724701, "learning_rate": 1.5262402092406325e-06, "loss": 0.021418055519461632, "memory(GiB)": 22.66, "step": 23311, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.957012 }, { "epoch": 0.75730110775428, "grad_norm": 0.3261433243751526, "learning_rate": 1.5258538823548874e-06, "loss": 0.01395975612103939, "memory(GiB)": 22.66, "step": 23312, "token_acc": 1.0, "train_speed(iter/s)": 0.957018 }, { "epoch": 0.7573335932170354, "grad_norm": 0.3207269012928009, "learning_rate": 1.5254675955654124e-06, "loss": 0.01401712466031313, "memory(GiB)": 22.66, "step": 23313, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.957024 }, { "epoch": 0.7573660786797908, "grad_norm": 0.337582528591156, "learning_rate": 1.5250813488766659e-06, "loss": 0.006498898845165968, "memory(GiB)": 22.66, "step": 23314, "token_acc": 0.995, "train_speed(iter/s)": 0.95703 }, { "epoch": 0.7573985641425462, "grad_norm": 0.46888646483421326, "learning_rate": 1.5246951422931072e-06, "loss": 0.01295142900198698, "memory(GiB)": 22.66, "step": 23315, "token_acc": 0.992, "train_speed(iter/s)": 0.957036 }, { "epoch": 0.7574310496053016, "grad_norm": 0.22752715647220612, "learning_rate": 1.524308975819191e-06, "loss": 0.00868312455713749, "memory(GiB)": 22.66, "step": 23316, "token_acc": 1.0, "train_speed(iter/s)": 0.957043 }, { "epoch": 0.7574635350680571, "grad_norm": 0.34115052223205566, "learning_rate": 1.5239228494593749e-06, "loss": 0.011926614679396152, "memory(GiB)": 22.66, "step": 23317, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.95705 }, { "epoch": 0.7574960205308124, "grad_norm": 0.2976636588573456, "learning_rate": 1.5235367632181164e-06, "loss": 0.014633064158260822, "memory(GiB)": 22.66, "step": 23318, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.957056 }, { "epoch": 0.7575285059935679, "grad_norm": 0.38365525007247925, "learning_rate": 1.5231507170998717e-06, "loss": 0.014864401891827583, "memory(GiB)": 22.66, "step": 23319, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.957063 }, { "epoch": 0.7575609914563233, "grad_norm": 0.3539014160633087, "learning_rate": 1.5227647111090943e-06, "loss": 0.013911550864577293, "memory(GiB)": 22.66, "step": 23320, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.95707 }, { "epoch": 0.7575934769190787, "grad_norm": 0.2975068986415863, "learning_rate": 1.5223787452502403e-06, "loss": 0.013038571923971176, "memory(GiB)": 22.66, "step": 23321, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957076 }, { "epoch": 0.7576259623818341, "grad_norm": 0.29371166229248047, "learning_rate": 1.5219928195277639e-06, "loss": 0.014440802857279778, "memory(GiB)": 22.66, "step": 23322, "token_acc": 0.98828125, "train_speed(iter/s)": 0.957083 }, { "epoch": 0.7576584478445896, "grad_norm": 0.3839326500892639, "learning_rate": 1.5216069339461197e-06, "loss": 0.015334399417042732, "memory(GiB)": 22.66, "step": 23323, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.957089 }, { "epoch": 0.7576909333073449, "grad_norm": 0.7567896842956543, "learning_rate": 1.5212210885097633e-06, "loss": 0.018389670178294182, "memory(GiB)": 22.66, "step": 23324, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.957096 }, { "epoch": 0.7577234187701004, "grad_norm": 0.2580970227718353, "learning_rate": 1.5208352832231437e-06, "loss": 0.007736172992736101, "memory(GiB)": 22.66, "step": 23325, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.957103 }, { "epoch": 0.7577559042328558, "grad_norm": 0.40776917338371277, "learning_rate": 1.5204495180907159e-06, "loss": 0.01667761616408825, "memory(GiB)": 22.66, "step": 23326, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.95711 }, { "epoch": 0.7577883896956112, "grad_norm": 0.24021172523498535, "learning_rate": 1.5200637931169315e-06, "loss": 0.00573162455111742, "memory(GiB)": 22.66, "step": 23327, "token_acc": 1.0, "train_speed(iter/s)": 0.957116 }, { "epoch": 0.7578208751583666, "grad_norm": 0.289976567029953, "learning_rate": 1.519678108306245e-06, "loss": 0.012673887424170971, "memory(GiB)": 22.66, "step": 23328, "token_acc": 1.0, "train_speed(iter/s)": 0.957122 }, { "epoch": 0.7578533606211221, "grad_norm": 0.3412294089794159, "learning_rate": 1.5192924636631034e-06, "loss": 0.017512869089841843, "memory(GiB)": 22.66, "step": 23329, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.95713 }, { "epoch": 0.7578858460838774, "grad_norm": 0.4088689684867859, "learning_rate": 1.518906859191961e-06, "loss": 0.012694856151938438, "memory(GiB)": 22.66, "step": 23330, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.957137 }, { "epoch": 0.7579183315466329, "grad_norm": 0.4086949825286865, "learning_rate": 1.5185212948972638e-06, "loss": 0.00965177919715643, "memory(GiB)": 22.66, "step": 23331, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957145 }, { "epoch": 0.7579508170093883, "grad_norm": 0.24656732380390167, "learning_rate": 1.5181357707834683e-06, "loss": 0.011788949370384216, "memory(GiB)": 22.66, "step": 23332, "token_acc": 1.0, "train_speed(iter/s)": 0.957153 }, { "epoch": 0.7579833024721437, "grad_norm": 0.38924089074134827, "learning_rate": 1.5177502868550181e-06, "loss": 0.007971236482262611, "memory(GiB)": 22.66, "step": 23333, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957162 }, { "epoch": 0.7580157879348991, "grad_norm": 0.3064572215080261, "learning_rate": 1.5173648431163661e-06, "loss": 0.010448262095451355, "memory(GiB)": 22.66, "step": 23334, "token_acc": 1.0, "train_speed(iter/s)": 0.95717 }, { "epoch": 0.7580482733976546, "grad_norm": 0.20047399401664734, "learning_rate": 1.5169794395719578e-06, "loss": 0.007440477609634399, "memory(GiB)": 22.66, "step": 23335, "token_acc": 1.0, "train_speed(iter/s)": 0.957179 }, { "epoch": 0.7580807588604099, "grad_norm": 0.24565458297729492, "learning_rate": 1.5165940762262433e-06, "loss": 0.009179215878248215, "memory(GiB)": 22.66, "step": 23336, "token_acc": 1.0, "train_speed(iter/s)": 0.957187 }, { "epoch": 0.7581132443231654, "grad_norm": 0.20483021438121796, "learning_rate": 1.5162087530836683e-06, "loss": 0.009431000798940659, "memory(GiB)": 22.66, "step": 23337, "token_acc": 1.0, "train_speed(iter/s)": 0.957196 }, { "epoch": 0.7581457297859208, "grad_norm": 0.3772237300872803, "learning_rate": 1.5158234701486824e-06, "loss": 0.012736989185214043, "memory(GiB)": 22.66, "step": 23338, "token_acc": 1.0, "train_speed(iter/s)": 0.957205 }, { "epoch": 0.7581782152486762, "grad_norm": 0.28831571340560913, "learning_rate": 1.5154382274257317e-06, "loss": 0.010989300906658173, "memory(GiB)": 22.66, "step": 23339, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.957214 }, { "epoch": 0.7582107007114316, "grad_norm": 0.41551458835601807, "learning_rate": 1.5150530249192607e-06, "loss": 0.026749014854431152, "memory(GiB)": 22.66, "step": 23340, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.957223 }, { "epoch": 0.7582431861741871, "grad_norm": 0.34296056628227234, "learning_rate": 1.5146678626337164e-06, "loss": 0.01669682189822197, "memory(GiB)": 22.66, "step": 23341, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.957232 }, { "epoch": 0.7582756716369424, "grad_norm": 0.4000032842159271, "learning_rate": 1.5142827405735433e-06, "loss": 0.013676169328391552, "memory(GiB)": 22.66, "step": 23342, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.95724 }, { "epoch": 0.7583081570996979, "grad_norm": 3.860369920730591, "learning_rate": 1.5138976587431892e-06, "loss": 0.015973227098584175, "memory(GiB)": 22.66, "step": 23343, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.957249 }, { "epoch": 0.7583406425624533, "grad_norm": 0.3859117329120636, "learning_rate": 1.5135126171470938e-06, "loss": 0.009081770665943623, "memory(GiB)": 22.66, "step": 23344, "token_acc": 1.0, "train_speed(iter/s)": 0.957258 }, { "epoch": 0.7583731280252087, "grad_norm": 0.46019911766052246, "learning_rate": 1.513127615789704e-06, "loss": 0.01843670755624771, "memory(GiB)": 22.66, "step": 23345, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957266 }, { "epoch": 0.7584056134879641, "grad_norm": 0.27147552371025085, "learning_rate": 1.512742654675462e-06, "loss": 0.009881668724119663, "memory(GiB)": 22.66, "step": 23346, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.957275 }, { "epoch": 0.7584380989507196, "grad_norm": 0.2079589068889618, "learning_rate": 1.512357733808813e-06, "loss": 0.010230205953121185, "memory(GiB)": 22.66, "step": 23347, "token_acc": 1.0, "train_speed(iter/s)": 0.957283 }, { "epoch": 0.7584705844134749, "grad_norm": 0.34380149841308594, "learning_rate": 1.5119728531941963e-06, "loss": 0.012200267985463142, "memory(GiB)": 22.66, "step": 23348, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.957292 }, { "epoch": 0.7585030698762304, "grad_norm": 0.2803938090801239, "learning_rate": 1.5115880128360556e-06, "loss": 0.009863912127912045, "memory(GiB)": 22.66, "step": 23349, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.9573 }, { "epoch": 0.7585355553389858, "grad_norm": 0.3837586045265198, "learning_rate": 1.5112032127388321e-06, "loss": 0.01664770022034645, "memory(GiB)": 22.66, "step": 23350, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.957309 }, { "epoch": 0.7585680408017412, "grad_norm": 0.3253253698348999, "learning_rate": 1.5108184529069686e-06, "loss": 0.01355900801718235, "memory(GiB)": 22.66, "step": 23351, "token_acc": 0.9923664122137404, "train_speed(iter/s)": 0.957317 }, { "epoch": 0.7586005262644966, "grad_norm": 0.31277987360954285, "learning_rate": 1.5104337333449025e-06, "loss": 0.00961914099752903, "memory(GiB)": 22.66, "step": 23352, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957326 }, { "epoch": 0.7586330117272521, "grad_norm": 0.4629778265953064, "learning_rate": 1.510049054057076e-06, "loss": 0.011094924062490463, "memory(GiB)": 22.66, "step": 23353, "token_acc": 1.0, "train_speed(iter/s)": 0.957335 }, { "epoch": 0.7586654971900074, "grad_norm": 0.32162174582481384, "learning_rate": 1.5096644150479288e-06, "loss": 0.012479949742555618, "memory(GiB)": 22.66, "step": 23354, "token_acc": 1.0, "train_speed(iter/s)": 0.957342 }, { "epoch": 0.7586979826527629, "grad_norm": 0.47148823738098145, "learning_rate": 1.5092798163218998e-06, "loss": 0.013814980164170265, "memory(GiB)": 22.66, "step": 23355, "token_acc": 1.0, "train_speed(iter/s)": 0.957349 }, { "epoch": 0.7587304681155183, "grad_norm": 0.35908472537994385, "learning_rate": 1.5088952578834288e-06, "loss": 0.01586238667368889, "memory(GiB)": 22.66, "step": 23356, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.957356 }, { "epoch": 0.7587629535782737, "grad_norm": 0.23539850115776062, "learning_rate": 1.508510739736952e-06, "loss": 0.007914232090115547, "memory(GiB)": 22.66, "step": 23357, "token_acc": 1.0, "train_speed(iter/s)": 0.957363 }, { "epoch": 0.7587954390410291, "grad_norm": 0.2850237190723419, "learning_rate": 1.5081262618869103e-06, "loss": 0.008003685623407364, "memory(GiB)": 22.66, "step": 23358, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.957369 }, { "epoch": 0.7588279245037846, "grad_norm": 0.28921055793762207, "learning_rate": 1.5077418243377356e-06, "loss": 0.011008759960532188, "memory(GiB)": 22.66, "step": 23359, "token_acc": 1.0, "train_speed(iter/s)": 0.957376 }, { "epoch": 0.75886040996654, "grad_norm": 0.4425200819969177, "learning_rate": 1.5073574270938718e-06, "loss": 0.0191134475171566, "memory(GiB)": 22.66, "step": 23360, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.957382 }, { "epoch": 0.7588928954292954, "grad_norm": 0.35040563344955444, "learning_rate": 1.5069730701597507e-06, "loss": 0.014509983360767365, "memory(GiB)": 22.66, "step": 23361, "token_acc": 1.0, "train_speed(iter/s)": 0.957389 }, { "epoch": 0.7589253808920509, "grad_norm": 0.3643760681152344, "learning_rate": 1.5065887535398106e-06, "loss": 0.01967913657426834, "memory(GiB)": 22.66, "step": 23362, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957396 }, { "epoch": 0.7589578663548062, "grad_norm": 0.3516072630882263, "learning_rate": 1.5062044772384848e-06, "loss": 0.01621362939476967, "memory(GiB)": 22.66, "step": 23363, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.957402 }, { "epoch": 0.7589903518175617, "grad_norm": 0.43103134632110596, "learning_rate": 1.505820241260209e-06, "loss": 0.01306544616818428, "memory(GiB)": 22.66, "step": 23364, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957408 }, { "epoch": 0.759022837280317, "grad_norm": 0.39283236861228943, "learning_rate": 1.505436045609419e-06, "loss": 0.020014241337776184, "memory(GiB)": 22.66, "step": 23365, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.957415 }, { "epoch": 0.7590553227430725, "grad_norm": 0.4188099503517151, "learning_rate": 1.5050518902905498e-06, "loss": 0.013825824484229088, "memory(GiB)": 22.66, "step": 23366, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957421 }, { "epoch": 0.7590878082058279, "grad_norm": 0.18129578232765198, "learning_rate": 1.5046677753080319e-06, "loss": 0.007437887601554394, "memory(GiB)": 22.66, "step": 23367, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.957428 }, { "epoch": 0.7591202936685834, "grad_norm": 0.2869243323802948, "learning_rate": 1.5042837006662997e-06, "loss": 0.011551951989531517, "memory(GiB)": 22.66, "step": 23368, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.957435 }, { "epoch": 0.7591527791313387, "grad_norm": 0.4824924170970917, "learning_rate": 1.5038996663697868e-06, "loss": 0.019759472459554672, "memory(GiB)": 22.66, "step": 23369, "token_acc": 1.0, "train_speed(iter/s)": 0.957442 }, { "epoch": 0.7591852645940942, "grad_norm": 0.2555679976940155, "learning_rate": 1.503515672422925e-06, "loss": 0.016092121601104736, "memory(GiB)": 22.66, "step": 23370, "token_acc": 0.9781420765027322, "train_speed(iter/s)": 0.957449 }, { "epoch": 0.7592177500568495, "grad_norm": 0.3854980170726776, "learning_rate": 1.5031317188301475e-06, "loss": 0.006484583951532841, "memory(GiB)": 22.66, "step": 23371, "token_acc": 1.0, "train_speed(iter/s)": 0.957455 }, { "epoch": 0.759250235519605, "grad_norm": 0.3991769850254059, "learning_rate": 1.5027478055958833e-06, "loss": 0.01706753671169281, "memory(GiB)": 22.66, "step": 23372, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.95746 }, { "epoch": 0.7592827209823604, "grad_norm": 0.3618876039981842, "learning_rate": 1.5023639327245637e-06, "loss": 0.014590241014957428, "memory(GiB)": 22.66, "step": 23373, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957465 }, { "epoch": 0.7593152064451159, "grad_norm": 0.2435118854045868, "learning_rate": 1.5019801002206197e-06, "loss": 0.007747964933514595, "memory(GiB)": 22.66, "step": 23374, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957471 }, { "epoch": 0.7593476919078712, "grad_norm": 0.3509805500507355, "learning_rate": 1.5015963080884833e-06, "loss": 0.012403560802340508, "memory(GiB)": 22.66, "step": 23375, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.957477 }, { "epoch": 0.7593801773706267, "grad_norm": 0.32560697197914124, "learning_rate": 1.5012125563325796e-06, "loss": 0.017033282667398453, "memory(GiB)": 22.66, "step": 23376, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.957483 }, { "epoch": 0.759412662833382, "grad_norm": 0.21886157989501953, "learning_rate": 1.5008288449573405e-06, "loss": 0.0074175582267344, "memory(GiB)": 22.66, "step": 23377, "token_acc": 1.0, "train_speed(iter/s)": 0.957489 }, { "epoch": 0.7594451482961375, "grad_norm": 0.3000437021255493, "learning_rate": 1.5004451739671932e-06, "loss": 0.017210515215992928, "memory(GiB)": 22.66, "step": 23378, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957496 }, { "epoch": 0.7594776337588929, "grad_norm": 0.26632192730903625, "learning_rate": 1.5000615433665687e-06, "loss": 0.01124332845211029, "memory(GiB)": 22.66, "step": 23379, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.957502 }, { "epoch": 0.7595101192216483, "grad_norm": 0.3571249544620514, "learning_rate": 1.4996779531598904e-06, "loss": 0.010241830721497536, "memory(GiB)": 22.66, "step": 23380, "token_acc": 1.0, "train_speed(iter/s)": 0.957508 }, { "epoch": 0.7595426046844037, "grad_norm": 0.3966715335845947, "learning_rate": 1.4992944033515893e-06, "loss": 0.02012878656387329, "memory(GiB)": 22.66, "step": 23381, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.957515 }, { "epoch": 0.7595750901471592, "grad_norm": 0.3297712802886963, "learning_rate": 1.4989108939460873e-06, "loss": 0.015938416123390198, "memory(GiB)": 22.66, "step": 23382, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.957522 }, { "epoch": 0.7596075756099145, "grad_norm": 0.4843708872795105, "learning_rate": 1.498527424947816e-06, "loss": 0.009536182507872581, "memory(GiB)": 22.66, "step": 23383, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957528 }, { "epoch": 0.75964006107267, "grad_norm": 0.36408841609954834, "learning_rate": 1.4981439963611978e-06, "loss": 0.016553647816181183, "memory(GiB)": 22.66, "step": 23384, "token_acc": 0.978448275862069, "train_speed(iter/s)": 0.957535 }, { "epoch": 0.7596725465354254, "grad_norm": 0.4617048501968384, "learning_rate": 1.4977606081906581e-06, "loss": 0.01911747455596924, "memory(GiB)": 22.66, "step": 23385, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.957542 }, { "epoch": 0.7597050319981808, "grad_norm": 0.3407365381717682, "learning_rate": 1.4973772604406246e-06, "loss": 0.018498055636882782, "memory(GiB)": 22.66, "step": 23386, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957548 }, { "epoch": 0.7597375174609362, "grad_norm": 0.38867974281311035, "learning_rate": 1.4969939531155158e-06, "loss": 0.010514270514249802, "memory(GiB)": 22.66, "step": 23387, "token_acc": 1.0, "train_speed(iter/s)": 0.957555 }, { "epoch": 0.7597700029236917, "grad_norm": 0.2554541826248169, "learning_rate": 1.4966106862197633e-06, "loss": 0.01445281133055687, "memory(GiB)": 22.66, "step": 23388, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.957562 }, { "epoch": 0.759802488386447, "grad_norm": 0.30459344387054443, "learning_rate": 1.4962274597577842e-06, "loss": 0.006989555433392525, "memory(GiB)": 22.66, "step": 23389, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.957569 }, { "epoch": 0.7598349738492025, "grad_norm": 0.36108314990997314, "learning_rate": 1.4958442737340056e-06, "loss": 0.012850742787122726, "memory(GiB)": 22.66, "step": 23390, "token_acc": 1.0, "train_speed(iter/s)": 0.957577 }, { "epoch": 0.7598674593119579, "grad_norm": 0.4348766803741455, "learning_rate": 1.4954611281528464e-06, "loss": 0.016278523951768875, "memory(GiB)": 22.66, "step": 23391, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.957586 }, { "epoch": 0.7598999447747133, "grad_norm": 0.3549823760986328, "learning_rate": 1.4950780230187301e-06, "loss": 0.014510704204440117, "memory(GiB)": 22.66, "step": 23392, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957594 }, { "epoch": 0.7599324302374687, "grad_norm": 0.27453550696372986, "learning_rate": 1.4946949583360787e-06, "loss": 0.01446556393057108, "memory(GiB)": 22.66, "step": 23393, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957603 }, { "epoch": 0.7599649157002242, "grad_norm": 0.31903374195098877, "learning_rate": 1.4943119341093148e-06, "loss": 0.012544073164463043, "memory(GiB)": 22.66, "step": 23394, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.957612 }, { "epoch": 0.7599974011629795, "grad_norm": 0.25773581862449646, "learning_rate": 1.493928950342855e-06, "loss": 0.011399403214454651, "memory(GiB)": 22.66, "step": 23395, "token_acc": 1.0, "train_speed(iter/s)": 0.957621 }, { "epoch": 0.760029886625735, "grad_norm": 0.28135406970977783, "learning_rate": 1.4935460070411223e-06, "loss": 0.011576415970921516, "memory(GiB)": 22.66, "step": 23396, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957629 }, { "epoch": 0.7600623720884904, "grad_norm": 0.41931673884391785, "learning_rate": 1.4931631042085359e-06, "loss": 0.012827960774302483, "memory(GiB)": 22.66, "step": 23397, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.957637 }, { "epoch": 0.7600948575512458, "grad_norm": 0.24509452283382416, "learning_rate": 1.4927802418495157e-06, "loss": 0.01333116739988327, "memory(GiB)": 22.66, "step": 23398, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.957646 }, { "epoch": 0.7601273430140012, "grad_norm": 0.4550441801548004, "learning_rate": 1.4923974199684788e-06, "loss": 0.017909258604049683, "memory(GiB)": 22.66, "step": 23399, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.957655 }, { "epoch": 0.7601598284767567, "grad_norm": 0.3619217276573181, "learning_rate": 1.4920146385698436e-06, "loss": 0.010491602122783661, "memory(GiB)": 22.66, "step": 23400, "token_acc": 1.0, "train_speed(iter/s)": 0.957663 }, { "epoch": 0.760192313939512, "grad_norm": 0.41962406039237976, "learning_rate": 1.491631897658029e-06, "loss": 0.015940750017762184, "memory(GiB)": 22.66, "step": 23401, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957672 }, { "epoch": 0.7602247994022675, "grad_norm": 0.41741546988487244, "learning_rate": 1.491249197237452e-06, "loss": 0.011670975014567375, "memory(GiB)": 22.66, "step": 23402, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.95768 }, { "epoch": 0.7602572848650229, "grad_norm": 0.4904303252696991, "learning_rate": 1.4908665373125309e-06, "loss": 0.0191909521818161, "memory(GiB)": 22.66, "step": 23403, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.957689 }, { "epoch": 0.7602897703277783, "grad_norm": 0.27930423617362976, "learning_rate": 1.490483917887679e-06, "loss": 0.007337925955653191, "memory(GiB)": 22.66, "step": 23404, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.957697 }, { "epoch": 0.7603222557905337, "grad_norm": 0.43674540519714355, "learning_rate": 1.4901013389673137e-06, "loss": 0.012237047776579857, "memory(GiB)": 22.66, "step": 23405, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957706 }, { "epoch": 0.7603547412532892, "grad_norm": 0.4410370886325836, "learning_rate": 1.4897188005558505e-06, "loss": 0.01671599969267845, "memory(GiB)": 22.66, "step": 23406, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.957715 }, { "epoch": 0.7603872267160445, "grad_norm": 0.2896430790424347, "learning_rate": 1.4893363026577067e-06, "loss": 0.012942858040332794, "memory(GiB)": 22.66, "step": 23407, "token_acc": 0.993006993006993, "train_speed(iter/s)": 0.957723 }, { "epoch": 0.7604197121788, "grad_norm": 0.48155006766319275, "learning_rate": 1.4889538452772924e-06, "loss": 0.012046486139297485, "memory(GiB)": 22.66, "step": 23408, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957732 }, { "epoch": 0.7604521976415554, "grad_norm": 0.3881872594356537, "learning_rate": 1.488571428419026e-06, "loss": 0.012864449992775917, "memory(GiB)": 22.66, "step": 23409, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957741 }, { "epoch": 0.7604846831043108, "grad_norm": 0.5342739820480347, "learning_rate": 1.4881890520873154e-06, "loss": 0.025513075292110443, "memory(GiB)": 22.66, "step": 23410, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.95775 }, { "epoch": 0.7605171685670662, "grad_norm": 0.5071596503257751, "learning_rate": 1.487806716286581e-06, "loss": 0.013027534820139408, "memory(GiB)": 22.66, "step": 23411, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957759 }, { "epoch": 0.7605496540298217, "grad_norm": 0.3116369843482971, "learning_rate": 1.4874244210212297e-06, "loss": 0.012137580662965775, "memory(GiB)": 22.66, "step": 23412, "token_acc": 1.0, "train_speed(iter/s)": 0.957767 }, { "epoch": 0.760582139492577, "grad_norm": 0.28974369168281555, "learning_rate": 1.487042166295678e-06, "loss": 0.011440558359026909, "memory(GiB)": 22.66, "step": 23413, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957776 }, { "epoch": 0.7606146249553325, "grad_norm": 0.5504674315452576, "learning_rate": 1.4866599521143332e-06, "loss": 0.015791691839694977, "memory(GiB)": 22.66, "step": 23414, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957785 }, { "epoch": 0.7606471104180879, "grad_norm": 0.5695169568061829, "learning_rate": 1.4862777784816078e-06, "loss": 0.0290218498557806, "memory(GiB)": 22.66, "step": 23415, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.957793 }, { "epoch": 0.7606795958808433, "grad_norm": 0.5136918425559998, "learning_rate": 1.4858956454019163e-06, "loss": 0.014927748590707779, "memory(GiB)": 22.66, "step": 23416, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.957801 }, { "epoch": 0.7607120813435987, "grad_norm": 0.3723312318325043, "learning_rate": 1.485513552879665e-06, "loss": 0.015154337510466576, "memory(GiB)": 22.66, "step": 23417, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.957808 }, { "epoch": 0.7607445668063542, "grad_norm": 0.4269273281097412, "learning_rate": 1.4851315009192668e-06, "loss": 0.016423482447862625, "memory(GiB)": 22.66, "step": 23418, "token_acc": 0.980544747081712, "train_speed(iter/s)": 0.957815 }, { "epoch": 0.7607770522691095, "grad_norm": 0.4565664231777191, "learning_rate": 1.4847494895251274e-06, "loss": 0.017163675278425217, "memory(GiB)": 22.66, "step": 23419, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.957821 }, { "epoch": 0.760809537731865, "grad_norm": 0.3790656328201294, "learning_rate": 1.4843675187016576e-06, "loss": 0.014839457347989082, "memory(GiB)": 22.66, "step": 23420, "token_acc": 1.0, "train_speed(iter/s)": 0.957827 }, { "epoch": 0.7608420231946204, "grad_norm": 0.378659725189209, "learning_rate": 1.483985588453266e-06, "loss": 0.016588501632213593, "memory(GiB)": 22.66, "step": 23421, "token_acc": 1.0, "train_speed(iter/s)": 0.957834 }, { "epoch": 0.7608745086573758, "grad_norm": 0.24530814588069916, "learning_rate": 1.4836036987843628e-06, "loss": 0.009601086378097534, "memory(GiB)": 22.66, "step": 23422, "token_acc": 1.0, "train_speed(iter/s)": 0.957842 }, { "epoch": 0.7609069941201312, "grad_norm": 0.381289005279541, "learning_rate": 1.483221849699351e-06, "loss": 0.018537523224949837, "memory(GiB)": 22.66, "step": 23423, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957849 }, { "epoch": 0.7609394795828867, "grad_norm": 0.21986138820648193, "learning_rate": 1.48284004120264e-06, "loss": 0.006777230650186539, "memory(GiB)": 22.66, "step": 23424, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957855 }, { "epoch": 0.7609719650456421, "grad_norm": 0.296244740486145, "learning_rate": 1.4824582732986358e-06, "loss": 0.009420417249202728, "memory(GiB)": 22.66, "step": 23425, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.957862 }, { "epoch": 0.7610044505083975, "grad_norm": 0.2854650318622589, "learning_rate": 1.4820765459917474e-06, "loss": 0.012641312554478645, "memory(GiB)": 22.66, "step": 23426, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.957869 }, { "epoch": 0.761036935971153, "grad_norm": 0.29602208733558655, "learning_rate": 1.4816948592863761e-06, "loss": 0.012518828734755516, "memory(GiB)": 22.66, "step": 23427, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.957876 }, { "epoch": 0.7610694214339083, "grad_norm": 0.27346259355545044, "learning_rate": 1.4813132131869284e-06, "loss": 0.008139455690979958, "memory(GiB)": 22.66, "step": 23428, "token_acc": 1.0, "train_speed(iter/s)": 0.957883 }, { "epoch": 0.7611019068966638, "grad_norm": 0.3844848871231079, "learning_rate": 1.4809316076978104e-06, "loss": 0.012464674189686775, "memory(GiB)": 22.66, "step": 23429, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.95789 }, { "epoch": 0.7611343923594192, "grad_norm": 0.6090598702430725, "learning_rate": 1.4805500428234254e-06, "loss": 0.02113993465900421, "memory(GiB)": 22.66, "step": 23430, "token_acc": 1.0, "train_speed(iter/s)": 0.957897 }, { "epoch": 0.7611668778221746, "grad_norm": 0.40928685665130615, "learning_rate": 1.4801685185681787e-06, "loss": 0.014184683561325073, "memory(GiB)": 22.66, "step": 23431, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.957902 }, { "epoch": 0.76119936328493, "grad_norm": 0.3129728436470032, "learning_rate": 1.47978703493647e-06, "loss": 0.010153228417038918, "memory(GiB)": 22.66, "step": 23432, "token_acc": 1.0, "train_speed(iter/s)": 0.957908 }, { "epoch": 0.7612318487476855, "grad_norm": 0.2750972509384155, "learning_rate": 1.4794055919327049e-06, "loss": 0.013415677472949028, "memory(GiB)": 22.66, "step": 23433, "token_acc": 1.0, "train_speed(iter/s)": 0.957914 }, { "epoch": 0.7612643342104408, "grad_norm": 0.24916087090969086, "learning_rate": 1.4790241895612857e-06, "loss": 0.011151023209095001, "memory(GiB)": 22.66, "step": 23434, "token_acc": 1.0, "train_speed(iter/s)": 0.957919 }, { "epoch": 0.7612968196731963, "grad_norm": 0.4366631507873535, "learning_rate": 1.4786428278266145e-06, "loss": 0.01261810027062893, "memory(GiB)": 22.66, "step": 23435, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957925 }, { "epoch": 0.7613293051359517, "grad_norm": 0.34901461005210876, "learning_rate": 1.4782615067330907e-06, "loss": 0.0139749925583601, "memory(GiB)": 22.66, "step": 23436, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.95793 }, { "epoch": 0.7613617905987071, "grad_norm": 0.2960200309753418, "learning_rate": 1.4778802262851178e-06, "loss": 0.013129610568284988, "memory(GiB)": 22.66, "step": 23437, "token_acc": 0.9840425531914894, "train_speed(iter/s)": 0.957936 }, { "epoch": 0.7613942760614625, "grad_norm": 0.24781090021133423, "learning_rate": 1.477498986487092e-06, "loss": 0.008669326081871986, "memory(GiB)": 22.66, "step": 23438, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.957942 }, { "epoch": 0.761426761524218, "grad_norm": 0.3344006836414337, "learning_rate": 1.47711778734342e-06, "loss": 0.013683468103408813, "memory(GiB)": 22.66, "step": 23439, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.957948 }, { "epoch": 0.7614592469869733, "grad_norm": 0.2421451061964035, "learning_rate": 1.4767366288584955e-06, "loss": 0.009680017828941345, "memory(GiB)": 22.66, "step": 23440, "token_acc": 1.0, "train_speed(iter/s)": 0.957954 }, { "epoch": 0.7614917324497288, "grad_norm": 0.34921279549598694, "learning_rate": 1.4763555110367217e-06, "loss": 0.01552769634872675, "memory(GiB)": 22.66, "step": 23441, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.957961 }, { "epoch": 0.7615242179124841, "grad_norm": 0.4071975350379944, "learning_rate": 1.4759744338824938e-06, "loss": 0.014628803357481956, "memory(GiB)": 22.66, "step": 23442, "token_acc": 0.99, "train_speed(iter/s)": 0.957968 }, { "epoch": 0.7615567033752396, "grad_norm": 0.3706279695034027, "learning_rate": 1.4755933974002108e-06, "loss": 0.015530385076999664, "memory(GiB)": 22.66, "step": 23443, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957975 }, { "epoch": 0.761589188837995, "grad_norm": 0.34424006938934326, "learning_rate": 1.4752124015942714e-06, "loss": 0.01528701838105917, "memory(GiB)": 22.66, "step": 23444, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957982 }, { "epoch": 0.7616216743007505, "grad_norm": 0.2344922572374344, "learning_rate": 1.474831446469074e-06, "loss": 0.00804205797612667, "memory(GiB)": 22.66, "step": 23445, "token_acc": 1.0, "train_speed(iter/s)": 0.957989 }, { "epoch": 0.7616541597635058, "grad_norm": 0.344799667596817, "learning_rate": 1.474450532029012e-06, "loss": 0.011707894504070282, "memory(GiB)": 22.66, "step": 23446, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957995 }, { "epoch": 0.7616866452262613, "grad_norm": 0.44798535108566284, "learning_rate": 1.474069658278483e-06, "loss": 0.010611982084810734, "memory(GiB)": 22.66, "step": 23447, "token_acc": 1.0, "train_speed(iter/s)": 0.958002 }, { "epoch": 0.7617191306890166, "grad_norm": 0.35909709334373474, "learning_rate": 1.4736888252218839e-06, "loss": 0.01299001183360815, "memory(GiB)": 22.66, "step": 23448, "token_acc": 1.0, "train_speed(iter/s)": 0.958009 }, { "epoch": 0.7617516161517721, "grad_norm": 0.44008657336235046, "learning_rate": 1.4733080328636084e-06, "loss": 0.0121631920337677, "memory(GiB)": 22.66, "step": 23449, "token_acc": 1.0, "train_speed(iter/s)": 0.958016 }, { "epoch": 0.7617841016145275, "grad_norm": 0.3048669397830963, "learning_rate": 1.4729272812080547e-06, "loss": 0.009941162541508675, "memory(GiB)": 22.66, "step": 23450, "token_acc": 0.9923664122137404, "train_speed(iter/s)": 0.958024 }, { "epoch": 0.761816587077283, "grad_norm": 0.3053102493286133, "learning_rate": 1.4725465702596125e-06, "loss": 0.015464892610907555, "memory(GiB)": 22.66, "step": 23451, "token_acc": 1.0, "train_speed(iter/s)": 0.958032 }, { "epoch": 0.7618490725400383, "grad_norm": 0.45588117837905884, "learning_rate": 1.4721659000226779e-06, "loss": 0.015696225687861443, "memory(GiB)": 22.66, "step": 23452, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.958041 }, { "epoch": 0.7618815580027938, "grad_norm": 0.35222476720809937, "learning_rate": 1.471785270501645e-06, "loss": 0.01221809908747673, "memory(GiB)": 22.66, "step": 23453, "token_acc": 1.0, "train_speed(iter/s)": 0.958049 }, { "epoch": 0.7619140434655491, "grad_norm": 0.2868054509162903, "learning_rate": 1.471404681700907e-06, "loss": 0.01130430493503809, "memory(GiB)": 22.66, "step": 23454, "token_acc": 0.9853479853479854, "train_speed(iter/s)": 0.958058 }, { "epoch": 0.7619465289283046, "grad_norm": 0.19834037125110626, "learning_rate": 1.4710241336248543e-06, "loss": 0.008560387417674065, "memory(GiB)": 22.66, "step": 23455, "token_acc": 0.9942196531791907, "train_speed(iter/s)": 0.958066 }, { "epoch": 0.76197901439106, "grad_norm": 0.34839820861816406, "learning_rate": 1.47064362627788e-06, "loss": 0.014058006927371025, "memory(GiB)": 22.66, "step": 23456, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.958075 }, { "epoch": 0.7620114998538154, "grad_norm": 0.3239066004753113, "learning_rate": 1.4702631596643758e-06, "loss": 0.009523781016469002, "memory(GiB)": 22.66, "step": 23457, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.958084 }, { "epoch": 0.7620439853165708, "grad_norm": 0.21577759087085724, "learning_rate": 1.4698827337887345e-06, "loss": 0.008404243737459183, "memory(GiB)": 22.66, "step": 23458, "token_acc": 1.0, "train_speed(iter/s)": 0.958093 }, { "epoch": 0.7620764707793263, "grad_norm": 0.40079209208488464, "learning_rate": 1.4695023486553434e-06, "loss": 0.012070158496499062, "memory(GiB)": 22.66, "step": 23459, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.958101 }, { "epoch": 0.7621089562420816, "grad_norm": 0.5963897705078125, "learning_rate": 1.4691220042685955e-06, "loss": 0.017795512452721596, "memory(GiB)": 22.66, "step": 23460, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.95811 }, { "epoch": 0.7621414417048371, "grad_norm": 0.4217776358127594, "learning_rate": 1.4687417006328768e-06, "loss": 0.016391925513744354, "memory(GiB)": 22.66, "step": 23461, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.958119 }, { "epoch": 0.7621739271675925, "grad_norm": 0.43137261271476746, "learning_rate": 1.46836143775258e-06, "loss": 0.01698892191052437, "memory(GiB)": 22.66, "step": 23462, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.958128 }, { "epoch": 0.7622064126303479, "grad_norm": 0.3187200725078583, "learning_rate": 1.4679812156320944e-06, "loss": 0.011356419883668423, "memory(GiB)": 22.66, "step": 23463, "token_acc": 0.9927007299270073, "train_speed(iter/s)": 0.958136 }, { "epoch": 0.7622388980931033, "grad_norm": 0.2930602729320526, "learning_rate": 1.4676010342758057e-06, "loss": 0.00983049999922514, "memory(GiB)": 22.66, "step": 23464, "token_acc": 1.0, "train_speed(iter/s)": 0.958145 }, { "epoch": 0.7622713835558588, "grad_norm": 0.42708829045295715, "learning_rate": 1.467220893688104e-06, "loss": 0.012968692928552628, "memory(GiB)": 22.66, "step": 23465, "token_acc": 0.9884169884169884, "train_speed(iter/s)": 0.958154 }, { "epoch": 0.7623038690186141, "grad_norm": 0.3410536050796509, "learning_rate": 1.4668407938733714e-06, "loss": 0.010743427090346813, "memory(GiB)": 22.66, "step": 23466, "token_acc": 1.0, "train_speed(iter/s)": 0.958163 }, { "epoch": 0.7623363544813696, "grad_norm": 0.5410532355308533, "learning_rate": 1.4664607348360028e-06, "loss": 0.01574522629380226, "memory(GiB)": 22.66, "step": 23467, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.958172 }, { "epoch": 0.762368839944125, "grad_norm": 0.2470136284828186, "learning_rate": 1.466080716580378e-06, "loss": 0.010226806625723839, "memory(GiB)": 22.66, "step": 23468, "token_acc": 1.0, "train_speed(iter/s)": 0.95818 }, { "epoch": 0.7624013254068804, "grad_norm": 0.2715297341346741, "learning_rate": 1.4657007391108874e-06, "loss": 0.006871944293379784, "memory(GiB)": 22.66, "step": 23469, "token_acc": 1.0, "train_speed(iter/s)": 0.958189 }, { "epoch": 0.7624338108696358, "grad_norm": 0.44748178124427795, "learning_rate": 1.4653208024319132e-06, "loss": 0.010220920667052269, "memory(GiB)": 22.66, "step": 23470, "token_acc": 1.0, "train_speed(iter/s)": 0.958199 }, { "epoch": 0.7624662963323913, "grad_norm": 0.22869355976581573, "learning_rate": 1.4649409065478409e-06, "loss": 0.006702266167849302, "memory(GiB)": 22.66, "step": 23471, "token_acc": 1.0, "train_speed(iter/s)": 0.958207 }, { "epoch": 0.7624987817951466, "grad_norm": 0.32902824878692627, "learning_rate": 1.4645610514630566e-06, "loss": 0.011149044148623943, "memory(GiB)": 22.66, "step": 23472, "token_acc": 1.0, "train_speed(iter/s)": 0.958215 }, { "epoch": 0.7625312672579021, "grad_norm": 0.34505921602249146, "learning_rate": 1.464181237181944e-06, "loss": 0.007947590202093124, "memory(GiB)": 22.66, "step": 23473, "token_acc": 0.9947368421052631, "train_speed(iter/s)": 0.958224 }, { "epoch": 0.7625637527206575, "grad_norm": 0.2981601655483246, "learning_rate": 1.4638014637088848e-06, "loss": 0.01303538866341114, "memory(GiB)": 22.66, "step": 23474, "token_acc": 1.0, "train_speed(iter/s)": 0.958232 }, { "epoch": 0.7625962381834129, "grad_norm": 0.4467943012714386, "learning_rate": 1.463421731048264e-06, "loss": 0.01663580909371376, "memory(GiB)": 22.66, "step": 23475, "token_acc": 1.0, "train_speed(iter/s)": 0.95824 }, { "epoch": 0.7626287236461683, "grad_norm": 0.4848681390285492, "learning_rate": 1.463042039204463e-06, "loss": 0.018494170159101486, "memory(GiB)": 22.66, "step": 23476, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.958249 }, { "epoch": 0.7626612091089238, "grad_norm": 0.582247257232666, "learning_rate": 1.4626623881818647e-06, "loss": 0.019525937736034393, "memory(GiB)": 22.66, "step": 23477, "token_acc": 1.0, "train_speed(iter/s)": 0.958258 }, { "epoch": 0.7626936945716791, "grad_norm": 0.49215224385261536, "learning_rate": 1.462282777984852e-06, "loss": 0.017958056181669235, "memory(GiB)": 22.66, "step": 23478, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.958266 }, { "epoch": 0.7627261800344346, "grad_norm": 0.42457857728004456, "learning_rate": 1.461903208617803e-06, "loss": 0.016272950917482376, "memory(GiB)": 22.66, "step": 23479, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.958273 }, { "epoch": 0.76275866549719, "grad_norm": 0.2396271824836731, "learning_rate": 1.4615236800851008e-06, "loss": 0.013347753323614597, "memory(GiB)": 22.66, "step": 23480, "token_acc": 0.9923954372623575, "train_speed(iter/s)": 0.95828 }, { "epoch": 0.7627911509599454, "grad_norm": 0.33329057693481445, "learning_rate": 1.4611441923911245e-06, "loss": 0.012954728677868843, "memory(GiB)": 22.66, "step": 23481, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.958287 }, { "epoch": 0.7628236364227008, "grad_norm": 0.33862683176994324, "learning_rate": 1.4607647455402562e-06, "loss": 0.013605345971882343, "memory(GiB)": 22.66, "step": 23482, "token_acc": 1.0, "train_speed(iter/s)": 0.958294 }, { "epoch": 0.7628561218854563, "grad_norm": 0.3581652045249939, "learning_rate": 1.4603853395368722e-06, "loss": 0.010578770190477371, "memory(GiB)": 22.66, "step": 23483, "token_acc": 1.0, "train_speed(iter/s)": 0.958301 }, { "epoch": 0.7628886073482116, "grad_norm": 0.3629402220249176, "learning_rate": 1.4600059743853522e-06, "loss": 0.013292117044329643, "memory(GiB)": 22.66, "step": 23484, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.958308 }, { "epoch": 0.7629210928109671, "grad_norm": 0.31378573179244995, "learning_rate": 1.4596266500900753e-06, "loss": 0.015096761286258698, "memory(GiB)": 22.66, "step": 23485, "token_acc": 1.0, "train_speed(iter/s)": 0.958315 }, { "epoch": 0.7629535782737225, "grad_norm": 0.2954799234867096, "learning_rate": 1.4592473666554201e-06, "loss": 0.015138611197471619, "memory(GiB)": 22.66, "step": 23486, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.958322 }, { "epoch": 0.7629860637364779, "grad_norm": 0.3650681674480438, "learning_rate": 1.4588681240857622e-06, "loss": 0.01705765910446644, "memory(GiB)": 22.66, "step": 23487, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.958328 }, { "epoch": 0.7630185491992334, "grad_norm": 0.31316375732421875, "learning_rate": 1.4584889223854804e-06, "loss": 0.015284737572073936, "memory(GiB)": 22.66, "step": 23488, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.958335 }, { "epoch": 0.7630510346619888, "grad_norm": 0.25490403175354004, "learning_rate": 1.4581097615589469e-06, "loss": 0.007803469896316528, "memory(GiB)": 22.66, "step": 23489, "token_acc": 1.0, "train_speed(iter/s)": 0.95834 }, { "epoch": 0.7630835201247442, "grad_norm": 0.4790729582309723, "learning_rate": 1.4577306416105446e-06, "loss": 0.01675713248550892, "memory(GiB)": 22.66, "step": 23490, "token_acc": 0.9849056603773585, "train_speed(iter/s)": 0.958346 }, { "epoch": 0.7631160055874996, "grad_norm": 0.2546038031578064, "learning_rate": 1.4573515625446437e-06, "loss": 0.010350849479436874, "memory(GiB)": 22.66, "step": 23491, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.958352 }, { "epoch": 0.7631484910502551, "grad_norm": 0.3476005494594574, "learning_rate": 1.4569725243656208e-06, "loss": 0.011304182931780815, "memory(GiB)": 22.66, "step": 23492, "token_acc": 1.0, "train_speed(iter/s)": 0.958357 }, { "epoch": 0.7631809765130104, "grad_norm": 0.561681866645813, "learning_rate": 1.4565935270778526e-06, "loss": 0.01644184999167919, "memory(GiB)": 22.66, "step": 23493, "token_acc": 1.0, "train_speed(iter/s)": 0.958362 }, { "epoch": 0.7632134619757659, "grad_norm": 0.29244476556777954, "learning_rate": 1.4562145706857084e-06, "loss": 0.00863093975931406, "memory(GiB)": 22.66, "step": 23494, "token_acc": 1.0, "train_speed(iter/s)": 0.958368 }, { "epoch": 0.7632459474385213, "grad_norm": 0.23730884492397308, "learning_rate": 1.4558356551935675e-06, "loss": 0.008722004480659962, "memory(GiB)": 22.66, "step": 23495, "token_acc": 1.0, "train_speed(iter/s)": 0.958374 }, { "epoch": 0.7632784329012767, "grad_norm": 0.443030446767807, "learning_rate": 1.455456780605799e-06, "loss": 0.02202395349740982, "memory(GiB)": 22.66, "step": 23496, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.95838 }, { "epoch": 0.7633109183640321, "grad_norm": 0.40678468346595764, "learning_rate": 1.4550779469267784e-06, "loss": 0.016361091285943985, "memory(GiB)": 22.66, "step": 23497, "token_acc": 0.9904761904761905, "train_speed(iter/s)": 0.958386 }, { "epoch": 0.7633434038267876, "grad_norm": 0.39136871695518494, "learning_rate": 1.4546991541608756e-06, "loss": 0.010420947335660458, "memory(GiB)": 22.66, "step": 23498, "token_acc": 1.0, "train_speed(iter/s)": 0.958391 }, { "epoch": 0.7633758892895429, "grad_norm": 0.3593440353870392, "learning_rate": 1.454320402312463e-06, "loss": 0.017459489405155182, "memory(GiB)": 22.66, "step": 23499, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.958398 }, { "epoch": 0.7634083747522984, "grad_norm": 0.30270999670028687, "learning_rate": 1.4539416913859121e-06, "loss": 0.012945239432156086, "memory(GiB)": 22.66, "step": 23500, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.958402 }, { "epoch": 0.7634083747522984, "eval_loss": 0.013836659491062164, "eval_runtime": 80.6009, "eval_samples_per_second": 123.448, "eval_steps_per_second": 3.859, "eval_token_acc": 0.9943832083155288, "step": 23500 }, { "epoch": 0.7634408602150538, "grad_norm": 0.2557752728462219, "learning_rate": 1.4535630213855956e-06, "loss": 0.008824107237160206, "memory(GiB)": 22.66, "step": 23501, "token_acc": 0.9938803170735047, "train_speed(iter/s)": 0.95483 }, { "epoch": 0.7634733456778092, "grad_norm": 0.2622112035751343, "learning_rate": 1.4531843923158807e-06, "loss": 0.01193197164684534, "memory(GiB)": 22.66, "step": 23502, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.954839 }, { "epoch": 0.7635058311405646, "grad_norm": 0.27076518535614014, "learning_rate": 1.4528058041811382e-06, "loss": 0.013280337676405907, "memory(GiB)": 22.66, "step": 23503, "token_acc": 1.0, "train_speed(iter/s)": 0.954848 }, { "epoch": 0.76353831660332, "grad_norm": 0.4370799660682678, "learning_rate": 1.4524272569857384e-06, "loss": 0.02124543860554695, "memory(GiB)": 22.66, "step": 23504, "token_acc": 0.9863481228668942, "train_speed(iter/s)": 0.954857 }, { "epoch": 0.7635708020660754, "grad_norm": 0.2941330671310425, "learning_rate": 1.4520487507340513e-06, "loss": 0.012125386856496334, "memory(GiB)": 22.66, "step": 23505, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.954865 }, { "epoch": 0.7636032875288309, "grad_norm": 0.27200090885162354, "learning_rate": 1.4516702854304426e-06, "loss": 0.00826981570571661, "memory(GiB)": 22.66, "step": 23506, "token_acc": 1.0, "train_speed(iter/s)": 0.954874 }, { "epoch": 0.7636357729915862, "grad_norm": 0.3213288187980652, "learning_rate": 1.4512918610792814e-06, "loss": 0.012169791385531425, "memory(GiB)": 22.66, "step": 23507, "token_acc": 0.9964028776978417, "train_speed(iter/s)": 0.954883 }, { "epoch": 0.7636682584543417, "grad_norm": 0.2876436114311218, "learning_rate": 1.4509134776849354e-06, "loss": 0.011012367904186249, "memory(GiB)": 22.66, "step": 23508, "token_acc": 0.9846153846153847, "train_speed(iter/s)": 0.954891 }, { "epoch": 0.7637007439170971, "grad_norm": 0.3883163630962372, "learning_rate": 1.450535135251772e-06, "loss": 0.01785954087972641, "memory(GiB)": 22.66, "step": 23509, "token_acc": 1.0, "train_speed(iter/s)": 0.9549 }, { "epoch": 0.7637332293798526, "grad_norm": 0.39893919229507446, "learning_rate": 1.4501568337841587e-06, "loss": 0.00926217995584011, "memory(GiB)": 22.66, "step": 23510, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.954908 }, { "epoch": 0.7637657148426079, "grad_norm": 0.42082008719444275, "learning_rate": 1.4497785732864589e-06, "loss": 0.012690919451415539, "memory(GiB)": 22.66, "step": 23511, "token_acc": 0.994475138121547, "train_speed(iter/s)": 0.954917 }, { "epoch": 0.7637982003053634, "grad_norm": 0.2726864218711853, "learning_rate": 1.4494003537630403e-06, "loss": 0.009074138477444649, "memory(GiB)": 22.66, "step": 23512, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.954925 }, { "epoch": 0.7638306857681187, "grad_norm": 0.37299633026123047, "learning_rate": 1.4490221752182665e-06, "loss": 0.013194257393479347, "memory(GiB)": 22.66, "step": 23513, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.954934 }, { "epoch": 0.7638631712308742, "grad_norm": 0.3473152220249176, "learning_rate": 1.4486440376565053e-06, "loss": 0.013309311121702194, "memory(GiB)": 22.66, "step": 23514, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.954942 }, { "epoch": 0.7638956566936296, "grad_norm": 0.5897576212882996, "learning_rate": 1.448265941082117e-06, "loss": 0.02084658294916153, "memory(GiB)": 22.66, "step": 23515, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.954949 }, { "epoch": 0.763928142156385, "grad_norm": 0.5700567960739136, "learning_rate": 1.4478878854994682e-06, "loss": 0.018577581271529198, "memory(GiB)": 22.66, "step": 23516, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.954956 }, { "epoch": 0.7639606276191404, "grad_norm": 0.25331562757492065, "learning_rate": 1.4475098709129187e-06, "loss": 0.009164777584373951, "memory(GiB)": 22.66, "step": 23517, "token_acc": 1.0, "train_speed(iter/s)": 0.954963 }, { "epoch": 0.7639931130818959, "grad_norm": 0.3303510844707489, "learning_rate": 1.4471318973268362e-06, "loss": 0.015429206192493439, "memory(GiB)": 22.66, "step": 23518, "token_acc": 0.992, "train_speed(iter/s)": 0.95497 }, { "epoch": 0.7640255985446512, "grad_norm": 0.38821521401405334, "learning_rate": 1.446753964745579e-06, "loss": 0.015833020210266113, "memory(GiB)": 22.66, "step": 23519, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.954977 }, { "epoch": 0.7640580840074067, "grad_norm": 0.2667061686515808, "learning_rate": 1.4463760731735127e-06, "loss": 0.01150168664753437, "memory(GiB)": 22.66, "step": 23520, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.954983 }, { "epoch": 0.7640905694701621, "grad_norm": 0.4044424295425415, "learning_rate": 1.4459982226149937e-06, "loss": 0.011124750599265099, "memory(GiB)": 22.66, "step": 23521, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.954991 }, { "epoch": 0.7641230549329175, "grad_norm": 0.41128355264663696, "learning_rate": 1.4456204130743862e-06, "loss": 0.017658408731222153, "memory(GiB)": 22.66, "step": 23522, "token_acc": 1.0, "train_speed(iter/s)": 0.954997 }, { "epoch": 0.7641555403956729, "grad_norm": 0.3579014241695404, "learning_rate": 1.4452426445560503e-06, "loss": 0.01533426158130169, "memory(GiB)": 22.66, "step": 23523, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955003 }, { "epoch": 0.7641880258584284, "grad_norm": 0.6342993378639221, "learning_rate": 1.4448649170643447e-06, "loss": 0.020074373111128807, "memory(GiB)": 22.66, "step": 23524, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.95501 }, { "epoch": 0.7642205113211837, "grad_norm": 0.34746596217155457, "learning_rate": 1.4444872306036323e-06, "loss": 0.011188406497240067, "memory(GiB)": 22.66, "step": 23525, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.955017 }, { "epoch": 0.7642529967839392, "grad_norm": 0.38838422298431396, "learning_rate": 1.4441095851782676e-06, "loss": 0.009856238029897213, "memory(GiB)": 22.66, "step": 23526, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.955023 }, { "epoch": 0.7642854822466946, "grad_norm": 0.2575770914554596, "learning_rate": 1.4437319807926115e-06, "loss": 0.008688585832715034, "memory(GiB)": 22.66, "step": 23527, "token_acc": 1.0, "train_speed(iter/s)": 0.955025 }, { "epoch": 0.76431796770945, "grad_norm": 0.41054296493530273, "learning_rate": 1.443354417451021e-06, "loss": 0.012647097930312157, "memory(GiB)": 22.66, "step": 23528, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.955032 }, { "epoch": 0.7643504531722054, "grad_norm": 0.3060814142227173, "learning_rate": 1.442976895157857e-06, "loss": 0.014353320002555847, "memory(GiB)": 22.66, "step": 23529, "token_acc": 1.0, "train_speed(iter/s)": 0.955039 }, { "epoch": 0.7643829386349609, "grad_norm": 0.367602676153183, "learning_rate": 1.4425994139174714e-06, "loss": 0.014551084488630295, "memory(GiB)": 22.66, "step": 23530, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.955046 }, { "epoch": 0.7644154240977162, "grad_norm": 0.3522779941558838, "learning_rate": 1.4422219737342242e-06, "loss": 0.011426451615989208, "memory(GiB)": 22.66, "step": 23531, "token_acc": 1.0, "train_speed(iter/s)": 0.955053 }, { "epoch": 0.7644479095604717, "grad_norm": 0.45847874879837036, "learning_rate": 1.441844574612471e-06, "loss": 0.013338975608348846, "memory(GiB)": 22.66, "step": 23532, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955059 }, { "epoch": 0.7644803950232271, "grad_norm": 0.3109813630580902, "learning_rate": 1.441467216556568e-06, "loss": 0.015968676656484604, "memory(GiB)": 22.66, "step": 23533, "token_acc": 1.0, "train_speed(iter/s)": 0.955065 }, { "epoch": 0.7645128804859825, "grad_norm": 0.351834237575531, "learning_rate": 1.4410898995708688e-06, "loss": 0.01218196377158165, "memory(GiB)": 22.66, "step": 23534, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955072 }, { "epoch": 0.7645453659487379, "grad_norm": 0.2862611711025238, "learning_rate": 1.4407126236597292e-06, "loss": 0.010456360876560211, "memory(GiB)": 22.66, "step": 23535, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.955078 }, { "epoch": 0.7645778514114934, "grad_norm": 0.27125394344329834, "learning_rate": 1.440335388827503e-06, "loss": 0.010641878470778465, "memory(GiB)": 22.66, "step": 23536, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.955084 }, { "epoch": 0.7646103368742487, "grad_norm": 0.3888133466243744, "learning_rate": 1.4399581950785456e-06, "loss": 0.016743870452046394, "memory(GiB)": 22.66, "step": 23537, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.95509 }, { "epoch": 0.7646428223370042, "grad_norm": 0.49548062682151794, "learning_rate": 1.439581042417208e-06, "loss": 0.009795136749744415, "memory(GiB)": 22.66, "step": 23538, "token_acc": 0.9900332225913622, "train_speed(iter/s)": 0.955096 }, { "epoch": 0.7646753077997596, "grad_norm": 0.9774986505508423, "learning_rate": 1.4392039308478433e-06, "loss": 0.014375923201441765, "memory(GiB)": 22.66, "step": 23539, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.955101 }, { "epoch": 0.764707793262515, "grad_norm": 0.32113903760910034, "learning_rate": 1.4388268603748056e-06, "loss": 0.012704784981906414, "memory(GiB)": 22.66, "step": 23540, "token_acc": 1.0, "train_speed(iter/s)": 0.955107 }, { "epoch": 0.7647402787252704, "grad_norm": 0.2901633381843567, "learning_rate": 1.4384498310024453e-06, "loss": 0.011412971653044224, "memory(GiB)": 22.66, "step": 23541, "token_acc": 1.0, "train_speed(iter/s)": 0.955112 }, { "epoch": 0.7647727641880259, "grad_norm": 0.3373354375362396, "learning_rate": 1.438072842735116e-06, "loss": 0.013022959232330322, "memory(GiB)": 22.66, "step": 23542, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955119 }, { "epoch": 0.7648052496507812, "grad_norm": 0.7420746684074402, "learning_rate": 1.4376958955771653e-06, "loss": 0.012971358373761177, "memory(GiB)": 22.66, "step": 23543, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955126 }, { "epoch": 0.7648377351135367, "grad_norm": 0.5209763050079346, "learning_rate": 1.4373189895329476e-06, "loss": 0.018046336248517036, "memory(GiB)": 22.66, "step": 23544, "token_acc": 0.9816176470588235, "train_speed(iter/s)": 0.955132 }, { "epoch": 0.7648702205762921, "grad_norm": 0.3317909836769104, "learning_rate": 1.4369421246068067e-06, "loss": 0.016272395849227905, "memory(GiB)": 22.66, "step": 23545, "token_acc": 0.9918367346938776, "train_speed(iter/s)": 0.955139 }, { "epoch": 0.7649027060390475, "grad_norm": 0.4207014739513397, "learning_rate": 1.4365653008031e-06, "loss": 0.010197215713560581, "memory(GiB)": 22.66, "step": 23546, "token_acc": 1.0, "train_speed(iter/s)": 0.955146 }, { "epoch": 0.7649351915018029, "grad_norm": 0.2815950810909271, "learning_rate": 1.4361885181261704e-06, "loss": 0.007631643209606409, "memory(GiB)": 22.66, "step": 23547, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.955153 }, { "epoch": 0.7649676769645584, "grad_norm": 0.33162567019462585, "learning_rate": 1.435811776580371e-06, "loss": 0.010822661221027374, "memory(GiB)": 22.66, "step": 23548, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.95516 }, { "epoch": 0.7650001624273137, "grad_norm": 0.32952091097831726, "learning_rate": 1.4354350761700453e-06, "loss": 0.013175839558243752, "memory(GiB)": 22.66, "step": 23549, "token_acc": 1.0, "train_speed(iter/s)": 0.955166 }, { "epoch": 0.7650326478900692, "grad_norm": 0.39491206407546997, "learning_rate": 1.4350584168995434e-06, "loss": 0.013518699444830418, "memory(GiB)": 22.66, "step": 23550, "token_acc": 1.0, "train_speed(iter/s)": 0.955173 }, { "epoch": 0.7650651333528246, "grad_norm": 0.658561110496521, "learning_rate": 1.4346817987732126e-06, "loss": 0.01266377605497837, "memory(GiB)": 22.66, "step": 23551, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.95518 }, { "epoch": 0.76509761881558, "grad_norm": 0.3342609405517578, "learning_rate": 1.4343052217954007e-06, "loss": 0.014645593240857124, "memory(GiB)": 22.66, "step": 23552, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955187 }, { "epoch": 0.7651301042783355, "grad_norm": 0.2887733280658722, "learning_rate": 1.433928685970451e-06, "loss": 0.010747101157903671, "memory(GiB)": 22.66, "step": 23553, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.955193 }, { "epoch": 0.7651625897410909, "grad_norm": 0.4503382742404938, "learning_rate": 1.4335521913027105e-06, "loss": 0.01393907330930233, "memory(GiB)": 22.66, "step": 23554, "token_acc": 1.0, "train_speed(iter/s)": 0.9552 }, { "epoch": 0.7651950752038463, "grad_norm": 0.3552508056163788, "learning_rate": 1.4331757377965249e-06, "loss": 0.011410726234316826, "memory(GiB)": 22.66, "step": 23555, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.955207 }, { "epoch": 0.7652275606666017, "grad_norm": 0.590370237827301, "learning_rate": 1.4327993254562383e-06, "loss": 0.01565214991569519, "memory(GiB)": 22.66, "step": 23556, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.955216 }, { "epoch": 0.7652600461293572, "grad_norm": 0.373972088098526, "learning_rate": 1.4324229542861973e-06, "loss": 0.014213092625141144, "memory(GiB)": 22.66, "step": 23557, "token_acc": 1.0, "train_speed(iter/s)": 0.955225 }, { "epoch": 0.7652925315921125, "grad_norm": 0.32096919417381287, "learning_rate": 1.4320466242907427e-06, "loss": 0.01250760443508625, "memory(GiB)": 22.66, "step": 23558, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.955233 }, { "epoch": 0.765325017054868, "grad_norm": 0.4039490222930908, "learning_rate": 1.4316703354742184e-06, "loss": 0.012390479445457458, "memory(GiB)": 22.66, "step": 23559, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.955242 }, { "epoch": 0.7653575025176234, "grad_norm": 0.26419976353645325, "learning_rate": 1.4312940878409682e-06, "loss": 0.0068692914210259914, "memory(GiB)": 22.66, "step": 23560, "token_acc": 1.0, "train_speed(iter/s)": 0.955251 }, { "epoch": 0.7653899879803788, "grad_norm": 0.4090750217437744, "learning_rate": 1.4309178813953352e-06, "loss": 0.012671618722379208, "memory(GiB)": 22.66, "step": 23561, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.95526 }, { "epoch": 0.7654224734431342, "grad_norm": 0.27315548062324524, "learning_rate": 1.430541716141659e-06, "loss": 0.01009861659258604, "memory(GiB)": 22.66, "step": 23562, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.955268 }, { "epoch": 0.7654549589058897, "grad_norm": 0.2444400191307068, "learning_rate": 1.4301655920842821e-06, "loss": 0.0071997856721282005, "memory(GiB)": 22.66, "step": 23563, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.955277 }, { "epoch": 0.765487444368645, "grad_norm": 0.2509825527667999, "learning_rate": 1.4297895092275465e-06, "loss": 0.007236488629132509, "memory(GiB)": 22.66, "step": 23564, "token_acc": 1.0, "train_speed(iter/s)": 0.955286 }, { "epoch": 0.7655199298314005, "grad_norm": 0.49738752841949463, "learning_rate": 1.4294134675757925e-06, "loss": 0.01021988969296217, "memory(GiB)": 22.66, "step": 23565, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955294 }, { "epoch": 0.7655524152941559, "grad_norm": 0.373159795999527, "learning_rate": 1.4290374671333584e-06, "loss": 0.01281808502972126, "memory(GiB)": 22.66, "step": 23566, "token_acc": 1.0, "train_speed(iter/s)": 0.955303 }, { "epoch": 0.7655849007569113, "grad_norm": 0.40265166759490967, "learning_rate": 1.4286615079045867e-06, "loss": 0.018779588863253593, "memory(GiB)": 22.66, "step": 23567, "token_acc": 0.992, "train_speed(iter/s)": 0.955312 }, { "epoch": 0.7656173862196667, "grad_norm": 1.4032467603683472, "learning_rate": 1.4282855898938115e-06, "loss": 0.016835281625390053, "memory(GiB)": 22.66, "step": 23568, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.95532 }, { "epoch": 0.7656498716824222, "grad_norm": 0.2637975215911865, "learning_rate": 1.4279097131053754e-06, "loss": 0.010482583194971085, "memory(GiB)": 22.66, "step": 23569, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.955329 }, { "epoch": 0.7656823571451775, "grad_norm": 0.3786930441856384, "learning_rate": 1.4275338775436182e-06, "loss": 0.017272362485527992, "memory(GiB)": 22.66, "step": 23570, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955327 }, { "epoch": 0.765714842607933, "grad_norm": 0.3341703712940216, "learning_rate": 1.4271580832128729e-06, "loss": 0.014607034623622894, "memory(GiB)": 22.66, "step": 23571, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.955335 }, { "epoch": 0.7657473280706883, "grad_norm": 0.38622990250587463, "learning_rate": 1.42678233011748e-06, "loss": 0.010402705520391464, "memory(GiB)": 22.66, "step": 23572, "token_acc": 0.9930555555555556, "train_speed(iter/s)": 0.955344 }, { "epoch": 0.7657798135334438, "grad_norm": 0.37243911623954773, "learning_rate": 1.426406618261772e-06, "loss": 0.01455397717654705, "memory(GiB)": 22.66, "step": 23573, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.955353 }, { "epoch": 0.7658122989961992, "grad_norm": 0.307140976190567, "learning_rate": 1.4260309476500916e-06, "loss": 0.01364324614405632, "memory(GiB)": 22.66, "step": 23574, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955361 }, { "epoch": 0.7658447844589547, "grad_norm": 0.35341590642929077, "learning_rate": 1.4256553182867694e-06, "loss": 0.013224955648183823, "memory(GiB)": 22.66, "step": 23575, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.955369 }, { "epoch": 0.76587726992171, "grad_norm": 0.4061407446861267, "learning_rate": 1.4252797301761433e-06, "loss": 0.016380194574594498, "memory(GiB)": 22.66, "step": 23576, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955378 }, { "epoch": 0.7659097553844655, "grad_norm": 0.28907111287117004, "learning_rate": 1.4249041833225442e-06, "loss": 0.012252327054738998, "memory(GiB)": 22.66, "step": 23577, "token_acc": 0.9965277777777778, "train_speed(iter/s)": 0.955387 }, { "epoch": 0.7659422408472208, "grad_norm": 0.38957035541534424, "learning_rate": 1.4245286777303125e-06, "loss": 0.015091724693775177, "memory(GiB)": 22.66, "step": 23578, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.955395 }, { "epoch": 0.7659747263099763, "grad_norm": 0.35414496064186096, "learning_rate": 1.4241532134037773e-06, "loss": 0.020393647253513336, "memory(GiB)": 22.66, "step": 23579, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955404 }, { "epoch": 0.7660072117727317, "grad_norm": 1.0767463445663452, "learning_rate": 1.423777790347275e-06, "loss": 0.019616054370999336, "memory(GiB)": 22.66, "step": 23580, "token_acc": 1.0, "train_speed(iter/s)": 0.955413 }, { "epoch": 0.7660396972354871, "grad_norm": 0.6015012264251709, "learning_rate": 1.423402408565135e-06, "loss": 0.015744363889098167, "memory(GiB)": 22.66, "step": 23581, "token_acc": 1.0, "train_speed(iter/s)": 0.95542 }, { "epoch": 0.7660721826982425, "grad_norm": 0.3528178334236145, "learning_rate": 1.423027068061692e-06, "loss": 0.014022717252373695, "memory(GiB)": 22.66, "step": 23582, "token_acc": 1.0, "train_speed(iter/s)": 0.955427 }, { "epoch": 0.766104668160998, "grad_norm": 0.24936838448047638, "learning_rate": 1.4226517688412777e-06, "loss": 0.009870689362287521, "memory(GiB)": 22.66, "step": 23583, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955433 }, { "epoch": 0.7661371536237533, "grad_norm": 0.4371728003025055, "learning_rate": 1.4222765109082248e-06, "loss": 0.021584589034318924, "memory(GiB)": 22.66, "step": 23584, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.95544 }, { "epoch": 0.7661696390865088, "grad_norm": 0.25211721658706665, "learning_rate": 1.4219012942668615e-06, "loss": 0.008497203700244427, "memory(GiB)": 22.66, "step": 23585, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.955447 }, { "epoch": 0.7662021245492642, "grad_norm": 0.31735578179359436, "learning_rate": 1.421526118921519e-06, "loss": 0.012364963069558144, "memory(GiB)": 22.66, "step": 23586, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955453 }, { "epoch": 0.7662346100120196, "grad_norm": 0.3750666379928589, "learning_rate": 1.4211509848765287e-06, "loss": 0.013421649113297462, "memory(GiB)": 22.66, "step": 23587, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.95546 }, { "epoch": 0.766267095474775, "grad_norm": 0.40048301219940186, "learning_rate": 1.4207758921362196e-06, "loss": 0.014687606133520603, "memory(GiB)": 22.66, "step": 23588, "token_acc": 1.0, "train_speed(iter/s)": 0.955467 }, { "epoch": 0.7662995809375305, "grad_norm": 0.6852939128875732, "learning_rate": 1.4204008407049219e-06, "loss": 0.013715951703488827, "memory(GiB)": 22.66, "step": 23589, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.955473 }, { "epoch": 0.7663320664002858, "grad_norm": 0.38763782382011414, "learning_rate": 1.4200258305869613e-06, "loss": 0.015339348465204239, "memory(GiB)": 22.66, "step": 23590, "token_acc": 1.0, "train_speed(iter/s)": 0.95548 }, { "epoch": 0.7663645518630413, "grad_norm": 0.3644319474697113, "learning_rate": 1.4196508617866678e-06, "loss": 0.013029330410063267, "memory(GiB)": 22.66, "step": 23591, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.955486 }, { "epoch": 0.7663970373257967, "grad_norm": 0.3025904595851898, "learning_rate": 1.4192759343083683e-06, "loss": 0.00709077063947916, "memory(GiB)": 22.66, "step": 23592, "token_acc": 1.0, "train_speed(iter/s)": 0.955491 }, { "epoch": 0.7664295227885521, "grad_norm": 0.31876063346862793, "learning_rate": 1.4189010481563924e-06, "loss": 0.011982200667262077, "memory(GiB)": 22.66, "step": 23593, "token_acc": 0.9965986394557823, "train_speed(iter/s)": 0.955498 }, { "epoch": 0.7664620082513075, "grad_norm": 0.2777763605117798, "learning_rate": 1.4185262033350633e-06, "loss": 0.011233069933950901, "memory(GiB)": 22.66, "step": 23594, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.955505 }, { "epoch": 0.766494493714063, "grad_norm": 0.23746417462825775, "learning_rate": 1.418151399848709e-06, "loss": 0.011526491492986679, "memory(GiB)": 22.66, "step": 23595, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955511 }, { "epoch": 0.7665269791768183, "grad_norm": 0.31558212637901306, "learning_rate": 1.4177766377016544e-06, "loss": 0.010564976371824741, "memory(GiB)": 22.66, "step": 23596, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.955517 }, { "epoch": 0.7665594646395738, "grad_norm": 0.38964346051216125, "learning_rate": 1.4174019168982273e-06, "loss": 0.013855833560228348, "memory(GiB)": 22.66, "step": 23597, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.955523 }, { "epoch": 0.7665919501023292, "grad_norm": 0.4424388110637665, "learning_rate": 1.4170272374427485e-06, "loss": 0.010454333387315273, "memory(GiB)": 22.66, "step": 23598, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955529 }, { "epoch": 0.7666244355650846, "grad_norm": 0.261554479598999, "learning_rate": 1.4166525993395463e-06, "loss": 0.012425167486071587, "memory(GiB)": 22.66, "step": 23599, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.955535 }, { "epoch": 0.76665692102784, "grad_norm": 0.2959284484386444, "learning_rate": 1.4162780025929395e-06, "loss": 0.015831470489501953, "memory(GiB)": 22.66, "step": 23600, "token_acc": 0.9927007299270073, "train_speed(iter/s)": 0.955539 }, { "epoch": 0.7666894064905955, "grad_norm": 0.3955017030239105, "learning_rate": 1.4159034472072558e-06, "loss": 0.012787524610757828, "memory(GiB)": 22.66, "step": 23601, "token_acc": 0.9943502824858758, "train_speed(iter/s)": 0.955545 }, { "epoch": 0.7667218919533508, "grad_norm": 0.2733304798603058, "learning_rate": 1.4155289331868183e-06, "loss": 0.008618159219622612, "memory(GiB)": 22.66, "step": 23602, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.95555 }, { "epoch": 0.7667543774161063, "grad_norm": 0.3570561707019806, "learning_rate": 1.4151544605359463e-06, "loss": 0.01352125033736229, "memory(GiB)": 22.66, "step": 23603, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.955557 }, { "epoch": 0.7667868628788617, "grad_norm": 0.4135774075984955, "learning_rate": 1.4147800292589647e-06, "loss": 0.01311349868774414, "memory(GiB)": 22.66, "step": 23604, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955562 }, { "epoch": 0.7668193483416171, "grad_norm": 0.34844890236854553, "learning_rate": 1.4144056393601902e-06, "loss": 0.013841470703482628, "memory(GiB)": 22.66, "step": 23605, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.955567 }, { "epoch": 0.7668518338043725, "grad_norm": 0.2581745386123657, "learning_rate": 1.41403129084395e-06, "loss": 0.009460801258683205, "memory(GiB)": 22.66, "step": 23606, "token_acc": 0.9963503649635036, "train_speed(iter/s)": 0.955573 }, { "epoch": 0.766884319267128, "grad_norm": 0.40341782569885254, "learning_rate": 1.4136569837145608e-06, "loss": 0.01785089448094368, "memory(GiB)": 22.66, "step": 23607, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.95558 }, { "epoch": 0.7669168047298833, "grad_norm": 1.049601674079895, "learning_rate": 1.4132827179763448e-06, "loss": 0.014048857614398003, "memory(GiB)": 22.66, "step": 23608, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955586 }, { "epoch": 0.7669492901926388, "grad_norm": 0.30156034231185913, "learning_rate": 1.4129084936336184e-06, "loss": 0.009742667898535728, "memory(GiB)": 22.66, "step": 23609, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955593 }, { "epoch": 0.7669817756553942, "grad_norm": 0.3482949435710907, "learning_rate": 1.4125343106907024e-06, "loss": 0.01833289861679077, "memory(GiB)": 22.66, "step": 23610, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.9556 }, { "epoch": 0.7670142611181496, "grad_norm": 0.3479171395301819, "learning_rate": 1.4121601691519154e-06, "loss": 0.010913792997598648, "memory(GiB)": 22.66, "step": 23611, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.955607 }, { "epoch": 0.767046746580905, "grad_norm": 0.3541903793811798, "learning_rate": 1.4117860690215768e-06, "loss": 0.010225445032119751, "memory(GiB)": 22.66, "step": 23612, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955613 }, { "epoch": 0.7670792320436605, "grad_norm": 0.4542444050312042, "learning_rate": 1.4114120103040013e-06, "loss": 0.014610765501856804, "memory(GiB)": 22.66, "step": 23613, "token_acc": 0.996, "train_speed(iter/s)": 0.955621 }, { "epoch": 0.7671117175064158, "grad_norm": 0.24873410165309906, "learning_rate": 1.4110379930035078e-06, "loss": 0.009431191720068455, "memory(GiB)": 22.66, "step": 23614, "token_acc": 1.0, "train_speed(iter/s)": 0.955628 }, { "epoch": 0.7671442029691713, "grad_norm": 0.40494003891944885, "learning_rate": 1.410664017124413e-06, "loss": 0.012972640804946423, "memory(GiB)": 22.66, "step": 23615, "token_acc": 1.0, "train_speed(iter/s)": 0.955636 }, { "epoch": 0.7671766884319268, "grad_norm": 0.4170151352882385, "learning_rate": 1.4102900826710325e-06, "loss": 0.013424125500023365, "memory(GiB)": 22.66, "step": 23616, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.955644 }, { "epoch": 0.7672091738946821, "grad_norm": 0.39260098338127136, "learning_rate": 1.4099161896476844e-06, "loss": 0.012269064784049988, "memory(GiB)": 22.66, "step": 23617, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955651 }, { "epoch": 0.7672416593574376, "grad_norm": 0.4340621829032898, "learning_rate": 1.4095423380586798e-06, "loss": 0.01720432937145233, "memory(GiB)": 22.66, "step": 23618, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955658 }, { "epoch": 0.767274144820193, "grad_norm": 0.2795720398426056, "learning_rate": 1.4091685279083355e-06, "loss": 0.00964006595313549, "memory(GiB)": 22.66, "step": 23619, "token_acc": 1.0, "train_speed(iter/s)": 0.955664 }, { "epoch": 0.7673066302829484, "grad_norm": 0.27108028531074524, "learning_rate": 1.4087947592009659e-06, "loss": 0.015363499522209167, "memory(GiB)": 22.66, "step": 23620, "token_acc": 1.0, "train_speed(iter/s)": 0.955671 }, { "epoch": 0.7673391157457038, "grad_norm": 3.949930429458618, "learning_rate": 1.4084210319408858e-06, "loss": 0.01165841892361641, "memory(GiB)": 22.66, "step": 23621, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.955678 }, { "epoch": 0.7673716012084593, "grad_norm": 0.45165079832077026, "learning_rate": 1.4080473461324062e-06, "loss": 0.012572316452860832, "memory(GiB)": 22.66, "step": 23622, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.955685 }, { "epoch": 0.7674040866712146, "grad_norm": 0.5907531976699829, "learning_rate": 1.4076737017798415e-06, "loss": 0.023833155632019043, "memory(GiB)": 22.66, "step": 23623, "token_acc": 0.975, "train_speed(iter/s)": 0.955692 }, { "epoch": 0.7674365721339701, "grad_norm": 0.48415762186050415, "learning_rate": 1.407300098887503e-06, "loss": 0.01746094413101673, "memory(GiB)": 22.66, "step": 23624, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.955698 }, { "epoch": 0.7674690575967255, "grad_norm": 0.5822321772575378, "learning_rate": 1.4069265374597052e-06, "loss": 0.023991137742996216, "memory(GiB)": 22.66, "step": 23625, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.955705 }, { "epoch": 0.7675015430594809, "grad_norm": 0.34522396326065063, "learning_rate": 1.4065530175007559e-06, "loss": 0.017002122476696968, "memory(GiB)": 22.66, "step": 23626, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955712 }, { "epoch": 0.7675340285222363, "grad_norm": 0.5577316284179688, "learning_rate": 1.4061795390149692e-06, "loss": 0.010334653779864311, "memory(GiB)": 22.66, "step": 23627, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.95572 }, { "epoch": 0.7675665139849918, "grad_norm": 0.2930067181587219, "learning_rate": 1.4058061020066505e-06, "loss": 0.017853207886219025, "memory(GiB)": 22.66, "step": 23628, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.955727 }, { "epoch": 0.7675989994477471, "grad_norm": 0.30545902252197266, "learning_rate": 1.4054327064801165e-06, "loss": 0.011661345139145851, "memory(GiB)": 22.66, "step": 23629, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.955733 }, { "epoch": 0.7676314849105026, "grad_norm": 0.5626862645149231, "learning_rate": 1.4050593524396722e-06, "loss": 0.02092703804373741, "memory(GiB)": 22.66, "step": 23630, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.955739 }, { "epoch": 0.767663970373258, "grad_norm": 0.31545835733413696, "learning_rate": 1.4046860398896278e-06, "loss": 0.0108824223279953, "memory(GiB)": 22.66, "step": 23631, "token_acc": 1.0, "train_speed(iter/s)": 0.955746 }, { "epoch": 0.7676964558360134, "grad_norm": 0.34683549404144287, "learning_rate": 1.4043127688342927e-06, "loss": 0.011009417474269867, "memory(GiB)": 22.66, "step": 23632, "token_acc": 1.0, "train_speed(iter/s)": 0.955753 }, { "epoch": 0.7677289412987688, "grad_norm": 0.4246792197227478, "learning_rate": 1.403939539277972e-06, "loss": 0.015776976943016052, "memory(GiB)": 22.66, "step": 23633, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.955759 }, { "epoch": 0.7677614267615243, "grad_norm": 0.25884684920310974, "learning_rate": 1.403566351224978e-06, "loss": 0.011424411088228226, "memory(GiB)": 22.66, "step": 23634, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.955766 }, { "epoch": 0.7677939122242796, "grad_norm": 0.3948104977607727, "learning_rate": 1.4031932046796138e-06, "loss": 0.009090437553822994, "memory(GiB)": 22.66, "step": 23635, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955773 }, { "epoch": 0.7678263976870351, "grad_norm": 0.348227322101593, "learning_rate": 1.4028200996461882e-06, "loss": 0.010293074883520603, "memory(GiB)": 22.66, "step": 23636, "token_acc": 1.0, "train_speed(iter/s)": 0.955782 }, { "epoch": 0.7678588831497904, "grad_norm": 0.3634352684020996, "learning_rate": 1.4024470361290055e-06, "loss": 0.012908004224300385, "memory(GiB)": 22.66, "step": 23637, "token_acc": 1.0, "train_speed(iter/s)": 0.95579 }, { "epoch": 0.7678913686125459, "grad_norm": 0.5499260425567627, "learning_rate": 1.4020740141323725e-06, "loss": 0.012608830817043781, "memory(GiB)": 22.66, "step": 23638, "token_acc": 1.0, "train_speed(iter/s)": 0.955799 }, { "epoch": 0.7679238540753013, "grad_norm": 0.2682031989097595, "learning_rate": 1.4017010336605941e-06, "loss": 0.01448950543999672, "memory(GiB)": 22.66, "step": 23639, "token_acc": 0.9824561403508771, "train_speed(iter/s)": 0.955807 }, { "epoch": 0.7679563395380568, "grad_norm": 0.39411184191703796, "learning_rate": 1.4013280947179763e-06, "loss": 0.018580026924610138, "memory(GiB)": 22.66, "step": 23640, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955816 }, { "epoch": 0.7679888250008121, "grad_norm": 0.5102733373641968, "learning_rate": 1.4009551973088208e-06, "loss": 0.018086981028318405, "memory(GiB)": 22.66, "step": 23641, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.955825 }, { "epoch": 0.7680213104635676, "grad_norm": 0.23052437603473663, "learning_rate": 1.4005823414374326e-06, "loss": 0.010306824930012226, "memory(GiB)": 22.66, "step": 23642, "token_acc": 0.994475138121547, "train_speed(iter/s)": 0.955833 }, { "epoch": 0.768053795926323, "grad_norm": 0.44436952471733093, "learning_rate": 1.4002095271081152e-06, "loss": 0.014828401617705822, "memory(GiB)": 22.66, "step": 23643, "token_acc": 0.9796954314720813, "train_speed(iter/s)": 0.955842 }, { "epoch": 0.7680862813890784, "grad_norm": 0.2956813871860504, "learning_rate": 1.3998367543251724e-06, "loss": 0.012705370783805847, "memory(GiB)": 22.66, "step": 23644, "token_acc": 1.0, "train_speed(iter/s)": 0.955851 }, { "epoch": 0.7681187668518338, "grad_norm": 0.26948514580726624, "learning_rate": 1.399464023092904e-06, "loss": 0.008552245795726776, "memory(GiB)": 22.66, "step": 23645, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.955857 }, { "epoch": 0.7681512523145893, "grad_norm": 0.2861621081829071, "learning_rate": 1.3990913334156131e-06, "loss": 0.009709913283586502, "memory(GiB)": 22.66, "step": 23646, "token_acc": 1.0, "train_speed(iter/s)": 0.955864 }, { "epoch": 0.7681837377773446, "grad_norm": 0.416480153799057, "learning_rate": 1.398718685297601e-06, "loss": 0.01682550460100174, "memory(GiB)": 22.66, "step": 23647, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.95587 }, { "epoch": 0.7682162232401001, "grad_norm": 0.4925321638584137, "learning_rate": 1.398346078743168e-06, "loss": 0.016891641542315483, "memory(GiB)": 22.66, "step": 23648, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.955876 }, { "epoch": 0.7682487087028554, "grad_norm": 0.27326634526252747, "learning_rate": 1.397973513756617e-06, "loss": 0.011355763301253319, "memory(GiB)": 22.66, "step": 23649, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955883 }, { "epoch": 0.7682811941656109, "grad_norm": 0.4177614450454712, "learning_rate": 1.3976009903422445e-06, "loss": 0.017411015927791595, "memory(GiB)": 22.66, "step": 23650, "token_acc": 0.9935897435897436, "train_speed(iter/s)": 0.95589 }, { "epoch": 0.7683136796283663, "grad_norm": 0.4720742106437683, "learning_rate": 1.3972285085043514e-06, "loss": 0.017962675541639328, "memory(GiB)": 22.66, "step": 23651, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.955896 }, { "epoch": 0.7683461650911217, "grad_norm": 0.3789626955986023, "learning_rate": 1.396856068247236e-06, "loss": 0.013852144591510296, "memory(GiB)": 22.66, "step": 23652, "token_acc": 1.0, "train_speed(iter/s)": 0.955903 }, { "epoch": 0.7683786505538771, "grad_norm": 0.3632068634033203, "learning_rate": 1.3964836695751994e-06, "loss": 0.01445675641298294, "memory(GiB)": 22.66, "step": 23653, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.955909 }, { "epoch": 0.7684111360166326, "grad_norm": 0.3934095501899719, "learning_rate": 1.3961113124925363e-06, "loss": 0.01589469239115715, "memory(GiB)": 22.66, "step": 23654, "token_acc": 1.0, "train_speed(iter/s)": 0.955916 }, { "epoch": 0.7684436214793879, "grad_norm": 0.4949711561203003, "learning_rate": 1.3957389970035463e-06, "loss": 0.01155007816851139, "memory(GiB)": 22.66, "step": 23655, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.955921 }, { "epoch": 0.7684761069421434, "grad_norm": 0.3688145577907562, "learning_rate": 1.395366723112523e-06, "loss": 0.018212202936410904, "memory(GiB)": 22.66, "step": 23656, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955928 }, { "epoch": 0.7685085924048988, "grad_norm": 0.3264252841472626, "learning_rate": 1.3949944908237678e-06, "loss": 0.012928483076393604, "memory(GiB)": 22.66, "step": 23657, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.955935 }, { "epoch": 0.7685410778676542, "grad_norm": 0.47606968879699707, "learning_rate": 1.3946223001415737e-06, "loss": 0.013418471440672874, "memory(GiB)": 22.66, "step": 23658, "token_acc": 0.9961977186311787, "train_speed(iter/s)": 0.955941 }, { "epoch": 0.7685735633304096, "grad_norm": 0.2768813669681549, "learning_rate": 1.3942501510702384e-06, "loss": 0.008724350482225418, "memory(GiB)": 22.66, "step": 23659, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.955948 }, { "epoch": 0.7686060487931651, "grad_norm": 0.44037216901779175, "learning_rate": 1.3938780436140536e-06, "loss": 0.012951664626598358, "memory(GiB)": 22.66, "step": 23660, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.955953 }, { "epoch": 0.7686385342559204, "grad_norm": 0.420782208442688, "learning_rate": 1.3935059777773163e-06, "loss": 0.018688540905714035, "memory(GiB)": 22.66, "step": 23661, "token_acc": 1.0, "train_speed(iter/s)": 0.955959 }, { "epoch": 0.7686710197186759, "grad_norm": 0.3938728868961334, "learning_rate": 1.3931339535643201e-06, "loss": 0.01767272874712944, "memory(GiB)": 22.66, "step": 23662, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.955966 }, { "epoch": 0.7687035051814313, "grad_norm": 0.4216359853744507, "learning_rate": 1.392761970979359e-06, "loss": 0.010394741781055927, "memory(GiB)": 22.66, "step": 23663, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.955972 }, { "epoch": 0.7687359906441867, "grad_norm": 0.3068157136440277, "learning_rate": 1.3923900300267267e-06, "loss": 0.008778339251875877, "memory(GiB)": 22.66, "step": 23664, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.955977 }, { "epoch": 0.7687684761069421, "grad_norm": 0.2897043824195862, "learning_rate": 1.3920181307107145e-06, "loss": 0.007560985628515482, "memory(GiB)": 22.66, "step": 23665, "token_acc": 0.995, "train_speed(iter/s)": 0.955984 }, { "epoch": 0.7688009615696976, "grad_norm": 0.261307954788208, "learning_rate": 1.391646273035615e-06, "loss": 0.013003123924136162, "memory(GiB)": 22.66, "step": 23666, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.95599 }, { "epoch": 0.7688334470324529, "grad_norm": 0.3492877781391144, "learning_rate": 1.3912744570057201e-06, "loss": 0.0157576072961092, "memory(GiB)": 22.66, "step": 23667, "token_acc": 0.9947368421052631, "train_speed(iter/s)": 0.955997 }, { "epoch": 0.7688659324952084, "grad_norm": 0.42095720767974854, "learning_rate": 1.390902682625323e-06, "loss": 0.018124457448720932, "memory(GiB)": 22.66, "step": 23668, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.956004 }, { "epoch": 0.7688984179579638, "grad_norm": 0.3335510492324829, "learning_rate": 1.3905309498987113e-06, "loss": 0.010075113736093044, "memory(GiB)": 22.66, "step": 23669, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.956013 }, { "epoch": 0.7689309034207192, "grad_norm": 0.29220640659332275, "learning_rate": 1.390159258830176e-06, "loss": 0.012176262214779854, "memory(GiB)": 22.66, "step": 23670, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.956022 }, { "epoch": 0.7689633888834746, "grad_norm": 0.3777877986431122, "learning_rate": 1.3897876094240082e-06, "loss": 0.011854960583150387, "memory(GiB)": 22.66, "step": 23671, "token_acc": 1.0, "train_speed(iter/s)": 0.95603 }, { "epoch": 0.7689958743462301, "grad_norm": 0.317929744720459, "learning_rate": 1.3894160016844976e-06, "loss": 0.010685073211789131, "memory(GiB)": 22.66, "step": 23672, "token_acc": 1.0, "train_speed(iter/s)": 0.956039 }, { "epoch": 0.7690283598089854, "grad_norm": 0.3452739715576172, "learning_rate": 1.3890444356159305e-06, "loss": 0.009834084659814835, "memory(GiB)": 22.66, "step": 23673, "token_acc": 0.996, "train_speed(iter/s)": 0.956046 }, { "epoch": 0.7690608452717409, "grad_norm": 0.30418163537979126, "learning_rate": 1.3886729112225971e-06, "loss": 0.011326650157570839, "memory(GiB)": 22.66, "step": 23674, "token_acc": 1.0, "train_speed(iter/s)": 0.956052 }, { "epoch": 0.7690933307344963, "grad_norm": 0.4773592948913574, "learning_rate": 1.3883014285087847e-06, "loss": 0.017032355070114136, "memory(GiB)": 22.66, "step": 23675, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.956059 }, { "epoch": 0.7691258161972517, "grad_norm": 0.30915021896362305, "learning_rate": 1.3879299874787827e-06, "loss": 0.009743577800691128, "memory(GiB)": 22.66, "step": 23676, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.956066 }, { "epoch": 0.7691583016600071, "grad_norm": 2.36922287940979, "learning_rate": 1.3875585881368748e-06, "loss": 0.011217024177312851, "memory(GiB)": 22.66, "step": 23677, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.956073 }, { "epoch": 0.7691907871227626, "grad_norm": 0.3160251975059509, "learning_rate": 1.3871872304873485e-06, "loss": 0.01244665402919054, "memory(GiB)": 22.66, "step": 23678, "token_acc": 1.0, "train_speed(iter/s)": 0.956079 }, { "epoch": 0.7692232725855179, "grad_norm": 0.4087126851081848, "learning_rate": 1.3868159145344906e-06, "loss": 0.016423625871539116, "memory(GiB)": 22.66, "step": 23679, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.956085 }, { "epoch": 0.7692557580482734, "grad_norm": 0.3775404989719391, "learning_rate": 1.386444640282586e-06, "loss": 0.019475294277071953, "memory(GiB)": 22.66, "step": 23680, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.956092 }, { "epoch": 0.7692882435110289, "grad_norm": 0.34057772159576416, "learning_rate": 1.3860734077359216e-06, "loss": 0.009247285313904285, "memory(GiB)": 22.66, "step": 23681, "token_acc": 1.0, "train_speed(iter/s)": 0.956097 }, { "epoch": 0.7693207289737842, "grad_norm": 0.33659401535987854, "learning_rate": 1.3857022168987787e-06, "loss": 0.01068494189530611, "memory(GiB)": 22.66, "step": 23682, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.956104 }, { "epoch": 0.7693532144365397, "grad_norm": 0.3295861482620239, "learning_rate": 1.3853310677754445e-06, "loss": 0.013352451846003532, "memory(GiB)": 22.66, "step": 23683, "token_acc": 1.0, "train_speed(iter/s)": 0.956111 }, { "epoch": 0.7693856998992951, "grad_norm": 0.27586832642555237, "learning_rate": 1.3849599603701973e-06, "loss": 0.008980990387499332, "memory(GiB)": 22.66, "step": 23684, "token_acc": 1.0, "train_speed(iter/s)": 0.956118 }, { "epoch": 0.7694181853620505, "grad_norm": 0.28594934940338135, "learning_rate": 1.384588894687327e-06, "loss": 0.011984637938439846, "memory(GiB)": 22.66, "step": 23685, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956124 }, { "epoch": 0.7694506708248059, "grad_norm": 0.2842726409435272, "learning_rate": 1.3842178707311117e-06, "loss": 0.010651040822267532, "memory(GiB)": 22.66, "step": 23686, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.956131 }, { "epoch": 0.7694831562875614, "grad_norm": 0.45394450426101685, "learning_rate": 1.3838468885058358e-06, "loss": 0.018063291907310486, "memory(GiB)": 22.66, "step": 23687, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956137 }, { "epoch": 0.7695156417503167, "grad_norm": 0.29509204626083374, "learning_rate": 1.383475948015779e-06, "loss": 0.013978655450046062, "memory(GiB)": 22.66, "step": 23688, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956145 }, { "epoch": 0.7695481272130722, "grad_norm": 0.35145044326782227, "learning_rate": 1.383105049265223e-06, "loss": 0.012391660362482071, "memory(GiB)": 22.66, "step": 23689, "token_acc": 1.0, "train_speed(iter/s)": 0.956151 }, { "epoch": 0.7695806126758276, "grad_norm": 0.4603491723537445, "learning_rate": 1.382734192258449e-06, "loss": 0.019292039796710014, "memory(GiB)": 22.66, "step": 23690, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.956158 }, { "epoch": 0.769613098138583, "grad_norm": 0.3049671947956085, "learning_rate": 1.3823633769997386e-06, "loss": 0.014277258887887001, "memory(GiB)": 22.66, "step": 23691, "token_acc": 1.0, "train_speed(iter/s)": 0.956165 }, { "epoch": 0.7696455836013384, "grad_norm": 0.35435742139816284, "learning_rate": 1.381992603493368e-06, "loss": 0.012061692774295807, "memory(GiB)": 22.66, "step": 23692, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.956172 }, { "epoch": 0.7696780690640939, "grad_norm": 0.2957220673561096, "learning_rate": 1.3816218717436181e-06, "loss": 0.010864002630114555, "memory(GiB)": 22.66, "step": 23693, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956181 }, { "epoch": 0.7697105545268492, "grad_norm": 0.2983892261981964, "learning_rate": 1.3812511817547687e-06, "loss": 0.010601719841361046, "memory(GiB)": 22.66, "step": 23694, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.95619 }, { "epoch": 0.7697430399896047, "grad_norm": 0.2899557054042816, "learning_rate": 1.3808805335310964e-06, "loss": 0.008950539864599705, "memory(GiB)": 22.66, "step": 23695, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956199 }, { "epoch": 0.76977552545236, "grad_norm": 0.4185953438282013, "learning_rate": 1.3805099270768819e-06, "loss": 0.011436897329986095, "memory(GiB)": 22.66, "step": 23696, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.956207 }, { "epoch": 0.7698080109151155, "grad_norm": 0.5667737126350403, "learning_rate": 1.3801393623963983e-06, "loss": 0.009589760564267635, "memory(GiB)": 22.66, "step": 23697, "token_acc": 1.0, "train_speed(iter/s)": 0.956216 }, { "epoch": 0.7698404963778709, "grad_norm": 0.40558767318725586, "learning_rate": 1.3797688394939247e-06, "loss": 0.01237214170396328, "memory(GiB)": 22.66, "step": 23698, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.956225 }, { "epoch": 0.7698729818406264, "grad_norm": 0.25592803955078125, "learning_rate": 1.3793983583737375e-06, "loss": 0.013308059424161911, "memory(GiB)": 22.66, "step": 23699, "token_acc": 0.9966555183946488, "train_speed(iter/s)": 0.956232 }, { "epoch": 0.7699054673033817, "grad_norm": 0.45753195881843567, "learning_rate": 1.3790279190401134e-06, "loss": 0.021260347217321396, "memory(GiB)": 22.66, "step": 23700, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.956241 }, { "epoch": 0.7699379527661372, "grad_norm": 0.24045367538928986, "learning_rate": 1.3786575214973252e-06, "loss": 0.004573125392198563, "memory(GiB)": 22.66, "step": 23701, "token_acc": 1.0, "train_speed(iter/s)": 0.95625 }, { "epoch": 0.7699704382288926, "grad_norm": 0.34782058000564575, "learning_rate": 1.3782871657496489e-06, "loss": 0.013896798714995384, "memory(GiB)": 22.66, "step": 23702, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.956258 }, { "epoch": 0.770002923691648, "grad_norm": 0.278693825006485, "learning_rate": 1.3779168518013597e-06, "loss": 0.013846831396222115, "memory(GiB)": 22.66, "step": 23703, "token_acc": 1.0, "train_speed(iter/s)": 0.956267 }, { "epoch": 0.7700354091544034, "grad_norm": 0.3236675560474396, "learning_rate": 1.3775465796567322e-06, "loss": 0.012942474335432053, "memory(GiB)": 22.66, "step": 23704, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956276 }, { "epoch": 0.7700678946171589, "grad_norm": 0.29077252745628357, "learning_rate": 1.3771763493200374e-06, "loss": 0.006127958185970783, "memory(GiB)": 22.66, "step": 23705, "token_acc": 0.993006993006993, "train_speed(iter/s)": 0.956285 }, { "epoch": 0.7701003800799142, "grad_norm": 0.34827640652656555, "learning_rate": 1.3768061607955507e-06, "loss": 0.014740804210305214, "memory(GiB)": 22.66, "step": 23706, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.956293 }, { "epoch": 0.7701328655426697, "grad_norm": 0.3491721451282501, "learning_rate": 1.3764360140875405e-06, "loss": 0.00954153947532177, "memory(GiB)": 22.66, "step": 23707, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.9563 }, { "epoch": 0.770165351005425, "grad_norm": 0.3573135733604431, "learning_rate": 1.3760659092002832e-06, "loss": 0.01391730085015297, "memory(GiB)": 22.66, "step": 23708, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956306 }, { "epoch": 0.7701978364681805, "grad_norm": 0.3337629735469818, "learning_rate": 1.3756958461380505e-06, "loss": 0.010915247723460197, "memory(GiB)": 22.66, "step": 23709, "token_acc": 1.0, "train_speed(iter/s)": 0.956313 }, { "epoch": 0.7702303219309359, "grad_norm": 0.28115150332450867, "learning_rate": 1.3753258249051098e-06, "loss": 0.012276269495487213, "memory(GiB)": 22.66, "step": 23710, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.95632 }, { "epoch": 0.7702628073936914, "grad_norm": 0.2525339126586914, "learning_rate": 1.374955845505735e-06, "loss": 0.0071137151680886745, "memory(GiB)": 22.66, "step": 23711, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956327 }, { "epoch": 0.7702952928564467, "grad_norm": 0.3301832675933838, "learning_rate": 1.374585907944192e-06, "loss": 0.013502880930900574, "memory(GiB)": 22.66, "step": 23712, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.956333 }, { "epoch": 0.7703277783192022, "grad_norm": 0.2607365846633911, "learning_rate": 1.3742160122247555e-06, "loss": 0.00994569156318903, "memory(GiB)": 22.66, "step": 23713, "token_acc": 0.9923954372623575, "train_speed(iter/s)": 0.956339 }, { "epoch": 0.7703602637819575, "grad_norm": 0.42831799387931824, "learning_rate": 1.373846158351691e-06, "loss": 0.014485036954283714, "memory(GiB)": 22.66, "step": 23714, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.956345 }, { "epoch": 0.770392749244713, "grad_norm": 0.3626972436904907, "learning_rate": 1.3734763463292699e-06, "loss": 0.016524698585271835, "memory(GiB)": 22.66, "step": 23715, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.956351 }, { "epoch": 0.7704252347074684, "grad_norm": 0.37940844893455505, "learning_rate": 1.3731065761617568e-06, "loss": 0.009963985532522202, "memory(GiB)": 22.66, "step": 23716, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956357 }, { "epoch": 0.7704577201702238, "grad_norm": 0.44274985790252686, "learning_rate": 1.3727368478534214e-06, "loss": 0.012604895979166031, "memory(GiB)": 22.66, "step": 23717, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.956363 }, { "epoch": 0.7704902056329792, "grad_norm": 0.29276371002197266, "learning_rate": 1.3723671614085305e-06, "loss": 0.01072509866207838, "memory(GiB)": 22.66, "step": 23718, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.95637 }, { "epoch": 0.7705226910957347, "grad_norm": 0.6287811398506165, "learning_rate": 1.3719975168313527e-06, "loss": 0.01953764818608761, "memory(GiB)": 22.66, "step": 23719, "token_acc": 1.0, "train_speed(iter/s)": 0.956376 }, { "epoch": 0.77055517655849, "grad_norm": 0.4049071669578552, "learning_rate": 1.3716279141261507e-06, "loss": 0.013367971405386925, "memory(GiB)": 22.66, "step": 23720, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956382 }, { "epoch": 0.7705876620212455, "grad_norm": 0.4433845281600952, "learning_rate": 1.3712583532971918e-06, "loss": 0.019663866609334946, "memory(GiB)": 22.66, "step": 23721, "token_acc": 0.992, "train_speed(iter/s)": 0.956389 }, { "epoch": 0.7706201474840009, "grad_norm": 0.19710123538970947, "learning_rate": 1.3708888343487415e-06, "loss": 0.009748633950948715, "memory(GiB)": 22.66, "step": 23722, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956397 }, { "epoch": 0.7706526329467563, "grad_norm": 0.32915905117988586, "learning_rate": 1.370519357285066e-06, "loss": 0.01258958037942648, "memory(GiB)": 22.66, "step": 23723, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956403 }, { "epoch": 0.7706851184095117, "grad_norm": 0.22442783415317535, "learning_rate": 1.3701499221104265e-06, "loss": 0.007508809678256512, "memory(GiB)": 22.66, "step": 23724, "token_acc": 1.0, "train_speed(iter/s)": 0.956409 }, { "epoch": 0.7707176038722672, "grad_norm": 0.3123237192630768, "learning_rate": 1.3697805288290878e-06, "loss": 0.012956414371728897, "memory(GiB)": 22.66, "step": 23725, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956415 }, { "epoch": 0.7707500893350225, "grad_norm": 0.3534587323665619, "learning_rate": 1.369411177445314e-06, "loss": 0.013246007263660431, "memory(GiB)": 22.66, "step": 23726, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.956422 }, { "epoch": 0.770782574797778, "grad_norm": 0.3512677848339081, "learning_rate": 1.369041867963367e-06, "loss": 0.017887331545352936, "memory(GiB)": 22.66, "step": 23727, "token_acc": 1.0, "train_speed(iter/s)": 0.956429 }, { "epoch": 0.7708150602605334, "grad_norm": 0.27967652678489685, "learning_rate": 1.3686726003875117e-06, "loss": 0.010104876011610031, "memory(GiB)": 22.66, "step": 23728, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.956435 }, { "epoch": 0.7708475457232888, "grad_norm": 0.30668312311172485, "learning_rate": 1.3683033747220065e-06, "loss": 0.008774707093834877, "memory(GiB)": 22.66, "step": 23729, "token_acc": 1.0, "train_speed(iter/s)": 0.956442 }, { "epoch": 0.7708800311860442, "grad_norm": 0.4972173273563385, "learning_rate": 1.3679341909711135e-06, "loss": 0.018283173441886902, "memory(GiB)": 22.66, "step": 23730, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.956448 }, { "epoch": 0.7709125166487997, "grad_norm": 0.2872184216976166, "learning_rate": 1.3675650491390947e-06, "loss": 0.010025132447481155, "memory(GiB)": 22.66, "step": 23731, "token_acc": 1.0, "train_speed(iter/s)": 0.956456 }, { "epoch": 0.770945002111555, "grad_norm": 0.2574779987335205, "learning_rate": 1.367195949230211e-06, "loss": 0.007641115225851536, "memory(GiB)": 22.66, "step": 23732, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.956464 }, { "epoch": 0.7709774875743105, "grad_norm": 0.3067675828933716, "learning_rate": 1.3668268912487204e-06, "loss": 0.00929119624197483, "memory(GiB)": 22.66, "step": 23733, "token_acc": 0.9948979591836735, "train_speed(iter/s)": 0.956471 }, { "epoch": 0.7710099730370659, "grad_norm": 0.3760277330875397, "learning_rate": 1.3664578751988843e-06, "loss": 0.013355014845728874, "memory(GiB)": 22.66, "step": 23734, "token_acc": 0.9961538461538462, "train_speed(iter/s)": 0.956478 }, { "epoch": 0.7710424584998213, "grad_norm": 0.4105103313922882, "learning_rate": 1.3660889010849583e-06, "loss": 0.016648896038532257, "memory(GiB)": 22.66, "step": 23735, "token_acc": 1.0, "train_speed(iter/s)": 0.956485 }, { "epoch": 0.7710749439625767, "grad_norm": 0.3259742856025696, "learning_rate": 1.365719968911205e-06, "loss": 0.009265311062335968, "memory(GiB)": 22.66, "step": 23736, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.956491 }, { "epoch": 0.7711074294253322, "grad_norm": 0.31652677059173584, "learning_rate": 1.3653510786818796e-06, "loss": 0.010881787165999413, "memory(GiB)": 22.66, "step": 23737, "token_acc": 1.0, "train_speed(iter/s)": 0.956497 }, { "epoch": 0.7711399148880875, "grad_norm": 0.3610530197620392, "learning_rate": 1.3649822304012416e-06, "loss": 0.009016701951622963, "memory(GiB)": 22.66, "step": 23738, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956505 }, { "epoch": 0.771172400350843, "grad_norm": 0.40559840202331543, "learning_rate": 1.364613424073546e-06, "loss": 0.013661406002938747, "memory(GiB)": 22.66, "step": 23739, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.956511 }, { "epoch": 0.7712048858135984, "grad_norm": 0.3769261837005615, "learning_rate": 1.3642446597030478e-06, "loss": 0.01447516679763794, "memory(GiB)": 22.66, "step": 23740, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956517 }, { "epoch": 0.7712373712763538, "grad_norm": 0.35436296463012695, "learning_rate": 1.3638759372940092e-06, "loss": 0.01867295242846012, "memory(GiB)": 22.66, "step": 23741, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956523 }, { "epoch": 0.7712698567391092, "grad_norm": 0.4032127261161804, "learning_rate": 1.3635072568506803e-06, "loss": 0.017265547066926956, "memory(GiB)": 22.66, "step": 23742, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.95653 }, { "epoch": 0.7713023422018647, "grad_norm": 0.49249792098999023, "learning_rate": 1.3631386183773187e-06, "loss": 0.016355637460947037, "memory(GiB)": 22.66, "step": 23743, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.956537 }, { "epoch": 0.7713348276646201, "grad_norm": 0.49910882115364075, "learning_rate": 1.362770021878177e-06, "loss": 0.01418374665081501, "memory(GiB)": 22.66, "step": 23744, "token_acc": 1.0, "train_speed(iter/s)": 0.956544 }, { "epoch": 0.7713673131273755, "grad_norm": 1.1258065700531006, "learning_rate": 1.3624014673575108e-06, "loss": 0.0088319331407547, "memory(GiB)": 22.66, "step": 23745, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.956551 }, { "epoch": 0.771399798590131, "grad_norm": 0.386441707611084, "learning_rate": 1.3620329548195727e-06, "loss": 0.015474253334105015, "memory(GiB)": 22.66, "step": 23746, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956557 }, { "epoch": 0.7714322840528863, "grad_norm": 0.4016939401626587, "learning_rate": 1.3616644842686182e-06, "loss": 0.016596024855971336, "memory(GiB)": 22.66, "step": 23747, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.956563 }, { "epoch": 0.7714647695156418, "grad_norm": 0.4344954490661621, "learning_rate": 1.3612960557088967e-06, "loss": 0.019786346703767776, "memory(GiB)": 22.66, "step": 23748, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.95657 }, { "epoch": 0.7714972549783972, "grad_norm": 0.6464493870735168, "learning_rate": 1.3609276691446616e-06, "loss": 0.01560977939516306, "memory(GiB)": 22.66, "step": 23749, "token_acc": 1.0, "train_speed(iter/s)": 0.956577 }, { "epoch": 0.7715297404411526, "grad_norm": 0.29154300689697266, "learning_rate": 1.3605593245801652e-06, "loss": 0.01063825935125351, "memory(GiB)": 22.66, "step": 23750, "token_acc": 1.0, "train_speed(iter/s)": 0.956583 }, { "epoch": 0.771562225903908, "grad_norm": 0.4400003254413605, "learning_rate": 1.3601910220196596e-06, "loss": 0.01467614620923996, "memory(GiB)": 22.66, "step": 23751, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.95659 }, { "epoch": 0.7715947113666635, "grad_norm": 0.277672678232193, "learning_rate": 1.3598227614673925e-06, "loss": 0.013041799888014793, "memory(GiB)": 22.66, "step": 23752, "token_acc": 1.0, "train_speed(iter/s)": 0.956598 }, { "epoch": 0.7716271968294188, "grad_norm": 0.3364144563674927, "learning_rate": 1.3594545429276158e-06, "loss": 0.010943403467535973, "memory(GiB)": 22.66, "step": 23753, "token_acc": 1.0, "train_speed(iter/s)": 0.956607 }, { "epoch": 0.7716596822921743, "grad_norm": 0.29727914929389954, "learning_rate": 1.3590863664045795e-06, "loss": 0.008404852822422981, "memory(GiB)": 22.66, "step": 23754, "token_acc": 1.0, "train_speed(iter/s)": 0.956615 }, { "epoch": 0.7716921677549297, "grad_norm": 0.3800469934940338, "learning_rate": 1.3587182319025322e-06, "loss": 0.0129520483314991, "memory(GiB)": 22.66, "step": 23755, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956624 }, { "epoch": 0.7717246532176851, "grad_norm": 0.45905351638793945, "learning_rate": 1.3583501394257248e-06, "loss": 0.018971577286720276, "memory(GiB)": 22.66, "step": 23756, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956633 }, { "epoch": 0.7717571386804405, "grad_norm": 0.27920055389404297, "learning_rate": 1.3579820889784022e-06, "loss": 0.010628458112478256, "memory(GiB)": 22.66, "step": 23757, "token_acc": 1.0, "train_speed(iter/s)": 0.956642 }, { "epoch": 0.771789624143196, "grad_norm": 0.5094689130783081, "learning_rate": 1.3576140805648136e-06, "loss": 0.023651503026485443, "memory(GiB)": 22.66, "step": 23758, "token_acc": 0.9656652360515021, "train_speed(iter/s)": 0.95665 }, { "epoch": 0.7718221096059513, "grad_norm": 0.3775678277015686, "learning_rate": 1.3572461141892067e-06, "loss": 0.02017858996987343, "memory(GiB)": 22.66, "step": 23759, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.956659 }, { "epoch": 0.7718545950687068, "grad_norm": 0.3033871650695801, "learning_rate": 1.3568781898558297e-06, "loss": 0.008878350257873535, "memory(GiB)": 22.66, "step": 23760, "token_acc": 0.9965156794425087, "train_speed(iter/s)": 0.956667 }, { "epoch": 0.7718870805314622, "grad_norm": 0.39736923575401306, "learning_rate": 1.3565103075689263e-06, "loss": 0.020410005003213882, "memory(GiB)": 22.66, "step": 23761, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956676 }, { "epoch": 0.7719195659942176, "grad_norm": 0.3619440197944641, "learning_rate": 1.3561424673327444e-06, "loss": 0.011586809530854225, "memory(GiB)": 22.66, "step": 23762, "token_acc": 0.983957219251337, "train_speed(iter/s)": 0.956685 }, { "epoch": 0.771952051456973, "grad_norm": 0.42362597584724426, "learning_rate": 1.3557746691515256e-06, "loss": 0.01733573153614998, "memory(GiB)": 22.66, "step": 23763, "token_acc": 0.9930313588850174, "train_speed(iter/s)": 0.956693 }, { "epoch": 0.7719845369197285, "grad_norm": 0.3755542039871216, "learning_rate": 1.35540691302952e-06, "loss": 0.01349673792719841, "memory(GiB)": 22.66, "step": 23764, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.956702 }, { "epoch": 0.7720170223824838, "grad_norm": 0.4464368224143982, "learning_rate": 1.3550391989709687e-06, "loss": 0.017628546804189682, "memory(GiB)": 22.66, "step": 23765, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.956711 }, { "epoch": 0.7720495078452393, "grad_norm": 0.37957605719566345, "learning_rate": 1.354671526980118e-06, "loss": 0.01049230806529522, "memory(GiB)": 22.66, "step": 23766, "token_acc": 1.0, "train_speed(iter/s)": 0.956719 }, { "epoch": 0.7720819933079947, "grad_norm": 0.22791780531406403, "learning_rate": 1.3543038970612082e-06, "loss": 0.009398601949214935, "memory(GiB)": 22.66, "step": 23767, "token_acc": 0.9930795847750865, "train_speed(iter/s)": 0.956728 }, { "epoch": 0.7721144787707501, "grad_norm": 0.2821832597255707, "learning_rate": 1.3539363092184836e-06, "loss": 0.008201603777706623, "memory(GiB)": 22.66, "step": 23768, "token_acc": 0.9876543209876543, "train_speed(iter/s)": 0.956737 }, { "epoch": 0.7721469642335055, "grad_norm": 0.3249114453792572, "learning_rate": 1.3535687634561867e-06, "loss": 0.011978712864220142, "memory(GiB)": 22.66, "step": 23769, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956744 }, { "epoch": 0.772179449696261, "grad_norm": 0.22799205780029297, "learning_rate": 1.3532012597785605e-06, "loss": 0.012275079265236855, "memory(GiB)": 22.66, "step": 23770, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.95675 }, { "epoch": 0.7722119351590163, "grad_norm": 0.38545331358909607, "learning_rate": 1.3528337981898465e-06, "loss": 0.013851677067577839, "memory(GiB)": 22.66, "step": 23771, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.956756 }, { "epoch": 0.7722444206217718, "grad_norm": 0.45529794692993164, "learning_rate": 1.3524663786942832e-06, "loss": 0.009462486952543259, "memory(GiB)": 22.66, "step": 23772, "token_acc": 1.0, "train_speed(iter/s)": 0.956763 }, { "epoch": 0.7722769060845271, "grad_norm": 0.3877444565296173, "learning_rate": 1.3520990012961132e-06, "loss": 0.01631646603345871, "memory(GiB)": 22.66, "step": 23773, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.95677 }, { "epoch": 0.7723093915472826, "grad_norm": 0.42217275500297546, "learning_rate": 1.351731665999576e-06, "loss": 0.015154628083109856, "memory(GiB)": 22.66, "step": 23774, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956777 }, { "epoch": 0.772341877010038, "grad_norm": 0.42012518644332886, "learning_rate": 1.3513643728089127e-06, "loss": 0.015131767839193344, "memory(GiB)": 22.66, "step": 23775, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.956784 }, { "epoch": 0.7723743624727935, "grad_norm": 0.3147921562194824, "learning_rate": 1.350997121728359e-06, "loss": 0.011248292401432991, "memory(GiB)": 22.66, "step": 23776, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.95679 }, { "epoch": 0.7724068479355488, "grad_norm": 0.4543517827987671, "learning_rate": 1.350629912762156e-06, "loss": 0.018729455769062042, "memory(GiB)": 22.66, "step": 23777, "token_acc": 0.9838056680161943, "train_speed(iter/s)": 0.956797 }, { "epoch": 0.7724393333983043, "grad_norm": 0.35703787207603455, "learning_rate": 1.35026274591454e-06, "loss": 0.014629073441028595, "memory(GiB)": 22.66, "step": 23778, "token_acc": 1.0, "train_speed(iter/s)": 0.956803 }, { "epoch": 0.7724718188610596, "grad_norm": 0.3663410544395447, "learning_rate": 1.3498956211897524e-06, "loss": 0.014295604079961777, "memory(GiB)": 22.66, "step": 23779, "token_acc": 1.0, "train_speed(iter/s)": 0.95681 }, { "epoch": 0.7725043043238151, "grad_norm": 0.4275028705596924, "learning_rate": 1.3495285385920253e-06, "loss": 0.016195280477404594, "memory(GiB)": 22.66, "step": 23780, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.956816 }, { "epoch": 0.7725367897865705, "grad_norm": 0.415030837059021, "learning_rate": 1.3491614981255979e-06, "loss": 0.013705601915717125, "memory(GiB)": 22.66, "step": 23781, "token_acc": 1.0, "train_speed(iter/s)": 0.956823 }, { "epoch": 0.772569275249326, "grad_norm": 0.38941097259521484, "learning_rate": 1.3487944997947062e-06, "loss": 0.01654159277677536, "memory(GiB)": 22.66, "step": 23782, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956829 }, { "epoch": 0.7726017607120813, "grad_norm": 0.654521644115448, "learning_rate": 1.3484275436035871e-06, "loss": 0.013375673443078995, "memory(GiB)": 22.66, "step": 23783, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956836 }, { "epoch": 0.7726342461748368, "grad_norm": 0.4261185824871063, "learning_rate": 1.348060629556473e-06, "loss": 0.018133442848920822, "memory(GiB)": 22.66, "step": 23784, "token_acc": 1.0, "train_speed(iter/s)": 0.956842 }, { "epoch": 0.7726667316375921, "grad_norm": 0.313371479511261, "learning_rate": 1.3476937576576016e-06, "loss": 0.014910376630723476, "memory(GiB)": 22.66, "step": 23785, "token_acc": 1.0, "train_speed(iter/s)": 0.956849 }, { "epoch": 0.7726992171003476, "grad_norm": 0.3610641062259674, "learning_rate": 1.3473269279112016e-06, "loss": 0.019225478172302246, "memory(GiB)": 22.66, "step": 23786, "token_acc": 1.0, "train_speed(iter/s)": 0.956856 }, { "epoch": 0.772731702563103, "grad_norm": 0.32375988364219666, "learning_rate": 1.3469601403215132e-06, "loss": 0.01283053494989872, "memory(GiB)": 22.66, "step": 23787, "token_acc": 1.0, "train_speed(iter/s)": 0.956862 }, { "epoch": 0.7727641880258584, "grad_norm": 0.30390989780426025, "learning_rate": 1.3465933948927678e-06, "loss": 0.015231937170028687, "memory(GiB)": 22.66, "step": 23788, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956868 }, { "epoch": 0.7727966734886138, "grad_norm": 0.3284567892551422, "learning_rate": 1.346226691629196e-06, "loss": 0.01329583115875721, "memory(GiB)": 22.66, "step": 23789, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.956874 }, { "epoch": 0.7728291589513693, "grad_norm": 0.2947373688220978, "learning_rate": 1.3458600305350328e-06, "loss": 0.014313933439552784, "memory(GiB)": 22.66, "step": 23790, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.95688 }, { "epoch": 0.7728616444141246, "grad_norm": 0.3745361268520355, "learning_rate": 1.3454934116145052e-06, "loss": 0.014116566628217697, "memory(GiB)": 22.66, "step": 23791, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.956887 }, { "epoch": 0.7728941298768801, "grad_norm": 0.2614845633506775, "learning_rate": 1.345126834871851e-06, "loss": 0.0087199117988348, "memory(GiB)": 22.66, "step": 23792, "token_acc": 1.0, "train_speed(iter/s)": 0.956893 }, { "epoch": 0.7729266153396355, "grad_norm": 0.3813294470310211, "learning_rate": 1.3447603003112963e-06, "loss": 0.013171123340725899, "memory(GiB)": 22.66, "step": 23793, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.9569 }, { "epoch": 0.7729591008023909, "grad_norm": 0.36277976632118225, "learning_rate": 1.3443938079370743e-06, "loss": 0.010535488836467266, "memory(GiB)": 22.66, "step": 23794, "token_acc": 1.0, "train_speed(iter/s)": 0.956906 }, { "epoch": 0.7729915862651463, "grad_norm": 0.19603155553340912, "learning_rate": 1.3440273577534119e-06, "loss": 0.006506461650133133, "memory(GiB)": 22.66, "step": 23795, "token_acc": 1.0, "train_speed(iter/s)": 0.956912 }, { "epoch": 0.7730240717279018, "grad_norm": 0.4720934331417084, "learning_rate": 1.3436609497645398e-06, "loss": 0.016179881989955902, "memory(GiB)": 22.66, "step": 23796, "token_acc": 1.0, "train_speed(iter/s)": 0.956919 }, { "epoch": 0.7730565571906571, "grad_norm": 0.41142746806144714, "learning_rate": 1.343294583974687e-06, "loss": 0.018551494926214218, "memory(GiB)": 22.66, "step": 23797, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956924 }, { "epoch": 0.7730890426534126, "grad_norm": 1.736862063407898, "learning_rate": 1.3429282603880833e-06, "loss": 0.01857023313641548, "memory(GiB)": 22.66, "step": 23798, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.956931 }, { "epoch": 0.773121528116168, "grad_norm": 0.3281451165676117, "learning_rate": 1.3425619790089532e-06, "loss": 0.012335522100329399, "memory(GiB)": 22.66, "step": 23799, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956938 }, { "epoch": 0.7731540135789234, "grad_norm": 0.34253889322280884, "learning_rate": 1.3421957398415263e-06, "loss": 0.011275144293904305, "memory(GiB)": 22.66, "step": 23800, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.956943 }, { "epoch": 0.7731864990416788, "grad_norm": 0.3343246877193451, "learning_rate": 1.341829542890029e-06, "loss": 0.014232280664145947, "memory(GiB)": 22.66, "step": 23801, "token_acc": 1.0, "train_speed(iter/s)": 0.95695 }, { "epoch": 0.7732189845044343, "grad_norm": 0.4873185455799103, "learning_rate": 1.3414633881586876e-06, "loss": 0.013994462788105011, "memory(GiB)": 22.66, "step": 23802, "token_acc": 0.991869918699187, "train_speed(iter/s)": 0.956957 }, { "epoch": 0.7732514699671896, "grad_norm": 0.2375529259443283, "learning_rate": 1.34109727565173e-06, "loss": 0.007019923999905586, "memory(GiB)": 22.66, "step": 23803, "token_acc": 1.0, "train_speed(iter/s)": 0.956964 }, { "epoch": 0.7732839554299451, "grad_norm": 0.3975568413734436, "learning_rate": 1.3407312053733778e-06, "loss": 0.015245998278260231, "memory(GiB)": 22.66, "step": 23804, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.95697 }, { "epoch": 0.7733164408927005, "grad_norm": 0.25415509939193726, "learning_rate": 1.3403651773278586e-06, "loss": 0.013169145211577415, "memory(GiB)": 22.66, "step": 23805, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.956977 }, { "epoch": 0.7733489263554559, "grad_norm": 0.3579312562942505, "learning_rate": 1.339999191519396e-06, "loss": 0.011301510035991669, "memory(GiB)": 22.66, "step": 23806, "token_acc": 0.9922779922779923, "train_speed(iter/s)": 0.956984 }, { "epoch": 0.7733814118182113, "grad_norm": 0.4000924527645111, "learning_rate": 1.3396332479522157e-06, "loss": 0.012357449159026146, "memory(GiB)": 22.66, "step": 23807, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.956991 }, { "epoch": 0.7734138972809668, "grad_norm": 0.4966510832309723, "learning_rate": 1.3392673466305378e-06, "loss": 0.013291404582560062, "memory(GiB)": 22.66, "step": 23808, "token_acc": 0.985781990521327, "train_speed(iter/s)": 0.956998 }, { "epoch": 0.7734463827437222, "grad_norm": 0.22610941529273987, "learning_rate": 1.3389014875585875e-06, "loss": 0.007946161553263664, "memory(GiB)": 22.66, "step": 23809, "token_acc": 1.0, "train_speed(iter/s)": 0.957005 }, { "epoch": 0.7734788682064776, "grad_norm": 0.2759111225605011, "learning_rate": 1.3385356707405867e-06, "loss": 0.011967767030000687, "memory(GiB)": 22.66, "step": 23810, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.957012 }, { "epoch": 0.7735113536692331, "grad_norm": 0.2587408423423767, "learning_rate": 1.3381698961807594e-06, "loss": 0.007357368245720863, "memory(GiB)": 22.66, "step": 23811, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.957019 }, { "epoch": 0.7735438391319884, "grad_norm": 0.2738454043865204, "learning_rate": 1.337804163883324e-06, "loss": 0.009492926299571991, "memory(GiB)": 22.66, "step": 23812, "token_acc": 1.0, "train_speed(iter/s)": 0.957027 }, { "epoch": 0.7735763245947439, "grad_norm": 0.35101786255836487, "learning_rate": 1.3374384738525042e-06, "loss": 0.017348727211356163, "memory(GiB)": 22.66, "step": 23813, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.957036 }, { "epoch": 0.7736088100574993, "grad_norm": 0.4832804799079895, "learning_rate": 1.3370728260925165e-06, "loss": 0.015265941619873047, "memory(GiB)": 22.66, "step": 23814, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957044 }, { "epoch": 0.7736412955202547, "grad_norm": 0.25813448429107666, "learning_rate": 1.3367072206075865e-06, "loss": 0.011676388792693615, "memory(GiB)": 22.66, "step": 23815, "token_acc": 0.9931506849315068, "train_speed(iter/s)": 0.957052 }, { "epoch": 0.7736737809830101, "grad_norm": 0.20062048733234406, "learning_rate": 1.3363416574019295e-06, "loss": 0.009789954870939255, "memory(GiB)": 22.66, "step": 23816, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.957061 }, { "epoch": 0.7737062664457656, "grad_norm": 0.3352705240249634, "learning_rate": 1.3359761364797659e-06, "loss": 0.011939695104956627, "memory(GiB)": 22.66, "step": 23817, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957069 }, { "epoch": 0.7737387519085209, "grad_norm": 0.34919703006744385, "learning_rate": 1.3356106578453164e-06, "loss": 0.012241438031196594, "memory(GiB)": 22.66, "step": 23818, "token_acc": 1.0, "train_speed(iter/s)": 0.957077 }, { "epoch": 0.7737712373712764, "grad_norm": 0.25735193490982056, "learning_rate": 1.3352452215027939e-06, "loss": 0.012292612344026566, "memory(GiB)": 22.66, "step": 23819, "token_acc": 1.0, "train_speed(iter/s)": 0.957085 }, { "epoch": 0.7738037228340318, "grad_norm": 0.2800939381122589, "learning_rate": 1.3348798274564223e-06, "loss": 0.008011656813323498, "memory(GiB)": 22.66, "step": 23820, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.957094 }, { "epoch": 0.7738362082967872, "grad_norm": 0.3540492653846741, "learning_rate": 1.3345144757104146e-06, "loss": 0.010175962932407856, "memory(GiB)": 22.66, "step": 23821, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.957103 }, { "epoch": 0.7738686937595426, "grad_norm": 0.35332342982292175, "learning_rate": 1.3341491662689898e-06, "loss": 0.013830470852553844, "memory(GiB)": 22.66, "step": 23822, "token_acc": 0.9930795847750865, "train_speed(iter/s)": 0.957111 }, { "epoch": 0.7739011792222981, "grad_norm": 1.005380630493164, "learning_rate": 1.3337838991363617e-06, "loss": 0.022722452878952026, "memory(GiB)": 22.66, "step": 23823, "token_acc": 0.9966777408637874, "train_speed(iter/s)": 0.957119 }, { "epoch": 0.7739336646850534, "grad_norm": 0.39112335443496704, "learning_rate": 1.3334186743167465e-06, "loss": 0.011090284213423729, "memory(GiB)": 22.66, "step": 23824, "token_acc": 0.992, "train_speed(iter/s)": 0.957128 }, { "epoch": 0.7739661501478089, "grad_norm": 0.47164714336395264, "learning_rate": 1.3330534918143607e-06, "loss": 0.02098235860466957, "memory(GiB)": 22.66, "step": 23825, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.957136 }, { "epoch": 0.7739986356105643, "grad_norm": 0.45369070768356323, "learning_rate": 1.3326883516334194e-06, "loss": 0.014661919325590134, "memory(GiB)": 22.66, "step": 23826, "token_acc": 1.0, "train_speed(iter/s)": 0.957145 }, { "epoch": 0.7740311210733197, "grad_norm": 0.33355841040611267, "learning_rate": 1.3323232537781343e-06, "loss": 0.009938588365912437, "memory(GiB)": 22.66, "step": 23827, "token_acc": 1.0, "train_speed(iter/s)": 0.957153 }, { "epoch": 0.7740636065360751, "grad_norm": 0.4238181710243225, "learning_rate": 1.3319581982527209e-06, "loss": 0.017139850184321404, "memory(GiB)": 22.66, "step": 23828, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.957162 }, { "epoch": 0.7740960919988306, "grad_norm": 0.2762168049812317, "learning_rate": 1.331593185061391e-06, "loss": 0.011214099824428558, "memory(GiB)": 22.66, "step": 23829, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.95717 }, { "epoch": 0.7741285774615859, "grad_norm": 0.3553289771080017, "learning_rate": 1.331228214208361e-06, "loss": 0.010837554931640625, "memory(GiB)": 22.66, "step": 23830, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.957178 }, { "epoch": 0.7741610629243414, "grad_norm": 0.4570712447166443, "learning_rate": 1.3308632856978382e-06, "loss": 0.014441316947340965, "memory(GiB)": 22.66, "step": 23831, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957185 }, { "epoch": 0.7741935483870968, "grad_norm": 0.3197297751903534, "learning_rate": 1.330498399534037e-06, "loss": 0.01490425318479538, "memory(GiB)": 22.66, "step": 23832, "token_acc": 1.0, "train_speed(iter/s)": 0.957192 }, { "epoch": 0.7742260338498522, "grad_norm": 0.3154655396938324, "learning_rate": 1.3301335557211686e-06, "loss": 0.013148708269000053, "memory(GiB)": 22.66, "step": 23833, "token_acc": 1.0, "train_speed(iter/s)": 0.957199 }, { "epoch": 0.7742585193126076, "grad_norm": 0.26349785923957825, "learning_rate": 1.3297687542634435e-06, "loss": 0.005775664933025837, "memory(GiB)": 22.66, "step": 23834, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.957205 }, { "epoch": 0.7742910047753631, "grad_norm": 0.4335389733314514, "learning_rate": 1.3294039951650733e-06, "loss": 0.018549740314483643, "memory(GiB)": 22.66, "step": 23835, "token_acc": 1.0, "train_speed(iter/s)": 0.957212 }, { "epoch": 0.7743234902381184, "grad_norm": 0.8068276047706604, "learning_rate": 1.3290392784302653e-06, "loss": 0.016755543649196625, "memory(GiB)": 22.66, "step": 23836, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957219 }, { "epoch": 0.7743559757008739, "grad_norm": 0.5104459524154663, "learning_rate": 1.3286746040632298e-06, "loss": 0.015058322809636593, "memory(GiB)": 22.66, "step": 23837, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957226 }, { "epoch": 0.7743884611636292, "grad_norm": 0.36810198426246643, "learning_rate": 1.3283099720681758e-06, "loss": 0.0139852873980999, "memory(GiB)": 22.66, "step": 23838, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.957232 }, { "epoch": 0.7744209466263847, "grad_norm": 0.4916931688785553, "learning_rate": 1.3279453824493138e-06, "loss": 0.012446423061192036, "memory(GiB)": 22.66, "step": 23839, "token_acc": 1.0, "train_speed(iter/s)": 0.957239 }, { "epoch": 0.7744534320891401, "grad_norm": 0.3440159857273102, "learning_rate": 1.3275808352108477e-06, "loss": 0.010675197467207909, "memory(GiB)": 22.66, "step": 23840, "token_acc": 1.0, "train_speed(iter/s)": 0.957245 }, { "epoch": 0.7744859175518956, "grad_norm": 0.3434845209121704, "learning_rate": 1.3272163303569879e-06, "loss": 0.013489630073308945, "memory(GiB)": 22.66, "step": 23841, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957251 }, { "epoch": 0.7745184030146509, "grad_norm": 0.39107558131217957, "learning_rate": 1.326851867891938e-06, "loss": 0.02130686119198799, "memory(GiB)": 22.66, "step": 23842, "token_acc": 0.9826839826839827, "train_speed(iter/s)": 0.957258 }, { "epoch": 0.7745508884774064, "grad_norm": 0.22214022278785706, "learning_rate": 1.3264874478199086e-06, "loss": 0.008668884634971619, "memory(GiB)": 22.66, "step": 23843, "token_acc": 1.0, "train_speed(iter/s)": 0.957265 }, { "epoch": 0.7745833739401617, "grad_norm": 0.36455440521240234, "learning_rate": 1.3261230701451028e-06, "loss": 0.013410413637757301, "memory(GiB)": 22.66, "step": 23844, "token_acc": 1.0, "train_speed(iter/s)": 0.957271 }, { "epoch": 0.7746158594029172, "grad_norm": 0.36527156829833984, "learning_rate": 1.3257587348717277e-06, "loss": 0.01380913332104683, "memory(GiB)": 22.66, "step": 23845, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.957278 }, { "epoch": 0.7746483448656726, "grad_norm": 0.29690343141555786, "learning_rate": 1.3253944420039855e-06, "loss": 0.009787680581212044, "memory(GiB)": 22.66, "step": 23846, "token_acc": 1.0, "train_speed(iter/s)": 0.957284 }, { "epoch": 0.774680830328428, "grad_norm": 0.38148486614227295, "learning_rate": 1.3250301915460812e-06, "loss": 0.0106922946870327, "memory(GiB)": 22.66, "step": 23847, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.95729 }, { "epoch": 0.7747133157911834, "grad_norm": 0.2907322645187378, "learning_rate": 1.3246659835022225e-06, "loss": 0.010764628648757935, "memory(GiB)": 22.66, "step": 23848, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.957297 }, { "epoch": 0.7747458012539389, "grad_norm": 0.32520419359207153, "learning_rate": 1.3243018178766093e-06, "loss": 0.012577529065310955, "memory(GiB)": 22.66, "step": 23849, "token_acc": 1.0, "train_speed(iter/s)": 0.957302 }, { "epoch": 0.7747782867166942, "grad_norm": 0.2735075056552887, "learning_rate": 1.3239376946734468e-06, "loss": 0.0099081601947546, "memory(GiB)": 22.66, "step": 23850, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957308 }, { "epoch": 0.7748107721794497, "grad_norm": 0.3836458921432495, "learning_rate": 1.323573613896934e-06, "loss": 0.01907801255583763, "memory(GiB)": 22.66, "step": 23851, "token_acc": 0.9966777408637874, "train_speed(iter/s)": 0.957315 }, { "epoch": 0.7748432576422051, "grad_norm": 0.2660740911960602, "learning_rate": 1.323209575551276e-06, "loss": 0.014797426760196686, "memory(GiB)": 22.66, "step": 23852, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.95732 }, { "epoch": 0.7748757431049605, "grad_norm": 0.37660059332847595, "learning_rate": 1.3228455796406726e-06, "loss": 0.009019446559250355, "memory(GiB)": 22.66, "step": 23853, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.957325 }, { "epoch": 0.7749082285677159, "grad_norm": 0.35543060302734375, "learning_rate": 1.3224816261693269e-06, "loss": 0.01301608420908451, "memory(GiB)": 22.66, "step": 23854, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.957332 }, { "epoch": 0.7749407140304714, "grad_norm": 0.4581502676010132, "learning_rate": 1.3221177151414365e-06, "loss": 0.012146169319748878, "memory(GiB)": 22.66, "step": 23855, "token_acc": 1.0, "train_speed(iter/s)": 0.957339 }, { "epoch": 0.7749731994932267, "grad_norm": 0.21336044371128082, "learning_rate": 1.3217538465612034e-06, "loss": 0.008364556357264519, "memory(GiB)": 22.66, "step": 23856, "token_acc": 1.0, "train_speed(iter/s)": 0.957345 }, { "epoch": 0.7750056849559822, "grad_norm": 0.2910759747028351, "learning_rate": 1.3213900204328262e-06, "loss": 0.011721665039658546, "memory(GiB)": 22.66, "step": 23857, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957353 }, { "epoch": 0.7750381704187376, "grad_norm": 0.2896900177001953, "learning_rate": 1.3210262367605064e-06, "loss": 0.011012467555701733, "memory(GiB)": 22.66, "step": 23858, "token_acc": 1.0, "train_speed(iter/s)": 0.95736 }, { "epoch": 0.775070655881493, "grad_norm": 0.3842952251434326, "learning_rate": 1.3206624955484387e-06, "loss": 0.012532735243439674, "memory(GiB)": 22.66, "step": 23859, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.957366 }, { "epoch": 0.7751031413442484, "grad_norm": 0.5097101330757141, "learning_rate": 1.320298796800823e-06, "loss": 0.024635011330246925, "memory(GiB)": 22.66, "step": 23860, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.957373 }, { "epoch": 0.7751356268070039, "grad_norm": 0.3300434648990631, "learning_rate": 1.3199351405218568e-06, "loss": 0.011939676478505135, "memory(GiB)": 22.66, "step": 23861, "token_acc": 0.9966555183946488, "train_speed(iter/s)": 0.957379 }, { "epoch": 0.7751681122697592, "grad_norm": 0.2904595732688904, "learning_rate": 1.319571526715739e-06, "loss": 0.009453472681343555, "memory(GiB)": 22.66, "step": 23862, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957386 }, { "epoch": 0.7752005977325147, "grad_norm": 0.33813533186912537, "learning_rate": 1.3192079553866627e-06, "loss": 0.011572827585041523, "memory(GiB)": 22.66, "step": 23863, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.957393 }, { "epoch": 0.7752330831952701, "grad_norm": 0.36594659090042114, "learning_rate": 1.3188444265388256e-06, "loss": 0.023904770612716675, "memory(GiB)": 22.66, "step": 23864, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957399 }, { "epoch": 0.7752655686580255, "grad_norm": 0.24660086631774902, "learning_rate": 1.3184809401764237e-06, "loss": 0.010593647137284279, "memory(GiB)": 22.66, "step": 23865, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.957406 }, { "epoch": 0.7752980541207809, "grad_norm": 0.3056897819042206, "learning_rate": 1.3181174963036514e-06, "loss": 0.008554846048355103, "memory(GiB)": 22.66, "step": 23866, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957412 }, { "epoch": 0.7753305395835364, "grad_norm": 0.40950116515159607, "learning_rate": 1.317754094924706e-06, "loss": 0.019407927989959717, "memory(GiB)": 22.66, "step": 23867, "token_acc": 0.9936507936507937, "train_speed(iter/s)": 0.957419 }, { "epoch": 0.7753630250462917, "grad_norm": 0.4014989733695984, "learning_rate": 1.3173907360437771e-06, "loss": 0.009728674776852131, "memory(GiB)": 22.66, "step": 23868, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.957425 }, { "epoch": 0.7753955105090472, "grad_norm": 0.317213237285614, "learning_rate": 1.3170274196650628e-06, "loss": 0.012233491986989975, "memory(GiB)": 22.66, "step": 23869, "token_acc": 0.9948717948717949, "train_speed(iter/s)": 0.957432 }, { "epoch": 0.7754279959718026, "grad_norm": 0.4828876554965973, "learning_rate": 1.316664145792751e-06, "loss": 0.0172068253159523, "memory(GiB)": 22.66, "step": 23870, "token_acc": 0.9793388429752066, "train_speed(iter/s)": 0.957438 }, { "epoch": 0.775460481434558, "grad_norm": 0.2763768434524536, "learning_rate": 1.3163009144310402e-06, "loss": 0.010619113221764565, "memory(GiB)": 22.66, "step": 23871, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957445 }, { "epoch": 0.7754929668973135, "grad_norm": 0.2886180877685547, "learning_rate": 1.3159377255841183e-06, "loss": 0.009362132288515568, "memory(GiB)": 22.66, "step": 23872, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957454 }, { "epoch": 0.7755254523600689, "grad_norm": 0.2722863554954529, "learning_rate": 1.3155745792561802e-06, "loss": 0.006326260045170784, "memory(GiB)": 22.66, "step": 23873, "token_acc": 1.0, "train_speed(iter/s)": 0.957463 }, { "epoch": 0.7755579378228243, "grad_norm": 0.40923750400543213, "learning_rate": 1.3152114754514139e-06, "loss": 0.020125536248087883, "memory(GiB)": 22.66, "step": 23874, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957471 }, { "epoch": 0.7755904232855797, "grad_norm": 0.2933204472064972, "learning_rate": 1.3148484141740114e-06, "loss": 0.011024540290236473, "memory(GiB)": 22.66, "step": 23875, "token_acc": 1.0, "train_speed(iter/s)": 0.957479 }, { "epoch": 0.7756229087483352, "grad_norm": 0.3245788514614105, "learning_rate": 1.3144853954281634e-06, "loss": 0.012034084647893906, "memory(GiB)": 22.66, "step": 23876, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.957488 }, { "epoch": 0.7756553942110905, "grad_norm": 0.2987709939479828, "learning_rate": 1.3141224192180607e-06, "loss": 0.01166872214525938, "memory(GiB)": 22.66, "step": 23877, "token_acc": 1.0, "train_speed(iter/s)": 0.957497 }, { "epoch": 0.775687879673846, "grad_norm": 0.3368816375732422, "learning_rate": 1.3137594855478896e-06, "loss": 0.016958683729171753, "memory(GiB)": 22.66, "step": 23878, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957505 }, { "epoch": 0.7757203651366014, "grad_norm": 0.2923904359340668, "learning_rate": 1.3133965944218408e-06, "loss": 0.012762246653437614, "memory(GiB)": 22.66, "step": 23879, "token_acc": 1.0, "train_speed(iter/s)": 0.957514 }, { "epoch": 0.7757528505993568, "grad_norm": 0.3983921408653259, "learning_rate": 1.3130337458441017e-06, "loss": 0.013044741004705429, "memory(GiB)": 22.66, "step": 23880, "token_acc": 0.9897435897435898, "train_speed(iter/s)": 0.957522 }, { "epoch": 0.7757853360621122, "grad_norm": 0.45133671164512634, "learning_rate": 1.312670939818861e-06, "loss": 0.016130249947309494, "memory(GiB)": 22.66, "step": 23881, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.957531 }, { "epoch": 0.7758178215248677, "grad_norm": 0.41286545991897583, "learning_rate": 1.3123081763503065e-06, "loss": 0.012482577003538609, "memory(GiB)": 22.66, "step": 23882, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957539 }, { "epoch": 0.775850306987623, "grad_norm": 0.25910019874572754, "learning_rate": 1.3119454554426225e-06, "loss": 0.007532668765634298, "memory(GiB)": 22.66, "step": 23883, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.957548 }, { "epoch": 0.7758827924503785, "grad_norm": 0.3148263096809387, "learning_rate": 1.3115827770999973e-06, "loss": 0.00734589621424675, "memory(GiB)": 22.66, "step": 23884, "token_acc": 1.0, "train_speed(iter/s)": 0.957557 }, { "epoch": 0.7759152779131339, "grad_norm": 0.32259485125541687, "learning_rate": 1.3112201413266156e-06, "loss": 0.011786982417106628, "memory(GiB)": 22.66, "step": 23885, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.957565 }, { "epoch": 0.7759477633758893, "grad_norm": 0.4124419093132019, "learning_rate": 1.310857548126665e-06, "loss": 0.01609889231622219, "memory(GiB)": 22.66, "step": 23886, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957574 }, { "epoch": 0.7759802488386447, "grad_norm": 0.24176611006259918, "learning_rate": 1.3104949975043268e-06, "loss": 0.008465399965643883, "memory(GiB)": 22.66, "step": 23887, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.957582 }, { "epoch": 0.7760127343014002, "grad_norm": 0.49836239218711853, "learning_rate": 1.3101324894637867e-06, "loss": 0.01961055025458336, "memory(GiB)": 22.66, "step": 23888, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957591 }, { "epoch": 0.7760452197641555, "grad_norm": 0.294644296169281, "learning_rate": 1.3097700240092293e-06, "loss": 0.012950284406542778, "memory(GiB)": 22.66, "step": 23889, "token_acc": 0.996, "train_speed(iter/s)": 0.957599 }, { "epoch": 0.776077705226911, "grad_norm": 0.26521429419517517, "learning_rate": 1.3094076011448393e-06, "loss": 0.007913241162896156, "memory(GiB)": 22.66, "step": 23890, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957607 }, { "epoch": 0.7761101906896664, "grad_norm": 0.32267946004867554, "learning_rate": 1.3090452208747955e-06, "loss": 0.011170506477355957, "memory(GiB)": 22.66, "step": 23891, "token_acc": 1.0, "train_speed(iter/s)": 0.957616 }, { "epoch": 0.7761426761524218, "grad_norm": 0.28674113750457764, "learning_rate": 1.3086828832032849e-06, "loss": 0.01252415869385004, "memory(GiB)": 22.66, "step": 23892, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.957624 }, { "epoch": 0.7761751616151772, "grad_norm": 0.3404502272605896, "learning_rate": 1.3083205881344834e-06, "loss": 0.017048722133040428, "memory(GiB)": 22.66, "step": 23893, "token_acc": 0.9904761904761905, "train_speed(iter/s)": 0.957631 }, { "epoch": 0.7762076470779327, "grad_norm": 0.40505290031433105, "learning_rate": 1.3079583356725778e-06, "loss": 0.013947696425020695, "memory(GiB)": 22.66, "step": 23894, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.957637 }, { "epoch": 0.776240132540688, "grad_norm": 0.31112435460090637, "learning_rate": 1.307596125821748e-06, "loss": 0.009527429938316345, "memory(GiB)": 22.66, "step": 23895, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.957644 }, { "epoch": 0.7762726180034435, "grad_norm": 0.4441365599632263, "learning_rate": 1.3072339585861727e-06, "loss": 0.011115651577711105, "memory(GiB)": 22.66, "step": 23896, "token_acc": 1.0, "train_speed(iter/s)": 0.957651 }, { "epoch": 0.7763051034661989, "grad_norm": 0.25138458609580994, "learning_rate": 1.306871833970033e-06, "loss": 0.00923990085721016, "memory(GiB)": 22.66, "step": 23897, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.957657 }, { "epoch": 0.7763375889289543, "grad_norm": 0.3378007411956787, "learning_rate": 1.306509751977506e-06, "loss": 0.013492649421095848, "memory(GiB)": 22.66, "step": 23898, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.957664 }, { "epoch": 0.7763700743917097, "grad_norm": 0.3927701711654663, "learning_rate": 1.3061477126127753e-06, "loss": 0.012543347664177418, "memory(GiB)": 22.66, "step": 23899, "token_acc": 0.9806763285024155, "train_speed(iter/s)": 0.957671 }, { "epoch": 0.7764025598544652, "grad_norm": 0.33789655566215515, "learning_rate": 1.3057857158800147e-06, "loss": 0.007928352802991867, "memory(GiB)": 22.66, "step": 23900, "token_acc": 1.0, "train_speed(iter/s)": 0.957677 }, { "epoch": 0.7764350453172205, "grad_norm": 0.44641759991645813, "learning_rate": 1.3054237617834055e-06, "loss": 0.01316146831959486, "memory(GiB)": 22.66, "step": 23901, "token_acc": 0.996, "train_speed(iter/s)": 0.957684 }, { "epoch": 0.776467530779976, "grad_norm": 0.3220674991607666, "learning_rate": 1.3050618503271217e-06, "loss": 0.014021788723766804, "memory(GiB)": 22.66, "step": 23902, "token_acc": 1.0, "train_speed(iter/s)": 0.957691 }, { "epoch": 0.7765000162427314, "grad_norm": 0.44621771574020386, "learning_rate": 1.3046999815153427e-06, "loss": 0.01574617065489292, "memory(GiB)": 22.66, "step": 23903, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957697 }, { "epoch": 0.7765325017054868, "grad_norm": 0.6430235505104065, "learning_rate": 1.304338155352244e-06, "loss": 0.022019565105438232, "memory(GiB)": 22.66, "step": 23904, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.957704 }, { "epoch": 0.7765649871682422, "grad_norm": 0.4126751720905304, "learning_rate": 1.3039763718420033e-06, "loss": 0.014612707309424877, "memory(GiB)": 22.66, "step": 23905, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.95771 }, { "epoch": 0.7765974726309977, "grad_norm": 0.4010677933692932, "learning_rate": 1.3036146309887937e-06, "loss": 0.01859932765364647, "memory(GiB)": 22.66, "step": 23906, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957717 }, { "epoch": 0.776629958093753, "grad_norm": 0.276059627532959, "learning_rate": 1.3032529327967908e-06, "loss": 0.00961139053106308, "memory(GiB)": 22.66, "step": 23907, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957724 }, { "epoch": 0.7766624435565085, "grad_norm": 0.37056881189346313, "learning_rate": 1.302891277270169e-06, "loss": 0.00955161266028881, "memory(GiB)": 22.66, "step": 23908, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957731 }, { "epoch": 0.7766949290192638, "grad_norm": 0.3448004424571991, "learning_rate": 1.3025296644131035e-06, "loss": 0.014193445444107056, "memory(GiB)": 22.66, "step": 23909, "token_acc": 1.0, "train_speed(iter/s)": 0.957738 }, { "epoch": 0.7767274144820193, "grad_norm": 0.28312844038009644, "learning_rate": 1.3021680942297682e-06, "loss": 0.009869803674519062, "memory(GiB)": 22.66, "step": 23910, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957745 }, { "epoch": 0.7767598999447747, "grad_norm": 0.37900906801223755, "learning_rate": 1.3018065667243329e-06, "loss": 0.014673753641545773, "memory(GiB)": 22.66, "step": 23911, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.95775 }, { "epoch": 0.7767923854075302, "grad_norm": 0.4263213276863098, "learning_rate": 1.3014450819009727e-06, "loss": 0.015113096684217453, "memory(GiB)": 22.66, "step": 23912, "token_acc": 1.0, "train_speed(iter/s)": 0.957756 }, { "epoch": 0.7768248708702855, "grad_norm": 0.3808249235153198, "learning_rate": 1.3010836397638587e-06, "loss": 0.011290637776255608, "memory(GiB)": 22.66, "step": 23913, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957762 }, { "epoch": 0.776857356333041, "grad_norm": 0.29866939783096313, "learning_rate": 1.3007222403171644e-06, "loss": 0.012099795043468475, "memory(GiB)": 22.66, "step": 23914, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.957768 }, { "epoch": 0.7768898417957963, "grad_norm": 0.2873542308807373, "learning_rate": 1.3003608835650577e-06, "loss": 0.009997391141951084, "memory(GiB)": 22.66, "step": 23915, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.957773 }, { "epoch": 0.7769223272585518, "grad_norm": 0.34394127130508423, "learning_rate": 1.2999995695117107e-06, "loss": 0.014030816033482552, "memory(GiB)": 22.66, "step": 23916, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.957779 }, { "epoch": 0.7769548127213072, "grad_norm": 0.39079293608665466, "learning_rate": 1.299638298161293e-06, "loss": 0.019252043217420578, "memory(GiB)": 22.66, "step": 23917, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.957786 }, { "epoch": 0.7769872981840626, "grad_norm": 0.34353208541870117, "learning_rate": 1.2992770695179762e-06, "loss": 0.01222233660519123, "memory(GiB)": 22.66, "step": 23918, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.957792 }, { "epoch": 0.777019783646818, "grad_norm": 0.3475226163864136, "learning_rate": 1.2989158835859267e-06, "loss": 0.011601557955145836, "memory(GiB)": 22.66, "step": 23919, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.957799 }, { "epoch": 0.7770522691095735, "grad_norm": 0.34448692202568054, "learning_rate": 1.298554740369315e-06, "loss": 0.011673318222165108, "memory(GiB)": 22.66, "step": 23920, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957806 }, { "epoch": 0.7770847545723288, "grad_norm": 0.3237465023994446, "learning_rate": 1.298193639872305e-06, "loss": 0.012415310367941856, "memory(GiB)": 22.66, "step": 23921, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.957813 }, { "epoch": 0.7771172400350843, "grad_norm": 0.3417164981365204, "learning_rate": 1.2978325820990706e-06, "loss": 0.016525644809007645, "memory(GiB)": 22.66, "step": 23922, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.957819 }, { "epoch": 0.7771497254978397, "grad_norm": 0.3229386806488037, "learning_rate": 1.2974715670537747e-06, "loss": 0.010044189170002937, "memory(GiB)": 22.66, "step": 23923, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.957826 }, { "epoch": 0.7771822109605951, "grad_norm": 0.570282518863678, "learning_rate": 1.297110594740586e-06, "loss": 0.012149663642048836, "memory(GiB)": 22.66, "step": 23924, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.957832 }, { "epoch": 0.7772146964233505, "grad_norm": 0.3608863055706024, "learning_rate": 1.2967496651636684e-06, "loss": 0.009871200658380985, "memory(GiB)": 22.66, "step": 23925, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.957839 }, { "epoch": 0.777247181886106, "grad_norm": 0.40805038809776306, "learning_rate": 1.2963887783271867e-06, "loss": 0.016184929758310318, "memory(GiB)": 22.66, "step": 23926, "token_acc": 1.0, "train_speed(iter/s)": 0.957845 }, { "epoch": 0.7772796673488613, "grad_norm": 0.6446544528007507, "learning_rate": 1.2960279342353115e-06, "loss": 0.015277241356670856, "memory(GiB)": 22.66, "step": 23927, "token_acc": 0.9886792452830189, "train_speed(iter/s)": 0.957852 }, { "epoch": 0.7773121528116168, "grad_norm": 0.2985512316226959, "learning_rate": 1.2956671328922022e-06, "loss": 0.011701308190822601, "memory(GiB)": 22.66, "step": 23928, "token_acc": 1.0, "train_speed(iter/s)": 0.957859 }, { "epoch": 0.7773446382743722, "grad_norm": 0.3216041624546051, "learning_rate": 1.295306374302026e-06, "loss": 0.010693390853703022, "memory(GiB)": 22.66, "step": 23929, "token_acc": 0.9894179894179894, "train_speed(iter/s)": 0.957866 }, { "epoch": 0.7773771237371276, "grad_norm": 0.35389217734336853, "learning_rate": 1.294945658468944e-06, "loss": 0.014273766428232193, "memory(GiB)": 22.66, "step": 23930, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957872 }, { "epoch": 0.777409609199883, "grad_norm": 0.37077292799949646, "learning_rate": 1.2945849853971205e-06, "loss": 0.013932440429925919, "memory(GiB)": 22.66, "step": 23931, "token_acc": 1.0, "train_speed(iter/s)": 0.957879 }, { "epoch": 0.7774420946626385, "grad_norm": 0.37234002351760864, "learning_rate": 1.294224355090718e-06, "loss": 0.012230182066559792, "memory(GiB)": 22.66, "step": 23932, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.957888 }, { "epoch": 0.7774745801253938, "grad_norm": 0.5361704230308533, "learning_rate": 1.2938637675539007e-06, "loss": 0.01385934092104435, "memory(GiB)": 22.66, "step": 23933, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957897 }, { "epoch": 0.7775070655881493, "grad_norm": 0.5860753655433655, "learning_rate": 1.2935032227908262e-06, "loss": 0.01980123668909073, "memory(GiB)": 22.66, "step": 23934, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.957905 }, { "epoch": 0.7775395510509047, "grad_norm": 0.29725101590156555, "learning_rate": 1.293142720805658e-06, "loss": 0.009246233850717545, "memory(GiB)": 22.66, "step": 23935, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.957914 }, { "epoch": 0.7775720365136601, "grad_norm": 0.3211905062198639, "learning_rate": 1.292782261602557e-06, "loss": 0.010264377109706402, "memory(GiB)": 22.66, "step": 23936, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.957922 }, { "epoch": 0.7776045219764156, "grad_norm": 0.2712010443210602, "learning_rate": 1.292421845185684e-06, "loss": 0.009444662369787693, "memory(GiB)": 22.66, "step": 23937, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.95793 }, { "epoch": 0.777637007439171, "grad_norm": 0.3708738684654236, "learning_rate": 1.2920614715591967e-06, "loss": 0.012747322209179401, "memory(GiB)": 22.66, "step": 23938, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.957938 }, { "epoch": 0.7776694929019264, "grad_norm": 0.43480822443962097, "learning_rate": 1.291701140727255e-06, "loss": 0.011185301467776299, "memory(GiB)": 22.66, "step": 23939, "token_acc": 1.0, "train_speed(iter/s)": 0.957947 }, { "epoch": 0.7777019783646818, "grad_norm": 0.3969971239566803, "learning_rate": 1.291340852694018e-06, "loss": 0.01824246719479561, "memory(GiB)": 22.66, "step": 23940, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.957954 }, { "epoch": 0.7777344638274373, "grad_norm": 0.3931281864643097, "learning_rate": 1.2909806074636439e-06, "loss": 0.010489949956536293, "memory(GiB)": 22.66, "step": 23941, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.957962 }, { "epoch": 0.7777669492901926, "grad_norm": 0.27933570742607117, "learning_rate": 1.2906204050402914e-06, "loss": 0.00853393878787756, "memory(GiB)": 22.66, "step": 23942, "token_acc": 1.0, "train_speed(iter/s)": 0.957969 }, { "epoch": 0.7777994347529481, "grad_norm": 0.38289859890937805, "learning_rate": 1.2902602454281155e-06, "loss": 0.011230668053030968, "memory(GiB)": 22.66, "step": 23943, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.957978 }, { "epoch": 0.7778319202157035, "grad_norm": 0.38365113735198975, "learning_rate": 1.2899001286312735e-06, "loss": 0.013292575255036354, "memory(GiB)": 22.66, "step": 23944, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957986 }, { "epoch": 0.7778644056784589, "grad_norm": 0.24820305407047272, "learning_rate": 1.2895400546539227e-06, "loss": 0.007411108817905188, "memory(GiB)": 22.66, "step": 23945, "token_acc": 1.0, "train_speed(iter/s)": 0.957995 }, { "epoch": 0.7778968911412143, "grad_norm": 0.377114474773407, "learning_rate": 1.2891800235002195e-06, "loss": 0.01023431308567524, "memory(GiB)": 22.66, "step": 23946, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.958003 }, { "epoch": 0.7779293766039698, "grad_norm": 0.4715174436569214, "learning_rate": 1.2888200351743164e-06, "loss": 0.022353045642375946, "memory(GiB)": 22.66, "step": 23947, "token_acc": 1.0, "train_speed(iter/s)": 0.958011 }, { "epoch": 0.7779618620667251, "grad_norm": 0.3575628995895386, "learning_rate": 1.2884600896803712e-06, "loss": 0.011732839047908783, "memory(GiB)": 22.66, "step": 23948, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.95802 }, { "epoch": 0.7779943475294806, "grad_norm": 0.37399306893348694, "learning_rate": 1.2881001870225336e-06, "loss": 0.01464197225868702, "memory(GiB)": 22.66, "step": 23949, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.958028 }, { "epoch": 0.778026832992236, "grad_norm": 0.31523773074150085, "learning_rate": 1.2877403272049632e-06, "loss": 0.014709343202412128, "memory(GiB)": 22.66, "step": 23950, "token_acc": 1.0, "train_speed(iter/s)": 0.958037 }, { "epoch": 0.7780593184549914, "grad_norm": 0.39542654156684875, "learning_rate": 1.2873805102318094e-06, "loss": 0.013380076736211777, "memory(GiB)": 22.66, "step": 23951, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.958046 }, { "epoch": 0.7780918039177468, "grad_norm": 0.3885122537612915, "learning_rate": 1.2870207361072273e-06, "loss": 0.014341025613248348, "memory(GiB)": 22.66, "step": 23952, "token_acc": 0.9965397923875432, "train_speed(iter/s)": 0.958054 }, { "epoch": 0.7781242893805023, "grad_norm": 0.22132526338100433, "learning_rate": 1.2866610048353662e-06, "loss": 0.006448104977607727, "memory(GiB)": 22.66, "step": 23953, "token_acc": 1.0, "train_speed(iter/s)": 0.958063 }, { "epoch": 0.7781567748432576, "grad_norm": 0.28394144773483276, "learning_rate": 1.2863013164203797e-06, "loss": 0.012592908926308155, "memory(GiB)": 22.66, "step": 23954, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.958071 }, { "epoch": 0.7781892603060131, "grad_norm": 0.4388715922832489, "learning_rate": 1.2859416708664186e-06, "loss": 0.017060086131095886, "memory(GiB)": 22.66, "step": 23955, "token_acc": 0.9857651245551602, "train_speed(iter/s)": 0.958078 }, { "epoch": 0.7782217457687685, "grad_norm": 0.3870369493961334, "learning_rate": 1.2855820681776342e-06, "loss": 0.012142818421125412, "memory(GiB)": 22.66, "step": 23956, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.958085 }, { "epoch": 0.7782542312315239, "grad_norm": 0.5716797709465027, "learning_rate": 1.2852225083581782e-06, "loss": 0.019041206687688828, "memory(GiB)": 22.66, "step": 23957, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.958091 }, { "epoch": 0.7782867166942793, "grad_norm": 0.2624063789844513, "learning_rate": 1.2848629914121974e-06, "loss": 0.006693248637020588, "memory(GiB)": 22.66, "step": 23958, "token_acc": 1.0, "train_speed(iter/s)": 0.958097 }, { "epoch": 0.7783192021570348, "grad_norm": 0.3994187116622925, "learning_rate": 1.2845035173438425e-06, "loss": 0.009467262774705887, "memory(GiB)": 22.66, "step": 23959, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.958103 }, { "epoch": 0.7783516876197901, "grad_norm": 0.5124155282974243, "learning_rate": 1.2841440861572624e-06, "loss": 0.01864255592226982, "memory(GiB)": 22.66, "step": 23960, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.958109 }, { "epoch": 0.7783841730825456, "grad_norm": 0.9161720871925354, "learning_rate": 1.2837846978566065e-06, "loss": 0.016812369227409363, "memory(GiB)": 22.66, "step": 23961, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.958115 }, { "epoch": 0.778416658545301, "grad_norm": 0.24795037508010864, "learning_rate": 1.28342535244602e-06, "loss": 0.007859707809984684, "memory(GiB)": 22.66, "step": 23962, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.958121 }, { "epoch": 0.7784491440080564, "grad_norm": 0.28078585863113403, "learning_rate": 1.2830660499296521e-06, "loss": 0.010209543630480766, "memory(GiB)": 22.66, "step": 23963, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.958126 }, { "epoch": 0.7784816294708118, "grad_norm": 0.27694258093833923, "learning_rate": 1.2827067903116485e-06, "loss": 0.009219867177307606, "memory(GiB)": 22.66, "step": 23964, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.958132 }, { "epoch": 0.7785141149335673, "grad_norm": 0.33622145652770996, "learning_rate": 1.2823475735961582e-06, "loss": 0.01105952262878418, "memory(GiB)": 22.66, "step": 23965, "token_acc": 1.0, "train_speed(iter/s)": 0.958138 }, { "epoch": 0.7785466003963226, "grad_norm": 0.3794378340244293, "learning_rate": 1.2819883997873238e-06, "loss": 0.016350893303751945, "memory(GiB)": 22.66, "step": 23966, "token_acc": 1.0, "train_speed(iter/s)": 0.958145 }, { "epoch": 0.7785790858590781, "grad_norm": 0.3706114888191223, "learning_rate": 1.2816292688892916e-06, "loss": 0.013883383944630623, "memory(GiB)": 22.66, "step": 23967, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.958151 }, { "epoch": 0.7786115713218335, "grad_norm": 0.2994513511657715, "learning_rate": 1.2812701809062073e-06, "loss": 0.010538761503994465, "memory(GiB)": 22.66, "step": 23968, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.958157 }, { "epoch": 0.7786440567845889, "grad_norm": 0.3907572329044342, "learning_rate": 1.2809111358422156e-06, "loss": 0.014059348963201046, "memory(GiB)": 22.66, "step": 23969, "token_acc": 1.0, "train_speed(iter/s)": 0.958164 }, { "epoch": 0.7786765422473443, "grad_norm": 0.3266039192676544, "learning_rate": 1.2805521337014582e-06, "loss": 0.010437903925776482, "memory(GiB)": 22.66, "step": 23970, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.958171 }, { "epoch": 0.7787090277100998, "grad_norm": 0.31202125549316406, "learning_rate": 1.2801931744880797e-06, "loss": 0.013239777646958828, "memory(GiB)": 22.66, "step": 23971, "token_acc": 0.9948186528497409, "train_speed(iter/s)": 0.958177 }, { "epoch": 0.7787415131728551, "grad_norm": 0.4480076730251312, "learning_rate": 1.2798342582062229e-06, "loss": 0.01186671108007431, "memory(GiB)": 22.66, "step": 23972, "token_acc": 0.9968652037617555, "train_speed(iter/s)": 0.958183 }, { "epoch": 0.7787739986356106, "grad_norm": 0.3566930592060089, "learning_rate": 1.2794753848600305e-06, "loss": 0.01664125919342041, "memory(GiB)": 22.66, "step": 23973, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.958187 }, { "epoch": 0.778806484098366, "grad_norm": 1.519752860069275, "learning_rate": 1.279116554453646e-06, "loss": 0.013602621853351593, "memory(GiB)": 22.66, "step": 23974, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.958193 }, { "epoch": 0.7788389695611214, "grad_norm": 0.29939723014831543, "learning_rate": 1.2787577669912065e-06, "loss": 0.012850168161094189, "memory(GiB)": 22.66, "step": 23975, "token_acc": 1.0, "train_speed(iter/s)": 0.958198 }, { "epoch": 0.7788714550238768, "grad_norm": 0.41605958342552185, "learning_rate": 1.2783990224768577e-06, "loss": 0.01545125711709261, "memory(GiB)": 22.66, "step": 23976, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.958204 }, { "epoch": 0.7789039404866323, "grad_norm": 0.3519226908683777, "learning_rate": 1.278040320914734e-06, "loss": 0.01485910452902317, "memory(GiB)": 22.66, "step": 23977, "token_acc": 0.9939759036144579, "train_speed(iter/s)": 0.95821 }, { "epoch": 0.7789364259493876, "grad_norm": 0.3390732705593109, "learning_rate": 1.2776816623089826e-06, "loss": 0.016039060428738594, "memory(GiB)": 22.66, "step": 23978, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.958215 }, { "epoch": 0.7789689114121431, "grad_norm": 0.4090333878993988, "learning_rate": 1.2773230466637371e-06, "loss": 0.01202383916825056, "memory(GiB)": 22.66, "step": 23979, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.958221 }, { "epoch": 0.7790013968748984, "grad_norm": 0.27773550152778625, "learning_rate": 1.2769644739831405e-06, "loss": 0.00856622215360403, "memory(GiB)": 22.66, "step": 23980, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.958228 }, { "epoch": 0.7790338823376539, "grad_norm": 0.5207536220550537, "learning_rate": 1.2766059442713275e-06, "loss": 0.019993336871266365, "memory(GiB)": 22.66, "step": 23981, "token_acc": 0.9967213114754099, "train_speed(iter/s)": 0.958234 }, { "epoch": 0.7790663678004093, "grad_norm": 0.4095587432384491, "learning_rate": 1.2762474575324373e-06, "loss": 0.013467448763549328, "memory(GiB)": 22.66, "step": 23982, "token_acc": 1.0, "train_speed(iter/s)": 0.958241 }, { "epoch": 0.7790988532631647, "grad_norm": 0.33460837602615356, "learning_rate": 1.2758890137706088e-06, "loss": 0.008797685615718365, "memory(GiB)": 22.66, "step": 23983, "token_acc": 0.9930795847750865, "train_speed(iter/s)": 0.958248 }, { "epoch": 0.7791313387259201, "grad_norm": 0.3912627398967743, "learning_rate": 1.2755306129899786e-06, "loss": 0.015163286589086056, "memory(GiB)": 22.66, "step": 23984, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.958254 }, { "epoch": 0.7791638241886756, "grad_norm": 0.34180065989494324, "learning_rate": 1.275172255194681e-06, "loss": 0.01119093969464302, "memory(GiB)": 22.66, "step": 23985, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.958261 }, { "epoch": 0.7791963096514309, "grad_norm": 0.21355171501636505, "learning_rate": 1.2748139403888515e-06, "loss": 0.009666329249739647, "memory(GiB)": 22.66, "step": 23986, "token_acc": 1.0, "train_speed(iter/s)": 0.958268 }, { "epoch": 0.7792287951141864, "grad_norm": 0.3211008906364441, "learning_rate": 1.2744556685766308e-06, "loss": 0.010527156293392181, "memory(GiB)": 22.66, "step": 23987, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.958274 }, { "epoch": 0.7792612805769418, "grad_norm": 0.4437575340270996, "learning_rate": 1.2740974397621485e-06, "loss": 0.017294030636548996, "memory(GiB)": 22.66, "step": 23988, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.95828 }, { "epoch": 0.7792937660396972, "grad_norm": 0.2637085020542145, "learning_rate": 1.273739253949543e-06, "loss": 0.015109332278370857, "memory(GiB)": 22.66, "step": 23989, "token_acc": 1.0, "train_speed(iter/s)": 0.958287 }, { "epoch": 0.7793262515024526, "grad_norm": 0.46707049012184143, "learning_rate": 1.273381111142944e-06, "loss": 0.011194809339940548, "memory(GiB)": 22.66, "step": 23990, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.958293 }, { "epoch": 0.7793587369652081, "grad_norm": 0.4144552946090698, "learning_rate": 1.2730230113464876e-06, "loss": 0.018534183502197266, "memory(GiB)": 22.66, "step": 23991, "token_acc": 1.0, "train_speed(iter/s)": 0.958301 }, { "epoch": 0.7793912224279634, "grad_norm": 0.19665086269378662, "learning_rate": 1.272664954564306e-06, "loss": 0.0065253800712525845, "memory(GiB)": 22.66, "step": 23992, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.95831 }, { "epoch": 0.7794237078907189, "grad_norm": 0.3708096444606781, "learning_rate": 1.2723069408005334e-06, "loss": 0.01538475975394249, "memory(GiB)": 22.66, "step": 23993, "token_acc": 0.9785407725321889, "train_speed(iter/s)": 0.958318 }, { "epoch": 0.7794561933534743, "grad_norm": 0.327832967042923, "learning_rate": 1.2719489700592985e-06, "loss": 0.012295257300138474, "memory(GiB)": 22.66, "step": 23994, "token_acc": 1.0, "train_speed(iter/s)": 0.958327 }, { "epoch": 0.7794886788162297, "grad_norm": 0.3871544599533081, "learning_rate": 1.2715910423447347e-06, "loss": 0.013107042759656906, "memory(GiB)": 22.66, "step": 23995, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.958335 }, { "epoch": 0.7795211642789851, "grad_norm": 1.1929051876068115, "learning_rate": 1.2712331576609732e-06, "loss": 0.017184382304549217, "memory(GiB)": 22.66, "step": 23996, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.958343 }, { "epoch": 0.7795536497417406, "grad_norm": 0.38271915912628174, "learning_rate": 1.2708753160121452e-06, "loss": 0.014907457865774632, "memory(GiB)": 22.66, "step": 23997, "token_acc": 1.0, "train_speed(iter/s)": 0.958352 }, { "epoch": 0.7795861352044959, "grad_norm": 0.4512614607810974, "learning_rate": 1.2705175174023776e-06, "loss": 0.017282569780945778, "memory(GiB)": 22.66, "step": 23998, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.95836 }, { "epoch": 0.7796186206672514, "grad_norm": 0.30194374918937683, "learning_rate": 1.2701597618358042e-06, "loss": 0.011221787892282009, "memory(GiB)": 22.66, "step": 23999, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.958369 }, { "epoch": 0.7796511061300069, "grad_norm": 0.3445185720920563, "learning_rate": 1.2698020493165475e-06, "loss": 0.013609342277050018, "memory(GiB)": 22.66, "step": 24000, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.958377 }, { "epoch": 0.7796511061300069, "eval_loss": 0.01292478758841753, "eval_runtime": 80.1108, "eval_samples_per_second": 124.203, "eval_steps_per_second": 3.882, "eval_token_acc": 0.9948105169404703, "step": 24000 }, { "epoch": 0.7796835915927622, "grad_norm": 0.22266165912151337, "learning_rate": 1.2694443798487438e-06, "loss": 0.009482831694185734, "memory(GiB)": 22.66, "step": 24001, "token_acc": 0.9945319994518296, "train_speed(iter/s)": 0.954902 }, { "epoch": 0.7797160770555177, "grad_norm": 0.38435590267181396, "learning_rate": 1.2690867534365154e-06, "loss": 0.011727276258170605, "memory(GiB)": 22.66, "step": 24002, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.954907 }, { "epoch": 0.7797485625182731, "grad_norm": 0.31107378005981445, "learning_rate": 1.2687291700839922e-06, "loss": 0.011604802682995796, "memory(GiB)": 22.66, "step": 24003, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.954913 }, { "epoch": 0.7797810479810285, "grad_norm": 0.37168288230895996, "learning_rate": 1.2683716297953003e-06, "loss": 0.01469756755977869, "memory(GiB)": 22.66, "step": 24004, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.954918 }, { "epoch": 0.7798135334437839, "grad_norm": 0.7831718921661377, "learning_rate": 1.2680141325745666e-06, "loss": 0.013981575146317482, "memory(GiB)": 22.66, "step": 24005, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.954924 }, { "epoch": 0.7798460189065394, "grad_norm": 0.2620677947998047, "learning_rate": 1.2676566784259186e-06, "loss": 0.010687001049518585, "memory(GiB)": 22.66, "step": 24006, "token_acc": 1.0, "train_speed(iter/s)": 0.95493 }, { "epoch": 0.7798785043692947, "grad_norm": 0.2508063018321991, "learning_rate": 1.267299267353479e-06, "loss": 0.008903510868549347, "memory(GiB)": 22.66, "step": 24007, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.954936 }, { "epoch": 0.7799109898320502, "grad_norm": 0.4745854139328003, "learning_rate": 1.2669418993613753e-06, "loss": 0.020063484087586403, "memory(GiB)": 22.66, "step": 24008, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.954941 }, { "epoch": 0.7799434752948056, "grad_norm": 0.3463887870311737, "learning_rate": 1.2665845744537274e-06, "loss": 0.01252184621989727, "memory(GiB)": 22.66, "step": 24009, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.954946 }, { "epoch": 0.779975960757561, "grad_norm": 0.3672451376914978, "learning_rate": 1.2662272926346663e-06, "loss": 0.013477502390742302, "memory(GiB)": 22.66, "step": 24010, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.954952 }, { "epoch": 0.7800084462203164, "grad_norm": 0.3868640065193176, "learning_rate": 1.2658700539083102e-06, "loss": 0.013396945782005787, "memory(GiB)": 22.66, "step": 24011, "token_acc": 1.0, "train_speed(iter/s)": 0.954958 }, { "epoch": 0.7800409316830719, "grad_norm": 0.3518381714820862, "learning_rate": 1.265512858278785e-06, "loss": 0.015316592529416084, "memory(GiB)": 22.66, "step": 24012, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.954964 }, { "epoch": 0.7800734171458272, "grad_norm": 0.16817204654216766, "learning_rate": 1.2651557057502113e-06, "loss": 0.009342166595160961, "memory(GiB)": 22.66, "step": 24013, "token_acc": 1.0, "train_speed(iter/s)": 0.954968 }, { "epoch": 0.7801059026085827, "grad_norm": 0.35531947016716003, "learning_rate": 1.2647985963267111e-06, "loss": 0.009964363649487495, "memory(GiB)": 22.66, "step": 24014, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.954973 }, { "epoch": 0.7801383880713381, "grad_norm": 0.3194860517978668, "learning_rate": 1.264441530012407e-06, "loss": 0.014756394550204277, "memory(GiB)": 22.66, "step": 24015, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.95498 }, { "epoch": 0.7801708735340935, "grad_norm": 0.7152141332626343, "learning_rate": 1.2640845068114205e-06, "loss": 0.011735816486179829, "memory(GiB)": 22.66, "step": 24016, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.954986 }, { "epoch": 0.7802033589968489, "grad_norm": 0.3944947421550751, "learning_rate": 1.2637275267278704e-06, "loss": 0.010800285264849663, "memory(GiB)": 22.66, "step": 24017, "token_acc": 1.0, "train_speed(iter/s)": 0.954993 }, { "epoch": 0.7802358444596044, "grad_norm": 0.3690544068813324, "learning_rate": 1.2633705897658776e-06, "loss": 0.012518839910626411, "memory(GiB)": 22.66, "step": 24018, "token_acc": 1.0, "train_speed(iter/s)": 0.954999 }, { "epoch": 0.7802683299223597, "grad_norm": 0.31298795342445374, "learning_rate": 1.2630136959295613e-06, "loss": 0.01557097677141428, "memory(GiB)": 22.66, "step": 24019, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.955005 }, { "epoch": 0.7803008153851152, "grad_norm": 0.4375324547290802, "learning_rate": 1.2626568452230414e-06, "loss": 0.009056555107235909, "memory(GiB)": 22.66, "step": 24020, "token_acc": 0.9965277777777778, "train_speed(iter/s)": 0.955012 }, { "epoch": 0.7803333008478706, "grad_norm": 0.34789979457855225, "learning_rate": 1.2623000376504368e-06, "loss": 0.015508659183979034, "memory(GiB)": 22.66, "step": 24021, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.955019 }, { "epoch": 0.780365786310626, "grad_norm": 0.41883742809295654, "learning_rate": 1.2619432732158637e-06, "loss": 0.01751333847641945, "memory(GiB)": 22.66, "step": 24022, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.955025 }, { "epoch": 0.7803982717733814, "grad_norm": 0.3807373642921448, "learning_rate": 1.2615865519234406e-06, "loss": 0.01704910397529602, "memory(GiB)": 22.66, "step": 24023, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.955033 }, { "epoch": 0.7804307572361369, "grad_norm": 0.3085692822933197, "learning_rate": 1.2612298737772837e-06, "loss": 0.01417006365954876, "memory(GiB)": 22.66, "step": 24024, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955041 }, { "epoch": 0.7804632426988922, "grad_norm": 0.3387264013290405, "learning_rate": 1.2608732387815126e-06, "loss": 0.012604694813489914, "memory(GiB)": 22.66, "step": 24025, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.955049 }, { "epoch": 0.7804957281616477, "grad_norm": 0.3153633177280426, "learning_rate": 1.260516646940239e-06, "loss": 0.007952660322189331, "memory(GiB)": 22.66, "step": 24026, "token_acc": 1.0, "train_speed(iter/s)": 0.955057 }, { "epoch": 0.7805282136244031, "grad_norm": 0.43781450390815735, "learning_rate": 1.2601600982575811e-06, "loss": 0.017572080716490746, "memory(GiB)": 22.66, "step": 24027, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.955064 }, { "epoch": 0.7805606990871585, "grad_norm": 0.27516940236091614, "learning_rate": 1.2598035927376534e-06, "loss": 0.009994269348680973, "memory(GiB)": 22.66, "step": 24028, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.955072 }, { "epoch": 0.7805931845499139, "grad_norm": 0.5176787972450256, "learning_rate": 1.2594471303845718e-06, "loss": 0.019171439111232758, "memory(GiB)": 22.66, "step": 24029, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955079 }, { "epoch": 0.7806256700126694, "grad_norm": 0.21758121252059937, "learning_rate": 1.259090711202447e-06, "loss": 0.010800869204103947, "memory(GiB)": 22.66, "step": 24030, "token_acc": 1.0, "train_speed(iter/s)": 0.955087 }, { "epoch": 0.7806581554754247, "grad_norm": 0.33847010135650635, "learning_rate": 1.2587343351953962e-06, "loss": 0.012702882289886475, "memory(GiB)": 22.66, "step": 24031, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.955095 }, { "epoch": 0.7806906409381802, "grad_norm": 0.2509092092514038, "learning_rate": 1.2583780023675286e-06, "loss": 0.013176771812140942, "memory(GiB)": 22.66, "step": 24032, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.955103 }, { "epoch": 0.7807231264009356, "grad_norm": 0.8109124898910522, "learning_rate": 1.2580217127229594e-06, "loss": 0.01944883167743683, "memory(GiB)": 22.66, "step": 24033, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.955111 }, { "epoch": 0.780755611863691, "grad_norm": 0.3203507959842682, "learning_rate": 1.2576654662658022e-06, "loss": 0.01233682967722416, "memory(GiB)": 22.66, "step": 24034, "token_acc": 1.0, "train_speed(iter/s)": 0.955119 }, { "epoch": 0.7807880973264464, "grad_norm": 0.4742402136325836, "learning_rate": 1.2573092630001648e-06, "loss": 0.018321659415960312, "memory(GiB)": 22.66, "step": 24035, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.955128 }, { "epoch": 0.7808205827892019, "grad_norm": 0.35197749733924866, "learning_rate": 1.2569531029301617e-06, "loss": 0.016611233353614807, "memory(GiB)": 22.66, "step": 24036, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.955136 }, { "epoch": 0.7808530682519572, "grad_norm": 0.33158254623413086, "learning_rate": 1.2565969860598992e-06, "loss": 0.01015527080744505, "memory(GiB)": 22.66, "step": 24037, "token_acc": 1.0, "train_speed(iter/s)": 0.955144 }, { "epoch": 0.7808855537147127, "grad_norm": 0.4126090705394745, "learning_rate": 1.256240912393492e-06, "loss": 0.01652980037033558, "memory(GiB)": 22.66, "step": 24038, "token_acc": 1.0, "train_speed(iter/s)": 0.955152 }, { "epoch": 0.780918039177468, "grad_norm": 0.28090935945510864, "learning_rate": 1.2558848819350473e-06, "loss": 0.010614296421408653, "memory(GiB)": 22.66, "step": 24039, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.955161 }, { "epoch": 0.7809505246402235, "grad_norm": 0.36673277616500854, "learning_rate": 1.2555288946886751e-06, "loss": 0.014192294329404831, "memory(GiB)": 22.66, "step": 24040, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955168 }, { "epoch": 0.7809830101029789, "grad_norm": 0.2985195219516754, "learning_rate": 1.255172950658482e-06, "loss": 0.010802702978253365, "memory(GiB)": 22.66, "step": 24041, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.955176 }, { "epoch": 0.7810154955657344, "grad_norm": 0.36141934990882874, "learning_rate": 1.2548170498485772e-06, "loss": 0.012619070708751678, "memory(GiB)": 22.66, "step": 24042, "token_acc": 0.9739130434782609, "train_speed(iter/s)": 0.955183 }, { "epoch": 0.7810479810284897, "grad_norm": 0.342447429895401, "learning_rate": 1.254461192263069e-06, "loss": 0.011935537680983543, "memory(GiB)": 22.66, "step": 24043, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.95519 }, { "epoch": 0.7810804664912452, "grad_norm": 0.4563666582107544, "learning_rate": 1.2541053779060647e-06, "loss": 0.016145009547472, "memory(GiB)": 22.66, "step": 24044, "token_acc": 1.0, "train_speed(iter/s)": 0.955198 }, { "epoch": 0.7811129519540005, "grad_norm": 0.3465754985809326, "learning_rate": 1.253749606781669e-06, "loss": 0.01678634062409401, "memory(GiB)": 22.66, "step": 24045, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.955205 }, { "epoch": 0.781145437416756, "grad_norm": 0.3803204596042633, "learning_rate": 1.2533938788939892e-06, "loss": 0.01254388689994812, "memory(GiB)": 22.66, "step": 24046, "token_acc": 1.0, "train_speed(iter/s)": 0.955213 }, { "epoch": 0.7811779228795114, "grad_norm": 0.33528831601142883, "learning_rate": 1.253038194247131e-06, "loss": 0.01499126199632883, "memory(GiB)": 22.66, "step": 24047, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.955221 }, { "epoch": 0.7812104083422668, "grad_norm": 0.3394627571105957, "learning_rate": 1.2526825528451986e-06, "loss": 0.008635284379124641, "memory(GiB)": 22.66, "step": 24048, "token_acc": 1.0, "train_speed(iter/s)": 0.955229 }, { "epoch": 0.7812428938050222, "grad_norm": 0.42159396409988403, "learning_rate": 1.2523269546922995e-06, "loss": 0.012484581209719181, "memory(GiB)": 22.66, "step": 24049, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.955238 }, { "epoch": 0.7812753792677777, "grad_norm": 0.2917146384716034, "learning_rate": 1.251971399792533e-06, "loss": 0.013413907960057259, "memory(GiB)": 22.66, "step": 24050, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.955245 }, { "epoch": 0.781307864730533, "grad_norm": 0.9364640712738037, "learning_rate": 1.2516158881500058e-06, "loss": 0.011482587084174156, "memory(GiB)": 22.66, "step": 24051, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.955253 }, { "epoch": 0.7813403501932885, "grad_norm": 0.39492231607437134, "learning_rate": 1.2512604197688206e-06, "loss": 0.015324290841817856, "memory(GiB)": 22.66, "step": 24052, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.955262 }, { "epoch": 0.7813728356560439, "grad_norm": 0.2555699944496155, "learning_rate": 1.2509049946530805e-06, "loss": 0.009923441335558891, "memory(GiB)": 22.66, "step": 24053, "token_acc": 0.996551724137931, "train_speed(iter/s)": 0.955269 }, { "epoch": 0.7814053211187993, "grad_norm": 0.38220515847206116, "learning_rate": 1.2505496128068856e-06, "loss": 0.012049257755279541, "memory(GiB)": 22.66, "step": 24054, "token_acc": 1.0, "train_speed(iter/s)": 0.955276 }, { "epoch": 0.7814378065815547, "grad_norm": 0.43953266739845276, "learning_rate": 1.2501942742343388e-06, "loss": 0.017614558339118958, "memory(GiB)": 22.66, "step": 24055, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.955282 }, { "epoch": 0.7814702920443102, "grad_norm": 0.2874963879585266, "learning_rate": 1.249838978939541e-06, "loss": 0.009195031598210335, "memory(GiB)": 22.66, "step": 24056, "token_acc": 0.9965156794425087, "train_speed(iter/s)": 0.955289 }, { "epoch": 0.7815027775070655, "grad_norm": 0.3567129373550415, "learning_rate": 1.2494837269265942e-06, "loss": 0.015190396457910538, "memory(GiB)": 22.66, "step": 24057, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.955297 }, { "epoch": 0.781535262969821, "grad_norm": 0.46843501925468445, "learning_rate": 1.249128518199596e-06, "loss": 0.02201054058969021, "memory(GiB)": 22.66, "step": 24058, "token_acc": 1.0, "train_speed(iter/s)": 0.955304 }, { "epoch": 0.7815677484325764, "grad_norm": 0.34368449449539185, "learning_rate": 1.2487733527626483e-06, "loss": 0.009252157062292099, "memory(GiB)": 22.66, "step": 24059, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.95531 }, { "epoch": 0.7816002338953318, "grad_norm": 0.33498457074165344, "learning_rate": 1.2484182306198456e-06, "loss": 0.012521402910351753, "memory(GiB)": 22.66, "step": 24060, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955316 }, { "epoch": 0.7816327193580872, "grad_norm": 0.3540501296520233, "learning_rate": 1.2480631517752933e-06, "loss": 0.013969587162137032, "memory(GiB)": 22.66, "step": 24061, "token_acc": 1.0, "train_speed(iter/s)": 0.955323 }, { "epoch": 0.7816652048208427, "grad_norm": 0.35490134358406067, "learning_rate": 1.2477081162330846e-06, "loss": 0.012489277869462967, "memory(GiB)": 22.66, "step": 24062, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.955329 }, { "epoch": 0.781697690283598, "grad_norm": 0.3782983720302582, "learning_rate": 1.2473531239973203e-06, "loss": 0.017609726637601852, "memory(GiB)": 22.66, "step": 24063, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.955334 }, { "epoch": 0.7817301757463535, "grad_norm": 0.46866998076438904, "learning_rate": 1.2469981750720932e-06, "loss": 0.00926467590034008, "memory(GiB)": 22.66, "step": 24064, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.95534 }, { "epoch": 0.781762661209109, "grad_norm": 0.33284255862236023, "learning_rate": 1.246643269461502e-06, "loss": 0.01319099310785532, "memory(GiB)": 22.66, "step": 24065, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955345 }, { "epoch": 0.7817951466718643, "grad_norm": 0.29499325156211853, "learning_rate": 1.2462884071696457e-06, "loss": 0.010476777330040932, "memory(GiB)": 22.66, "step": 24066, "token_acc": 0.9898477157360406, "train_speed(iter/s)": 0.955351 }, { "epoch": 0.7818276321346198, "grad_norm": 0.36667773127555847, "learning_rate": 1.2459335882006158e-06, "loss": 0.011780526489019394, "memory(GiB)": 22.66, "step": 24067, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.955357 }, { "epoch": 0.7818601175973752, "grad_norm": 0.2699456512928009, "learning_rate": 1.2455788125585104e-06, "loss": 0.008481270633637905, "memory(GiB)": 22.66, "step": 24068, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955363 }, { "epoch": 0.7818926030601306, "grad_norm": 0.4259924590587616, "learning_rate": 1.2452240802474213e-06, "loss": 0.014301438815891743, "memory(GiB)": 22.66, "step": 24069, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955368 }, { "epoch": 0.781925088522886, "grad_norm": 0.3172123432159424, "learning_rate": 1.2448693912714437e-06, "loss": 0.013722885400056839, "memory(GiB)": 22.66, "step": 24070, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.955374 }, { "epoch": 0.7819575739856415, "grad_norm": 0.3286609947681427, "learning_rate": 1.2445147456346718e-06, "loss": 0.011563075706362724, "memory(GiB)": 22.66, "step": 24071, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.95538 }, { "epoch": 0.7819900594483968, "grad_norm": 0.6110519766807556, "learning_rate": 1.2441601433411993e-06, "loss": 0.015623383224010468, "memory(GiB)": 22.66, "step": 24072, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.955385 }, { "epoch": 0.7820225449111523, "grad_norm": 0.3562787175178528, "learning_rate": 1.243805584395117e-06, "loss": 0.016338825225830078, "memory(GiB)": 22.66, "step": 24073, "token_acc": 1.0, "train_speed(iter/s)": 0.955391 }, { "epoch": 0.7820550303739077, "grad_norm": 0.26425135135650635, "learning_rate": 1.2434510688005169e-06, "loss": 0.008947769179940224, "memory(GiB)": 22.66, "step": 24074, "token_acc": 1.0, "train_speed(iter/s)": 0.955397 }, { "epoch": 0.7820875158366631, "grad_norm": 0.45949143171310425, "learning_rate": 1.2430965965614915e-06, "loss": 0.012785207480192184, "memory(GiB)": 22.66, "step": 24075, "token_acc": 1.0, "train_speed(iter/s)": 0.955404 }, { "epoch": 0.7821200012994185, "grad_norm": 0.3164941668510437, "learning_rate": 1.2427421676821338e-06, "loss": 0.011492043733596802, "memory(GiB)": 22.66, "step": 24076, "token_acc": 1.0, "train_speed(iter/s)": 0.95541 }, { "epoch": 0.782152486762174, "grad_norm": 0.3631606698036194, "learning_rate": 1.2423877821665304e-06, "loss": 0.009580718353390694, "memory(GiB)": 22.66, "step": 24077, "token_acc": 0.9930555555555556, "train_speed(iter/s)": 0.955416 }, { "epoch": 0.7821849722249293, "grad_norm": 0.25023511052131653, "learning_rate": 1.242033440018774e-06, "loss": 0.009852915070950985, "memory(GiB)": 22.66, "step": 24078, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.95541 }, { "epoch": 0.7822174576876848, "grad_norm": 0.232905313372612, "learning_rate": 1.241679141242953e-06, "loss": 0.012314091436564922, "memory(GiB)": 22.66, "step": 24079, "token_acc": 1.0, "train_speed(iter/s)": 0.955416 }, { "epoch": 0.7822499431504402, "grad_norm": 0.34458738565444946, "learning_rate": 1.241324885843157e-06, "loss": 0.01191677711904049, "memory(GiB)": 22.66, "step": 24080, "token_acc": 0.9963898916967509, "train_speed(iter/s)": 0.955423 }, { "epoch": 0.7822824286131956, "grad_norm": 0.3436889946460724, "learning_rate": 1.2409706738234762e-06, "loss": 0.012957578524947166, "memory(GiB)": 22.66, "step": 24081, "token_acc": 0.9940828402366864, "train_speed(iter/s)": 0.95543 }, { "epoch": 0.782314914075951, "grad_norm": 0.3088352084159851, "learning_rate": 1.2406165051879959e-06, "loss": 0.011699195019900799, "memory(GiB)": 22.66, "step": 24082, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.955439 }, { "epoch": 0.7823473995387065, "grad_norm": 0.38443633913993835, "learning_rate": 1.2402623799408042e-06, "loss": 0.009459340944886208, "memory(GiB)": 22.66, "step": 24083, "token_acc": 1.0, "train_speed(iter/s)": 0.955447 }, { "epoch": 0.7823798850014618, "grad_norm": 0.43951064348220825, "learning_rate": 1.2399082980859895e-06, "loss": 0.01308361440896988, "memory(GiB)": 22.66, "step": 24084, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.955455 }, { "epoch": 0.7824123704642173, "grad_norm": 0.290547251701355, "learning_rate": 1.2395542596276389e-06, "loss": 0.0120835080742836, "memory(GiB)": 22.66, "step": 24085, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955464 }, { "epoch": 0.7824448559269727, "grad_norm": 0.3230758309364319, "learning_rate": 1.239200264569836e-06, "loss": 0.011982245370745659, "memory(GiB)": 22.66, "step": 24086, "token_acc": 0.9928057553956835, "train_speed(iter/s)": 0.955471 }, { "epoch": 0.7824773413897281, "grad_norm": 0.434352308511734, "learning_rate": 1.238846312916669e-06, "loss": 0.01925649866461754, "memory(GiB)": 22.66, "step": 24087, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.955478 }, { "epoch": 0.7825098268524835, "grad_norm": 0.3136923015117645, "learning_rate": 1.2384924046722185e-06, "loss": 0.01296199019998312, "memory(GiB)": 22.66, "step": 24088, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955486 }, { "epoch": 0.782542312315239, "grad_norm": 0.2821536064147949, "learning_rate": 1.2381385398405754e-06, "loss": 0.01028430461883545, "memory(GiB)": 22.66, "step": 24089, "token_acc": 1.0, "train_speed(iter/s)": 0.955494 }, { "epoch": 0.7825747977779943, "grad_norm": 0.3845894932746887, "learning_rate": 1.237784718425819e-06, "loss": 0.015234562568366528, "memory(GiB)": 22.66, "step": 24090, "token_acc": 1.0, "train_speed(iter/s)": 0.955503 }, { "epoch": 0.7826072832407498, "grad_norm": 0.2795824706554413, "learning_rate": 1.2374309404320355e-06, "loss": 0.01475046668201685, "memory(GiB)": 22.66, "step": 24091, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.955512 }, { "epoch": 0.7826397687035052, "grad_norm": 0.32257238030433655, "learning_rate": 1.2370772058633063e-06, "loss": 0.012397677637636662, "memory(GiB)": 22.66, "step": 24092, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.95552 }, { "epoch": 0.7826722541662606, "grad_norm": 0.21625453233718872, "learning_rate": 1.236723514723714e-06, "loss": 0.007986290380358696, "memory(GiB)": 22.66, "step": 24093, "token_acc": 0.9948453608247423, "train_speed(iter/s)": 0.955528 }, { "epoch": 0.782704739629016, "grad_norm": 0.3269542455673218, "learning_rate": 1.2363698670173413e-06, "loss": 0.010975348763167858, "memory(GiB)": 22.66, "step": 24094, "token_acc": 0.9875, "train_speed(iter/s)": 0.955537 }, { "epoch": 0.7827372250917715, "grad_norm": 0.3752276301383972, "learning_rate": 1.2360162627482702e-06, "loss": 0.017875347286462784, "memory(GiB)": 22.66, "step": 24095, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.955545 }, { "epoch": 0.7827697105545268, "grad_norm": 0.5278077125549316, "learning_rate": 1.2356627019205819e-06, "loss": 0.022951219230890274, "memory(GiB)": 22.66, "step": 24096, "token_acc": 0.9935064935064936, "train_speed(iter/s)": 0.955554 }, { "epoch": 0.7828021960172823, "grad_norm": 0.3691008388996124, "learning_rate": 1.2353091845383558e-06, "loss": 0.015755679458379745, "memory(GiB)": 22.66, "step": 24097, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955563 }, { "epoch": 0.7828346814800377, "grad_norm": 0.35844308137893677, "learning_rate": 1.234955710605672e-06, "loss": 0.015799175947904587, "memory(GiB)": 22.66, "step": 24098, "token_acc": 1.0, "train_speed(iter/s)": 0.955572 }, { "epoch": 0.7828671669427931, "grad_norm": 0.2647567689418793, "learning_rate": 1.2346022801266105e-06, "loss": 0.010581667535007, "memory(GiB)": 22.66, "step": 24099, "token_acc": 1.0, "train_speed(iter/s)": 0.95558 }, { "epoch": 0.7828996524055485, "grad_norm": 0.459041953086853, "learning_rate": 1.234248893105252e-06, "loss": 0.021972553804516792, "memory(GiB)": 22.66, "step": 24100, "token_acc": 0.9745762711864406, "train_speed(iter/s)": 0.955588 }, { "epoch": 0.782932137868304, "grad_norm": 0.31119996309280396, "learning_rate": 1.2338955495456722e-06, "loss": 0.014518212527036667, "memory(GiB)": 22.66, "step": 24101, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.955597 }, { "epoch": 0.7829646233310593, "grad_norm": 0.23910757899284363, "learning_rate": 1.23354224945195e-06, "loss": 0.013550244271755219, "memory(GiB)": 22.66, "step": 24102, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955606 }, { "epoch": 0.7829971087938148, "grad_norm": 0.3680485785007477, "learning_rate": 1.2331889928281638e-06, "loss": 0.012543493881821632, "memory(GiB)": 22.66, "step": 24103, "token_acc": 0.996, "train_speed(iter/s)": 0.955615 }, { "epoch": 0.7830295942565701, "grad_norm": 0.3366619944572449, "learning_rate": 1.2328357796783914e-06, "loss": 0.01075267605483532, "memory(GiB)": 22.66, "step": 24104, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.955623 }, { "epoch": 0.7830620797193256, "grad_norm": 0.3061065077781677, "learning_rate": 1.2324826100067067e-06, "loss": 0.01182107999920845, "memory(GiB)": 22.66, "step": 24105, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.955631 }, { "epoch": 0.783094565182081, "grad_norm": 0.7190432548522949, "learning_rate": 1.2321294838171877e-06, "loss": 0.013394927605986595, "memory(GiB)": 22.66, "step": 24106, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.95564 }, { "epoch": 0.7831270506448365, "grad_norm": 0.33387553691864014, "learning_rate": 1.2317764011139088e-06, "loss": 0.015115748159587383, "memory(GiB)": 22.66, "step": 24107, "token_acc": 1.0, "train_speed(iter/s)": 0.955648 }, { "epoch": 0.7831595361075918, "grad_norm": 0.4031096398830414, "learning_rate": 1.2314233619009476e-06, "loss": 0.015583735890686512, "memory(GiB)": 22.66, "step": 24108, "token_acc": 1.0, "train_speed(iter/s)": 0.955656 }, { "epoch": 0.7831920215703473, "grad_norm": 0.249100461602211, "learning_rate": 1.2310703661823753e-06, "loss": 0.010968299582600594, "memory(GiB)": 22.66, "step": 24109, "token_acc": 1.0, "train_speed(iter/s)": 0.955665 }, { "epoch": 0.7832245070331026, "grad_norm": 6.064192771911621, "learning_rate": 1.2307174139622674e-06, "loss": 0.019926635548472404, "memory(GiB)": 22.66, "step": 24110, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.955674 }, { "epoch": 0.7832569924958581, "grad_norm": 0.29779401421546936, "learning_rate": 1.2303645052446972e-06, "loss": 0.005819229409098625, "memory(GiB)": 22.66, "step": 24111, "token_acc": 1.0, "train_speed(iter/s)": 0.955683 }, { "epoch": 0.7832894779586135, "grad_norm": 0.2800537645816803, "learning_rate": 1.2300116400337385e-06, "loss": 0.010100550949573517, "memory(GiB)": 22.66, "step": 24112, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955691 }, { "epoch": 0.783321963421369, "grad_norm": 0.40838533639907837, "learning_rate": 1.2296588183334646e-06, "loss": 0.01601993851363659, "memory(GiB)": 22.66, "step": 24113, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.955698 }, { "epoch": 0.7833544488841243, "grad_norm": 0.27525126934051514, "learning_rate": 1.2293060401479446e-06, "loss": 0.015926871448755264, "memory(GiB)": 22.66, "step": 24114, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.955707 }, { "epoch": 0.7833869343468798, "grad_norm": 0.4005848467350006, "learning_rate": 1.2289533054812531e-06, "loss": 0.01612708903849125, "memory(GiB)": 22.66, "step": 24115, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.955716 }, { "epoch": 0.7834194198096351, "grad_norm": 0.40701907873153687, "learning_rate": 1.2286006143374567e-06, "loss": 0.014501017518341541, "memory(GiB)": 22.66, "step": 24116, "token_acc": 0.9877049180327869, "train_speed(iter/s)": 0.955724 }, { "epoch": 0.7834519052723906, "grad_norm": 0.39348942041397095, "learning_rate": 1.228247966720632e-06, "loss": 0.017678476870059967, "memory(GiB)": 22.66, "step": 24117, "token_acc": 1.0, "train_speed(iter/s)": 0.955732 }, { "epoch": 0.783484390735146, "grad_norm": 0.2743315100669861, "learning_rate": 1.227895362634845e-06, "loss": 0.011491895653307438, "memory(GiB)": 22.66, "step": 24118, "token_acc": 0.9940828402366864, "train_speed(iter/s)": 0.955741 }, { "epoch": 0.7835168761979014, "grad_norm": 0.4103340804576874, "learning_rate": 1.2275428020841669e-06, "loss": 0.01790093258023262, "memory(GiB)": 22.66, "step": 24119, "token_acc": 0.984, "train_speed(iter/s)": 0.955749 }, { "epoch": 0.7835493616606568, "grad_norm": 0.282214879989624, "learning_rate": 1.2271902850726647e-06, "loss": 0.011657405644655228, "memory(GiB)": 22.66, "step": 24120, "token_acc": 1.0, "train_speed(iter/s)": 0.955756 }, { "epoch": 0.7835818471234123, "grad_norm": 0.4580567181110382, "learning_rate": 1.2268378116044083e-06, "loss": 0.016861001029610634, "memory(GiB)": 22.66, "step": 24121, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.955764 }, { "epoch": 0.7836143325861676, "grad_norm": 0.38399091362953186, "learning_rate": 1.226485381683466e-06, "loss": 0.013935107737779617, "memory(GiB)": 22.66, "step": 24122, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.95577 }, { "epoch": 0.7836468180489231, "grad_norm": 0.4623676836490631, "learning_rate": 1.2261329953139057e-06, "loss": 0.015461307018995285, "memory(GiB)": 22.66, "step": 24123, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955775 }, { "epoch": 0.7836793035116785, "grad_norm": 0.1893339604139328, "learning_rate": 1.2257806524997927e-06, "loss": 0.005988840479403734, "memory(GiB)": 22.66, "step": 24124, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.955781 }, { "epoch": 0.7837117889744339, "grad_norm": 0.38067805767059326, "learning_rate": 1.2254283532451944e-06, "loss": 0.018358632922172546, "memory(GiB)": 22.66, "step": 24125, "token_acc": 0.993103448275862, "train_speed(iter/s)": 0.955787 }, { "epoch": 0.7837442744371893, "grad_norm": 0.37658193707466125, "learning_rate": 1.2250760975541765e-06, "loss": 0.0116268340498209, "memory(GiB)": 22.66, "step": 24126, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.955793 }, { "epoch": 0.7837767598999448, "grad_norm": 0.5996602177619934, "learning_rate": 1.2247238854308052e-06, "loss": 0.014927576296031475, "memory(GiB)": 22.66, "step": 24127, "token_acc": 1.0, "train_speed(iter/s)": 0.955799 }, { "epoch": 0.7838092453627002, "grad_norm": 0.3021543323993683, "learning_rate": 1.2243717168791463e-06, "loss": 0.011595591902732849, "memory(GiB)": 22.66, "step": 24128, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.955804 }, { "epoch": 0.7838417308254556, "grad_norm": 0.41659536957740784, "learning_rate": 1.2240195919032616e-06, "loss": 0.016110185533761978, "memory(GiB)": 22.66, "step": 24129, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.95581 }, { "epoch": 0.7838742162882111, "grad_norm": 0.5789144039154053, "learning_rate": 1.223667510507217e-06, "loss": 0.013219386339187622, "memory(GiB)": 22.66, "step": 24130, "token_acc": 0.9923954372623575, "train_speed(iter/s)": 0.955815 }, { "epoch": 0.7839067017509664, "grad_norm": 0.46816498041152954, "learning_rate": 1.2233154726950752e-06, "loss": 0.011392354965209961, "memory(GiB)": 22.66, "step": 24131, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.955821 }, { "epoch": 0.7839391872137219, "grad_norm": 0.2862231135368347, "learning_rate": 1.2229634784709017e-06, "loss": 0.008451041765511036, "memory(GiB)": 22.66, "step": 24132, "token_acc": 1.0, "train_speed(iter/s)": 0.955827 }, { "epoch": 0.7839716726764773, "grad_norm": 0.31220078468322754, "learning_rate": 1.222611527838755e-06, "loss": 0.010760713368654251, "memory(GiB)": 22.66, "step": 24133, "token_acc": 1.0, "train_speed(iter/s)": 0.955833 }, { "epoch": 0.7840041581392327, "grad_norm": 0.46074891090393066, "learning_rate": 1.2222596208026988e-06, "loss": 0.010331790894269943, "memory(GiB)": 22.66, "step": 24134, "token_acc": 1.0, "train_speed(iter/s)": 0.955839 }, { "epoch": 0.7840366436019881, "grad_norm": 0.3603791296482086, "learning_rate": 1.2219077573667953e-06, "loss": 0.01917439140379429, "memory(GiB)": 22.66, "step": 24135, "token_acc": 0.981203007518797, "train_speed(iter/s)": 0.955844 }, { "epoch": 0.7840691290647436, "grad_norm": 0.37644582986831665, "learning_rate": 1.221555937535106e-06, "loss": 0.018133897334337234, "memory(GiB)": 22.66, "step": 24136, "token_acc": 1.0, "train_speed(iter/s)": 0.95585 }, { "epoch": 0.7841016145274989, "grad_norm": 0.485929399728775, "learning_rate": 1.2212041613116887e-06, "loss": 0.018402883782982826, "memory(GiB)": 22.66, "step": 24137, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.955857 }, { "epoch": 0.7841340999902544, "grad_norm": 0.377178430557251, "learning_rate": 1.2208524287006062e-06, "loss": 0.01334321778267622, "memory(GiB)": 22.66, "step": 24138, "token_acc": 1.0, "train_speed(iter/s)": 0.955862 }, { "epoch": 0.7841665854530098, "grad_norm": 0.3199599087238312, "learning_rate": 1.2205007397059143e-06, "loss": 0.009516345337033272, "memory(GiB)": 22.66, "step": 24139, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.955868 }, { "epoch": 0.7841990709157652, "grad_norm": 0.3818913698196411, "learning_rate": 1.2201490943316773e-06, "loss": 0.010994397103786469, "memory(GiB)": 22.66, "step": 24140, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.955873 }, { "epoch": 0.7842315563785206, "grad_norm": 0.3514178395271301, "learning_rate": 1.2197974925819489e-06, "loss": 0.014497542753815651, "memory(GiB)": 22.66, "step": 24141, "token_acc": 0.995, "train_speed(iter/s)": 0.955879 }, { "epoch": 0.7842640418412761, "grad_norm": 0.41305968165397644, "learning_rate": 1.219445934460789e-06, "loss": 0.015891602262854576, "memory(GiB)": 22.66, "step": 24142, "token_acc": 1.0, "train_speed(iter/s)": 0.955886 }, { "epoch": 0.7842965273040314, "grad_norm": 0.4331229627132416, "learning_rate": 1.2190944199722565e-06, "loss": 0.01475103572010994, "memory(GiB)": 22.66, "step": 24143, "token_acc": 1.0, "train_speed(iter/s)": 0.955894 }, { "epoch": 0.7843290127667869, "grad_norm": 0.3018326759338379, "learning_rate": 1.218742949120404e-06, "loss": 0.01263286080211401, "memory(GiB)": 22.66, "step": 24144, "token_acc": 1.0, "train_speed(iter/s)": 0.955903 }, { "epoch": 0.7843614982295423, "grad_norm": 0.40042391419410706, "learning_rate": 1.2183915219092934e-06, "loss": 0.021121911704540253, "memory(GiB)": 22.66, "step": 24145, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.955911 }, { "epoch": 0.7843939836922977, "grad_norm": 0.23999936878681183, "learning_rate": 1.2180401383429763e-06, "loss": 0.010201705619692802, "memory(GiB)": 22.66, "step": 24146, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.95592 }, { "epoch": 0.7844264691550531, "grad_norm": 0.2976580262184143, "learning_rate": 1.2176887984255115e-06, "loss": 0.010546550154685974, "memory(GiB)": 22.66, "step": 24147, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.955928 }, { "epoch": 0.7844589546178086, "grad_norm": 0.3124324381351471, "learning_rate": 1.2173375021609507e-06, "loss": 0.01134304329752922, "memory(GiB)": 22.66, "step": 24148, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.955936 }, { "epoch": 0.7844914400805639, "grad_norm": 0.2506733536720276, "learning_rate": 1.2169862495533496e-06, "loss": 0.008344865404069424, "memory(GiB)": 22.66, "step": 24149, "token_acc": 1.0, "train_speed(iter/s)": 0.955945 }, { "epoch": 0.7845239255433194, "grad_norm": 0.48815175890922546, "learning_rate": 1.216635040606763e-06, "loss": 0.016953278332948685, "memory(GiB)": 22.66, "step": 24150, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.955953 }, { "epoch": 0.7845564110060748, "grad_norm": 0.27088120579719543, "learning_rate": 1.216283875325245e-06, "loss": 0.00705002574250102, "memory(GiB)": 22.66, "step": 24151, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.955962 }, { "epoch": 0.7845888964688302, "grad_norm": 0.37403541803359985, "learning_rate": 1.2159327537128451e-06, "loss": 0.009569603949785233, "memory(GiB)": 22.66, "step": 24152, "token_acc": 1.0, "train_speed(iter/s)": 0.95597 }, { "epoch": 0.7846213819315856, "grad_norm": 0.3360881209373474, "learning_rate": 1.2155816757736182e-06, "loss": 0.014038750901818275, "memory(GiB)": 22.66, "step": 24153, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.955979 }, { "epoch": 0.7846538673943411, "grad_norm": 0.2578682601451874, "learning_rate": 1.2152306415116154e-06, "loss": 0.010047387331724167, "memory(GiB)": 22.66, "step": 24154, "token_acc": 0.9940119760479041, "train_speed(iter/s)": 0.955987 }, { "epoch": 0.7846863528570964, "grad_norm": 0.4222753942012787, "learning_rate": 1.2148796509308908e-06, "loss": 0.011718301102519035, "memory(GiB)": 22.66, "step": 24155, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.955996 }, { "epoch": 0.7847188383198519, "grad_norm": 0.24500414729118347, "learning_rate": 1.2145287040354908e-06, "loss": 0.009359665215015411, "memory(GiB)": 22.66, "step": 24156, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956005 }, { "epoch": 0.7847513237826073, "grad_norm": 0.2787693738937378, "learning_rate": 1.2141778008294686e-06, "loss": 0.013308383524417877, "memory(GiB)": 22.66, "step": 24157, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.956013 }, { "epoch": 0.7847838092453627, "grad_norm": 0.24813170731067657, "learning_rate": 1.213826941316873e-06, "loss": 0.008725391700863838, "memory(GiB)": 22.66, "step": 24158, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956022 }, { "epoch": 0.7848162947081181, "grad_norm": 0.4403923451900482, "learning_rate": 1.2134761255017542e-06, "loss": 0.013997347094118595, "memory(GiB)": 22.66, "step": 24159, "token_acc": 1.0, "train_speed(iter/s)": 0.95603 }, { "epoch": 0.7848487801708736, "grad_norm": 0.32604873180389404, "learning_rate": 1.213125353388162e-06, "loss": 0.013360686600208282, "memory(GiB)": 22.66, "step": 24160, "token_acc": 0.9922779922779923, "train_speed(iter/s)": 0.956039 }, { "epoch": 0.7848812656336289, "grad_norm": 0.310913622379303, "learning_rate": 1.212774624980142e-06, "loss": 0.011716127395629883, "memory(GiB)": 22.66, "step": 24161, "token_acc": 1.0, "train_speed(iter/s)": 0.956047 }, { "epoch": 0.7849137510963844, "grad_norm": 0.32217180728912354, "learning_rate": 1.212423940281744e-06, "loss": 0.012557038106024265, "memory(GiB)": 22.66, "step": 24162, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.956056 }, { "epoch": 0.7849462365591398, "grad_norm": 0.289002925157547, "learning_rate": 1.2120732992970147e-06, "loss": 0.012064510025084019, "memory(GiB)": 22.66, "step": 24163, "token_acc": 1.0, "train_speed(iter/s)": 0.956064 }, { "epoch": 0.7849787220218952, "grad_norm": 0.6142279505729675, "learning_rate": 1.2117227020300027e-06, "loss": 0.015451077371835709, "memory(GiB)": 22.66, "step": 24164, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956072 }, { "epoch": 0.7850112074846506, "grad_norm": 0.4282839298248291, "learning_rate": 1.2113721484847518e-06, "loss": 0.011294133961200714, "memory(GiB)": 22.66, "step": 24165, "token_acc": 1.0, "train_speed(iter/s)": 0.95608 }, { "epoch": 0.7850436929474061, "grad_norm": 0.33060595393180847, "learning_rate": 1.2110216386653101e-06, "loss": 0.01314431894570589, "memory(GiB)": 22.66, "step": 24166, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.956088 }, { "epoch": 0.7850761784101614, "grad_norm": 0.3869423270225525, "learning_rate": 1.2106711725757187e-06, "loss": 0.01448001153767109, "memory(GiB)": 22.66, "step": 24167, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.956097 }, { "epoch": 0.7851086638729169, "grad_norm": 0.5496406555175781, "learning_rate": 1.2103207502200286e-06, "loss": 0.016668563708662987, "memory(GiB)": 22.66, "step": 24168, "token_acc": 1.0, "train_speed(iter/s)": 0.956105 }, { "epoch": 0.7851411493356723, "grad_norm": 0.38211020827293396, "learning_rate": 1.2099703716022799e-06, "loss": 0.010921802371740341, "memory(GiB)": 22.66, "step": 24169, "token_acc": 1.0, "train_speed(iter/s)": 0.956113 }, { "epoch": 0.7851736347984277, "grad_norm": 0.2518540918827057, "learning_rate": 1.2096200367265193e-06, "loss": 0.00935057271271944, "memory(GiB)": 22.66, "step": 24170, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.956121 }, { "epoch": 0.7852061202611831, "grad_norm": 0.38956648111343384, "learning_rate": 1.2092697455967866e-06, "loss": 0.02033945918083191, "memory(GiB)": 22.66, "step": 24171, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.956129 }, { "epoch": 0.7852386057239386, "grad_norm": 0.3517683148384094, "learning_rate": 1.2089194982171248e-06, "loss": 0.010789450258016586, "memory(GiB)": 22.66, "step": 24172, "token_acc": 0.9854014598540146, "train_speed(iter/s)": 0.956137 }, { "epoch": 0.7852710911866939, "grad_norm": 0.28285807371139526, "learning_rate": 1.2085692945915811e-06, "loss": 0.010251527652144432, "memory(GiB)": 22.66, "step": 24173, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.956146 }, { "epoch": 0.7853035766494494, "grad_norm": 0.3151242733001709, "learning_rate": 1.208219134724193e-06, "loss": 0.011869250796735287, "memory(GiB)": 22.66, "step": 24174, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.956154 }, { "epoch": 0.7853360621122047, "grad_norm": 0.33357101678848267, "learning_rate": 1.207869018619004e-06, "loss": 0.011591752991080284, "memory(GiB)": 22.66, "step": 24175, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.956162 }, { "epoch": 0.7853685475749602, "grad_norm": 0.23496593534946442, "learning_rate": 1.2075189462800524e-06, "loss": 0.007421229034662247, "memory(GiB)": 22.66, "step": 24176, "token_acc": 1.0, "train_speed(iter/s)": 0.95617 }, { "epoch": 0.7854010330377156, "grad_norm": 0.29842323064804077, "learning_rate": 1.2071689177113798e-06, "loss": 0.012030734680593014, "memory(GiB)": 22.66, "step": 24177, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956179 }, { "epoch": 0.785433518500471, "grad_norm": 0.3571968674659729, "learning_rate": 1.2068189329170266e-06, "loss": 0.00741283455863595, "memory(GiB)": 22.66, "step": 24178, "token_acc": 1.0, "train_speed(iter/s)": 0.956187 }, { "epoch": 0.7854660039632264, "grad_norm": 0.28661394119262695, "learning_rate": 1.2064689919010324e-06, "loss": 0.009010816924273968, "memory(GiB)": 22.66, "step": 24179, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956195 }, { "epoch": 0.7854984894259819, "grad_norm": 0.40423160791397095, "learning_rate": 1.2061190946674344e-06, "loss": 0.008935555815696716, "memory(GiB)": 22.66, "step": 24180, "token_acc": 1.0, "train_speed(iter/s)": 0.956204 }, { "epoch": 0.7855309748887372, "grad_norm": 0.8656092882156372, "learning_rate": 1.2057692412202714e-06, "loss": 0.013871463015675545, "memory(GiB)": 22.66, "step": 24181, "token_acc": 1.0, "train_speed(iter/s)": 0.956211 }, { "epoch": 0.7855634603514927, "grad_norm": 0.2098582237958908, "learning_rate": 1.2054194315635814e-06, "loss": 0.008030029013752937, "memory(GiB)": 22.66, "step": 24182, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956219 }, { "epoch": 0.7855959458142481, "grad_norm": 0.2808418571949005, "learning_rate": 1.2050696657014033e-06, "loss": 0.012652638368308544, "memory(GiB)": 22.66, "step": 24183, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956225 }, { "epoch": 0.7856284312770035, "grad_norm": 0.3799784481525421, "learning_rate": 1.204719943637771e-06, "loss": 0.013105109333992004, "memory(GiB)": 22.66, "step": 24184, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.95623 }, { "epoch": 0.7856609167397589, "grad_norm": 0.7841196656227112, "learning_rate": 1.204370265376722e-06, "loss": 0.012816881760954857, "memory(GiB)": 22.66, "step": 24185, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956236 }, { "epoch": 0.7856934022025144, "grad_norm": 0.4453785717487335, "learning_rate": 1.2040206309222917e-06, "loss": 0.01480493601411581, "memory(GiB)": 22.66, "step": 24186, "token_acc": 0.9798387096774194, "train_speed(iter/s)": 0.956242 }, { "epoch": 0.7857258876652697, "grad_norm": 0.7869117259979248, "learning_rate": 1.2036710402785163e-06, "loss": 0.014544499106705189, "memory(GiB)": 22.66, "step": 24187, "token_acc": 1.0, "train_speed(iter/s)": 0.956247 }, { "epoch": 0.7857583731280252, "grad_norm": 0.5022676587104797, "learning_rate": 1.203321493449431e-06, "loss": 0.016973240301012993, "memory(GiB)": 22.66, "step": 24188, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956253 }, { "epoch": 0.7857908585907806, "grad_norm": 0.2610328197479248, "learning_rate": 1.202971990439068e-06, "loss": 0.007765633519738913, "memory(GiB)": 22.66, "step": 24189, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.956259 }, { "epoch": 0.785823344053536, "grad_norm": 0.34854254126548767, "learning_rate": 1.2026225312514617e-06, "loss": 0.012303996831178665, "memory(GiB)": 22.66, "step": 24190, "token_acc": 1.0, "train_speed(iter/s)": 0.956265 }, { "epoch": 0.7858558295162914, "grad_norm": 0.3052569627761841, "learning_rate": 1.202273115890646e-06, "loss": 0.009948275983333588, "memory(GiB)": 22.66, "step": 24191, "token_acc": 1.0, "train_speed(iter/s)": 0.956271 }, { "epoch": 0.7858883149790469, "grad_norm": 0.357113242149353, "learning_rate": 1.2019237443606546e-06, "loss": 0.012294416315853596, "memory(GiB)": 22.66, "step": 24192, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.956277 }, { "epoch": 0.7859208004418023, "grad_norm": 0.4184742569923401, "learning_rate": 1.201574416665517e-06, "loss": 0.010316992178559303, "memory(GiB)": 22.66, "step": 24193, "token_acc": 1.0, "train_speed(iter/s)": 0.956283 }, { "epoch": 0.7859532859045577, "grad_norm": 0.48488110303878784, "learning_rate": 1.2012251328092673e-06, "loss": 0.007976565510034561, "memory(GiB)": 22.66, "step": 24194, "token_acc": 1.0, "train_speed(iter/s)": 0.956289 }, { "epoch": 0.7859857713673132, "grad_norm": 0.3148473799228668, "learning_rate": 1.2008758927959336e-06, "loss": 0.008237836882472038, "memory(GiB)": 22.66, "step": 24195, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.956295 }, { "epoch": 0.7860182568300685, "grad_norm": 0.34839680790901184, "learning_rate": 1.200526696629551e-06, "loss": 0.007118622772395611, "memory(GiB)": 22.66, "step": 24196, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.9563 }, { "epoch": 0.786050742292824, "grad_norm": 0.5245556831359863, "learning_rate": 1.2001775443141461e-06, "loss": 0.008697301149368286, "memory(GiB)": 22.66, "step": 24197, "token_acc": 1.0, "train_speed(iter/s)": 0.956306 }, { "epoch": 0.7860832277555794, "grad_norm": 0.33487236499786377, "learning_rate": 1.1998284358537515e-06, "loss": 0.010027099400758743, "memory(GiB)": 22.66, "step": 24198, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.956311 }, { "epoch": 0.7861157132183348, "grad_norm": 0.468674898147583, "learning_rate": 1.1994793712523933e-06, "loss": 0.012293693609535694, "memory(GiB)": 22.66, "step": 24199, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.956316 }, { "epoch": 0.7861481986810902, "grad_norm": 0.4223524332046509, "learning_rate": 1.1991303505141016e-06, "loss": 0.016728099435567856, "memory(GiB)": 22.66, "step": 24200, "token_acc": 1.0, "train_speed(iter/s)": 0.956321 }, { "epoch": 0.7861806841438457, "grad_norm": 0.3977855443954468, "learning_rate": 1.198781373642905e-06, "loss": 0.012383075430989265, "memory(GiB)": 22.66, "step": 24201, "token_acc": 1.0, "train_speed(iter/s)": 0.956328 }, { "epoch": 0.786213169606601, "grad_norm": 0.40458911657333374, "learning_rate": 1.1984324406428315e-06, "loss": 0.01275215856730938, "memory(GiB)": 22.66, "step": 24202, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.956334 }, { "epoch": 0.7862456550693565, "grad_norm": 0.33997878432273865, "learning_rate": 1.1980835515179068e-06, "loss": 0.009530775249004364, "memory(GiB)": 22.66, "step": 24203, "token_acc": 1.0, "train_speed(iter/s)": 0.956339 }, { "epoch": 0.7862781405321119, "grad_norm": 0.3710251450538635, "learning_rate": 1.197734706272158e-06, "loss": 0.011498551815748215, "memory(GiB)": 22.66, "step": 24204, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.956345 }, { "epoch": 0.7863106259948673, "grad_norm": 0.3267521560192108, "learning_rate": 1.197385904909611e-06, "loss": 0.010498006828129292, "memory(GiB)": 22.66, "step": 24205, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956352 }, { "epoch": 0.7863431114576227, "grad_norm": 0.37776342034339905, "learning_rate": 1.1970371474342923e-06, "loss": 0.00915392953902483, "memory(GiB)": 22.66, "step": 24206, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.956358 }, { "epoch": 0.7863755969203782, "grad_norm": 0.3765139877796173, "learning_rate": 1.1966884338502284e-06, "loss": 0.011371567845344543, "memory(GiB)": 22.66, "step": 24207, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.956365 }, { "epoch": 0.7864080823831335, "grad_norm": 0.24948112666606903, "learning_rate": 1.19633976416144e-06, "loss": 0.011605875566601753, "memory(GiB)": 22.66, "step": 24208, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.956373 }, { "epoch": 0.786440567845889, "grad_norm": 0.29017573595046997, "learning_rate": 1.195991138371954e-06, "loss": 0.009064357727766037, "memory(GiB)": 22.66, "step": 24209, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.956381 }, { "epoch": 0.7864730533086444, "grad_norm": 0.3983914256095886, "learning_rate": 1.1956425564857931e-06, "loss": 0.010470226407051086, "memory(GiB)": 22.66, "step": 24210, "token_acc": 1.0, "train_speed(iter/s)": 0.95639 }, { "epoch": 0.7865055387713998, "grad_norm": 0.43006348609924316, "learning_rate": 1.1952940185069822e-06, "loss": 0.013114288449287415, "memory(GiB)": 22.66, "step": 24211, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.956398 }, { "epoch": 0.7865380242341552, "grad_norm": 0.32363682985305786, "learning_rate": 1.1949455244395408e-06, "loss": 0.00852878950536251, "memory(GiB)": 22.66, "step": 24212, "token_acc": 1.0, "train_speed(iter/s)": 0.956407 }, { "epoch": 0.7865705096969107, "grad_norm": 0.46476730704307556, "learning_rate": 1.1945970742874929e-06, "loss": 0.011611423455178738, "memory(GiB)": 22.66, "step": 24213, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956415 }, { "epoch": 0.786602995159666, "grad_norm": 0.3635227084159851, "learning_rate": 1.1942486680548592e-06, "loss": 0.012953139841556549, "memory(GiB)": 22.66, "step": 24214, "token_acc": 1.0, "train_speed(iter/s)": 0.956424 }, { "epoch": 0.7866354806224215, "grad_norm": 0.8742650151252747, "learning_rate": 1.1939003057456633e-06, "loss": 0.019722124561667442, "memory(GiB)": 22.66, "step": 24215, "token_acc": 1.0, "train_speed(iter/s)": 0.956432 }, { "epoch": 0.7866679660851769, "grad_norm": 0.33717507123947144, "learning_rate": 1.1935519873639223e-06, "loss": 0.013818629086017609, "memory(GiB)": 22.66, "step": 24216, "token_acc": 1.0, "train_speed(iter/s)": 0.956441 }, { "epoch": 0.7867004515479323, "grad_norm": 0.36282551288604736, "learning_rate": 1.1932037129136593e-06, "loss": 0.015802351757884026, "memory(GiB)": 22.66, "step": 24217, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.956449 }, { "epoch": 0.7867329370106877, "grad_norm": 1.1657921075820923, "learning_rate": 1.1928554823988887e-06, "loss": 0.01995820179581642, "memory(GiB)": 22.66, "step": 24218, "token_acc": 1.0, "train_speed(iter/s)": 0.956457 }, { "epoch": 0.7867654224734432, "grad_norm": 0.36089128255844116, "learning_rate": 1.192507295823635e-06, "loss": 0.015877166762948036, "memory(GiB)": 22.66, "step": 24219, "token_acc": 1.0, "train_speed(iter/s)": 0.956466 }, { "epoch": 0.7867979079361985, "grad_norm": 0.43960392475128174, "learning_rate": 1.1921591531919162e-06, "loss": 0.01413534302264452, "memory(GiB)": 22.66, "step": 24220, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.956474 }, { "epoch": 0.786830393398954, "grad_norm": 0.3812013268470764, "learning_rate": 1.1918110545077478e-06, "loss": 0.01124383695423603, "memory(GiB)": 22.66, "step": 24221, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.956483 }, { "epoch": 0.7868628788617094, "grad_norm": 0.309835821390152, "learning_rate": 1.1914629997751498e-06, "loss": 0.011558860540390015, "memory(GiB)": 22.66, "step": 24222, "token_acc": 1.0, "train_speed(iter/s)": 0.956491 }, { "epoch": 0.7868953643244648, "grad_norm": 0.3942182958126068, "learning_rate": 1.191114988998135e-06, "loss": 0.012870971113443375, "memory(GiB)": 22.66, "step": 24223, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.9565 }, { "epoch": 0.7869278497872202, "grad_norm": 0.37911397218704224, "learning_rate": 1.190767022180726e-06, "loss": 0.014839411713182926, "memory(GiB)": 22.66, "step": 24224, "token_acc": 1.0, "train_speed(iter/s)": 0.956509 }, { "epoch": 0.7869603352499757, "grad_norm": 0.44581717252731323, "learning_rate": 1.1904190993269338e-06, "loss": 0.01470233965665102, "memory(GiB)": 22.66, "step": 24225, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956517 }, { "epoch": 0.786992820712731, "grad_norm": 0.3450455665588379, "learning_rate": 1.1900712204407778e-06, "loss": 0.01273747906088829, "memory(GiB)": 22.66, "step": 24226, "token_acc": 1.0, "train_speed(iter/s)": 0.956525 }, { "epoch": 0.7870253061754865, "grad_norm": 0.3662436604499817, "learning_rate": 1.1897233855262686e-06, "loss": 0.013074049726128578, "memory(GiB)": 22.66, "step": 24227, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.956534 }, { "epoch": 0.7870577916382419, "grad_norm": 0.3437352180480957, "learning_rate": 1.1893755945874237e-06, "loss": 0.015068890526890755, "memory(GiB)": 22.66, "step": 24228, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.956542 }, { "epoch": 0.7870902771009973, "grad_norm": 0.3294624388217926, "learning_rate": 1.1890278476282563e-06, "loss": 0.011019561439752579, "memory(GiB)": 22.66, "step": 24229, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.956551 }, { "epoch": 0.7871227625637527, "grad_norm": 0.24929441511631012, "learning_rate": 1.1886801446527807e-06, "loss": 0.008099505677819252, "memory(GiB)": 22.66, "step": 24230, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.956559 }, { "epoch": 0.7871552480265082, "grad_norm": 0.3142540454864502, "learning_rate": 1.1883324856650087e-06, "loss": 0.008183409459888935, "memory(GiB)": 22.66, "step": 24231, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.956567 }, { "epoch": 0.7871877334892635, "grad_norm": 0.23764236271381378, "learning_rate": 1.1879848706689524e-06, "loss": 0.00583793455734849, "memory(GiB)": 22.66, "step": 24232, "token_acc": 1.0, "train_speed(iter/s)": 0.956576 }, { "epoch": 0.787220218952019, "grad_norm": 0.41507285833358765, "learning_rate": 1.1876372996686247e-06, "loss": 0.015104835852980614, "memory(GiB)": 22.66, "step": 24233, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956585 }, { "epoch": 0.7872527044147744, "grad_norm": 0.20920804142951965, "learning_rate": 1.1872897726680367e-06, "loss": 0.010006806813180447, "memory(GiB)": 22.66, "step": 24234, "token_acc": 1.0, "train_speed(iter/s)": 0.956593 }, { "epoch": 0.7872851898775298, "grad_norm": 0.2523505389690399, "learning_rate": 1.1869422896712008e-06, "loss": 0.009571729227900505, "memory(GiB)": 22.66, "step": 24235, "token_acc": 1.0, "train_speed(iter/s)": 0.956602 }, { "epoch": 0.7873176753402852, "grad_norm": 0.44375964999198914, "learning_rate": 1.1865948506821245e-06, "loss": 0.019846046343445778, "memory(GiB)": 22.66, "step": 24236, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.95661 }, { "epoch": 0.7873501608030407, "grad_norm": 0.2715999484062195, "learning_rate": 1.1862474557048198e-06, "loss": 0.012142598628997803, "memory(GiB)": 22.66, "step": 24237, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.956618 }, { "epoch": 0.787382646265796, "grad_norm": 0.3277071416378021, "learning_rate": 1.185900104743295e-06, "loss": 0.016879742965102196, "memory(GiB)": 22.66, "step": 24238, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.956626 }, { "epoch": 0.7874151317285515, "grad_norm": 0.28672438859939575, "learning_rate": 1.1855527978015614e-06, "loss": 0.011403048411011696, "memory(GiB)": 22.66, "step": 24239, "token_acc": 1.0, "train_speed(iter/s)": 0.956635 }, { "epoch": 0.7874476171913068, "grad_norm": 0.7992831468582153, "learning_rate": 1.185205534883624e-06, "loss": 0.009787935763597488, "memory(GiB)": 22.66, "step": 24240, "token_acc": 1.0, "train_speed(iter/s)": 0.956643 }, { "epoch": 0.7874801026540623, "grad_norm": 0.5341904163360596, "learning_rate": 1.1848583159934917e-06, "loss": 0.02172672189772129, "memory(GiB)": 22.66, "step": 24241, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.956652 }, { "epoch": 0.7875125881168177, "grad_norm": 0.23120209574699402, "learning_rate": 1.1845111411351728e-06, "loss": 0.008510230109095573, "memory(GiB)": 22.66, "step": 24242, "token_acc": 1.0, "train_speed(iter/s)": 0.95666 }, { "epoch": 0.7875450735795732, "grad_norm": 0.3344620168209076, "learning_rate": 1.1841640103126756e-06, "loss": 0.009306667372584343, "memory(GiB)": 22.66, "step": 24243, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.956667 }, { "epoch": 0.7875775590423285, "grad_norm": 0.2177003175020218, "learning_rate": 1.1838169235300024e-06, "loss": 0.007270917762070894, "memory(GiB)": 22.66, "step": 24244, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.956673 }, { "epoch": 0.787610044505084, "grad_norm": 0.4391409158706665, "learning_rate": 1.1834698807911633e-06, "loss": 0.017250005155801773, "memory(GiB)": 22.66, "step": 24245, "token_acc": 1.0, "train_speed(iter/s)": 0.95668 }, { "epoch": 0.7876425299678393, "grad_norm": 0.38390839099884033, "learning_rate": 1.1831228821001584e-06, "loss": 0.0100704375654459, "memory(GiB)": 22.66, "step": 24246, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.956687 }, { "epoch": 0.7876750154305948, "grad_norm": 0.2316761016845703, "learning_rate": 1.1827759274609985e-06, "loss": 0.009317818097770214, "memory(GiB)": 22.66, "step": 24247, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956694 }, { "epoch": 0.7877075008933502, "grad_norm": 0.29262813925743103, "learning_rate": 1.1824290168776836e-06, "loss": 0.009710797108709812, "memory(GiB)": 22.66, "step": 24248, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956699 }, { "epoch": 0.7877399863561056, "grad_norm": 0.43593892455101013, "learning_rate": 1.182082150354219e-06, "loss": 0.01705373451113701, "memory(GiB)": 22.66, "step": 24249, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.956705 }, { "epoch": 0.787772471818861, "grad_norm": 0.2576073408126831, "learning_rate": 1.1817353278946097e-06, "loss": 0.008415799587965012, "memory(GiB)": 22.66, "step": 24250, "token_acc": 1.0, "train_speed(iter/s)": 0.956711 }, { "epoch": 0.7878049572816165, "grad_norm": 0.5058813095092773, "learning_rate": 1.1813885495028537e-06, "loss": 0.015547962859272957, "memory(GiB)": 22.66, "step": 24251, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.956717 }, { "epoch": 0.7878374427443718, "grad_norm": 0.2648012936115265, "learning_rate": 1.1810418151829595e-06, "loss": 0.012380938045680523, "memory(GiB)": 22.66, "step": 24252, "token_acc": 1.0, "train_speed(iter/s)": 0.956723 }, { "epoch": 0.7878699282071273, "grad_norm": 0.48680081963539124, "learning_rate": 1.1806951249389242e-06, "loss": 0.015892788767814636, "memory(GiB)": 22.66, "step": 24253, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956729 }, { "epoch": 0.7879024136698827, "grad_norm": 0.3236028254032135, "learning_rate": 1.180348478774752e-06, "loss": 0.010336032137274742, "memory(GiB)": 22.66, "step": 24254, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.956735 }, { "epoch": 0.7879348991326381, "grad_norm": 0.34478992223739624, "learning_rate": 1.180001876694441e-06, "loss": 0.011865541338920593, "memory(GiB)": 22.66, "step": 24255, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.956741 }, { "epoch": 0.7879673845953936, "grad_norm": 0.4860110878944397, "learning_rate": 1.1796553187019928e-06, "loss": 0.019049759954214096, "memory(GiB)": 22.66, "step": 24256, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.956746 }, { "epoch": 0.787999870058149, "grad_norm": 0.2636626064777374, "learning_rate": 1.1793088048014074e-06, "loss": 0.010322127491235733, "memory(GiB)": 22.66, "step": 24257, "token_acc": 1.0, "train_speed(iter/s)": 0.956752 }, { "epoch": 0.7880323555209044, "grad_norm": 0.5010448694229126, "learning_rate": 1.1789623349966844e-06, "loss": 0.017220187932252884, "memory(GiB)": 22.66, "step": 24258, "token_acc": 1.0, "train_speed(iter/s)": 0.956757 }, { "epoch": 0.7880648409836598, "grad_norm": 0.29826903343200684, "learning_rate": 1.1786159092918209e-06, "loss": 0.010360547341406345, "memory(GiB)": 22.66, "step": 24259, "token_acc": 0.9900662251655629, "train_speed(iter/s)": 0.956762 }, { "epoch": 0.7880973264464153, "grad_norm": 0.5221061706542969, "learning_rate": 1.1782695276908156e-06, "loss": 0.01335779670625925, "memory(GiB)": 22.66, "step": 24260, "token_acc": 1.0, "train_speed(iter/s)": 0.956767 }, { "epoch": 0.7881298119091706, "grad_norm": 0.27580103278160095, "learning_rate": 1.1779231901976672e-06, "loss": 0.009760158136487007, "memory(GiB)": 22.66, "step": 24261, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.956772 }, { "epoch": 0.7881622973719261, "grad_norm": 0.1922648549079895, "learning_rate": 1.1775768968163737e-06, "loss": 0.0071725486777722836, "memory(GiB)": 22.66, "step": 24262, "token_acc": 0.9947643979057592, "train_speed(iter/s)": 0.956778 }, { "epoch": 0.7881947828346815, "grad_norm": 0.4876236915588379, "learning_rate": 1.1772306475509287e-06, "loss": 0.018005773425102234, "memory(GiB)": 22.66, "step": 24263, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956785 }, { "epoch": 0.788227268297437, "grad_norm": 0.33840349316596985, "learning_rate": 1.1768844424053305e-06, "loss": 0.015144472010433674, "memory(GiB)": 22.66, "step": 24264, "token_acc": 1.0, "train_speed(iter/s)": 0.956791 }, { "epoch": 0.7882597537601923, "grad_norm": 0.5040385723114014, "learning_rate": 1.1765382813835741e-06, "loss": 0.019321955740451813, "memory(GiB)": 22.66, "step": 24265, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.956798 }, { "epoch": 0.7882922392229478, "grad_norm": 0.380504846572876, "learning_rate": 1.1761921644896551e-06, "loss": 0.01479457039386034, "memory(GiB)": 22.66, "step": 24266, "token_acc": 0.9880239520958084, "train_speed(iter/s)": 0.956803 }, { "epoch": 0.7883247246857031, "grad_norm": 0.2611048221588135, "learning_rate": 1.1758460917275694e-06, "loss": 0.01164681650698185, "memory(GiB)": 22.66, "step": 24267, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.95681 }, { "epoch": 0.7883572101484586, "grad_norm": 0.44123363494873047, "learning_rate": 1.1755000631013087e-06, "loss": 0.016412705183029175, "memory(GiB)": 22.66, "step": 24268, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.956817 }, { "epoch": 0.788389695611214, "grad_norm": 0.3559018671512604, "learning_rate": 1.1751540786148668e-06, "loss": 0.012340707704424858, "memory(GiB)": 22.66, "step": 24269, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.956824 }, { "epoch": 0.7884221810739694, "grad_norm": 0.3030778765678406, "learning_rate": 1.1748081382722381e-06, "loss": 0.011284706182777882, "memory(GiB)": 22.66, "step": 24270, "token_acc": 1.0, "train_speed(iter/s)": 0.956831 }, { "epoch": 0.7884546665367248, "grad_norm": 0.4447758197784424, "learning_rate": 1.1744622420774165e-06, "loss": 0.01985771209001541, "memory(GiB)": 22.66, "step": 24271, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.956839 }, { "epoch": 0.7884871519994803, "grad_norm": 0.26251745223999023, "learning_rate": 1.174116390034391e-06, "loss": 0.015291990712285042, "memory(GiB)": 22.66, "step": 24272, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.956847 }, { "epoch": 0.7885196374622356, "grad_norm": 3.258610725402832, "learning_rate": 1.1737705821471556e-06, "loss": 0.019958071410655975, "memory(GiB)": 22.66, "step": 24273, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.956855 }, { "epoch": 0.7885521229249911, "grad_norm": 0.2646942734718323, "learning_rate": 1.1734248184196977e-06, "loss": 0.009987618774175644, "memory(GiB)": 22.66, "step": 24274, "token_acc": 1.0, "train_speed(iter/s)": 0.956863 }, { "epoch": 0.7885846083877465, "grad_norm": 0.3394036591053009, "learning_rate": 1.1730790988560136e-06, "loss": 0.01392514817416668, "memory(GiB)": 22.66, "step": 24275, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.956871 }, { "epoch": 0.7886170938505019, "grad_norm": 0.33761098980903625, "learning_rate": 1.1727334234600884e-06, "loss": 0.013497933745384216, "memory(GiB)": 22.66, "step": 24276, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.95688 }, { "epoch": 0.7886495793132573, "grad_norm": 0.32926517724990845, "learning_rate": 1.1723877922359156e-06, "loss": 0.012736841104924679, "memory(GiB)": 22.66, "step": 24277, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.956888 }, { "epoch": 0.7886820647760128, "grad_norm": 0.5585358142852783, "learning_rate": 1.1720422051874809e-06, "loss": 0.014450719580054283, "memory(GiB)": 22.66, "step": 24278, "token_acc": 0.9835390946502057, "train_speed(iter/s)": 0.956896 }, { "epoch": 0.7887145502387681, "grad_norm": 0.5612354874610901, "learning_rate": 1.1716966623187738e-06, "loss": 0.009282439947128296, "memory(GiB)": 22.66, "step": 24279, "token_acc": 1.0, "train_speed(iter/s)": 0.956905 }, { "epoch": 0.7887470357015236, "grad_norm": 0.32827115058898926, "learning_rate": 1.1713511636337821e-06, "loss": 0.01571677252650261, "memory(GiB)": 22.66, "step": 24280, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.956913 }, { "epoch": 0.788779521164279, "grad_norm": 0.36774060130119324, "learning_rate": 1.1710057091364946e-06, "loss": 0.013484036549925804, "memory(GiB)": 22.66, "step": 24281, "token_acc": 0.996, "train_speed(iter/s)": 0.956921 }, { "epoch": 0.7888120066270344, "grad_norm": 0.30733522772789, "learning_rate": 1.1706602988308991e-06, "loss": 0.010541250929236412, "memory(GiB)": 22.66, "step": 24282, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.95693 }, { "epoch": 0.7888444920897898, "grad_norm": 0.32532164454460144, "learning_rate": 1.170314932720979e-06, "loss": 0.012530606240034103, "memory(GiB)": 22.66, "step": 24283, "token_acc": 0.9948717948717949, "train_speed(iter/s)": 0.956938 }, { "epoch": 0.7888769775525453, "grad_norm": 0.17566922307014465, "learning_rate": 1.1699696108107216e-06, "loss": 0.006177912931889296, "memory(GiB)": 22.66, "step": 24284, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956946 }, { "epoch": 0.7889094630153006, "grad_norm": 0.3825306296348572, "learning_rate": 1.169624333104113e-06, "loss": 0.017180752009153366, "memory(GiB)": 22.66, "step": 24285, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.956955 }, { "epoch": 0.7889419484780561, "grad_norm": 0.2610722780227661, "learning_rate": 1.169279099605139e-06, "loss": 0.008536223322153091, "memory(GiB)": 22.66, "step": 24286, "token_acc": 1.0, "train_speed(iter/s)": 0.956963 }, { "epoch": 0.7889744339408115, "grad_norm": 0.24635660648345947, "learning_rate": 1.1689339103177817e-06, "loss": 0.012893800623714924, "memory(GiB)": 22.66, "step": 24287, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.956972 }, { "epoch": 0.7890069194035669, "grad_norm": 0.2370322197675705, "learning_rate": 1.168588765246026e-06, "loss": 0.008785160258412361, "memory(GiB)": 22.66, "step": 24288, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.95698 }, { "epoch": 0.7890394048663223, "grad_norm": 0.38123300671577454, "learning_rate": 1.1682436643938556e-06, "loss": 0.014890393242239952, "memory(GiB)": 22.66, "step": 24289, "token_acc": 1.0, "train_speed(iter/s)": 0.956988 }, { "epoch": 0.7890718903290778, "grad_norm": 0.27624455094337463, "learning_rate": 1.1678986077652542e-06, "loss": 0.013601058162748814, "memory(GiB)": 22.66, "step": 24290, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956996 }, { "epoch": 0.7891043757918331, "grad_norm": 0.4030202627182007, "learning_rate": 1.1675535953642032e-06, "loss": 0.008477414026856422, "memory(GiB)": 22.66, "step": 24291, "token_acc": 1.0, "train_speed(iter/s)": 0.957004 }, { "epoch": 0.7891368612545886, "grad_norm": 0.29809704422950745, "learning_rate": 1.1672086271946836e-06, "loss": 0.012059502303600311, "memory(GiB)": 22.66, "step": 24292, "token_acc": 1.0, "train_speed(iter/s)": 0.957013 }, { "epoch": 0.789169346717344, "grad_norm": 0.41995304822921753, "learning_rate": 1.1668637032606784e-06, "loss": 0.010923978872597218, "memory(GiB)": 22.66, "step": 24293, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957021 }, { "epoch": 0.7892018321800994, "grad_norm": 0.9789479970932007, "learning_rate": 1.1665188235661695e-06, "loss": 0.02003747597336769, "memory(GiB)": 22.66, "step": 24294, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.95703 }, { "epoch": 0.7892343176428548, "grad_norm": 0.4649331867694855, "learning_rate": 1.166173988115134e-06, "loss": 0.01911497488617897, "memory(GiB)": 22.66, "step": 24295, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.957038 }, { "epoch": 0.7892668031056103, "grad_norm": 0.3747640550136566, "learning_rate": 1.1658291969115537e-06, "loss": 0.010538307949900627, "memory(GiB)": 22.66, "step": 24296, "token_acc": 1.0, "train_speed(iter/s)": 0.957047 }, { "epoch": 0.7892992885683656, "grad_norm": 0.3670605719089508, "learning_rate": 1.1654844499594077e-06, "loss": 0.01481832005083561, "memory(GiB)": 22.66, "step": 24297, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.957055 }, { "epoch": 0.7893317740311211, "grad_norm": 0.3062133491039276, "learning_rate": 1.1651397472626746e-06, "loss": 0.01330237090587616, "memory(GiB)": 22.66, "step": 24298, "token_acc": 1.0, "train_speed(iter/s)": 0.957063 }, { "epoch": 0.7893642594938765, "grad_norm": 0.4759431481361389, "learning_rate": 1.1647950888253345e-06, "loss": 0.020164016634225845, "memory(GiB)": 22.66, "step": 24299, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.957072 }, { "epoch": 0.7893967449566319, "grad_norm": 0.3144283592700958, "learning_rate": 1.1644504746513624e-06, "loss": 0.010663093999028206, "memory(GiB)": 22.66, "step": 24300, "token_acc": 0.9799196787148594, "train_speed(iter/s)": 0.95708 }, { "epoch": 0.7894292304193873, "grad_norm": 0.21771308779716492, "learning_rate": 1.1641059047447383e-06, "loss": 0.008817862719297409, "memory(GiB)": 22.66, "step": 24301, "token_acc": 1.0, "train_speed(iter/s)": 0.957089 }, { "epoch": 0.7894617158821428, "grad_norm": 0.20218296349048615, "learning_rate": 1.163761379109435e-06, "loss": 0.008450450375676155, "memory(GiB)": 22.66, "step": 24302, "token_acc": 1.0, "train_speed(iter/s)": 0.957097 }, { "epoch": 0.7894942013448981, "grad_norm": 0.3635993003845215, "learning_rate": 1.1634168977494341e-06, "loss": 0.015612518414855003, "memory(GiB)": 22.66, "step": 24303, "token_acc": 1.0, "train_speed(iter/s)": 0.957104 }, { "epoch": 0.7895266868076536, "grad_norm": 0.34592148661613464, "learning_rate": 1.163072460668707e-06, "loss": 0.013688959181308746, "memory(GiB)": 22.66, "step": 24304, "token_acc": 0.9965156794425087, "train_speed(iter/s)": 0.95711 }, { "epoch": 0.789559172270409, "grad_norm": 0.28381380438804626, "learning_rate": 1.1627280678712326e-06, "loss": 0.010095058009028435, "memory(GiB)": 22.66, "step": 24305, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957117 }, { "epoch": 0.7895916577331644, "grad_norm": 0.35203713178634644, "learning_rate": 1.1623837193609821e-06, "loss": 0.011396214365959167, "memory(GiB)": 22.66, "step": 24306, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957123 }, { "epoch": 0.7896241431959198, "grad_norm": 0.22403547167778015, "learning_rate": 1.1620394151419312e-06, "loss": 0.009373015724122524, "memory(GiB)": 22.66, "step": 24307, "token_acc": 0.9941520467836257, "train_speed(iter/s)": 0.95713 }, { "epoch": 0.7896566286586753, "grad_norm": 0.3028489649295807, "learning_rate": 1.161695155218054e-06, "loss": 0.010190528817474842, "memory(GiB)": 22.66, "step": 24308, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.957136 }, { "epoch": 0.7896891141214306, "grad_norm": 0.4532175064086914, "learning_rate": 1.1613509395933248e-06, "loss": 0.016462821513414383, "memory(GiB)": 22.66, "step": 24309, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957143 }, { "epoch": 0.7897215995841861, "grad_norm": 0.4155319631099701, "learning_rate": 1.1610067682717129e-06, "loss": 0.015384897589683533, "memory(GiB)": 22.66, "step": 24310, "token_acc": 0.9948717948717949, "train_speed(iter/s)": 0.957148 }, { "epoch": 0.7897540850469414, "grad_norm": 0.29480472207069397, "learning_rate": 1.1606626412571936e-06, "loss": 0.007905496284365654, "memory(GiB)": 22.66, "step": 24311, "token_acc": 1.0, "train_speed(iter/s)": 0.957154 }, { "epoch": 0.7897865705096969, "grad_norm": 0.4437802731990814, "learning_rate": 1.160318558553737e-06, "loss": 0.007892746478319168, "memory(GiB)": 22.66, "step": 24312, "token_acc": 1.0, "train_speed(iter/s)": 0.957144 }, { "epoch": 0.7898190559724523, "grad_norm": 0.4067157208919525, "learning_rate": 1.159974520165315e-06, "loss": 0.013644321821630001, "memory(GiB)": 22.66, "step": 24313, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.957149 }, { "epoch": 0.7898515414352077, "grad_norm": 0.37707558274269104, "learning_rate": 1.1596305260958994e-06, "loss": 0.010466428473591805, "memory(GiB)": 22.66, "step": 24314, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957155 }, { "epoch": 0.7898840268979631, "grad_norm": 0.23805633187294006, "learning_rate": 1.1592865763494577e-06, "loss": 0.01301262155175209, "memory(GiB)": 22.66, "step": 24315, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957161 }, { "epoch": 0.7899165123607186, "grad_norm": 0.2651667892932892, "learning_rate": 1.158942670929961e-06, "loss": 0.008586425334215164, "memory(GiB)": 22.66, "step": 24316, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957167 }, { "epoch": 0.7899489978234739, "grad_norm": 0.3299887776374817, "learning_rate": 1.158598809841378e-06, "loss": 0.008714692667126656, "memory(GiB)": 22.66, "step": 24317, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957173 }, { "epoch": 0.7899814832862294, "grad_norm": 0.33336004614830017, "learning_rate": 1.1582549930876797e-06, "loss": 0.011719334870576859, "memory(GiB)": 22.66, "step": 24318, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.957178 }, { "epoch": 0.7900139687489848, "grad_norm": 0.28323760628700256, "learning_rate": 1.1579112206728298e-06, "loss": 0.009198199957609177, "memory(GiB)": 22.66, "step": 24319, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.957184 }, { "epoch": 0.7900464542117402, "grad_norm": 0.3716370165348053, "learning_rate": 1.157567492600799e-06, "loss": 0.015397181734442711, "memory(GiB)": 22.66, "step": 24320, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957189 }, { "epoch": 0.7900789396744957, "grad_norm": 0.2859354615211487, "learning_rate": 1.1572238088755533e-06, "loss": 0.010240619070827961, "memory(GiB)": 22.66, "step": 24321, "token_acc": 1.0, "train_speed(iter/s)": 0.957195 }, { "epoch": 0.7901114251372511, "grad_norm": 0.3347811996936798, "learning_rate": 1.1568801695010607e-06, "loss": 0.01300995796918869, "memory(GiB)": 22.66, "step": 24322, "token_acc": 0.9944444444444445, "train_speed(iter/s)": 0.957201 }, { "epoch": 0.7901439106000065, "grad_norm": 0.21736577153205872, "learning_rate": 1.1565365744812851e-06, "loss": 0.00558240432292223, "memory(GiB)": 22.66, "step": 24323, "token_acc": 1.0, "train_speed(iter/s)": 0.957207 }, { "epoch": 0.7901763960627619, "grad_norm": 0.3369784355163574, "learning_rate": 1.1561930238201945e-06, "loss": 0.011443649418652058, "memory(GiB)": 22.66, "step": 24324, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957213 }, { "epoch": 0.7902088815255174, "grad_norm": 0.3778741955757141, "learning_rate": 1.1558495175217494e-06, "loss": 0.014831382781267166, "memory(GiB)": 22.66, "step": 24325, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.95722 }, { "epoch": 0.7902413669882727, "grad_norm": 0.3588409423828125, "learning_rate": 1.1555060555899189e-06, "loss": 0.016897883266210556, "memory(GiB)": 22.66, "step": 24326, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.957226 }, { "epoch": 0.7902738524510282, "grad_norm": 0.29383379220962524, "learning_rate": 1.1551626380286667e-06, "loss": 0.007708857301622629, "memory(GiB)": 22.66, "step": 24327, "token_acc": 1.0, "train_speed(iter/s)": 0.957233 }, { "epoch": 0.7903063379137836, "grad_norm": 0.21785300970077515, "learning_rate": 1.1548192648419543e-06, "loss": 0.009656652808189392, "memory(GiB)": 22.66, "step": 24328, "token_acc": 1.0, "train_speed(iter/s)": 0.957238 }, { "epoch": 0.790338823376539, "grad_norm": 0.3326318562030792, "learning_rate": 1.1544759360337465e-06, "loss": 0.012298556044697762, "memory(GiB)": 22.66, "step": 24329, "token_acc": 1.0, "train_speed(iter/s)": 0.957244 }, { "epoch": 0.7903713088392944, "grad_norm": 0.32482054829597473, "learning_rate": 1.1541326516080026e-06, "loss": 0.01367184054106474, "memory(GiB)": 22.66, "step": 24330, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957251 }, { "epoch": 0.7904037943020499, "grad_norm": 0.6632511615753174, "learning_rate": 1.1537894115686887e-06, "loss": 0.01578040048480034, "memory(GiB)": 22.66, "step": 24331, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957257 }, { "epoch": 0.7904362797648052, "grad_norm": 0.26463842391967773, "learning_rate": 1.1534462159197635e-06, "loss": 0.009709341451525688, "memory(GiB)": 22.66, "step": 24332, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.957264 }, { "epoch": 0.7904687652275607, "grad_norm": 0.21061764657497406, "learning_rate": 1.1531030646651897e-06, "loss": 0.00877310149371624, "memory(GiB)": 22.66, "step": 24333, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.957272 }, { "epoch": 0.7905012506903161, "grad_norm": 0.3812788128852844, "learning_rate": 1.1527599578089255e-06, "loss": 0.009632843546569347, "memory(GiB)": 22.66, "step": 24334, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.95728 }, { "epoch": 0.7905337361530715, "grad_norm": 0.4957507252693176, "learning_rate": 1.152416895354932e-06, "loss": 0.016545679420232773, "memory(GiB)": 22.66, "step": 24335, "token_acc": 0.9878787878787879, "train_speed(iter/s)": 0.957288 }, { "epoch": 0.7905662216158269, "grad_norm": 0.3972487151622772, "learning_rate": 1.1520738773071688e-06, "loss": 0.01266350969672203, "memory(GiB)": 22.66, "step": 24336, "token_acc": 1.0, "train_speed(iter/s)": 0.957297 }, { "epoch": 0.7905987070785824, "grad_norm": 0.2756710946559906, "learning_rate": 1.1517309036695962e-06, "loss": 0.011617165058851242, "memory(GiB)": 22.66, "step": 24337, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.957305 }, { "epoch": 0.7906311925413377, "grad_norm": 0.4664328992366791, "learning_rate": 1.1513879744461692e-06, "loss": 0.017565041780471802, "memory(GiB)": 22.66, "step": 24338, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.957313 }, { "epoch": 0.7906636780040932, "grad_norm": 0.32979071140289307, "learning_rate": 1.1510450896408476e-06, "loss": 0.009803618304431438, "memory(GiB)": 22.66, "step": 24339, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.957322 }, { "epoch": 0.7906961634668486, "grad_norm": 0.5764124393463135, "learning_rate": 1.1507022492575887e-06, "loss": 0.02201061323285103, "memory(GiB)": 22.66, "step": 24340, "token_acc": 0.9963636363636363, "train_speed(iter/s)": 0.95733 }, { "epoch": 0.790728648929604, "grad_norm": 0.4576478600502014, "learning_rate": 1.1503594533003503e-06, "loss": 0.016249874606728554, "memory(GiB)": 22.66, "step": 24341, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957338 }, { "epoch": 0.7907611343923594, "grad_norm": 0.3021961450576782, "learning_rate": 1.1500167017730867e-06, "loss": 0.012111488729715347, "memory(GiB)": 22.66, "step": 24342, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.957346 }, { "epoch": 0.7907936198551149, "grad_norm": 0.4074556231498718, "learning_rate": 1.1496739946797542e-06, "loss": 0.012904147617518902, "memory(GiB)": 22.66, "step": 24343, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.957355 }, { "epoch": 0.7908261053178702, "grad_norm": 0.2840108573436737, "learning_rate": 1.1493313320243089e-06, "loss": 0.014165817759931087, "memory(GiB)": 22.66, "step": 24344, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.957363 }, { "epoch": 0.7908585907806257, "grad_norm": 0.7422977089881897, "learning_rate": 1.1489887138107054e-06, "loss": 0.01522674411535263, "memory(GiB)": 22.66, "step": 24345, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957371 }, { "epoch": 0.7908910762433811, "grad_norm": 0.4857739508152008, "learning_rate": 1.1486461400428988e-06, "loss": 0.016997089609503746, "memory(GiB)": 22.66, "step": 24346, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.95738 }, { "epoch": 0.7909235617061365, "grad_norm": 0.3559335172176361, "learning_rate": 1.1483036107248413e-06, "loss": 0.013733098283410072, "memory(GiB)": 22.66, "step": 24347, "token_acc": 1.0, "train_speed(iter/s)": 0.957388 }, { "epoch": 0.7909560471688919, "grad_norm": 0.36429649591445923, "learning_rate": 1.1479611258604861e-06, "loss": 0.014357795007526875, "memory(GiB)": 22.66, "step": 24348, "token_acc": 1.0, "train_speed(iter/s)": 0.957396 }, { "epoch": 0.7909885326316474, "grad_norm": 0.35460084676742554, "learning_rate": 1.1476186854537862e-06, "loss": 0.010531727224588394, "memory(GiB)": 22.66, "step": 24349, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957404 }, { "epoch": 0.7910210180944027, "grad_norm": 0.30025917291641235, "learning_rate": 1.1472762895086963e-06, "loss": 0.013641398400068283, "memory(GiB)": 22.66, "step": 24350, "token_acc": 1.0, "train_speed(iter/s)": 0.957413 }, { "epoch": 0.7910535035571582, "grad_norm": 0.286446750164032, "learning_rate": 1.1469339380291645e-06, "loss": 0.009020674973726273, "memory(GiB)": 22.66, "step": 24351, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.957421 }, { "epoch": 0.7910859890199136, "grad_norm": 0.39694955945014954, "learning_rate": 1.1465916310191444e-06, "loss": 0.018721502274274826, "memory(GiB)": 22.66, "step": 24352, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.957429 }, { "epoch": 0.791118474482669, "grad_norm": 0.3100016117095947, "learning_rate": 1.1462493684825831e-06, "loss": 0.0129221361130476, "memory(GiB)": 22.66, "step": 24353, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.957437 }, { "epoch": 0.7911509599454244, "grad_norm": 0.3667135536670685, "learning_rate": 1.1459071504234364e-06, "loss": 0.008632481098175049, "memory(GiB)": 22.66, "step": 24354, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.957446 }, { "epoch": 0.7911834454081799, "grad_norm": 0.36275482177734375, "learning_rate": 1.1455649768456494e-06, "loss": 0.015567292459309101, "memory(GiB)": 22.66, "step": 24355, "token_acc": 0.9759615384615384, "train_speed(iter/s)": 0.957454 }, { "epoch": 0.7912159308709352, "grad_norm": 0.412061870098114, "learning_rate": 1.1452228477531745e-06, "loss": 0.018116356804966927, "memory(GiB)": 22.66, "step": 24356, "token_acc": 1.0, "train_speed(iter/s)": 0.957463 }, { "epoch": 0.7912484163336907, "grad_norm": 0.29393482208251953, "learning_rate": 1.144880763149957e-06, "loss": 0.01199485082179308, "memory(GiB)": 22.66, "step": 24357, "token_acc": 1.0, "train_speed(iter/s)": 0.957471 }, { "epoch": 0.7912809017964461, "grad_norm": 0.2564515769481659, "learning_rate": 1.1445387230399456e-06, "loss": 0.015298791229724884, "memory(GiB)": 22.66, "step": 24358, "token_acc": 1.0, "train_speed(iter/s)": 0.957479 }, { "epoch": 0.7913133872592015, "grad_norm": 0.2936429977416992, "learning_rate": 1.1441967274270915e-06, "loss": 0.006695937365293503, "memory(GiB)": 22.66, "step": 24359, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957487 }, { "epoch": 0.7913458727219569, "grad_norm": 0.37387552857398987, "learning_rate": 1.1438547763153373e-06, "loss": 0.015623390674591064, "memory(GiB)": 22.66, "step": 24360, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.957496 }, { "epoch": 0.7913783581847124, "grad_norm": 0.38294166326522827, "learning_rate": 1.143512869708634e-06, "loss": 0.013217300176620483, "memory(GiB)": 22.66, "step": 24361, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.957504 }, { "epoch": 0.7914108436474677, "grad_norm": 0.33488017320632935, "learning_rate": 1.1431710076109226e-06, "loss": 0.008956272155046463, "memory(GiB)": 22.66, "step": 24362, "token_acc": 0.9965277777777778, "train_speed(iter/s)": 0.957512 }, { "epoch": 0.7914433291102232, "grad_norm": 0.2961563169956207, "learning_rate": 1.1428291900261518e-06, "loss": 0.01208958588540554, "memory(GiB)": 22.66, "step": 24363, "token_acc": 1.0, "train_speed(iter/s)": 0.957518 }, { "epoch": 0.7914758145729786, "grad_norm": 0.2738174796104431, "learning_rate": 1.1424874169582661e-06, "loss": 0.010493744164705276, "memory(GiB)": 22.66, "step": 24364, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957524 }, { "epoch": 0.791508300035734, "grad_norm": 0.277925044298172, "learning_rate": 1.1421456884112114e-06, "loss": 0.007977675646543503, "memory(GiB)": 22.66, "step": 24365, "token_acc": 1.0, "train_speed(iter/s)": 0.957531 }, { "epoch": 0.7915407854984894, "grad_norm": 0.5488256812095642, "learning_rate": 1.1418040043889285e-06, "loss": 0.016007695347070694, "memory(GiB)": 22.66, "step": 24366, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.957538 }, { "epoch": 0.7915732709612449, "grad_norm": 0.3470703363418579, "learning_rate": 1.1414623648953626e-06, "loss": 0.015160351060330868, "memory(GiB)": 22.66, "step": 24367, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.957544 }, { "epoch": 0.7916057564240002, "grad_norm": 1.7227541208267212, "learning_rate": 1.1411207699344574e-06, "loss": 0.006864349823445082, "memory(GiB)": 22.66, "step": 24368, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.95755 }, { "epoch": 0.7916382418867557, "grad_norm": 0.38469716906547546, "learning_rate": 1.1407792195101553e-06, "loss": 0.013970375061035156, "memory(GiB)": 22.66, "step": 24369, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957557 }, { "epoch": 0.791670727349511, "grad_norm": 0.30624231696128845, "learning_rate": 1.1404377136263966e-06, "loss": 0.009097054600715637, "memory(GiB)": 22.66, "step": 24370, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.957563 }, { "epoch": 0.7917032128122665, "grad_norm": 0.3643253743648529, "learning_rate": 1.1400962522871235e-06, "loss": 0.00840110331773758, "memory(GiB)": 22.66, "step": 24371, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957569 }, { "epoch": 0.7917356982750219, "grad_norm": 0.2927635610103607, "learning_rate": 1.1397548354962773e-06, "loss": 0.007313208654522896, "memory(GiB)": 22.66, "step": 24372, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957575 }, { "epoch": 0.7917681837377774, "grad_norm": 0.28330639004707336, "learning_rate": 1.1394134632577986e-06, "loss": 0.008351510390639305, "memory(GiB)": 22.66, "step": 24373, "token_acc": 1.0, "train_speed(iter/s)": 0.95758 }, { "epoch": 0.7918006692005327, "grad_norm": 0.419011652469635, "learning_rate": 1.139072135575628e-06, "loss": 0.012756901793181896, "memory(GiB)": 22.66, "step": 24374, "token_acc": 1.0, "train_speed(iter/s)": 0.957586 }, { "epoch": 0.7918331546632882, "grad_norm": 0.40285441279411316, "learning_rate": 1.138730852453702e-06, "loss": 0.013589726760983467, "memory(GiB)": 22.66, "step": 24375, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.957592 }, { "epoch": 0.7918656401260435, "grad_norm": 0.4562394917011261, "learning_rate": 1.1383896138959621e-06, "loss": 0.013875775039196014, "memory(GiB)": 22.66, "step": 24376, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.957597 }, { "epoch": 0.791898125588799, "grad_norm": 0.35895952582359314, "learning_rate": 1.1380484199063457e-06, "loss": 0.012508492916822433, "memory(GiB)": 22.66, "step": 24377, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.957603 }, { "epoch": 0.7919306110515544, "grad_norm": 0.374749094247818, "learning_rate": 1.137707270488792e-06, "loss": 0.015194139443337917, "memory(GiB)": 22.66, "step": 24378, "token_acc": 1.0, "train_speed(iter/s)": 0.957608 }, { "epoch": 0.7919630965143098, "grad_norm": 0.3244166076183319, "learning_rate": 1.1373661656472358e-06, "loss": 0.010395949706435204, "memory(GiB)": 22.66, "step": 24379, "token_acc": 1.0, "train_speed(iter/s)": 0.957614 }, { "epoch": 0.7919955819770652, "grad_norm": 0.27877214550971985, "learning_rate": 1.1370251053856162e-06, "loss": 0.007643260061740875, "memory(GiB)": 22.66, "step": 24380, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.95762 }, { "epoch": 0.7920280674398207, "grad_norm": 0.3537542223930359, "learning_rate": 1.1366840897078663e-06, "loss": 0.011854699812829494, "memory(GiB)": 22.66, "step": 24381, "token_acc": 0.9965397923875432, "train_speed(iter/s)": 0.957625 }, { "epoch": 0.792060552902576, "grad_norm": 0.45412230491638184, "learning_rate": 1.1363431186179264e-06, "loss": 0.01626906543970108, "memory(GiB)": 22.66, "step": 24382, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957631 }, { "epoch": 0.7920930383653315, "grad_norm": 0.37548550963401794, "learning_rate": 1.1360021921197284e-06, "loss": 0.013755196705460548, "memory(GiB)": 22.66, "step": 24383, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.957636 }, { "epoch": 0.792125523828087, "grad_norm": 0.3764316737651825, "learning_rate": 1.1356613102172094e-06, "loss": 0.018008355051279068, "memory(GiB)": 22.66, "step": 24384, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957642 }, { "epoch": 0.7921580092908423, "grad_norm": 0.34949374198913574, "learning_rate": 1.1353204729143002e-06, "loss": 0.01400160975754261, "memory(GiB)": 22.66, "step": 24385, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957647 }, { "epoch": 0.7921904947535978, "grad_norm": 0.43678295612335205, "learning_rate": 1.1349796802149377e-06, "loss": 0.015020261518657207, "memory(GiB)": 22.66, "step": 24386, "token_acc": 1.0, "train_speed(iter/s)": 0.957652 }, { "epoch": 0.7922229802163532, "grad_norm": 0.2537340223789215, "learning_rate": 1.1346389321230532e-06, "loss": 0.012354686856269836, "memory(GiB)": 22.66, "step": 24387, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957658 }, { "epoch": 0.7922554656791087, "grad_norm": 0.4357631206512451, "learning_rate": 1.1342982286425808e-06, "loss": 0.012091548182070255, "memory(GiB)": 22.66, "step": 24388, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957663 }, { "epoch": 0.792287951141864, "grad_norm": 0.2977653741836548, "learning_rate": 1.133957569777453e-06, "loss": 0.010622235015034676, "memory(GiB)": 22.66, "step": 24389, "token_acc": 0.9923371647509579, "train_speed(iter/s)": 0.957669 }, { "epoch": 0.7923204366046195, "grad_norm": 0.31060972809791565, "learning_rate": 1.133616955531598e-06, "loss": 0.011787822470068932, "memory(GiB)": 22.66, "step": 24390, "token_acc": 1.0, "train_speed(iter/s)": 0.957673 }, { "epoch": 0.7923529220673748, "grad_norm": 0.3644437789916992, "learning_rate": 1.1332763859089523e-06, "loss": 0.013081936165690422, "memory(GiB)": 22.66, "step": 24391, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.957679 }, { "epoch": 0.7923854075301303, "grad_norm": 0.5210173726081848, "learning_rate": 1.1329358609134423e-06, "loss": 0.017940843477845192, "memory(GiB)": 22.66, "step": 24392, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957684 }, { "epoch": 0.7924178929928857, "grad_norm": 0.30911141633987427, "learning_rate": 1.132595380549001e-06, "loss": 0.00799260102212429, "memory(GiB)": 22.66, "step": 24393, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957689 }, { "epoch": 0.7924503784556411, "grad_norm": 0.1747424155473709, "learning_rate": 1.1322549448195548e-06, "loss": 0.008878733031451702, "memory(GiB)": 22.66, "step": 24394, "token_acc": 1.0, "train_speed(iter/s)": 0.957695 }, { "epoch": 0.7924828639183965, "grad_norm": 0.3269880712032318, "learning_rate": 1.1319145537290344e-06, "loss": 0.016537092626094818, "memory(GiB)": 22.66, "step": 24395, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.957701 }, { "epoch": 0.792515349381152, "grad_norm": 0.5149988532066345, "learning_rate": 1.131574207281369e-06, "loss": 0.012980839237570763, "memory(GiB)": 22.66, "step": 24396, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957708 }, { "epoch": 0.7925478348439073, "grad_norm": 0.4260852038860321, "learning_rate": 1.1312339054804872e-06, "loss": 0.01836502179503441, "memory(GiB)": 22.66, "step": 24397, "token_acc": 0.992, "train_speed(iter/s)": 0.957714 }, { "epoch": 0.7925803203066628, "grad_norm": 0.2732572555541992, "learning_rate": 1.130893648330314e-06, "loss": 0.00816885195672512, "memory(GiB)": 22.66, "step": 24398, "token_acc": 1.0, "train_speed(iter/s)": 0.957721 }, { "epoch": 0.7926128057694182, "grad_norm": 0.2472591996192932, "learning_rate": 1.130553435834778e-06, "loss": 0.00751475477591157, "memory(GiB)": 22.66, "step": 24399, "token_acc": 1.0, "train_speed(iter/s)": 0.957727 }, { "epoch": 0.7926452912321736, "grad_norm": 0.5468041300773621, "learning_rate": 1.130213267997805e-06, "loss": 0.018234610557556152, "memory(GiB)": 22.66, "step": 24400, "token_acc": 0.9928057553956835, "train_speed(iter/s)": 0.957733 }, { "epoch": 0.792677776694929, "grad_norm": 0.41334354877471924, "learning_rate": 1.1298731448233236e-06, "loss": 0.012370231561362743, "memory(GiB)": 22.66, "step": 24401, "token_acc": 0.9899665551839465, "train_speed(iter/s)": 0.957739 }, { "epoch": 0.7927102621576845, "grad_norm": 0.4536196291446686, "learning_rate": 1.129533066315256e-06, "loss": 0.02067127265036106, "memory(GiB)": 22.66, "step": 24402, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.957747 }, { "epoch": 0.7927427476204398, "grad_norm": 0.2648983895778656, "learning_rate": 1.129193032477528e-06, "loss": 0.011714622378349304, "memory(GiB)": 22.66, "step": 24403, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.957756 }, { "epoch": 0.7927752330831953, "grad_norm": 0.4477485716342926, "learning_rate": 1.1288530433140643e-06, "loss": 0.017504263669252396, "memory(GiB)": 22.66, "step": 24404, "token_acc": 1.0, "train_speed(iter/s)": 0.957763 }, { "epoch": 0.7928077185459507, "grad_norm": 0.3999808430671692, "learning_rate": 1.1285130988287895e-06, "loss": 0.01245846226811409, "memory(GiB)": 22.66, "step": 24405, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957772 }, { "epoch": 0.7928402040087061, "grad_norm": 0.31945520639419556, "learning_rate": 1.1281731990256273e-06, "loss": 0.012771913781762123, "memory(GiB)": 22.66, "step": 24406, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.95778 }, { "epoch": 0.7928726894714615, "grad_norm": 0.3790743052959442, "learning_rate": 1.1278333439084987e-06, "loss": 0.012424913235008717, "memory(GiB)": 22.66, "step": 24407, "token_acc": 1.0, "train_speed(iter/s)": 0.957789 }, { "epoch": 0.792905174934217, "grad_norm": 0.4344188868999481, "learning_rate": 1.1274935334813275e-06, "loss": 0.009638478048145771, "memory(GiB)": 22.66, "step": 24408, "token_acc": 0.9837837837837838, "train_speed(iter/s)": 0.957797 }, { "epoch": 0.7929376603969723, "grad_norm": 0.36264070868492126, "learning_rate": 1.1271537677480344e-06, "loss": 0.013042336329817772, "memory(GiB)": 22.66, "step": 24409, "token_acc": 1.0, "train_speed(iter/s)": 0.957805 }, { "epoch": 0.7929701458597278, "grad_norm": 0.33427488803863525, "learning_rate": 1.1268140467125439e-06, "loss": 0.017485031858086586, "memory(GiB)": 22.66, "step": 24410, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.957813 }, { "epoch": 0.7930026313224832, "grad_norm": 0.34108468890190125, "learning_rate": 1.1264743703787723e-06, "loss": 0.01176894549280405, "memory(GiB)": 22.66, "step": 24411, "token_acc": 1.0, "train_speed(iter/s)": 0.957821 }, { "epoch": 0.7930351167852386, "grad_norm": 0.21645662188529968, "learning_rate": 1.126134738750644e-06, "loss": 0.006327495910227299, "memory(GiB)": 22.66, "step": 24412, "token_acc": 1.0, "train_speed(iter/s)": 0.957829 }, { "epoch": 0.793067602247994, "grad_norm": 0.38643965125083923, "learning_rate": 1.1257951518320737e-06, "loss": 0.014347439631819725, "memory(GiB)": 22.66, "step": 24413, "token_acc": 0.9966216216216216, "train_speed(iter/s)": 0.957837 }, { "epoch": 0.7931000877107495, "grad_norm": 0.530815064907074, "learning_rate": 1.125455609626987e-06, "loss": 0.02443571947515011, "memory(GiB)": 22.66, "step": 24414, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.957846 }, { "epoch": 0.7931325731735048, "grad_norm": 0.3821549415588379, "learning_rate": 1.1251161121392978e-06, "loss": 0.011618154123425484, "memory(GiB)": 22.66, "step": 24415, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.957854 }, { "epoch": 0.7931650586362603, "grad_norm": 0.32605838775634766, "learning_rate": 1.1247766593729282e-06, "loss": 0.012696598656475544, "memory(GiB)": 22.66, "step": 24416, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.957863 }, { "epoch": 0.7931975440990157, "grad_norm": 0.5362032651901245, "learning_rate": 1.124437251331792e-06, "loss": 0.01108718290925026, "memory(GiB)": 22.66, "step": 24417, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957871 }, { "epoch": 0.7932300295617711, "grad_norm": 0.41169998049736023, "learning_rate": 1.1240978880198084e-06, "loss": 0.02213609218597412, "memory(GiB)": 22.66, "step": 24418, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957879 }, { "epoch": 0.7932625150245265, "grad_norm": 0.4561128616333008, "learning_rate": 1.1237585694408937e-06, "loss": 0.01658424735069275, "memory(GiB)": 22.66, "step": 24419, "token_acc": 0.984375, "train_speed(iter/s)": 0.957888 }, { "epoch": 0.793295000487282, "grad_norm": 0.28102368116378784, "learning_rate": 1.1234192955989649e-06, "loss": 0.011311322450637817, "memory(GiB)": 22.66, "step": 24420, "token_acc": 1.0, "train_speed(iter/s)": 0.957896 }, { "epoch": 0.7933274859500373, "grad_norm": 0.5218408107757568, "learning_rate": 1.123080066497938e-06, "loss": 0.016583580523729324, "memory(GiB)": 22.66, "step": 24421, "token_acc": 1.0, "train_speed(iter/s)": 0.957904 }, { "epoch": 0.7933599714127928, "grad_norm": 0.5698270201683044, "learning_rate": 1.1227408821417263e-06, "loss": 0.01886748895049095, "memory(GiB)": 22.66, "step": 24422, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.957913 }, { "epoch": 0.7933924568755482, "grad_norm": 0.35520780086517334, "learning_rate": 1.1224017425342453e-06, "loss": 0.019082188606262207, "memory(GiB)": 22.66, "step": 24423, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.957921 }, { "epoch": 0.7934249423383036, "grad_norm": 0.23110970854759216, "learning_rate": 1.1220626476794095e-06, "loss": 0.008949709124863148, "memory(GiB)": 22.66, "step": 24424, "token_acc": 1.0, "train_speed(iter/s)": 0.957929 }, { "epoch": 0.793457427801059, "grad_norm": 0.3034161329269409, "learning_rate": 1.121723597581133e-06, "loss": 0.010143827646970749, "memory(GiB)": 22.66, "step": 24425, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.957937 }, { "epoch": 0.7934899132638145, "grad_norm": 0.2976817488670349, "learning_rate": 1.1213845922433276e-06, "loss": 0.011584969237446785, "memory(GiB)": 22.66, "step": 24426, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957946 }, { "epoch": 0.7935223987265698, "grad_norm": 0.30581924319267273, "learning_rate": 1.1210456316699058e-06, "loss": 0.01828393153846264, "memory(GiB)": 22.66, "step": 24427, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957954 }, { "epoch": 0.7935548841893253, "grad_norm": 0.5633924007415771, "learning_rate": 1.1207067158647804e-06, "loss": 0.012220000848174095, "memory(GiB)": 22.66, "step": 24428, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.957962 }, { "epoch": 0.7935873696520807, "grad_norm": 0.43381112813949585, "learning_rate": 1.1203678448318644e-06, "loss": 0.016785643994808197, "memory(GiB)": 22.66, "step": 24429, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957971 }, { "epoch": 0.7936198551148361, "grad_norm": 0.32453250885009766, "learning_rate": 1.1200290185750662e-06, "loss": 0.015778448432683945, "memory(GiB)": 22.66, "step": 24430, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.957979 }, { "epoch": 0.7936523405775915, "grad_norm": 0.26639899611473083, "learning_rate": 1.1196902370982966e-06, "loss": 0.011098229326307774, "memory(GiB)": 22.66, "step": 24431, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957987 }, { "epoch": 0.793684826040347, "grad_norm": 0.39588528871536255, "learning_rate": 1.1193515004054667e-06, "loss": 0.015574540011584759, "memory(GiB)": 22.66, "step": 24432, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.957996 }, { "epoch": 0.7937173115031023, "grad_norm": 0.3718457818031311, "learning_rate": 1.119012808500487e-06, "loss": 0.009695383720099926, "memory(GiB)": 22.66, "step": 24433, "token_acc": 1.0, "train_speed(iter/s)": 0.958004 }, { "epoch": 0.7937497969658578, "grad_norm": 0.2935248017311096, "learning_rate": 1.1186741613872636e-06, "loss": 0.013415148481726646, "memory(GiB)": 22.66, "step": 24434, "token_acc": 1.0, "train_speed(iter/s)": 0.958012 }, { "epoch": 0.7937822824286132, "grad_norm": 0.2660335898399353, "learning_rate": 1.1183355590697064e-06, "loss": 0.00857347808778286, "memory(GiB)": 22.66, "step": 24435, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.958021 }, { "epoch": 0.7938147678913686, "grad_norm": 0.27951645851135254, "learning_rate": 1.1179970015517238e-06, "loss": 0.010445548221468925, "memory(GiB)": 22.66, "step": 24436, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.958028 }, { "epoch": 0.793847253354124, "grad_norm": 0.31215089559555054, "learning_rate": 1.117658488837222e-06, "loss": 0.014578827656805515, "memory(GiB)": 22.66, "step": 24437, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.958035 }, { "epoch": 0.7938797388168795, "grad_norm": 0.25255802273750305, "learning_rate": 1.1173200209301104e-06, "loss": 0.01029939204454422, "memory(GiB)": 22.66, "step": 24438, "token_acc": 1.0, "train_speed(iter/s)": 0.958042 }, { "epoch": 0.7939122242796348, "grad_norm": 0.3904670178890228, "learning_rate": 1.1169815978342924e-06, "loss": 0.014926481060683727, "memory(GiB)": 22.66, "step": 24439, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.958048 }, { "epoch": 0.7939447097423903, "grad_norm": 0.3589336574077606, "learning_rate": 1.1166432195536764e-06, "loss": 0.012978163547813892, "memory(GiB)": 22.66, "step": 24440, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.958055 }, { "epoch": 0.7939771952051456, "grad_norm": 0.3398312032222748, "learning_rate": 1.1163048860921633e-06, "loss": 0.013375332579016685, "memory(GiB)": 22.66, "step": 24441, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.95806 }, { "epoch": 0.7940096806679011, "grad_norm": 0.551325261592865, "learning_rate": 1.1159665974536637e-06, "loss": 0.020064380019903183, "memory(GiB)": 22.66, "step": 24442, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.958066 }, { "epoch": 0.7940421661306565, "grad_norm": 0.28969013690948486, "learning_rate": 1.115628353642078e-06, "loss": 0.014810536056756973, "memory(GiB)": 22.66, "step": 24443, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.958072 }, { "epoch": 0.794074651593412, "grad_norm": 0.3589620292186737, "learning_rate": 1.1152901546613127e-06, "loss": 0.011642567813396454, "memory(GiB)": 22.66, "step": 24444, "token_acc": 1.0, "train_speed(iter/s)": 0.958077 }, { "epoch": 0.7941071370561673, "grad_norm": 0.32372981309890747, "learning_rate": 1.114952000515268e-06, "loss": 0.010625304654240608, "memory(GiB)": 22.66, "step": 24445, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.958084 }, { "epoch": 0.7941396225189228, "grad_norm": 0.42706191539764404, "learning_rate": 1.1146138912078485e-06, "loss": 0.012433070689439774, "memory(GiB)": 22.66, "step": 24446, "token_acc": 0.9898477157360406, "train_speed(iter/s)": 0.958089 }, { "epoch": 0.7941721079816781, "grad_norm": 0.3649246096611023, "learning_rate": 1.114275826742956e-06, "loss": 0.01413322240114212, "memory(GiB)": 22.66, "step": 24447, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.958094 }, { "epoch": 0.7942045934444336, "grad_norm": 0.29295089840888977, "learning_rate": 1.1139378071244934e-06, "loss": 0.008572172373533249, "memory(GiB)": 22.66, "step": 24448, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.9581 }, { "epoch": 0.7942370789071891, "grad_norm": 0.32868602871894836, "learning_rate": 1.1135998323563595e-06, "loss": 0.00791042298078537, "memory(GiB)": 22.66, "step": 24449, "token_acc": 1.0, "train_speed(iter/s)": 0.958105 }, { "epoch": 0.7942695643699444, "grad_norm": 0.28948986530303955, "learning_rate": 1.1132619024424568e-06, "loss": 0.01071322150528431, "memory(GiB)": 22.66, "step": 24450, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.958111 }, { "epoch": 0.7943020498326999, "grad_norm": 0.4605301320552826, "learning_rate": 1.1129240173866845e-06, "loss": 0.013270609080791473, "memory(GiB)": 22.66, "step": 24451, "token_acc": 0.9847328244274809, "train_speed(iter/s)": 0.958117 }, { "epoch": 0.7943345352954553, "grad_norm": 0.41774365305900574, "learning_rate": 1.112586177192943e-06, "loss": 0.018387366086244583, "memory(GiB)": 22.66, "step": 24452, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.958122 }, { "epoch": 0.7943670207582108, "grad_norm": 0.3469279110431671, "learning_rate": 1.1122483818651325e-06, "loss": 0.013005206361413002, "memory(GiB)": 22.66, "step": 24453, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.958128 }, { "epoch": 0.7943995062209661, "grad_norm": 0.5141575336456299, "learning_rate": 1.1119106314071486e-06, "loss": 0.017005469650030136, "memory(GiB)": 22.66, "step": 24454, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.958134 }, { "epoch": 0.7944319916837216, "grad_norm": 0.25650617480278015, "learning_rate": 1.1115729258228913e-06, "loss": 0.012295561842620373, "memory(GiB)": 22.66, "step": 24455, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.958138 }, { "epoch": 0.7944644771464769, "grad_norm": 0.2469799518585205, "learning_rate": 1.1112352651162578e-06, "loss": 0.009860594756901264, "memory(GiB)": 22.66, "step": 24456, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.958143 }, { "epoch": 0.7944969626092324, "grad_norm": 0.19400082528591156, "learning_rate": 1.1108976492911471e-06, "loss": 0.0075361523777246475, "memory(GiB)": 22.66, "step": 24457, "token_acc": 1.0, "train_speed(iter/s)": 0.958149 }, { "epoch": 0.7945294480719878, "grad_norm": 0.6171834468841553, "learning_rate": 1.110560078351452e-06, "loss": 0.012490060180425644, "memory(GiB)": 22.66, "step": 24458, "token_acc": 1.0, "train_speed(iter/s)": 0.958155 }, { "epoch": 0.7945619335347432, "grad_norm": 0.5557340979576111, "learning_rate": 1.1102225523010707e-06, "loss": 0.011451678350567818, "memory(GiB)": 22.66, "step": 24459, "token_acc": 1.0, "train_speed(iter/s)": 0.958161 }, { "epoch": 0.7945944189974986, "grad_norm": 0.42120128870010376, "learning_rate": 1.1098850711438985e-06, "loss": 0.010781355202198029, "memory(GiB)": 22.66, "step": 24460, "token_acc": 1.0, "train_speed(iter/s)": 0.958167 }, { "epoch": 0.7946269044602541, "grad_norm": 0.3421129286289215, "learning_rate": 1.1095476348838314e-06, "loss": 0.01603398099541664, "memory(GiB)": 22.66, "step": 24461, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.958176 }, { "epoch": 0.7946593899230094, "grad_norm": 0.33109167218208313, "learning_rate": 1.1092102435247616e-06, "loss": 0.012639999389648438, "memory(GiB)": 22.66, "step": 24462, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.958184 }, { "epoch": 0.7946918753857649, "grad_norm": 0.40343108773231506, "learning_rate": 1.1088728970705853e-06, "loss": 0.012745694257318974, "memory(GiB)": 22.66, "step": 24463, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.958192 }, { "epoch": 0.7947243608485203, "grad_norm": 0.3592695891857147, "learning_rate": 1.1085355955251925e-06, "loss": 0.013470068573951721, "memory(GiB)": 22.66, "step": 24464, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.9582 }, { "epoch": 0.7947568463112757, "grad_norm": 0.34111374616622925, "learning_rate": 1.1081983388924794e-06, "loss": 0.008340934291481972, "memory(GiB)": 22.66, "step": 24465, "token_acc": 1.0, "train_speed(iter/s)": 0.958208 }, { "epoch": 0.7947893317740311, "grad_norm": 0.49496400356292725, "learning_rate": 1.107861127176339e-06, "loss": 0.016687003895640373, "memory(GiB)": 22.66, "step": 24466, "token_acc": 1.0, "train_speed(iter/s)": 0.958216 }, { "epoch": 0.7948218172367866, "grad_norm": 0.3576395511627197, "learning_rate": 1.1075239603806603e-06, "loss": 0.014086869545280933, "memory(GiB)": 22.66, "step": 24467, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.958225 }, { "epoch": 0.7948543026995419, "grad_norm": 0.43546319007873535, "learning_rate": 1.1071868385093377e-06, "loss": 0.020741526037454605, "memory(GiB)": 22.66, "step": 24468, "token_acc": 0.9854545454545455, "train_speed(iter/s)": 0.958233 }, { "epoch": 0.7948867881622974, "grad_norm": 0.3869112432003021, "learning_rate": 1.1068497615662571e-06, "loss": 0.015745628625154495, "memory(GiB)": 22.66, "step": 24469, "token_acc": 0.9921875, "train_speed(iter/s)": 0.958241 }, { "epoch": 0.7949192736250528, "grad_norm": 0.16999435424804688, "learning_rate": 1.1065127295553146e-06, "loss": 0.00700626615434885, "memory(GiB)": 22.66, "step": 24470, "token_acc": 0.9944444444444445, "train_speed(iter/s)": 0.95825 }, { "epoch": 0.7949517590878082, "grad_norm": 0.518719494342804, "learning_rate": 1.1061757424803964e-06, "loss": 0.02414221502840519, "memory(GiB)": 22.66, "step": 24471, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.958258 }, { "epoch": 0.7949842445505636, "grad_norm": 0.370193213224411, "learning_rate": 1.105838800345394e-06, "loss": 0.014459770172834396, "memory(GiB)": 22.66, "step": 24472, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.958266 }, { "epoch": 0.7950167300133191, "grad_norm": 0.25924545526504517, "learning_rate": 1.1055019031541942e-06, "loss": 0.00901961512863636, "memory(GiB)": 22.66, "step": 24473, "token_acc": 1.0, "train_speed(iter/s)": 0.958274 }, { "epoch": 0.7950492154760744, "grad_norm": 0.37938782572746277, "learning_rate": 1.1051650509106854e-06, "loss": 0.009347643703222275, "memory(GiB)": 22.66, "step": 24474, "token_acc": 1.0, "train_speed(iter/s)": 0.958283 }, { "epoch": 0.7950817009388299, "grad_norm": 0.2964237928390503, "learning_rate": 1.1048282436187557e-06, "loss": 0.011081041768193245, "memory(GiB)": 22.66, "step": 24475, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.95829 }, { "epoch": 0.7951141864015853, "grad_norm": 0.287560373544693, "learning_rate": 1.1044914812822944e-06, "loss": 0.008663071319460869, "memory(GiB)": 22.66, "step": 24476, "token_acc": 1.0, "train_speed(iter/s)": 0.958298 }, { "epoch": 0.7951466718643407, "grad_norm": 0.32192957401275635, "learning_rate": 1.1041547639051848e-06, "loss": 0.009865673258900642, "memory(GiB)": 22.66, "step": 24477, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.958306 }, { "epoch": 0.7951791573270961, "grad_norm": 0.41695380210876465, "learning_rate": 1.1038180914913143e-06, "loss": 0.014478788711130619, "memory(GiB)": 22.66, "step": 24478, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.958314 }, { "epoch": 0.7952116427898516, "grad_norm": 0.4051300883293152, "learning_rate": 1.103481464044569e-06, "loss": 0.012607126496732235, "memory(GiB)": 22.66, "step": 24479, "token_acc": 0.9892857142857143, "train_speed(iter/s)": 0.958322 }, { "epoch": 0.7952441282526069, "grad_norm": 0.3319302797317505, "learning_rate": 1.1031448815688355e-06, "loss": 0.01311506051570177, "memory(GiB)": 22.66, "step": 24480, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.95833 }, { "epoch": 0.7952766137153624, "grad_norm": 0.4660256803035736, "learning_rate": 1.1028083440679954e-06, "loss": 0.013389632105827332, "memory(GiB)": 22.66, "step": 24481, "token_acc": 1.0, "train_speed(iter/s)": 0.958338 }, { "epoch": 0.7953090991781178, "grad_norm": 0.3102221190929413, "learning_rate": 1.1024718515459338e-06, "loss": 0.010305986739695072, "memory(GiB)": 22.66, "step": 24482, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.958346 }, { "epoch": 0.7953415846408732, "grad_norm": 0.20833253860473633, "learning_rate": 1.1021354040065345e-06, "loss": 0.009821919724345207, "memory(GiB)": 22.66, "step": 24483, "token_acc": 0.995, "train_speed(iter/s)": 0.958354 }, { "epoch": 0.7953740701036286, "grad_norm": 0.3684252202510834, "learning_rate": 1.101799001453681e-06, "loss": 0.01193708274513483, "memory(GiB)": 22.66, "step": 24484, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.958362 }, { "epoch": 0.7954065555663841, "grad_norm": 0.26875436305999756, "learning_rate": 1.1014626438912572e-06, "loss": 0.00966574065387249, "memory(GiB)": 22.66, "step": 24485, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.958371 }, { "epoch": 0.7954390410291394, "grad_norm": 0.4856441020965576, "learning_rate": 1.1011263313231418e-06, "loss": 0.018447019159793854, "memory(GiB)": 22.66, "step": 24486, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.958379 }, { "epoch": 0.7954715264918949, "grad_norm": 0.34029820561408997, "learning_rate": 1.1007900637532177e-06, "loss": 0.011623529717326164, "memory(GiB)": 22.66, "step": 24487, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.958387 }, { "epoch": 0.7955040119546503, "grad_norm": 0.4270464777946472, "learning_rate": 1.1004538411853666e-06, "loss": 0.013136796653270721, "memory(GiB)": 22.66, "step": 24488, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.958395 }, { "epoch": 0.7955364974174057, "grad_norm": 0.5010902881622314, "learning_rate": 1.1001176636234688e-06, "loss": 0.017616238445043564, "memory(GiB)": 22.66, "step": 24489, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.958403 }, { "epoch": 0.7955689828801611, "grad_norm": 0.3106914758682251, "learning_rate": 1.0997815310714039e-06, "loss": 0.012870880775153637, "memory(GiB)": 22.66, "step": 24490, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.958411 }, { "epoch": 0.7956014683429166, "grad_norm": 0.439671128988266, "learning_rate": 1.0994454435330514e-06, "loss": 0.016618814319372177, "memory(GiB)": 22.66, "step": 24491, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.958419 }, { "epoch": 0.7956339538056719, "grad_norm": 0.2213938683271408, "learning_rate": 1.0991094010122882e-06, "loss": 0.009566363878548145, "memory(GiB)": 22.66, "step": 24492, "token_acc": 1.0, "train_speed(iter/s)": 0.958428 }, { "epoch": 0.7956664392684274, "grad_norm": 0.20504969358444214, "learning_rate": 1.0987734035129972e-06, "loss": 0.007790534757077694, "memory(GiB)": 22.66, "step": 24493, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.958436 }, { "epoch": 0.7956989247311828, "grad_norm": 0.35217228531837463, "learning_rate": 1.0984374510390522e-06, "loss": 0.015699168667197227, "memory(GiB)": 22.66, "step": 24494, "token_acc": 1.0, "train_speed(iter/s)": 0.958444 }, { "epoch": 0.7957314101939382, "grad_norm": 0.3856555223464966, "learning_rate": 1.0981015435943333e-06, "loss": 0.010211889632046223, "memory(GiB)": 22.66, "step": 24495, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.958452 }, { "epoch": 0.7957638956566936, "grad_norm": 0.3346945345401764, "learning_rate": 1.0977656811827148e-06, "loss": 0.012287448160350323, "memory(GiB)": 22.66, "step": 24496, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.95846 }, { "epoch": 0.7957963811194491, "grad_norm": 0.46885189414024353, "learning_rate": 1.097429863808072e-06, "loss": 0.013536146841943264, "memory(GiB)": 22.66, "step": 24497, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.958467 }, { "epoch": 0.7958288665822044, "grad_norm": 0.40828898549079895, "learning_rate": 1.0970940914742867e-06, "loss": 0.012777949683368206, "memory(GiB)": 22.66, "step": 24498, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.958475 }, { "epoch": 0.7958613520449599, "grad_norm": 0.3212814927101135, "learning_rate": 1.0967583641852287e-06, "loss": 0.017348498106002808, "memory(GiB)": 22.66, "step": 24499, "token_acc": 0.9893992932862191, "train_speed(iter/s)": 0.958483 }, { "epoch": 0.7958938375077153, "grad_norm": 0.3893705904483795, "learning_rate": 1.0964226819447755e-06, "loss": 0.016552135348320007, "memory(GiB)": 22.66, "step": 24500, "token_acc": 1.0, "train_speed(iter/s)": 0.958491 }, { "epoch": 0.7958938375077153, "eval_loss": 0.012697945348918438, "eval_runtime": 81.407, "eval_samples_per_second": 122.225, "eval_steps_per_second": 3.82, "eval_token_acc": 0.9948774447973888, "step": 24500 }, { "epoch": 0.7959263229704707, "grad_norm": 0.3514114022254944, "learning_rate": 1.096087044756799e-06, "loss": 0.013978498056530952, "memory(GiB)": 22.66, "step": 24501, "token_acc": 0.9944963925359035, "train_speed(iter/s)": 0.955063 }, { "epoch": 0.7959588084332261, "grad_norm": 0.30571630597114563, "learning_rate": 1.0957514526251739e-06, "loss": 0.00853846874088049, "memory(GiB)": 22.66, "step": 24502, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955071 }, { "epoch": 0.7959912938959816, "grad_norm": 0.2722032368183136, "learning_rate": 1.095415905553774e-06, "loss": 0.011625710874795914, "memory(GiB)": 22.66, "step": 24503, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955079 }, { "epoch": 0.7960237793587369, "grad_norm": 0.41814687848091125, "learning_rate": 1.0950804035464725e-06, "loss": 0.012394196353852749, "memory(GiB)": 22.66, "step": 24504, "token_acc": 1.0, "train_speed(iter/s)": 0.955088 }, { "epoch": 0.7960562648214924, "grad_norm": 0.3547631800174713, "learning_rate": 1.094744946607139e-06, "loss": 0.015655847266316414, "memory(GiB)": 22.66, "step": 24505, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.955096 }, { "epoch": 0.7960887502842477, "grad_norm": 0.339508056640625, "learning_rate": 1.094409534739647e-06, "loss": 0.005877785384654999, "memory(GiB)": 22.66, "step": 24506, "token_acc": 1.0, "train_speed(iter/s)": 0.955104 }, { "epoch": 0.7961212357470032, "grad_norm": 0.29796531796455383, "learning_rate": 1.0940741679478678e-06, "loss": 0.010113779455423355, "memory(GiB)": 22.66, "step": 24507, "token_acc": 1.0, "train_speed(iter/s)": 0.955113 }, { "epoch": 0.7961537212097586, "grad_norm": 0.2532845437526703, "learning_rate": 1.0937388462356724e-06, "loss": 0.010821813717484474, "memory(GiB)": 22.66, "step": 24508, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.955121 }, { "epoch": 0.796186206672514, "grad_norm": 0.27917811274528503, "learning_rate": 1.0934035696069284e-06, "loss": 0.01325253676623106, "memory(GiB)": 22.66, "step": 24509, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.95512 }, { "epoch": 0.7962186921352694, "grad_norm": 0.431150883436203, "learning_rate": 1.093068338065507e-06, "loss": 0.013642638921737671, "memory(GiB)": 22.66, "step": 24510, "token_acc": 1.0, "train_speed(iter/s)": 0.955129 }, { "epoch": 0.7962511775980249, "grad_norm": 0.3567095398902893, "learning_rate": 1.0927331516152773e-06, "loss": 0.012639486230909824, "memory(GiB)": 22.66, "step": 24511, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.955136 }, { "epoch": 0.7962836630607804, "grad_norm": 0.33173367381095886, "learning_rate": 1.0923980102601072e-06, "loss": 0.012269390746951103, "memory(GiB)": 22.66, "step": 24512, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.955143 }, { "epoch": 0.7963161485235357, "grad_norm": 0.37317538261413574, "learning_rate": 1.092062914003867e-06, "loss": 0.013616091571748257, "memory(GiB)": 22.66, "step": 24513, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.955149 }, { "epoch": 0.7963486339862912, "grad_norm": 0.39865046739578247, "learning_rate": 1.0917278628504201e-06, "loss": 0.015986161306500435, "memory(GiB)": 22.66, "step": 24514, "token_acc": 0.9866220735785953, "train_speed(iter/s)": 0.955156 }, { "epoch": 0.7963811194490465, "grad_norm": 0.3130245506763458, "learning_rate": 1.0913928568036358e-06, "loss": 0.00977306254208088, "memory(GiB)": 22.66, "step": 24515, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.955162 }, { "epoch": 0.796413604911802, "grad_norm": 0.3590072989463806, "learning_rate": 1.0910578958673807e-06, "loss": 0.013801690191030502, "memory(GiB)": 22.66, "step": 24516, "token_acc": 1.0, "train_speed(iter/s)": 0.955169 }, { "epoch": 0.7964460903745574, "grad_norm": 0.28805267810821533, "learning_rate": 1.0907229800455216e-06, "loss": 0.00818142294883728, "memory(GiB)": 22.66, "step": 24517, "token_acc": 1.0, "train_speed(iter/s)": 0.955175 }, { "epoch": 0.7964785758373129, "grad_norm": 0.3164918124675751, "learning_rate": 1.0903881093419204e-06, "loss": 0.011652622371912003, "memory(GiB)": 22.66, "step": 24518, "token_acc": 1.0, "train_speed(iter/s)": 0.955182 }, { "epoch": 0.7965110613000682, "grad_norm": 0.8157247304916382, "learning_rate": 1.0900532837604467e-06, "loss": 0.016835976392030716, "memory(GiB)": 22.66, "step": 24519, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.955188 }, { "epoch": 0.7965435467628237, "grad_norm": 0.3620685338973999, "learning_rate": 1.0897185033049583e-06, "loss": 0.01555931381881237, "memory(GiB)": 22.66, "step": 24520, "token_acc": 0.9862068965517241, "train_speed(iter/s)": 0.955195 }, { "epoch": 0.796576032225579, "grad_norm": 0.6290795207023621, "learning_rate": 1.0893837679793261e-06, "loss": 0.011653877794742584, "memory(GiB)": 22.66, "step": 24521, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.955201 }, { "epoch": 0.7966085176883345, "grad_norm": 0.2596692740917206, "learning_rate": 1.0890490777874091e-06, "loss": 0.008702327497303486, "memory(GiB)": 22.66, "step": 24522, "token_acc": 1.0, "train_speed(iter/s)": 0.955208 }, { "epoch": 0.7966410031510899, "grad_norm": 0.2634550631046295, "learning_rate": 1.0887144327330724e-06, "loss": 0.009594939649105072, "memory(GiB)": 22.66, "step": 24523, "token_acc": 1.0, "train_speed(iter/s)": 0.955214 }, { "epoch": 0.7966734886138453, "grad_norm": 0.2597130835056305, "learning_rate": 1.0883798328201762e-06, "loss": 0.010623148642480373, "memory(GiB)": 22.66, "step": 24524, "token_acc": 1.0, "train_speed(iter/s)": 0.955221 }, { "epoch": 0.7967059740766007, "grad_norm": 0.3115696907043457, "learning_rate": 1.0880452780525825e-06, "loss": 0.013675621710717678, "memory(GiB)": 22.66, "step": 24525, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.955227 }, { "epoch": 0.7967384595393562, "grad_norm": 0.3116399645805359, "learning_rate": 1.0877107684341532e-06, "loss": 0.006830236408859491, "memory(GiB)": 22.66, "step": 24526, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955233 }, { "epoch": 0.7967709450021115, "grad_norm": 0.31346821784973145, "learning_rate": 1.0873763039687485e-06, "loss": 0.009722849354147911, "memory(GiB)": 22.66, "step": 24527, "token_acc": 1.0, "train_speed(iter/s)": 0.955239 }, { "epoch": 0.796803430464867, "grad_norm": 0.29371702671051025, "learning_rate": 1.0870418846602309e-06, "loss": 0.009833182208240032, "memory(GiB)": 22.66, "step": 24528, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955245 }, { "epoch": 0.7968359159276224, "grad_norm": 0.1902552992105484, "learning_rate": 1.0867075105124565e-06, "loss": 0.008658175356686115, "memory(GiB)": 22.66, "step": 24529, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955251 }, { "epoch": 0.7968684013903778, "grad_norm": 0.33194759488105774, "learning_rate": 1.086373181529286e-06, "loss": 0.009323816746473312, "memory(GiB)": 22.66, "step": 24530, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.955255 }, { "epoch": 0.7969008868531332, "grad_norm": 0.3795997202396393, "learning_rate": 1.086038897714578e-06, "loss": 0.011221850290894508, "memory(GiB)": 22.66, "step": 24531, "token_acc": 0.996, "train_speed(iter/s)": 0.955261 }, { "epoch": 0.7969333723158887, "grad_norm": 0.464584082365036, "learning_rate": 1.085704659072192e-06, "loss": 0.013804960064589977, "memory(GiB)": 22.66, "step": 24532, "token_acc": 1.0, "train_speed(iter/s)": 0.955266 }, { "epoch": 0.796965857778644, "grad_norm": 0.27886494994163513, "learning_rate": 1.0853704656059828e-06, "loss": 0.008663070388138294, "memory(GiB)": 22.66, "step": 24533, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.955272 }, { "epoch": 0.7969983432413995, "grad_norm": 0.33590543270111084, "learning_rate": 1.0850363173198086e-06, "loss": 0.008679969236254692, "memory(GiB)": 22.66, "step": 24534, "token_acc": 1.0, "train_speed(iter/s)": 0.955278 }, { "epoch": 0.7970308287041549, "grad_norm": 0.25013452768325806, "learning_rate": 1.0847022142175267e-06, "loss": 0.006450011860579252, "memory(GiB)": 22.66, "step": 24535, "token_acc": 1.0, "train_speed(iter/s)": 0.955283 }, { "epoch": 0.7970633141669103, "grad_norm": 0.4314124882221222, "learning_rate": 1.0843681563029935e-06, "loss": 0.011862391605973244, "memory(GiB)": 22.66, "step": 24536, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955289 }, { "epoch": 0.7970957996296657, "grad_norm": 0.29731905460357666, "learning_rate": 1.0840341435800617e-06, "loss": 0.011187031865119934, "memory(GiB)": 22.66, "step": 24537, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.955295 }, { "epoch": 0.7971282850924212, "grad_norm": 0.3510940968990326, "learning_rate": 1.0837001760525884e-06, "loss": 0.011040430516004562, "memory(GiB)": 22.66, "step": 24538, "token_acc": 1.0, "train_speed(iter/s)": 0.955301 }, { "epoch": 0.7971607705551765, "grad_norm": 0.24327298998832703, "learning_rate": 1.0833662537244276e-06, "loss": 0.006654412019997835, "memory(GiB)": 22.66, "step": 24539, "token_acc": 1.0, "train_speed(iter/s)": 0.955307 }, { "epoch": 0.797193256017932, "grad_norm": 0.36147940158843994, "learning_rate": 1.083032376599435e-06, "loss": 0.01119522750377655, "memory(GiB)": 22.66, "step": 24540, "token_acc": 1.0, "train_speed(iter/s)": 0.955314 }, { "epoch": 0.7972257414806874, "grad_norm": 0.35015586018562317, "learning_rate": 1.0826985446814603e-06, "loss": 0.010533015243709087, "memory(GiB)": 22.66, "step": 24541, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.95532 }, { "epoch": 0.7972582269434428, "grad_norm": 0.3894248306751251, "learning_rate": 1.0823647579743595e-06, "loss": 0.015091201290488243, "memory(GiB)": 22.66, "step": 24542, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.955326 }, { "epoch": 0.7972907124061982, "grad_norm": 0.380576491355896, "learning_rate": 1.082031016481982e-06, "loss": 0.012542123906314373, "memory(GiB)": 22.66, "step": 24543, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.955333 }, { "epoch": 0.7973231978689537, "grad_norm": 0.31748875975608826, "learning_rate": 1.081697320208182e-06, "loss": 0.01138489693403244, "memory(GiB)": 22.66, "step": 24544, "token_acc": 1.0, "train_speed(iter/s)": 0.955339 }, { "epoch": 0.797355683331709, "grad_norm": 0.22241243720054626, "learning_rate": 1.0813636691568114e-06, "loss": 0.008695404976606369, "memory(GiB)": 22.66, "step": 24545, "token_acc": 1.0, "train_speed(iter/s)": 0.955347 }, { "epoch": 0.7973881687944645, "grad_norm": 0.4030798375606537, "learning_rate": 1.0810300633317189e-06, "loss": 0.014033777639269829, "memory(GiB)": 22.66, "step": 24546, "token_acc": 1.0, "train_speed(iter/s)": 0.955355 }, { "epoch": 0.7974206542572199, "grad_norm": 0.271518737077713, "learning_rate": 1.0806965027367566e-06, "loss": 0.008846908807754517, "memory(GiB)": 22.66, "step": 24547, "token_acc": 1.0, "train_speed(iter/s)": 0.955364 }, { "epoch": 0.7974531397199753, "grad_norm": 0.3243655562400818, "learning_rate": 1.0803629873757709e-06, "loss": 0.012043342925608158, "memory(GiB)": 22.66, "step": 24548, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.955372 }, { "epoch": 0.7974856251827307, "grad_norm": 0.32407817244529724, "learning_rate": 1.0800295172526154e-06, "loss": 0.015002603642642498, "memory(GiB)": 22.66, "step": 24549, "token_acc": 0.9923076923076923, "train_speed(iter/s)": 0.95538 }, { "epoch": 0.7975181106454862, "grad_norm": 0.45324158668518066, "learning_rate": 1.0796960923711358e-06, "loss": 0.015243355184793472, "memory(GiB)": 22.66, "step": 24550, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.955388 }, { "epoch": 0.7975505961082415, "grad_norm": 0.29234054684638977, "learning_rate": 1.079362712735183e-06, "loss": 0.012548533268272877, "memory(GiB)": 22.66, "step": 24551, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.955396 }, { "epoch": 0.797583081570997, "grad_norm": 0.3895333409309387, "learning_rate": 1.079029378348601e-06, "loss": 0.010707962326705456, "memory(GiB)": 22.66, "step": 24552, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955404 }, { "epoch": 0.7976155670337524, "grad_norm": 0.24007061123847961, "learning_rate": 1.0786960892152387e-06, "loss": 0.008034502156078815, "memory(GiB)": 22.66, "step": 24553, "token_acc": 1.0, "train_speed(iter/s)": 0.955413 }, { "epoch": 0.7976480524965078, "grad_norm": 0.26565125584602356, "learning_rate": 1.0783628453389427e-06, "loss": 0.009833933785557747, "memory(GiB)": 22.66, "step": 24554, "token_acc": 1.0, "train_speed(iter/s)": 0.955421 }, { "epoch": 0.7976805379592632, "grad_norm": 0.24095800518989563, "learning_rate": 1.0780296467235612e-06, "loss": 0.008030148223042488, "memory(GiB)": 22.66, "step": 24555, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.955429 }, { "epoch": 0.7977130234220187, "grad_norm": 0.24106138944625854, "learning_rate": 1.0776964933729356e-06, "loss": 0.008690329268574715, "memory(GiB)": 22.66, "step": 24556, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955438 }, { "epoch": 0.797745508884774, "grad_norm": 0.42180299758911133, "learning_rate": 1.0773633852909132e-06, "loss": 0.01809987798333168, "memory(GiB)": 22.66, "step": 24557, "token_acc": 0.994413407821229, "train_speed(iter/s)": 0.955446 }, { "epoch": 0.7977779943475295, "grad_norm": 0.28894567489624023, "learning_rate": 1.0770303224813389e-06, "loss": 0.009913611225783825, "memory(GiB)": 22.66, "step": 24558, "token_acc": 1.0, "train_speed(iter/s)": 0.955454 }, { "epoch": 0.7978104798102849, "grad_norm": 0.4237997829914093, "learning_rate": 1.0766973049480556e-06, "loss": 0.01660921610891819, "memory(GiB)": 22.66, "step": 24559, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.955463 }, { "epoch": 0.7978429652730403, "grad_norm": 0.3546450734138489, "learning_rate": 1.0763643326949086e-06, "loss": 0.013179875910282135, "memory(GiB)": 22.66, "step": 24560, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.95547 }, { "epoch": 0.7978754507357957, "grad_norm": 0.3091718256473541, "learning_rate": 1.0760314057257387e-06, "loss": 0.010964417830109596, "memory(GiB)": 22.66, "step": 24561, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.955478 }, { "epoch": 0.7979079361985512, "grad_norm": 0.47801944613456726, "learning_rate": 1.0756985240443884e-06, "loss": 0.022425802424550056, "memory(GiB)": 22.66, "step": 24562, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.955486 }, { "epoch": 0.7979404216613065, "grad_norm": 0.472649484872818, "learning_rate": 1.0753656876547008e-06, "loss": 0.016942324116826057, "memory(GiB)": 22.66, "step": 24563, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.955495 }, { "epoch": 0.797972907124062, "grad_norm": 0.35759100317955017, "learning_rate": 1.0750328965605183e-06, "loss": 0.010748534463346004, "memory(GiB)": 22.66, "step": 24564, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955503 }, { "epoch": 0.7980053925868174, "grad_norm": 0.5955877900123596, "learning_rate": 1.074700150765679e-06, "loss": 0.014547733590006828, "memory(GiB)": 22.66, "step": 24565, "token_acc": 1.0, "train_speed(iter/s)": 0.955512 }, { "epoch": 0.7980378780495728, "grad_norm": 0.5914284586906433, "learning_rate": 1.0743674502740242e-06, "loss": 0.01736173778772354, "memory(GiB)": 22.66, "step": 24566, "token_acc": 1.0, "train_speed(iter/s)": 0.95552 }, { "epoch": 0.7980703635123282, "grad_norm": 0.1781303733587265, "learning_rate": 1.0740347950893943e-06, "loss": 0.005201103165745735, "memory(GiB)": 22.66, "step": 24567, "token_acc": 1.0, "train_speed(iter/s)": 0.955527 }, { "epoch": 0.7981028489750837, "grad_norm": 0.31397417187690735, "learning_rate": 1.0737021852156294e-06, "loss": 0.009915090166032314, "memory(GiB)": 22.66, "step": 24568, "token_acc": 1.0, "train_speed(iter/s)": 0.955534 }, { "epoch": 0.798135334437839, "grad_norm": 0.6634451746940613, "learning_rate": 1.0733696206565665e-06, "loss": 0.011758403852581978, "memory(GiB)": 22.66, "step": 24569, "token_acc": 1.0, "train_speed(iter/s)": 0.955543 }, { "epoch": 0.7981678199005945, "grad_norm": 0.48572787642478943, "learning_rate": 1.0730371014160452e-06, "loss": 0.016230247914791107, "memory(GiB)": 22.66, "step": 24570, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.955551 }, { "epoch": 0.7982003053633498, "grad_norm": 0.2931849956512451, "learning_rate": 1.0727046274978998e-06, "loss": 0.012222436256706715, "memory(GiB)": 22.66, "step": 24571, "token_acc": 0.993103448275862, "train_speed(iter/s)": 0.95556 }, { "epoch": 0.7982327908261053, "grad_norm": 0.44574323296546936, "learning_rate": 1.0723721989059731e-06, "loss": 0.012740574777126312, "memory(GiB)": 22.66, "step": 24572, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955568 }, { "epoch": 0.7982652762888607, "grad_norm": 0.3864484429359436, "learning_rate": 1.0720398156440976e-06, "loss": 0.021704494953155518, "memory(GiB)": 22.66, "step": 24573, "token_acc": 1.0, "train_speed(iter/s)": 0.955576 }, { "epoch": 0.7982977617516162, "grad_norm": 0.4427264630794525, "learning_rate": 1.0717074777161106e-06, "loss": 0.012331303209066391, "memory(GiB)": 22.66, "step": 24574, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955584 }, { "epoch": 0.7983302472143715, "grad_norm": 0.5194135904312134, "learning_rate": 1.071375185125849e-06, "loss": 0.013158155605196953, "memory(GiB)": 22.66, "step": 24575, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.955593 }, { "epoch": 0.798362732677127, "grad_norm": 0.3676297068595886, "learning_rate": 1.0710429378771441e-06, "loss": 0.01184767484664917, "memory(GiB)": 22.66, "step": 24576, "token_acc": 0.988, "train_speed(iter/s)": 0.9556 }, { "epoch": 0.7983952181398825, "grad_norm": 0.5348950028419495, "learning_rate": 1.0707107359738362e-06, "loss": 0.013253092765808105, "memory(GiB)": 22.66, "step": 24577, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.955607 }, { "epoch": 0.7984277036026378, "grad_norm": 0.4036548435688019, "learning_rate": 1.0703785794197547e-06, "loss": 0.009541606530547142, "memory(GiB)": 22.66, "step": 24578, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.955614 }, { "epoch": 0.7984601890653933, "grad_norm": 0.3901045322418213, "learning_rate": 1.0700464682187362e-06, "loss": 0.01384813990443945, "memory(GiB)": 22.66, "step": 24579, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.955621 }, { "epoch": 0.7984926745281486, "grad_norm": 0.35275474190711975, "learning_rate": 1.069714402374611e-06, "loss": 0.007939827628433704, "memory(GiB)": 22.66, "step": 24580, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.955628 }, { "epoch": 0.7985251599909041, "grad_norm": 0.3317528963088989, "learning_rate": 1.0693823818912125e-06, "loss": 0.008782332763075829, "memory(GiB)": 22.66, "step": 24581, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.955635 }, { "epoch": 0.7985576454536595, "grad_norm": 0.24737752974033356, "learning_rate": 1.0690504067723729e-06, "loss": 0.011414699256420135, "memory(GiB)": 22.66, "step": 24582, "token_acc": 1.0, "train_speed(iter/s)": 0.955642 }, { "epoch": 0.798590130916415, "grad_norm": 0.45572930574417114, "learning_rate": 1.068718477021925e-06, "loss": 0.015693718567490578, "memory(GiB)": 22.66, "step": 24583, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.955649 }, { "epoch": 0.7986226163791703, "grad_norm": 0.32982075214385986, "learning_rate": 1.0683865926436977e-06, "loss": 0.00826563872396946, "memory(GiB)": 22.66, "step": 24584, "token_acc": 1.0, "train_speed(iter/s)": 0.955655 }, { "epoch": 0.7986551018419258, "grad_norm": 0.39461079239845276, "learning_rate": 1.0680547536415214e-06, "loss": 0.016824930906295776, "memory(GiB)": 22.66, "step": 24585, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955662 }, { "epoch": 0.7986875873046811, "grad_norm": 0.23536011576652527, "learning_rate": 1.0677229600192268e-06, "loss": 0.00807502493262291, "memory(GiB)": 22.66, "step": 24586, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.955668 }, { "epoch": 0.7987200727674366, "grad_norm": 0.32836592197418213, "learning_rate": 1.0673912117806445e-06, "loss": 0.008338537998497486, "memory(GiB)": 22.66, "step": 24587, "token_acc": 1.0, "train_speed(iter/s)": 0.955673 }, { "epoch": 0.798752558230192, "grad_norm": 0.2660430669784546, "learning_rate": 1.0670595089296004e-06, "loss": 0.009324794635176659, "memory(GiB)": 22.66, "step": 24588, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.955679 }, { "epoch": 0.7987850436929474, "grad_norm": 0.33518198132514954, "learning_rate": 1.0667278514699248e-06, "loss": 0.014624186791479588, "memory(GiB)": 22.66, "step": 24589, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.955685 }, { "epoch": 0.7988175291557028, "grad_norm": 0.33989718556404114, "learning_rate": 1.0663962394054445e-06, "loss": 0.011650331318378448, "memory(GiB)": 22.66, "step": 24590, "token_acc": 1.0, "train_speed(iter/s)": 0.95569 }, { "epoch": 0.7988500146184583, "grad_norm": 0.45325762033462524, "learning_rate": 1.066064672739987e-06, "loss": 0.010179783217608929, "memory(GiB)": 22.66, "step": 24591, "token_acc": 1.0, "train_speed(iter/s)": 0.955696 }, { "epoch": 0.7988825000812136, "grad_norm": 0.4242748022079468, "learning_rate": 1.065733151477381e-06, "loss": 0.012077691033482552, "memory(GiB)": 22.66, "step": 24592, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955701 }, { "epoch": 0.7989149855439691, "grad_norm": 0.5562472343444824, "learning_rate": 1.0654016756214497e-06, "loss": 0.012850189581513405, "memory(GiB)": 22.66, "step": 24593, "token_acc": 1.0, "train_speed(iter/s)": 0.955707 }, { "epoch": 0.7989474710067245, "grad_norm": 0.30549246072769165, "learning_rate": 1.0650702451760197e-06, "loss": 0.012908309698104858, "memory(GiB)": 22.66, "step": 24594, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.955712 }, { "epoch": 0.79897995646948, "grad_norm": 0.39537549018859863, "learning_rate": 1.0647388601449166e-06, "loss": 0.013773947954177856, "memory(GiB)": 22.66, "step": 24595, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955717 }, { "epoch": 0.7990124419322353, "grad_norm": 0.49179020524024963, "learning_rate": 1.064407520531966e-06, "loss": 0.018120288848876953, "memory(GiB)": 22.66, "step": 24596, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.955722 }, { "epoch": 0.7990449273949908, "grad_norm": 0.4524563252925873, "learning_rate": 1.06407622634099e-06, "loss": 0.016273099929094315, "memory(GiB)": 22.66, "step": 24597, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.955728 }, { "epoch": 0.7990774128577461, "grad_norm": 0.2585150897502899, "learning_rate": 1.0637449775758141e-06, "loss": 0.009374291636049747, "memory(GiB)": 22.66, "step": 24598, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.955734 }, { "epoch": 0.7991098983205016, "grad_norm": 0.3540028929710388, "learning_rate": 1.0634137742402573e-06, "loss": 0.013951843604445457, "memory(GiB)": 22.66, "step": 24599, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.95574 }, { "epoch": 0.799142383783257, "grad_norm": 0.47022363543510437, "learning_rate": 1.0630826163381481e-06, "loss": 0.022511128336191177, "memory(GiB)": 22.66, "step": 24600, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.955746 }, { "epoch": 0.7991748692460124, "grad_norm": 0.4688105285167694, "learning_rate": 1.0627515038733045e-06, "loss": 0.011910817585885525, "memory(GiB)": 22.66, "step": 24601, "token_acc": 1.0, "train_speed(iter/s)": 0.955752 }, { "epoch": 0.7992073547087678, "grad_norm": 0.21729065477848053, "learning_rate": 1.06242043684955e-06, "loss": 0.011681427247822285, "memory(GiB)": 22.66, "step": 24602, "token_acc": 1.0, "train_speed(iter/s)": 0.955759 }, { "epoch": 0.7992398401715233, "grad_norm": 0.3255836069583893, "learning_rate": 1.0620894152707028e-06, "loss": 0.011424247175455093, "memory(GiB)": 22.66, "step": 24603, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.955765 }, { "epoch": 0.7992723256342786, "grad_norm": 0.22470061480998993, "learning_rate": 1.0617584391405844e-06, "loss": 0.009615978226065636, "memory(GiB)": 22.66, "step": 24604, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955771 }, { "epoch": 0.7993048110970341, "grad_norm": 0.4024839699268341, "learning_rate": 1.0614275084630182e-06, "loss": 0.01427600346505642, "memory(GiB)": 22.66, "step": 24605, "token_acc": 1.0, "train_speed(iter/s)": 0.95578 }, { "epoch": 0.7993372965597895, "grad_norm": 0.4399432837963104, "learning_rate": 1.061096623241819e-06, "loss": 0.013846774585545063, "memory(GiB)": 22.66, "step": 24606, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.955788 }, { "epoch": 0.7993697820225449, "grad_norm": 0.20896495878696442, "learning_rate": 1.0607657834808089e-06, "loss": 0.007760751061141491, "memory(GiB)": 22.66, "step": 24607, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.955797 }, { "epoch": 0.7994022674853003, "grad_norm": 0.25798606872558594, "learning_rate": 1.0604349891838034e-06, "loss": 0.006769641302525997, "memory(GiB)": 22.66, "step": 24608, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.955805 }, { "epoch": 0.7994347529480558, "grad_norm": 0.29328233003616333, "learning_rate": 1.060104240354622e-06, "loss": 0.008297627791762352, "memory(GiB)": 22.66, "step": 24609, "token_acc": 1.0, "train_speed(iter/s)": 0.955813 }, { "epoch": 0.7994672384108111, "grad_norm": 0.34889090061187744, "learning_rate": 1.0597735369970814e-06, "loss": 0.012041395530104637, "memory(GiB)": 22.66, "step": 24610, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.955822 }, { "epoch": 0.7994997238735666, "grad_norm": 0.5314123630523682, "learning_rate": 1.0594428791149997e-06, "loss": 0.017502084374427795, "memory(GiB)": 22.66, "step": 24611, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.95583 }, { "epoch": 0.799532209336322, "grad_norm": 0.49922436475753784, "learning_rate": 1.0591122667121912e-06, "loss": 0.014418880455195904, "memory(GiB)": 22.66, "step": 24612, "token_acc": 0.996, "train_speed(iter/s)": 0.955839 }, { "epoch": 0.7995646947990774, "grad_norm": 0.6693564057350159, "learning_rate": 1.058781699792472e-06, "loss": 0.013313943520188332, "memory(GiB)": 22.66, "step": 24613, "token_acc": 0.9922779922779923, "train_speed(iter/s)": 0.955846 }, { "epoch": 0.7995971802618328, "grad_norm": 0.46686500310897827, "learning_rate": 1.0584511783596579e-06, "loss": 0.014467142522335052, "memory(GiB)": 22.66, "step": 24614, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.955855 }, { "epoch": 0.7996296657245883, "grad_norm": 0.3200077414512634, "learning_rate": 1.058120702417565e-06, "loss": 0.013330187648534775, "memory(GiB)": 22.66, "step": 24615, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955863 }, { "epoch": 0.7996621511873436, "grad_norm": 0.3498998284339905, "learning_rate": 1.057790271970004e-06, "loss": 0.011002975516021252, "memory(GiB)": 22.66, "step": 24616, "token_acc": 1.0, "train_speed(iter/s)": 0.955872 }, { "epoch": 0.7996946366500991, "grad_norm": 0.3296642601490021, "learning_rate": 1.0574598870207907e-06, "loss": 0.013591939583420753, "memory(GiB)": 22.66, "step": 24617, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.95588 }, { "epoch": 0.7997271221128545, "grad_norm": 0.47478216886520386, "learning_rate": 1.0571295475737375e-06, "loss": 0.014429965987801552, "memory(GiB)": 22.66, "step": 24618, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.955888 }, { "epoch": 0.7997596075756099, "grad_norm": 0.2554490268230438, "learning_rate": 1.0567992536326588e-06, "loss": 0.009604429826140404, "memory(GiB)": 22.66, "step": 24619, "token_acc": 1.0, "train_speed(iter/s)": 0.955896 }, { "epoch": 0.7997920930383653, "grad_norm": 0.33294957876205444, "learning_rate": 1.0564690052013637e-06, "loss": 0.013834178447723389, "memory(GiB)": 22.66, "step": 24620, "token_acc": 1.0, "train_speed(iter/s)": 0.955904 }, { "epoch": 0.7998245785011208, "grad_norm": 0.2500932514667511, "learning_rate": 1.0561388022836655e-06, "loss": 0.014146389439702034, "memory(GiB)": 22.66, "step": 24621, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.955913 }, { "epoch": 0.7998570639638761, "grad_norm": 0.3045482337474823, "learning_rate": 1.0558086448833744e-06, "loss": 0.010090786032378674, "memory(GiB)": 22.66, "step": 24622, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.955921 }, { "epoch": 0.7998895494266316, "grad_norm": 0.4179048240184784, "learning_rate": 1.055478533004301e-06, "loss": 0.012013457715511322, "memory(GiB)": 22.66, "step": 24623, "token_acc": 1.0, "train_speed(iter/s)": 0.95593 }, { "epoch": 0.799922034889387, "grad_norm": 0.4022219479084015, "learning_rate": 1.0551484666502576e-06, "loss": 0.014600815251469612, "memory(GiB)": 22.66, "step": 24624, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955938 }, { "epoch": 0.7999545203521424, "grad_norm": 0.32033127546310425, "learning_rate": 1.0548184458250494e-06, "loss": 0.011506196111440659, "memory(GiB)": 22.66, "step": 24625, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.955947 }, { "epoch": 0.7999870058148978, "grad_norm": 0.33561262488365173, "learning_rate": 1.0544884705324893e-06, "loss": 0.011041020974516869, "memory(GiB)": 22.66, "step": 24626, "token_acc": 1.0, "train_speed(iter/s)": 0.955955 }, { "epoch": 0.8000194912776533, "grad_norm": 0.4001062512397766, "learning_rate": 1.0541585407763804e-06, "loss": 0.01681762933731079, "memory(GiB)": 22.66, "step": 24627, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.955964 }, { "epoch": 0.8000519767404086, "grad_norm": 0.49090662598609924, "learning_rate": 1.0538286565605365e-06, "loss": 0.010060778819024563, "memory(GiB)": 22.66, "step": 24628, "token_acc": 1.0, "train_speed(iter/s)": 0.955972 }, { "epoch": 0.8000844622031641, "grad_norm": 0.4389113783836365, "learning_rate": 1.0534988178887618e-06, "loss": 0.012609078548848629, "memory(GiB)": 22.66, "step": 24629, "token_acc": 1.0, "train_speed(iter/s)": 0.95598 }, { "epoch": 0.8001169476659195, "grad_norm": 0.3383038640022278, "learning_rate": 1.053169024764864e-06, "loss": 0.008845354430377483, "memory(GiB)": 22.66, "step": 24630, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955989 }, { "epoch": 0.8001494331286749, "grad_norm": 0.41094279289245605, "learning_rate": 1.0528392771926477e-06, "loss": 0.01436311099678278, "memory(GiB)": 22.66, "step": 24631, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.955997 }, { "epoch": 0.8001819185914303, "grad_norm": 0.3331161141395569, "learning_rate": 1.05250957517592e-06, "loss": 0.01335262879729271, "memory(GiB)": 22.66, "step": 24632, "token_acc": 1.0, "train_speed(iter/s)": 0.956005 }, { "epoch": 0.8002144040541858, "grad_norm": 0.41252461075782776, "learning_rate": 1.0521799187184862e-06, "loss": 0.010421660728752613, "memory(GiB)": 22.66, "step": 24633, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.956013 }, { "epoch": 0.8002468895169411, "grad_norm": 0.32206642627716064, "learning_rate": 1.0518503078241515e-06, "loss": 0.00889684073626995, "memory(GiB)": 22.66, "step": 24634, "token_acc": 0.9930795847750865, "train_speed(iter/s)": 0.956021 }, { "epoch": 0.8002793749796966, "grad_norm": 0.39880597591400146, "learning_rate": 1.0515207424967178e-06, "loss": 0.01682676374912262, "memory(GiB)": 22.66, "step": 24635, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.956029 }, { "epoch": 0.800311860442452, "grad_norm": 0.49654483795166016, "learning_rate": 1.0511912227399906e-06, "loss": 0.013446413911879063, "memory(GiB)": 22.66, "step": 24636, "token_acc": 1.0, "train_speed(iter/s)": 0.956038 }, { "epoch": 0.8003443459052074, "grad_norm": 0.6756476163864136, "learning_rate": 1.0508617485577722e-06, "loss": 0.01889844611287117, "memory(GiB)": 22.66, "step": 24637, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.956046 }, { "epoch": 0.8003768313679628, "grad_norm": 0.313595712184906, "learning_rate": 1.0505323199538654e-06, "loss": 0.014355357736349106, "memory(GiB)": 22.66, "step": 24638, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.956054 }, { "epoch": 0.8004093168307183, "grad_norm": 0.21001194417476654, "learning_rate": 1.0502029369320738e-06, "loss": 0.008573738858103752, "memory(GiB)": 22.66, "step": 24639, "token_acc": 1.0, "train_speed(iter/s)": 0.956061 }, { "epoch": 0.8004418022934737, "grad_norm": 0.30644819140434265, "learning_rate": 1.0498735994961967e-06, "loss": 0.01090896874666214, "memory(GiB)": 22.66, "step": 24640, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956068 }, { "epoch": 0.8004742877562291, "grad_norm": 0.3846246302127838, "learning_rate": 1.049544307650035e-06, "loss": 0.020343752577900887, "memory(GiB)": 22.66, "step": 24641, "token_acc": 1.0, "train_speed(iter/s)": 0.956075 }, { "epoch": 0.8005067732189846, "grad_norm": 0.3109896779060364, "learning_rate": 1.0492150613973905e-06, "loss": 0.011669140309095383, "memory(GiB)": 22.66, "step": 24642, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956082 }, { "epoch": 0.8005392586817399, "grad_norm": 0.3676508665084839, "learning_rate": 1.0488858607420644e-06, "loss": 0.012041732668876648, "memory(GiB)": 22.66, "step": 24643, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.956088 }, { "epoch": 0.8005717441444954, "grad_norm": 0.46204277873039246, "learning_rate": 1.0485567056878521e-06, "loss": 0.01446034200489521, "memory(GiB)": 22.66, "step": 24644, "token_acc": 1.0, "train_speed(iter/s)": 0.956095 }, { "epoch": 0.8006042296072508, "grad_norm": 0.40694695711135864, "learning_rate": 1.0482275962385557e-06, "loss": 0.012317357584834099, "memory(GiB)": 22.66, "step": 24645, "token_acc": 1.0, "train_speed(iter/s)": 0.956102 }, { "epoch": 0.8006367150700062, "grad_norm": 0.4285961985588074, "learning_rate": 1.0478985323979718e-06, "loss": 0.014365190640091896, "memory(GiB)": 22.66, "step": 24646, "token_acc": 0.9966996699669967, "train_speed(iter/s)": 0.956108 }, { "epoch": 0.8006692005327616, "grad_norm": 0.2908966541290283, "learning_rate": 1.0475695141699011e-06, "loss": 0.012277591042220592, "memory(GiB)": 22.66, "step": 24647, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956113 }, { "epoch": 0.800701685995517, "grad_norm": 0.2455594390630722, "learning_rate": 1.0472405415581372e-06, "loss": 0.009105163626372814, "memory(GiB)": 22.66, "step": 24648, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.956119 }, { "epoch": 0.8007341714582724, "grad_norm": 0.3108706772327423, "learning_rate": 1.0469116145664798e-06, "loss": 0.015127833932638168, "memory(GiB)": 22.66, "step": 24649, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.956125 }, { "epoch": 0.8007666569210279, "grad_norm": 0.359353631734848, "learning_rate": 1.0465827331987216e-06, "loss": 0.008402510546147823, "memory(GiB)": 22.66, "step": 24650, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.95613 }, { "epoch": 0.8007991423837832, "grad_norm": 0.34059205651283264, "learning_rate": 1.0462538974586618e-06, "loss": 0.009751223027706146, "memory(GiB)": 22.66, "step": 24651, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956136 }, { "epoch": 0.8008316278465387, "grad_norm": 0.3845600485801697, "learning_rate": 1.0459251073500958e-06, "loss": 0.01741848699748516, "memory(GiB)": 22.66, "step": 24652, "token_acc": 1.0, "train_speed(iter/s)": 0.956141 }, { "epoch": 0.8008641133092941, "grad_norm": 0.3491918444633484, "learning_rate": 1.0455963628768156e-06, "loss": 0.01425669714808464, "memory(GiB)": 22.66, "step": 24653, "token_acc": 0.9923664122137404, "train_speed(iter/s)": 0.956146 }, { "epoch": 0.8008965987720496, "grad_norm": 0.4211832880973816, "learning_rate": 1.045267664042618e-06, "loss": 0.014817265793681145, "memory(GiB)": 22.66, "step": 24654, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.956151 }, { "epoch": 0.8009290842348049, "grad_norm": 0.3375876843929291, "learning_rate": 1.0449390108512925e-06, "loss": 0.016999460756778717, "memory(GiB)": 22.66, "step": 24655, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956156 }, { "epoch": 0.8009615696975604, "grad_norm": 0.4547085165977478, "learning_rate": 1.0446104033066379e-06, "loss": 0.01222988497465849, "memory(GiB)": 22.66, "step": 24656, "token_acc": 0.9961685823754789, "train_speed(iter/s)": 0.956162 }, { "epoch": 0.8009940551603157, "grad_norm": 0.333314448595047, "learning_rate": 1.0442818414124424e-06, "loss": 0.012019302695989609, "memory(GiB)": 22.66, "step": 24657, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.956167 }, { "epoch": 0.8010265406230712, "grad_norm": 0.318249374628067, "learning_rate": 1.0439533251725009e-06, "loss": 0.012540138326585293, "memory(GiB)": 22.66, "step": 24658, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.956172 }, { "epoch": 0.8010590260858266, "grad_norm": 0.393520325422287, "learning_rate": 1.043624854590602e-06, "loss": 0.014320960268378258, "memory(GiB)": 22.66, "step": 24659, "token_acc": 0.988950276243094, "train_speed(iter/s)": 0.956177 }, { "epoch": 0.801091511548582, "grad_norm": 0.3188652992248535, "learning_rate": 1.0432964296705383e-06, "loss": 0.013802774250507355, "memory(GiB)": 22.66, "step": 24660, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956183 }, { "epoch": 0.8011239970113374, "grad_norm": 0.4320796728134155, "learning_rate": 1.0429680504161e-06, "loss": 0.015921583399176598, "memory(GiB)": 22.66, "step": 24661, "token_acc": 1.0, "train_speed(iter/s)": 0.956188 }, { "epoch": 0.8011564824740929, "grad_norm": 0.31441354751586914, "learning_rate": 1.0426397168310787e-06, "loss": 0.012491662055253983, "memory(GiB)": 22.66, "step": 24662, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.956194 }, { "epoch": 0.8011889679368482, "grad_norm": 0.2611560821533203, "learning_rate": 1.042311428919261e-06, "loss": 0.007853572256863117, "memory(GiB)": 22.66, "step": 24663, "token_acc": 1.0, "train_speed(iter/s)": 0.956201 }, { "epoch": 0.8012214533996037, "grad_norm": 0.30596187710762024, "learning_rate": 1.041983186684437e-06, "loss": 0.012074138969182968, "memory(GiB)": 22.66, "step": 24664, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956208 }, { "epoch": 0.8012539388623591, "grad_norm": 0.35166624188423157, "learning_rate": 1.0416549901303951e-06, "loss": 0.01342783123254776, "memory(GiB)": 22.66, "step": 24665, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.956216 }, { "epoch": 0.8012864243251145, "grad_norm": 0.48592087626457214, "learning_rate": 1.0413268392609234e-06, "loss": 0.01614135503768921, "memory(GiB)": 22.66, "step": 24666, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.956224 }, { "epoch": 0.8013189097878699, "grad_norm": 0.3841615319252014, "learning_rate": 1.04099873407981e-06, "loss": 0.014412703923881054, "memory(GiB)": 22.66, "step": 24667, "token_acc": 1.0, "train_speed(iter/s)": 0.956233 }, { "epoch": 0.8013513952506254, "grad_norm": 0.45699238777160645, "learning_rate": 1.0406706745908395e-06, "loss": 0.02052249014377594, "memory(GiB)": 22.66, "step": 24668, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.956242 }, { "epoch": 0.8013838807133807, "grad_norm": 0.39114996790885925, "learning_rate": 1.0403426607977996e-06, "loss": 0.01426499243825674, "memory(GiB)": 22.66, "step": 24669, "token_acc": 1.0, "train_speed(iter/s)": 0.956251 }, { "epoch": 0.8014163661761362, "grad_norm": 0.360029399394989, "learning_rate": 1.0400146927044751e-06, "loss": 0.013509085401892662, "memory(GiB)": 22.66, "step": 24670, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.956259 }, { "epoch": 0.8014488516388916, "grad_norm": 0.3367632031440735, "learning_rate": 1.0396867703146529e-06, "loss": 0.011426540091633797, "memory(GiB)": 22.66, "step": 24671, "token_acc": 0.9961685823754789, "train_speed(iter/s)": 0.956267 }, { "epoch": 0.801481337101647, "grad_norm": 0.4490194022655487, "learning_rate": 1.039358893632116e-06, "loss": 0.016437385231256485, "memory(GiB)": 22.66, "step": 24672, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.956276 }, { "epoch": 0.8015138225644024, "grad_norm": 0.31523120403289795, "learning_rate": 1.0390310626606486e-06, "loss": 0.013456275686621666, "memory(GiB)": 22.66, "step": 24673, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.956285 }, { "epoch": 0.8015463080271579, "grad_norm": 0.42804908752441406, "learning_rate": 1.0387032774040345e-06, "loss": 0.010191193781793118, "memory(GiB)": 22.66, "step": 24674, "token_acc": 1.0, "train_speed(iter/s)": 0.956293 }, { "epoch": 0.8015787934899132, "grad_norm": 0.2664320170879364, "learning_rate": 1.0383755378660588e-06, "loss": 0.007309765554964542, "memory(GiB)": 22.66, "step": 24675, "token_acc": 1.0, "train_speed(iter/s)": 0.956302 }, { "epoch": 0.8016112789526687, "grad_norm": 0.38267287611961365, "learning_rate": 1.038047844050501e-06, "loss": 0.013420971110463142, "memory(GiB)": 22.66, "step": 24676, "token_acc": 1.0, "train_speed(iter/s)": 0.95631 }, { "epoch": 0.8016437644154241, "grad_norm": 0.549416720867157, "learning_rate": 1.0377201959611456e-06, "loss": 0.01755107380449772, "memory(GiB)": 22.66, "step": 24677, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956319 }, { "epoch": 0.8016762498781795, "grad_norm": 0.4235733449459076, "learning_rate": 1.0373925936017705e-06, "loss": 0.016621902585029602, "memory(GiB)": 22.66, "step": 24678, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.956328 }, { "epoch": 0.8017087353409349, "grad_norm": 0.3434751629829407, "learning_rate": 1.0370650369761614e-06, "loss": 0.016582245007157326, "memory(GiB)": 22.66, "step": 24679, "token_acc": 1.0, "train_speed(iter/s)": 0.956336 }, { "epoch": 0.8017412208036904, "grad_norm": 0.3096446692943573, "learning_rate": 1.0367375260880952e-06, "loss": 0.014399598352611065, "memory(GiB)": 22.66, "step": 24680, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.956345 }, { "epoch": 0.8017737062664457, "grad_norm": 0.2664261758327484, "learning_rate": 1.0364100609413547e-06, "loss": 0.009815812110900879, "memory(GiB)": 22.66, "step": 24681, "token_acc": 1.0, "train_speed(iter/s)": 0.956353 }, { "epoch": 0.8018061917292012, "grad_norm": 0.28434181213378906, "learning_rate": 1.0360826415397158e-06, "loss": 0.013571223244071007, "memory(GiB)": 22.66, "step": 24682, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956362 }, { "epoch": 0.8018386771919566, "grad_norm": 0.4407869875431061, "learning_rate": 1.0357552678869582e-06, "loss": 0.01748643070459366, "memory(GiB)": 22.66, "step": 24683, "token_acc": 0.996, "train_speed(iter/s)": 0.95637 }, { "epoch": 0.801871162654712, "grad_norm": 0.3830533027648926, "learning_rate": 1.0354279399868638e-06, "loss": 0.01584523357450962, "memory(GiB)": 22.66, "step": 24684, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956379 }, { "epoch": 0.8019036481174674, "grad_norm": 0.3688548803329468, "learning_rate": 1.035100657843206e-06, "loss": 0.014072314836084843, "memory(GiB)": 22.66, "step": 24685, "token_acc": 0.9828767123287672, "train_speed(iter/s)": 0.956388 }, { "epoch": 0.8019361335802229, "grad_norm": 0.25233280658721924, "learning_rate": 1.034773421459766e-06, "loss": 0.007542959414422512, "memory(GiB)": 22.66, "step": 24686, "token_acc": 1.0, "train_speed(iter/s)": 0.956396 }, { "epoch": 0.8019686190429782, "grad_norm": 0.378993421792984, "learning_rate": 1.0344462308403164e-06, "loss": 0.011375656351447105, "memory(GiB)": 22.66, "step": 24687, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.956405 }, { "epoch": 0.8020011045057337, "grad_norm": 0.42687657475471497, "learning_rate": 1.0341190859886357e-06, "loss": 0.013269949704408646, "memory(GiB)": 22.66, "step": 24688, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.956413 }, { "epoch": 0.8020335899684891, "grad_norm": 0.35277092456817627, "learning_rate": 1.0337919869084995e-06, "loss": 0.010669363662600517, "memory(GiB)": 22.66, "step": 24689, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.956422 }, { "epoch": 0.8020660754312445, "grad_norm": 0.40029221773147583, "learning_rate": 1.0334649336036839e-06, "loss": 0.012950694188475609, "memory(GiB)": 22.66, "step": 24690, "token_acc": 1.0, "train_speed(iter/s)": 0.95643 }, { "epoch": 0.8020985608939999, "grad_norm": 0.39714592695236206, "learning_rate": 1.0331379260779612e-06, "loss": 0.009404166601598263, "memory(GiB)": 22.66, "step": 24691, "token_acc": 1.0, "train_speed(iter/s)": 0.956438 }, { "epoch": 0.8021310463567554, "grad_norm": 0.332856148481369, "learning_rate": 1.032810964335107e-06, "loss": 0.013096440583467484, "memory(GiB)": 22.66, "step": 24692, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.956446 }, { "epoch": 0.8021635318195107, "grad_norm": 0.4073954224586487, "learning_rate": 1.032484048378894e-06, "loss": 0.01616932451725006, "memory(GiB)": 22.66, "step": 24693, "token_acc": 1.0, "train_speed(iter/s)": 0.956454 }, { "epoch": 0.8021960172822662, "grad_norm": 0.38068246841430664, "learning_rate": 1.0321571782130973e-06, "loss": 0.016802236437797546, "memory(GiB)": 22.66, "step": 24694, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.956462 }, { "epoch": 0.8022285027450216, "grad_norm": 0.35082361102104187, "learning_rate": 1.031830353841487e-06, "loss": 0.01331282127648592, "memory(GiB)": 22.66, "step": 24695, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.956471 }, { "epoch": 0.802260988207777, "grad_norm": 0.3206033706665039, "learning_rate": 1.031503575267836e-06, "loss": 0.012983076274394989, "memory(GiB)": 22.66, "step": 24696, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.956479 }, { "epoch": 0.8022934736705324, "grad_norm": 0.30328628420829773, "learning_rate": 1.0311768424959152e-06, "loss": 0.012058401480317116, "memory(GiB)": 22.66, "step": 24697, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956488 }, { "epoch": 0.8023259591332879, "grad_norm": 0.3783702850341797, "learning_rate": 1.0308501555294965e-06, "loss": 0.0141346650198102, "memory(GiB)": 22.66, "step": 24698, "token_acc": 1.0, "train_speed(iter/s)": 0.956497 }, { "epoch": 0.8023584445960432, "grad_norm": 0.40431663393974304, "learning_rate": 1.0305235143723513e-06, "loss": 0.011955762282013893, "memory(GiB)": 22.66, "step": 24699, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.956505 }, { "epoch": 0.8023909300587987, "grad_norm": 0.7858348488807678, "learning_rate": 1.030196919028247e-06, "loss": 0.017799926921725273, "memory(GiB)": 22.66, "step": 24700, "token_acc": 0.992, "train_speed(iter/s)": 0.956514 }, { "epoch": 0.802423415521554, "grad_norm": 0.2635141611099243, "learning_rate": 1.0298703695009538e-06, "loss": 0.007882818579673767, "memory(GiB)": 22.66, "step": 24701, "token_acc": 1.0, "train_speed(iter/s)": 0.956522 }, { "epoch": 0.8024559009843095, "grad_norm": 0.2888326346874237, "learning_rate": 1.0295438657942408e-06, "loss": 0.009860443882644176, "memory(GiB)": 22.66, "step": 24702, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.956529 }, { "epoch": 0.8024883864470649, "grad_norm": 0.3060239553451538, "learning_rate": 1.029217407911877e-06, "loss": 0.016434453427791595, "memory(GiB)": 22.66, "step": 24703, "token_acc": 1.0, "train_speed(iter/s)": 0.956536 }, { "epoch": 0.8025208719098204, "grad_norm": 0.2885158658027649, "learning_rate": 1.0288909958576288e-06, "loss": 0.010342301800847054, "memory(GiB)": 22.66, "step": 24704, "token_acc": 1.0, "train_speed(iter/s)": 0.956543 }, { "epoch": 0.8025533573725758, "grad_norm": 0.24762821197509766, "learning_rate": 1.0285646296352653e-06, "loss": 0.010847504250705242, "memory(GiB)": 22.66, "step": 24705, "token_acc": 1.0, "train_speed(iter/s)": 0.956549 }, { "epoch": 0.8025858428353312, "grad_norm": 0.26836228370666504, "learning_rate": 1.0282383092485492e-06, "loss": 0.008402165025472641, "memory(GiB)": 22.66, "step": 24706, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956556 }, { "epoch": 0.8026183282980867, "grad_norm": 0.44306665658950806, "learning_rate": 1.0279120347012522e-06, "loss": 0.01782170683145523, "memory(GiB)": 22.66, "step": 24707, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.956562 }, { "epoch": 0.802650813760842, "grad_norm": 0.42358872294425964, "learning_rate": 1.0275858059971356e-06, "loss": 0.015634365379810333, "memory(GiB)": 22.66, "step": 24708, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.956567 }, { "epoch": 0.8026832992235975, "grad_norm": 0.3358098268508911, "learning_rate": 1.0272596231399674e-06, "loss": 0.011705264449119568, "memory(GiB)": 22.66, "step": 24709, "token_acc": 1.0, "train_speed(iter/s)": 0.956573 }, { "epoch": 0.8027157846863529, "grad_norm": 0.21688959002494812, "learning_rate": 1.0269334861335095e-06, "loss": 0.011467981152236462, "memory(GiB)": 22.66, "step": 24710, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956579 }, { "epoch": 0.8027482701491083, "grad_norm": 0.3269689381122589, "learning_rate": 1.0266073949815276e-06, "loss": 0.008799068629741669, "memory(GiB)": 22.66, "step": 24711, "token_acc": 1.0, "train_speed(iter/s)": 0.956584 }, { "epoch": 0.8027807556118637, "grad_norm": 0.336131751537323, "learning_rate": 1.0262813496877848e-06, "loss": 0.007803416345268488, "memory(GiB)": 22.66, "step": 24712, "token_acc": 1.0, "train_speed(iter/s)": 0.95659 }, { "epoch": 0.8028132410746192, "grad_norm": 0.31217077374458313, "learning_rate": 1.0259553502560443e-06, "loss": 0.01365350466221571, "memory(GiB)": 22.66, "step": 24713, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956596 }, { "epoch": 0.8028457265373745, "grad_norm": 0.3962884843349457, "learning_rate": 1.0256293966900698e-06, "loss": 0.01623496413230896, "memory(GiB)": 22.66, "step": 24714, "token_acc": 1.0, "train_speed(iter/s)": 0.956602 }, { "epoch": 0.80287821200013, "grad_norm": 0.5126997828483582, "learning_rate": 1.0253034889936204e-06, "loss": 0.013180559501051903, "memory(GiB)": 22.66, "step": 24715, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956607 }, { "epoch": 0.8029106974628853, "grad_norm": 0.3797551691532135, "learning_rate": 1.0249776271704592e-06, "loss": 0.01269933395087719, "memory(GiB)": 22.66, "step": 24716, "token_acc": 1.0, "train_speed(iter/s)": 0.956613 }, { "epoch": 0.8029431829256408, "grad_norm": 0.26625216007232666, "learning_rate": 1.024651811224347e-06, "loss": 0.007798037491738796, "memory(GiB)": 22.66, "step": 24717, "token_acc": 1.0, "train_speed(iter/s)": 0.956618 }, { "epoch": 0.8029756683883962, "grad_norm": 0.2602511942386627, "learning_rate": 1.024326041159045e-06, "loss": 0.01091521792113781, "memory(GiB)": 22.66, "step": 24718, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.956623 }, { "epoch": 0.8030081538511517, "grad_norm": 0.3874458372592926, "learning_rate": 1.0240003169783113e-06, "loss": 0.015141678974032402, "memory(GiB)": 22.66, "step": 24719, "token_acc": 1.0, "train_speed(iter/s)": 0.956629 }, { "epoch": 0.803040639313907, "grad_norm": 0.42545783519744873, "learning_rate": 1.0236746386859058e-06, "loss": 0.014379063621163368, "memory(GiB)": 22.66, "step": 24720, "token_acc": 1.0, "train_speed(iter/s)": 0.956634 }, { "epoch": 0.8030731247766625, "grad_norm": 0.3199015259742737, "learning_rate": 1.0233490062855872e-06, "loss": 0.015675030648708344, "memory(GiB)": 22.66, "step": 24721, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956639 }, { "epoch": 0.8031056102394178, "grad_norm": 0.3567642271518707, "learning_rate": 1.0230234197811157e-06, "loss": 0.017194021493196487, "memory(GiB)": 22.66, "step": 24722, "token_acc": 1.0, "train_speed(iter/s)": 0.956645 }, { "epoch": 0.8031380957021733, "grad_norm": 0.45526251196861267, "learning_rate": 1.0226978791762453e-06, "loss": 0.018603283911943436, "memory(GiB)": 22.66, "step": 24723, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.95665 }, { "epoch": 0.8031705811649287, "grad_norm": 0.42580273747444153, "learning_rate": 1.0223723844747358e-06, "loss": 0.014273963868618011, "memory(GiB)": 22.66, "step": 24724, "token_acc": 0.995, "train_speed(iter/s)": 0.956656 }, { "epoch": 0.8032030666276841, "grad_norm": 0.3574639856815338, "learning_rate": 1.022046935680343e-06, "loss": 0.015175758861005306, "memory(GiB)": 22.66, "step": 24725, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956662 }, { "epoch": 0.8032355520904395, "grad_norm": 0.2935086786746979, "learning_rate": 1.0217215327968245e-06, "loss": 0.011697125621140003, "memory(GiB)": 22.66, "step": 24726, "token_acc": 1.0, "train_speed(iter/s)": 0.95667 }, { "epoch": 0.803268037553195, "grad_norm": 0.33453109860420227, "learning_rate": 1.0213961758279333e-06, "loss": 0.011108524166047573, "memory(GiB)": 22.66, "step": 24727, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.956678 }, { "epoch": 0.8033005230159503, "grad_norm": 0.3176385164260864, "learning_rate": 1.021070864777426e-06, "loss": 0.010963074862957, "memory(GiB)": 22.66, "step": 24728, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.956686 }, { "epoch": 0.8033330084787058, "grad_norm": 0.40395587682724, "learning_rate": 1.0207455996490567e-06, "loss": 0.014590740203857422, "memory(GiB)": 22.66, "step": 24729, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.956694 }, { "epoch": 0.8033654939414612, "grad_norm": 0.2857086956501007, "learning_rate": 1.0204203804465789e-06, "loss": 0.008896172046661377, "memory(GiB)": 22.66, "step": 24730, "token_acc": 1.0, "train_speed(iter/s)": 0.956702 }, { "epoch": 0.8033979794042166, "grad_norm": 0.3042057752609253, "learning_rate": 1.0200952071737492e-06, "loss": 0.01640334725379944, "memory(GiB)": 22.66, "step": 24731, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.95671 }, { "epoch": 0.803430464866972, "grad_norm": 0.38520485162734985, "learning_rate": 1.0197700798343158e-06, "loss": 0.014425056986510754, "memory(GiB)": 22.66, "step": 24732, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956719 }, { "epoch": 0.8034629503297275, "grad_norm": 0.42296046018600464, "learning_rate": 1.0194449984320348e-06, "loss": 0.014988952316343784, "memory(GiB)": 22.66, "step": 24733, "token_acc": 0.988950276243094, "train_speed(iter/s)": 0.956727 }, { "epoch": 0.8034954357924828, "grad_norm": 0.2657588720321655, "learning_rate": 1.0191199629706543e-06, "loss": 0.009411325678229332, "memory(GiB)": 22.66, "step": 24734, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.956736 }, { "epoch": 0.8035279212552383, "grad_norm": 0.3724311292171478, "learning_rate": 1.0187949734539304e-06, "loss": 0.01923418417572975, "memory(GiB)": 22.66, "step": 24735, "token_acc": 0.9933110367892977, "train_speed(iter/s)": 0.956744 }, { "epoch": 0.8035604067179937, "grad_norm": 0.2740197777748108, "learning_rate": 1.0184700298856103e-06, "loss": 0.010245313867926598, "memory(GiB)": 22.66, "step": 24736, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.956753 }, { "epoch": 0.8035928921807491, "grad_norm": 0.32773086428642273, "learning_rate": 1.018145132269447e-06, "loss": 0.0124078169465065, "memory(GiB)": 22.66, "step": 24737, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.956762 }, { "epoch": 0.8036253776435045, "grad_norm": 0.35773882269859314, "learning_rate": 1.0178202806091868e-06, "loss": 0.014590730890631676, "memory(GiB)": 22.66, "step": 24738, "token_acc": 1.0, "train_speed(iter/s)": 0.95677 }, { "epoch": 0.80365786310626, "grad_norm": 0.3778424859046936, "learning_rate": 1.0174954749085814e-06, "loss": 0.01610850915312767, "memory(GiB)": 22.66, "step": 24739, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956779 }, { "epoch": 0.8036903485690153, "grad_norm": 0.31913992762565613, "learning_rate": 1.0171707151713788e-06, "loss": 0.010958406142890453, "memory(GiB)": 22.66, "step": 24740, "token_acc": 1.0, "train_speed(iter/s)": 0.956788 }, { "epoch": 0.8037228340317708, "grad_norm": 0.28128674626350403, "learning_rate": 1.016846001401328e-06, "loss": 0.017364678904414177, "memory(GiB)": 22.66, "step": 24741, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.956796 }, { "epoch": 0.8037553194945262, "grad_norm": 0.3200830817222595, "learning_rate": 1.0165213336021755e-06, "loss": 0.010080627165734768, "memory(GiB)": 22.66, "step": 24742, "token_acc": 1.0, "train_speed(iter/s)": 0.956804 }, { "epoch": 0.8037878049572816, "grad_norm": 0.32357195019721985, "learning_rate": 1.0161967117776683e-06, "loss": 0.01177113689482212, "memory(GiB)": 22.66, "step": 24743, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956813 }, { "epoch": 0.803820290420037, "grad_norm": 0.2808304727077484, "learning_rate": 1.0158721359315538e-06, "loss": 0.014656780287623405, "memory(GiB)": 22.66, "step": 24744, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.956822 }, { "epoch": 0.8038527758827925, "grad_norm": 0.3453066647052765, "learning_rate": 1.0155476060675773e-06, "loss": 0.012941134162247181, "memory(GiB)": 22.66, "step": 24745, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.95683 }, { "epoch": 0.8038852613455478, "grad_norm": 0.29064062237739563, "learning_rate": 1.015223122189486e-06, "loss": 0.011861942708492279, "memory(GiB)": 22.66, "step": 24746, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956838 }, { "epoch": 0.8039177468083033, "grad_norm": 0.3727914094924927, "learning_rate": 1.0148986843010227e-06, "loss": 0.008886134251952171, "memory(GiB)": 22.66, "step": 24747, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.956847 }, { "epoch": 0.8039502322710587, "grad_norm": 0.44490736722946167, "learning_rate": 1.0145742924059326e-06, "loss": 0.018894031643867493, "memory(GiB)": 22.66, "step": 24748, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956855 }, { "epoch": 0.8039827177338141, "grad_norm": 0.33988040685653687, "learning_rate": 1.0142499465079598e-06, "loss": 0.012495416216552258, "memory(GiB)": 22.66, "step": 24749, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.956863 }, { "epoch": 0.8040152031965695, "grad_norm": 0.42000535130500793, "learning_rate": 1.0139256466108482e-06, "loss": 0.01652936078608036, "memory(GiB)": 22.66, "step": 24750, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.956871 }, { "epoch": 0.804047688659325, "grad_norm": 0.4113904535770416, "learning_rate": 1.0136013927183397e-06, "loss": 0.005884448066353798, "memory(GiB)": 22.66, "step": 24751, "token_acc": 1.0, "train_speed(iter/s)": 0.956879 }, { "epoch": 0.8040801741220803, "grad_norm": 0.2818301320075989, "learning_rate": 1.0132771848341766e-06, "loss": 0.012461373582482338, "memory(GiB)": 22.66, "step": 24752, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.956887 }, { "epoch": 0.8041126595848358, "grad_norm": 0.3750794529914856, "learning_rate": 1.0129530229621015e-06, "loss": 0.011443065479397774, "memory(GiB)": 22.66, "step": 24753, "token_acc": 1.0, "train_speed(iter/s)": 0.956895 }, { "epoch": 0.8041451450475912, "grad_norm": 0.41100355982780457, "learning_rate": 1.012628907105856e-06, "loss": 0.011360508389770985, "memory(GiB)": 22.66, "step": 24754, "token_acc": 1.0, "train_speed(iter/s)": 0.956904 }, { "epoch": 0.8041776305103466, "grad_norm": 0.48374396562576294, "learning_rate": 1.0123048372691791e-06, "loss": 0.017063740640878677, "memory(GiB)": 22.66, "step": 24755, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.956912 }, { "epoch": 0.804210115973102, "grad_norm": 0.36379435658454895, "learning_rate": 1.0119808134558134e-06, "loss": 0.01679823361337185, "memory(GiB)": 22.66, "step": 24756, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.95692 }, { "epoch": 0.8042426014358575, "grad_norm": 0.36215952038764954, "learning_rate": 1.0116568356694944e-06, "loss": 0.013676177710294724, "memory(GiB)": 22.66, "step": 24757, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.956927 }, { "epoch": 0.8042750868986128, "grad_norm": 0.3299582600593567, "learning_rate": 1.0113329039139663e-06, "loss": 0.008048884570598602, "memory(GiB)": 22.66, "step": 24758, "token_acc": 0.993006993006993, "train_speed(iter/s)": 0.956936 }, { "epoch": 0.8043075723613683, "grad_norm": 0.3166240155696869, "learning_rate": 1.011009018192965e-06, "loss": 0.007051500026136637, "memory(GiB)": 22.66, "step": 24759, "token_acc": 0.9952830188679245, "train_speed(iter/s)": 0.956944 }, { "epoch": 0.8043400578241237, "grad_norm": 0.2684798240661621, "learning_rate": 1.0106851785102278e-06, "loss": 0.00653950497508049, "memory(GiB)": 22.66, "step": 24760, "token_acc": 1.0, "train_speed(iter/s)": 0.956953 }, { "epoch": 0.8043725432868791, "grad_norm": 0.2712266147136688, "learning_rate": 1.0103613848694954e-06, "loss": 0.014067070558667183, "memory(GiB)": 22.66, "step": 24761, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.956962 }, { "epoch": 0.8044050287496345, "grad_norm": 0.26483428478240967, "learning_rate": 1.0100376372745002e-06, "loss": 0.00905003771185875, "memory(GiB)": 22.66, "step": 24762, "token_acc": 1.0, "train_speed(iter/s)": 0.956969 }, { "epoch": 0.80443751421239, "grad_norm": 0.363879531621933, "learning_rate": 1.0097139357289838e-06, "loss": 0.013147087767720222, "memory(GiB)": 22.66, "step": 24763, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956976 }, { "epoch": 0.8044699996751453, "grad_norm": 0.29354366660118103, "learning_rate": 1.0093902802366785e-06, "loss": 0.008243405260145664, "memory(GiB)": 22.66, "step": 24764, "token_acc": 1.0, "train_speed(iter/s)": 0.956985 }, { "epoch": 0.8045024851379008, "grad_norm": 0.36457282304763794, "learning_rate": 1.009066670801322e-06, "loss": 0.01374169159680605, "memory(GiB)": 22.66, "step": 24765, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.956991 }, { "epoch": 0.8045349706006562, "grad_norm": 0.3155851662158966, "learning_rate": 1.0087431074266462e-06, "loss": 0.010386407375335693, "memory(GiB)": 22.66, "step": 24766, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.956998 }, { "epoch": 0.8045674560634116, "grad_norm": 0.27675047516822815, "learning_rate": 1.008419590116388e-06, "loss": 0.012731127440929413, "memory(GiB)": 22.66, "step": 24767, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957004 }, { "epoch": 0.8045999415261671, "grad_norm": 0.40651825070381165, "learning_rate": 1.0080961188742805e-06, "loss": 0.01654394343495369, "memory(GiB)": 22.66, "step": 24768, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.957008 }, { "epoch": 0.8046324269889225, "grad_norm": 0.3759923577308655, "learning_rate": 1.0077726937040583e-06, "loss": 0.01412174478173256, "memory(GiB)": 22.66, "step": 24769, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.957013 }, { "epoch": 0.8046649124516779, "grad_norm": 0.41315141320228577, "learning_rate": 1.007449314609451e-06, "loss": 0.013863053172826767, "memory(GiB)": 22.66, "step": 24770, "token_acc": 0.99609375, "train_speed(iter/s)": 0.957019 }, { "epoch": 0.8046973979144333, "grad_norm": 0.29744628071784973, "learning_rate": 1.0071259815941932e-06, "loss": 0.013515623286366463, "memory(GiB)": 22.66, "step": 24771, "token_acc": 1.0, "train_speed(iter/s)": 0.957025 }, { "epoch": 0.8047298833771888, "grad_norm": 0.4029378592967987, "learning_rate": 1.0068026946620158e-06, "loss": 0.010977741330862045, "memory(GiB)": 22.66, "step": 24772, "token_acc": 1.0, "train_speed(iter/s)": 0.957031 }, { "epoch": 0.8047623688399441, "grad_norm": 0.3493654727935791, "learning_rate": 1.0064794538166517e-06, "loss": 0.011485218070447445, "memory(GiB)": 22.66, "step": 24773, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957037 }, { "epoch": 0.8047948543026996, "grad_norm": 0.34985288977622986, "learning_rate": 1.006156259061829e-06, "loss": 0.01263938657939434, "memory(GiB)": 22.66, "step": 24774, "token_acc": 1.0, "train_speed(iter/s)": 0.957042 }, { "epoch": 0.804827339765455, "grad_norm": 0.4429846405982971, "learning_rate": 1.0058331104012787e-06, "loss": 0.011682532727718353, "memory(GiB)": 22.66, "step": 24775, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.957047 }, { "epoch": 0.8048598252282104, "grad_norm": 0.48005855083465576, "learning_rate": 1.0055100078387303e-06, "loss": 0.011757783591747284, "memory(GiB)": 22.66, "step": 24776, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.957053 }, { "epoch": 0.8048923106909658, "grad_norm": 0.2844252586364746, "learning_rate": 1.0051869513779134e-06, "loss": 0.013336127623915672, "memory(GiB)": 22.66, "step": 24777, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.957059 }, { "epoch": 0.8049247961537213, "grad_norm": 0.3496011197566986, "learning_rate": 1.004863941022557e-06, "loss": 0.009876085445284843, "memory(GiB)": 22.66, "step": 24778, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957064 }, { "epoch": 0.8049572816164766, "grad_norm": 0.3890765309333801, "learning_rate": 1.0045409767763869e-06, "loss": 0.016012042760849, "memory(GiB)": 22.66, "step": 24779, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957069 }, { "epoch": 0.8049897670792321, "grad_norm": 0.44248396158218384, "learning_rate": 1.0042180586431321e-06, "loss": 0.018418045714497566, "memory(GiB)": 22.66, "step": 24780, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957074 }, { "epoch": 0.8050222525419874, "grad_norm": 0.3517795205116272, "learning_rate": 1.0038951866265195e-06, "loss": 0.008036622777581215, "memory(GiB)": 22.66, "step": 24781, "token_acc": 1.0, "train_speed(iter/s)": 0.95708 }, { "epoch": 0.8050547380047429, "grad_norm": 0.23190255463123322, "learning_rate": 1.0035723607302766e-06, "loss": 0.00803943071514368, "memory(GiB)": 22.66, "step": 24782, "token_acc": 1.0, "train_speed(iter/s)": 0.957086 }, { "epoch": 0.8050872234674983, "grad_norm": 0.2855995297431946, "learning_rate": 1.003249580958126e-06, "loss": 0.007065685465931892, "memory(GiB)": 22.66, "step": 24783, "token_acc": 1.0, "train_speed(iter/s)": 0.95709 }, { "epoch": 0.8051197089302538, "grad_norm": 0.3651939928531647, "learning_rate": 1.0029268473137972e-06, "loss": 0.01775549165904522, "memory(GiB)": 22.66, "step": 24784, "token_acc": 1.0, "train_speed(iter/s)": 0.957096 }, { "epoch": 0.8051521943930091, "grad_norm": 0.3996383249759674, "learning_rate": 1.0026041598010095e-06, "loss": 0.019319215789437294, "memory(GiB)": 22.66, "step": 24785, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957102 }, { "epoch": 0.8051846798557646, "grad_norm": 0.3885642886161804, "learning_rate": 1.0022815184234923e-06, "loss": 0.010084228590130806, "memory(GiB)": 22.66, "step": 24786, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957109 }, { "epoch": 0.80521716531852, "grad_norm": 0.219168022274971, "learning_rate": 1.0019589231849669e-06, "loss": 0.011962233111262321, "memory(GiB)": 22.66, "step": 24787, "token_acc": 0.9826989619377162, "train_speed(iter/s)": 0.957116 }, { "epoch": 0.8052496507812754, "grad_norm": 0.3881450593471527, "learning_rate": 1.0016363740891577e-06, "loss": 0.015063930302858353, "memory(GiB)": 22.66, "step": 24788, "token_acc": 1.0, "train_speed(iter/s)": 0.957124 }, { "epoch": 0.8052821362440308, "grad_norm": 0.21420423686504364, "learning_rate": 1.0013138711397857e-06, "loss": 0.0060660275630652905, "memory(GiB)": 22.66, "step": 24789, "token_acc": 1.0, "train_speed(iter/s)": 0.957133 }, { "epoch": 0.8053146217067862, "grad_norm": 0.5185297727584839, "learning_rate": 1.0009914143405725e-06, "loss": 0.016185497865080833, "memory(GiB)": 22.66, "step": 24790, "token_acc": 0.98828125, "train_speed(iter/s)": 0.957142 }, { "epoch": 0.8053471071695416, "grad_norm": 0.3070037066936493, "learning_rate": 1.0006690036952433e-06, "loss": 0.012615412473678589, "memory(GiB)": 22.66, "step": 24791, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.95715 }, { "epoch": 0.8053795926322971, "grad_norm": 0.2563425302505493, "learning_rate": 1.0003466392075155e-06, "loss": 0.011456269770860672, "memory(GiB)": 22.66, "step": 24792, "token_acc": 1.0, "train_speed(iter/s)": 0.957159 }, { "epoch": 0.8054120780950524, "grad_norm": 0.3977983593940735, "learning_rate": 1.0000243208811128e-06, "loss": 0.012402962893247604, "memory(GiB)": 22.66, "step": 24793, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957167 }, { "epoch": 0.8054445635578079, "grad_norm": 0.3093717098236084, "learning_rate": 9.997020487197518e-07, "loss": 0.006906486116349697, "memory(GiB)": 22.66, "step": 24794, "token_acc": 1.0, "train_speed(iter/s)": 0.957176 }, { "epoch": 0.8054770490205633, "grad_norm": 0.2677922248840332, "learning_rate": 9.99379822727154e-07, "loss": 0.01355617493391037, "memory(GiB)": 22.66, "step": 24795, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.957185 }, { "epoch": 0.8055095344833187, "grad_norm": 0.23636166751384735, "learning_rate": 9.99057642907037e-07, "loss": 0.009666524827480316, "memory(GiB)": 22.66, "step": 24796, "token_acc": 1.0, "train_speed(iter/s)": 0.957194 }, { "epoch": 0.8055420199460741, "grad_norm": 0.7160748839378357, "learning_rate": 9.987355092631217e-07, "loss": 0.01803450658917427, "memory(GiB)": 22.66, "step": 24797, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.957202 }, { "epoch": 0.8055745054088296, "grad_norm": 0.5054014921188354, "learning_rate": 9.984134217991236e-07, "loss": 0.012435182929039001, "memory(GiB)": 22.66, "step": 24798, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.957211 }, { "epoch": 0.8056069908715849, "grad_norm": 0.426882803440094, "learning_rate": 9.980913805187603e-07, "loss": 0.014893172308802605, "memory(GiB)": 22.66, "step": 24799, "token_acc": 0.9847715736040609, "train_speed(iter/s)": 0.957219 }, { "epoch": 0.8056394763343404, "grad_norm": 0.256921648979187, "learning_rate": 9.977693854257486e-07, "loss": 0.013557345606386662, "memory(GiB)": 22.66, "step": 24800, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957227 }, { "epoch": 0.8056719617970958, "grad_norm": 0.3506118357181549, "learning_rate": 9.974474365238068e-07, "loss": 0.01322510838508606, "memory(GiB)": 22.66, "step": 24801, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.957234 }, { "epoch": 0.8057044472598512, "grad_norm": 0.2918873429298401, "learning_rate": 9.971255338166475e-07, "loss": 0.013185406103730202, "memory(GiB)": 22.66, "step": 24802, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.957242 }, { "epoch": 0.8057369327226066, "grad_norm": 0.34416550397872925, "learning_rate": 9.968036773079876e-07, "loss": 0.016100486740469933, "memory(GiB)": 22.66, "step": 24803, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.95725 }, { "epoch": 0.8057694181853621, "grad_norm": 0.34506359696388245, "learning_rate": 9.964818670015413e-07, "loss": 0.021129019558429718, "memory(GiB)": 22.66, "step": 24804, "token_acc": 0.991869918699187, "train_speed(iter/s)": 0.957259 }, { "epoch": 0.8058019036481174, "grad_norm": 0.3066966235637665, "learning_rate": 9.961601029010232e-07, "loss": 0.011878706514835358, "memory(GiB)": 22.66, "step": 24805, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.957267 }, { "epoch": 0.8058343891108729, "grad_norm": 0.3568904995918274, "learning_rate": 9.958383850101477e-07, "loss": 0.008860484696924686, "memory(GiB)": 22.66, "step": 24806, "token_acc": 1.0, "train_speed(iter/s)": 0.957276 }, { "epoch": 0.8058668745736283, "grad_norm": 0.5722857713699341, "learning_rate": 9.955167133326254e-07, "loss": 0.01553634088486433, "memory(GiB)": 22.66, "step": 24807, "token_acc": 0.988, "train_speed(iter/s)": 0.957284 }, { "epoch": 0.8058993600363837, "grad_norm": 0.4391622841358185, "learning_rate": 9.951950878721706e-07, "loss": 0.01131952740252018, "memory(GiB)": 22.66, "step": 24808, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.957292 }, { "epoch": 0.8059318454991391, "grad_norm": 0.19877922534942627, "learning_rate": 9.948735086324946e-07, "loss": 0.007132749073207378, "memory(GiB)": 22.66, "step": 24809, "token_acc": 1.0, "train_speed(iter/s)": 0.957301 }, { "epoch": 0.8059643309618946, "grad_norm": 0.36787736415863037, "learning_rate": 9.9455197561731e-07, "loss": 0.01564178057014942, "memory(GiB)": 22.66, "step": 24810, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.95731 }, { "epoch": 0.8059968164246499, "grad_norm": 0.34560495615005493, "learning_rate": 9.94230488830326e-07, "loss": 0.013168573379516602, "memory(GiB)": 22.66, "step": 24811, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957319 }, { "epoch": 0.8060293018874054, "grad_norm": 0.40628910064697266, "learning_rate": 9.939090482752551e-07, "loss": 0.018941426649689674, "memory(GiB)": 22.66, "step": 24812, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.957327 }, { "epoch": 0.8060617873501608, "grad_norm": 0.3470725119113922, "learning_rate": 9.93587653955803e-07, "loss": 0.011759823188185692, "memory(GiB)": 22.66, "step": 24813, "token_acc": 1.0, "train_speed(iter/s)": 0.957336 }, { "epoch": 0.8060942728129162, "grad_norm": 0.2190103530883789, "learning_rate": 9.932663058756848e-07, "loss": 0.008065924048423767, "memory(GiB)": 22.66, "step": 24814, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957344 }, { "epoch": 0.8061267582756716, "grad_norm": 0.45620059967041016, "learning_rate": 9.929450040386053e-07, "loss": 0.017468778416514397, "memory(GiB)": 22.66, "step": 24815, "token_acc": 1.0, "train_speed(iter/s)": 0.957352 }, { "epoch": 0.8061592437384271, "grad_norm": 0.3079918920993805, "learning_rate": 9.926237484482748e-07, "loss": 0.010891066864132881, "memory(GiB)": 22.66, "step": 24816, "token_acc": 1.0, "train_speed(iter/s)": 0.957361 }, { "epoch": 0.8061917292011824, "grad_norm": 0.3731478154659271, "learning_rate": 9.923025391083974e-07, "loss": 0.010961784049868584, "memory(GiB)": 22.66, "step": 24817, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.957369 }, { "epoch": 0.8062242146639379, "grad_norm": 0.330410361289978, "learning_rate": 9.919813760226854e-07, "loss": 0.012325793504714966, "memory(GiB)": 22.66, "step": 24818, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957377 }, { "epoch": 0.8062567001266933, "grad_norm": 0.3131643831729889, "learning_rate": 9.91660259194842e-07, "loss": 0.01206664927303791, "memory(GiB)": 22.66, "step": 24819, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957386 }, { "epoch": 0.8062891855894487, "grad_norm": 0.2795941233634949, "learning_rate": 9.913391886285755e-07, "loss": 0.010213838890194893, "memory(GiB)": 22.66, "step": 24820, "token_acc": 1.0, "train_speed(iter/s)": 0.957395 }, { "epoch": 0.8063216710522041, "grad_norm": 0.427288681268692, "learning_rate": 9.91018164327589e-07, "loss": 0.01076046098023653, "memory(GiB)": 22.66, "step": 24821, "token_acc": 1.0, "train_speed(iter/s)": 0.957403 }, { "epoch": 0.8063541565149596, "grad_norm": 0.33182424306869507, "learning_rate": 9.906971862955877e-07, "loss": 0.014271445572376251, "memory(GiB)": 22.66, "step": 24822, "token_acc": 1.0, "train_speed(iter/s)": 0.957411 }, { "epoch": 0.8063866419777149, "grad_norm": 0.34236499667167664, "learning_rate": 9.903762545362793e-07, "loss": 0.011445915326476097, "memory(GiB)": 22.66, "step": 24823, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.95742 }, { "epoch": 0.8064191274404704, "grad_norm": 0.3311457633972168, "learning_rate": 9.900553690533648e-07, "loss": 0.012760663405060768, "memory(GiB)": 22.66, "step": 24824, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957429 }, { "epoch": 0.8064516129032258, "grad_norm": 0.2735287547111511, "learning_rate": 9.8973452985055e-07, "loss": 0.008376913145184517, "memory(GiB)": 22.66, "step": 24825, "token_acc": 1.0, "train_speed(iter/s)": 0.957438 }, { "epoch": 0.8064840983659812, "grad_norm": 0.3857104778289795, "learning_rate": 9.89413736931535e-07, "loss": 0.01313911285251379, "memory(GiB)": 22.66, "step": 24826, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.957446 }, { "epoch": 0.8065165838287366, "grad_norm": 0.3737737238407135, "learning_rate": 9.890929903000235e-07, "loss": 0.010358954779803753, "memory(GiB)": 22.66, "step": 24827, "token_acc": 1.0, "train_speed(iter/s)": 0.957454 }, { "epoch": 0.8065490692914921, "grad_norm": 0.39792904257774353, "learning_rate": 9.887722899597175e-07, "loss": 0.013441845774650574, "memory(GiB)": 22.66, "step": 24828, "token_acc": 1.0, "train_speed(iter/s)": 0.95746 }, { "epoch": 0.8065815547542474, "grad_norm": 0.45068636536598206, "learning_rate": 9.884516359143193e-07, "loss": 0.02095281146466732, "memory(GiB)": 22.66, "step": 24829, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.957465 }, { "epoch": 0.8066140402170029, "grad_norm": 0.2559414505958557, "learning_rate": 9.881310281675273e-07, "loss": 0.007603625766932964, "memory(GiB)": 22.66, "step": 24830, "token_acc": 1.0, "train_speed(iter/s)": 0.957471 }, { "epoch": 0.8066465256797583, "grad_norm": 0.4150327146053314, "learning_rate": 9.878104667230425e-07, "loss": 0.00852753035724163, "memory(GiB)": 22.66, "step": 24831, "token_acc": 1.0, "train_speed(iter/s)": 0.957477 }, { "epoch": 0.8066790111425137, "grad_norm": 0.4901510775089264, "learning_rate": 9.87489951584566e-07, "loss": 0.012516144663095474, "memory(GiB)": 22.66, "step": 24832, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.957483 }, { "epoch": 0.8067114966052692, "grad_norm": 0.2909924387931824, "learning_rate": 9.87169482755797e-07, "loss": 0.009963501244783401, "memory(GiB)": 22.66, "step": 24833, "token_acc": 1.0, "train_speed(iter/s)": 0.957488 }, { "epoch": 0.8067439820680246, "grad_norm": 0.3579276502132416, "learning_rate": 9.86849060240432e-07, "loss": 0.009336547926068306, "memory(GiB)": 22.66, "step": 24834, "token_acc": 1.0, "train_speed(iter/s)": 0.957494 }, { "epoch": 0.80677646753078, "grad_norm": 0.33094748854637146, "learning_rate": 9.865286840421707e-07, "loss": 0.010354414582252502, "memory(GiB)": 22.66, "step": 24835, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.957499 }, { "epoch": 0.8068089529935354, "grad_norm": 0.22715698182582855, "learning_rate": 9.862083541647094e-07, "loss": 0.00948982685804367, "memory(GiB)": 22.66, "step": 24836, "token_acc": 1.0, "train_speed(iter/s)": 0.957504 }, { "epoch": 0.8068414384562909, "grad_norm": 0.2857367992401123, "learning_rate": 9.858880706117468e-07, "loss": 0.013151104561984539, "memory(GiB)": 22.66, "step": 24837, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.95751 }, { "epoch": 0.8068739239190462, "grad_norm": 0.39135003089904785, "learning_rate": 9.8556783338698e-07, "loss": 0.010597913525998592, "memory(GiB)": 22.66, "step": 24838, "token_acc": 1.0, "train_speed(iter/s)": 0.957515 }, { "epoch": 0.8069064093818017, "grad_norm": 0.34584981203079224, "learning_rate": 9.852476424941016e-07, "loss": 0.00877278484404087, "memory(GiB)": 22.66, "step": 24839, "token_acc": 1.0, "train_speed(iter/s)": 0.957521 }, { "epoch": 0.806938894844557, "grad_norm": 0.21209260821342468, "learning_rate": 9.849274979368095e-07, "loss": 0.01135275699198246, "memory(GiB)": 22.66, "step": 24840, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957526 }, { "epoch": 0.8069713803073125, "grad_norm": 0.34095072746276855, "learning_rate": 9.846073997187982e-07, "loss": 0.009769923985004425, "memory(GiB)": 22.66, "step": 24841, "token_acc": 1.0, "train_speed(iter/s)": 0.957531 }, { "epoch": 0.8070038657700679, "grad_norm": 0.29201218485832214, "learning_rate": 9.842873478437632e-07, "loss": 0.007857397198677063, "memory(GiB)": 22.66, "step": 24842, "token_acc": 1.0, "train_speed(iter/s)": 0.957536 }, { "epoch": 0.8070363512328234, "grad_norm": 0.35764774680137634, "learning_rate": 9.839673423153956e-07, "loss": 0.01048118993639946, "memory(GiB)": 22.66, "step": 24843, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.957542 }, { "epoch": 0.8070688366955787, "grad_norm": 0.3922019302845001, "learning_rate": 9.836473831373915e-07, "loss": 0.01675471104681492, "memory(GiB)": 22.66, "step": 24844, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.957548 }, { "epoch": 0.8071013221583342, "grad_norm": 0.29659631848335266, "learning_rate": 9.833274703134404e-07, "loss": 0.011600314639508724, "memory(GiB)": 22.66, "step": 24845, "token_acc": 1.0, "train_speed(iter/s)": 0.957553 }, { "epoch": 0.8071338076210895, "grad_norm": 0.376203328371048, "learning_rate": 9.830076038472387e-07, "loss": 0.013773931190371513, "memory(GiB)": 22.66, "step": 24846, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.957558 }, { "epoch": 0.807166293083845, "grad_norm": 0.3359658122062683, "learning_rate": 9.826877837424741e-07, "loss": 0.009937655180692673, "memory(GiB)": 22.66, "step": 24847, "token_acc": 1.0, "train_speed(iter/s)": 0.957565 }, { "epoch": 0.8071987785466004, "grad_norm": 0.3650614619255066, "learning_rate": 9.823680100028415e-07, "loss": 0.01066831685602665, "memory(GiB)": 22.66, "step": 24848, "token_acc": 0.986159169550173, "train_speed(iter/s)": 0.957572 }, { "epoch": 0.8072312640093559, "grad_norm": 0.3553481996059418, "learning_rate": 9.820482826320277e-07, "loss": 0.014970529824495316, "memory(GiB)": 22.66, "step": 24849, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.957579 }, { "epoch": 0.8072637494721112, "grad_norm": 0.418865442276001, "learning_rate": 9.817286016337246e-07, "loss": 0.01558747049421072, "memory(GiB)": 22.66, "step": 24850, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.957585 }, { "epoch": 0.8072962349348667, "grad_norm": 0.24658650159835815, "learning_rate": 9.814089670116222e-07, "loss": 0.007733054459095001, "memory(GiB)": 22.66, "step": 24851, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957592 }, { "epoch": 0.807328720397622, "grad_norm": 0.31381848454475403, "learning_rate": 9.810893787694088e-07, "loss": 0.010543250478804111, "memory(GiB)": 22.66, "step": 24852, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.9576 }, { "epoch": 0.8073612058603775, "grad_norm": 0.3558180034160614, "learning_rate": 9.807698369107743e-07, "loss": 0.009547315537929535, "memory(GiB)": 22.66, "step": 24853, "token_acc": 0.9937888198757764, "train_speed(iter/s)": 0.957608 }, { "epoch": 0.8073936913231329, "grad_norm": 0.2949848771095276, "learning_rate": 9.80450341439404e-07, "loss": 0.010747005231678486, "memory(GiB)": 22.66, "step": 24854, "token_acc": 0.9931506849315068, "train_speed(iter/s)": 0.957617 }, { "epoch": 0.8074261767858883, "grad_norm": 0.31972819566726685, "learning_rate": 9.80130892358987e-07, "loss": 0.012259559705853462, "memory(GiB)": 22.66, "step": 24855, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957625 }, { "epoch": 0.8074586622486437, "grad_norm": 0.36225712299346924, "learning_rate": 9.798114896732103e-07, "loss": 0.014311525039374828, "memory(GiB)": 22.66, "step": 24856, "token_acc": 0.9896907216494846, "train_speed(iter/s)": 0.957633 }, { "epoch": 0.8074911477113992, "grad_norm": 0.5177252888679504, "learning_rate": 9.794921333857605e-07, "loss": 0.011478274129331112, "memory(GiB)": 22.66, "step": 24857, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.957641 }, { "epoch": 0.8075236331741545, "grad_norm": 0.28012514114379883, "learning_rate": 9.791728235003217e-07, "loss": 0.01041132677346468, "memory(GiB)": 22.66, "step": 24858, "token_acc": 1.0, "train_speed(iter/s)": 0.957649 }, { "epoch": 0.80755611863691, "grad_norm": 0.3889411389827728, "learning_rate": 9.788535600205807e-07, "loss": 0.014772901311516762, "memory(GiB)": 22.66, "step": 24859, "token_acc": 0.9824561403508771, "train_speed(iter/s)": 0.957657 }, { "epoch": 0.8075886040996654, "grad_norm": 0.2832067012786865, "learning_rate": 9.785343429502213e-07, "loss": 0.00958352442830801, "memory(GiB)": 22.66, "step": 24860, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.957666 }, { "epoch": 0.8076210895624208, "grad_norm": 0.39013493061065674, "learning_rate": 9.782151722929296e-07, "loss": 0.01524580828845501, "memory(GiB)": 22.66, "step": 24861, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.957674 }, { "epoch": 0.8076535750251762, "grad_norm": 0.45310020446777344, "learning_rate": 9.778960480523863e-07, "loss": 0.015137875452637672, "memory(GiB)": 22.66, "step": 24862, "token_acc": 0.992, "train_speed(iter/s)": 0.957682 }, { "epoch": 0.8076860604879317, "grad_norm": 0.3685360848903656, "learning_rate": 9.775769702322762e-07, "loss": 0.01068025827407837, "memory(GiB)": 22.66, "step": 24863, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.957691 }, { "epoch": 0.807718545950687, "grad_norm": 0.2766893804073334, "learning_rate": 9.772579388362814e-07, "loss": 0.010179484263062477, "memory(GiB)": 22.66, "step": 24864, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.957699 }, { "epoch": 0.8077510314134425, "grad_norm": 0.3978193402290344, "learning_rate": 9.769389538680856e-07, "loss": 0.01313023455440998, "memory(GiB)": 22.66, "step": 24865, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.957707 }, { "epoch": 0.8077835168761979, "grad_norm": 0.24966517090797424, "learning_rate": 9.766200153313682e-07, "loss": 0.007445205003023148, "memory(GiB)": 22.66, "step": 24866, "token_acc": 1.0, "train_speed(iter/s)": 0.957715 }, { "epoch": 0.8078160023389533, "grad_norm": 0.38347384333610535, "learning_rate": 9.763011232298102e-07, "loss": 0.016150999814271927, "memory(GiB)": 22.66, "step": 24867, "token_acc": 1.0, "train_speed(iter/s)": 0.957723 }, { "epoch": 0.8078484878017087, "grad_norm": 0.3518744111061096, "learning_rate": 9.759822775670935e-07, "loss": 0.013302205130457878, "memory(GiB)": 22.66, "step": 24868, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957731 }, { "epoch": 0.8078809732644642, "grad_norm": 0.2761451303958893, "learning_rate": 9.756634783468965e-07, "loss": 0.011758007109165192, "memory(GiB)": 22.66, "step": 24869, "token_acc": 1.0, "train_speed(iter/s)": 0.957739 }, { "epoch": 0.8079134587272195, "grad_norm": 0.4216301143169403, "learning_rate": 9.753447255729015e-07, "loss": 0.013688020408153534, "memory(GiB)": 22.66, "step": 24870, "token_acc": 1.0, "train_speed(iter/s)": 0.957747 }, { "epoch": 0.807945944189975, "grad_norm": 0.3562673032283783, "learning_rate": 9.750260192487831e-07, "loss": 0.012353816069662571, "memory(GiB)": 22.66, "step": 24871, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957755 }, { "epoch": 0.8079784296527304, "grad_norm": 0.2529492974281311, "learning_rate": 9.747073593782236e-07, "loss": 0.011746309697628021, "memory(GiB)": 22.66, "step": 24872, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.957762 }, { "epoch": 0.8080109151154858, "grad_norm": 0.2255699783563614, "learning_rate": 9.74388745964896e-07, "loss": 0.006856391206383705, "memory(GiB)": 22.66, "step": 24873, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.95777 }, { "epoch": 0.8080434005782412, "grad_norm": 0.2373199462890625, "learning_rate": 9.740701790124824e-07, "loss": 0.008491843938827515, "memory(GiB)": 22.66, "step": 24874, "token_acc": 1.0, "train_speed(iter/s)": 0.957778 }, { "epoch": 0.8080758860409967, "grad_norm": 0.38662344217300415, "learning_rate": 9.737516585246565e-07, "loss": 0.015500150620937347, "memory(GiB)": 22.66, "step": 24875, "token_acc": 0.977859778597786, "train_speed(iter/s)": 0.957786 }, { "epoch": 0.808108371503752, "grad_norm": 0.5045509338378906, "learning_rate": 9.734331845050971e-07, "loss": 0.014494607225060463, "memory(GiB)": 22.66, "step": 24876, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.957795 }, { "epoch": 0.8081408569665075, "grad_norm": 0.33507803082466125, "learning_rate": 9.73114756957476e-07, "loss": 0.010846373625099659, "memory(GiB)": 22.66, "step": 24877, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.957803 }, { "epoch": 0.8081733424292629, "grad_norm": 0.36740320920944214, "learning_rate": 9.727963758854714e-07, "loss": 0.01692899316549301, "memory(GiB)": 22.66, "step": 24878, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.957811 }, { "epoch": 0.8082058278920183, "grad_norm": 0.4130153954029083, "learning_rate": 9.724780412927558e-07, "loss": 0.016612596809864044, "memory(GiB)": 22.66, "step": 24879, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.957819 }, { "epoch": 0.8082383133547737, "grad_norm": 0.3287791609764099, "learning_rate": 9.721597531830063e-07, "loss": 0.00989258661866188, "memory(GiB)": 22.66, "step": 24880, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957827 }, { "epoch": 0.8082707988175292, "grad_norm": 0.3292231559753418, "learning_rate": 9.718415115598929e-07, "loss": 0.011988144367933273, "memory(GiB)": 22.66, "step": 24881, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.957836 }, { "epoch": 0.8083032842802845, "grad_norm": 0.35101714730262756, "learning_rate": 9.715233164270893e-07, "loss": 0.011641139164566994, "memory(GiB)": 22.66, "step": 24882, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957844 }, { "epoch": 0.80833576974304, "grad_norm": 0.2995472848415375, "learning_rate": 9.712051677882689e-07, "loss": 0.00848033931106329, "memory(GiB)": 22.66, "step": 24883, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.957853 }, { "epoch": 0.8083682552057954, "grad_norm": 0.37915775179862976, "learning_rate": 9.708870656471032e-07, "loss": 0.019610682502388954, "memory(GiB)": 22.66, "step": 24884, "token_acc": 1.0, "train_speed(iter/s)": 0.957861 }, { "epoch": 0.8084007406685508, "grad_norm": 0.3123410642147064, "learning_rate": 9.70569010007265e-07, "loss": 0.013081916607916355, "memory(GiB)": 22.66, "step": 24885, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.957869 }, { "epoch": 0.8084332261313062, "grad_norm": 0.4112846553325653, "learning_rate": 9.702510008724219e-07, "loss": 0.01048186793923378, "memory(GiB)": 22.66, "step": 24886, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.957877 }, { "epoch": 0.8084657115940617, "grad_norm": 0.26517680287361145, "learning_rate": 9.699330382462463e-07, "loss": 0.006886040326207876, "memory(GiB)": 22.66, "step": 24887, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957886 }, { "epoch": 0.808498197056817, "grad_norm": 0.40414392948150635, "learning_rate": 9.69615122132407e-07, "loss": 0.011142078787088394, "memory(GiB)": 22.66, "step": 24888, "token_acc": 1.0, "train_speed(iter/s)": 0.957892 }, { "epoch": 0.8085306825195725, "grad_norm": 0.3043113946914673, "learning_rate": 9.692972525345752e-07, "loss": 0.013046862557530403, "memory(GiB)": 22.66, "step": 24889, "token_acc": 1.0, "train_speed(iter/s)": 0.957899 }, { "epoch": 0.8085631679823279, "grad_norm": 0.24850870668888092, "learning_rate": 9.689794294564165e-07, "loss": 0.010359451174736023, "memory(GiB)": 22.66, "step": 24890, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957905 }, { "epoch": 0.8085956534450833, "grad_norm": 0.4207160770893097, "learning_rate": 9.686616529016002e-07, "loss": 0.01838799938559532, "memory(GiB)": 22.66, "step": 24891, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.95791 }, { "epoch": 0.8086281389078387, "grad_norm": 0.2862495481967926, "learning_rate": 9.683439228737945e-07, "loss": 0.010247313417494297, "memory(GiB)": 22.66, "step": 24892, "token_acc": 1.0, "train_speed(iter/s)": 0.957916 }, { "epoch": 0.8086606243705942, "grad_norm": 0.3849083185195923, "learning_rate": 9.680262393766672e-07, "loss": 0.013854363933205605, "memory(GiB)": 22.66, "step": 24893, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.957922 }, { "epoch": 0.8086931098333495, "grad_norm": 0.22064977884292603, "learning_rate": 9.677086024138827e-07, "loss": 0.010383693501353264, "memory(GiB)": 22.66, "step": 24894, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957928 }, { "epoch": 0.808725595296105, "grad_norm": 0.2503072917461395, "learning_rate": 9.67391011989109e-07, "loss": 0.009941300377249718, "memory(GiB)": 22.66, "step": 24895, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.957933 }, { "epoch": 0.8087580807588605, "grad_norm": 0.4020251929759979, "learning_rate": 9.670734681060074e-07, "loss": 0.012394067831337452, "memory(GiB)": 22.66, "step": 24896, "token_acc": 1.0, "train_speed(iter/s)": 0.957938 }, { "epoch": 0.8087905662216158, "grad_norm": 0.5903941988945007, "learning_rate": 9.667559707682489e-07, "loss": 0.017384208738803864, "memory(GiB)": 22.66, "step": 24897, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.957943 }, { "epoch": 0.8088230516843713, "grad_norm": 0.3076155483722687, "learning_rate": 9.664385199794929e-07, "loss": 0.009179613552987576, "memory(GiB)": 22.66, "step": 24898, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.957949 }, { "epoch": 0.8088555371471267, "grad_norm": 0.44601303339004517, "learning_rate": 9.661211157434059e-07, "loss": 0.014633812010288239, "memory(GiB)": 22.66, "step": 24899, "token_acc": 1.0, "train_speed(iter/s)": 0.957954 }, { "epoch": 0.8088880226098821, "grad_norm": 0.6274792551994324, "learning_rate": 9.65803758063651e-07, "loss": 0.009290321730077267, "memory(GiB)": 22.66, "step": 24900, "token_acc": 0.9817518248175182, "train_speed(iter/s)": 0.95796 }, { "epoch": 0.8089205080726375, "grad_norm": 0.23543842136859894, "learning_rate": 9.654864469438874e-07, "loss": 0.008583871647715569, "memory(GiB)": 22.66, "step": 24901, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.957965 }, { "epoch": 0.808952993535393, "grad_norm": 0.4209285080432892, "learning_rate": 9.651691823877828e-07, "loss": 0.01636035367846489, "memory(GiB)": 22.66, "step": 24902, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.957971 }, { "epoch": 0.8089854789981483, "grad_norm": 0.42853960394859314, "learning_rate": 9.648519643989956e-07, "loss": 0.016077883541584015, "memory(GiB)": 22.66, "step": 24903, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.957976 }, { "epoch": 0.8090179644609038, "grad_norm": 0.3694955110549927, "learning_rate": 9.645347929811882e-07, "loss": 0.012376287020742893, "memory(GiB)": 22.66, "step": 24904, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957981 }, { "epoch": 0.8090504499236592, "grad_norm": 0.42917755246162415, "learning_rate": 9.642176681380195e-07, "loss": 0.01758570969104767, "memory(GiB)": 22.66, "step": 24905, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957986 }, { "epoch": 0.8090829353864146, "grad_norm": 0.3394082486629486, "learning_rate": 9.6390058987315e-07, "loss": 0.012802844867110252, "memory(GiB)": 22.66, "step": 24906, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957991 }, { "epoch": 0.80911542084917, "grad_norm": 0.2831304669380188, "learning_rate": 9.635835581902403e-07, "loss": 0.015157114714384079, "memory(GiB)": 22.66, "step": 24907, "token_acc": 0.9929824561403509, "train_speed(iter/s)": 0.957997 }, { "epoch": 0.8091479063119255, "grad_norm": 0.3732070326805115, "learning_rate": 9.632665730929497e-07, "loss": 0.01328884344547987, "memory(GiB)": 22.66, "step": 24908, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.958004 }, { "epoch": 0.8091803917746808, "grad_norm": 0.25222402811050415, "learning_rate": 9.629496345849349e-07, "loss": 0.0064233941957354546, "memory(GiB)": 22.66, "step": 24909, "token_acc": 0.9963636363636363, "train_speed(iter/s)": 0.958009 }, { "epoch": 0.8092128772374363, "grad_norm": 0.3898361027240753, "learning_rate": 9.626327426698539e-07, "loss": 0.01146694552153349, "memory(GiB)": 22.66, "step": 24910, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.958016 }, { "epoch": 0.8092453627001917, "grad_norm": 0.4401806592941284, "learning_rate": 9.623158973513652e-07, "loss": 0.013694602064788342, "memory(GiB)": 22.66, "step": 24911, "token_acc": 0.992831541218638, "train_speed(iter/s)": 0.958023 }, { "epoch": 0.8092778481629471, "grad_norm": 0.29403936862945557, "learning_rate": 9.619990986331268e-07, "loss": 0.009172003716230392, "memory(GiB)": 22.66, "step": 24912, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.958029 }, { "epoch": 0.8093103336257025, "grad_norm": 0.40003249049186707, "learning_rate": 9.61682346518792e-07, "loss": 0.010815334506332874, "memory(GiB)": 22.66, "step": 24913, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.958036 }, { "epoch": 0.809342819088458, "grad_norm": 0.3201061189174652, "learning_rate": 9.61365641012017e-07, "loss": 0.010408580303192139, "memory(GiB)": 22.66, "step": 24914, "token_acc": 0.99609375, "train_speed(iter/s)": 0.958043 }, { "epoch": 0.8093753045512133, "grad_norm": 0.4367913007736206, "learning_rate": 9.610489821164588e-07, "loss": 0.015268595889210701, "memory(GiB)": 22.66, "step": 24915, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.958051 }, { "epoch": 0.8094077900139688, "grad_norm": 0.26421424746513367, "learning_rate": 9.607323698357712e-07, "loss": 0.009021131321787834, "memory(GiB)": 22.66, "step": 24916, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.95806 }, { "epoch": 0.8094402754767241, "grad_norm": 0.4102844297885895, "learning_rate": 9.604158041736088e-07, "loss": 0.014752132818102837, "memory(GiB)": 22.66, "step": 24917, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.958067 }, { "epoch": 0.8094727609394796, "grad_norm": 0.3455064594745636, "learning_rate": 9.600992851336243e-07, "loss": 0.016604403033852577, "memory(GiB)": 22.66, "step": 24918, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.958075 }, { "epoch": 0.809505246402235, "grad_norm": 0.24028727412223816, "learning_rate": 9.597828127194709e-07, "loss": 0.005922744981944561, "memory(GiB)": 22.66, "step": 24919, "token_acc": 1.0, "train_speed(iter/s)": 0.958083 }, { "epoch": 0.8095377318649905, "grad_norm": 0.25379350781440735, "learning_rate": 9.59466386934801e-07, "loss": 0.01036376878619194, "memory(GiB)": 22.66, "step": 24920, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.958091 }, { "epoch": 0.8095702173277458, "grad_norm": 0.2666696012020111, "learning_rate": 9.591500077832688e-07, "loss": 0.012578731402754784, "memory(GiB)": 22.66, "step": 24921, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.958099 }, { "epoch": 0.8096027027905013, "grad_norm": 0.4006003141403198, "learning_rate": 9.588336752685218e-07, "loss": 0.011140760034322739, "memory(GiB)": 22.66, "step": 24922, "token_acc": 1.0, "train_speed(iter/s)": 0.958107 }, { "epoch": 0.8096351882532566, "grad_norm": 0.3234108090400696, "learning_rate": 9.585173893942152e-07, "loss": 0.012995760887861252, "memory(GiB)": 22.66, "step": 24923, "token_acc": 1.0, "train_speed(iter/s)": 0.958116 }, { "epoch": 0.8096676737160121, "grad_norm": 0.45622003078460693, "learning_rate": 9.582011501639938e-07, "loss": 0.01846364513039589, "memory(GiB)": 22.66, "step": 24924, "token_acc": 0.9823788546255506, "train_speed(iter/s)": 0.958124 }, { "epoch": 0.8097001591787675, "grad_norm": 0.2727138102054596, "learning_rate": 9.578849575815137e-07, "loss": 0.008551822043955326, "memory(GiB)": 22.66, "step": 24925, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.958132 }, { "epoch": 0.809732644641523, "grad_norm": 0.3754153847694397, "learning_rate": 9.575688116504194e-07, "loss": 0.015091591514647007, "memory(GiB)": 22.66, "step": 24926, "token_acc": 1.0, "train_speed(iter/s)": 0.95814 }, { "epoch": 0.8097651301042783, "grad_norm": 0.30350613594055176, "learning_rate": 9.572527123743635e-07, "loss": 0.009389190003275871, "memory(GiB)": 22.66, "step": 24927, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.958148 }, { "epoch": 0.8097976155670338, "grad_norm": 0.49216169118881226, "learning_rate": 9.569366597569906e-07, "loss": 0.009984882548451424, "memory(GiB)": 22.66, "step": 24928, "token_acc": 1.0, "train_speed(iter/s)": 0.958157 }, { "epoch": 0.8098301010297891, "grad_norm": 0.32501524686813354, "learning_rate": 9.566206538019485e-07, "loss": 0.01369999349117279, "memory(GiB)": 22.66, "step": 24929, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.958164 }, { "epoch": 0.8098625864925446, "grad_norm": 0.40383949875831604, "learning_rate": 9.563046945128883e-07, "loss": 0.012044520117342472, "memory(GiB)": 22.66, "step": 24930, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.958172 }, { "epoch": 0.8098950719553, "grad_norm": 0.356189101934433, "learning_rate": 9.559887818934532e-07, "loss": 0.00894962903112173, "memory(GiB)": 22.66, "step": 24931, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.958181 }, { "epoch": 0.8099275574180554, "grad_norm": 0.4042007625102997, "learning_rate": 9.556729159472917e-07, "loss": 0.016265233978629112, "memory(GiB)": 22.66, "step": 24932, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.958189 }, { "epoch": 0.8099600428808108, "grad_norm": 0.3345261216163635, "learning_rate": 9.553570966780456e-07, "loss": 0.01222945936024189, "memory(GiB)": 22.66, "step": 24933, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.958197 }, { "epoch": 0.8099925283435663, "grad_norm": 0.38460609316825867, "learning_rate": 9.550413240893626e-07, "loss": 0.012978184036910534, "memory(GiB)": 22.66, "step": 24934, "token_acc": 1.0, "train_speed(iter/s)": 0.958205 }, { "epoch": 0.8100250138063216, "grad_norm": 0.3707998991012573, "learning_rate": 9.547255981848864e-07, "loss": 0.008057942613959312, "memory(GiB)": 22.66, "step": 24935, "token_acc": 1.0, "train_speed(iter/s)": 0.958212 }, { "epoch": 0.8100574992690771, "grad_norm": 0.27992963790893555, "learning_rate": 9.54409918968262e-07, "loss": 0.00856715440750122, "memory(GiB)": 22.66, "step": 24936, "token_acc": 1.0, "train_speed(iter/s)": 0.95822 }, { "epoch": 0.8100899847318325, "grad_norm": 0.32916340231895447, "learning_rate": 9.540942864431307e-07, "loss": 0.012073526158928871, "memory(GiB)": 22.66, "step": 24937, "token_acc": 1.0, "train_speed(iter/s)": 0.958228 }, { "epoch": 0.8101224701945879, "grad_norm": 0.3596322238445282, "learning_rate": 9.537787006131366e-07, "loss": 0.011647867038846016, "memory(GiB)": 22.66, "step": 24938, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.958237 }, { "epoch": 0.8101549556573433, "grad_norm": 0.4898502230644226, "learning_rate": 9.534631614819218e-07, "loss": 0.013333579525351524, "memory(GiB)": 22.66, "step": 24939, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.958245 }, { "epoch": 0.8101874411200988, "grad_norm": 0.3688678741455078, "learning_rate": 9.531476690531288e-07, "loss": 0.012000255286693573, "memory(GiB)": 22.66, "step": 24940, "token_acc": 1.0, "train_speed(iter/s)": 0.958253 }, { "epoch": 0.8102199265828541, "grad_norm": 0.29056745767593384, "learning_rate": 9.528322233303972e-07, "loss": 0.008867643773555756, "memory(GiB)": 22.66, "step": 24941, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.958261 }, { "epoch": 0.8102524120456096, "grad_norm": 0.4615035653114319, "learning_rate": 9.525168243173688e-07, "loss": 0.019499491900205612, "memory(GiB)": 22.66, "step": 24942, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.958269 }, { "epoch": 0.810284897508365, "grad_norm": 0.3869079351425171, "learning_rate": 9.522014720176831e-07, "loss": 0.008561646565794945, "memory(GiB)": 22.66, "step": 24943, "token_acc": 1.0, "train_speed(iter/s)": 0.958277 }, { "epoch": 0.8103173829711204, "grad_norm": 0.2832391560077667, "learning_rate": 9.518861664349798e-07, "loss": 0.009199708700180054, "memory(GiB)": 22.66, "step": 24944, "token_acc": 1.0, "train_speed(iter/s)": 0.958283 }, { "epoch": 0.8103498684338758, "grad_norm": 0.3198481798171997, "learning_rate": 9.515709075728996e-07, "loss": 0.00856725126504898, "memory(GiB)": 22.66, "step": 24945, "token_acc": 1.0, "train_speed(iter/s)": 0.95829 }, { "epoch": 0.8103823538966313, "grad_norm": 0.38941073417663574, "learning_rate": 9.512556954350788e-07, "loss": 0.0095132477581501, "memory(GiB)": 22.66, "step": 24946, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.958298 }, { "epoch": 0.8104148393593866, "grad_norm": 0.31121668219566345, "learning_rate": 9.509405300251556e-07, "loss": 0.01508923340588808, "memory(GiB)": 22.66, "step": 24947, "token_acc": 0.9757281553398058, "train_speed(iter/s)": 0.958306 }, { "epoch": 0.8104473248221421, "grad_norm": 0.43279898166656494, "learning_rate": 9.506254113467683e-07, "loss": 0.01487799920141697, "memory(GiB)": 22.66, "step": 24948, "token_acc": 1.0, "train_speed(iter/s)": 0.958312 }, { "epoch": 0.8104798102848975, "grad_norm": 0.3032623529434204, "learning_rate": 9.503103394035545e-07, "loss": 0.010551980696618557, "memory(GiB)": 22.66, "step": 24949, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.958318 }, { "epoch": 0.8105122957476529, "grad_norm": 0.3022342324256897, "learning_rate": 9.49995314199148e-07, "loss": 0.011203303933143616, "memory(GiB)": 22.66, "step": 24950, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.958324 }, { "epoch": 0.8105447812104083, "grad_norm": 0.45294198393821716, "learning_rate": 9.496803357371875e-07, "loss": 0.01384495198726654, "memory(GiB)": 22.66, "step": 24951, "token_acc": 1.0, "train_speed(iter/s)": 0.95833 }, { "epoch": 0.8105772666731638, "grad_norm": 0.39721307158470154, "learning_rate": 9.493654040213046e-07, "loss": 0.008304646238684654, "memory(GiB)": 22.66, "step": 24952, "token_acc": 1.0, "train_speed(iter/s)": 0.958336 }, { "epoch": 0.8106097521359191, "grad_norm": 0.31177371740341187, "learning_rate": 9.490505190551391e-07, "loss": 0.007802943699061871, "memory(GiB)": 22.66, "step": 24953, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.958341 }, { "epoch": 0.8106422375986746, "grad_norm": 0.277652382850647, "learning_rate": 9.487356808423203e-07, "loss": 0.01084250956773758, "memory(GiB)": 22.66, "step": 24954, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.958346 }, { "epoch": 0.81067472306143, "grad_norm": 0.3351089358329773, "learning_rate": 9.484208893864861e-07, "loss": 0.013494143262505531, "memory(GiB)": 22.66, "step": 24955, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.958351 }, { "epoch": 0.8107072085241854, "grad_norm": 0.34679898619651794, "learning_rate": 9.481061446912659e-07, "loss": 0.017200332134962082, "memory(GiB)": 22.66, "step": 24956, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.958357 }, { "epoch": 0.8107396939869408, "grad_norm": 0.36966824531555176, "learning_rate": 9.477914467602939e-07, "loss": 0.01053091324865818, "memory(GiB)": 22.66, "step": 24957, "token_acc": 1.0, "train_speed(iter/s)": 0.958362 }, { "epoch": 0.8107721794496963, "grad_norm": 0.3897083103656769, "learning_rate": 9.47476795597202e-07, "loss": 0.01494954340159893, "memory(GiB)": 22.66, "step": 24958, "token_acc": 0.994475138121547, "train_speed(iter/s)": 0.958368 }, { "epoch": 0.8108046649124516, "grad_norm": 0.3388482928276062, "learning_rate": 9.471621912056228e-07, "loss": 0.01037769764661789, "memory(GiB)": 22.66, "step": 24959, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.958373 }, { "epoch": 0.8108371503752071, "grad_norm": 0.2838001251220703, "learning_rate": 9.468476335891852e-07, "loss": 0.011782770045101643, "memory(GiB)": 22.66, "step": 24960, "token_acc": 1.0, "train_speed(iter/s)": 0.958379 }, { "epoch": 0.8108696358379626, "grad_norm": 0.2970700263977051, "learning_rate": 9.465331227515206e-07, "loss": 0.013462948612868786, "memory(GiB)": 22.66, "step": 24961, "token_acc": 1.0, "train_speed(iter/s)": 0.958384 }, { "epoch": 0.8109021213007179, "grad_norm": 0.27911901473999023, "learning_rate": 9.46218658696259e-07, "loss": 0.012427945621311665, "memory(GiB)": 22.66, "step": 24962, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.958389 }, { "epoch": 0.8109346067634734, "grad_norm": 0.37014320492744446, "learning_rate": 9.459042414270297e-07, "loss": 0.010075677186250687, "memory(GiB)": 22.66, "step": 24963, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.958395 }, { "epoch": 0.8109670922262288, "grad_norm": 0.29766297340393066, "learning_rate": 9.455898709474626e-07, "loss": 0.012074897065758705, "memory(GiB)": 22.66, "step": 24964, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.9584 }, { "epoch": 0.8109995776889842, "grad_norm": 0.2807174623012543, "learning_rate": 9.452755472611836e-07, "loss": 0.007814879529178143, "memory(GiB)": 22.66, "step": 24965, "token_acc": 1.0, "train_speed(iter/s)": 0.958406 }, { "epoch": 0.8110320631517396, "grad_norm": 0.37549644708633423, "learning_rate": 9.449612703718214e-07, "loss": 0.00915441568940878, "memory(GiB)": 22.66, "step": 24966, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.958411 }, { "epoch": 0.8110645486144951, "grad_norm": 0.28173762559890747, "learning_rate": 9.446470402830032e-07, "loss": 0.010495619848370552, "memory(GiB)": 22.66, "step": 24967, "token_acc": 1.0, "train_speed(iter/s)": 0.958417 }, { "epoch": 0.8110970340772504, "grad_norm": 0.47939348220825195, "learning_rate": 9.443328569983579e-07, "loss": 0.00967164896428585, "memory(GiB)": 22.66, "step": 24968, "token_acc": 1.0, "train_speed(iter/s)": 0.958424 }, { "epoch": 0.8111295195400059, "grad_norm": 0.31866633892059326, "learning_rate": 9.440187205215073e-07, "loss": 0.010505151003599167, "memory(GiB)": 22.66, "step": 24969, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.95843 }, { "epoch": 0.8111620050027613, "grad_norm": 0.2706310451030731, "learning_rate": 9.437046308560799e-07, "loss": 0.007311876863241196, "memory(GiB)": 22.66, "step": 24970, "token_acc": 1.0, "train_speed(iter/s)": 0.958437 }, { "epoch": 0.8111944904655167, "grad_norm": 0.512732982635498, "learning_rate": 9.433905880056998e-07, "loss": 0.01697760447859764, "memory(GiB)": 22.66, "step": 24971, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.958443 }, { "epoch": 0.8112269759282721, "grad_norm": 0.23235416412353516, "learning_rate": 9.430765919739926e-07, "loss": 0.010031290352344513, "memory(GiB)": 22.66, "step": 24972, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.958448 }, { "epoch": 0.8112594613910276, "grad_norm": 0.48349931836128235, "learning_rate": 9.427626427645803e-07, "loss": 0.01541292667388916, "memory(GiB)": 22.66, "step": 24973, "token_acc": 1.0, "train_speed(iter/s)": 0.958454 }, { "epoch": 0.8112919468537829, "grad_norm": 0.32447201013565063, "learning_rate": 9.424487403810883e-07, "loss": 0.011149993166327477, "memory(GiB)": 22.66, "step": 24974, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.958461 }, { "epoch": 0.8113244323165384, "grad_norm": 0.36373189091682434, "learning_rate": 9.421348848271361e-07, "loss": 0.016043642535805702, "memory(GiB)": 22.66, "step": 24975, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.958467 }, { "epoch": 0.8113569177792938, "grad_norm": 0.40886300802230835, "learning_rate": 9.418210761063496e-07, "loss": 0.0162762813270092, "memory(GiB)": 22.66, "step": 24976, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.958473 }, { "epoch": 0.8113894032420492, "grad_norm": 0.29972735047340393, "learning_rate": 9.415073142223508e-07, "loss": 0.009542571380734444, "memory(GiB)": 22.66, "step": 24977, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.958481 }, { "epoch": 0.8114218887048046, "grad_norm": 0.3334157466888428, "learning_rate": 9.41193599178758e-07, "loss": 0.012764476239681244, "memory(GiB)": 22.66, "step": 24978, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.958489 }, { "epoch": 0.81145437416756, "grad_norm": 0.3622838854789734, "learning_rate": 9.408799309791944e-07, "loss": 0.013536088168621063, "memory(GiB)": 22.66, "step": 24979, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.958497 }, { "epoch": 0.8114868596303154, "grad_norm": 0.4289492666721344, "learning_rate": 9.405663096272766e-07, "loss": 0.01174466498196125, "memory(GiB)": 22.66, "step": 24980, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.958505 }, { "epoch": 0.8115193450930709, "grad_norm": 0.27452775835990906, "learning_rate": 9.402527351266289e-07, "loss": 0.00920695811510086, "memory(GiB)": 22.66, "step": 24981, "token_acc": 1.0, "train_speed(iter/s)": 0.958513 }, { "epoch": 0.8115518305558262, "grad_norm": 0.42934179306030273, "learning_rate": 9.399392074808672e-07, "loss": 0.02085888385772705, "memory(GiB)": 22.66, "step": 24982, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.958521 }, { "epoch": 0.8115843160185817, "grad_norm": 0.36772245168685913, "learning_rate": 9.396257266936126e-07, "loss": 0.006280918605625629, "memory(GiB)": 22.66, "step": 24983, "token_acc": 1.0, "train_speed(iter/s)": 0.958529 }, { "epoch": 0.8116168014813371, "grad_norm": 0.3763435482978821, "learning_rate": 9.3931229276848e-07, "loss": 0.015369834378361702, "memory(GiB)": 22.66, "step": 24984, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.958536 }, { "epoch": 0.8116492869440926, "grad_norm": 0.43409258127212524, "learning_rate": 9.389989057090881e-07, "loss": 0.01143668033182621, "memory(GiB)": 22.66, "step": 24985, "token_acc": 1.0, "train_speed(iter/s)": 0.958544 }, { "epoch": 0.8116817724068479, "grad_norm": 0.27774906158447266, "learning_rate": 9.386855655190541e-07, "loss": 0.008229302242398262, "memory(GiB)": 22.66, "step": 24986, "token_acc": 1.0, "train_speed(iter/s)": 0.958552 }, { "epoch": 0.8117142578696034, "grad_norm": 0.3483646810054779, "learning_rate": 9.383722722019955e-07, "loss": 0.008719388395547867, "memory(GiB)": 22.66, "step": 24987, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.958559 }, { "epoch": 0.8117467433323587, "grad_norm": 0.3997791111469269, "learning_rate": 9.380590257615252e-07, "loss": 0.012522443197667599, "memory(GiB)": 22.66, "step": 24988, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.958541 }, { "epoch": 0.8117792287951142, "grad_norm": 0.3963198661804199, "learning_rate": 9.377458262012607e-07, "loss": 0.016575608402490616, "memory(GiB)": 22.66, "step": 24989, "token_acc": 1.0, "train_speed(iter/s)": 0.958549 }, { "epoch": 0.8118117142578696, "grad_norm": 0.37354397773742676, "learning_rate": 9.374326735248163e-07, "loss": 0.011943573132157326, "memory(GiB)": 22.66, "step": 24990, "token_acc": 1.0, "train_speed(iter/s)": 0.958556 }, { "epoch": 0.811844199720625, "grad_norm": 0.3990826904773712, "learning_rate": 9.371195677358058e-07, "loss": 0.014231368899345398, "memory(GiB)": 22.66, "step": 24991, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.958564 }, { "epoch": 0.8118766851833804, "grad_norm": 0.3686908483505249, "learning_rate": 9.368065088378447e-07, "loss": 0.016619855538010597, "memory(GiB)": 22.66, "step": 24992, "token_acc": 0.995, "train_speed(iter/s)": 0.958572 }, { "epoch": 0.8119091706461359, "grad_norm": 0.3981214463710785, "learning_rate": 9.36493496834543e-07, "loss": 0.016381124034523964, "memory(GiB)": 22.66, "step": 24993, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.95858 }, { "epoch": 0.8119416561088912, "grad_norm": 0.2158339023590088, "learning_rate": 9.361805317295153e-07, "loss": 0.007990164682269096, "memory(GiB)": 22.66, "step": 24994, "token_acc": 1.0, "train_speed(iter/s)": 0.958588 }, { "epoch": 0.8119741415716467, "grad_norm": 0.44563165307044983, "learning_rate": 9.358676135263728e-07, "loss": 0.019094791263341904, "memory(GiB)": 22.66, "step": 24995, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.958594 }, { "epoch": 0.8120066270344021, "grad_norm": 0.3984435796737671, "learning_rate": 9.355547422287287e-07, "loss": 0.01143164373934269, "memory(GiB)": 22.66, "step": 24996, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.9586 }, { "epoch": 0.8120391124971575, "grad_norm": 0.4800044894218445, "learning_rate": 9.352419178401916e-07, "loss": 0.018145127221941948, "memory(GiB)": 22.66, "step": 24997, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.958606 }, { "epoch": 0.8120715979599129, "grad_norm": 0.2948600947856903, "learning_rate": 9.349291403643729e-07, "loss": 0.010832687839865685, "memory(GiB)": 22.66, "step": 24998, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.958613 }, { "epoch": 0.8121040834226684, "grad_norm": 0.30636435747146606, "learning_rate": 9.346164098048826e-07, "loss": 0.01596968062222004, "memory(GiB)": 22.66, "step": 24999, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.958619 }, { "epoch": 0.8121365688854237, "grad_norm": 0.2428065836429596, "learning_rate": 9.343037261653309e-07, "loss": 0.010276737622916698, "memory(GiB)": 22.66, "step": 25000, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.958626 }, { "epoch": 0.8121365688854237, "eval_loss": 0.012433567084372044, "eval_runtime": 80.6252, "eval_samples_per_second": 123.411, "eval_steps_per_second": 3.857, "eval_token_acc": 0.994903186280819, "step": 25000 }, { "epoch": 0.8121690543481792, "grad_norm": 0.3525344133377075, "learning_rate": 9.339910894493242e-07, "loss": 0.010612986981868744, "memory(GiB)": 22.66, "step": 25001, "token_acc": 0.9944121071012806, "train_speed(iter/s)": 0.95526 }, { "epoch": 0.8122015398109346, "grad_norm": 0.25251755118370056, "learning_rate": 9.336784996604741e-07, "loss": 0.010042032226920128, "memory(GiB)": 22.66, "step": 25002, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.955265 }, { "epoch": 0.81223402527369, "grad_norm": 0.32783564925193787, "learning_rate": 9.333659568023834e-07, "loss": 0.013230636715888977, "memory(GiB)": 22.66, "step": 25003, "token_acc": 1.0, "train_speed(iter/s)": 0.95527 }, { "epoch": 0.8122665107364454, "grad_norm": 0.41273319721221924, "learning_rate": 9.330534608786651e-07, "loss": 0.010870417580008507, "memory(GiB)": 22.66, "step": 25004, "token_acc": 1.0, "train_speed(iter/s)": 0.955276 }, { "epoch": 0.8122989961992009, "grad_norm": 0.44731661677360535, "learning_rate": 9.327410118929215e-07, "loss": 0.013317469507455826, "memory(GiB)": 22.66, "step": 25005, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955282 }, { "epoch": 0.8123314816619562, "grad_norm": 0.2180517464876175, "learning_rate": 9.3242860984876e-07, "loss": 0.007871638052165508, "memory(GiB)": 22.66, "step": 25006, "token_acc": 1.0, "train_speed(iter/s)": 0.955287 }, { "epoch": 0.8123639671247117, "grad_norm": 0.4134092926979065, "learning_rate": 9.321162547497876e-07, "loss": 0.013132510706782341, "memory(GiB)": 22.66, "step": 25007, "token_acc": 0.9922779922779923, "train_speed(iter/s)": 0.955293 }, { "epoch": 0.8123964525874671, "grad_norm": 0.3143620193004608, "learning_rate": 9.318039465996053e-07, "loss": 0.009671179577708244, "memory(GiB)": 22.66, "step": 25008, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955298 }, { "epoch": 0.8124289380502225, "grad_norm": 0.34488990902900696, "learning_rate": 9.31491685401823e-07, "loss": 0.013219164684414864, "memory(GiB)": 22.66, "step": 25009, "token_acc": 1.0, "train_speed(iter/s)": 0.955303 }, { "epoch": 0.8124614235129779, "grad_norm": 0.44003790616989136, "learning_rate": 9.311794711600402e-07, "loss": 0.0113518126308918, "memory(GiB)": 22.66, "step": 25010, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955308 }, { "epoch": 0.8124939089757334, "grad_norm": 0.5732659101486206, "learning_rate": 9.308673038778637e-07, "loss": 0.01070423237979412, "memory(GiB)": 22.66, "step": 25011, "token_acc": 1.0, "train_speed(iter/s)": 0.955314 }, { "epoch": 0.8125263944384887, "grad_norm": 0.3950212597846985, "learning_rate": 9.305551835588928e-07, "loss": 0.00925475638359785, "memory(GiB)": 22.66, "step": 25012, "token_acc": 0.9948453608247423, "train_speed(iter/s)": 0.955319 }, { "epoch": 0.8125588799012442, "grad_norm": 0.35671523213386536, "learning_rate": 9.302431102067316e-07, "loss": 0.01610502228140831, "memory(GiB)": 22.66, "step": 25013, "token_acc": 1.0, "train_speed(iter/s)": 0.955324 }, { "epoch": 0.8125913653639996, "grad_norm": 0.40921351313591003, "learning_rate": 9.29931083824982e-07, "loss": 0.01350770890712738, "memory(GiB)": 22.66, "step": 25014, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.95533 }, { "epoch": 0.812623850826755, "grad_norm": 0.39071258902549744, "learning_rate": 9.296191044172459e-07, "loss": 0.01405374612659216, "memory(GiB)": 22.66, "step": 25015, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.955336 }, { "epoch": 0.8126563362895104, "grad_norm": 0.2899214029312134, "learning_rate": 9.293071719871221e-07, "loss": 0.010286478325724602, "memory(GiB)": 22.66, "step": 25016, "token_acc": 1.0, "train_speed(iter/s)": 0.955341 }, { "epoch": 0.8126888217522659, "grad_norm": 0.3166213631629944, "learning_rate": 9.289952865382113e-07, "loss": 0.007648369297385216, "memory(GiB)": 22.66, "step": 25017, "token_acc": 1.0, "train_speed(iter/s)": 0.955337 }, { "epoch": 0.8127213072150212, "grad_norm": 0.35834771394729614, "learning_rate": 9.286834480741136e-07, "loss": 0.013541508466005325, "memory(GiB)": 22.66, "step": 25018, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.955342 }, { "epoch": 0.8127537926777767, "grad_norm": 0.4161185920238495, "learning_rate": 9.283716565984285e-07, "loss": 0.01253361813724041, "memory(GiB)": 22.66, "step": 25019, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.955348 }, { "epoch": 0.8127862781405321, "grad_norm": 0.4569821059703827, "learning_rate": 9.280599121147532e-07, "loss": 0.012713141739368439, "memory(GiB)": 22.66, "step": 25020, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955354 }, { "epoch": 0.8128187636032875, "grad_norm": 0.26419225335121155, "learning_rate": 9.277482146266858e-07, "loss": 0.015056531876325607, "memory(GiB)": 22.66, "step": 25021, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.955359 }, { "epoch": 0.8128512490660429, "grad_norm": 0.4207475483417511, "learning_rate": 9.274365641378247e-07, "loss": 0.016715869307518005, "memory(GiB)": 22.66, "step": 25022, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955364 }, { "epoch": 0.8128837345287984, "grad_norm": 0.31184670329093933, "learning_rate": 9.271249606517657e-07, "loss": 0.011346706189215183, "memory(GiB)": 22.66, "step": 25023, "token_acc": 1.0, "train_speed(iter/s)": 0.95537 }, { "epoch": 0.8129162199915538, "grad_norm": 0.3612969219684601, "learning_rate": 9.268134041721072e-07, "loss": 0.020520377904176712, "memory(GiB)": 22.66, "step": 25024, "token_acc": 0.988, "train_speed(iter/s)": 0.955376 }, { "epoch": 0.8129487054543092, "grad_norm": 0.2558887302875519, "learning_rate": 9.265018947024423e-07, "loss": 0.010004112496972084, "memory(GiB)": 22.66, "step": 25025, "token_acc": 1.0, "train_speed(iter/s)": 0.955382 }, { "epoch": 0.8129811909170647, "grad_norm": 0.3199198842048645, "learning_rate": 9.261904322463667e-07, "loss": 0.009742152877151966, "memory(GiB)": 22.66, "step": 25026, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.955388 }, { "epoch": 0.81301367637982, "grad_norm": 0.31675437092781067, "learning_rate": 9.258790168074766e-07, "loss": 0.009021496400237083, "memory(GiB)": 22.66, "step": 25027, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.955394 }, { "epoch": 0.8130461618425755, "grad_norm": 0.4004494249820709, "learning_rate": 9.25567648389366e-07, "loss": 0.011778258718550205, "memory(GiB)": 22.66, "step": 25028, "token_acc": 1.0, "train_speed(iter/s)": 0.955401 }, { "epoch": 0.8130786473053309, "grad_norm": 0.3201521337032318, "learning_rate": 9.252563269956266e-07, "loss": 0.010191665962338448, "memory(GiB)": 22.66, "step": 25029, "token_acc": 0.988, "train_speed(iter/s)": 0.955407 }, { "epoch": 0.8131111327680863, "grad_norm": 0.24537470936775208, "learning_rate": 9.249450526298537e-07, "loss": 0.007115937303751707, "memory(GiB)": 22.66, "step": 25030, "token_acc": 1.0, "train_speed(iter/s)": 0.955414 }, { "epoch": 0.8131436182308417, "grad_norm": 0.3563985824584961, "learning_rate": 9.246338252956366e-07, "loss": 0.012225974351167679, "memory(GiB)": 22.66, "step": 25031, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955421 }, { "epoch": 0.8131761036935972, "grad_norm": 0.419592022895813, "learning_rate": 9.24322644996572e-07, "loss": 0.013333950191736221, "memory(GiB)": 22.66, "step": 25032, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955427 }, { "epoch": 0.8132085891563525, "grad_norm": 0.4141140282154083, "learning_rate": 9.240115117362475e-07, "loss": 0.011314673349261284, "memory(GiB)": 22.66, "step": 25033, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.955433 }, { "epoch": 0.813241074619108, "grad_norm": 0.3772013187408447, "learning_rate": 9.237004255182564e-07, "loss": 0.011154621839523315, "memory(GiB)": 22.66, "step": 25034, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.955438 }, { "epoch": 0.8132735600818634, "grad_norm": 0.4432990252971649, "learning_rate": 9.233893863461868e-07, "loss": 0.011666743084788322, "memory(GiB)": 22.66, "step": 25035, "token_acc": 1.0, "train_speed(iter/s)": 0.955444 }, { "epoch": 0.8133060455446188, "grad_norm": 0.46139660477638245, "learning_rate": 9.230783942236293e-07, "loss": 0.022902678698301315, "memory(GiB)": 22.66, "step": 25036, "token_acc": 0.9869565217391304, "train_speed(iter/s)": 0.955451 }, { "epoch": 0.8133385310073742, "grad_norm": 0.29480552673339844, "learning_rate": 9.227674491541738e-07, "loss": 0.007949150167405605, "memory(GiB)": 22.66, "step": 25037, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.955457 }, { "epoch": 0.8133710164701297, "grad_norm": 0.3716128170490265, "learning_rate": 9.224565511414091e-07, "loss": 0.014318672008812428, "memory(GiB)": 22.66, "step": 25038, "token_acc": 1.0, "train_speed(iter/s)": 0.955463 }, { "epoch": 0.813403501932885, "grad_norm": 0.27683085203170776, "learning_rate": 9.221457001889234e-07, "loss": 0.016363080590963364, "memory(GiB)": 22.66, "step": 25039, "token_acc": 1.0, "train_speed(iter/s)": 0.955471 }, { "epoch": 0.8134359873956405, "grad_norm": 0.23351003229618073, "learning_rate": 9.218348963003032e-07, "loss": 0.0061331503093242645, "memory(GiB)": 22.66, "step": 25040, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955478 }, { "epoch": 0.8134684728583959, "grad_norm": 0.3928726613521576, "learning_rate": 9.215241394791364e-07, "loss": 0.01164565235376358, "memory(GiB)": 22.66, "step": 25041, "token_acc": 1.0, "train_speed(iter/s)": 0.955486 }, { "epoch": 0.8135009583211513, "grad_norm": 0.2432277351617813, "learning_rate": 9.212134297290093e-07, "loss": 0.00930863432586193, "memory(GiB)": 22.66, "step": 25042, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.955495 }, { "epoch": 0.8135334437839067, "grad_norm": 0.30967801809310913, "learning_rate": 9.209027670535092e-07, "loss": 0.016995228826999664, "memory(GiB)": 22.66, "step": 25043, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.955503 }, { "epoch": 0.8135659292466622, "grad_norm": 0.3316503167152405, "learning_rate": 9.205921514562194e-07, "loss": 0.008220787160098553, "memory(GiB)": 22.66, "step": 25044, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955511 }, { "epoch": 0.8135984147094175, "grad_norm": 0.4185953438282013, "learning_rate": 9.202815829407258e-07, "loss": 0.015391364693641663, "memory(GiB)": 22.66, "step": 25045, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.955519 }, { "epoch": 0.813630900172173, "grad_norm": 0.33142873644828796, "learning_rate": 9.199710615106128e-07, "loss": 0.012151568196713924, "memory(GiB)": 22.66, "step": 25046, "token_acc": 1.0, "train_speed(iter/s)": 0.955527 }, { "epoch": 0.8136633856349283, "grad_norm": 0.4063740372657776, "learning_rate": 9.196605871694658e-07, "loss": 0.021832626312971115, "memory(GiB)": 22.66, "step": 25047, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.955535 }, { "epoch": 0.8136958710976838, "grad_norm": 0.27439552545547485, "learning_rate": 9.193501599208649e-07, "loss": 0.007650353945791721, "memory(GiB)": 22.66, "step": 25048, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.955543 }, { "epoch": 0.8137283565604392, "grad_norm": 0.192823126912117, "learning_rate": 9.190397797683942e-07, "loss": 0.005862886551767588, "memory(GiB)": 22.66, "step": 25049, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.955551 }, { "epoch": 0.8137608420231947, "grad_norm": 0.25894200801849365, "learning_rate": 9.187294467156366e-07, "loss": 0.01318381354212761, "memory(GiB)": 22.66, "step": 25050, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.955558 }, { "epoch": 0.81379332748595, "grad_norm": 0.9585103392601013, "learning_rate": 9.184191607661747e-07, "loss": 0.019818276166915894, "memory(GiB)": 22.66, "step": 25051, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.955565 }, { "epoch": 0.8138258129487055, "grad_norm": 0.4433375895023346, "learning_rate": 9.181089219235873e-07, "loss": 0.013344802893698215, "memory(GiB)": 22.66, "step": 25052, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.955574 }, { "epoch": 0.8138582984114608, "grad_norm": 0.2860531806945801, "learning_rate": 9.177987301914554e-07, "loss": 0.008586529642343521, "memory(GiB)": 22.66, "step": 25053, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.955582 }, { "epoch": 0.8138907838742163, "grad_norm": 0.26779407262802124, "learning_rate": 9.174885855733596e-07, "loss": 0.008055872283875942, "memory(GiB)": 22.66, "step": 25054, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.95559 }, { "epoch": 0.8139232693369717, "grad_norm": 0.3404765725135803, "learning_rate": 9.17178488072879e-07, "loss": 0.010960390791296959, "memory(GiB)": 22.66, "step": 25055, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.955599 }, { "epoch": 0.8139557547997271, "grad_norm": 0.35848191380500793, "learning_rate": 9.168684376935949e-07, "loss": 0.009304849430918694, "memory(GiB)": 22.66, "step": 25056, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.955607 }, { "epoch": 0.8139882402624825, "grad_norm": 0.2921890914440155, "learning_rate": 9.165584344390821e-07, "loss": 0.009355654008686543, "memory(GiB)": 22.66, "step": 25057, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955615 }, { "epoch": 0.814020725725238, "grad_norm": 0.2598349153995514, "learning_rate": 9.162484783129211e-07, "loss": 0.00922198686748743, "memory(GiB)": 22.66, "step": 25058, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.955623 }, { "epoch": 0.8140532111879933, "grad_norm": 0.26761093735694885, "learning_rate": 9.159385693186856e-07, "loss": 0.011445797979831696, "memory(GiB)": 22.66, "step": 25059, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955631 }, { "epoch": 0.8140856966507488, "grad_norm": 0.3321506381034851, "learning_rate": 9.156287074599573e-07, "loss": 0.015840260311961174, "memory(GiB)": 22.66, "step": 25060, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.955639 }, { "epoch": 0.8141181821135042, "grad_norm": 0.43093177676200867, "learning_rate": 9.15318892740309e-07, "loss": 0.02117900922894478, "memory(GiB)": 22.66, "step": 25061, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.955648 }, { "epoch": 0.8141506675762596, "grad_norm": 0.29068252444267273, "learning_rate": 9.150091251633186e-07, "loss": 0.010472025722265244, "memory(GiB)": 22.66, "step": 25062, "token_acc": 1.0, "train_speed(iter/s)": 0.955656 }, { "epoch": 0.814183153039015, "grad_norm": 0.19412219524383545, "learning_rate": 9.146994047325586e-07, "loss": 0.007247909903526306, "memory(GiB)": 22.66, "step": 25063, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955665 }, { "epoch": 0.8142156385017705, "grad_norm": 0.28970110416412354, "learning_rate": 9.143897314516054e-07, "loss": 0.008676780387759209, "memory(GiB)": 22.66, "step": 25064, "token_acc": 0.9944444444444445, "train_speed(iter/s)": 0.955673 }, { "epoch": 0.8142481239645258, "grad_norm": 0.27363407611846924, "learning_rate": 9.140801053240322e-07, "loss": 0.010995033197104931, "memory(GiB)": 22.66, "step": 25065, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.955681 }, { "epoch": 0.8142806094272813, "grad_norm": 0.2905559539794922, "learning_rate": 9.137705263534147e-07, "loss": 0.010272325947880745, "memory(GiB)": 22.66, "step": 25066, "token_acc": 1.0, "train_speed(iter/s)": 0.955687 }, { "epoch": 0.8143130948900367, "grad_norm": 1.39208984375, "learning_rate": 9.134609945433221e-07, "loss": 0.01819821447134018, "memory(GiB)": 22.66, "step": 25067, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955693 }, { "epoch": 0.8143455803527921, "grad_norm": 0.44551220536231995, "learning_rate": 9.131515098973293e-07, "loss": 0.015112509950995445, "memory(GiB)": 22.66, "step": 25068, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.955699 }, { "epoch": 0.8143780658155475, "grad_norm": 0.4226625859737396, "learning_rate": 9.128420724190073e-07, "loss": 0.00937545858323574, "memory(GiB)": 22.66, "step": 25069, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955706 }, { "epoch": 0.814410551278303, "grad_norm": 0.32659438252449036, "learning_rate": 9.125326821119284e-07, "loss": 0.013160117901861668, "memory(GiB)": 22.66, "step": 25070, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.955712 }, { "epoch": 0.8144430367410583, "grad_norm": 0.3326665759086609, "learning_rate": 9.122233389796637e-07, "loss": 0.011251069605350494, "memory(GiB)": 22.66, "step": 25071, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.955718 }, { "epoch": 0.8144755222038138, "grad_norm": 0.39461830258369446, "learning_rate": 9.119140430257812e-07, "loss": 0.021448809653520584, "memory(GiB)": 22.66, "step": 25072, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.955725 }, { "epoch": 0.8145080076665692, "grad_norm": 0.3271854519844055, "learning_rate": 9.116047942538514e-07, "loss": 0.015595389530062675, "memory(GiB)": 22.66, "step": 25073, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955732 }, { "epoch": 0.8145404931293246, "grad_norm": 0.38630184531211853, "learning_rate": 9.112955926674443e-07, "loss": 0.010669520124793053, "memory(GiB)": 22.66, "step": 25074, "token_acc": 1.0, "train_speed(iter/s)": 0.955737 }, { "epoch": 0.81457297859208, "grad_norm": 0.3609731197357178, "learning_rate": 9.109864382701295e-07, "loss": 0.011982526630163193, "memory(GiB)": 22.66, "step": 25075, "token_acc": 0.990506329113924, "train_speed(iter/s)": 0.955744 }, { "epoch": 0.8146054640548355, "grad_norm": 0.30951398611068726, "learning_rate": 9.106773310654726e-07, "loss": 0.01649383455514908, "memory(GiB)": 22.66, "step": 25076, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.955751 }, { "epoch": 0.8146379495175908, "grad_norm": 0.3448353111743927, "learning_rate": 9.103682710570416e-07, "loss": 0.013189258053898811, "memory(GiB)": 22.66, "step": 25077, "token_acc": 1.0, "train_speed(iter/s)": 0.955757 }, { "epoch": 0.8146704349803463, "grad_norm": 0.3806893527507782, "learning_rate": 9.100592582484042e-07, "loss": 0.015677068382501602, "memory(GiB)": 22.66, "step": 25078, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.955763 }, { "epoch": 0.8147029204431017, "grad_norm": 0.3109828531742096, "learning_rate": 9.097502926431285e-07, "loss": 0.013995024375617504, "memory(GiB)": 22.66, "step": 25079, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.95577 }, { "epoch": 0.8147354059058571, "grad_norm": 0.3184830844402313, "learning_rate": 9.094413742447766e-07, "loss": 0.017078036442399025, "memory(GiB)": 22.66, "step": 25080, "token_acc": 1.0, "train_speed(iter/s)": 0.955775 }, { "epoch": 0.8147678913686125, "grad_norm": 0.28115758299827576, "learning_rate": 9.09132503056917e-07, "loss": 0.007026524282991886, "memory(GiB)": 22.66, "step": 25081, "token_acc": 1.0, "train_speed(iter/s)": 0.955781 }, { "epoch": 0.814800376831368, "grad_norm": 0.3406660258769989, "learning_rate": 9.088236790831112e-07, "loss": 0.013458735309541225, "memory(GiB)": 22.66, "step": 25082, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955787 }, { "epoch": 0.8148328622941233, "grad_norm": 0.2554999887943268, "learning_rate": 9.085149023269263e-07, "loss": 0.008053367026150227, "memory(GiB)": 22.66, "step": 25083, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.955792 }, { "epoch": 0.8148653477568788, "grad_norm": 0.47309112548828125, "learning_rate": 9.082061727919267e-07, "loss": 0.017714854329824448, "memory(GiB)": 22.66, "step": 25084, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.95579 }, { "epoch": 0.8148978332196342, "grad_norm": 0.3262196481227875, "learning_rate": 9.078974904816728e-07, "loss": 0.011593136936426163, "memory(GiB)": 22.66, "step": 25085, "token_acc": 1.0, "train_speed(iter/s)": 0.955797 }, { "epoch": 0.8149303186823896, "grad_norm": 0.4554528594017029, "learning_rate": 9.075888553997297e-07, "loss": 0.014939667657017708, "memory(GiB)": 22.66, "step": 25086, "token_acc": 1.0, "train_speed(iter/s)": 0.955803 }, { "epoch": 0.814962804145145, "grad_norm": 0.3276295065879822, "learning_rate": 9.072802675496556e-07, "loss": 0.011240369640290737, "memory(GiB)": 22.66, "step": 25087, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.955809 }, { "epoch": 0.8149952896079005, "grad_norm": 0.3692989647388458, "learning_rate": 9.06971726935017e-07, "loss": 0.01845337636768818, "memory(GiB)": 22.66, "step": 25088, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.955815 }, { "epoch": 0.8150277750706559, "grad_norm": 0.35044535994529724, "learning_rate": 9.066632335593717e-07, "loss": 0.010834140703082085, "memory(GiB)": 22.66, "step": 25089, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955821 }, { "epoch": 0.8150602605334113, "grad_norm": 0.280769944190979, "learning_rate": 9.063547874262818e-07, "loss": 0.009393930435180664, "memory(GiB)": 22.66, "step": 25090, "token_acc": 1.0, "train_speed(iter/s)": 0.955828 }, { "epoch": 0.8150927459961668, "grad_norm": 0.3839925527572632, "learning_rate": 9.06046388539305e-07, "loss": 0.016097376123070717, "memory(GiB)": 22.66, "step": 25091, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.955833 }, { "epoch": 0.8151252314589221, "grad_norm": 0.3351419270038605, "learning_rate": 9.057380369020019e-07, "loss": 0.015020433813333511, "memory(GiB)": 22.66, "step": 25092, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955839 }, { "epoch": 0.8151577169216776, "grad_norm": 0.3382064998149872, "learning_rate": 9.054297325179312e-07, "loss": 0.013025389984250069, "memory(GiB)": 22.66, "step": 25093, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.955846 }, { "epoch": 0.815190202384433, "grad_norm": 0.2542189657688141, "learning_rate": 9.051214753906529e-07, "loss": 0.009292596951127052, "memory(GiB)": 22.66, "step": 25094, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.955852 }, { "epoch": 0.8152226878471884, "grad_norm": 0.2902665436267853, "learning_rate": 9.048132655237213e-07, "loss": 0.008968515321612358, "memory(GiB)": 22.66, "step": 25095, "token_acc": 1.0, "train_speed(iter/s)": 0.955859 }, { "epoch": 0.8152551733099438, "grad_norm": 0.392692893743515, "learning_rate": 9.045051029206959e-07, "loss": 0.016965068876743317, "memory(GiB)": 22.66, "step": 25096, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.955865 }, { "epoch": 0.8152876587726993, "grad_norm": 0.2793715000152588, "learning_rate": 9.04196987585132e-07, "loss": 0.00897354818880558, "memory(GiB)": 22.66, "step": 25097, "token_acc": 1.0, "train_speed(iter/s)": 0.955872 }, { "epoch": 0.8153201442354546, "grad_norm": 0.4186773896217346, "learning_rate": 9.03888919520588e-07, "loss": 0.012661601416766644, "memory(GiB)": 22.66, "step": 25098, "token_acc": 1.0, "train_speed(iter/s)": 0.955877 }, { "epoch": 0.8153526296982101, "grad_norm": 0.5147639513015747, "learning_rate": 9.035808987306166e-07, "loss": 0.009859325364232063, "memory(GiB)": 22.66, "step": 25099, "token_acc": 1.0, "train_speed(iter/s)": 0.955883 }, { "epoch": 0.8153851151609655, "grad_norm": 0.43187400698661804, "learning_rate": 9.032729252187738e-07, "loss": 0.016169976443052292, "memory(GiB)": 22.66, "step": 25100, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.95589 }, { "epoch": 0.8154176006237209, "grad_norm": 0.28915995359420776, "learning_rate": 9.029649989886141e-07, "loss": 0.010232217609882355, "memory(GiB)": 22.66, "step": 25101, "token_acc": 1.0, "train_speed(iter/s)": 0.955896 }, { "epoch": 0.8154500860864763, "grad_norm": 0.4621361196041107, "learning_rate": 9.026571200436918e-07, "loss": 0.011149479076266289, "memory(GiB)": 22.66, "step": 25102, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.955901 }, { "epoch": 0.8154825715492318, "grad_norm": 0.35585659742355347, "learning_rate": 9.023492883875607e-07, "loss": 0.013360559940338135, "memory(GiB)": 22.66, "step": 25103, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955908 }, { "epoch": 0.8155150570119871, "grad_norm": 0.33385467529296875, "learning_rate": 9.020415040237718e-07, "loss": 0.013840259984135628, "memory(GiB)": 22.66, "step": 25104, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.955916 }, { "epoch": 0.8155475424747426, "grad_norm": 0.35465896129608154, "learning_rate": 9.017337669558784e-07, "loss": 0.012040916830301285, "memory(GiB)": 22.66, "step": 25105, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.955924 }, { "epoch": 0.815580027937498, "grad_norm": 0.2759658098220825, "learning_rate": 9.014260771874317e-07, "loss": 0.008391829207539558, "memory(GiB)": 22.66, "step": 25106, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955932 }, { "epoch": 0.8156125134002534, "grad_norm": 0.3332121968269348, "learning_rate": 9.011184347219848e-07, "loss": 0.007846061140298843, "memory(GiB)": 22.66, "step": 25107, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.955941 }, { "epoch": 0.8156449988630088, "grad_norm": 0.3893060088157654, "learning_rate": 9.008108395630854e-07, "loss": 0.016397060826420784, "memory(GiB)": 22.66, "step": 25108, "token_acc": 0.9961538461538462, "train_speed(iter/s)": 0.955949 }, { "epoch": 0.8156774843257643, "grad_norm": 0.25834494829177856, "learning_rate": 9.005032917142864e-07, "loss": 0.008629638701677322, "memory(GiB)": 22.66, "step": 25109, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.955956 }, { "epoch": 0.8157099697885196, "grad_norm": 0.3032507300376892, "learning_rate": 9.001957911791331e-07, "loss": 0.009441078640520573, "memory(GiB)": 22.66, "step": 25110, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955964 }, { "epoch": 0.8157424552512751, "grad_norm": 0.33075958490371704, "learning_rate": 8.998883379611801e-07, "loss": 0.013848580420017242, "memory(GiB)": 22.66, "step": 25111, "token_acc": 1.0, "train_speed(iter/s)": 0.955973 }, { "epoch": 0.8157749407140304, "grad_norm": 0.3941446542739868, "learning_rate": 8.995809320639714e-07, "loss": 0.012627405114471912, "memory(GiB)": 22.66, "step": 25112, "token_acc": 1.0, "train_speed(iter/s)": 0.95598 }, { "epoch": 0.8158074261767859, "grad_norm": 0.33573096990585327, "learning_rate": 8.992735734910574e-07, "loss": 0.012778270058333874, "memory(GiB)": 22.66, "step": 25113, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955989 }, { "epoch": 0.8158399116395413, "grad_norm": 0.3520109951496124, "learning_rate": 8.989662622459832e-07, "loss": 0.008054524660110474, "memory(GiB)": 22.66, "step": 25114, "token_acc": 0.9921875, "train_speed(iter/s)": 0.955997 }, { "epoch": 0.8158723971022968, "grad_norm": 0.5197400450706482, "learning_rate": 8.986589983322958e-07, "loss": 0.016407102346420288, "memory(GiB)": 22.66, "step": 25115, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.956005 }, { "epoch": 0.8159048825650521, "grad_norm": 0.31878283619880676, "learning_rate": 8.983517817535448e-07, "loss": 0.009207669645547867, "memory(GiB)": 22.66, "step": 25116, "token_acc": 1.0, "train_speed(iter/s)": 0.956013 }, { "epoch": 0.8159373680278076, "grad_norm": 0.370944619178772, "learning_rate": 8.980446125132725e-07, "loss": 0.009352343156933784, "memory(GiB)": 22.66, "step": 25117, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.956021 }, { "epoch": 0.815969853490563, "grad_norm": 0.7505722045898438, "learning_rate": 8.97737490615026e-07, "loss": 0.01936756819486618, "memory(GiB)": 22.66, "step": 25118, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.956029 }, { "epoch": 0.8160023389533184, "grad_norm": 0.3545168340206146, "learning_rate": 8.974304160623476e-07, "loss": 0.014615317806601524, "memory(GiB)": 22.66, "step": 25119, "token_acc": 1.0, "train_speed(iter/s)": 0.956037 }, { "epoch": 0.8160348244160738, "grad_norm": 0.3776707649230957, "learning_rate": 8.971233888587833e-07, "loss": 0.015726862475275993, "memory(GiB)": 22.66, "step": 25120, "token_acc": 1.0, "train_speed(iter/s)": 0.956045 }, { "epoch": 0.8160673098788293, "grad_norm": 0.3138318657875061, "learning_rate": 8.968164090078757e-07, "loss": 0.014725012704730034, "memory(GiB)": 22.66, "step": 25121, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.956053 }, { "epoch": 0.8160997953415846, "grad_norm": 0.21502795815467834, "learning_rate": 8.965094765131687e-07, "loss": 0.006768429186195135, "memory(GiB)": 22.66, "step": 25122, "token_acc": 1.0, "train_speed(iter/s)": 0.95606 }, { "epoch": 0.8161322808043401, "grad_norm": 0.8392050862312317, "learning_rate": 8.962025913782036e-07, "loss": 0.014138448983430862, "memory(GiB)": 22.66, "step": 25123, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.956068 }, { "epoch": 0.8161647662670954, "grad_norm": 0.32943621277809143, "learning_rate": 8.958957536065227e-07, "loss": 0.013761686161160469, "memory(GiB)": 22.66, "step": 25124, "token_acc": 0.9895104895104895, "train_speed(iter/s)": 0.956076 }, { "epoch": 0.8161972517298509, "grad_norm": 0.3084351718425751, "learning_rate": 8.955889632016674e-07, "loss": 0.014911307021975517, "memory(GiB)": 22.66, "step": 25125, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956083 }, { "epoch": 0.8162297371926063, "grad_norm": 0.2898857891559601, "learning_rate": 8.95282220167179e-07, "loss": 0.0089956633746624, "memory(GiB)": 22.66, "step": 25126, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956089 }, { "epoch": 0.8162622226553617, "grad_norm": 0.36869633197784424, "learning_rate": 8.949755245065966e-07, "loss": 0.00863713026046753, "memory(GiB)": 22.66, "step": 25127, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956096 }, { "epoch": 0.8162947081181171, "grad_norm": 0.3523564636707306, "learning_rate": 8.9466887622346e-07, "loss": 0.01144685409963131, "memory(GiB)": 22.66, "step": 25128, "token_acc": 1.0, "train_speed(iter/s)": 0.956103 }, { "epoch": 0.8163271935808726, "grad_norm": 0.38794204592704773, "learning_rate": 8.943622753213088e-07, "loss": 0.012805622071027756, "memory(GiB)": 22.66, "step": 25129, "token_acc": 1.0, "train_speed(iter/s)": 0.956109 }, { "epoch": 0.8163596790436279, "grad_norm": 0.20738184452056885, "learning_rate": 8.940557218036816e-07, "loss": 0.005063402000814676, "memory(GiB)": 22.66, "step": 25130, "token_acc": 1.0, "train_speed(iter/s)": 0.956116 }, { "epoch": 0.8163921645063834, "grad_norm": 0.4576207399368286, "learning_rate": 8.937492156741179e-07, "loss": 0.014901579357683659, "memory(GiB)": 22.66, "step": 25131, "token_acc": 0.9923664122137404, "train_speed(iter/s)": 0.956123 }, { "epoch": 0.8164246499691388, "grad_norm": 0.29329830408096313, "learning_rate": 8.93442756936152e-07, "loss": 0.013383902609348297, "memory(GiB)": 22.66, "step": 25132, "token_acc": 1.0, "train_speed(iter/s)": 0.956129 }, { "epoch": 0.8164571354318942, "grad_norm": 0.38203245401382446, "learning_rate": 8.931363455933229e-07, "loss": 0.0112532377243042, "memory(GiB)": 22.66, "step": 25133, "token_acc": 1.0, "train_speed(iter/s)": 0.956135 }, { "epoch": 0.8164896208946496, "grad_norm": 0.4134860336780548, "learning_rate": 8.928299816491664e-07, "loss": 0.012989334762096405, "memory(GiB)": 22.66, "step": 25134, "token_acc": 1.0, "train_speed(iter/s)": 0.956142 }, { "epoch": 0.8165221063574051, "grad_norm": 0.26757392287254333, "learning_rate": 8.925236651072194e-07, "loss": 0.0076445904560387135, "memory(GiB)": 22.66, "step": 25135, "token_acc": 1.0, "train_speed(iter/s)": 0.956148 }, { "epoch": 0.8165545918201604, "grad_norm": 0.2142491489648819, "learning_rate": 8.922173959710156e-07, "loss": 0.011114997789263725, "memory(GiB)": 22.66, "step": 25136, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956154 }, { "epoch": 0.8165870772829159, "grad_norm": 0.1954253762960434, "learning_rate": 8.919111742440917e-07, "loss": 0.006711141671985388, "memory(GiB)": 22.66, "step": 25137, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.956161 }, { "epoch": 0.8166195627456713, "grad_norm": 0.29092469811439514, "learning_rate": 8.916049999299781e-07, "loss": 0.0101426150649786, "memory(GiB)": 22.66, "step": 25138, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956168 }, { "epoch": 0.8166520482084267, "grad_norm": 0.3358556628227234, "learning_rate": 8.912988730322136e-07, "loss": 0.009502585977315903, "memory(GiB)": 22.66, "step": 25139, "token_acc": 1.0, "train_speed(iter/s)": 0.956174 }, { "epoch": 0.8166845336711821, "grad_norm": 0.2766912281513214, "learning_rate": 8.909927935543272e-07, "loss": 0.010796122252941132, "memory(GiB)": 22.66, "step": 25140, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.95618 }, { "epoch": 0.8167170191339376, "grad_norm": 0.3052116334438324, "learning_rate": 8.906867614998549e-07, "loss": 0.010221054777503014, "memory(GiB)": 22.66, "step": 25141, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.956187 }, { "epoch": 0.8167495045966929, "grad_norm": 0.4502517879009247, "learning_rate": 8.903807768723249e-07, "loss": 0.015778787434101105, "memory(GiB)": 22.66, "step": 25142, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.956193 }, { "epoch": 0.8167819900594484, "grad_norm": 0.45122891664505005, "learning_rate": 8.900748396752712e-07, "loss": 0.009150737896561623, "memory(GiB)": 22.66, "step": 25143, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.9562 }, { "epoch": 0.8168144755222038, "grad_norm": 0.5337573885917664, "learning_rate": 8.897689499122242e-07, "loss": 0.019888319075107574, "memory(GiB)": 22.66, "step": 25144, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.956205 }, { "epoch": 0.8168469609849592, "grad_norm": 1.4813008308410645, "learning_rate": 8.894631075867138e-07, "loss": 0.011719981208443642, "memory(GiB)": 22.66, "step": 25145, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956211 }, { "epoch": 0.8168794464477146, "grad_norm": 0.42325910925865173, "learning_rate": 8.891573127022718e-07, "loss": 0.014909179881215096, "memory(GiB)": 22.66, "step": 25146, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.956217 }, { "epoch": 0.8169119319104701, "grad_norm": 0.45753878355026245, "learning_rate": 8.888515652624247e-07, "loss": 0.013196694664657116, "memory(GiB)": 22.66, "step": 25147, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956223 }, { "epoch": 0.8169444173732254, "grad_norm": 0.22512418031692505, "learning_rate": 8.885458652707019e-07, "loss": 0.011449504643678665, "memory(GiB)": 22.66, "step": 25148, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956229 }, { "epoch": 0.8169769028359809, "grad_norm": 0.3132724165916443, "learning_rate": 8.882402127306327e-07, "loss": 0.009764844551682472, "memory(GiB)": 22.66, "step": 25149, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.956235 }, { "epoch": 0.8170093882987363, "grad_norm": 0.35407257080078125, "learning_rate": 8.879346076457452e-07, "loss": 0.011052154004573822, "memory(GiB)": 22.66, "step": 25150, "token_acc": 1.0, "train_speed(iter/s)": 0.956241 }, { "epoch": 0.8170418737614917, "grad_norm": 0.2976963222026825, "learning_rate": 8.876290500195645e-07, "loss": 0.011658472940325737, "memory(GiB)": 22.66, "step": 25151, "token_acc": 1.0, "train_speed(iter/s)": 0.956247 }, { "epoch": 0.8170743592242472, "grad_norm": 0.3419683277606964, "learning_rate": 8.87323539855618e-07, "loss": 0.011327528394758701, "memory(GiB)": 22.66, "step": 25152, "token_acc": 1.0, "train_speed(iter/s)": 0.956253 }, { "epoch": 0.8171068446870026, "grad_norm": 0.282375693321228, "learning_rate": 8.870180771574316e-07, "loss": 0.01077251136302948, "memory(GiB)": 22.66, "step": 25153, "token_acc": 1.0, "train_speed(iter/s)": 0.956259 }, { "epoch": 0.817139330149758, "grad_norm": 0.2317168414592743, "learning_rate": 8.867126619285327e-07, "loss": 0.007999615743756294, "memory(GiB)": 22.66, "step": 25154, "token_acc": 1.0, "train_speed(iter/s)": 0.956266 }, { "epoch": 0.8171718156125134, "grad_norm": 0.600105345249176, "learning_rate": 8.864072941724428e-07, "loss": 0.01919727772474289, "memory(GiB)": 22.66, "step": 25155, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.956271 }, { "epoch": 0.8172043010752689, "grad_norm": 0.49688106775283813, "learning_rate": 8.86101973892688e-07, "loss": 0.012947136536240578, "memory(GiB)": 22.66, "step": 25156, "token_acc": 0.99609375, "train_speed(iter/s)": 0.956277 }, { "epoch": 0.8172367865380242, "grad_norm": 0.35677284002304077, "learning_rate": 8.85796701092792e-07, "loss": 0.008263146504759789, "memory(GiB)": 22.66, "step": 25157, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.956283 }, { "epoch": 0.8172692720007797, "grad_norm": 0.3895145654678345, "learning_rate": 8.854914757762789e-07, "loss": 0.0124262860044837, "memory(GiB)": 22.66, "step": 25158, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.95629 }, { "epoch": 0.8173017574635351, "grad_norm": 0.34148240089416504, "learning_rate": 8.851862979466696e-07, "loss": 0.012959391809999943, "memory(GiB)": 22.66, "step": 25159, "token_acc": 0.995, "train_speed(iter/s)": 0.956296 }, { "epoch": 0.8173342429262905, "grad_norm": 0.3369855582714081, "learning_rate": 8.848811676074875e-07, "loss": 0.015215637162327766, "memory(GiB)": 22.66, "step": 25160, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956302 }, { "epoch": 0.8173667283890459, "grad_norm": 0.3679085969924927, "learning_rate": 8.845760847622537e-07, "loss": 0.012164373882114887, "memory(GiB)": 22.66, "step": 25161, "token_acc": 0.9963503649635036, "train_speed(iter/s)": 0.956308 }, { "epoch": 0.8173992138518014, "grad_norm": 0.3255288302898407, "learning_rate": 8.842710494144896e-07, "loss": 0.009510970674455166, "memory(GiB)": 22.66, "step": 25162, "token_acc": 1.0, "train_speed(iter/s)": 0.956313 }, { "epoch": 0.8174316993145567, "grad_norm": 0.49785923957824707, "learning_rate": 8.83966061567717e-07, "loss": 0.01767697185277939, "memory(GiB)": 22.66, "step": 25163, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.956319 }, { "epoch": 0.8174641847773122, "grad_norm": 0.38217878341674805, "learning_rate": 8.836611212254531e-07, "loss": 0.010082220658659935, "memory(GiB)": 22.66, "step": 25164, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956325 }, { "epoch": 0.8174966702400676, "grad_norm": 0.33884602785110474, "learning_rate": 8.833562283912201e-07, "loss": 0.009160371497273445, "memory(GiB)": 22.66, "step": 25165, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.956331 }, { "epoch": 0.817529155702823, "grad_norm": 0.2632478177547455, "learning_rate": 8.830513830685328e-07, "loss": 0.010758616030216217, "memory(GiB)": 22.66, "step": 25166, "token_acc": 1.0, "train_speed(iter/s)": 0.956338 }, { "epoch": 0.8175616411655784, "grad_norm": 0.42615026235580444, "learning_rate": 8.827465852609146e-07, "loss": 0.013714870437979698, "memory(GiB)": 22.66, "step": 25167, "token_acc": 1.0, "train_speed(iter/s)": 0.956345 }, { "epoch": 0.8175941266283339, "grad_norm": 0.36065617203712463, "learning_rate": 8.824418349718794e-07, "loss": 0.010769743472337723, "memory(GiB)": 22.66, "step": 25168, "token_acc": 0.984, "train_speed(iter/s)": 0.956353 }, { "epoch": 0.8176266120910892, "grad_norm": 0.30944374203681946, "learning_rate": 8.821371322049477e-07, "loss": 0.008858519606292248, "memory(GiB)": 22.66, "step": 25169, "token_acc": 1.0, "train_speed(iter/s)": 0.956361 }, { "epoch": 0.8176590975538447, "grad_norm": 0.41563349962234497, "learning_rate": 8.818324769636327e-07, "loss": 0.011751421727240086, "memory(GiB)": 22.66, "step": 25170, "token_acc": 1.0, "train_speed(iter/s)": 0.956369 }, { "epoch": 0.8176915830166, "grad_norm": 0.3906867802143097, "learning_rate": 8.815278692514522e-07, "loss": 0.012930842116475105, "memory(GiB)": 22.66, "step": 25171, "token_acc": 0.9966887417218543, "train_speed(iter/s)": 0.956377 }, { "epoch": 0.8177240684793555, "grad_norm": 0.3256487548351288, "learning_rate": 8.812233090719219e-07, "loss": 0.010449958965182304, "memory(GiB)": 22.66, "step": 25172, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.956385 }, { "epoch": 0.8177565539421109, "grad_norm": 0.3508266806602478, "learning_rate": 8.809187964285576e-07, "loss": 0.010686926543712616, "memory(GiB)": 22.66, "step": 25173, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.956393 }, { "epoch": 0.8177890394048664, "grad_norm": 0.4272962212562561, "learning_rate": 8.806143313248722e-07, "loss": 0.016643675044178963, "memory(GiB)": 22.66, "step": 25174, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.956401 }, { "epoch": 0.8178215248676217, "grad_norm": 0.5495294332504272, "learning_rate": 8.8030991376438e-07, "loss": 0.013583955354988575, "memory(GiB)": 22.66, "step": 25175, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956409 }, { "epoch": 0.8178540103303772, "grad_norm": 0.32611432671546936, "learning_rate": 8.80005543750595e-07, "loss": 0.009634234942495823, "memory(GiB)": 22.66, "step": 25176, "token_acc": 0.9964028776978417, "train_speed(iter/s)": 0.956416 }, { "epoch": 0.8178864957931326, "grad_norm": 0.26786255836486816, "learning_rate": 8.797012212870293e-07, "loss": 0.01257643848657608, "memory(GiB)": 22.66, "step": 25177, "token_acc": 1.0, "train_speed(iter/s)": 0.956425 }, { "epoch": 0.817918981255888, "grad_norm": 0.24066194891929626, "learning_rate": 8.793969463771967e-07, "loss": 0.008688625879585743, "memory(GiB)": 22.66, "step": 25178, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956433 }, { "epoch": 0.8179514667186434, "grad_norm": 0.5425986647605896, "learning_rate": 8.790927190246074e-07, "loss": 0.01722550578415394, "memory(GiB)": 22.66, "step": 25179, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.956441 }, { "epoch": 0.8179839521813989, "grad_norm": 0.30618563294410706, "learning_rate": 8.787885392327722e-07, "loss": 0.014726654626429081, "memory(GiB)": 22.66, "step": 25180, "token_acc": 0.9943820224719101, "train_speed(iter/s)": 0.956449 }, { "epoch": 0.8180164376441542, "grad_norm": 0.2418271005153656, "learning_rate": 8.784844070052029e-07, "loss": 0.007955914363265038, "memory(GiB)": 22.66, "step": 25181, "token_acc": 1.0, "train_speed(iter/s)": 0.956458 }, { "epoch": 0.8180489231069097, "grad_norm": 0.3578474819660187, "learning_rate": 8.781803223454105e-07, "loss": 0.009673798456788063, "memory(GiB)": 22.66, "step": 25182, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956466 }, { "epoch": 0.818081408569665, "grad_norm": 0.406838983297348, "learning_rate": 8.778762852569023e-07, "loss": 0.016156665980815887, "memory(GiB)": 22.66, "step": 25183, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.956474 }, { "epoch": 0.8181138940324205, "grad_norm": 0.3625636100769043, "learning_rate": 8.775722957431881e-07, "loss": 0.016842901706695557, "memory(GiB)": 22.66, "step": 25184, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.956481 }, { "epoch": 0.8181463794951759, "grad_norm": 0.33093976974487305, "learning_rate": 8.772683538077764e-07, "loss": 0.010676991194486618, "memory(GiB)": 22.66, "step": 25185, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956487 }, { "epoch": 0.8181788649579314, "grad_norm": 0.3313831090927124, "learning_rate": 8.769644594541765e-07, "loss": 0.010038953274488449, "memory(GiB)": 22.66, "step": 25186, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.956495 }, { "epoch": 0.8182113504206867, "grad_norm": 0.26670849323272705, "learning_rate": 8.766606126858935e-07, "loss": 0.009359311312437057, "memory(GiB)": 22.66, "step": 25187, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.956502 }, { "epoch": 0.8182438358834422, "grad_norm": 0.2748570144176483, "learning_rate": 8.763568135064366e-07, "loss": 0.012558317743241787, "memory(GiB)": 22.66, "step": 25188, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.956508 }, { "epoch": 0.8182763213461975, "grad_norm": 0.30349084734916687, "learning_rate": 8.760530619193075e-07, "loss": 0.015465800650417805, "memory(GiB)": 22.66, "step": 25189, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.956515 }, { "epoch": 0.818308806808953, "grad_norm": 0.473060667514801, "learning_rate": 8.757493579280179e-07, "loss": 0.013873156160116196, "memory(GiB)": 22.66, "step": 25190, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.95652 }, { "epoch": 0.8183412922717084, "grad_norm": 0.32593998312950134, "learning_rate": 8.75445701536069e-07, "loss": 0.011572152376174927, "memory(GiB)": 22.66, "step": 25191, "token_acc": 0.984375, "train_speed(iter/s)": 0.956527 }, { "epoch": 0.8183737777344638, "grad_norm": 0.4187873601913452, "learning_rate": 8.751420927469662e-07, "loss": 0.014094751328229904, "memory(GiB)": 22.66, "step": 25192, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.956533 }, { "epoch": 0.8184062631972192, "grad_norm": 0.35414114594459534, "learning_rate": 8.74838531564215e-07, "loss": 0.01174173504114151, "memory(GiB)": 22.66, "step": 25193, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.956539 }, { "epoch": 0.8184387486599747, "grad_norm": 0.2350732684135437, "learning_rate": 8.745350179913154e-07, "loss": 0.005923515651375055, "memory(GiB)": 22.66, "step": 25194, "token_acc": 1.0, "train_speed(iter/s)": 0.956545 }, { "epoch": 0.81847123412273, "grad_norm": 0.5227820873260498, "learning_rate": 8.742315520317751e-07, "loss": 0.017849253490567207, "memory(GiB)": 22.66, "step": 25195, "token_acc": 0.984375, "train_speed(iter/s)": 0.956551 }, { "epoch": 0.8185037195854855, "grad_norm": 0.6106839776039124, "learning_rate": 8.739281336890925e-07, "loss": 0.010296173393726349, "memory(GiB)": 22.66, "step": 25196, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956557 }, { "epoch": 0.8185362050482409, "grad_norm": 0.41994500160217285, "learning_rate": 8.736247629667733e-07, "loss": 0.015580445528030396, "memory(GiB)": 22.66, "step": 25197, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.956563 }, { "epoch": 0.8185686905109963, "grad_norm": 0.32665306329727173, "learning_rate": 8.733214398683144e-07, "loss": 0.014439224265515804, "memory(GiB)": 22.66, "step": 25198, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.95657 }, { "epoch": 0.8186011759737517, "grad_norm": 0.2845149040222168, "learning_rate": 8.730181643972185e-07, "loss": 0.011921216733753681, "memory(GiB)": 22.66, "step": 25199, "token_acc": 1.0, "train_speed(iter/s)": 0.956576 }, { "epoch": 0.8186336614365072, "grad_norm": 0.3607860505580902, "learning_rate": 8.727149365569865e-07, "loss": 0.014326095581054688, "memory(GiB)": 22.66, "step": 25200, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.956582 }, { "epoch": 0.8186661468992625, "grad_norm": 0.3305685520172119, "learning_rate": 8.724117563511186e-07, "loss": 0.015309510752558708, "memory(GiB)": 22.66, "step": 25201, "token_acc": 0.9965397923875432, "train_speed(iter/s)": 0.956589 }, { "epoch": 0.818698632362018, "grad_norm": 0.30845773220062256, "learning_rate": 8.721086237831116e-07, "loss": 0.010112602263689041, "memory(GiB)": 22.66, "step": 25202, "token_acc": 1.0, "train_speed(iter/s)": 0.956596 }, { "epoch": 0.8187311178247734, "grad_norm": 0.3245159089565277, "learning_rate": 8.718055388564655e-07, "loss": 0.011093981564044952, "memory(GiB)": 22.66, "step": 25203, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.956602 }, { "epoch": 0.8187636032875288, "grad_norm": 0.2686832845211029, "learning_rate": 8.715025015746781e-07, "loss": 0.009395287372171879, "memory(GiB)": 22.66, "step": 25204, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.95661 }, { "epoch": 0.8187960887502842, "grad_norm": 0.29174742102622986, "learning_rate": 8.71199511941248e-07, "loss": 0.009739670902490616, "memory(GiB)": 22.66, "step": 25205, "token_acc": 1.0, "train_speed(iter/s)": 0.956617 }, { "epoch": 0.8188285742130397, "grad_norm": 0.29031604528427124, "learning_rate": 8.708965699596694e-07, "loss": 0.014213486574590206, "memory(GiB)": 22.66, "step": 25206, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.956623 }, { "epoch": 0.818861059675795, "grad_norm": 0.46957454085350037, "learning_rate": 8.705936756334405e-07, "loss": 0.01972762867808342, "memory(GiB)": 22.66, "step": 25207, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.95663 }, { "epoch": 0.8188935451385505, "grad_norm": 0.44555944204330444, "learning_rate": 8.702908289660567e-07, "loss": 0.017799358814954758, "memory(GiB)": 22.66, "step": 25208, "token_acc": 1.0, "train_speed(iter/s)": 0.956635 }, { "epoch": 0.8189260306013059, "grad_norm": 0.3919121026992798, "learning_rate": 8.699880299610136e-07, "loss": 0.00971543975174427, "memory(GiB)": 22.66, "step": 25209, "token_acc": 1.0, "train_speed(iter/s)": 0.956641 }, { "epoch": 0.8189585160640613, "grad_norm": 0.46178996562957764, "learning_rate": 8.696852786218069e-07, "loss": 0.01635553501546383, "memory(GiB)": 22.66, "step": 25210, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.956647 }, { "epoch": 0.8189910015268167, "grad_norm": 0.32160353660583496, "learning_rate": 8.69382574951928e-07, "loss": 0.01053115725517273, "memory(GiB)": 22.66, "step": 25211, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.956654 }, { "epoch": 0.8190234869895722, "grad_norm": 0.2927512526512146, "learning_rate": 8.690799189548721e-07, "loss": 0.009415303356945515, "memory(GiB)": 22.66, "step": 25212, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.95666 }, { "epoch": 0.8190559724523275, "grad_norm": 0.45291128754615784, "learning_rate": 8.687773106341319e-07, "loss": 0.01308436319231987, "memory(GiB)": 22.66, "step": 25213, "token_acc": 1.0, "train_speed(iter/s)": 0.956666 }, { "epoch": 0.819088457915083, "grad_norm": 0.30930691957473755, "learning_rate": 8.684747499932017e-07, "loss": 0.010670574381947517, "memory(GiB)": 22.66, "step": 25214, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.956672 }, { "epoch": 0.8191209433778384, "grad_norm": 0.33719730377197266, "learning_rate": 8.68172237035571e-07, "loss": 0.016074955463409424, "memory(GiB)": 22.66, "step": 25215, "token_acc": 0.9885496183206107, "train_speed(iter/s)": 0.956678 }, { "epoch": 0.8191534288405938, "grad_norm": 0.42806440591812134, "learning_rate": 8.678697717647333e-07, "loss": 0.015455019660294056, "memory(GiB)": 22.66, "step": 25216, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.956683 }, { "epoch": 0.8191859143033493, "grad_norm": 0.2628585696220398, "learning_rate": 8.675673541841756e-07, "loss": 0.01410639751702547, "memory(GiB)": 22.66, "step": 25217, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.956689 }, { "epoch": 0.8192183997661047, "grad_norm": 0.36453747749328613, "learning_rate": 8.672649842973929e-07, "loss": 0.016998639330267906, "memory(GiB)": 22.66, "step": 25218, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.956695 }, { "epoch": 0.8192508852288601, "grad_norm": 0.25594663619995117, "learning_rate": 8.669626621078725e-07, "loss": 0.010790408588945866, "memory(GiB)": 22.66, "step": 25219, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.956701 }, { "epoch": 0.8192833706916155, "grad_norm": 0.34856122732162476, "learning_rate": 8.666603876191048e-07, "loss": 0.00845001358538866, "memory(GiB)": 22.66, "step": 25220, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.956707 }, { "epoch": 0.819315856154371, "grad_norm": 0.5404523015022278, "learning_rate": 8.663581608345767e-07, "loss": 0.012083170004189014, "memory(GiB)": 22.66, "step": 25221, "token_acc": 1.0, "train_speed(iter/s)": 0.956714 }, { "epoch": 0.8193483416171263, "grad_norm": 0.4255978465080261, "learning_rate": 8.660559817577757e-07, "loss": 0.015246832743287086, "memory(GiB)": 22.66, "step": 25222, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.95672 }, { "epoch": 0.8193808270798818, "grad_norm": 0.3093792796134949, "learning_rate": 8.657538503921942e-07, "loss": 0.009698107838630676, "memory(GiB)": 22.66, "step": 25223, "token_acc": 0.9965753424657534, "train_speed(iter/s)": 0.956727 }, { "epoch": 0.8194133125426372, "grad_norm": 0.3828398883342743, "learning_rate": 8.654517667413137e-07, "loss": 0.010977592319250107, "memory(GiB)": 22.66, "step": 25224, "token_acc": 1.0, "train_speed(iter/s)": 0.956732 }, { "epoch": 0.8194457980053926, "grad_norm": 2.0093588829040527, "learning_rate": 8.651497308086248e-07, "loss": 0.01038048043847084, "memory(GiB)": 22.66, "step": 25225, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.956738 }, { "epoch": 0.819478283468148, "grad_norm": 0.40343302488327026, "learning_rate": 8.648477425976088e-07, "loss": 0.015572570264339447, "memory(GiB)": 22.66, "step": 25226, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.956744 }, { "epoch": 0.8195107689309035, "grad_norm": 0.25222691893577576, "learning_rate": 8.645458021117564e-07, "loss": 0.009611362591385841, "memory(GiB)": 22.66, "step": 25227, "token_acc": 1.0, "train_speed(iter/s)": 0.95675 }, { "epoch": 0.8195432543936588, "grad_norm": 0.339121013879776, "learning_rate": 8.642439093545479e-07, "loss": 0.014526590704917908, "memory(GiB)": 22.66, "step": 25228, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.956756 }, { "epoch": 0.8195757398564143, "grad_norm": 0.30793297290802, "learning_rate": 8.639420643294711e-07, "loss": 0.011736834421753883, "memory(GiB)": 22.66, "step": 25229, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956763 }, { "epoch": 0.8196082253191697, "grad_norm": 0.5826669931411743, "learning_rate": 8.636402670400068e-07, "loss": 0.01107008196413517, "memory(GiB)": 22.66, "step": 25230, "token_acc": 1.0, "train_speed(iter/s)": 0.956771 }, { "epoch": 0.8196407107819251, "grad_norm": 0.39103177189826965, "learning_rate": 8.633385174896392e-07, "loss": 0.009550472721457481, "memory(GiB)": 22.66, "step": 25231, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956779 }, { "epoch": 0.8196731962446805, "grad_norm": 0.3547983467578888, "learning_rate": 8.630368156818508e-07, "loss": 0.0146673284471035, "memory(GiB)": 22.66, "step": 25232, "token_acc": 1.0, "train_speed(iter/s)": 0.956787 }, { "epoch": 0.819705681707436, "grad_norm": 0.3749186098575592, "learning_rate": 8.627351616201252e-07, "loss": 0.009255555458366871, "memory(GiB)": 22.66, "step": 25233, "token_acc": 1.0, "train_speed(iter/s)": 0.956795 }, { "epoch": 0.8197381671701913, "grad_norm": 0.45706209540367126, "learning_rate": 8.62433555307941e-07, "loss": 0.01473420299589634, "memory(GiB)": 22.66, "step": 25234, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.956803 }, { "epoch": 0.8197706526329468, "grad_norm": 0.43806055188179016, "learning_rate": 8.621319967487807e-07, "loss": 0.011945326812565327, "memory(GiB)": 22.66, "step": 25235, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.956811 }, { "epoch": 0.8198031380957022, "grad_norm": 0.38676372170448303, "learning_rate": 8.618304859461252e-07, "loss": 0.017325378954410553, "memory(GiB)": 22.66, "step": 25236, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.956819 }, { "epoch": 0.8198356235584576, "grad_norm": 0.26065921783447266, "learning_rate": 8.615290229034545e-07, "loss": 0.006343844346702099, "memory(GiB)": 22.66, "step": 25237, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956827 }, { "epoch": 0.819868109021213, "grad_norm": 0.2719683349132538, "learning_rate": 8.612276076242459e-07, "loss": 0.008585469797253609, "memory(GiB)": 22.66, "step": 25238, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.956835 }, { "epoch": 0.8199005944839685, "grad_norm": 0.3404221534729004, "learning_rate": 8.6092624011198e-07, "loss": 0.009133439511060715, "memory(GiB)": 22.66, "step": 25239, "token_acc": 1.0, "train_speed(iter/s)": 0.956843 }, { "epoch": 0.8199330799467238, "grad_norm": 0.37507393956184387, "learning_rate": 8.606249203701339e-07, "loss": 0.006304123438894749, "memory(GiB)": 22.66, "step": 25240, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.956851 }, { "epoch": 0.8199655654094793, "grad_norm": 0.25042256712913513, "learning_rate": 8.603236484021859e-07, "loss": 0.009087643586099148, "memory(GiB)": 22.66, "step": 25241, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956859 }, { "epoch": 0.8199980508722347, "grad_norm": 0.34289678931236267, "learning_rate": 8.60022424211614e-07, "loss": 0.008247184567153454, "memory(GiB)": 22.66, "step": 25242, "token_acc": 1.0, "train_speed(iter/s)": 0.956868 }, { "epoch": 0.8200305363349901, "grad_norm": 0.3290112018585205, "learning_rate": 8.597212478018923e-07, "loss": 0.01573106274008751, "memory(GiB)": 22.66, "step": 25243, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956876 }, { "epoch": 0.8200630217977455, "grad_norm": 0.3361762762069702, "learning_rate": 8.594201191764984e-07, "loss": 0.00852169282734394, "memory(GiB)": 22.66, "step": 25244, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956882 }, { "epoch": 0.820095507260501, "grad_norm": 0.34054267406463623, "learning_rate": 8.59119038338907e-07, "loss": 0.013172326609492302, "memory(GiB)": 22.66, "step": 25245, "token_acc": 1.0, "train_speed(iter/s)": 0.956889 }, { "epoch": 0.8201279927232563, "grad_norm": 0.5781221985816956, "learning_rate": 8.588180052925949e-07, "loss": 0.014249429106712341, "memory(GiB)": 22.66, "step": 25246, "token_acc": 1.0, "train_speed(iter/s)": 0.956896 }, { "epoch": 0.8201604781860118, "grad_norm": 0.39990127086639404, "learning_rate": 8.585170200410337e-07, "loss": 0.015729691833257675, "memory(GiB)": 22.66, "step": 25247, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.956902 }, { "epoch": 0.8201929636487671, "grad_norm": 0.26914530992507935, "learning_rate": 8.582160825876995e-07, "loss": 0.009359557181596756, "memory(GiB)": 22.66, "step": 25248, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.956908 }, { "epoch": 0.8202254491115226, "grad_norm": 0.36793985962867737, "learning_rate": 8.579151929360624e-07, "loss": 0.014921430498361588, "memory(GiB)": 22.66, "step": 25249, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.956914 }, { "epoch": 0.820257934574278, "grad_norm": 0.3239794969558716, "learning_rate": 8.57614351089599e-07, "loss": 0.01007846649736166, "memory(GiB)": 22.66, "step": 25250, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.956921 }, { "epoch": 0.8202904200370335, "grad_norm": 0.35256505012512207, "learning_rate": 8.573135570517783e-07, "loss": 0.013233150355517864, "memory(GiB)": 22.66, "step": 25251, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.956927 }, { "epoch": 0.8203229054997888, "grad_norm": 0.3561997711658478, "learning_rate": 8.570128108260745e-07, "loss": 0.008601158857345581, "memory(GiB)": 22.66, "step": 25252, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.956933 }, { "epoch": 0.8203553909625443, "grad_norm": 0.35020992159843445, "learning_rate": 8.567121124159561e-07, "loss": 0.007908601313829422, "memory(GiB)": 22.66, "step": 25253, "token_acc": 1.0, "train_speed(iter/s)": 0.956939 }, { "epoch": 0.8203878764252996, "grad_norm": 0.3077496588230133, "learning_rate": 8.564114618248931e-07, "loss": 0.013247782364487648, "memory(GiB)": 22.66, "step": 25254, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956946 }, { "epoch": 0.8204203618880551, "grad_norm": 0.3192519545555115, "learning_rate": 8.561108590563594e-07, "loss": 0.008426635526120663, "memory(GiB)": 22.66, "step": 25255, "token_acc": 1.0, "train_speed(iter/s)": 0.956953 }, { "epoch": 0.8204528473508105, "grad_norm": 0.23029695451259613, "learning_rate": 8.558103041138205e-07, "loss": 0.00825912319123745, "memory(GiB)": 22.66, "step": 25256, "token_acc": 1.0, "train_speed(iter/s)": 0.956959 }, { "epoch": 0.820485332813566, "grad_norm": 0.37900248169898987, "learning_rate": 8.555097970007481e-07, "loss": 0.01227032020688057, "memory(GiB)": 22.66, "step": 25257, "token_acc": 1.0, "train_speed(iter/s)": 0.956965 }, { "epoch": 0.8205178182763213, "grad_norm": 0.3412496745586395, "learning_rate": 8.552093377206077e-07, "loss": 0.013178830966353416, "memory(GiB)": 22.66, "step": 25258, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956972 }, { "epoch": 0.8205503037390768, "grad_norm": 0.34179797768592834, "learning_rate": 8.549089262768678e-07, "loss": 0.012084830552339554, "memory(GiB)": 22.66, "step": 25259, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.956979 }, { "epoch": 0.8205827892018321, "grad_norm": 0.29054126143455505, "learning_rate": 8.546085626729967e-07, "loss": 0.007983608171343803, "memory(GiB)": 22.66, "step": 25260, "token_acc": 0.9932432432432432, "train_speed(iter/s)": 0.956985 }, { "epoch": 0.8206152746645876, "grad_norm": 0.41068384051322937, "learning_rate": 8.543082469124613e-07, "loss": 0.018232736736536026, "memory(GiB)": 22.66, "step": 25261, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.956992 }, { "epoch": 0.820647760127343, "grad_norm": 0.32231616973876953, "learning_rate": 8.540079789987255e-07, "loss": 0.013168904930353165, "memory(GiB)": 22.66, "step": 25262, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.956999 }, { "epoch": 0.8206802455900984, "grad_norm": 1.025733470916748, "learning_rate": 8.537077589352555e-07, "loss": 0.010574063286185265, "memory(GiB)": 22.66, "step": 25263, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.957005 }, { "epoch": 0.8207127310528538, "grad_norm": 0.3770994246006012, "learning_rate": 8.53407586725517e-07, "loss": 0.012817004695534706, "memory(GiB)": 22.66, "step": 25264, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.957012 }, { "epoch": 0.8207452165156093, "grad_norm": 0.4462122917175293, "learning_rate": 8.531074623729752e-07, "loss": 0.00927422009408474, "memory(GiB)": 22.66, "step": 25265, "token_acc": 1.0, "train_speed(iter/s)": 0.95702 }, { "epoch": 0.8207777019783646, "grad_norm": 0.4613044559955597, "learning_rate": 8.528073858810914e-07, "loss": 0.0207945816218853, "memory(GiB)": 22.66, "step": 25266, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.957029 }, { "epoch": 0.8208101874411201, "grad_norm": 0.2999730110168457, "learning_rate": 8.525073572533304e-07, "loss": 0.010848680511116982, "memory(GiB)": 22.66, "step": 25267, "token_acc": 1.0, "train_speed(iter/s)": 0.957037 }, { "epoch": 0.8208426729038755, "grad_norm": 0.41448891162872314, "learning_rate": 8.522073764931549e-07, "loss": 0.01200881041586399, "memory(GiB)": 22.66, "step": 25268, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957043 }, { "epoch": 0.8208751583666309, "grad_norm": 0.4176025688648224, "learning_rate": 8.51907443604027e-07, "loss": 0.007958081550896168, "memory(GiB)": 22.66, "step": 25269, "token_acc": 1.0, "train_speed(iter/s)": 0.957048 }, { "epoch": 0.8209076438293863, "grad_norm": 0.4310806095600128, "learning_rate": 8.516075585894091e-07, "loss": 0.018350601196289062, "memory(GiB)": 22.66, "step": 25270, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.957054 }, { "epoch": 0.8209401292921418, "grad_norm": 0.34437310695648193, "learning_rate": 8.513077214527605e-07, "loss": 0.010951196774840355, "memory(GiB)": 22.66, "step": 25271, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.957061 }, { "epoch": 0.8209726147548971, "grad_norm": 0.694951593875885, "learning_rate": 8.510079321975429e-07, "loss": 0.01917221210896969, "memory(GiB)": 22.66, "step": 25272, "token_acc": 0.9796954314720813, "train_speed(iter/s)": 0.957067 }, { "epoch": 0.8210051002176526, "grad_norm": 0.26793161034584045, "learning_rate": 8.507081908272158e-07, "loss": 0.008110194467008114, "memory(GiB)": 22.66, "step": 25273, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957073 }, { "epoch": 0.821037585680408, "grad_norm": 0.3541888892650604, "learning_rate": 8.504084973452403e-07, "loss": 0.01095580868422985, "memory(GiB)": 22.66, "step": 25274, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.957079 }, { "epoch": 0.8210700711431634, "grad_norm": 0.39617881178855896, "learning_rate": 8.501088517550726e-07, "loss": 0.013131489977240562, "memory(GiB)": 22.66, "step": 25275, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.957086 }, { "epoch": 0.8211025566059188, "grad_norm": 0.364949107170105, "learning_rate": 8.498092540601732e-07, "loss": 0.011741168797016144, "memory(GiB)": 22.66, "step": 25276, "token_acc": 0.99609375, "train_speed(iter/s)": 0.957092 }, { "epoch": 0.8211350420686743, "grad_norm": 0.3689870238304138, "learning_rate": 8.495097042639966e-07, "loss": 0.011464590206742287, "memory(GiB)": 22.66, "step": 25277, "token_acc": 1.0, "train_speed(iter/s)": 0.957098 }, { "epoch": 0.8211675275314296, "grad_norm": 0.4084530770778656, "learning_rate": 8.49210202370005e-07, "loss": 0.011793872341513634, "memory(GiB)": 22.66, "step": 25278, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.957103 }, { "epoch": 0.8212000129941851, "grad_norm": 0.26686736941337585, "learning_rate": 8.489107483816506e-07, "loss": 0.0074973078444600105, "memory(GiB)": 22.66, "step": 25279, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.95711 }, { "epoch": 0.8212324984569406, "grad_norm": 0.33034542202949524, "learning_rate": 8.486113423023929e-07, "loss": 0.011333947069942951, "memory(GiB)": 22.66, "step": 25280, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.957116 }, { "epoch": 0.8212649839196959, "grad_norm": 0.438251256942749, "learning_rate": 8.483119841356846e-07, "loss": 0.01605033688247204, "memory(GiB)": 22.66, "step": 25281, "token_acc": 1.0, "train_speed(iter/s)": 0.957121 }, { "epoch": 0.8212974693824514, "grad_norm": 0.31505945324897766, "learning_rate": 8.480126738849814e-07, "loss": 0.011041487567126751, "memory(GiB)": 22.66, "step": 25282, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957128 }, { "epoch": 0.8213299548452068, "grad_norm": 0.41759201884269714, "learning_rate": 8.477134115537389e-07, "loss": 0.014512777328491211, "memory(GiB)": 22.66, "step": 25283, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957134 }, { "epoch": 0.8213624403079622, "grad_norm": 0.3848998248577118, "learning_rate": 8.474141971454097e-07, "loss": 0.011167151853442192, "memory(GiB)": 22.66, "step": 25284, "token_acc": 1.0, "train_speed(iter/s)": 0.95714 }, { "epoch": 0.8213949257707176, "grad_norm": 0.2821544408798218, "learning_rate": 8.471150306634496e-07, "loss": 0.007804764434695244, "memory(GiB)": 22.66, "step": 25285, "token_acc": 1.0, "train_speed(iter/s)": 0.957146 }, { "epoch": 0.8214274112334731, "grad_norm": 0.3090348541736603, "learning_rate": 8.468159121113079e-07, "loss": 0.016340913251042366, "memory(GiB)": 22.66, "step": 25286, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.957152 }, { "epoch": 0.8214598966962284, "grad_norm": 0.30623653531074524, "learning_rate": 8.465168414924386e-07, "loss": 0.007489484269171953, "memory(GiB)": 22.66, "step": 25287, "token_acc": 1.0, "train_speed(iter/s)": 0.957157 }, { "epoch": 0.8214923821589839, "grad_norm": 0.2515760064125061, "learning_rate": 8.462178188102927e-07, "loss": 0.01038740947842598, "memory(GiB)": 22.66, "step": 25288, "token_acc": 1.0, "train_speed(iter/s)": 0.957163 }, { "epoch": 0.8215248676217393, "grad_norm": 0.34039509296417236, "learning_rate": 8.459188440683236e-07, "loss": 0.010860923677682877, "memory(GiB)": 22.66, "step": 25289, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957169 }, { "epoch": 0.8215573530844947, "grad_norm": 0.2962622344493866, "learning_rate": 8.456199172699786e-07, "loss": 0.008604711852967739, "memory(GiB)": 22.66, "step": 25290, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.957175 }, { "epoch": 0.8215898385472501, "grad_norm": 0.3796343505382538, "learning_rate": 8.453210384187094e-07, "loss": 0.011189773678779602, "memory(GiB)": 22.66, "step": 25291, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957181 }, { "epoch": 0.8216223240100056, "grad_norm": 0.44905078411102295, "learning_rate": 8.450222075179654e-07, "loss": 0.011935102753341198, "memory(GiB)": 22.66, "step": 25292, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957189 }, { "epoch": 0.8216548094727609, "grad_norm": 0.33030661940574646, "learning_rate": 8.447234245711966e-07, "loss": 0.01262266468256712, "memory(GiB)": 22.66, "step": 25293, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957197 }, { "epoch": 0.8216872949355164, "grad_norm": 0.39331233501434326, "learning_rate": 8.444246895818486e-07, "loss": 0.010781479999423027, "memory(GiB)": 22.66, "step": 25294, "token_acc": 1.0, "train_speed(iter/s)": 0.957205 }, { "epoch": 0.8217197803982718, "grad_norm": 0.25862735509872437, "learning_rate": 8.441260025533709e-07, "loss": 0.011227739043533802, "memory(GiB)": 22.66, "step": 25295, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.957213 }, { "epoch": 0.8217522658610272, "grad_norm": 0.33398357033729553, "learning_rate": 8.438273634892108e-07, "loss": 0.016905268654227257, "memory(GiB)": 22.66, "step": 25296, "token_acc": 0.995, "train_speed(iter/s)": 0.957221 }, { "epoch": 0.8217847513237826, "grad_norm": 0.5859383344650269, "learning_rate": 8.435287723928159e-07, "loss": 0.011681376956403255, "memory(GiB)": 22.66, "step": 25297, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.957229 }, { "epoch": 0.8218172367865381, "grad_norm": 0.30203408002853394, "learning_rate": 8.432302292676298e-07, "loss": 0.0101427361369133, "memory(GiB)": 22.66, "step": 25298, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.957237 }, { "epoch": 0.8218497222492934, "grad_norm": 0.2623227536678314, "learning_rate": 8.429317341170995e-07, "loss": 0.011095095425844193, "memory(GiB)": 22.66, "step": 25299, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957245 }, { "epoch": 0.8218822077120489, "grad_norm": 0.24568235874176025, "learning_rate": 8.426332869446701e-07, "loss": 0.007742735557258129, "memory(GiB)": 22.66, "step": 25300, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.957253 }, { "epoch": 0.8219146931748043, "grad_norm": 0.3839882016181946, "learning_rate": 8.423348877537863e-07, "loss": 0.01953054964542389, "memory(GiB)": 22.66, "step": 25301, "token_acc": 1.0, "train_speed(iter/s)": 0.957261 }, { "epoch": 0.8219471786375597, "grad_norm": 0.2857322096824646, "learning_rate": 8.420365365478927e-07, "loss": 0.01003331784158945, "memory(GiB)": 22.66, "step": 25302, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957269 }, { "epoch": 0.8219796641003151, "grad_norm": 0.28526222705841064, "learning_rate": 8.417382333304308e-07, "loss": 0.01071617566049099, "memory(GiB)": 22.66, "step": 25303, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.957276 }, { "epoch": 0.8220121495630706, "grad_norm": 0.4773532450199127, "learning_rate": 8.414399781048449e-07, "loss": 0.015716511756181717, "memory(GiB)": 22.66, "step": 25304, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.957282 }, { "epoch": 0.8220446350258259, "grad_norm": 0.3377472758293152, "learning_rate": 8.411417708745745e-07, "loss": 0.010218566283583641, "memory(GiB)": 22.66, "step": 25305, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.957288 }, { "epoch": 0.8220771204885814, "grad_norm": 0.4472578167915344, "learning_rate": 8.408436116430662e-07, "loss": 0.015989437699317932, "memory(GiB)": 22.66, "step": 25306, "token_acc": 0.9884393063583815, "train_speed(iter/s)": 0.957295 }, { "epoch": 0.8221096059513368, "grad_norm": 0.22392185032367706, "learning_rate": 8.405455004137569e-07, "loss": 0.00910711195319891, "memory(GiB)": 22.66, "step": 25307, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.957302 }, { "epoch": 0.8221420914140922, "grad_norm": 0.3751474618911743, "learning_rate": 8.402474371900904e-07, "loss": 0.009182436391711235, "memory(GiB)": 22.66, "step": 25308, "token_acc": 1.0, "train_speed(iter/s)": 0.957308 }, { "epoch": 0.8221745768768476, "grad_norm": 0.3260689377784729, "learning_rate": 8.399494219755033e-07, "loss": 0.009919669479131699, "memory(GiB)": 22.66, "step": 25309, "token_acc": 1.0, "train_speed(iter/s)": 0.957314 }, { "epoch": 0.8222070623396031, "grad_norm": 0.38440465927124023, "learning_rate": 8.396514547734375e-07, "loss": 0.013112539425492287, "memory(GiB)": 22.66, "step": 25310, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.95732 }, { "epoch": 0.8222395478023584, "grad_norm": 0.3160877823829651, "learning_rate": 8.393535355873311e-07, "loss": 0.017422614619135857, "memory(GiB)": 22.66, "step": 25311, "token_acc": 1.0, "train_speed(iter/s)": 0.957326 }, { "epoch": 0.8222720332651139, "grad_norm": 0.2764812409877777, "learning_rate": 8.390556644206238e-07, "loss": 0.01232058834284544, "memory(GiB)": 22.66, "step": 25312, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.957333 }, { "epoch": 0.8223045187278692, "grad_norm": 0.3342216908931732, "learning_rate": 8.38757841276751e-07, "loss": 0.014018983580172062, "memory(GiB)": 22.66, "step": 25313, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.957339 }, { "epoch": 0.8223370041906247, "grad_norm": 0.3829229772090912, "learning_rate": 8.384600661591513e-07, "loss": 0.014084498398005962, "memory(GiB)": 22.66, "step": 25314, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.957346 }, { "epoch": 0.8223694896533801, "grad_norm": 0.7127435803413391, "learning_rate": 8.381623390712618e-07, "loss": 0.011404242366552353, "memory(GiB)": 22.66, "step": 25315, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957352 }, { "epoch": 0.8224019751161356, "grad_norm": 0.3997189700603485, "learning_rate": 8.37864660016518e-07, "loss": 0.014066331088542938, "memory(GiB)": 22.66, "step": 25316, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.957358 }, { "epoch": 0.8224344605788909, "grad_norm": 0.3916967511177063, "learning_rate": 8.375670289983573e-07, "loss": 0.018574856221675873, "memory(GiB)": 22.66, "step": 25317, "token_acc": 1.0, "train_speed(iter/s)": 0.957365 }, { "epoch": 0.8224669460416464, "grad_norm": 0.35672488808631897, "learning_rate": 8.372694460202118e-07, "loss": 0.016573268920183182, "memory(GiB)": 22.66, "step": 25318, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.957371 }, { "epoch": 0.8224994315044017, "grad_norm": 0.3381725549697876, "learning_rate": 8.369719110855174e-07, "loss": 0.011287536472082138, "memory(GiB)": 22.66, "step": 25319, "token_acc": 1.0, "train_speed(iter/s)": 0.957377 }, { "epoch": 0.8225319169671572, "grad_norm": 0.3544410765171051, "learning_rate": 8.366744241977081e-07, "loss": 0.012310678139328957, "memory(GiB)": 22.66, "step": 25320, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957384 }, { "epoch": 0.8225644024299126, "grad_norm": 0.4002969264984131, "learning_rate": 8.363769853602189e-07, "loss": 0.02458827942609787, "memory(GiB)": 22.66, "step": 25321, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.95739 }, { "epoch": 0.822596887892668, "grad_norm": 0.344372421503067, "learning_rate": 8.360795945764799e-07, "loss": 0.016735054552555084, "memory(GiB)": 22.66, "step": 25322, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957396 }, { "epoch": 0.8226293733554234, "grad_norm": 0.3487471640110016, "learning_rate": 8.357822518499242e-07, "loss": 0.007016262039542198, "memory(GiB)": 22.66, "step": 25323, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957404 }, { "epoch": 0.8226618588181789, "grad_norm": 0.3299541771411896, "learning_rate": 8.354849571839841e-07, "loss": 0.011701397597789764, "memory(GiB)": 22.66, "step": 25324, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.957411 }, { "epoch": 0.8226943442809342, "grad_norm": 0.34706979990005493, "learning_rate": 8.351877105820916e-07, "loss": 0.010677853599190712, "memory(GiB)": 22.66, "step": 25325, "token_acc": 1.0, "train_speed(iter/s)": 0.95742 }, { "epoch": 0.8227268297436897, "grad_norm": 0.4252312481403351, "learning_rate": 8.348905120476753e-07, "loss": 0.012149369344115257, "memory(GiB)": 22.66, "step": 25326, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957428 }, { "epoch": 0.8227593152064451, "grad_norm": 0.34055694937705994, "learning_rate": 8.345933615841673e-07, "loss": 0.013278920203447342, "memory(GiB)": 22.66, "step": 25327, "token_acc": 1.0, "train_speed(iter/s)": 0.957436 }, { "epoch": 0.8227918006692005, "grad_norm": 0.3607310652732849, "learning_rate": 8.342962591949932e-07, "loss": 0.015908509492874146, "memory(GiB)": 22.66, "step": 25328, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.957444 }, { "epoch": 0.8228242861319559, "grad_norm": 0.26476696133613586, "learning_rate": 8.339992048835877e-07, "loss": 0.009561135433614254, "memory(GiB)": 22.66, "step": 25329, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.957451 }, { "epoch": 0.8228567715947114, "grad_norm": 0.3875647783279419, "learning_rate": 8.337021986533749e-07, "loss": 0.010008098557591438, "memory(GiB)": 22.66, "step": 25330, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957458 }, { "epoch": 0.8228892570574667, "grad_norm": 0.3768496513366699, "learning_rate": 8.334052405077842e-07, "loss": 0.008419630117714405, "memory(GiB)": 22.66, "step": 25331, "token_acc": 1.0, "train_speed(iter/s)": 0.957464 }, { "epoch": 0.8229217425202222, "grad_norm": 0.2825620174407959, "learning_rate": 8.331083304502441e-07, "loss": 0.008359274826943874, "memory(GiB)": 22.66, "step": 25332, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.95747 }, { "epoch": 0.8229542279829776, "grad_norm": 0.47716107964515686, "learning_rate": 8.32811468484177e-07, "loss": 0.01058496255427599, "memory(GiB)": 22.66, "step": 25333, "token_acc": 1.0, "train_speed(iter/s)": 0.957476 }, { "epoch": 0.822986713445733, "grad_norm": 0.35170888900756836, "learning_rate": 8.325146546130153e-07, "loss": 0.009190364740788937, "memory(GiB)": 22.66, "step": 25334, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957482 }, { "epoch": 0.8230191989084884, "grad_norm": 0.3078402876853943, "learning_rate": 8.322178888401794e-07, "loss": 0.011505928821861744, "memory(GiB)": 22.66, "step": 25335, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.957488 }, { "epoch": 0.8230516843712439, "grad_norm": 0.28627368807792664, "learning_rate": 8.319211711690983e-07, "loss": 0.010604100301861763, "memory(GiB)": 22.66, "step": 25336, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.957494 }, { "epoch": 0.8230841698339992, "grad_norm": 0.4584587812423706, "learning_rate": 8.316245016031926e-07, "loss": 0.01708238571882248, "memory(GiB)": 22.66, "step": 25337, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.9575 }, { "epoch": 0.8231166552967547, "grad_norm": 0.332972913980484, "learning_rate": 8.31327880145889e-07, "loss": 0.012379547581076622, "memory(GiB)": 22.66, "step": 25338, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957506 }, { "epoch": 0.8231491407595101, "grad_norm": 0.29973500967025757, "learning_rate": 8.310313068006098e-07, "loss": 0.011205853894352913, "memory(GiB)": 22.66, "step": 25339, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.957512 }, { "epoch": 0.8231816262222655, "grad_norm": 0.36477962136268616, "learning_rate": 8.307347815707795e-07, "loss": 0.01023360900580883, "memory(GiB)": 22.66, "step": 25340, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.957518 }, { "epoch": 0.8232141116850209, "grad_norm": 0.4188794791698456, "learning_rate": 8.30438304459818e-07, "loss": 0.015482286922633648, "memory(GiB)": 22.66, "step": 25341, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.957524 }, { "epoch": 0.8232465971477764, "grad_norm": 0.4361419975757599, "learning_rate": 8.301418754711476e-07, "loss": 0.01883639208972454, "memory(GiB)": 22.66, "step": 25342, "token_acc": 0.9940828402366864, "train_speed(iter/s)": 0.95753 }, { "epoch": 0.8232790826105317, "grad_norm": 1.0472573041915894, "learning_rate": 8.298454946081908e-07, "loss": 0.00988711230456829, "memory(GiB)": 22.66, "step": 25343, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957536 }, { "epoch": 0.8233115680732872, "grad_norm": 0.3917415142059326, "learning_rate": 8.295491618743679e-07, "loss": 0.008133607916533947, "memory(GiB)": 22.66, "step": 25344, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.957542 }, { "epoch": 0.8233440535360427, "grad_norm": 0.3236757516860962, "learning_rate": 8.292528772730979e-07, "loss": 0.006684017833322287, "memory(GiB)": 22.66, "step": 25345, "token_acc": 1.0, "train_speed(iter/s)": 0.957549 }, { "epoch": 0.823376538998798, "grad_norm": 0.332133412361145, "learning_rate": 8.289566408078004e-07, "loss": 0.012657894752919674, "memory(GiB)": 22.66, "step": 25346, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.957555 }, { "epoch": 0.8234090244615535, "grad_norm": 0.4842779338359833, "learning_rate": 8.286604524818953e-07, "loss": 0.015111192129552364, "memory(GiB)": 22.66, "step": 25347, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957561 }, { "epoch": 0.8234415099243089, "grad_norm": 0.468302845954895, "learning_rate": 8.283643122988005e-07, "loss": 0.018160372972488403, "memory(GiB)": 22.66, "step": 25348, "token_acc": 1.0, "train_speed(iter/s)": 0.957566 }, { "epoch": 0.8234739953870643, "grad_norm": 0.4907063841819763, "learning_rate": 8.280682202619356e-07, "loss": 0.010810698382556438, "memory(GiB)": 22.66, "step": 25349, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.957572 }, { "epoch": 0.8235064808498197, "grad_norm": 0.35253608226776123, "learning_rate": 8.277721763747143e-07, "loss": 0.014275453984737396, "memory(GiB)": 22.66, "step": 25350, "token_acc": 1.0, "train_speed(iter/s)": 0.957578 }, { "epoch": 0.8235389663125752, "grad_norm": 0.4350159764289856, "learning_rate": 8.274761806405557e-07, "loss": 0.015735767781734467, "memory(GiB)": 22.66, "step": 25351, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.957584 }, { "epoch": 0.8235714517753305, "grad_norm": 0.3368360102176666, "learning_rate": 8.271802330628759e-07, "loss": 0.011668356135487556, "memory(GiB)": 22.66, "step": 25352, "token_acc": 1.0, "train_speed(iter/s)": 0.95759 }, { "epoch": 0.823603937238086, "grad_norm": 0.2582084834575653, "learning_rate": 8.26884333645091e-07, "loss": 0.010943502187728882, "memory(GiB)": 22.66, "step": 25353, "token_acc": 1.0, "train_speed(iter/s)": 0.957597 }, { "epoch": 0.8236364227008414, "grad_norm": 0.42258089780807495, "learning_rate": 8.26588482390614e-07, "loss": 0.011100751347839832, "memory(GiB)": 22.66, "step": 25354, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.957605 }, { "epoch": 0.8236689081635968, "grad_norm": 0.3596913814544678, "learning_rate": 8.262926793028625e-07, "loss": 0.01318285521119833, "memory(GiB)": 22.66, "step": 25355, "token_acc": 0.9966329966329966, "train_speed(iter/s)": 0.957613 }, { "epoch": 0.8237013936263522, "grad_norm": 0.2851141095161438, "learning_rate": 8.259969243852456e-07, "loss": 0.009729638695716858, "memory(GiB)": 22.66, "step": 25356, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.957621 }, { "epoch": 0.8237338790891077, "grad_norm": 0.4136703610420227, "learning_rate": 8.257012176411827e-07, "loss": 0.010375765152275562, "memory(GiB)": 22.66, "step": 25357, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.957629 }, { "epoch": 0.823766364551863, "grad_norm": 0.3978676199913025, "learning_rate": 8.254055590740823e-07, "loss": 0.014430122449994087, "memory(GiB)": 22.66, "step": 25358, "token_acc": 1.0, "train_speed(iter/s)": 0.957637 }, { "epoch": 0.8237988500146185, "grad_norm": 0.3636215925216675, "learning_rate": 8.251099486873593e-07, "loss": 0.009264794178307056, "memory(GiB)": 22.66, "step": 25359, "token_acc": 1.0, "train_speed(iter/s)": 0.957645 }, { "epoch": 0.8238313354773739, "grad_norm": 0.2728734314441681, "learning_rate": 8.248143864844232e-07, "loss": 0.009940650314092636, "memory(GiB)": 22.66, "step": 25360, "token_acc": 0.991869918699187, "train_speed(iter/s)": 0.957653 }, { "epoch": 0.8238638209401293, "grad_norm": 0.3377588987350464, "learning_rate": 8.245188724686848e-07, "loss": 0.011737140826880932, "memory(GiB)": 22.66, "step": 25361, "token_acc": 1.0, "train_speed(iter/s)": 0.957661 }, { "epoch": 0.8238963064028847, "grad_norm": 0.27653464674949646, "learning_rate": 8.242234066435583e-07, "loss": 0.011848898604512215, "memory(GiB)": 22.66, "step": 25362, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.957668 }, { "epoch": 0.8239287918656402, "grad_norm": 0.7713191509246826, "learning_rate": 8.239279890124513e-07, "loss": 0.02036791667342186, "memory(GiB)": 22.66, "step": 25363, "token_acc": 1.0, "train_speed(iter/s)": 0.957675 }, { "epoch": 0.8239612773283955, "grad_norm": 0.3865746259689331, "learning_rate": 8.236326195787741e-07, "loss": 0.017140837386250496, "memory(GiB)": 22.66, "step": 25364, "token_acc": 0.9837837837837838, "train_speed(iter/s)": 0.957681 }, { "epoch": 0.823993762791151, "grad_norm": 0.3292192816734314, "learning_rate": 8.233372983459342e-07, "loss": 0.009352874010801315, "memory(GiB)": 22.66, "step": 25365, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957687 }, { "epoch": 0.8240262482539064, "grad_norm": 0.37075549364089966, "learning_rate": 8.23042025317341e-07, "loss": 0.011355282738804817, "memory(GiB)": 22.66, "step": 25366, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.957694 }, { "epoch": 0.8240587337166618, "grad_norm": 0.30450382828712463, "learning_rate": 8.227468004964028e-07, "loss": 0.014958567917346954, "memory(GiB)": 22.66, "step": 25367, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.957699 }, { "epoch": 0.8240912191794172, "grad_norm": 0.2766070067882538, "learning_rate": 8.224516238865271e-07, "loss": 0.006338318809866905, "memory(GiB)": 22.66, "step": 25368, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.957705 }, { "epoch": 0.8241237046421727, "grad_norm": 0.3306836783885956, "learning_rate": 8.221564954911188e-07, "loss": 0.011816376820206642, "memory(GiB)": 22.66, "step": 25369, "token_acc": 1.0, "train_speed(iter/s)": 0.957711 }, { "epoch": 0.824156190104928, "grad_norm": 0.3521166443824768, "learning_rate": 8.218614153135857e-07, "loss": 0.01413646899163723, "memory(GiB)": 22.66, "step": 25370, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.957717 }, { "epoch": 0.8241886755676835, "grad_norm": 0.38804951310157776, "learning_rate": 8.215663833573323e-07, "loss": 0.011672250926494598, "memory(GiB)": 22.66, "step": 25371, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.957722 }, { "epoch": 0.8242211610304389, "grad_norm": 0.4481344521045685, "learning_rate": 8.21271399625766e-07, "loss": 0.011535311117768288, "memory(GiB)": 22.66, "step": 25372, "token_acc": 1.0, "train_speed(iter/s)": 0.957729 }, { "epoch": 0.8242536464931943, "grad_norm": 0.34688952565193176, "learning_rate": 8.209764641222884e-07, "loss": 0.013279430568218231, "memory(GiB)": 22.66, "step": 25373, "token_acc": 1.0, "train_speed(iter/s)": 0.957735 }, { "epoch": 0.8242861319559497, "grad_norm": 0.34712332487106323, "learning_rate": 8.20681576850304e-07, "loss": 0.015322031453251839, "memory(GiB)": 22.66, "step": 25374, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.957741 }, { "epoch": 0.8243186174187052, "grad_norm": 0.31736230850219727, "learning_rate": 8.203867378132175e-07, "loss": 0.01120264083147049, "memory(GiB)": 22.66, "step": 25375, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.957748 }, { "epoch": 0.8243511028814605, "grad_norm": 0.6747690439224243, "learning_rate": 8.200919470144325e-07, "loss": 0.011783070862293243, "memory(GiB)": 22.66, "step": 25376, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957754 }, { "epoch": 0.824383588344216, "grad_norm": 0.4068145751953125, "learning_rate": 8.197972044573482e-07, "loss": 0.01366014126688242, "memory(GiB)": 22.66, "step": 25377, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957761 }, { "epoch": 0.8244160738069714, "grad_norm": 0.3723624348640442, "learning_rate": 8.19502510145368e-07, "loss": 0.01392442174255848, "memory(GiB)": 22.66, "step": 25378, "token_acc": 1.0, "train_speed(iter/s)": 0.957768 }, { "epoch": 0.8244485592697268, "grad_norm": 0.3716706335544586, "learning_rate": 8.192078640818934e-07, "loss": 0.012308446690440178, "memory(GiB)": 22.66, "step": 25379, "token_acc": 0.9877049180327869, "train_speed(iter/s)": 0.957773 }, { "epoch": 0.8244810447324822, "grad_norm": 0.395652174949646, "learning_rate": 8.189132662703247e-07, "loss": 0.013156663626432419, "memory(GiB)": 22.66, "step": 25380, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957779 }, { "epoch": 0.8245135301952377, "grad_norm": 0.43380922079086304, "learning_rate": 8.186187167140624e-07, "loss": 0.015157199464738369, "memory(GiB)": 22.66, "step": 25381, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.957786 }, { "epoch": 0.824546015657993, "grad_norm": 0.37010616064071655, "learning_rate": 8.18324215416505e-07, "loss": 0.013843158259987831, "memory(GiB)": 22.66, "step": 25382, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.957792 }, { "epoch": 0.8245785011207485, "grad_norm": 0.3393165171146393, "learning_rate": 8.180297623810529e-07, "loss": 0.012854759581387043, "memory(GiB)": 22.66, "step": 25383, "token_acc": 1.0, "train_speed(iter/s)": 0.9578 }, { "epoch": 0.8246109865835038, "grad_norm": 0.35132917761802673, "learning_rate": 8.177353576111013e-07, "loss": 0.013016263023018837, "memory(GiB)": 22.66, "step": 25384, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.957808 }, { "epoch": 0.8246434720462593, "grad_norm": 0.3935837745666504, "learning_rate": 8.174410011100525e-07, "loss": 0.013053545728325844, "memory(GiB)": 22.66, "step": 25385, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957817 }, { "epoch": 0.8246759575090147, "grad_norm": 0.44438430666923523, "learning_rate": 8.171466928813005e-07, "loss": 0.014344373717904091, "memory(GiB)": 22.66, "step": 25386, "token_acc": 1.0, "train_speed(iter/s)": 0.957825 }, { "epoch": 0.8247084429717702, "grad_norm": 0.40612658858299255, "learning_rate": 8.16852432928244e-07, "loss": 0.01790888048708439, "memory(GiB)": 22.66, "step": 25387, "token_acc": 0.9779411764705882, "train_speed(iter/s)": 0.957833 }, { "epoch": 0.8247409284345255, "grad_norm": 0.2747974097728729, "learning_rate": 8.165582212542772e-07, "loss": 0.010904354974627495, "memory(GiB)": 22.66, "step": 25388, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.95784 }, { "epoch": 0.824773413897281, "grad_norm": 0.39901942014694214, "learning_rate": 8.162640578627967e-07, "loss": 0.011824477463960648, "memory(GiB)": 22.66, "step": 25389, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.957848 }, { "epoch": 0.8248058993600363, "grad_norm": 0.20395898818969727, "learning_rate": 8.159699427571971e-07, "loss": 0.007265864871442318, "memory(GiB)": 22.66, "step": 25390, "token_acc": 0.9961538461538462, "train_speed(iter/s)": 0.957856 }, { "epoch": 0.8248383848227918, "grad_norm": 0.19146637618541718, "learning_rate": 8.15675875940875e-07, "loss": 0.010001113638281822, "memory(GiB)": 22.66, "step": 25391, "token_acc": 1.0, "train_speed(iter/s)": 0.957863 }, { "epoch": 0.8248708702855472, "grad_norm": 0.271560400724411, "learning_rate": 8.153818574172212e-07, "loss": 0.00855887308716774, "memory(GiB)": 22.66, "step": 25392, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.95787 }, { "epoch": 0.8249033557483026, "grad_norm": 0.3639358580112457, "learning_rate": 8.150878871896306e-07, "loss": 0.016159621998667717, "memory(GiB)": 22.66, "step": 25393, "token_acc": 1.0, "train_speed(iter/s)": 0.957876 }, { "epoch": 0.824935841211058, "grad_norm": 0.34467652440071106, "learning_rate": 8.147939652614955e-07, "loss": 0.015332593582570553, "memory(GiB)": 22.66, "step": 25394, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.957882 }, { "epoch": 0.8249683266738135, "grad_norm": 0.393110066652298, "learning_rate": 8.145000916362089e-07, "loss": 0.010007762350142002, "memory(GiB)": 22.66, "step": 25395, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957888 }, { "epoch": 0.8250008121365688, "grad_norm": 0.36583778262138367, "learning_rate": 8.142062663171635e-07, "loss": 0.013107937760651112, "memory(GiB)": 22.66, "step": 25396, "token_acc": 1.0, "train_speed(iter/s)": 0.957895 }, { "epoch": 0.8250332975993243, "grad_norm": 0.19833676517009735, "learning_rate": 8.139124893077471e-07, "loss": 0.010478049516677856, "memory(GiB)": 22.66, "step": 25397, "token_acc": 0.996, "train_speed(iter/s)": 0.957901 }, { "epoch": 0.8250657830620797, "grad_norm": 0.3622932732105255, "learning_rate": 8.136187606113532e-07, "loss": 0.01635451801121235, "memory(GiB)": 22.66, "step": 25398, "token_acc": 0.9930313588850174, "train_speed(iter/s)": 0.957907 }, { "epoch": 0.8250982685248351, "grad_norm": 0.5000638365745544, "learning_rate": 8.133250802313703e-07, "loss": 0.016781149432063103, "memory(GiB)": 22.66, "step": 25399, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957914 }, { "epoch": 0.8251307539875905, "grad_norm": 0.22489255666732788, "learning_rate": 8.130314481711898e-07, "loss": 0.006849231198430061, "memory(GiB)": 22.66, "step": 25400, "token_acc": 1.0, "train_speed(iter/s)": 0.957921 }, { "epoch": 0.825163239450346, "grad_norm": 0.37010425329208374, "learning_rate": 8.127378644341977e-07, "loss": 0.011838721111416817, "memory(GiB)": 22.66, "step": 25401, "token_acc": 0.99609375, "train_speed(iter/s)": 0.957927 }, { "epoch": 0.8251957249131013, "grad_norm": 0.3447292149066925, "learning_rate": 8.124443290237844e-07, "loss": 0.014184290543198586, "memory(GiB)": 22.66, "step": 25402, "token_acc": 1.0, "train_speed(iter/s)": 0.957933 }, { "epoch": 0.8252282103758568, "grad_norm": 0.27564600110054016, "learning_rate": 8.121508419433366e-07, "loss": 0.006335503421723843, "memory(GiB)": 22.66, "step": 25403, "token_acc": 1.0, "train_speed(iter/s)": 0.957939 }, { "epoch": 0.8252606958386122, "grad_norm": 0.42224037647247314, "learning_rate": 8.11857403196244e-07, "loss": 0.012581149116158485, "memory(GiB)": 22.66, "step": 25404, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.957946 }, { "epoch": 0.8252931813013676, "grad_norm": 0.17605747282505035, "learning_rate": 8.115640127858892e-07, "loss": 0.008210677653551102, "memory(GiB)": 22.66, "step": 25405, "token_acc": 1.0, "train_speed(iter/s)": 0.957952 }, { "epoch": 0.825325666764123, "grad_norm": 0.30819424986839294, "learning_rate": 8.112706707156626e-07, "loss": 0.008568299934267998, "memory(GiB)": 22.66, "step": 25406, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.957959 }, { "epoch": 0.8253581522268785, "grad_norm": 0.377171128988266, "learning_rate": 8.109773769889451e-07, "loss": 0.01449789758771658, "memory(GiB)": 22.66, "step": 25407, "token_acc": 0.9894366197183099, "train_speed(iter/s)": 0.957966 }, { "epoch": 0.8253906376896339, "grad_norm": 0.25727495551109314, "learning_rate": 8.106841316091252e-07, "loss": 0.009046918712556362, "memory(GiB)": 22.66, "step": 25408, "token_acc": 1.0, "train_speed(iter/s)": 0.957973 }, { "epoch": 0.8254231231523893, "grad_norm": 0.32550477981567383, "learning_rate": 8.103909345795874e-07, "loss": 0.010878256522119045, "memory(GiB)": 22.66, "step": 25409, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.95798 }, { "epoch": 0.8254556086151448, "grad_norm": 0.3428514003753662, "learning_rate": 8.100977859037134e-07, "loss": 0.01166713796555996, "memory(GiB)": 22.66, "step": 25410, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957986 }, { "epoch": 0.8254880940779001, "grad_norm": 0.5005161762237549, "learning_rate": 8.098046855848896e-07, "loss": 0.016590693965554237, "memory(GiB)": 22.66, "step": 25411, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.957992 }, { "epoch": 0.8255205795406556, "grad_norm": 0.30778369307518005, "learning_rate": 8.095116336264935e-07, "loss": 0.010186953470110893, "memory(GiB)": 22.66, "step": 25412, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.957998 }, { "epoch": 0.825553065003411, "grad_norm": 0.2978764474391937, "learning_rate": 8.092186300319138e-07, "loss": 0.008258573710918427, "memory(GiB)": 22.66, "step": 25413, "token_acc": 1.0, "train_speed(iter/s)": 0.958005 }, { "epoch": 0.8255855504661664, "grad_norm": 0.3531089425086975, "learning_rate": 8.089256748045276e-07, "loss": 0.01324284728616476, "memory(GiB)": 22.66, "step": 25414, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.958012 }, { "epoch": 0.8256180359289218, "grad_norm": 0.3669959604740143, "learning_rate": 8.086327679477185e-07, "loss": 0.011363307014107704, "memory(GiB)": 22.66, "step": 25415, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.958019 }, { "epoch": 0.8256505213916773, "grad_norm": 0.2701931893825531, "learning_rate": 8.083399094648647e-07, "loss": 0.009219314903020859, "memory(GiB)": 22.66, "step": 25416, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.958027 }, { "epoch": 0.8256830068544326, "grad_norm": 0.2533249855041504, "learning_rate": 8.080470993593476e-07, "loss": 0.012663343921303749, "memory(GiB)": 22.66, "step": 25417, "token_acc": 1.0, "train_speed(iter/s)": 0.958035 }, { "epoch": 0.8257154923171881, "grad_norm": 0.4192073941230774, "learning_rate": 8.077543376345464e-07, "loss": 0.009622530080378056, "memory(GiB)": 22.66, "step": 25418, "token_acc": 1.0, "train_speed(iter/s)": 0.958043 }, { "epoch": 0.8257479777799435, "grad_norm": 0.4417513906955719, "learning_rate": 8.074616242938416e-07, "loss": 0.013419242575764656, "memory(GiB)": 22.66, "step": 25419, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.958051 }, { "epoch": 0.8257804632426989, "grad_norm": 0.31272566318511963, "learning_rate": 8.071689593406085e-07, "loss": 0.010565202683210373, "memory(GiB)": 22.66, "step": 25420, "token_acc": 1.0, "train_speed(iter/s)": 0.958059 }, { "epoch": 0.8258129487054543, "grad_norm": 0.5264574289321899, "learning_rate": 8.068763427782256e-07, "loss": 0.013886161148548126, "memory(GiB)": 22.66, "step": 25421, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.958067 }, { "epoch": 0.8258454341682098, "grad_norm": 0.34382912516593933, "learning_rate": 8.065837746100713e-07, "loss": 0.011857048608362675, "memory(GiB)": 22.66, "step": 25422, "token_acc": 1.0, "train_speed(iter/s)": 0.958075 }, { "epoch": 0.8258779196309651, "grad_norm": 0.264023095369339, "learning_rate": 8.062912548395218e-07, "loss": 0.010547548532485962, "memory(GiB)": 22.66, "step": 25423, "token_acc": 1.0, "train_speed(iter/s)": 0.958081 }, { "epoch": 0.8259104050937206, "grad_norm": 0.36222541332244873, "learning_rate": 8.05998783469954e-07, "loss": 0.012711865827441216, "memory(GiB)": 22.66, "step": 25424, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.958087 }, { "epoch": 0.825942890556476, "grad_norm": 0.2142098993062973, "learning_rate": 8.057063605047416e-07, "loss": 0.0065620820969343185, "memory(GiB)": 22.66, "step": 25425, "token_acc": 1.0, "train_speed(iter/s)": 0.958093 }, { "epoch": 0.8259753760192314, "grad_norm": 0.28928524255752563, "learning_rate": 8.054139859472599e-07, "loss": 0.009978746064007282, "memory(GiB)": 22.66, "step": 25426, "token_acc": 1.0, "train_speed(iter/s)": 0.9581 }, { "epoch": 0.8260078614819868, "grad_norm": 0.3150707483291626, "learning_rate": 8.051216598008838e-07, "loss": 0.00829162448644638, "memory(GiB)": 22.66, "step": 25427, "token_acc": 1.0, "train_speed(iter/s)": 0.958106 }, { "epoch": 0.8260403469447423, "grad_norm": 0.35097774863243103, "learning_rate": 8.048293820689884e-07, "loss": 0.012560145929455757, "memory(GiB)": 22.66, "step": 25428, "token_acc": 1.0, "train_speed(iter/s)": 0.958112 }, { "epoch": 0.8260728324074976, "grad_norm": 0.32289406657218933, "learning_rate": 8.045371527549444e-07, "loss": 0.00892257783561945, "memory(GiB)": 22.66, "step": 25429, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.958118 }, { "epoch": 0.8261053178702531, "grad_norm": 0.3301582336425781, "learning_rate": 8.042449718621265e-07, "loss": 0.010093837045133114, "memory(GiB)": 22.66, "step": 25430, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.958125 }, { "epoch": 0.8261378033330085, "grad_norm": 0.2962908148765564, "learning_rate": 8.039528393939055e-07, "loss": 0.012939861044287682, "memory(GiB)": 22.66, "step": 25431, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.95813 }, { "epoch": 0.8261702887957639, "grad_norm": 0.4254043996334076, "learning_rate": 8.036607553536552e-07, "loss": 0.01277583185583353, "memory(GiB)": 22.66, "step": 25432, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.958136 }, { "epoch": 0.8262027742585193, "grad_norm": 0.3846626877784729, "learning_rate": 8.033687197447437e-07, "loss": 0.01324855349957943, "memory(GiB)": 22.66, "step": 25433, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.958143 }, { "epoch": 0.8262352597212748, "grad_norm": 0.20768685638904572, "learning_rate": 8.030767325705441e-07, "loss": 0.005257682874798775, "memory(GiB)": 22.66, "step": 25434, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.95815 }, { "epoch": 0.8262677451840301, "grad_norm": 0.4246254861354828, "learning_rate": 8.027847938344224e-07, "loss": 0.01753724366426468, "memory(GiB)": 22.66, "step": 25435, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.958155 }, { "epoch": 0.8263002306467856, "grad_norm": 0.4342920482158661, "learning_rate": 8.024929035397532e-07, "loss": 0.013609173707664013, "memory(GiB)": 22.66, "step": 25436, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.958161 }, { "epoch": 0.826332716109541, "grad_norm": 0.9748532772064209, "learning_rate": 8.022010616899017e-07, "loss": 0.012973028235137463, "memory(GiB)": 22.66, "step": 25437, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.958167 }, { "epoch": 0.8263652015722964, "grad_norm": 0.40614932775497437, "learning_rate": 8.019092682882385e-07, "loss": 0.01357967033982277, "memory(GiB)": 22.66, "step": 25438, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.958173 }, { "epoch": 0.8263976870350518, "grad_norm": 0.37372592091560364, "learning_rate": 8.016175233381285e-07, "loss": 0.014961231499910355, "memory(GiB)": 22.66, "step": 25439, "token_acc": 1.0, "train_speed(iter/s)": 0.958179 }, { "epoch": 0.8264301724978073, "grad_norm": 0.3551552891731262, "learning_rate": 8.013258268429397e-07, "loss": 0.01855965331196785, "memory(GiB)": 22.66, "step": 25440, "token_acc": 0.9961832061068703, "train_speed(iter/s)": 0.958185 }, { "epoch": 0.8264626579605626, "grad_norm": 0.3065510094165802, "learning_rate": 8.01034178806041e-07, "loss": 0.010826492682099342, "memory(GiB)": 22.66, "step": 25441, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.95819 }, { "epoch": 0.8264951434233181, "grad_norm": 0.2670159637928009, "learning_rate": 8.00742579230796e-07, "loss": 0.007144204806536436, "memory(GiB)": 22.66, "step": 25442, "token_acc": 1.0, "train_speed(iter/s)": 0.958195 }, { "epoch": 0.8265276288860735, "grad_norm": 0.37624216079711914, "learning_rate": 8.004510281205719e-07, "loss": 0.009076775051653385, "memory(GiB)": 22.66, "step": 25443, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.958203 }, { "epoch": 0.8265601143488289, "grad_norm": 0.28355666995048523, "learning_rate": 8.001595254787309e-07, "loss": 0.010353785008192062, "memory(GiB)": 22.66, "step": 25444, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.958211 }, { "epoch": 0.8265925998115843, "grad_norm": 0.3932335376739502, "learning_rate": 7.998680713086393e-07, "loss": 0.019266318529844284, "memory(GiB)": 22.66, "step": 25445, "token_acc": 1.0, "train_speed(iter/s)": 0.958219 }, { "epoch": 0.8266250852743398, "grad_norm": 0.4090616703033447, "learning_rate": 7.995766656136605e-07, "loss": 0.013352539390325546, "memory(GiB)": 22.66, "step": 25446, "token_acc": 1.0, "train_speed(iter/s)": 0.958228 }, { "epoch": 0.8266575707370951, "grad_norm": 0.391458660364151, "learning_rate": 7.992853083971591e-07, "loss": 0.016575690358877182, "memory(GiB)": 22.66, "step": 25447, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.958236 }, { "epoch": 0.8266900561998506, "grad_norm": 0.2238117903470993, "learning_rate": 7.989939996624951e-07, "loss": 0.008472541347146034, "memory(GiB)": 22.66, "step": 25448, "token_acc": 1.0, "train_speed(iter/s)": 0.958244 }, { "epoch": 0.826722541662606, "grad_norm": 0.4145825505256653, "learning_rate": 7.987027394130314e-07, "loss": 0.0173963475972414, "memory(GiB)": 22.66, "step": 25449, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.958252 }, { "epoch": 0.8267550271253614, "grad_norm": 0.3562590479850769, "learning_rate": 7.984115276521309e-07, "loss": 0.010778852738440037, "memory(GiB)": 22.66, "step": 25450, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.958261 }, { "epoch": 0.8267875125881168, "grad_norm": 0.37394848465919495, "learning_rate": 7.981203643831542e-07, "loss": 0.010484281927347183, "memory(GiB)": 22.66, "step": 25451, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.958269 }, { "epoch": 0.8268199980508723, "grad_norm": 0.30581697821617126, "learning_rate": 7.978292496094602e-07, "loss": 0.009749332442879677, "memory(GiB)": 22.66, "step": 25452, "token_acc": 1.0, "train_speed(iter/s)": 0.958277 }, { "epoch": 0.8268524835136276, "grad_norm": 0.2734805941581726, "learning_rate": 7.975381833344098e-07, "loss": 0.011371305212378502, "memory(GiB)": 22.66, "step": 25453, "token_acc": 1.0, "train_speed(iter/s)": 0.958285 }, { "epoch": 0.8268849689763831, "grad_norm": 0.24824945628643036, "learning_rate": 7.972471655613617e-07, "loss": 0.009358955547213554, "memory(GiB)": 22.66, "step": 25454, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.958291 }, { "epoch": 0.8269174544391384, "grad_norm": 0.3785208761692047, "learning_rate": 7.969561962936756e-07, "loss": 0.015875833109021187, "memory(GiB)": 22.66, "step": 25455, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.958298 }, { "epoch": 0.8269499399018939, "grad_norm": 0.3214736580848694, "learning_rate": 7.966652755347104e-07, "loss": 0.011790875345468521, "memory(GiB)": 22.66, "step": 25456, "token_acc": 0.996, "train_speed(iter/s)": 0.958304 }, { "epoch": 0.8269824253646493, "grad_norm": 0.5709043145179749, "learning_rate": 7.963744032878213e-07, "loss": 0.019432686269283295, "memory(GiB)": 22.66, "step": 25457, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.958311 }, { "epoch": 0.8270149108274047, "grad_norm": 0.3189460039138794, "learning_rate": 7.960835795563671e-07, "loss": 0.014224587008357048, "memory(GiB)": 22.66, "step": 25458, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.958317 }, { "epoch": 0.8270473962901601, "grad_norm": 0.40112096071243286, "learning_rate": 7.95792804343703e-07, "loss": 0.015863681212067604, "memory(GiB)": 22.66, "step": 25459, "token_acc": 0.9890909090909091, "train_speed(iter/s)": 0.958324 }, { "epoch": 0.8270798817529156, "grad_norm": 0.4115431010723114, "learning_rate": 7.955020776531874e-07, "loss": 0.015429528430104256, "memory(GiB)": 22.66, "step": 25460, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.95833 }, { "epoch": 0.8271123672156709, "grad_norm": 0.4646648168563843, "learning_rate": 7.952113994881727e-07, "loss": 0.019764598459005356, "memory(GiB)": 22.66, "step": 25461, "token_acc": 1.0, "train_speed(iter/s)": 0.958336 }, { "epoch": 0.8271448526784264, "grad_norm": 0.2959018051624298, "learning_rate": 7.949207698520162e-07, "loss": 0.013147828169167042, "memory(GiB)": 22.66, "step": 25462, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.958342 }, { "epoch": 0.8271773381411818, "grad_norm": 0.4921087920665741, "learning_rate": 7.946301887480684e-07, "loss": 0.018238861113786697, "memory(GiB)": 22.66, "step": 25463, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.958348 }, { "epoch": 0.8272098236039372, "grad_norm": 0.48662883043289185, "learning_rate": 7.943396561796879e-07, "loss": 0.021166248247027397, "memory(GiB)": 22.66, "step": 25464, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.958354 }, { "epoch": 0.8272423090666926, "grad_norm": 0.1935141235589981, "learning_rate": 7.940491721502247e-07, "loss": 0.008866834454238415, "memory(GiB)": 22.66, "step": 25465, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.95836 }, { "epoch": 0.8272747945294481, "grad_norm": 0.3511008620262146, "learning_rate": 7.937587366630328e-07, "loss": 0.008582865819334984, "memory(GiB)": 22.66, "step": 25466, "token_acc": 1.0, "train_speed(iter/s)": 0.958366 }, { "epoch": 0.8273072799922034, "grad_norm": 0.7023964524269104, "learning_rate": 7.934683497214629e-07, "loss": 0.019474249333143234, "memory(GiB)": 22.66, "step": 25467, "token_acc": 1.0, "train_speed(iter/s)": 0.958372 }, { "epoch": 0.8273397654549589, "grad_norm": 0.2101859301328659, "learning_rate": 7.931780113288673e-07, "loss": 0.008992457762360573, "memory(GiB)": 22.66, "step": 25468, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.958378 }, { "epoch": 0.8273722509177143, "grad_norm": 0.26288148760795593, "learning_rate": 7.92887721488596e-07, "loss": 0.008233046159148216, "memory(GiB)": 22.66, "step": 25469, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.958384 }, { "epoch": 0.8274047363804697, "grad_norm": 0.31518885493278503, "learning_rate": 7.925974802040004e-07, "loss": 0.012661002576351166, "memory(GiB)": 22.66, "step": 25470, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.95839 }, { "epoch": 0.8274372218432251, "grad_norm": 0.23090380430221558, "learning_rate": 7.92307287478431e-07, "loss": 0.00853803101927042, "memory(GiB)": 22.66, "step": 25471, "token_acc": 1.0, "train_speed(iter/s)": 0.958396 }, { "epoch": 0.8274697073059806, "grad_norm": 0.37957021594047546, "learning_rate": 7.920171433152352e-07, "loss": 0.009111268445849419, "memory(GiB)": 22.66, "step": 25472, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.958401 }, { "epoch": 0.827502192768736, "grad_norm": 0.3632005453109741, "learning_rate": 7.917270477177619e-07, "loss": 0.011455176398158073, "memory(GiB)": 22.66, "step": 25473, "token_acc": 1.0, "train_speed(iter/s)": 0.958406 }, { "epoch": 0.8275346782314914, "grad_norm": 0.31035158038139343, "learning_rate": 7.914370006893596e-07, "loss": 0.009919395670294762, "memory(GiB)": 22.66, "step": 25474, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.958412 }, { "epoch": 0.8275671636942469, "grad_norm": 0.4325971305370331, "learning_rate": 7.911470022333772e-07, "loss": 0.011435107327997684, "memory(GiB)": 22.66, "step": 25475, "token_acc": 1.0, "train_speed(iter/s)": 0.958417 }, { "epoch": 0.8275996491570022, "grad_norm": 0.35452204942703247, "learning_rate": 7.908570523531595e-07, "loss": 0.011128636077046394, "memory(GiB)": 22.66, "step": 25476, "token_acc": 1.0, "train_speed(iter/s)": 0.958423 }, { "epoch": 0.8276321346197577, "grad_norm": 0.2779185175895691, "learning_rate": 7.905671510520535e-07, "loss": 0.008674124255776405, "memory(GiB)": 22.66, "step": 25477, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.95843 }, { "epoch": 0.8276646200825131, "grad_norm": 0.5103667974472046, "learning_rate": 7.902772983334056e-07, "loss": 0.019540883600711823, "memory(GiB)": 22.66, "step": 25478, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.958437 }, { "epoch": 0.8276971055452685, "grad_norm": 0.36792904138565063, "learning_rate": 7.899874942005614e-07, "loss": 0.011284224689006805, "memory(GiB)": 22.66, "step": 25479, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.958444 }, { "epoch": 0.8277295910080239, "grad_norm": 0.31994813680648804, "learning_rate": 7.896977386568644e-07, "loss": 0.012676505371928215, "memory(GiB)": 22.66, "step": 25480, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.958451 }, { "epoch": 0.8277620764707794, "grad_norm": 0.37009096145629883, "learning_rate": 7.89408031705659e-07, "loss": 0.021445732563734055, "memory(GiB)": 22.66, "step": 25481, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.958459 }, { "epoch": 0.8277945619335347, "grad_norm": 0.20624887943267822, "learning_rate": 7.891183733502894e-07, "loss": 0.007989378646016121, "memory(GiB)": 22.66, "step": 25482, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.958465 }, { "epoch": 0.8278270473962902, "grad_norm": 0.7870636582374573, "learning_rate": 7.888287635941e-07, "loss": 0.021059032529592514, "memory(GiB)": 22.66, "step": 25483, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.958471 }, { "epoch": 0.8278595328590456, "grad_norm": 0.2352800965309143, "learning_rate": 7.885392024404298e-07, "loss": 0.008378003723919392, "memory(GiB)": 22.66, "step": 25484, "token_acc": 1.0, "train_speed(iter/s)": 0.958477 }, { "epoch": 0.827892018321801, "grad_norm": 0.4343225657939911, "learning_rate": 7.882496898926234e-07, "loss": 0.019112177193164825, "memory(GiB)": 22.66, "step": 25485, "token_acc": 1.0, "train_speed(iter/s)": 0.958482 }, { "epoch": 0.8279245037845564, "grad_norm": 0.30642062425613403, "learning_rate": 7.879602259540209e-07, "loss": 0.012595122680068016, "memory(GiB)": 22.66, "step": 25486, "token_acc": 1.0, "train_speed(iter/s)": 0.958487 }, { "epoch": 0.8279569892473119, "grad_norm": 0.28560692071914673, "learning_rate": 7.876708106279634e-07, "loss": 0.009561960585415363, "memory(GiB)": 22.66, "step": 25487, "token_acc": 1.0, "train_speed(iter/s)": 0.958493 }, { "epoch": 0.8279894747100672, "grad_norm": 0.5102300643920898, "learning_rate": 7.873814439177929e-07, "loss": 0.010230613872408867, "memory(GiB)": 22.66, "step": 25488, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.9585 }, { "epoch": 0.8280219601728227, "grad_norm": 0.24354387819766998, "learning_rate": 7.870921258268465e-07, "loss": 0.009336095303297043, "memory(GiB)": 22.66, "step": 25489, "token_acc": 1.0, "train_speed(iter/s)": 0.958505 }, { "epoch": 0.8280544456355781, "grad_norm": 0.4126155376434326, "learning_rate": 7.868028563584651e-07, "loss": 0.010147114284336567, "memory(GiB)": 22.66, "step": 25490, "token_acc": 1.0, "train_speed(iter/s)": 0.958511 }, { "epoch": 0.8280869310983335, "grad_norm": 0.3263026773929596, "learning_rate": 7.865136355159841e-07, "loss": 0.010658304207026958, "memory(GiB)": 22.66, "step": 25491, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.958516 }, { "epoch": 0.8281194165610889, "grad_norm": 0.23026928305625916, "learning_rate": 7.862244633027466e-07, "loss": 0.012563267722725868, "memory(GiB)": 22.66, "step": 25492, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.958523 }, { "epoch": 0.8281519020238444, "grad_norm": 0.3328065872192383, "learning_rate": 7.859353397220859e-07, "loss": 0.015050871297717094, "memory(GiB)": 22.66, "step": 25493, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.958529 }, { "epoch": 0.8281843874865997, "grad_norm": 0.28100186586380005, "learning_rate": 7.856462647773417e-07, "loss": 0.010247945785522461, "memory(GiB)": 22.66, "step": 25494, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.958535 }, { "epoch": 0.8282168729493552, "grad_norm": 0.2977491319179535, "learning_rate": 7.853572384718478e-07, "loss": 0.011947022750973701, "memory(GiB)": 22.66, "step": 25495, "token_acc": 0.992831541218638, "train_speed(iter/s)": 0.958541 }, { "epoch": 0.8282493584121106, "grad_norm": 0.39130011200904846, "learning_rate": 7.850682608089405e-07, "loss": 0.015961868688464165, "memory(GiB)": 22.66, "step": 25496, "token_acc": 1.0, "train_speed(iter/s)": 0.958547 }, { "epoch": 0.828281843874866, "grad_norm": 0.3962123990058899, "learning_rate": 7.847793317919561e-07, "loss": 0.014921074733138084, "memory(GiB)": 22.66, "step": 25497, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.958553 }, { "epoch": 0.8283143293376214, "grad_norm": 0.3645492494106293, "learning_rate": 7.844904514242296e-07, "loss": 0.013237478211522102, "memory(GiB)": 22.66, "step": 25498, "token_acc": 1.0, "train_speed(iter/s)": 0.958559 }, { "epoch": 0.8283468148003769, "grad_norm": 0.38218921422958374, "learning_rate": 7.842016197090929e-07, "loss": 0.011239700019359589, "memory(GiB)": 22.66, "step": 25499, "token_acc": 1.0, "train_speed(iter/s)": 0.958564 }, { "epoch": 0.8283793002631322, "grad_norm": 0.3475150763988495, "learning_rate": 7.83912836649881e-07, "loss": 0.01036711223423481, "memory(GiB)": 22.66, "step": 25500, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.95857 }, { "epoch": 0.8283793002631322, "eval_loss": 0.012145888060331345, "eval_runtime": 80.2991, "eval_samples_per_second": 123.912, "eval_steps_per_second": 3.873, "eval_token_acc": 0.9950902410604119, "step": 25500 }, { "epoch": 0.8284117857258877, "grad_norm": 0.37366193532943726, "learning_rate": 7.836241022499264e-07, "loss": 0.011725466698408127, "memory(GiB)": 22.66, "step": 25501, "token_acc": 0.9948090013833532, "train_speed(iter/s)": 0.955185 }, { "epoch": 0.828444271188643, "grad_norm": 0.3549055755138397, "learning_rate": 7.833354165125611e-07, "loss": 0.013627985492348671, "memory(GiB)": 22.66, "step": 25502, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.955189 }, { "epoch": 0.8284767566513985, "grad_norm": 0.42977553606033325, "learning_rate": 7.830467794411195e-07, "loss": 0.01263401284813881, "memory(GiB)": 22.66, "step": 25503, "token_acc": 1.0, "train_speed(iter/s)": 0.955193 }, { "epoch": 0.8285092421141539, "grad_norm": 0.21302028000354767, "learning_rate": 7.827581910389292e-07, "loss": 0.007347516715526581, "memory(GiB)": 22.66, "step": 25504, "token_acc": 1.0, "train_speed(iter/s)": 0.955198 }, { "epoch": 0.8285417275769094, "grad_norm": 0.21517911553382874, "learning_rate": 7.824696513093222e-07, "loss": 0.007583684753626585, "memory(GiB)": 22.66, "step": 25505, "token_acc": 1.0, "train_speed(iter/s)": 0.955201 }, { "epoch": 0.8285742130396647, "grad_norm": 0.35809430480003357, "learning_rate": 7.821811602556295e-07, "loss": 0.014856934547424316, "memory(GiB)": 22.66, "step": 25506, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.955206 }, { "epoch": 0.8286066985024202, "grad_norm": 0.36836448311805725, "learning_rate": 7.818927178811813e-07, "loss": 0.008706638589501381, "memory(GiB)": 22.66, "step": 25507, "token_acc": 1.0, "train_speed(iter/s)": 0.955211 }, { "epoch": 0.8286391839651756, "grad_norm": 0.32507404685020447, "learning_rate": 7.81604324189304e-07, "loss": 0.011583186686038971, "memory(GiB)": 22.66, "step": 25508, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.955216 }, { "epoch": 0.828671669427931, "grad_norm": 0.20349819958209991, "learning_rate": 7.813159791833275e-07, "loss": 0.007620109245181084, "memory(GiB)": 22.66, "step": 25509, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.955221 }, { "epoch": 0.8287041548906864, "grad_norm": 0.359986275434494, "learning_rate": 7.810276828665797e-07, "loss": 0.008721612393856049, "memory(GiB)": 22.66, "step": 25510, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955228 }, { "epoch": 0.8287366403534419, "grad_norm": 0.4107610285282135, "learning_rate": 7.807394352423886e-07, "loss": 0.010058166459202766, "memory(GiB)": 22.66, "step": 25511, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.955234 }, { "epoch": 0.8287691258161972, "grad_norm": 0.18202821910381317, "learning_rate": 7.804512363140798e-07, "loss": 0.008362814784049988, "memory(GiB)": 22.66, "step": 25512, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.95524 }, { "epoch": 0.8288016112789527, "grad_norm": 0.37254294753074646, "learning_rate": 7.801630860849807e-07, "loss": 0.01213257946074009, "memory(GiB)": 22.66, "step": 25513, "token_acc": 0.9927007299270073, "train_speed(iter/s)": 0.955247 }, { "epoch": 0.828834096741708, "grad_norm": 0.6606698632240295, "learning_rate": 7.798749845584141e-07, "loss": 0.024335063993930817, "memory(GiB)": 22.66, "step": 25514, "token_acc": 0.9724770642201835, "train_speed(iter/s)": 0.955255 }, { "epoch": 0.8288665822044635, "grad_norm": 0.2968241572380066, "learning_rate": 7.795869317377092e-07, "loss": 0.008442878723144531, "memory(GiB)": 22.66, "step": 25515, "token_acc": 1.0, "train_speed(iter/s)": 0.955263 }, { "epoch": 0.8288990676672189, "grad_norm": 0.2669544816017151, "learning_rate": 7.792989276261876e-07, "loss": 0.013313785195350647, "memory(GiB)": 22.66, "step": 25516, "token_acc": 1.0, "train_speed(iter/s)": 0.955271 }, { "epoch": 0.8289315531299744, "grad_norm": 0.3037739098072052, "learning_rate": 7.790109722271738e-07, "loss": 0.011678216978907585, "memory(GiB)": 22.66, "step": 25517, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955279 }, { "epoch": 0.8289640385927297, "grad_norm": 0.3945123255252838, "learning_rate": 7.787230655439926e-07, "loss": 0.015559249557554722, "memory(GiB)": 22.66, "step": 25518, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955287 }, { "epoch": 0.8289965240554852, "grad_norm": 0.33840230107307434, "learning_rate": 7.784352075799639e-07, "loss": 0.009103856049478054, "memory(GiB)": 22.66, "step": 25519, "token_acc": 1.0, "train_speed(iter/s)": 0.955295 }, { "epoch": 0.8290290095182405, "grad_norm": 0.32888272404670715, "learning_rate": 7.781473983384136e-07, "loss": 0.010965602472424507, "memory(GiB)": 22.66, "step": 25520, "token_acc": 1.0, "train_speed(iter/s)": 0.955303 }, { "epoch": 0.829061494980996, "grad_norm": 0.3103739023208618, "learning_rate": 7.778596378226611e-07, "loss": 0.010232335887849331, "memory(GiB)": 22.66, "step": 25521, "token_acc": 1.0, "train_speed(iter/s)": 0.955311 }, { "epoch": 0.8290939804437514, "grad_norm": 0.24695317447185516, "learning_rate": 7.775719260360288e-07, "loss": 0.0065244403667747974, "memory(GiB)": 22.66, "step": 25522, "token_acc": 1.0, "train_speed(iter/s)": 0.955318 }, { "epoch": 0.8291264659065068, "grad_norm": 0.27791836857795715, "learning_rate": 7.77284262981835e-07, "loss": 0.009394720196723938, "memory(GiB)": 22.66, "step": 25523, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.955326 }, { "epoch": 0.8291589513692622, "grad_norm": 0.24089719355106354, "learning_rate": 7.769966486634018e-07, "loss": 0.010765829123556614, "memory(GiB)": 22.66, "step": 25524, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955324 }, { "epoch": 0.8291914368320177, "grad_norm": 0.304423451423645, "learning_rate": 7.767090830840474e-07, "loss": 0.017789188772439957, "memory(GiB)": 22.66, "step": 25525, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.955332 }, { "epoch": 0.829223922294773, "grad_norm": 0.3869600296020508, "learning_rate": 7.76421566247093e-07, "loss": 0.013192282058298588, "memory(GiB)": 22.66, "step": 25526, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.95534 }, { "epoch": 0.8292564077575285, "grad_norm": 0.18486657738685608, "learning_rate": 7.761340981558541e-07, "loss": 0.00603182427585125, "memory(GiB)": 22.66, "step": 25527, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955347 }, { "epoch": 0.8292888932202839, "grad_norm": 0.31691619753837585, "learning_rate": 7.758466788136499e-07, "loss": 0.009050073102116585, "memory(GiB)": 22.66, "step": 25528, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.955355 }, { "epoch": 0.8293213786830393, "grad_norm": 0.533402681350708, "learning_rate": 7.755593082237972e-07, "loss": 0.015641730278730392, "memory(GiB)": 22.66, "step": 25529, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.955363 }, { "epoch": 0.8293538641457947, "grad_norm": 0.5642531514167786, "learning_rate": 7.752719863896135e-07, "loss": 0.012576724402606487, "memory(GiB)": 22.66, "step": 25530, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.955371 }, { "epoch": 0.8293863496085502, "grad_norm": 0.31874147057533264, "learning_rate": 7.749847133144134e-07, "loss": 0.017757661640644073, "memory(GiB)": 22.66, "step": 25531, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.955379 }, { "epoch": 0.8294188350713055, "grad_norm": 0.3190491497516632, "learning_rate": 7.746974890015135e-07, "loss": 0.010183671489357948, "memory(GiB)": 22.66, "step": 25532, "token_acc": 1.0, "train_speed(iter/s)": 0.955386 }, { "epoch": 0.829451320534061, "grad_norm": 1.253261923789978, "learning_rate": 7.74410313454228e-07, "loss": 0.01827988773584366, "memory(GiB)": 22.66, "step": 25533, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.955394 }, { "epoch": 0.8294838059968164, "grad_norm": 0.3487701714038849, "learning_rate": 7.741231866758719e-07, "loss": 0.013533873483538628, "memory(GiB)": 22.66, "step": 25534, "token_acc": 1.0, "train_speed(iter/s)": 0.955402 }, { "epoch": 0.8295162914595718, "grad_norm": 0.37182578444480896, "learning_rate": 7.738361086697599e-07, "loss": 0.011066652834415436, "memory(GiB)": 22.66, "step": 25535, "token_acc": 1.0, "train_speed(iter/s)": 0.95541 }, { "epoch": 0.8295487769223273, "grad_norm": 0.48783308267593384, "learning_rate": 7.735490794392031e-07, "loss": 0.012538176029920578, "memory(GiB)": 22.66, "step": 25536, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.955418 }, { "epoch": 0.8295812623850827, "grad_norm": 0.4653934836387634, "learning_rate": 7.732620989875151e-07, "loss": 0.013955810107290745, "memory(GiB)": 22.66, "step": 25537, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.955425 }, { "epoch": 0.8296137478478381, "grad_norm": 0.44893720746040344, "learning_rate": 7.729751673180086e-07, "loss": 0.012169990688562393, "memory(GiB)": 22.66, "step": 25538, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955433 }, { "epoch": 0.8296462333105935, "grad_norm": 0.36116155982017517, "learning_rate": 7.726882844339962e-07, "loss": 0.007937022484838963, "memory(GiB)": 22.66, "step": 25539, "token_acc": 1.0, "train_speed(iter/s)": 0.955441 }, { "epoch": 0.829678718773349, "grad_norm": 0.44843849539756775, "learning_rate": 7.724014503387862e-07, "loss": 0.01693563163280487, "memory(GiB)": 22.66, "step": 25540, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.955449 }, { "epoch": 0.8297112042361043, "grad_norm": 0.4285893440246582, "learning_rate": 7.721146650356909e-07, "loss": 0.016912657767534256, "memory(GiB)": 22.66, "step": 25541, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955456 }, { "epoch": 0.8297436896988598, "grad_norm": 0.41469526290893555, "learning_rate": 7.718279285280178e-07, "loss": 0.018841413781046867, "memory(GiB)": 22.66, "step": 25542, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955464 }, { "epoch": 0.8297761751616152, "grad_norm": 0.3578869104385376, "learning_rate": 7.715412408190809e-07, "loss": 0.01035941019654274, "memory(GiB)": 22.66, "step": 25543, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.955472 }, { "epoch": 0.8298086606243706, "grad_norm": 0.3598513901233673, "learning_rate": 7.712546019121842e-07, "loss": 0.014004416763782501, "memory(GiB)": 22.66, "step": 25544, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.95548 }, { "epoch": 0.829841146087126, "grad_norm": 0.3782432973384857, "learning_rate": 7.709680118106394e-07, "loss": 0.009686053730547428, "memory(GiB)": 22.66, "step": 25545, "token_acc": 0.99609375, "train_speed(iter/s)": 0.955489 }, { "epoch": 0.8298736315498815, "grad_norm": 0.3505699336528778, "learning_rate": 7.706814705177512e-07, "loss": 0.01490750815719366, "memory(GiB)": 22.66, "step": 25546, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955496 }, { "epoch": 0.8299061170126368, "grad_norm": 0.34023037552833557, "learning_rate": 7.703949780368269e-07, "loss": 0.01055715512484312, "memory(GiB)": 22.66, "step": 25547, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.955504 }, { "epoch": 0.8299386024753923, "grad_norm": 0.26384612917900085, "learning_rate": 7.701085343711767e-07, "loss": 0.012616965919733047, "memory(GiB)": 22.66, "step": 25548, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.955511 }, { "epoch": 0.8299710879381477, "grad_norm": 0.4039948880672455, "learning_rate": 7.698221395241029e-07, "loss": 0.01215441059321165, "memory(GiB)": 22.66, "step": 25549, "token_acc": 1.0, "train_speed(iter/s)": 0.955517 }, { "epoch": 0.8300035734009031, "grad_norm": 0.5004504919052124, "learning_rate": 7.695357934989128e-07, "loss": 0.015463167801499367, "memory(GiB)": 22.66, "step": 25550, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955523 }, { "epoch": 0.8300360588636585, "grad_norm": 0.4747275710105896, "learning_rate": 7.692494962989094e-07, "loss": 0.016498804092407227, "memory(GiB)": 22.66, "step": 25551, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955529 }, { "epoch": 0.830068544326414, "grad_norm": 0.3208385109901428, "learning_rate": 7.689632479273979e-07, "loss": 0.0121681559830904, "memory(GiB)": 22.66, "step": 25552, "token_acc": 1.0, "train_speed(iter/s)": 0.955535 }, { "epoch": 0.8301010297891693, "grad_norm": 0.292886346578598, "learning_rate": 7.686770483876821e-07, "loss": 0.009994429536163807, "memory(GiB)": 22.66, "step": 25553, "token_acc": 1.0, "train_speed(iter/s)": 0.95554 }, { "epoch": 0.8301335152519248, "grad_norm": 0.3218699097633362, "learning_rate": 7.683908976830661e-07, "loss": 0.011516802944242954, "memory(GiB)": 22.66, "step": 25554, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955546 }, { "epoch": 0.8301660007146802, "grad_norm": 0.4481041431427002, "learning_rate": 7.681047958168502e-07, "loss": 0.02222120761871338, "memory(GiB)": 22.66, "step": 25555, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.955551 }, { "epoch": 0.8301984861774356, "grad_norm": 0.24179771542549133, "learning_rate": 7.678187427923378e-07, "loss": 0.0078039150685071945, "memory(GiB)": 22.66, "step": 25556, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.955556 }, { "epoch": 0.830230971640191, "grad_norm": 0.3339495360851288, "learning_rate": 7.675327386128301e-07, "loss": 0.010677680373191833, "memory(GiB)": 22.66, "step": 25557, "token_acc": 1.0, "train_speed(iter/s)": 0.955561 }, { "epoch": 0.8302634571029465, "grad_norm": 0.3583541214466095, "learning_rate": 7.672467832816288e-07, "loss": 0.024114716798067093, "memory(GiB)": 22.66, "step": 25558, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955567 }, { "epoch": 0.8302959425657018, "grad_norm": 0.3272097408771515, "learning_rate": 7.669608768020326e-07, "loss": 0.010821975767612457, "memory(GiB)": 22.66, "step": 25559, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955572 }, { "epoch": 0.8303284280284573, "grad_norm": 0.5057342648506165, "learning_rate": 7.66675019177342e-07, "loss": 0.01570628583431244, "memory(GiB)": 22.66, "step": 25560, "token_acc": 1.0, "train_speed(iter/s)": 0.955577 }, { "epoch": 0.8303609134912127, "grad_norm": 0.3511147201061249, "learning_rate": 7.663892104108561e-07, "loss": 0.014318243600428104, "memory(GiB)": 22.66, "step": 25561, "token_acc": 0.9927007299270073, "train_speed(iter/s)": 0.955583 }, { "epoch": 0.8303933989539681, "grad_norm": 0.619670569896698, "learning_rate": 7.661034505058735e-07, "loss": 0.012094054371118546, "memory(GiB)": 22.66, "step": 25562, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.955589 }, { "epoch": 0.8304258844167235, "grad_norm": 0.25790277123451233, "learning_rate": 7.658177394656941e-07, "loss": 0.007259208243340254, "memory(GiB)": 22.66, "step": 25563, "token_acc": 1.0, "train_speed(iter/s)": 0.955594 }, { "epoch": 0.830458369879479, "grad_norm": 0.3462222218513489, "learning_rate": 7.65532077293612e-07, "loss": 0.01405705139040947, "memory(GiB)": 22.66, "step": 25564, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.9556 }, { "epoch": 0.8304908553422343, "grad_norm": 0.3604087829589844, "learning_rate": 7.652464639929258e-07, "loss": 0.013649356551468372, "memory(GiB)": 22.66, "step": 25565, "token_acc": 1.0, "train_speed(iter/s)": 0.955606 }, { "epoch": 0.8305233408049898, "grad_norm": 0.5028094053268433, "learning_rate": 7.649608995669322e-07, "loss": 0.015367621555924416, "memory(GiB)": 22.66, "step": 25566, "token_acc": 1.0, "train_speed(iter/s)": 0.95561 }, { "epoch": 0.8305558262677452, "grad_norm": 0.36969736218452454, "learning_rate": 7.646753840189275e-07, "loss": 0.0086824344471097, "memory(GiB)": 22.66, "step": 25567, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.955616 }, { "epoch": 0.8305883117305006, "grad_norm": 0.3191159963607788, "learning_rate": 7.643899173522052e-07, "loss": 0.01207398995757103, "memory(GiB)": 22.66, "step": 25568, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955622 }, { "epoch": 0.830620797193256, "grad_norm": 0.20620712637901306, "learning_rate": 7.641044995700614e-07, "loss": 0.00888497568666935, "memory(GiB)": 22.66, "step": 25569, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955629 }, { "epoch": 0.8306532826560115, "grad_norm": 0.4110415577888489, "learning_rate": 7.63819130675788e-07, "loss": 0.01376556046307087, "memory(GiB)": 22.66, "step": 25570, "token_acc": 0.9921875, "train_speed(iter/s)": 0.955635 }, { "epoch": 0.8306857681187668, "grad_norm": 0.2819347083568573, "learning_rate": 7.635338106726819e-07, "loss": 0.01450582779943943, "memory(GiB)": 22.66, "step": 25571, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.95564 }, { "epoch": 0.8307182535815223, "grad_norm": 0.42156782746315, "learning_rate": 7.632485395640338e-07, "loss": 0.014653165824711323, "memory(GiB)": 22.66, "step": 25572, "token_acc": 1.0, "train_speed(iter/s)": 0.955646 }, { "epoch": 0.8307507390442777, "grad_norm": 0.36613062024116516, "learning_rate": 7.629633173531375e-07, "loss": 0.0085678081959486, "memory(GiB)": 22.66, "step": 25573, "token_acc": 1.0, "train_speed(iter/s)": 0.955652 }, { "epoch": 0.8307832245070331, "grad_norm": 0.4035020172595978, "learning_rate": 7.626781440432829e-07, "loss": 0.013028411194682121, "memory(GiB)": 22.66, "step": 25574, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955658 }, { "epoch": 0.8308157099697885, "grad_norm": 0.390543669462204, "learning_rate": 7.623930196377621e-07, "loss": 0.01542302593588829, "memory(GiB)": 22.66, "step": 25575, "token_acc": 0.9952830188679245, "train_speed(iter/s)": 0.955665 }, { "epoch": 0.830848195432544, "grad_norm": 0.4544176459312439, "learning_rate": 7.621079441398665e-07, "loss": 0.01548305619508028, "memory(GiB)": 22.66, "step": 25576, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955671 }, { "epoch": 0.8308806808952993, "grad_norm": 0.3491230607032776, "learning_rate": 7.618229175528869e-07, "loss": 0.009035972878336906, "memory(GiB)": 22.66, "step": 25577, "token_acc": 1.0, "train_speed(iter/s)": 0.955679 }, { "epoch": 0.8309131663580548, "grad_norm": 0.382394939661026, "learning_rate": 7.615379398801103e-07, "loss": 0.015082435682415962, "memory(GiB)": 22.66, "step": 25578, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955687 }, { "epoch": 0.8309456518208101, "grad_norm": 0.4901717007160187, "learning_rate": 7.612530111248278e-07, "loss": 0.011026512831449509, "memory(GiB)": 22.66, "step": 25579, "token_acc": 1.0, "train_speed(iter/s)": 0.955696 }, { "epoch": 0.8309781372835656, "grad_norm": 0.2561057507991791, "learning_rate": 7.609681312903272e-07, "loss": 0.00823111180216074, "memory(GiB)": 22.66, "step": 25580, "token_acc": 1.0, "train_speed(iter/s)": 0.955704 }, { "epoch": 0.831010622746321, "grad_norm": 0.400297611951828, "learning_rate": 7.606833003798964e-07, "loss": 0.013659929856657982, "memory(GiB)": 22.66, "step": 25581, "token_acc": 0.9878787878787879, "train_speed(iter/s)": 0.955712 }, { "epoch": 0.8310431082090765, "grad_norm": 0.411340594291687, "learning_rate": 7.603985183968238e-07, "loss": 0.016010530292987823, "memory(GiB)": 22.66, "step": 25582, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955719 }, { "epoch": 0.8310755936718318, "grad_norm": 0.3070327937602997, "learning_rate": 7.60113785344394e-07, "loss": 0.013147398829460144, "memory(GiB)": 22.66, "step": 25583, "token_acc": 0.9923954372623575, "train_speed(iter/s)": 0.955727 }, { "epoch": 0.8311080791345873, "grad_norm": 0.37510356307029724, "learning_rate": 7.598291012258946e-07, "loss": 0.015660367906093597, "memory(GiB)": 22.66, "step": 25584, "token_acc": 1.0, "train_speed(iter/s)": 0.955735 }, { "epoch": 0.8311405645973426, "grad_norm": 0.31741297245025635, "learning_rate": 7.59544466044611e-07, "loss": 0.008138964883983135, "memory(GiB)": 22.66, "step": 25585, "token_acc": 1.0, "train_speed(iter/s)": 0.955743 }, { "epoch": 0.8311730500600981, "grad_norm": 0.325130432844162, "learning_rate": 7.592598798038297e-07, "loss": 0.01576261967420578, "memory(GiB)": 22.66, "step": 25586, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955751 }, { "epoch": 0.8312055355228535, "grad_norm": 0.4436863660812378, "learning_rate": 7.589753425068325e-07, "loss": 0.012113897129893303, "memory(GiB)": 22.66, "step": 25587, "token_acc": 1.0, "train_speed(iter/s)": 0.955759 }, { "epoch": 0.831238020985609, "grad_norm": 0.40472114086151123, "learning_rate": 7.586908541569044e-07, "loss": 0.018955763429403305, "memory(GiB)": 22.66, "step": 25588, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.955767 }, { "epoch": 0.8312705064483643, "grad_norm": 0.4642159640789032, "learning_rate": 7.584064147573295e-07, "loss": 0.017234664410352707, "memory(GiB)": 22.66, "step": 25589, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955775 }, { "epoch": 0.8313029919111198, "grad_norm": 0.38787880539894104, "learning_rate": 7.581220243113908e-07, "loss": 0.01906481571495533, "memory(GiB)": 22.66, "step": 25590, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.955783 }, { "epoch": 0.8313354773738751, "grad_norm": 0.35802266001701355, "learning_rate": 7.578376828223688e-07, "loss": 0.01588643714785576, "memory(GiB)": 22.66, "step": 25591, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.955791 }, { "epoch": 0.8313679628366306, "grad_norm": 0.4084358811378479, "learning_rate": 7.575533902935473e-07, "loss": 0.015082290396094322, "memory(GiB)": 22.66, "step": 25592, "token_acc": 0.9905660377358491, "train_speed(iter/s)": 0.955799 }, { "epoch": 0.831400448299386, "grad_norm": 0.28182128071784973, "learning_rate": 7.572691467282045e-07, "loss": 0.008965566754341125, "memory(GiB)": 22.66, "step": 25593, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.955807 }, { "epoch": 0.8314329337621414, "grad_norm": 0.40357130765914917, "learning_rate": 7.569849521296235e-07, "loss": 0.01228305697441101, "memory(GiB)": 22.66, "step": 25594, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955815 }, { "epoch": 0.8314654192248968, "grad_norm": 0.19953882694244385, "learning_rate": 7.567008065010851e-07, "loss": 0.0066376663744449615, "memory(GiB)": 22.66, "step": 25595, "token_acc": 1.0, "train_speed(iter/s)": 0.955823 }, { "epoch": 0.8314979046876523, "grad_norm": 0.39785459637641907, "learning_rate": 7.564167098458658e-07, "loss": 0.014072977006435394, "memory(GiB)": 22.66, "step": 25596, "token_acc": 1.0, "train_speed(iter/s)": 0.955831 }, { "epoch": 0.8315303901504076, "grad_norm": 0.36461082100868225, "learning_rate": 7.561326621672471e-07, "loss": 0.011496685445308685, "memory(GiB)": 22.66, "step": 25597, "token_acc": 0.9921875, "train_speed(iter/s)": 0.955839 }, { "epoch": 0.8315628756131631, "grad_norm": 0.4030500650405884, "learning_rate": 7.558486634685036e-07, "loss": 0.012186296284198761, "memory(GiB)": 22.66, "step": 25598, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.955846 }, { "epoch": 0.8315953610759185, "grad_norm": 0.23414312303066254, "learning_rate": 7.555647137529182e-07, "loss": 0.010535555891692638, "memory(GiB)": 22.66, "step": 25599, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.955854 }, { "epoch": 0.8316278465386739, "grad_norm": 0.27435070276260376, "learning_rate": 7.552808130237638e-07, "loss": 0.013070790097117424, "memory(GiB)": 22.66, "step": 25600, "token_acc": 1.0, "train_speed(iter/s)": 0.955862 }, { "epoch": 0.8316603320014294, "grad_norm": 0.29242849349975586, "learning_rate": 7.549969612843194e-07, "loss": 0.008250527083873749, "memory(GiB)": 22.66, "step": 25601, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.95587 }, { "epoch": 0.8316928174641848, "grad_norm": 0.4292454421520233, "learning_rate": 7.547131585378597e-07, "loss": 0.011492841877043247, "memory(GiB)": 22.66, "step": 25602, "token_acc": 1.0, "train_speed(iter/s)": 0.955878 }, { "epoch": 0.8317253029269402, "grad_norm": 0.4360997974872589, "learning_rate": 7.544294047876605e-07, "loss": 0.013539924286305904, "memory(GiB)": 22.66, "step": 25603, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.955886 }, { "epoch": 0.8317577883896956, "grad_norm": 0.39269718527793884, "learning_rate": 7.541457000369962e-07, "loss": 0.012923907488584518, "memory(GiB)": 22.66, "step": 25604, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955894 }, { "epoch": 0.8317902738524511, "grad_norm": 0.32319390773773193, "learning_rate": 7.538620442891436e-07, "loss": 0.01092899776995182, "memory(GiB)": 22.66, "step": 25605, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.955901 }, { "epoch": 0.8318227593152064, "grad_norm": 0.33268415927886963, "learning_rate": 7.535784375473731e-07, "loss": 0.01122825313359499, "memory(GiB)": 22.66, "step": 25606, "token_acc": 1.0, "train_speed(iter/s)": 0.955909 }, { "epoch": 0.8318552447779619, "grad_norm": 0.3370126187801361, "learning_rate": 7.532948798149592e-07, "loss": 0.015771936625242233, "memory(GiB)": 22.66, "step": 25607, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.955917 }, { "epoch": 0.8318877302407173, "grad_norm": 0.37703031301498413, "learning_rate": 7.530113710951753e-07, "loss": 0.013503185473382473, "memory(GiB)": 22.66, "step": 25608, "token_acc": 1.0, "train_speed(iter/s)": 0.955924 }, { "epoch": 0.8319202157034727, "grad_norm": 0.3297976851463318, "learning_rate": 7.527279113912922e-07, "loss": 0.009575096890330315, "memory(GiB)": 22.66, "step": 25609, "token_acc": 1.0, "train_speed(iter/s)": 0.95593 }, { "epoch": 0.8319527011662281, "grad_norm": 0.28072717785835266, "learning_rate": 7.524445007065839e-07, "loss": 0.007367909885942936, "memory(GiB)": 22.66, "step": 25610, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955935 }, { "epoch": 0.8319851866289836, "grad_norm": 0.28798508644104004, "learning_rate": 7.521611390443184e-07, "loss": 0.015007270500063896, "memory(GiB)": 22.66, "step": 25611, "token_acc": 1.0, "train_speed(iter/s)": 0.95594 }, { "epoch": 0.8320176720917389, "grad_norm": 0.3584453761577606, "learning_rate": 7.518778264077669e-07, "loss": 0.018054883927106857, "memory(GiB)": 22.66, "step": 25612, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.955947 }, { "epoch": 0.8320501575544944, "grad_norm": 0.4055600166320801, "learning_rate": 7.515945628001998e-07, "loss": 0.019637977704405785, "memory(GiB)": 22.66, "step": 25613, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.955952 }, { "epoch": 0.8320826430172498, "grad_norm": 0.29522109031677246, "learning_rate": 7.513113482248868e-07, "loss": 0.012022237293422222, "memory(GiB)": 22.66, "step": 25614, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955958 }, { "epoch": 0.8321151284800052, "grad_norm": 0.2250281423330307, "learning_rate": 7.510281826850946e-07, "loss": 0.0050535425543785095, "memory(GiB)": 22.66, "step": 25615, "token_acc": 1.0, "train_speed(iter/s)": 0.955964 }, { "epoch": 0.8321476139427606, "grad_norm": 0.2689094841480255, "learning_rate": 7.507450661840932e-07, "loss": 0.011536680161952972, "memory(GiB)": 22.66, "step": 25616, "token_acc": 1.0, "train_speed(iter/s)": 0.95597 }, { "epoch": 0.8321800994055161, "grad_norm": 0.3724392354488373, "learning_rate": 7.504619987251488e-07, "loss": 0.013181282207369804, "memory(GiB)": 22.66, "step": 25617, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.955976 }, { "epoch": 0.8322125848682714, "grad_norm": 0.4563109278678894, "learning_rate": 7.501789803115306e-07, "loss": 0.008878640830516815, "memory(GiB)": 22.66, "step": 25618, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955982 }, { "epoch": 0.8322450703310269, "grad_norm": 0.3005000352859497, "learning_rate": 7.49896010946502e-07, "loss": 0.007649296894669533, "memory(GiB)": 22.66, "step": 25619, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955987 }, { "epoch": 0.8322775557937823, "grad_norm": 0.2734261155128479, "learning_rate": 7.496130906333321e-07, "loss": 0.011188813485205173, "memory(GiB)": 22.66, "step": 25620, "token_acc": 0.9931972789115646, "train_speed(iter/s)": 0.955993 }, { "epoch": 0.8323100412565377, "grad_norm": 0.3757229149341583, "learning_rate": 7.493302193752817e-07, "loss": 0.008554171770811081, "memory(GiB)": 22.66, "step": 25621, "token_acc": 1.0, "train_speed(iter/s)": 0.955997 }, { "epoch": 0.8323425267192931, "grad_norm": 0.4642147123813629, "learning_rate": 7.490473971756201e-07, "loss": 0.012001929804682732, "memory(GiB)": 22.66, "step": 25622, "token_acc": 1.0, "train_speed(iter/s)": 0.956003 }, { "epoch": 0.8323750121820486, "grad_norm": 0.339400976896286, "learning_rate": 7.487646240376089e-07, "loss": 0.012835142202675343, "memory(GiB)": 22.66, "step": 25623, "token_acc": 0.9921875, "train_speed(iter/s)": 0.956009 }, { "epoch": 0.8324074976448039, "grad_norm": 0.2979773283004761, "learning_rate": 7.484818999645122e-07, "loss": 0.010396593250334263, "memory(GiB)": 22.66, "step": 25624, "token_acc": 1.0, "train_speed(iter/s)": 0.956014 }, { "epoch": 0.8324399831075594, "grad_norm": 0.2657170593738556, "learning_rate": 7.481992249595943e-07, "loss": 0.012057885527610779, "memory(GiB)": 22.66, "step": 25625, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.95602 }, { "epoch": 0.8324724685703148, "grad_norm": 0.39722275733947754, "learning_rate": 7.479165990261145e-07, "loss": 0.015390537679195404, "memory(GiB)": 22.66, "step": 25626, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.956025 }, { "epoch": 0.8325049540330702, "grad_norm": 0.41423895955085754, "learning_rate": 7.476340221673384e-07, "loss": 0.014634359627962112, "memory(GiB)": 22.66, "step": 25627, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.95603 }, { "epoch": 0.8325374394958256, "grad_norm": 0.3381001055240631, "learning_rate": 7.473514943865246e-07, "loss": 0.010647343471646309, "memory(GiB)": 22.66, "step": 25628, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.956035 }, { "epoch": 0.8325699249585811, "grad_norm": 0.3482207655906677, "learning_rate": 7.470690156869359e-07, "loss": 0.011388810351490974, "memory(GiB)": 22.66, "step": 25629, "token_acc": 1.0, "train_speed(iter/s)": 0.95604 }, { "epoch": 0.8326024104213364, "grad_norm": 0.19494235515594482, "learning_rate": 7.467865860718293e-07, "loss": 0.004317770712077618, "memory(GiB)": 22.66, "step": 25630, "token_acc": 1.0, "train_speed(iter/s)": 0.956046 }, { "epoch": 0.8326348958840919, "grad_norm": 0.33109602332115173, "learning_rate": 7.465042055444688e-07, "loss": 0.013027092441916466, "memory(GiB)": 22.66, "step": 25631, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.956051 }, { "epoch": 0.8326673813468473, "grad_norm": 0.3640482425689697, "learning_rate": 7.462218741081096e-07, "loss": 0.01566864736378193, "memory(GiB)": 22.66, "step": 25632, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956057 }, { "epoch": 0.8326998668096027, "grad_norm": 0.30568721890449524, "learning_rate": 7.459395917660134e-07, "loss": 0.010814795270562172, "memory(GiB)": 22.66, "step": 25633, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.956062 }, { "epoch": 0.8327323522723581, "grad_norm": 0.21651405096054077, "learning_rate": 7.45657358521435e-07, "loss": 0.007743490859866142, "memory(GiB)": 22.66, "step": 25634, "token_acc": 1.0, "train_speed(iter/s)": 0.956067 }, { "epoch": 0.8327648377351136, "grad_norm": 0.32992562651634216, "learning_rate": 7.453751743776333e-07, "loss": 0.011851992458105087, "memory(GiB)": 22.66, "step": 25635, "token_acc": 0.9885931558935361, "train_speed(iter/s)": 0.956071 }, { "epoch": 0.8327973231978689, "grad_norm": 0.327543705701828, "learning_rate": 7.450930393378653e-07, "loss": 0.016770901158452034, "memory(GiB)": 22.66, "step": 25636, "token_acc": 1.0, "train_speed(iter/s)": 0.956076 }, { "epoch": 0.8328298086606244, "grad_norm": 0.2902670204639435, "learning_rate": 7.448109534053877e-07, "loss": 0.008565163239836693, "memory(GiB)": 22.66, "step": 25637, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956082 }, { "epoch": 0.8328622941233798, "grad_norm": 0.4188113510608673, "learning_rate": 7.445289165834541e-07, "loss": 0.015101081691682339, "memory(GiB)": 22.66, "step": 25638, "token_acc": 0.9933110367892977, "train_speed(iter/s)": 0.956087 }, { "epoch": 0.8328947795861352, "grad_norm": 0.4603644013404846, "learning_rate": 7.442469288753207e-07, "loss": 0.014377595856785774, "memory(GiB)": 22.66, "step": 25639, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.956093 }, { "epoch": 0.8329272650488906, "grad_norm": 0.4692239761352539, "learning_rate": 7.439649902842427e-07, "loss": 0.01665189489722252, "memory(GiB)": 22.66, "step": 25640, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.956098 }, { "epoch": 0.8329597505116461, "grad_norm": 0.5447942018508911, "learning_rate": 7.436831008134732e-07, "loss": 0.015018987469375134, "memory(GiB)": 22.66, "step": 25641, "token_acc": 0.9921875, "train_speed(iter/s)": 0.956103 }, { "epoch": 0.8329922359744014, "grad_norm": 0.43373072147369385, "learning_rate": 7.434012604662666e-07, "loss": 0.017837705090641975, "memory(GiB)": 22.66, "step": 25642, "token_acc": 1.0, "train_speed(iter/s)": 0.95611 }, { "epoch": 0.8330247214371569, "grad_norm": 0.20022302865982056, "learning_rate": 7.431194692458743e-07, "loss": 0.008422216400504112, "memory(GiB)": 22.66, "step": 25643, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956116 }, { "epoch": 0.8330572068999123, "grad_norm": 0.25556010007858276, "learning_rate": 7.428377271555492e-07, "loss": 0.00690656341612339, "memory(GiB)": 22.66, "step": 25644, "token_acc": 1.0, "train_speed(iter/s)": 0.956122 }, { "epoch": 0.8330896923626677, "grad_norm": 0.3609062135219574, "learning_rate": 7.425560341985427e-07, "loss": 0.013559581711888313, "memory(GiB)": 22.66, "step": 25645, "token_acc": 1.0, "train_speed(iter/s)": 0.956128 }, { "epoch": 0.8331221778254231, "grad_norm": 0.3853815495967865, "learning_rate": 7.422743903781077e-07, "loss": 0.014748871326446533, "memory(GiB)": 22.66, "step": 25646, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.956134 }, { "epoch": 0.8331546632881786, "grad_norm": 0.37534886598587036, "learning_rate": 7.419927956974921e-07, "loss": 0.016624752432107925, "memory(GiB)": 22.66, "step": 25647, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.95614 }, { "epoch": 0.8331871487509339, "grad_norm": 0.3577054738998413, "learning_rate": 7.417112501599471e-07, "loss": 0.011457236483693123, "memory(GiB)": 22.66, "step": 25648, "token_acc": 1.0, "train_speed(iter/s)": 0.956148 }, { "epoch": 0.8332196342136894, "grad_norm": 0.2382475733757019, "learning_rate": 7.414297537687215e-07, "loss": 0.01195642352104187, "memory(GiB)": 22.66, "step": 25649, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956156 }, { "epoch": 0.8332521196764447, "grad_norm": 0.32592371106147766, "learning_rate": 7.411483065270663e-07, "loss": 0.007628855295479298, "memory(GiB)": 22.66, "step": 25650, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.956164 }, { "epoch": 0.8332846051392002, "grad_norm": 0.35758838057518005, "learning_rate": 7.40866908438227e-07, "loss": 0.01372618693858385, "memory(GiB)": 22.66, "step": 25651, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956172 }, { "epoch": 0.8333170906019556, "grad_norm": 0.36636242270469666, "learning_rate": 7.405855595054534e-07, "loss": 0.010300097987055779, "memory(GiB)": 22.66, "step": 25652, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956179 }, { "epoch": 0.833349576064711, "grad_norm": 0.39987942576408386, "learning_rate": 7.403042597319898e-07, "loss": 0.01340891420841217, "memory(GiB)": 22.66, "step": 25653, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.956186 }, { "epoch": 0.8333820615274664, "grad_norm": 0.3480232357978821, "learning_rate": 7.400230091210864e-07, "loss": 0.011490419507026672, "memory(GiB)": 22.66, "step": 25654, "token_acc": 1.0, "train_speed(iter/s)": 0.956194 }, { "epoch": 0.8334145469902219, "grad_norm": 0.5122043490409851, "learning_rate": 7.397418076759866e-07, "loss": 0.015292550437152386, "memory(GiB)": 22.66, "step": 25655, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956202 }, { "epoch": 0.8334470324529772, "grad_norm": 0.3884831666946411, "learning_rate": 7.394606553999373e-07, "loss": 0.012847593054175377, "memory(GiB)": 22.66, "step": 25656, "token_acc": 1.0, "train_speed(iter/s)": 0.95621 }, { "epoch": 0.8334795179157327, "grad_norm": 0.2342352420091629, "learning_rate": 7.391795522961831e-07, "loss": 0.00966107752174139, "memory(GiB)": 22.66, "step": 25657, "token_acc": 1.0, "train_speed(iter/s)": 0.956218 }, { "epoch": 0.8335120033784881, "grad_norm": 0.3645528256893158, "learning_rate": 7.388984983679659e-07, "loss": 0.013154283165931702, "memory(GiB)": 22.66, "step": 25658, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956225 }, { "epoch": 0.8335444888412435, "grad_norm": 0.355785071849823, "learning_rate": 7.386174936185342e-07, "loss": 0.008914941921830177, "memory(GiB)": 22.66, "step": 25659, "token_acc": 0.9965156794425087, "train_speed(iter/s)": 0.956233 }, { "epoch": 0.8335769743039989, "grad_norm": 0.3466227054595947, "learning_rate": 7.383365380511271e-07, "loss": 0.013839678838849068, "memory(GiB)": 22.66, "step": 25660, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956241 }, { "epoch": 0.8336094597667544, "grad_norm": 0.3442785441875458, "learning_rate": 7.380556316689897e-07, "loss": 0.012283464893698692, "memory(GiB)": 22.66, "step": 25661, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956249 }, { "epoch": 0.8336419452295097, "grad_norm": 0.38869211077690125, "learning_rate": 7.377747744753621e-07, "loss": 0.013758691027760506, "memory(GiB)": 22.66, "step": 25662, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956257 }, { "epoch": 0.8336744306922652, "grad_norm": 0.20116522908210754, "learning_rate": 7.374939664734865e-07, "loss": 0.00717287277802825, "memory(GiB)": 22.66, "step": 25663, "token_acc": 1.0, "train_speed(iter/s)": 0.956265 }, { "epoch": 0.8337069161550207, "grad_norm": 0.34299737215042114, "learning_rate": 7.372132076666039e-07, "loss": 0.007454933598637581, "memory(GiB)": 22.66, "step": 25664, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.956272 }, { "epoch": 0.833739401617776, "grad_norm": 0.34676843881607056, "learning_rate": 7.369324980579556e-07, "loss": 0.008710629306733608, "memory(GiB)": 22.66, "step": 25665, "token_acc": 1.0, "train_speed(iter/s)": 0.95628 }, { "epoch": 0.8337718870805315, "grad_norm": 0.3204548954963684, "learning_rate": 7.366518376507792e-07, "loss": 0.012166492640972137, "memory(GiB)": 22.66, "step": 25666, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956288 }, { "epoch": 0.8338043725432869, "grad_norm": 0.2775043249130249, "learning_rate": 7.363712264483152e-07, "loss": 0.007255101576447487, "memory(GiB)": 22.66, "step": 25667, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956296 }, { "epoch": 0.8338368580060423, "grad_norm": 0.438338965177536, "learning_rate": 7.360906644538024e-07, "loss": 0.013388559222221375, "memory(GiB)": 22.66, "step": 25668, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.956304 }, { "epoch": 0.8338693434687977, "grad_norm": 0.21596527099609375, "learning_rate": 7.358101516704796e-07, "loss": 0.006938048172742128, "memory(GiB)": 22.66, "step": 25669, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956311 }, { "epoch": 0.8339018289315532, "grad_norm": 0.3342520296573639, "learning_rate": 7.355296881015822e-07, "loss": 0.010867234319448471, "memory(GiB)": 22.66, "step": 25670, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.956319 }, { "epoch": 0.8339343143943085, "grad_norm": 0.36885735392570496, "learning_rate": 7.35249273750348e-07, "loss": 0.019838878884911537, "memory(GiB)": 22.66, "step": 25671, "token_acc": 1.0, "train_speed(iter/s)": 0.956327 }, { "epoch": 0.833966799857064, "grad_norm": 0.4752058684825897, "learning_rate": 7.349689086200134e-07, "loss": 0.01446507591754198, "memory(GiB)": 22.66, "step": 25672, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956335 }, { "epoch": 0.8339992853198194, "grad_norm": 0.38676702976226807, "learning_rate": 7.346885927138147e-07, "loss": 0.012197965756058693, "memory(GiB)": 22.66, "step": 25673, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956343 }, { "epoch": 0.8340317707825748, "grad_norm": 0.224125936627388, "learning_rate": 7.344083260349877e-07, "loss": 0.008438081480562687, "memory(GiB)": 22.66, "step": 25674, "token_acc": 0.9878048780487805, "train_speed(iter/s)": 0.956351 }, { "epoch": 0.8340642562453302, "grad_norm": 0.34262141585350037, "learning_rate": 7.341281085867646e-07, "loss": 0.00978896301239729, "memory(GiB)": 22.66, "step": 25675, "token_acc": 0.981651376146789, "train_speed(iter/s)": 0.956359 }, { "epoch": 0.8340967417080857, "grad_norm": 0.3346044719219208, "learning_rate": 7.33847940372382e-07, "loss": 0.009901942685246468, "memory(GiB)": 22.66, "step": 25676, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956367 }, { "epoch": 0.834129227170841, "grad_norm": 0.2825501561164856, "learning_rate": 7.335678213950714e-07, "loss": 0.013755694963037968, "memory(GiB)": 22.66, "step": 25677, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.956374 }, { "epoch": 0.8341617126335965, "grad_norm": 0.3938986361026764, "learning_rate": 7.332877516580683e-07, "loss": 0.009777408093214035, "memory(GiB)": 22.66, "step": 25678, "token_acc": 1.0, "train_speed(iter/s)": 0.956381 }, { "epoch": 0.8341941980963519, "grad_norm": 0.33743807673454285, "learning_rate": 7.330077311646022e-07, "loss": 0.00953914038836956, "memory(GiB)": 22.66, "step": 25679, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.956389 }, { "epoch": 0.8342266835591073, "grad_norm": 0.281354695558548, "learning_rate": 7.327277599179078e-07, "loss": 0.011209813877940178, "memory(GiB)": 22.66, "step": 25680, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956396 }, { "epoch": 0.8342591690218627, "grad_norm": 0.35544145107269287, "learning_rate": 7.324478379212125e-07, "loss": 0.009637783281505108, "memory(GiB)": 22.66, "step": 25681, "token_acc": 1.0, "train_speed(iter/s)": 0.956404 }, { "epoch": 0.8342916544846182, "grad_norm": 0.2952708899974823, "learning_rate": 7.321679651777508e-07, "loss": 0.010461400263011456, "memory(GiB)": 22.66, "step": 25682, "token_acc": 1.0, "train_speed(iter/s)": 0.95641 }, { "epoch": 0.8343241399473735, "grad_norm": 0.34465596079826355, "learning_rate": 7.318881416907508e-07, "loss": 0.008043872192502022, "memory(GiB)": 22.66, "step": 25683, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956416 }, { "epoch": 0.834356625410129, "grad_norm": 0.3638538420200348, "learning_rate": 7.316083674634434e-07, "loss": 0.01441539078950882, "memory(GiB)": 22.66, "step": 25684, "token_acc": 1.0, "train_speed(iter/s)": 0.956422 }, { "epoch": 0.8343891108728844, "grad_norm": 0.4128398299217224, "learning_rate": 7.313286424990556e-07, "loss": 0.01662192866206169, "memory(GiB)": 22.66, "step": 25685, "token_acc": 0.99609375, "train_speed(iter/s)": 0.956429 }, { "epoch": 0.8344215963356398, "grad_norm": 0.40081876516342163, "learning_rate": 7.310489668008158e-07, "loss": 0.014580175280570984, "memory(GiB)": 22.66, "step": 25686, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.956435 }, { "epoch": 0.8344540817983952, "grad_norm": 0.3053388297557831, "learning_rate": 7.307693403719552e-07, "loss": 0.012294318526983261, "memory(GiB)": 22.66, "step": 25687, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.95644 }, { "epoch": 0.8344865672611507, "grad_norm": 0.28061357140541077, "learning_rate": 7.304897632156971e-07, "loss": 0.008576495572924614, "memory(GiB)": 22.66, "step": 25688, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.956446 }, { "epoch": 0.834519052723906, "grad_norm": 0.34843921661376953, "learning_rate": 7.302102353352714e-07, "loss": 0.01038181409239769, "memory(GiB)": 22.66, "step": 25689, "token_acc": 1.0, "train_speed(iter/s)": 0.956451 }, { "epoch": 0.8345515381866615, "grad_norm": 0.456211656332016, "learning_rate": 7.299307567339015e-07, "loss": 0.016035709530115128, "memory(GiB)": 22.66, "step": 25690, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.956456 }, { "epoch": 0.8345840236494169, "grad_norm": 0.26891520619392395, "learning_rate": 7.296513274148137e-07, "loss": 0.014774519950151443, "memory(GiB)": 22.66, "step": 25691, "token_acc": 1.0, "train_speed(iter/s)": 0.956462 }, { "epoch": 0.8346165091121723, "grad_norm": 0.35426008701324463, "learning_rate": 7.293719473812338e-07, "loss": 0.014278148300945759, "memory(GiB)": 22.66, "step": 25692, "token_acc": 1.0, "train_speed(iter/s)": 0.956467 }, { "epoch": 0.8346489945749277, "grad_norm": 0.32101869583129883, "learning_rate": 7.290926166363871e-07, "loss": 0.010086854919791222, "memory(GiB)": 22.66, "step": 25693, "token_acc": 1.0, "train_speed(iter/s)": 0.956472 }, { "epoch": 0.8346814800376832, "grad_norm": 0.4185769259929657, "learning_rate": 7.288133351834948e-07, "loss": 0.009029319509863853, "memory(GiB)": 22.66, "step": 25694, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.956478 }, { "epoch": 0.8347139655004385, "grad_norm": 0.24930927157402039, "learning_rate": 7.285341030257814e-07, "loss": 0.009765636175870895, "memory(GiB)": 22.66, "step": 25695, "token_acc": 1.0, "train_speed(iter/s)": 0.956483 }, { "epoch": 0.834746450963194, "grad_norm": 1.2186009883880615, "learning_rate": 7.282549201664702e-07, "loss": 0.013081083074212074, "memory(GiB)": 22.66, "step": 25696, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.956488 }, { "epoch": 0.8347789364259494, "grad_norm": 0.49536123871803284, "learning_rate": 7.279757866087833e-07, "loss": 0.01064478512853384, "memory(GiB)": 22.66, "step": 25697, "token_acc": 1.0, "train_speed(iter/s)": 0.956492 }, { "epoch": 0.8348114218887048, "grad_norm": 0.4467340409755707, "learning_rate": 7.276967023559417e-07, "loss": 0.0102302934974432, "memory(GiB)": 22.66, "step": 25698, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956498 }, { "epoch": 0.8348439073514602, "grad_norm": 0.33492326736450195, "learning_rate": 7.27417667411166e-07, "loss": 0.00974468793720007, "memory(GiB)": 22.66, "step": 25699, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956504 }, { "epoch": 0.8348763928142157, "grad_norm": 0.7971648573875427, "learning_rate": 7.271386817776771e-07, "loss": 0.008449128828942776, "memory(GiB)": 22.66, "step": 25700, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.956508 }, { "epoch": 0.834908878276971, "grad_norm": 0.2959655821323395, "learning_rate": 7.268597454586951e-07, "loss": 0.01410038024187088, "memory(GiB)": 22.66, "step": 25701, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.956514 }, { "epoch": 0.8349413637397265, "grad_norm": 0.4870738089084625, "learning_rate": 7.265808584574408e-07, "loss": 0.01648861914873123, "memory(GiB)": 22.66, "step": 25702, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956519 }, { "epoch": 0.8349738492024819, "grad_norm": 0.3973841965198517, "learning_rate": 7.263020207771293e-07, "loss": 0.015735607594251633, "memory(GiB)": 22.66, "step": 25703, "token_acc": 1.0, "train_speed(iter/s)": 0.956524 }, { "epoch": 0.8350063346652373, "grad_norm": 0.29958727955818176, "learning_rate": 7.260232324209815e-07, "loss": 0.010921215638518333, "memory(GiB)": 22.66, "step": 25704, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.95653 }, { "epoch": 0.8350388201279927, "grad_norm": 0.3725675642490387, "learning_rate": 7.257444933922137e-07, "loss": 0.011462662369012833, "memory(GiB)": 22.66, "step": 25705, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956536 }, { "epoch": 0.8350713055907482, "grad_norm": 0.3872505724430084, "learning_rate": 7.254658036940448e-07, "loss": 0.015487182885408401, "memory(GiB)": 22.66, "step": 25706, "token_acc": 0.9964285714285714, "train_speed(iter/s)": 0.956544 }, { "epoch": 0.8351037910535035, "grad_norm": 0.29453811049461365, "learning_rate": 7.251871633296892e-07, "loss": 0.016921207308769226, "memory(GiB)": 22.66, "step": 25707, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.956551 }, { "epoch": 0.835136276516259, "grad_norm": 0.4621226489543915, "learning_rate": 7.249085723023641e-07, "loss": 0.018661197274923325, "memory(GiB)": 22.66, "step": 25708, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.956559 }, { "epoch": 0.8351687619790144, "grad_norm": 0.7163135409355164, "learning_rate": 7.24630030615282e-07, "loss": 0.011025946587324142, "memory(GiB)": 22.66, "step": 25709, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956567 }, { "epoch": 0.8352012474417698, "grad_norm": 0.3788856565952301, "learning_rate": 7.243515382716626e-07, "loss": 0.009009309113025665, "memory(GiB)": 22.66, "step": 25710, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.956575 }, { "epoch": 0.8352337329045252, "grad_norm": 0.2508338987827301, "learning_rate": 7.240730952747154e-07, "loss": 0.007693071383982897, "memory(GiB)": 22.66, "step": 25711, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956583 }, { "epoch": 0.8352662183672807, "grad_norm": 0.2967250645160675, "learning_rate": 7.237947016276576e-07, "loss": 0.011107636615633965, "memory(GiB)": 22.66, "step": 25712, "token_acc": 1.0, "train_speed(iter/s)": 0.95659 }, { "epoch": 0.835298703830036, "grad_norm": 0.4557745158672333, "learning_rate": 7.235163573336995e-07, "loss": 0.012214445509016514, "memory(GiB)": 22.66, "step": 25713, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956598 }, { "epoch": 0.8353311892927915, "grad_norm": 0.22999690473079681, "learning_rate": 7.232380623960544e-07, "loss": 0.008625692687928677, "memory(GiB)": 22.66, "step": 25714, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956606 }, { "epoch": 0.8353636747555468, "grad_norm": 0.22322583198547363, "learning_rate": 7.229598168179347e-07, "loss": 0.00763353006914258, "memory(GiB)": 22.66, "step": 25715, "token_acc": 1.0, "train_speed(iter/s)": 0.956614 }, { "epoch": 0.8353961602183023, "grad_norm": 0.2870516777038574, "learning_rate": 7.226816206025522e-07, "loss": 0.00928441621363163, "memory(GiB)": 22.66, "step": 25716, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.956621 }, { "epoch": 0.8354286456810577, "grad_norm": 0.3291882574558258, "learning_rate": 7.22403473753116e-07, "loss": 0.01732069067656994, "memory(GiB)": 22.66, "step": 25717, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.956629 }, { "epoch": 0.8354611311438132, "grad_norm": 0.26784029603004456, "learning_rate": 7.221253762728369e-07, "loss": 0.009565320797264576, "memory(GiB)": 22.66, "step": 25718, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956637 }, { "epoch": 0.8354936166065685, "grad_norm": 0.4186142086982727, "learning_rate": 7.218473281649247e-07, "loss": 0.012672008946537971, "memory(GiB)": 22.66, "step": 25719, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956645 }, { "epoch": 0.835526102069324, "grad_norm": 0.37875765562057495, "learning_rate": 7.215693294325893e-07, "loss": 0.012370629236102104, "memory(GiB)": 22.66, "step": 25720, "token_acc": 0.996, "train_speed(iter/s)": 0.956652 }, { "epoch": 0.8355585875320793, "grad_norm": 0.5665546655654907, "learning_rate": 7.212913800790389e-07, "loss": 0.021025601774454117, "memory(GiB)": 22.66, "step": 25721, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.95666 }, { "epoch": 0.8355910729948348, "grad_norm": 0.27293384075164795, "learning_rate": 7.210134801074797e-07, "loss": 0.009012795984745026, "memory(GiB)": 22.66, "step": 25722, "token_acc": 1.0, "train_speed(iter/s)": 0.956669 }, { "epoch": 0.8356235584575902, "grad_norm": 0.4961337149143219, "learning_rate": 7.207356295211204e-07, "loss": 0.013216255232691765, "memory(GiB)": 22.66, "step": 25723, "token_acc": 1.0, "train_speed(iter/s)": 0.956676 }, { "epoch": 0.8356560439203456, "grad_norm": 0.37584662437438965, "learning_rate": 7.204578283231678e-07, "loss": 0.012065013870596886, "memory(GiB)": 22.66, "step": 25724, "token_acc": 1.0, "train_speed(iter/s)": 0.956684 }, { "epoch": 0.835688529383101, "grad_norm": 0.6540655493736267, "learning_rate": 7.201800765168293e-07, "loss": 0.011554569005966187, "memory(GiB)": 22.66, "step": 25725, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956691 }, { "epoch": 0.8357210148458565, "grad_norm": 0.3771311640739441, "learning_rate": 7.199023741053074e-07, "loss": 0.012856598943471909, "memory(GiB)": 22.66, "step": 25726, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.956699 }, { "epoch": 0.8357535003086118, "grad_norm": 0.3372993767261505, "learning_rate": 7.196247210918094e-07, "loss": 0.008864907547831535, "memory(GiB)": 22.66, "step": 25727, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.956708 }, { "epoch": 0.8357859857713673, "grad_norm": 0.25114262104034424, "learning_rate": 7.193471174795391e-07, "loss": 0.008532170206308365, "memory(GiB)": 22.66, "step": 25728, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.956716 }, { "epoch": 0.8358184712341228, "grad_norm": 0.2833173871040344, "learning_rate": 7.190695632717015e-07, "loss": 0.010168032720685005, "memory(GiB)": 22.66, "step": 25729, "token_acc": 1.0, "train_speed(iter/s)": 0.956723 }, { "epoch": 0.8358509566968781, "grad_norm": 0.43255195021629333, "learning_rate": 7.18792058471498e-07, "loss": 0.016590459272265434, "memory(GiB)": 22.66, "step": 25730, "token_acc": 1.0, "train_speed(iter/s)": 0.956731 }, { "epoch": 0.8358834421596336, "grad_norm": 0.728931188583374, "learning_rate": 7.185146030821338e-07, "loss": 0.017424579709768295, "memory(GiB)": 22.66, "step": 25731, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.956739 }, { "epoch": 0.835915927622389, "grad_norm": 0.4651646316051483, "learning_rate": 7.182371971068075e-07, "loss": 0.01764661632478237, "memory(GiB)": 22.66, "step": 25732, "token_acc": 0.9923371647509579, "train_speed(iter/s)": 0.956747 }, { "epoch": 0.8359484130851444, "grad_norm": 0.23416827619075775, "learning_rate": 7.179598405487237e-07, "loss": 0.01234380155801773, "memory(GiB)": 22.66, "step": 25733, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.956754 }, { "epoch": 0.8359808985478998, "grad_norm": 0.32509562373161316, "learning_rate": 7.176825334110843e-07, "loss": 0.015011651441454887, "memory(GiB)": 22.66, "step": 25734, "token_acc": 1.0, "train_speed(iter/s)": 0.956762 }, { "epoch": 0.8360133840106553, "grad_norm": 0.27575576305389404, "learning_rate": 7.174052756970867e-07, "loss": 0.009935598820447922, "memory(GiB)": 22.66, "step": 25735, "token_acc": 1.0, "train_speed(iter/s)": 0.95677 }, { "epoch": 0.8360458694734106, "grad_norm": 0.30837175250053406, "learning_rate": 7.17128067409934e-07, "loss": 0.013028259389102459, "memory(GiB)": 22.66, "step": 25736, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956778 }, { "epoch": 0.8360783549361661, "grad_norm": 0.3824048638343811, "learning_rate": 7.168509085528213e-07, "loss": 0.013078894466161728, "memory(GiB)": 22.66, "step": 25737, "token_acc": 0.9923664122137404, "train_speed(iter/s)": 0.956786 }, { "epoch": 0.8361108403989215, "grad_norm": 0.48300454020500183, "learning_rate": 7.165737991289517e-07, "loss": 0.014974935911595821, "memory(GiB)": 22.66, "step": 25738, "token_acc": 0.9875, "train_speed(iter/s)": 0.956793 }, { "epoch": 0.836143325861677, "grad_norm": 0.25932028889656067, "learning_rate": 7.162967391415216e-07, "loss": 0.0075049567967653275, "memory(GiB)": 22.66, "step": 25739, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.9568 }, { "epoch": 0.8361758113244323, "grad_norm": 0.30512312054634094, "learning_rate": 7.160197285937292e-07, "loss": 0.01439750101417303, "memory(GiB)": 22.66, "step": 25740, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.956808 }, { "epoch": 0.8362082967871878, "grad_norm": 0.42846402525901794, "learning_rate": 7.157427674887701e-07, "loss": 0.013569800183176994, "memory(GiB)": 22.66, "step": 25741, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.956816 }, { "epoch": 0.8362407822499431, "grad_norm": 0.5169863104820251, "learning_rate": 7.154658558298417e-07, "loss": 0.015503470785915852, "memory(GiB)": 22.66, "step": 25742, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.956824 }, { "epoch": 0.8362732677126986, "grad_norm": 0.25719696283340454, "learning_rate": 7.151889936201401e-07, "loss": 0.007449010387063026, "memory(GiB)": 22.66, "step": 25743, "token_acc": 1.0, "train_speed(iter/s)": 0.956832 }, { "epoch": 0.836305753175454, "grad_norm": 0.5587449669837952, "learning_rate": 7.149121808628617e-07, "loss": 0.0128257405012846, "memory(GiB)": 22.66, "step": 25744, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.95684 }, { "epoch": 0.8363382386382094, "grad_norm": 0.3118961751461029, "learning_rate": 7.146354175611992e-07, "loss": 0.012407382018864155, "memory(GiB)": 22.66, "step": 25745, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956847 }, { "epoch": 0.8363707241009648, "grad_norm": 0.22641655802726746, "learning_rate": 7.143587037183475e-07, "loss": 0.008648635819554329, "memory(GiB)": 22.66, "step": 25746, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.956855 }, { "epoch": 0.8364032095637203, "grad_norm": 0.3201759457588196, "learning_rate": 7.140820393375008e-07, "loss": 0.013333955779671669, "memory(GiB)": 22.66, "step": 25747, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956861 }, { "epoch": 0.8364356950264756, "grad_norm": 0.39883649349212646, "learning_rate": 7.138054244218518e-07, "loss": 0.013010933995246887, "memory(GiB)": 22.66, "step": 25748, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956866 }, { "epoch": 0.8364681804892311, "grad_norm": 0.30018922686576843, "learning_rate": 7.135288589745942e-07, "loss": 0.008628110401332378, "memory(GiB)": 22.66, "step": 25749, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956871 }, { "epoch": 0.8365006659519865, "grad_norm": 0.48534059524536133, "learning_rate": 7.132523429989174e-07, "loss": 0.0151091068983078, "memory(GiB)": 22.66, "step": 25750, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.956876 }, { "epoch": 0.8365331514147419, "grad_norm": 0.3782482445240021, "learning_rate": 7.129758764980149e-07, "loss": 0.013641648925840855, "memory(GiB)": 22.66, "step": 25751, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956881 }, { "epoch": 0.8365656368774973, "grad_norm": 0.2716771960258484, "learning_rate": 7.126994594750763e-07, "loss": 0.008871002122759819, "memory(GiB)": 22.66, "step": 25752, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956886 }, { "epoch": 0.8365981223402528, "grad_norm": 0.3405885696411133, "learning_rate": 7.124230919332936e-07, "loss": 0.013172629289329052, "memory(GiB)": 22.66, "step": 25753, "token_acc": 1.0, "train_speed(iter/s)": 0.956891 }, { "epoch": 0.8366306078030081, "grad_norm": 0.3557722270488739, "learning_rate": 7.121467738758542e-07, "loss": 0.011394158005714417, "memory(GiB)": 22.66, "step": 25754, "token_acc": 1.0, "train_speed(iter/s)": 0.956896 }, { "epoch": 0.8366630932657636, "grad_norm": 0.21938395500183105, "learning_rate": 7.118705053059483e-07, "loss": 0.004702406469732523, "memory(GiB)": 22.66, "step": 25755, "token_acc": 1.0, "train_speed(iter/s)": 0.956902 }, { "epoch": 0.836695578728519, "grad_norm": 0.35280999541282654, "learning_rate": 7.115942862267639e-07, "loss": 0.008711340837180614, "memory(GiB)": 22.66, "step": 25756, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.956907 }, { "epoch": 0.8367280641912744, "grad_norm": 0.3532426953315735, "learning_rate": 7.113181166414901e-07, "loss": 0.012585487216711044, "memory(GiB)": 22.66, "step": 25757, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.956912 }, { "epoch": 0.8367605496540298, "grad_norm": 0.23544926941394806, "learning_rate": 7.110419965533128e-07, "loss": 0.005411245860159397, "memory(GiB)": 22.66, "step": 25758, "token_acc": 1.0, "train_speed(iter/s)": 0.956918 }, { "epoch": 0.8367930351167853, "grad_norm": 0.3470626473426819, "learning_rate": 7.107659259654204e-07, "loss": 0.012749768793582916, "memory(GiB)": 22.66, "step": 25759, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956923 }, { "epoch": 0.8368255205795406, "grad_norm": 0.3406502306461334, "learning_rate": 7.10489904880996e-07, "loss": 0.01403461117297411, "memory(GiB)": 22.66, "step": 25760, "token_acc": 1.0, "train_speed(iter/s)": 0.956928 }, { "epoch": 0.8368580060422961, "grad_norm": 0.524147093296051, "learning_rate": 7.102139333032299e-07, "loss": 0.016879187896847725, "memory(GiB)": 22.66, "step": 25761, "token_acc": 0.9847328244274809, "train_speed(iter/s)": 0.956933 }, { "epoch": 0.8368904915050515, "grad_norm": 0.4694339334964752, "learning_rate": 7.099380112353032e-07, "loss": 0.013314012438058853, "memory(GiB)": 22.66, "step": 25762, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956938 }, { "epoch": 0.8369229769678069, "grad_norm": 0.39047834277153015, "learning_rate": 7.09662138680402e-07, "loss": 0.008798198774456978, "memory(GiB)": 22.66, "step": 25763, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.956943 }, { "epoch": 0.8369554624305623, "grad_norm": 0.3537552058696747, "learning_rate": 7.093863156417114e-07, "loss": 0.011494539678096771, "memory(GiB)": 22.66, "step": 25764, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956948 }, { "epoch": 0.8369879478933178, "grad_norm": 0.29512348771095276, "learning_rate": 7.091105421224114e-07, "loss": 0.013148733414709568, "memory(GiB)": 22.66, "step": 25765, "token_acc": 0.996551724137931, "train_speed(iter/s)": 0.956953 }, { "epoch": 0.8370204333560731, "grad_norm": 0.37635040283203125, "learning_rate": 7.088348181256888e-07, "loss": 0.012583794072270393, "memory(GiB)": 22.66, "step": 25766, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.956959 }, { "epoch": 0.8370529188188286, "grad_norm": 0.4484800100326538, "learning_rate": 7.085591436547229e-07, "loss": 0.01271079108119011, "memory(GiB)": 22.66, "step": 25767, "token_acc": 1.0, "train_speed(iter/s)": 0.956965 }, { "epoch": 0.837085404281584, "grad_norm": 0.41054025292396545, "learning_rate": 7.08283518712698e-07, "loss": 0.017825739458203316, "memory(GiB)": 22.66, "step": 25768, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956971 }, { "epoch": 0.8371178897443394, "grad_norm": 0.3428456485271454, "learning_rate": 7.08007943302792e-07, "loss": 0.011094634421169758, "memory(GiB)": 22.66, "step": 25769, "token_acc": 1.0, "train_speed(iter/s)": 0.956978 }, { "epoch": 0.8371503752070948, "grad_norm": 0.3721626102924347, "learning_rate": 7.077324174281869e-07, "loss": 0.012557335197925568, "memory(GiB)": 22.66, "step": 25770, "token_acc": 1.0, "train_speed(iter/s)": 0.956984 }, { "epoch": 0.8371828606698503, "grad_norm": 0.23795676231384277, "learning_rate": 7.074569410920623e-07, "loss": 0.006562924012541771, "memory(GiB)": 22.66, "step": 25771, "token_acc": 1.0, "train_speed(iter/s)": 0.956992 }, { "epoch": 0.8372153461326056, "grad_norm": 0.3814871609210968, "learning_rate": 7.071815142975997e-07, "loss": 0.012690698727965355, "memory(GiB)": 22.66, "step": 25772, "token_acc": 1.0, "train_speed(iter/s)": 0.957 }, { "epoch": 0.8372478315953611, "grad_norm": 0.3373960256576538, "learning_rate": 7.069061370479752e-07, "loss": 0.007378711365163326, "memory(GiB)": 22.66, "step": 25773, "token_acc": 1.0, "train_speed(iter/s)": 0.957008 }, { "epoch": 0.8372803170581165, "grad_norm": 0.2579333186149597, "learning_rate": 7.066308093463675e-07, "loss": 0.010398698970675468, "memory(GiB)": 22.66, "step": 25774, "token_acc": 1.0, "train_speed(iter/s)": 0.957016 }, { "epoch": 0.8373128025208719, "grad_norm": 0.42863649129867554, "learning_rate": 7.063555311959547e-07, "loss": 0.014908015727996826, "memory(GiB)": 22.66, "step": 25775, "token_acc": 0.9961685823754789, "train_speed(iter/s)": 0.957024 }, { "epoch": 0.8373452879836273, "grad_norm": 0.3428359031677246, "learning_rate": 7.060803025999152e-07, "loss": 0.012208284810185432, "memory(GiB)": 22.66, "step": 25776, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.957031 }, { "epoch": 0.8373777734463828, "grad_norm": 0.29799723625183105, "learning_rate": 7.058051235614233e-07, "loss": 0.0135301249101758, "memory(GiB)": 22.66, "step": 25777, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.95704 }, { "epoch": 0.8374102589091381, "grad_norm": 0.5074038505554199, "learning_rate": 7.055299940836552e-07, "loss": 0.014079254120588303, "memory(GiB)": 22.66, "step": 25778, "token_acc": 1.0, "train_speed(iter/s)": 0.957047 }, { "epoch": 0.8374427443718936, "grad_norm": 0.4968934655189514, "learning_rate": 7.052549141697873e-07, "loss": 0.010881468653678894, "memory(GiB)": 22.66, "step": 25779, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.957055 }, { "epoch": 0.837475229834649, "grad_norm": 0.4627647399902344, "learning_rate": 7.049798838229944e-07, "loss": 0.012533359229564667, "memory(GiB)": 22.66, "step": 25780, "token_acc": 1.0, "train_speed(iter/s)": 0.957063 }, { "epoch": 0.8375077152974044, "grad_norm": 0.42827484011650085, "learning_rate": 7.047049030464509e-07, "loss": 0.01898081973195076, "memory(GiB)": 22.66, "step": 25781, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.957071 }, { "epoch": 0.8375402007601598, "grad_norm": 0.4001208543777466, "learning_rate": 7.044299718433289e-07, "loss": 0.015254254452884197, "memory(GiB)": 22.66, "step": 25782, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.957078 }, { "epoch": 0.8375726862229153, "grad_norm": 0.46494579315185547, "learning_rate": 7.041550902168021e-07, "loss": 0.015718266367912292, "memory(GiB)": 22.66, "step": 25783, "token_acc": 1.0, "train_speed(iter/s)": 0.957086 }, { "epoch": 0.8376051716856706, "grad_norm": 0.4805202782154083, "learning_rate": 7.038802581700438e-07, "loss": 0.012151840142905712, "memory(GiB)": 22.66, "step": 25784, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.957094 }, { "epoch": 0.8376376571484261, "grad_norm": 0.1479041874408722, "learning_rate": 7.036054757062261e-07, "loss": 0.004420212469995022, "memory(GiB)": 22.66, "step": 25785, "token_acc": 1.0, "train_speed(iter/s)": 0.957102 }, { "epoch": 0.8376701426111814, "grad_norm": 0.32446563243865967, "learning_rate": 7.033307428285191e-07, "loss": 0.015458695590496063, "memory(GiB)": 22.66, "step": 25786, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.95711 }, { "epoch": 0.8377026280739369, "grad_norm": 0.36563074588775635, "learning_rate": 7.030560595400948e-07, "loss": 0.011114985682070255, "memory(GiB)": 22.66, "step": 25787, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.957117 }, { "epoch": 0.8377351135366923, "grad_norm": 0.23879191279411316, "learning_rate": 7.027814258441207e-07, "loss": 0.009130172431468964, "memory(GiB)": 22.66, "step": 25788, "token_acc": 1.0, "train_speed(iter/s)": 0.957125 }, { "epoch": 0.8377675989994477, "grad_norm": 0.29293057322502136, "learning_rate": 7.025068417437708e-07, "loss": 0.009146960452198982, "memory(GiB)": 22.66, "step": 25789, "token_acc": 1.0, "train_speed(iter/s)": 0.957132 }, { "epoch": 0.8378000844622031, "grad_norm": 0.26595377922058105, "learning_rate": 7.02232307242211e-07, "loss": 0.009540487080812454, "memory(GiB)": 22.66, "step": 25790, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.95714 }, { "epoch": 0.8378325699249586, "grad_norm": 0.3124397099018097, "learning_rate": 7.019578223426116e-07, "loss": 0.00850680097937584, "memory(GiB)": 22.66, "step": 25791, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957148 }, { "epoch": 0.837865055387714, "grad_norm": 0.36722224950790405, "learning_rate": 7.016833870481382e-07, "loss": 0.013742867857217789, "memory(GiB)": 22.66, "step": 25792, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957156 }, { "epoch": 0.8378975408504694, "grad_norm": 0.6841846704483032, "learning_rate": 7.014090013619601e-07, "loss": 0.015094118192791939, "memory(GiB)": 22.66, "step": 25793, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.957164 }, { "epoch": 0.8379300263132249, "grad_norm": 0.3629113733768463, "learning_rate": 7.011346652872431e-07, "loss": 0.016271142289042473, "memory(GiB)": 22.66, "step": 25794, "token_acc": 0.9856115107913669, "train_speed(iter/s)": 0.957172 }, { "epoch": 0.8379625117759802, "grad_norm": 0.3303416073322296, "learning_rate": 7.008603788271539e-07, "loss": 0.012745451182126999, "memory(GiB)": 22.66, "step": 25795, "token_acc": 1.0, "train_speed(iter/s)": 0.95718 }, { "epoch": 0.8379949972387357, "grad_norm": 0.31439492106437683, "learning_rate": 7.005861419848597e-07, "loss": 0.012274453416466713, "memory(GiB)": 22.66, "step": 25796, "token_acc": 0.988, "train_speed(iter/s)": 0.957187 }, { "epoch": 0.8380274827014911, "grad_norm": 0.3317478597164154, "learning_rate": 7.003119547635223e-07, "loss": 0.013859134167432785, "memory(GiB)": 22.66, "step": 25797, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.957195 }, { "epoch": 0.8380599681642465, "grad_norm": 0.4079991281032562, "learning_rate": 7.000378171663081e-07, "loss": 0.007442468777298927, "memory(GiB)": 22.66, "step": 25798, "token_acc": 1.0, "train_speed(iter/s)": 0.957202 }, { "epoch": 0.8380924536270019, "grad_norm": 0.2516101598739624, "learning_rate": 6.997637291963804e-07, "loss": 0.011569652706384659, "memory(GiB)": 22.66, "step": 25799, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.957211 }, { "epoch": 0.8381249390897574, "grad_norm": 0.2653845250606537, "learning_rate": 6.994896908569043e-07, "loss": 0.006371994037181139, "memory(GiB)": 22.66, "step": 25800, "token_acc": 1.0, "train_speed(iter/s)": 0.957218 }, { "epoch": 0.8381574245525127, "grad_norm": 0.49343341588974, "learning_rate": 6.992157021510399e-07, "loss": 0.016492566093802452, "memory(GiB)": 22.66, "step": 25801, "token_acc": 1.0, "train_speed(iter/s)": 0.957226 }, { "epoch": 0.8381899100152682, "grad_norm": 0.3840838670730591, "learning_rate": 6.989417630819506e-07, "loss": 0.011090308427810669, "memory(GiB)": 22.66, "step": 25802, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957235 }, { "epoch": 0.8382223954780236, "grad_norm": 0.28632229566574097, "learning_rate": 6.98667873652798e-07, "loss": 0.012325449846684933, "memory(GiB)": 22.66, "step": 25803, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.957242 }, { "epoch": 0.838254880940779, "grad_norm": 0.36872896552085876, "learning_rate": 6.983940338667445e-07, "loss": 0.014640744775533676, "memory(GiB)": 22.66, "step": 25804, "token_acc": 1.0, "train_speed(iter/s)": 0.95725 }, { "epoch": 0.8382873664035344, "grad_norm": 0.5502004027366638, "learning_rate": 6.98120243726948e-07, "loss": 0.019438516348600388, "memory(GiB)": 22.66, "step": 25805, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.957258 }, { "epoch": 0.8383198518662899, "grad_norm": 0.3654179275035858, "learning_rate": 6.978465032365695e-07, "loss": 0.009662038646638393, "memory(GiB)": 22.66, "step": 25806, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.957265 }, { "epoch": 0.8383523373290452, "grad_norm": 0.425868958234787, "learning_rate": 6.975728123987686e-07, "loss": 0.017764130607247353, "memory(GiB)": 22.66, "step": 25807, "token_acc": 0.990521327014218, "train_speed(iter/s)": 0.957272 }, { "epoch": 0.8383848227918007, "grad_norm": 0.3901432752609253, "learning_rate": 6.972991712167055e-07, "loss": 0.012901395559310913, "memory(GiB)": 22.66, "step": 25808, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957278 }, { "epoch": 0.8384173082545561, "grad_norm": 0.5299177765846252, "learning_rate": 6.970255796935354e-07, "loss": 0.018049631267786026, "memory(GiB)": 22.66, "step": 25809, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.957284 }, { "epoch": 0.8384497937173115, "grad_norm": 0.2435033619403839, "learning_rate": 6.967520378324172e-07, "loss": 0.006746630650013685, "memory(GiB)": 22.66, "step": 25810, "token_acc": 1.0, "train_speed(iter/s)": 0.95729 }, { "epoch": 0.8384822791800669, "grad_norm": 0.6729013323783875, "learning_rate": 6.964785456365081e-07, "loss": 0.02057156339287758, "memory(GiB)": 22.66, "step": 25811, "token_acc": 0.9826086956521739, "train_speed(iter/s)": 0.957296 }, { "epoch": 0.8385147646428224, "grad_norm": 0.35453203320503235, "learning_rate": 6.962051031089651e-07, "loss": 0.010943124070763588, "memory(GiB)": 22.66, "step": 25812, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957302 }, { "epoch": 0.8385472501055777, "grad_norm": 0.31834837794303894, "learning_rate": 6.95931710252944e-07, "loss": 0.00838879682123661, "memory(GiB)": 22.66, "step": 25813, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.957307 }, { "epoch": 0.8385797355683332, "grad_norm": 0.314984530210495, "learning_rate": 6.956583670715989e-07, "loss": 0.01075642742216587, "memory(GiB)": 22.66, "step": 25814, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957312 }, { "epoch": 0.8386122210310886, "grad_norm": 0.28167879581451416, "learning_rate": 6.953850735680862e-07, "loss": 0.008982950821518898, "memory(GiB)": 22.66, "step": 25815, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.957317 }, { "epoch": 0.838644706493844, "grad_norm": 0.3771708905696869, "learning_rate": 6.951118297455573e-07, "loss": 0.012385834008455276, "memory(GiB)": 22.66, "step": 25816, "token_acc": 1.0, "train_speed(iter/s)": 0.957323 }, { "epoch": 0.8386771919565994, "grad_norm": 0.3406261205673218, "learning_rate": 6.948386356071696e-07, "loss": 0.010463365353643894, "memory(GiB)": 22.66, "step": 25817, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.957328 }, { "epoch": 0.8387096774193549, "grad_norm": 0.34522247314453125, "learning_rate": 6.945654911560734e-07, "loss": 0.010108156129717827, "memory(GiB)": 22.66, "step": 25818, "token_acc": 1.0, "train_speed(iter/s)": 0.957334 }, { "epoch": 0.8387421628821102, "grad_norm": 0.2686313986778259, "learning_rate": 6.942923963954229e-07, "loss": 0.011905675753951073, "memory(GiB)": 22.66, "step": 25819, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957339 }, { "epoch": 0.8387746483448657, "grad_norm": 0.38682445883750916, "learning_rate": 6.940193513283677e-07, "loss": 0.019651338458061218, "memory(GiB)": 22.66, "step": 25820, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.957344 }, { "epoch": 0.8388071338076211, "grad_norm": 0.3802870512008667, "learning_rate": 6.937463559580609e-07, "loss": 0.014479290693998337, "memory(GiB)": 22.66, "step": 25821, "token_acc": 1.0, "train_speed(iter/s)": 0.957349 }, { "epoch": 0.8388396192703765, "grad_norm": 0.466056227684021, "learning_rate": 6.934734102876523e-07, "loss": 0.018955666571855545, "memory(GiB)": 22.66, "step": 25822, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.957355 }, { "epoch": 0.8388721047331319, "grad_norm": 0.37394118309020996, "learning_rate": 6.932005143202941e-07, "loss": 0.00956900604069233, "memory(GiB)": 22.66, "step": 25823, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.95736 }, { "epoch": 0.8389045901958874, "grad_norm": 0.24865667521953583, "learning_rate": 6.929276680591329e-07, "loss": 0.01267416961491108, "memory(GiB)": 22.66, "step": 25824, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.957365 }, { "epoch": 0.8389370756586427, "grad_norm": 0.2459506243467331, "learning_rate": 6.926548715073189e-07, "loss": 0.01262046117335558, "memory(GiB)": 22.66, "step": 25825, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957369 }, { "epoch": 0.8389695611213982, "grad_norm": 0.42351964116096497, "learning_rate": 6.923821246680007e-07, "loss": 0.013145300559699535, "memory(GiB)": 22.66, "step": 25826, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.957375 }, { "epoch": 0.8390020465841536, "grad_norm": 0.44816893339157104, "learning_rate": 6.921094275443269e-07, "loss": 0.01641000434756279, "memory(GiB)": 22.66, "step": 25827, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957381 }, { "epoch": 0.839034532046909, "grad_norm": 0.46501195430755615, "learning_rate": 6.918367801394443e-07, "loss": 0.016525913029909134, "memory(GiB)": 22.66, "step": 25828, "token_acc": 0.99, "train_speed(iter/s)": 0.957387 }, { "epoch": 0.8390670175096644, "grad_norm": 0.3800429105758667, "learning_rate": 6.915641824564989e-07, "loss": 0.009991761296987534, "memory(GiB)": 22.66, "step": 25829, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957393 }, { "epoch": 0.8390995029724199, "grad_norm": 0.43322402238845825, "learning_rate": 6.912916344986376e-07, "loss": 0.020762953907251358, "memory(GiB)": 22.66, "step": 25830, "token_acc": 1.0, "train_speed(iter/s)": 0.957398 }, { "epoch": 0.8391319884351752, "grad_norm": 0.39879193902015686, "learning_rate": 6.91019136269005e-07, "loss": 0.0091934809461236, "memory(GiB)": 22.66, "step": 25831, "token_acc": 1.0, "train_speed(iter/s)": 0.957404 }, { "epoch": 0.8391644738979307, "grad_norm": 0.3176438808441162, "learning_rate": 6.907466877707486e-07, "loss": 0.012413237243890762, "memory(GiB)": 22.66, "step": 25832, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.95741 }, { "epoch": 0.8391969593606861, "grad_norm": 0.3475874066352844, "learning_rate": 6.904742890070093e-07, "loss": 0.01198264118283987, "memory(GiB)": 22.66, "step": 25833, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.957416 }, { "epoch": 0.8392294448234415, "grad_norm": 0.21850094199180603, "learning_rate": 6.902019399809329e-07, "loss": 0.01039062812924385, "memory(GiB)": 22.66, "step": 25834, "token_acc": 1.0, "train_speed(iter/s)": 0.957422 }, { "epoch": 0.8392619302861969, "grad_norm": 0.27359387278556824, "learning_rate": 6.899296406956623e-07, "loss": 0.008860216476023197, "memory(GiB)": 22.66, "step": 25835, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.957429 }, { "epoch": 0.8392944157489524, "grad_norm": 0.46165353059768677, "learning_rate": 6.89657391154342e-07, "loss": 0.014815344475209713, "memory(GiB)": 22.66, "step": 25836, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957437 }, { "epoch": 0.8393269012117077, "grad_norm": 0.3873741328716278, "learning_rate": 6.893851913601107e-07, "loss": 0.012827256694436073, "memory(GiB)": 22.66, "step": 25837, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.957444 }, { "epoch": 0.8393593866744632, "grad_norm": 0.1610790640115738, "learning_rate": 6.891130413161134e-07, "loss": 0.004031787626445293, "memory(GiB)": 22.66, "step": 25838, "token_acc": 1.0, "train_speed(iter/s)": 0.957452 }, { "epoch": 0.8393918721372186, "grad_norm": 0.2798028588294983, "learning_rate": 6.888409410254871e-07, "loss": 0.015550859272480011, "memory(GiB)": 22.66, "step": 25839, "token_acc": 1.0, "train_speed(iter/s)": 0.95746 }, { "epoch": 0.839424357599974, "grad_norm": 0.2733719050884247, "learning_rate": 6.885688904913751e-07, "loss": 0.013319268822669983, "memory(GiB)": 22.66, "step": 25840, "token_acc": 1.0, "train_speed(iter/s)": 0.957468 }, { "epoch": 0.8394568430627294, "grad_norm": 0.32754719257354736, "learning_rate": 6.882968897169179e-07, "loss": 0.009473484009504318, "memory(GiB)": 22.66, "step": 25841, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.957475 }, { "epoch": 0.8394893285254849, "grad_norm": 0.3563523292541504, "learning_rate": 6.880249387052523e-07, "loss": 0.013324561528861523, "memory(GiB)": 22.66, "step": 25842, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957483 }, { "epoch": 0.8395218139882402, "grad_norm": 0.34853002429008484, "learning_rate": 6.877530374595193e-07, "loss": 0.011011523194611073, "memory(GiB)": 22.66, "step": 25843, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.95749 }, { "epoch": 0.8395542994509957, "grad_norm": 0.3764670491218567, "learning_rate": 6.87481185982854e-07, "loss": 0.010394737124443054, "memory(GiB)": 22.66, "step": 25844, "token_acc": 1.0, "train_speed(iter/s)": 0.957498 }, { "epoch": 0.839586784913751, "grad_norm": 0.35926011204719543, "learning_rate": 6.872093842783978e-07, "loss": 0.012202595360577106, "memory(GiB)": 22.66, "step": 25845, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957505 }, { "epoch": 0.8396192703765065, "grad_norm": 0.3254832327365875, "learning_rate": 6.869376323492844e-07, "loss": 0.012281827628612518, "memory(GiB)": 22.66, "step": 25846, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.957513 }, { "epoch": 0.8396517558392619, "grad_norm": 0.29587358236312866, "learning_rate": 6.866659301986522e-07, "loss": 0.007320912554860115, "memory(GiB)": 22.66, "step": 25847, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.95752 }, { "epoch": 0.8396842413020174, "grad_norm": 0.31161370873451233, "learning_rate": 6.863942778296351e-07, "loss": 0.009452738799154758, "memory(GiB)": 22.66, "step": 25848, "token_acc": 0.9952830188679245, "train_speed(iter/s)": 0.957528 }, { "epoch": 0.8397167267647727, "grad_norm": 0.3741920292377472, "learning_rate": 6.861226752453698e-07, "loss": 0.015176697634160519, "memory(GiB)": 22.66, "step": 25849, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.957536 }, { "epoch": 0.8397492122275282, "grad_norm": 0.3637349605560303, "learning_rate": 6.858511224489906e-07, "loss": 0.011713622137904167, "memory(GiB)": 22.66, "step": 25850, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.957544 }, { "epoch": 0.8397816976902835, "grad_norm": 0.3027377128601074, "learning_rate": 6.855796194436326e-07, "loss": 0.00751093402504921, "memory(GiB)": 22.66, "step": 25851, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.957552 }, { "epoch": 0.839814183153039, "grad_norm": 0.4464414119720459, "learning_rate": 6.853081662324273e-07, "loss": 0.00995918083935976, "memory(GiB)": 22.66, "step": 25852, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.95756 }, { "epoch": 0.8398466686157944, "grad_norm": 0.35465794801712036, "learning_rate": 6.850367628185084e-07, "loss": 0.01344836875796318, "memory(GiB)": 22.66, "step": 25853, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.957567 }, { "epoch": 0.8398791540785498, "grad_norm": 0.38145944476127625, "learning_rate": 6.847654092050082e-07, "loss": 0.011410940438508987, "memory(GiB)": 22.66, "step": 25854, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.957575 }, { "epoch": 0.8399116395413052, "grad_norm": 0.5595640540122986, "learning_rate": 6.844941053950605e-07, "loss": 0.008836610242724419, "memory(GiB)": 22.66, "step": 25855, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957583 }, { "epoch": 0.8399441250040607, "grad_norm": 0.23667368292808533, "learning_rate": 6.842228513917931e-07, "loss": 0.010103115811944008, "memory(GiB)": 22.66, "step": 25856, "token_acc": 0.9923664122137404, "train_speed(iter/s)": 0.957591 }, { "epoch": 0.8399766104668162, "grad_norm": 0.2978716790676117, "learning_rate": 6.839516471983381e-07, "loss": 0.010097827762365341, "memory(GiB)": 22.66, "step": 25857, "token_acc": 1.0, "train_speed(iter/s)": 0.957599 }, { "epoch": 0.8400090959295715, "grad_norm": 0.4300699234008789, "learning_rate": 6.836804928178265e-07, "loss": 0.012761124409735203, "memory(GiB)": 22.66, "step": 25858, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.957607 }, { "epoch": 0.840041581392327, "grad_norm": 0.4071688950061798, "learning_rate": 6.834093882533866e-07, "loss": 0.012963852845132351, "memory(GiB)": 22.66, "step": 25859, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.957615 }, { "epoch": 0.8400740668550823, "grad_norm": 0.381677508354187, "learning_rate": 6.831383335081487e-07, "loss": 0.010967617854475975, "memory(GiB)": 22.66, "step": 25860, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957622 }, { "epoch": 0.8401065523178378, "grad_norm": 0.1939079612493515, "learning_rate": 6.828673285852394e-07, "loss": 0.004312511533498764, "memory(GiB)": 22.66, "step": 25861, "token_acc": 1.0, "train_speed(iter/s)": 0.95763 }, { "epoch": 0.8401390377805932, "grad_norm": 0.2587653696537018, "learning_rate": 6.825963734877866e-07, "loss": 0.009088298305869102, "memory(GiB)": 22.66, "step": 25862, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.957638 }, { "epoch": 0.8401715232433487, "grad_norm": 0.32388100028038025, "learning_rate": 6.82325468218919e-07, "loss": 0.009059309959411621, "memory(GiB)": 22.66, "step": 25863, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957645 }, { "epoch": 0.840204008706104, "grad_norm": 0.3457683026790619, "learning_rate": 6.820546127817629e-07, "loss": 0.007925665937364101, "memory(GiB)": 22.66, "step": 25864, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957653 }, { "epoch": 0.8402364941688595, "grad_norm": 0.6592937111854553, "learning_rate": 6.817838071794424e-07, "loss": 0.00939088687300682, "memory(GiB)": 22.66, "step": 25865, "token_acc": 1.0, "train_speed(iter/s)": 0.957661 }, { "epoch": 0.8402689796316148, "grad_norm": 0.34020814299583435, "learning_rate": 6.815130514150858e-07, "loss": 0.01116708479821682, "memory(GiB)": 22.66, "step": 25866, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957668 }, { "epoch": 0.8403014650943703, "grad_norm": 0.3444163501262665, "learning_rate": 6.812423454918138e-07, "loss": 0.00834948755800724, "memory(GiB)": 22.66, "step": 25867, "token_acc": 1.0, "train_speed(iter/s)": 0.957674 }, { "epoch": 0.8403339505571257, "grad_norm": 0.23903682827949524, "learning_rate": 6.80971689412756e-07, "loss": 0.006346582435071468, "memory(GiB)": 22.66, "step": 25868, "token_acc": 1.0, "train_speed(iter/s)": 0.95768 }, { "epoch": 0.8403664360198811, "grad_norm": 0.3266766667366028, "learning_rate": 6.807010831810323e-07, "loss": 0.010422911494970322, "memory(GiB)": 22.66, "step": 25869, "token_acc": 1.0, "train_speed(iter/s)": 0.957686 }, { "epoch": 0.8403989214826365, "grad_norm": 0.39428094029426575, "learning_rate": 6.804305267997685e-07, "loss": 0.01519967895001173, "memory(GiB)": 22.66, "step": 25870, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.957692 }, { "epoch": 0.840431406945392, "grad_norm": 0.30262476205825806, "learning_rate": 6.801600202720842e-07, "loss": 0.01308299507945776, "memory(GiB)": 22.66, "step": 25871, "token_acc": 1.0, "train_speed(iter/s)": 0.957698 }, { "epoch": 0.8404638924081473, "grad_norm": 0.23141619563102722, "learning_rate": 6.798895636011021e-07, "loss": 0.009029358625411987, "memory(GiB)": 22.66, "step": 25872, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957704 }, { "epoch": 0.8404963778709028, "grad_norm": 0.6061406135559082, "learning_rate": 6.796191567899468e-07, "loss": 0.018155202269554138, "memory(GiB)": 22.66, "step": 25873, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.957709 }, { "epoch": 0.8405288633336582, "grad_norm": 0.5003453493118286, "learning_rate": 6.793487998417353e-07, "loss": 0.008705778047442436, "memory(GiB)": 22.66, "step": 25874, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.957714 }, { "epoch": 0.8405613487964136, "grad_norm": 0.2491670846939087, "learning_rate": 6.790784927595912e-07, "loss": 0.005398174747824669, "memory(GiB)": 22.66, "step": 25875, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.957719 }, { "epoch": 0.840593834259169, "grad_norm": 0.22537019848823547, "learning_rate": 6.788082355466308e-07, "loss": 0.009288043715059757, "memory(GiB)": 22.66, "step": 25876, "token_acc": 0.9878048780487805, "train_speed(iter/s)": 0.957725 }, { "epoch": 0.8406263197219245, "grad_norm": 0.34165439009666443, "learning_rate": 6.78538028205975e-07, "loss": 0.0068938541226089, "memory(GiB)": 22.66, "step": 25877, "token_acc": 1.0, "train_speed(iter/s)": 0.95773 }, { "epoch": 0.8406588051846798, "grad_norm": 0.3824714720249176, "learning_rate": 6.78267870740742e-07, "loss": 0.011565925553441048, "memory(GiB)": 22.66, "step": 25878, "token_acc": 0.9808917197452229, "train_speed(iter/s)": 0.957735 }, { "epoch": 0.8406912906474353, "grad_norm": 0.383050799369812, "learning_rate": 6.779977631540513e-07, "loss": 0.009979250840842724, "memory(GiB)": 22.66, "step": 25879, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.95774 }, { "epoch": 0.8407237761101907, "grad_norm": 0.25495997071266174, "learning_rate": 6.777277054490178e-07, "loss": 0.01015528105199337, "memory(GiB)": 22.66, "step": 25880, "token_acc": 1.0, "train_speed(iter/s)": 0.957745 }, { "epoch": 0.8407562615729461, "grad_norm": 0.6561523675918579, "learning_rate": 6.7745769762876e-07, "loss": 0.017313379794359207, "memory(GiB)": 22.66, "step": 25881, "token_acc": 1.0, "train_speed(iter/s)": 0.957751 }, { "epoch": 0.8407887470357015, "grad_norm": 0.4321116805076599, "learning_rate": 6.771877396963927e-07, "loss": 0.019886065274477005, "memory(GiB)": 22.66, "step": 25882, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.957756 }, { "epoch": 0.840821232498457, "grad_norm": 0.40148165822029114, "learning_rate": 6.769178316550345e-07, "loss": 0.009980261325836182, "memory(GiB)": 22.66, "step": 25883, "token_acc": 1.0, "train_speed(iter/s)": 0.957762 }, { "epoch": 0.8408537179612123, "grad_norm": 0.4254712164402008, "learning_rate": 6.766479735077969e-07, "loss": 0.01695343479514122, "memory(GiB)": 22.66, "step": 25884, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.957767 }, { "epoch": 0.8408862034239678, "grad_norm": 0.28408384323120117, "learning_rate": 6.763781652577967e-07, "loss": 0.00898872222751379, "memory(GiB)": 22.66, "step": 25885, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.957772 }, { "epoch": 0.8409186888867232, "grad_norm": 0.3314056098461151, "learning_rate": 6.761084069081463e-07, "loss": 0.01269066333770752, "memory(GiB)": 22.66, "step": 25886, "token_acc": 0.9947643979057592, "train_speed(iter/s)": 0.957777 }, { "epoch": 0.8409511743494786, "grad_norm": 0.27054476737976074, "learning_rate": 6.758386984619608e-07, "loss": 0.009925602003932, "memory(GiB)": 22.66, "step": 25887, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957783 }, { "epoch": 0.840983659812234, "grad_norm": 0.23407188057899475, "learning_rate": 6.755690399223525e-07, "loss": 0.006647294852882624, "memory(GiB)": 22.66, "step": 25888, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.95779 }, { "epoch": 0.8410161452749895, "grad_norm": 0.3425937592983246, "learning_rate": 6.752994312924327e-07, "loss": 0.01312689483165741, "memory(GiB)": 22.66, "step": 25889, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.957795 }, { "epoch": 0.8410486307377448, "grad_norm": 0.1355602890253067, "learning_rate": 6.750298725753135e-07, "loss": 0.0043848417699337006, "memory(GiB)": 22.66, "step": 25890, "token_acc": 1.0, "train_speed(iter/s)": 0.957801 }, { "epoch": 0.8410811162005003, "grad_norm": 1.1009482145309448, "learning_rate": 6.747603637741063e-07, "loss": 0.017966698855161667, "memory(GiB)": 22.66, "step": 25891, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.957807 }, { "epoch": 0.8411136016632557, "grad_norm": 0.3158438801765442, "learning_rate": 6.744909048919224e-07, "loss": 0.012305108830332756, "memory(GiB)": 22.66, "step": 25892, "token_acc": 1.0, "train_speed(iter/s)": 0.957812 }, { "epoch": 0.8411460871260111, "grad_norm": 0.32831063866615295, "learning_rate": 6.742214959318693e-07, "loss": 0.008364353328943253, "memory(GiB)": 22.66, "step": 25893, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.957818 }, { "epoch": 0.8411785725887665, "grad_norm": 0.2552748918533325, "learning_rate": 6.739521368970586e-07, "loss": 0.00748914759606123, "memory(GiB)": 22.66, "step": 25894, "token_acc": 0.9952830188679245, "train_speed(iter/s)": 0.957824 }, { "epoch": 0.841211058051522, "grad_norm": 0.4442210793495178, "learning_rate": 6.736828277905966e-07, "loss": 0.01981380581855774, "memory(GiB)": 22.66, "step": 25895, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.95783 }, { "epoch": 0.8412435435142773, "grad_norm": 0.44452422857284546, "learning_rate": 6.734135686155951e-07, "loss": 0.0163503997027874, "memory(GiB)": 22.66, "step": 25896, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.957837 }, { "epoch": 0.8412760289770328, "grad_norm": 0.24781201779842377, "learning_rate": 6.731443593751586e-07, "loss": 0.01008511520922184, "memory(GiB)": 22.66, "step": 25897, "token_acc": 1.0, "train_speed(iter/s)": 0.957844 }, { "epoch": 0.8413085144397882, "grad_norm": 0.4101218581199646, "learning_rate": 6.728752000723965e-07, "loss": 0.008677621372044086, "memory(GiB)": 22.66, "step": 25898, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.957852 }, { "epoch": 0.8413409999025436, "grad_norm": 0.46367359161376953, "learning_rate": 6.726060907104126e-07, "loss": 0.01538031455129385, "memory(GiB)": 22.66, "step": 25899, "token_acc": 1.0, "train_speed(iter/s)": 0.957859 }, { "epoch": 0.841373485365299, "grad_norm": 0.36555635929107666, "learning_rate": 6.723370312923144e-07, "loss": 0.010518375784158707, "memory(GiB)": 22.66, "step": 25900, "token_acc": 1.0, "train_speed(iter/s)": 0.957867 }, { "epoch": 0.8414059708280545, "grad_norm": 0.3994095027446747, "learning_rate": 6.720680218212066e-07, "loss": 0.018221372738480568, "memory(GiB)": 22.66, "step": 25901, "token_acc": 1.0, "train_speed(iter/s)": 0.957874 }, { "epoch": 0.8414384562908098, "grad_norm": 0.3228706419467926, "learning_rate": 6.717990623001947e-07, "loss": 0.014416350051760674, "memory(GiB)": 22.66, "step": 25902, "token_acc": 0.994475138121547, "train_speed(iter/s)": 0.957882 }, { "epoch": 0.8414709417535653, "grad_norm": 0.35130560398101807, "learning_rate": 6.715301527323831e-07, "loss": 0.012436923570930958, "memory(GiB)": 22.66, "step": 25903, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.95789 }, { "epoch": 0.8415034272163207, "grad_norm": 0.2323441505432129, "learning_rate": 6.71261293120874e-07, "loss": 0.010174773633480072, "memory(GiB)": 22.66, "step": 25904, "token_acc": 1.0, "train_speed(iter/s)": 0.957898 }, { "epoch": 0.8415359126790761, "grad_norm": 0.3639977276325226, "learning_rate": 6.709924834687709e-07, "loss": 0.013193562626838684, "memory(GiB)": 22.66, "step": 25905, "token_acc": 1.0, "train_speed(iter/s)": 0.957906 }, { "epoch": 0.8415683981418315, "grad_norm": 0.4871548116207123, "learning_rate": 6.707237237791764e-07, "loss": 0.014172011986374855, "memory(GiB)": 22.66, "step": 25906, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957914 }, { "epoch": 0.841600883604587, "grad_norm": 0.25739362835884094, "learning_rate": 6.704550140551936e-07, "loss": 0.009281303733587265, "memory(GiB)": 22.66, "step": 25907, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.957922 }, { "epoch": 0.8416333690673423, "grad_norm": 0.22711390256881714, "learning_rate": 6.701863542999216e-07, "loss": 0.00801056157797575, "memory(GiB)": 22.66, "step": 25908, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.95793 }, { "epoch": 0.8416658545300978, "grad_norm": 0.2650115191936493, "learning_rate": 6.699177445164617e-07, "loss": 0.008061502128839493, "memory(GiB)": 22.66, "step": 25909, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957938 }, { "epoch": 0.8416983399928532, "grad_norm": 0.2941231429576874, "learning_rate": 6.696491847079145e-07, "loss": 0.01529840286821127, "memory(GiB)": 22.66, "step": 25910, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.957946 }, { "epoch": 0.8417308254556086, "grad_norm": 0.28671160340309143, "learning_rate": 6.693806748773806e-07, "loss": 0.009320689365267754, "memory(GiB)": 22.66, "step": 25911, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957953 }, { "epoch": 0.841763310918364, "grad_norm": 0.3067259192466736, "learning_rate": 6.691122150279567e-07, "loss": 0.010493841022253036, "memory(GiB)": 22.66, "step": 25912, "token_acc": 1.0, "train_speed(iter/s)": 0.957961 }, { "epoch": 0.8417957963811195, "grad_norm": 0.47468242049217224, "learning_rate": 6.688438051627422e-07, "loss": 0.011810312047600746, "memory(GiB)": 22.66, "step": 25913, "token_acc": 1.0, "train_speed(iter/s)": 0.957969 }, { "epoch": 0.8418282818438748, "grad_norm": 0.396797239780426, "learning_rate": 6.685754452848347e-07, "loss": 0.014209916815161705, "memory(GiB)": 22.66, "step": 25914, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957977 }, { "epoch": 0.8418607673066303, "grad_norm": 0.24579012393951416, "learning_rate": 6.683071353973331e-07, "loss": 0.009129397571086884, "memory(GiB)": 22.66, "step": 25915, "token_acc": 1.0, "train_speed(iter/s)": 0.957985 }, { "epoch": 0.8418932527693856, "grad_norm": 0.3780289590358734, "learning_rate": 6.680388755033312e-07, "loss": 0.014149408787488937, "memory(GiB)": 22.66, "step": 25916, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.957993 }, { "epoch": 0.8419257382321411, "grad_norm": 0.4076283574104309, "learning_rate": 6.677706656059268e-07, "loss": 0.01227990910410881, "memory(GiB)": 22.66, "step": 25917, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.958001 }, { "epoch": 0.8419582236948965, "grad_norm": 0.4219512939453125, "learning_rate": 6.675025057082152e-07, "loss": 0.014477408491075039, "memory(GiB)": 22.66, "step": 25918, "token_acc": 0.9798387096774194, "train_speed(iter/s)": 0.958009 }, { "epoch": 0.841990709157652, "grad_norm": 0.28408902883529663, "learning_rate": 6.67234395813291e-07, "loss": 0.010670339688658714, "memory(GiB)": 22.66, "step": 25919, "token_acc": 1.0, "train_speed(iter/s)": 0.958017 }, { "epoch": 0.8420231946204074, "grad_norm": 0.5331172943115234, "learning_rate": 6.669663359242495e-07, "loss": 0.019352085888385773, "memory(GiB)": 22.66, "step": 25920, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.958024 }, { "epoch": 0.8420556800831628, "grad_norm": 0.2580016255378723, "learning_rate": 6.666983260441834e-07, "loss": 0.007545183412730694, "memory(GiB)": 22.66, "step": 25921, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.958032 }, { "epoch": 0.8420881655459183, "grad_norm": 0.31633949279785156, "learning_rate": 6.664303661761867e-07, "loss": 0.011450836434960365, "memory(GiB)": 22.66, "step": 25922, "token_acc": 1.0, "train_speed(iter/s)": 0.95804 }, { "epoch": 0.8421206510086736, "grad_norm": 0.3287931978702545, "learning_rate": 6.661624563233499e-07, "loss": 0.010536108165979385, "memory(GiB)": 22.66, "step": 25923, "token_acc": 1.0, "train_speed(iter/s)": 0.958048 }, { "epoch": 0.8421531364714291, "grad_norm": 0.3421812057495117, "learning_rate": 6.658945964887681e-07, "loss": 0.014139503240585327, "memory(GiB)": 22.66, "step": 25924, "token_acc": 0.9904761904761905, "train_speed(iter/s)": 0.958055 }, { "epoch": 0.8421856219341844, "grad_norm": 0.4069514870643616, "learning_rate": 6.656267866755306e-07, "loss": 0.019463995471596718, "memory(GiB)": 22.66, "step": 25925, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.958063 }, { "epoch": 0.8422181073969399, "grad_norm": 0.46688973903656006, "learning_rate": 6.653590268867305e-07, "loss": 0.017879003658890724, "memory(GiB)": 22.66, "step": 25926, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.958071 }, { "epoch": 0.8422505928596953, "grad_norm": 0.2599516808986664, "learning_rate": 6.650913171254553e-07, "loss": 0.008329510688781738, "memory(GiB)": 22.66, "step": 25927, "token_acc": 1.0, "train_speed(iter/s)": 0.958077 }, { "epoch": 0.8422830783224508, "grad_norm": 0.28956136107444763, "learning_rate": 6.648236573947958e-07, "loss": 0.012465454638004303, "memory(GiB)": 22.66, "step": 25928, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.958083 }, { "epoch": 0.8423155637852061, "grad_norm": 0.40641677379608154, "learning_rate": 6.645560476978418e-07, "loss": 0.016439657658338547, "memory(GiB)": 22.66, "step": 25929, "token_acc": 0.9927797833935018, "train_speed(iter/s)": 0.958089 }, { "epoch": 0.8423480492479616, "grad_norm": 0.3289453387260437, "learning_rate": 6.642884880376821e-07, "loss": 0.014850899577140808, "memory(GiB)": 22.66, "step": 25930, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.958096 }, { "epoch": 0.8423805347107169, "grad_norm": 0.33479395508766174, "learning_rate": 6.640209784174034e-07, "loss": 0.012377341277897358, "memory(GiB)": 22.66, "step": 25931, "token_acc": 1.0, "train_speed(iter/s)": 0.958101 }, { "epoch": 0.8424130201734724, "grad_norm": 0.3726928234100342, "learning_rate": 6.637535188400935e-07, "loss": 0.012691374868154526, "memory(GiB)": 22.66, "step": 25932, "token_acc": 1.0, "train_speed(iter/s)": 0.958107 }, { "epoch": 0.8424455056362278, "grad_norm": 0.3334837555885315, "learning_rate": 6.634861093088396e-07, "loss": 0.014193102717399597, "memory(GiB)": 22.66, "step": 25933, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.958114 }, { "epoch": 0.8424779910989832, "grad_norm": 0.33220019936561584, "learning_rate": 6.632187498267279e-07, "loss": 0.014237720519304276, "memory(GiB)": 22.66, "step": 25934, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.95812 }, { "epoch": 0.8425104765617386, "grad_norm": 0.33565753698349, "learning_rate": 6.629514403968457e-07, "loss": 0.007305067032575607, "memory(GiB)": 22.66, "step": 25935, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.958125 }, { "epoch": 0.8425429620244941, "grad_norm": 0.3410623371601105, "learning_rate": 6.626841810222745e-07, "loss": 0.010531257838010788, "memory(GiB)": 22.66, "step": 25936, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.958131 }, { "epoch": 0.8425754474872494, "grad_norm": 0.3735116124153137, "learning_rate": 6.624169717061013e-07, "loss": 0.013530844822525978, "memory(GiB)": 22.66, "step": 25937, "token_acc": 1.0, "train_speed(iter/s)": 0.958137 }, { "epoch": 0.8426079329500049, "grad_norm": 0.2851124703884125, "learning_rate": 6.621498124514092e-07, "loss": 0.010023344308137894, "memory(GiB)": 22.66, "step": 25938, "token_acc": 1.0, "train_speed(iter/s)": 0.958142 }, { "epoch": 0.8426404184127603, "grad_norm": 0.423012912273407, "learning_rate": 6.618827032612834e-07, "loss": 0.010875968262553215, "memory(GiB)": 22.66, "step": 25939, "token_acc": 1.0, "train_speed(iter/s)": 0.958147 }, { "epoch": 0.8426729038755157, "grad_norm": 0.4337041974067688, "learning_rate": 6.616156441388039e-07, "loss": 0.010569683276116848, "memory(GiB)": 22.66, "step": 25940, "token_acc": 1.0, "train_speed(iter/s)": 0.958152 }, { "epoch": 0.8427053893382711, "grad_norm": 0.32864224910736084, "learning_rate": 6.613486350870546e-07, "loss": 0.00863572582602501, "memory(GiB)": 22.66, "step": 25941, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.958157 }, { "epoch": 0.8427378748010266, "grad_norm": 0.23842425644397736, "learning_rate": 6.610816761091165e-07, "loss": 0.0067019984126091, "memory(GiB)": 22.66, "step": 25942, "token_acc": 1.0, "train_speed(iter/s)": 0.958163 }, { "epoch": 0.8427703602637819, "grad_norm": 0.4669417142868042, "learning_rate": 6.608147672080717e-07, "loss": 0.016646921634674072, "memory(GiB)": 22.66, "step": 25943, "token_acc": 1.0, "train_speed(iter/s)": 0.958168 }, { "epoch": 0.8428028457265374, "grad_norm": 0.8131937384605408, "learning_rate": 6.605479083869992e-07, "loss": 0.011972174048423767, "memory(GiB)": 22.66, "step": 25944, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.958173 }, { "epoch": 0.8428353311892928, "grad_norm": 0.41258564591407776, "learning_rate": 6.602810996489811e-07, "loss": 0.016767237335443497, "memory(GiB)": 22.66, "step": 25945, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.958179 }, { "epoch": 0.8428678166520482, "grad_norm": 0.3208343982696533, "learning_rate": 6.600143409970927e-07, "loss": 0.010529104620218277, "memory(GiB)": 22.66, "step": 25946, "token_acc": 1.0, "train_speed(iter/s)": 0.958184 }, { "epoch": 0.8429003021148036, "grad_norm": 0.3641127347946167, "learning_rate": 6.597476324344177e-07, "loss": 0.011012900620698929, "memory(GiB)": 22.66, "step": 25947, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.95819 }, { "epoch": 0.8429327875775591, "grad_norm": 0.3809100091457367, "learning_rate": 6.594809739640301e-07, "loss": 0.009479326196014881, "memory(GiB)": 22.66, "step": 25948, "token_acc": 1.0, "train_speed(iter/s)": 0.958197 }, { "epoch": 0.8429652730403144, "grad_norm": 0.3180522322654724, "learning_rate": 6.592143655890099e-07, "loss": 0.01435692049562931, "memory(GiB)": 22.66, "step": 25949, "token_acc": 1.0, "train_speed(iter/s)": 0.958203 }, { "epoch": 0.8429977585030699, "grad_norm": 0.45464345812797546, "learning_rate": 6.589478073124345e-07, "loss": 0.017527077347040176, "memory(GiB)": 22.66, "step": 25950, "token_acc": 1.0, "train_speed(iter/s)": 0.95821 }, { "epoch": 0.8430302439658253, "grad_norm": 0.33831262588500977, "learning_rate": 6.58681299137377e-07, "loss": 0.008990531787276268, "memory(GiB)": 22.66, "step": 25951, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.958216 }, { "epoch": 0.8430627294285807, "grad_norm": 0.41510894894599915, "learning_rate": 6.584148410669178e-07, "loss": 0.017615901306271553, "memory(GiB)": 22.66, "step": 25952, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.958223 }, { "epoch": 0.8430952148913361, "grad_norm": 0.42818593978881836, "learning_rate": 6.581484331041294e-07, "loss": 0.011791359633207321, "memory(GiB)": 22.66, "step": 25953, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.958229 }, { "epoch": 0.8431277003540916, "grad_norm": 0.3452799618244171, "learning_rate": 6.578820752520876e-07, "loss": 0.013223773799836636, "memory(GiB)": 22.66, "step": 25954, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.958235 }, { "epoch": 0.8431601858168469, "grad_norm": 0.32831132411956787, "learning_rate": 6.57615767513865e-07, "loss": 0.01221671886742115, "memory(GiB)": 22.66, "step": 25955, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.95824 }, { "epoch": 0.8431926712796024, "grad_norm": 0.30308690667152405, "learning_rate": 6.573495098925358e-07, "loss": 0.011465717107057571, "memory(GiB)": 22.66, "step": 25956, "token_acc": 1.0, "train_speed(iter/s)": 0.958246 }, { "epoch": 0.8432251567423578, "grad_norm": 0.42321857810020447, "learning_rate": 6.570833023911738e-07, "loss": 0.014790360815823078, "memory(GiB)": 22.66, "step": 25957, "token_acc": 1.0, "train_speed(iter/s)": 0.958253 }, { "epoch": 0.8432576422051132, "grad_norm": 0.34527263045310974, "learning_rate": 6.568171450128519e-07, "loss": 0.010739203542470932, "memory(GiB)": 22.66, "step": 25958, "token_acc": 1.0, "train_speed(iter/s)": 0.958259 }, { "epoch": 0.8432901276678686, "grad_norm": 0.30549192428588867, "learning_rate": 6.565510377606399e-07, "loss": 0.01277160458266735, "memory(GiB)": 22.66, "step": 25959, "token_acc": 1.0, "train_speed(iter/s)": 0.958266 }, { "epoch": 0.8433226131306241, "grad_norm": 0.49864012002944946, "learning_rate": 6.562849806376098e-07, "loss": 0.013754558749496937, "memory(GiB)": 22.66, "step": 25960, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.958272 }, { "epoch": 0.8433550985933794, "grad_norm": 0.35589832067489624, "learning_rate": 6.560189736468325e-07, "loss": 0.015465752221643925, "memory(GiB)": 22.66, "step": 25961, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.958279 }, { "epoch": 0.8433875840561349, "grad_norm": 0.24779921770095825, "learning_rate": 6.557530167913795e-07, "loss": 0.009067404083907604, "memory(GiB)": 22.66, "step": 25962, "token_acc": 1.0, "train_speed(iter/s)": 0.958287 }, { "epoch": 0.8434200695188903, "grad_norm": 0.3822821378707886, "learning_rate": 6.554871100743177e-07, "loss": 0.009062720462679863, "memory(GiB)": 22.66, "step": 25963, "token_acc": 1.0, "train_speed(iter/s)": 0.958295 }, { "epoch": 0.8434525549816457, "grad_norm": 0.24656420946121216, "learning_rate": 6.552212534987168e-07, "loss": 0.008959479629993439, "memory(GiB)": 22.66, "step": 25964, "token_acc": 1.0, "train_speed(iter/s)": 0.958303 }, { "epoch": 0.8434850404444011, "grad_norm": 0.3838318884372711, "learning_rate": 6.549554470676456e-07, "loss": 0.01843593828380108, "memory(GiB)": 22.66, "step": 25965, "token_acc": 1.0, "train_speed(iter/s)": 0.958311 }, { "epoch": 0.8435175259071566, "grad_norm": 0.40699976682662964, "learning_rate": 6.546896907841721e-07, "loss": 0.014437483623623848, "memory(GiB)": 22.66, "step": 25966, "token_acc": 0.9939759036144579, "train_speed(iter/s)": 0.958318 }, { "epoch": 0.8435500113699119, "grad_norm": 0.456861287355423, "learning_rate": 6.544239846513644e-07, "loss": 0.011778291314840317, "memory(GiB)": 22.66, "step": 25967, "token_acc": 1.0, "train_speed(iter/s)": 0.958326 }, { "epoch": 0.8435824968326674, "grad_norm": 0.42099669575691223, "learning_rate": 6.541583286722864e-07, "loss": 0.018639270216226578, "memory(GiB)": 22.66, "step": 25968, "token_acc": 0.9885931558935361, "train_speed(iter/s)": 0.958334 }, { "epoch": 0.8436149822954228, "grad_norm": 0.35740551352500916, "learning_rate": 6.538927228500053e-07, "loss": 0.013564633205533028, "memory(GiB)": 22.66, "step": 25969, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.958342 }, { "epoch": 0.8436474677581782, "grad_norm": 0.22540748119354248, "learning_rate": 6.536271671875877e-07, "loss": 0.012251734733581543, "memory(GiB)": 22.66, "step": 25970, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.95835 }, { "epoch": 0.8436799532209336, "grad_norm": 0.3599194586277008, "learning_rate": 6.533616616880978e-07, "loss": 0.008913062512874603, "memory(GiB)": 22.66, "step": 25971, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.958358 }, { "epoch": 0.8437124386836891, "grad_norm": 0.24381417036056519, "learning_rate": 6.530962063545993e-07, "loss": 0.006824500858783722, "memory(GiB)": 22.66, "step": 25972, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.958366 }, { "epoch": 0.8437449241464444, "grad_norm": 0.29769405722618103, "learning_rate": 6.528308011901568e-07, "loss": 0.010953575372695923, "memory(GiB)": 22.66, "step": 25973, "token_acc": 1.0, "train_speed(iter/s)": 0.958374 }, { "epoch": 0.8437774096091999, "grad_norm": 0.37924665212631226, "learning_rate": 6.525654461978309e-07, "loss": 0.00798318162560463, "memory(GiB)": 22.66, "step": 25974, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.958382 }, { "epoch": 0.8438098950719553, "grad_norm": 0.28237879276275635, "learning_rate": 6.523001413806884e-07, "loss": 0.008427218534052372, "memory(GiB)": 22.66, "step": 25975, "token_acc": 0.99609375, "train_speed(iter/s)": 0.958389 }, { "epoch": 0.8438423805347107, "grad_norm": 0.3666190505027771, "learning_rate": 6.52034886741788e-07, "loss": 0.009897276759147644, "memory(GiB)": 22.66, "step": 25976, "token_acc": 1.0, "train_speed(iter/s)": 0.958397 }, { "epoch": 0.8438748659974661, "grad_norm": 0.33751070499420166, "learning_rate": 6.517696822841935e-07, "loss": 0.010348543524742126, "memory(GiB)": 22.66, "step": 25977, "token_acc": 1.0, "train_speed(iter/s)": 0.958404 }, { "epoch": 0.8439073514602216, "grad_norm": 0.46806785464286804, "learning_rate": 6.515045280109633e-07, "loss": 0.010467196814715862, "memory(GiB)": 22.66, "step": 25978, "token_acc": 1.0, "train_speed(iter/s)": 0.958412 }, { "epoch": 0.8439398369229769, "grad_norm": 0.3156375288963318, "learning_rate": 6.512394239251568e-07, "loss": 0.011363644152879715, "memory(GiB)": 22.66, "step": 25979, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.958419 }, { "epoch": 0.8439723223857324, "grad_norm": 0.38477781414985657, "learning_rate": 6.509743700298382e-07, "loss": 0.02114448882639408, "memory(GiB)": 22.66, "step": 25980, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.958427 }, { "epoch": 0.8440048078484877, "grad_norm": 0.29185134172439575, "learning_rate": 6.507093663280628e-07, "loss": 0.011713267304003239, "memory(GiB)": 22.66, "step": 25981, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.958435 }, { "epoch": 0.8440372933112432, "grad_norm": 0.6264895796775818, "learning_rate": 6.504444128228915e-07, "loss": 0.020239882171154022, "memory(GiB)": 22.66, "step": 25982, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.958443 }, { "epoch": 0.8440697787739986, "grad_norm": 0.31297674775123596, "learning_rate": 6.501795095173796e-07, "loss": 0.009046115912497044, "memory(GiB)": 22.66, "step": 25983, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.958451 }, { "epoch": 0.844102264236754, "grad_norm": 0.4065716862678528, "learning_rate": 6.499146564145858e-07, "loss": 0.019191166386008263, "memory(GiB)": 22.66, "step": 25984, "token_acc": 1.0, "train_speed(iter/s)": 0.958459 }, { "epoch": 0.8441347496995095, "grad_norm": 0.2684646248817444, "learning_rate": 6.496498535175672e-07, "loss": 0.0062086982652544975, "memory(GiB)": 22.66, "step": 25985, "token_acc": 1.0, "train_speed(iter/s)": 0.958466 }, { "epoch": 0.8441672351622649, "grad_norm": 0.376422643661499, "learning_rate": 6.4938510082938e-07, "loss": 0.011447161436080933, "memory(GiB)": 22.66, "step": 25986, "token_acc": 1.0, "train_speed(iter/s)": 0.958474 }, { "epoch": 0.8441997206250204, "grad_norm": 0.2916158139705658, "learning_rate": 6.491203983530786e-07, "loss": 0.010350599884986877, "memory(GiB)": 22.66, "step": 25987, "token_acc": 1.0, "train_speed(iter/s)": 0.95848 }, { "epoch": 0.8442322060877757, "grad_norm": 0.5084115862846375, "learning_rate": 6.488557460917189e-07, "loss": 0.018901051953434944, "memory(GiB)": 22.66, "step": 25988, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.958486 }, { "epoch": 0.8442646915505312, "grad_norm": 0.34488075971603394, "learning_rate": 6.48591144048355e-07, "loss": 0.010207529179751873, "memory(GiB)": 22.66, "step": 25989, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.958491 }, { "epoch": 0.8442971770132865, "grad_norm": 0.421125590801239, "learning_rate": 6.483265922260423e-07, "loss": 0.01334466878324747, "memory(GiB)": 22.66, "step": 25990, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.958497 }, { "epoch": 0.844329662476042, "grad_norm": 0.3176041543483734, "learning_rate": 6.480620906278317e-07, "loss": 0.012331759557127953, "memory(GiB)": 22.66, "step": 25991, "token_acc": 0.9854014598540146, "train_speed(iter/s)": 0.958504 }, { "epoch": 0.8443621479387974, "grad_norm": 0.3511282801628113, "learning_rate": 6.47797639256777e-07, "loss": 0.012567849829792976, "memory(GiB)": 22.66, "step": 25992, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.95851 }, { "epoch": 0.8443946334015529, "grad_norm": 0.4076865613460541, "learning_rate": 6.475332381159305e-07, "loss": 0.011909907683730125, "memory(GiB)": 22.66, "step": 25993, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.958515 }, { "epoch": 0.8444271188643082, "grad_norm": 0.29222798347473145, "learning_rate": 6.472688872083443e-07, "loss": 0.010337082669138908, "memory(GiB)": 22.66, "step": 25994, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.958521 }, { "epoch": 0.8444596043270637, "grad_norm": 0.23001433908939362, "learning_rate": 6.470045865370677e-07, "loss": 0.0059667108580470085, "memory(GiB)": 22.66, "step": 25995, "token_acc": 1.0, "train_speed(iter/s)": 0.958526 }, { "epoch": 0.844492089789819, "grad_norm": 0.35248833894729614, "learning_rate": 6.467403361051522e-07, "loss": 0.011245441623032093, "memory(GiB)": 22.66, "step": 25996, "token_acc": 1.0, "train_speed(iter/s)": 0.958533 }, { "epoch": 0.8445245752525745, "grad_norm": 0.3395334780216217, "learning_rate": 6.464761359156469e-07, "loss": 0.014925265684723854, "memory(GiB)": 22.66, "step": 25997, "token_acc": 1.0, "train_speed(iter/s)": 0.958539 }, { "epoch": 0.8445570607153299, "grad_norm": 0.3197837769985199, "learning_rate": 6.462119859716015e-07, "loss": 0.012398825958371162, "memory(GiB)": 22.66, "step": 25998, "token_acc": 0.9948979591836735, "train_speed(iter/s)": 0.958544 }, { "epoch": 0.8445895461780853, "grad_norm": 0.31016623973846436, "learning_rate": 6.459478862760659e-07, "loss": 0.01259807962924242, "memory(GiB)": 22.66, "step": 25999, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.958549 }, { "epoch": 0.8446220316408407, "grad_norm": 0.4390396773815155, "learning_rate": 6.456838368320861e-07, "loss": 0.016775403171777725, "memory(GiB)": 22.66, "step": 26000, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.958554 }, { "epoch": 0.8446220316408407, "eval_loss": 0.012255133129656315, "eval_runtime": 81.0173, "eval_samples_per_second": 122.813, "eval_steps_per_second": 3.839, "eval_token_acc": 0.9950404741924468, "step": 26000 }, { "epoch": 0.8446545171035962, "grad_norm": 0.6708881258964539, "learning_rate": 6.454198376427117e-07, "loss": 0.016691159456968307, "memory(GiB)": 22.66, "step": 26001, "token_acc": 0.9947516272696129, "train_speed(iter/s)": 0.955303 }, { "epoch": 0.8446870025663515, "grad_norm": 0.6128807067871094, "learning_rate": 6.451558887109854e-07, "loss": 0.01835767552256584, "memory(GiB)": 22.66, "step": 26002, "token_acc": 1.0, "train_speed(iter/s)": 0.955308 }, { "epoch": 0.844719488029107, "grad_norm": 0.3375774323940277, "learning_rate": 6.448919900399592e-07, "loss": 0.012084638699889183, "memory(GiB)": 22.66, "step": 26003, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.955314 }, { "epoch": 0.8447519734918624, "grad_norm": 0.28032657504081726, "learning_rate": 6.446281416326749e-07, "loss": 0.011277401819825172, "memory(GiB)": 22.66, "step": 26004, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955321 }, { "epoch": 0.8447844589546178, "grad_norm": 0.36497780680656433, "learning_rate": 6.443643434921798e-07, "loss": 0.009652460925281048, "memory(GiB)": 22.66, "step": 26005, "token_acc": 1.0, "train_speed(iter/s)": 0.955327 }, { "epoch": 0.8448169444173732, "grad_norm": 0.310499370098114, "learning_rate": 6.441005956215163e-07, "loss": 0.010225256904959679, "memory(GiB)": 22.66, "step": 26006, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955333 }, { "epoch": 0.8448494298801287, "grad_norm": 0.3472634255886078, "learning_rate": 6.438368980237297e-07, "loss": 0.011494605801999569, "memory(GiB)": 22.66, "step": 26007, "token_acc": 0.9966996699669967, "train_speed(iter/s)": 0.955338 }, { "epoch": 0.844881915342884, "grad_norm": 0.29530593752861023, "learning_rate": 6.435732507018633e-07, "loss": 0.011074064299464226, "memory(GiB)": 22.66, "step": 26008, "token_acc": 1.0, "train_speed(iter/s)": 0.955343 }, { "epoch": 0.8449144008056395, "grad_norm": 0.3621229827404022, "learning_rate": 6.433096536589612e-07, "loss": 0.016009390354156494, "memory(GiB)": 22.66, "step": 26009, "token_acc": 1.0, "train_speed(iter/s)": 0.955349 }, { "epoch": 0.8449468862683949, "grad_norm": 0.446970134973526, "learning_rate": 6.430461068980632e-07, "loss": 0.011626400984823704, "memory(GiB)": 22.66, "step": 26010, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955354 }, { "epoch": 0.8449793717311503, "grad_norm": 0.4265630543231964, "learning_rate": 6.427826104222124e-07, "loss": 0.013689564540982246, "memory(GiB)": 22.66, "step": 26011, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.95536 }, { "epoch": 0.8450118571939057, "grad_norm": 0.214692160487175, "learning_rate": 6.425191642344491e-07, "loss": 0.009228012524545193, "memory(GiB)": 22.66, "step": 26012, "token_acc": 0.98828125, "train_speed(iter/s)": 0.955365 }, { "epoch": 0.8450443426566612, "grad_norm": 0.26418936252593994, "learning_rate": 6.42255768337815e-07, "loss": 0.010316621512174606, "memory(GiB)": 22.66, "step": 26013, "token_acc": 1.0, "train_speed(iter/s)": 0.955371 }, { "epoch": 0.8450768281194165, "grad_norm": 0.2835904657840729, "learning_rate": 6.419924227353508e-07, "loss": 0.01241029892116785, "memory(GiB)": 22.66, "step": 26014, "token_acc": 1.0, "train_speed(iter/s)": 0.955376 }, { "epoch": 0.845109313582172, "grad_norm": 0.31098607182502747, "learning_rate": 6.41729127430093e-07, "loss": 0.01061050221323967, "memory(GiB)": 22.66, "step": 26015, "token_acc": 1.0, "train_speed(iter/s)": 0.955382 }, { "epoch": 0.8451417990449274, "grad_norm": 0.3747483789920807, "learning_rate": 6.414658824250819e-07, "loss": 0.015808047726750374, "memory(GiB)": 22.66, "step": 26016, "token_acc": 1.0, "train_speed(iter/s)": 0.955389 }, { "epoch": 0.8451742845076828, "grad_norm": 0.9861740469932556, "learning_rate": 6.412026877233563e-07, "loss": 0.011626871302723885, "memory(GiB)": 22.66, "step": 26017, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.955396 }, { "epoch": 0.8452067699704382, "grad_norm": 0.3002094030380249, "learning_rate": 6.409395433279536e-07, "loss": 0.01216758880764246, "memory(GiB)": 22.66, "step": 26018, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.955402 }, { "epoch": 0.8452392554331937, "grad_norm": 0.3337247967720032, "learning_rate": 6.406764492419098e-07, "loss": 0.007900855503976345, "memory(GiB)": 22.66, "step": 26019, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.955408 }, { "epoch": 0.845271740895949, "grad_norm": 0.41952240467071533, "learning_rate": 6.40413405468262e-07, "loss": 0.014748998917639256, "memory(GiB)": 22.66, "step": 26020, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.955414 }, { "epoch": 0.8453042263587045, "grad_norm": 0.6674925684928894, "learning_rate": 6.401504120100461e-07, "loss": 0.01347571425139904, "memory(GiB)": 22.66, "step": 26021, "token_acc": 1.0, "train_speed(iter/s)": 0.955421 }, { "epoch": 0.8453367118214599, "grad_norm": 0.2072562724351883, "learning_rate": 6.398874688702983e-07, "loss": 0.006643571425229311, "memory(GiB)": 22.66, "step": 26022, "token_acc": 1.0, "train_speed(iter/s)": 0.955427 }, { "epoch": 0.8453691972842153, "grad_norm": 0.2141859084367752, "learning_rate": 6.396245760520514e-07, "loss": 0.0062870760448277, "memory(GiB)": 22.66, "step": 26023, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.955434 }, { "epoch": 0.8454016827469707, "grad_norm": 0.36491307616233826, "learning_rate": 6.393617335583413e-07, "loss": 0.00943784136325121, "memory(GiB)": 22.66, "step": 26024, "token_acc": 1.0, "train_speed(iter/s)": 0.95544 }, { "epoch": 0.8454341682097262, "grad_norm": 0.3261439800262451, "learning_rate": 6.390989413921995e-07, "loss": 0.0055608125403523445, "memory(GiB)": 22.66, "step": 26025, "token_acc": 1.0, "train_speed(iter/s)": 0.955447 }, { "epoch": 0.8454666536724815, "grad_norm": 0.3554382622241974, "learning_rate": 6.388361995566606e-07, "loss": 0.01451907865703106, "memory(GiB)": 22.66, "step": 26026, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.955453 }, { "epoch": 0.845499139135237, "grad_norm": 0.42506784200668335, "learning_rate": 6.385735080547578e-07, "loss": 0.006936185993254185, "memory(GiB)": 22.66, "step": 26027, "token_acc": 1.0, "train_speed(iter/s)": 0.955459 }, { "epoch": 0.8455316245979924, "grad_norm": 0.4642265737056732, "learning_rate": 6.383108668895211e-07, "loss": 0.012361813336610794, "memory(GiB)": 22.66, "step": 26028, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.955465 }, { "epoch": 0.8455641100607478, "grad_norm": 0.36301353573799133, "learning_rate": 6.380482760639828e-07, "loss": 0.014547955244779587, "memory(GiB)": 22.66, "step": 26029, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.955471 }, { "epoch": 0.8455965955235032, "grad_norm": 0.5009778738021851, "learning_rate": 6.377857355811712e-07, "loss": 0.019506053999066353, "memory(GiB)": 22.66, "step": 26030, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.955477 }, { "epoch": 0.8456290809862587, "grad_norm": 1.812650203704834, "learning_rate": 6.37523245444121e-07, "loss": 0.010558603331446648, "memory(GiB)": 22.66, "step": 26031, "token_acc": 0.9939759036144579, "train_speed(iter/s)": 0.955484 }, { "epoch": 0.845661566449014, "grad_norm": 0.44982609152793884, "learning_rate": 6.372608056558571e-07, "loss": 0.012533874250948429, "memory(GiB)": 22.66, "step": 26032, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.955481 }, { "epoch": 0.8456940519117695, "grad_norm": 0.4437296688556671, "learning_rate": 6.369984162194121e-07, "loss": 0.01397346705198288, "memory(GiB)": 22.66, "step": 26033, "token_acc": 0.9947643979057592, "train_speed(iter/s)": 0.955489 }, { "epoch": 0.8457265373745249, "grad_norm": 0.19047893583774567, "learning_rate": 6.367360771378112e-07, "loss": 0.0036550546064972878, "memory(GiB)": 22.66, "step": 26034, "token_acc": 1.0, "train_speed(iter/s)": 0.955497 }, { "epoch": 0.8457590228372803, "grad_norm": 0.3978557586669922, "learning_rate": 6.364737884140831e-07, "loss": 0.01443181000649929, "memory(GiB)": 22.66, "step": 26035, "token_acc": 0.9927536231884058, "train_speed(iter/s)": 0.955505 }, { "epoch": 0.8457915083000357, "grad_norm": 0.2499149888753891, "learning_rate": 6.362115500512561e-07, "loss": 0.009776250459253788, "memory(GiB)": 22.66, "step": 26036, "token_acc": 1.0, "train_speed(iter/s)": 0.955513 }, { "epoch": 0.8458239937627912, "grad_norm": 0.32406026124954224, "learning_rate": 6.359493620523566e-07, "loss": 0.007603141479194164, "memory(GiB)": 22.66, "step": 26037, "token_acc": 1.0, "train_speed(iter/s)": 0.95552 }, { "epoch": 0.8458564792255465, "grad_norm": 0.3598646819591522, "learning_rate": 6.356872244204087e-07, "loss": 0.008251870982348919, "memory(GiB)": 22.66, "step": 26038, "token_acc": 0.9893238434163701, "train_speed(iter/s)": 0.955528 }, { "epoch": 0.845888964688302, "grad_norm": 0.28451991081237793, "learning_rate": 6.354251371584391e-07, "loss": 0.005923153832554817, "memory(GiB)": 22.66, "step": 26039, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.955536 }, { "epoch": 0.8459214501510574, "grad_norm": 0.37662407755851746, "learning_rate": 6.35163100269473e-07, "loss": 0.013509380631148815, "memory(GiB)": 22.66, "step": 26040, "token_acc": 0.988950276243094, "train_speed(iter/s)": 0.955544 }, { "epoch": 0.8459539356138128, "grad_norm": 0.3653636574745178, "learning_rate": 6.349011137565342e-07, "loss": 0.010442038998007774, "memory(GiB)": 22.66, "step": 26041, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.955552 }, { "epoch": 0.8459864210765682, "grad_norm": 0.26492297649383545, "learning_rate": 6.346391776226479e-07, "loss": 0.007369093596935272, "memory(GiB)": 22.66, "step": 26042, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.95556 }, { "epoch": 0.8460189065393237, "grad_norm": 0.2628381848335266, "learning_rate": 6.343772918708341e-07, "loss": 0.008740751072764397, "memory(GiB)": 22.66, "step": 26043, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.955568 }, { "epoch": 0.846051392002079, "grad_norm": 0.49787965416908264, "learning_rate": 6.341154565041174e-07, "loss": 0.015015212818980217, "memory(GiB)": 22.66, "step": 26044, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.955576 }, { "epoch": 0.8460838774648345, "grad_norm": 0.28309136629104614, "learning_rate": 6.338536715255189e-07, "loss": 0.009969191625714302, "memory(GiB)": 22.66, "step": 26045, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.955583 }, { "epoch": 0.8461163629275898, "grad_norm": 0.5479093194007874, "learning_rate": 6.335919369380622e-07, "loss": 0.017147183418273926, "memory(GiB)": 22.66, "step": 26046, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.955591 }, { "epoch": 0.8461488483903453, "grad_norm": 0.4287148416042328, "learning_rate": 6.333302527447643e-07, "loss": 0.013269219547510147, "memory(GiB)": 22.66, "step": 26047, "token_acc": 1.0, "train_speed(iter/s)": 0.955598 }, { "epoch": 0.8461813338531008, "grad_norm": 0.46399378776550293, "learning_rate": 6.330686189486479e-07, "loss": 0.012656966224312782, "memory(GiB)": 22.66, "step": 26048, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955606 }, { "epoch": 0.8462138193158562, "grad_norm": 0.267936646938324, "learning_rate": 6.328070355527316e-07, "loss": 0.008645293302834034, "memory(GiB)": 22.66, "step": 26049, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.955614 }, { "epoch": 0.8462463047786116, "grad_norm": 0.37025219202041626, "learning_rate": 6.325455025600363e-07, "loss": 0.00740706454962492, "memory(GiB)": 22.66, "step": 26050, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.955622 }, { "epoch": 0.846278790241367, "grad_norm": 0.3638569712638855, "learning_rate": 6.322840199735775e-07, "loss": 0.015799475833773613, "memory(GiB)": 22.66, "step": 26051, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.95563 }, { "epoch": 0.8463112757041225, "grad_norm": 0.29244115948677063, "learning_rate": 6.32022587796376e-07, "loss": 0.009819656610488892, "memory(GiB)": 22.66, "step": 26052, "token_acc": 1.0, "train_speed(iter/s)": 0.955638 }, { "epoch": 0.8463437611668778, "grad_norm": 0.36476537585258484, "learning_rate": 6.317612060314448e-07, "loss": 0.014571920968592167, "memory(GiB)": 22.66, "step": 26053, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955645 }, { "epoch": 0.8463762466296333, "grad_norm": 4.970794677734375, "learning_rate": 6.314998746818063e-07, "loss": 0.015060478821396828, "memory(GiB)": 22.66, "step": 26054, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955653 }, { "epoch": 0.8464087320923886, "grad_norm": 0.3801976144313812, "learning_rate": 6.312385937504723e-07, "loss": 0.013605739921331406, "memory(GiB)": 22.66, "step": 26055, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.955659 }, { "epoch": 0.8464412175551441, "grad_norm": 0.39430519938468933, "learning_rate": 6.309773632404592e-07, "loss": 0.01208660937845707, "memory(GiB)": 22.66, "step": 26056, "token_acc": 1.0, "train_speed(iter/s)": 0.955665 }, { "epoch": 0.8464737030178995, "grad_norm": 0.1821456253528595, "learning_rate": 6.307161831547831e-07, "loss": 0.005782881751656532, "memory(GiB)": 22.66, "step": 26057, "token_acc": 1.0, "train_speed(iter/s)": 0.955672 }, { "epoch": 0.846506188480655, "grad_norm": 0.36559733748435974, "learning_rate": 6.304550534964576e-07, "loss": 0.011221511289477348, "memory(GiB)": 22.66, "step": 26058, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955678 }, { "epoch": 0.8465386739434103, "grad_norm": 0.31250637769699097, "learning_rate": 6.301939742684982e-07, "loss": 0.00882788561284542, "memory(GiB)": 22.66, "step": 26059, "token_acc": 1.0, "train_speed(iter/s)": 0.955685 }, { "epoch": 0.8465711594061658, "grad_norm": 0.45046427845954895, "learning_rate": 6.29932945473915e-07, "loss": 0.010419207625091076, "memory(GiB)": 22.66, "step": 26060, "token_acc": 1.0, "train_speed(iter/s)": 0.95569 }, { "epoch": 0.8466036448689211, "grad_norm": 0.3466499447822571, "learning_rate": 6.296719671157231e-07, "loss": 0.012069609016180038, "memory(GiB)": 22.66, "step": 26061, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.955696 }, { "epoch": 0.8466361303316766, "grad_norm": 0.42721033096313477, "learning_rate": 6.294110391969315e-07, "loss": 0.015883998945355415, "memory(GiB)": 22.66, "step": 26062, "token_acc": 1.0, "train_speed(iter/s)": 0.955701 }, { "epoch": 0.846668615794432, "grad_norm": 0.3662865459918976, "learning_rate": 6.291501617205559e-07, "loss": 0.015070796944200993, "memory(GiB)": 22.66, "step": 26063, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955706 }, { "epoch": 0.8467011012571874, "grad_norm": 0.34355515241622925, "learning_rate": 6.288893346896041e-07, "loss": 0.013311349786818027, "memory(GiB)": 22.66, "step": 26064, "token_acc": 0.992, "train_speed(iter/s)": 0.955712 }, { "epoch": 0.8467335867199428, "grad_norm": 0.6158459186553955, "learning_rate": 6.286285581070883e-07, "loss": 0.009175935760140419, "memory(GiB)": 22.66, "step": 26065, "token_acc": 1.0, "train_speed(iter/s)": 0.955719 }, { "epoch": 0.8467660721826983, "grad_norm": 0.34362706542015076, "learning_rate": 6.283678319760161e-07, "loss": 0.009266961365938187, "memory(GiB)": 22.66, "step": 26066, "token_acc": 0.99609375, "train_speed(iter/s)": 0.955724 }, { "epoch": 0.8467985576454536, "grad_norm": 1.1706452369689941, "learning_rate": 6.281071562993979e-07, "loss": 0.008823461830615997, "memory(GiB)": 22.66, "step": 26067, "token_acc": 1.0, "train_speed(iter/s)": 0.95573 }, { "epoch": 0.8468310431082091, "grad_norm": 0.47112414240837097, "learning_rate": 6.278465310802423e-07, "loss": 0.01450793445110321, "memory(GiB)": 22.66, "step": 26068, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.955736 }, { "epoch": 0.8468635285709645, "grad_norm": 0.241660475730896, "learning_rate": 6.275859563215575e-07, "loss": 0.010028529912233353, "memory(GiB)": 22.66, "step": 26069, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.955742 }, { "epoch": 0.84689601403372, "grad_norm": 0.3278028666973114, "learning_rate": 6.2732543202635e-07, "loss": 0.011146919801831245, "memory(GiB)": 22.66, "step": 26070, "token_acc": 1.0, "train_speed(iter/s)": 0.955749 }, { "epoch": 0.8469284994964753, "grad_norm": 0.516823410987854, "learning_rate": 6.270649581976268e-07, "loss": 0.01824306882917881, "memory(GiB)": 22.66, "step": 26071, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955754 }, { "epoch": 0.8469609849592308, "grad_norm": 0.40457406640052795, "learning_rate": 6.268045348383944e-07, "loss": 0.012126614339649677, "memory(GiB)": 22.66, "step": 26072, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.955759 }, { "epoch": 0.8469934704219861, "grad_norm": 0.3013545572757721, "learning_rate": 6.265441619516582e-07, "loss": 0.009438721463084221, "memory(GiB)": 22.66, "step": 26073, "token_acc": 0.993006993006993, "train_speed(iter/s)": 0.955765 }, { "epoch": 0.8470259558847416, "grad_norm": 0.39597949385643005, "learning_rate": 6.262838395404241e-07, "loss": 0.010482076555490494, "memory(GiB)": 22.66, "step": 26074, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.955771 }, { "epoch": 0.847058441347497, "grad_norm": 0.2783883512020111, "learning_rate": 6.260235676076954e-07, "loss": 0.007069717161357403, "memory(GiB)": 22.66, "step": 26075, "token_acc": 1.0, "train_speed(iter/s)": 0.955777 }, { "epoch": 0.8470909268102524, "grad_norm": 0.3726098835468292, "learning_rate": 6.257633461564766e-07, "loss": 0.014907065778970718, "memory(GiB)": 22.66, "step": 26076, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955784 }, { "epoch": 0.8471234122730078, "grad_norm": 0.47482070326805115, "learning_rate": 6.255031751897706e-07, "loss": 0.019633766263723373, "memory(GiB)": 22.66, "step": 26077, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.95579 }, { "epoch": 0.8471558977357633, "grad_norm": 0.42122799158096313, "learning_rate": 6.252430547105814e-07, "loss": 0.011310476809740067, "memory(GiB)": 22.66, "step": 26078, "token_acc": 1.0, "train_speed(iter/s)": 0.955796 }, { "epoch": 0.8471883831985186, "grad_norm": 0.2788389325141907, "learning_rate": 6.249829847219091e-07, "loss": 0.00820503942668438, "memory(GiB)": 22.66, "step": 26079, "token_acc": 1.0, "train_speed(iter/s)": 0.955802 }, { "epoch": 0.8472208686612741, "grad_norm": 0.4108540117740631, "learning_rate": 6.247229652267567e-07, "loss": 0.013947682455182076, "memory(GiB)": 22.66, "step": 26080, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955809 }, { "epoch": 0.8472533541240295, "grad_norm": 0.30547988414764404, "learning_rate": 6.244629962281245e-07, "loss": 0.01463038194924593, "memory(GiB)": 22.66, "step": 26081, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.955815 }, { "epoch": 0.8472858395867849, "grad_norm": 0.2441890835762024, "learning_rate": 6.242030777290142e-07, "loss": 0.007625156082212925, "memory(GiB)": 22.66, "step": 26082, "token_acc": 1.0, "train_speed(iter/s)": 0.955821 }, { "epoch": 0.8473183250495403, "grad_norm": 0.31164538860321045, "learning_rate": 6.239432097324233e-07, "loss": 0.008763022720813751, "memory(GiB)": 22.66, "step": 26083, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.955827 }, { "epoch": 0.8473508105122958, "grad_norm": 0.40925291180610657, "learning_rate": 6.236833922413532e-07, "loss": 0.012486517429351807, "memory(GiB)": 22.66, "step": 26084, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.955833 }, { "epoch": 0.8473832959750511, "grad_norm": 0.40673941373825073, "learning_rate": 6.234236252588e-07, "loss": 0.019251424819231033, "memory(GiB)": 22.66, "step": 26085, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.95584 }, { "epoch": 0.8474157814378066, "grad_norm": 0.3671466112136841, "learning_rate": 6.231639087877656e-07, "loss": 0.016990330070257187, "memory(GiB)": 22.66, "step": 26086, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.955846 }, { "epoch": 0.847448266900562, "grad_norm": 0.3695243299007416, "learning_rate": 6.229042428312438e-07, "loss": 0.011646632105112076, "memory(GiB)": 22.66, "step": 26087, "token_acc": 1.0, "train_speed(iter/s)": 0.955853 }, { "epoch": 0.8474807523633174, "grad_norm": 0.2779668867588043, "learning_rate": 6.226446273922327e-07, "loss": 0.009992046281695366, "memory(GiB)": 22.66, "step": 26088, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.955859 }, { "epoch": 0.8475132378260728, "grad_norm": 0.2556656301021576, "learning_rate": 6.223850624737305e-07, "loss": 0.009398303925991058, "memory(GiB)": 22.66, "step": 26089, "token_acc": 1.0, "train_speed(iter/s)": 0.955866 }, { "epoch": 0.8475457232888283, "grad_norm": 0.22086402773857117, "learning_rate": 6.221255480787286e-07, "loss": 0.005492295138537884, "memory(GiB)": 22.66, "step": 26090, "token_acc": 1.0, "train_speed(iter/s)": 0.955871 }, { "epoch": 0.8475782087515836, "grad_norm": 0.4228525459766388, "learning_rate": 6.218660842102269e-07, "loss": 0.020161647349596024, "memory(GiB)": 22.66, "step": 26091, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.955877 }, { "epoch": 0.8476106942143391, "grad_norm": 0.4013778567314148, "learning_rate": 6.216066708712176e-07, "loss": 0.013882294297218323, "memory(GiB)": 22.66, "step": 26092, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.955884 }, { "epoch": 0.8476431796770945, "grad_norm": 0.3018800616264343, "learning_rate": 6.213473080646954e-07, "loss": 0.009206954389810562, "memory(GiB)": 22.66, "step": 26093, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.95589 }, { "epoch": 0.8476756651398499, "grad_norm": 0.5032140016555786, "learning_rate": 6.210879957936522e-07, "loss": 0.01669495739042759, "memory(GiB)": 22.66, "step": 26094, "token_acc": 1.0, "train_speed(iter/s)": 0.955897 }, { "epoch": 0.8477081506026053, "grad_norm": 0.3400956690311432, "learning_rate": 6.208287340610819e-07, "loss": 0.012692606076598167, "memory(GiB)": 22.66, "step": 26095, "token_acc": 1.0, "train_speed(iter/s)": 0.955903 }, { "epoch": 0.8477406360653608, "grad_norm": 0.4119206666946411, "learning_rate": 6.205695228699771e-07, "loss": 0.013059118762612343, "memory(GiB)": 22.66, "step": 26096, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.95591 }, { "epoch": 0.8477731215281161, "grad_norm": 0.4734923541545868, "learning_rate": 6.203103622233291e-07, "loss": 0.013856923207640648, "memory(GiB)": 22.66, "step": 26097, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.955918 }, { "epoch": 0.8478056069908716, "grad_norm": 0.5020034909248352, "learning_rate": 6.200512521241286e-07, "loss": 0.01656024344265461, "memory(GiB)": 22.66, "step": 26098, "token_acc": 0.9790209790209791, "train_speed(iter/s)": 0.955925 }, { "epoch": 0.847838092453627, "grad_norm": 0.38040122389793396, "learning_rate": 6.197921925753664e-07, "loss": 0.014608856290578842, "memory(GiB)": 22.66, "step": 26099, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.955933 }, { "epoch": 0.8478705779163824, "grad_norm": 0.5557081699371338, "learning_rate": 6.195331835800317e-07, "loss": 0.0215863399207592, "memory(GiB)": 22.66, "step": 26100, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.955941 }, { "epoch": 0.8479030633791378, "grad_norm": 0.2993793785572052, "learning_rate": 6.19274225141116e-07, "loss": 0.008335186168551445, "memory(GiB)": 22.66, "step": 26101, "token_acc": 1.0, "train_speed(iter/s)": 0.955949 }, { "epoch": 0.8479355488418933, "grad_norm": 0.47513365745544434, "learning_rate": 6.190153172616054e-07, "loss": 0.011277785524725914, "memory(GiB)": 22.66, "step": 26102, "token_acc": 1.0, "train_speed(iter/s)": 0.955957 }, { "epoch": 0.8479680343046486, "grad_norm": 0.37893417477607727, "learning_rate": 6.18756459944489e-07, "loss": 0.013262107037007809, "memory(GiB)": 22.66, "step": 26103, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955965 }, { "epoch": 0.8480005197674041, "grad_norm": 0.3998287618160248, "learning_rate": 6.184976531927545e-07, "loss": 0.015865188091993332, "memory(GiB)": 22.66, "step": 26104, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955973 }, { "epoch": 0.8480330052301595, "grad_norm": 0.3066272735595703, "learning_rate": 6.182388970093889e-07, "loss": 0.011459629982709885, "memory(GiB)": 22.66, "step": 26105, "token_acc": 1.0, "train_speed(iter/s)": 0.955981 }, { "epoch": 0.8480654906929149, "grad_norm": 0.18236371874809265, "learning_rate": 6.179801913973799e-07, "loss": 0.006979013793170452, "memory(GiB)": 22.66, "step": 26106, "token_acc": 1.0, "train_speed(iter/s)": 0.955989 }, { "epoch": 0.8480979761556703, "grad_norm": 0.364939421415329, "learning_rate": 6.1772153635971e-07, "loss": 0.014376426115632057, "memory(GiB)": 22.66, "step": 26107, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.955997 }, { "epoch": 0.8481304616184258, "grad_norm": 0.2401290237903595, "learning_rate": 6.174629318993669e-07, "loss": 0.006121618673205376, "memory(GiB)": 22.66, "step": 26108, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956004 }, { "epoch": 0.8481629470811811, "grad_norm": 0.3921893835067749, "learning_rate": 6.172043780193349e-07, "loss": 0.012368869967758656, "memory(GiB)": 22.66, "step": 26109, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.956012 }, { "epoch": 0.8481954325439366, "grad_norm": 0.3193855583667755, "learning_rate": 6.169458747225987e-07, "loss": 0.010827489197254181, "memory(GiB)": 22.66, "step": 26110, "token_acc": 1.0, "train_speed(iter/s)": 0.956019 }, { "epoch": 0.848227918006692, "grad_norm": 0.3087974786758423, "learning_rate": 6.1668742201214e-07, "loss": 0.009801211766898632, "memory(GiB)": 22.66, "step": 26111, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.956027 }, { "epoch": 0.8482604034694474, "grad_norm": 0.2938801646232605, "learning_rate": 6.164290198909439e-07, "loss": 0.011681096628308296, "memory(GiB)": 22.66, "step": 26112, "token_acc": 1.0, "train_speed(iter/s)": 0.956035 }, { "epoch": 0.8482928889322029, "grad_norm": 0.3502506911754608, "learning_rate": 6.161706683619889e-07, "loss": 0.010505275800824165, "memory(GiB)": 22.66, "step": 26113, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.956043 }, { "epoch": 0.8483253743949583, "grad_norm": 0.4080606698989868, "learning_rate": 6.159123674282613e-07, "loss": 0.01643148437142372, "memory(GiB)": 22.66, "step": 26114, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.956051 }, { "epoch": 0.8483578598577137, "grad_norm": 0.3501552939414978, "learning_rate": 6.156541170927394e-07, "loss": 0.014788597822189331, "memory(GiB)": 22.66, "step": 26115, "token_acc": 0.9963636363636363, "train_speed(iter/s)": 0.956056 }, { "epoch": 0.8483903453204691, "grad_norm": 0.314241886138916, "learning_rate": 6.153959173584057e-07, "loss": 0.014608222059905529, "memory(GiB)": 22.66, "step": 26116, "token_acc": 0.9922779922779923, "train_speed(iter/s)": 0.956062 }, { "epoch": 0.8484228307832246, "grad_norm": 0.46626517176628113, "learning_rate": 6.151377682282378e-07, "loss": 0.008590498939156532, "memory(GiB)": 22.66, "step": 26117, "token_acc": 1.0, "train_speed(iter/s)": 0.956068 }, { "epoch": 0.8484553162459799, "grad_norm": 0.2751672565937042, "learning_rate": 6.14879669705215e-07, "loss": 0.009301070123910904, "memory(GiB)": 22.66, "step": 26118, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.956074 }, { "epoch": 0.8484878017087354, "grad_norm": 0.40835270285606384, "learning_rate": 6.146216217923196e-07, "loss": 0.012294951826334, "memory(GiB)": 22.66, "step": 26119, "token_acc": 1.0, "train_speed(iter/s)": 0.95608 }, { "epoch": 0.8485202871714908, "grad_norm": 0.374720960855484, "learning_rate": 6.143636244925267e-07, "loss": 0.00961260311305523, "memory(GiB)": 22.66, "step": 26120, "token_acc": 1.0, "train_speed(iter/s)": 0.956085 }, { "epoch": 0.8485527726342462, "grad_norm": 0.27514514327049255, "learning_rate": 6.141056778088161e-07, "loss": 0.010605650022625923, "memory(GiB)": 22.66, "step": 26121, "token_acc": 1.0, "train_speed(iter/s)": 0.956092 }, { "epoch": 0.8485852580970016, "grad_norm": 0.3530980348587036, "learning_rate": 6.138477817441623e-07, "loss": 0.009522461332380772, "memory(GiB)": 22.66, "step": 26122, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.956097 }, { "epoch": 0.848617743559757, "grad_norm": 0.35784712433815, "learning_rate": 6.135899363015429e-07, "loss": 0.012706873938441277, "memory(GiB)": 22.66, "step": 26123, "token_acc": 1.0, "train_speed(iter/s)": 0.956103 }, { "epoch": 0.8486502290225124, "grad_norm": 0.33764347434043884, "learning_rate": 6.133321414839338e-07, "loss": 0.016114328056573868, "memory(GiB)": 22.66, "step": 26124, "token_acc": 1.0, "train_speed(iter/s)": 0.956109 }, { "epoch": 0.8486827144852679, "grad_norm": 0.3297063112258911, "learning_rate": 6.130743972943121e-07, "loss": 0.009557521902024746, "memory(GiB)": 22.66, "step": 26125, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.956115 }, { "epoch": 0.8487151999480232, "grad_norm": 0.3180774450302124, "learning_rate": 6.128167037356492e-07, "loss": 0.009027578867971897, "memory(GiB)": 22.66, "step": 26126, "token_acc": 0.9966442953020134, "train_speed(iter/s)": 0.95612 }, { "epoch": 0.8487476854107787, "grad_norm": 1.5003443956375122, "learning_rate": 6.125590608109211e-07, "loss": 0.010160477831959724, "memory(GiB)": 22.66, "step": 26127, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956126 }, { "epoch": 0.8487801708735341, "grad_norm": 0.3421153426170349, "learning_rate": 6.123014685231016e-07, "loss": 0.013526221737265587, "memory(GiB)": 22.66, "step": 26128, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956131 }, { "epoch": 0.8488126563362896, "grad_norm": 0.31146159768104553, "learning_rate": 6.120439268751632e-07, "loss": 0.008428149856626987, "memory(GiB)": 22.66, "step": 26129, "token_acc": 1.0, "train_speed(iter/s)": 0.956137 }, { "epoch": 0.8488451417990449, "grad_norm": 0.4130975306034088, "learning_rate": 6.117864358700781e-07, "loss": 0.011256786063313484, "memory(GiB)": 22.66, "step": 26130, "token_acc": 1.0, "train_speed(iter/s)": 0.956143 }, { "epoch": 0.8488776272618004, "grad_norm": 0.37172725796699524, "learning_rate": 6.115289955108178e-07, "loss": 0.014035645872354507, "memory(GiB)": 22.66, "step": 26131, "token_acc": 1.0, "train_speed(iter/s)": 0.956149 }, { "epoch": 0.8489101127245557, "grad_norm": 0.3061077296733856, "learning_rate": 6.112716058003537e-07, "loss": 0.010182926431298256, "memory(GiB)": 22.66, "step": 26132, "token_acc": 1.0, "train_speed(iter/s)": 0.956154 }, { "epoch": 0.8489425981873112, "grad_norm": 0.37065398693084717, "learning_rate": 6.110142667416586e-07, "loss": 0.012362897396087646, "memory(GiB)": 22.66, "step": 26133, "token_acc": 0.9965870307167235, "train_speed(iter/s)": 0.95616 }, { "epoch": 0.8489750836500666, "grad_norm": 0.386227548122406, "learning_rate": 6.107569783376987e-07, "loss": 0.012337801977992058, "memory(GiB)": 22.66, "step": 26134, "token_acc": 0.9930795847750865, "train_speed(iter/s)": 0.956167 }, { "epoch": 0.849007569112822, "grad_norm": 0.20304788649082184, "learning_rate": 6.104997405914453e-07, "loss": 0.00488428445532918, "memory(GiB)": 22.66, "step": 26135, "token_acc": 1.0, "train_speed(iter/s)": 0.956174 }, { "epoch": 0.8490400545755774, "grad_norm": 0.3288829028606415, "learning_rate": 6.102425535058675e-07, "loss": 0.010537369176745415, "memory(GiB)": 22.66, "step": 26136, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.956181 }, { "epoch": 0.8490725400383329, "grad_norm": 0.2522812783718109, "learning_rate": 6.099854170839337e-07, "loss": 0.01590747758746147, "memory(GiB)": 22.66, "step": 26137, "token_acc": 0.99609375, "train_speed(iter/s)": 0.956188 }, { "epoch": 0.8491050255010882, "grad_norm": 0.4152754247188568, "learning_rate": 6.097283313286123e-07, "loss": 0.016487006098031998, "memory(GiB)": 22.66, "step": 26138, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956194 }, { "epoch": 0.8491375109638437, "grad_norm": 0.3571341037750244, "learning_rate": 6.094712962428679e-07, "loss": 0.01050992775708437, "memory(GiB)": 22.66, "step": 26139, "token_acc": 1.0, "train_speed(iter/s)": 0.9562 }, { "epoch": 0.8491699964265991, "grad_norm": 0.2879014015197754, "learning_rate": 6.092143118296701e-07, "loss": 0.005192706361413002, "memory(GiB)": 22.66, "step": 26140, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.956207 }, { "epoch": 0.8492024818893545, "grad_norm": 0.531970739364624, "learning_rate": 6.089573780919806e-07, "loss": 0.020777646452188492, "memory(GiB)": 22.66, "step": 26141, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.956213 }, { "epoch": 0.8492349673521099, "grad_norm": 0.373395174741745, "learning_rate": 6.087004950327702e-07, "loss": 0.01065370999276638, "memory(GiB)": 22.66, "step": 26142, "token_acc": 1.0, "train_speed(iter/s)": 0.956219 }, { "epoch": 0.8492674528148654, "grad_norm": 0.31270405650138855, "learning_rate": 6.084436626549994e-07, "loss": 0.0105581758543849, "memory(GiB)": 22.66, "step": 26143, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.956226 }, { "epoch": 0.8492999382776207, "grad_norm": 0.3306601643562317, "learning_rate": 6.081868809616348e-07, "loss": 0.010924020782113075, "memory(GiB)": 22.66, "step": 26144, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956232 }, { "epoch": 0.8493324237403762, "grad_norm": 0.3189944624900818, "learning_rate": 6.079301499556384e-07, "loss": 0.01404769066721201, "memory(GiB)": 22.66, "step": 26145, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.956239 }, { "epoch": 0.8493649092031316, "grad_norm": 0.33945709466934204, "learning_rate": 6.076734696399733e-07, "loss": 0.009553520008921623, "memory(GiB)": 22.66, "step": 26146, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.956244 }, { "epoch": 0.849397394665887, "grad_norm": 0.3815416097640991, "learning_rate": 6.074168400176034e-07, "loss": 0.00943172350525856, "memory(GiB)": 22.66, "step": 26147, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.95625 }, { "epoch": 0.8494298801286424, "grad_norm": 0.33133432269096375, "learning_rate": 6.071602610914901e-07, "loss": 0.009286634624004364, "memory(GiB)": 22.66, "step": 26148, "token_acc": 1.0, "train_speed(iter/s)": 0.956256 }, { "epoch": 0.8494623655913979, "grad_norm": 0.25374317169189453, "learning_rate": 6.069037328645933e-07, "loss": 0.010702421888709068, "memory(GiB)": 22.66, "step": 26149, "token_acc": 1.0, "train_speed(iter/s)": 0.956263 }, { "epoch": 0.8494948510541532, "grad_norm": 0.36330774426460266, "learning_rate": 6.066472553398744e-07, "loss": 0.01174818817526102, "memory(GiB)": 22.66, "step": 26150, "token_acc": 1.0, "train_speed(iter/s)": 0.956269 }, { "epoch": 0.8495273365169087, "grad_norm": 0.33440375328063965, "learning_rate": 6.063908285202941e-07, "loss": 0.010738417506217957, "memory(GiB)": 22.66, "step": 26151, "token_acc": 1.0, "train_speed(iter/s)": 0.956276 }, { "epoch": 0.8495598219796641, "grad_norm": 0.32446596026420593, "learning_rate": 6.061344524088114e-07, "loss": 0.014746811240911484, "memory(GiB)": 22.66, "step": 26152, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.956282 }, { "epoch": 0.8495923074424195, "grad_norm": 0.39823460578918457, "learning_rate": 6.058781270083868e-07, "loss": 0.014344148337841034, "memory(GiB)": 22.66, "step": 26153, "token_acc": 1.0, "train_speed(iter/s)": 0.956289 }, { "epoch": 0.8496247929051749, "grad_norm": 0.5688152313232422, "learning_rate": 6.056218523219754e-07, "loss": 0.016668885946273804, "memory(GiB)": 22.66, "step": 26154, "token_acc": 1.0, "train_speed(iter/s)": 0.956295 }, { "epoch": 0.8496572783679304, "grad_norm": 0.34091147780418396, "learning_rate": 6.053656283525372e-07, "loss": 0.011357059702277184, "memory(GiB)": 22.66, "step": 26155, "token_acc": 1.0, "train_speed(iter/s)": 0.9563 }, { "epoch": 0.8496897638306857, "grad_norm": 0.3797690272331238, "learning_rate": 6.05109455103029e-07, "loss": 0.013713639229536057, "memory(GiB)": 22.66, "step": 26156, "token_acc": 0.995, "train_speed(iter/s)": 0.956306 }, { "epoch": 0.8497222492934412, "grad_norm": 0.32053670287132263, "learning_rate": 6.048533325764078e-07, "loss": 0.01075682695955038, "memory(GiB)": 22.66, "step": 26157, "token_acc": 1.0, "train_speed(iter/s)": 0.956313 }, { "epoch": 0.8497547347561966, "grad_norm": 0.33066701889038086, "learning_rate": 6.045972607756284e-07, "loss": 0.007872043177485466, "memory(GiB)": 22.66, "step": 26158, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.956319 }, { "epoch": 0.849787220218952, "grad_norm": 0.9698542952537537, "learning_rate": 6.043412397036469e-07, "loss": 0.02092551440000534, "memory(GiB)": 22.66, "step": 26159, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956326 }, { "epoch": 0.8498197056817074, "grad_norm": 0.4983598291873932, "learning_rate": 6.04085269363418e-07, "loss": 0.011413965374231339, "memory(GiB)": 22.66, "step": 26160, "token_acc": 0.9961977186311787, "train_speed(iter/s)": 0.956333 }, { "epoch": 0.8498521911444629, "grad_norm": 0.3553076386451721, "learning_rate": 6.038293497578973e-07, "loss": 0.011994367465376854, "memory(GiB)": 22.66, "step": 26161, "token_acc": 1.0, "train_speed(iter/s)": 0.95634 }, { "epoch": 0.8498846766072182, "grad_norm": 0.25966936349868774, "learning_rate": 6.035734808900356e-07, "loss": 0.009083179756999016, "memory(GiB)": 22.66, "step": 26162, "token_acc": 1.0, "train_speed(iter/s)": 0.956346 }, { "epoch": 0.8499171620699737, "grad_norm": 0.31837356090545654, "learning_rate": 6.03317662762789e-07, "loss": 0.01787426322698593, "memory(GiB)": 22.66, "step": 26163, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.956353 }, { "epoch": 0.8499496475327291, "grad_norm": 0.3705116808414459, "learning_rate": 6.030618953791062e-07, "loss": 0.01270019356161356, "memory(GiB)": 22.66, "step": 26164, "token_acc": 1.0, "train_speed(iter/s)": 0.956359 }, { "epoch": 0.8499821329954845, "grad_norm": 0.3988784849643707, "learning_rate": 6.028061787419426e-07, "loss": 0.013131191022694111, "memory(GiB)": 22.66, "step": 26165, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956365 }, { "epoch": 0.8500146184582399, "grad_norm": 0.37622976303100586, "learning_rate": 6.025505128542497e-07, "loss": 0.008091377094388008, "memory(GiB)": 22.66, "step": 26166, "token_acc": 1.0, "train_speed(iter/s)": 0.956371 }, { "epoch": 0.8500471039209954, "grad_norm": 0.39063209295272827, "learning_rate": 6.022948977189757e-07, "loss": 0.013890601694583893, "memory(GiB)": 22.66, "step": 26167, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.956378 }, { "epoch": 0.8500795893837507, "grad_norm": 0.34779196977615356, "learning_rate": 6.020393333390729e-07, "loss": 0.012364216148853302, "memory(GiB)": 22.66, "step": 26168, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.956384 }, { "epoch": 0.8501120748465062, "grad_norm": 0.2544516921043396, "learning_rate": 6.017838197174875e-07, "loss": 0.008281029760837555, "memory(GiB)": 22.66, "step": 26169, "token_acc": 1.0, "train_speed(iter/s)": 0.95639 }, { "epoch": 0.8501445603092616, "grad_norm": 0.27492597699165344, "learning_rate": 6.015283568571733e-07, "loss": 0.010474154725670815, "memory(GiB)": 22.66, "step": 26170, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956396 }, { "epoch": 0.850177045772017, "grad_norm": 0.3974975347518921, "learning_rate": 6.012729447610748e-07, "loss": 0.010509871877729893, "memory(GiB)": 22.66, "step": 26171, "token_acc": 1.0, "train_speed(iter/s)": 0.956403 }, { "epoch": 0.8502095312347724, "grad_norm": 0.4058707058429718, "learning_rate": 6.010175834321424e-07, "loss": 0.012249093502759933, "memory(GiB)": 22.66, "step": 26172, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956409 }, { "epoch": 0.8502420166975279, "grad_norm": 0.366055965423584, "learning_rate": 6.007622728733214e-07, "loss": 0.009781899861991405, "memory(GiB)": 22.66, "step": 26173, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.956415 }, { "epoch": 0.8502745021602832, "grad_norm": 0.35960081219673157, "learning_rate": 6.00507013087559e-07, "loss": 0.009914358146488667, "memory(GiB)": 22.66, "step": 26174, "token_acc": 1.0, "train_speed(iter/s)": 0.956421 }, { "epoch": 0.8503069876230387, "grad_norm": 0.28648096323013306, "learning_rate": 6.00251804077801e-07, "loss": 0.008616582490503788, "memory(GiB)": 22.66, "step": 26175, "token_acc": 1.0, "train_speed(iter/s)": 0.956426 }, { "epoch": 0.8503394730857942, "grad_norm": 0.28993716835975647, "learning_rate": 5.999966458469947e-07, "loss": 0.008394865319132805, "memory(GiB)": 22.66, "step": 26176, "token_acc": 1.0, "train_speed(iter/s)": 0.956431 }, { "epoch": 0.8503719585485495, "grad_norm": 0.2118408977985382, "learning_rate": 5.997415383980826e-07, "loss": 0.00742016825824976, "memory(GiB)": 22.66, "step": 26177, "token_acc": 1.0, "train_speed(iter/s)": 0.956436 }, { "epoch": 0.850404444011305, "grad_norm": 0.34600988030433655, "learning_rate": 5.99486481734009e-07, "loss": 0.011489695869386196, "memory(GiB)": 22.66, "step": 26178, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956441 }, { "epoch": 0.8504369294740604, "grad_norm": 0.4314326047897339, "learning_rate": 5.992314758577195e-07, "loss": 0.016764050349593163, "memory(GiB)": 22.66, "step": 26179, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956447 }, { "epoch": 0.8504694149368158, "grad_norm": 0.3218303918838501, "learning_rate": 5.989765207721554e-07, "loss": 0.008710522204637527, "memory(GiB)": 22.66, "step": 26180, "token_acc": 1.0, "train_speed(iter/s)": 0.956451 }, { "epoch": 0.8505019003995712, "grad_norm": 0.3510212004184723, "learning_rate": 5.987216164802617e-07, "loss": 0.012123782187700272, "memory(GiB)": 22.66, "step": 26181, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956456 }, { "epoch": 0.8505343858623267, "grad_norm": 0.5861954092979431, "learning_rate": 5.984667629849772e-07, "loss": 0.016262948513031006, "memory(GiB)": 22.66, "step": 26182, "token_acc": 1.0, "train_speed(iter/s)": 0.956461 }, { "epoch": 0.850566871325082, "grad_norm": 0.36424148082733154, "learning_rate": 5.982119602892444e-07, "loss": 0.013969303108751774, "memory(GiB)": 22.66, "step": 26183, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956466 }, { "epoch": 0.8505993567878375, "grad_norm": 0.22203199565410614, "learning_rate": 5.97957208396005e-07, "loss": 0.009115290828049183, "memory(GiB)": 22.66, "step": 26184, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.95647 }, { "epoch": 0.8506318422505929, "grad_norm": 0.3587304651737213, "learning_rate": 5.977025073081993e-07, "loss": 0.012188928201794624, "memory(GiB)": 22.66, "step": 26185, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.956476 }, { "epoch": 0.8506643277133483, "grad_norm": 0.3373873829841614, "learning_rate": 5.974478570287651e-07, "loss": 0.011812583543360233, "memory(GiB)": 22.66, "step": 26186, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.956481 }, { "epoch": 0.8506968131761037, "grad_norm": 0.28704336285591125, "learning_rate": 5.971932575606421e-07, "loss": 0.012924384325742722, "memory(GiB)": 22.66, "step": 26187, "token_acc": 1.0, "train_speed(iter/s)": 0.956487 }, { "epoch": 0.8507292986388592, "grad_norm": 0.38526660203933716, "learning_rate": 5.969387089067691e-07, "loss": 0.012845499441027641, "memory(GiB)": 22.66, "step": 26188, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.956492 }, { "epoch": 0.8507617841016145, "grad_norm": 0.4863872528076172, "learning_rate": 5.966842110700855e-07, "loss": 0.012552500702440739, "memory(GiB)": 22.66, "step": 26189, "token_acc": 1.0, "train_speed(iter/s)": 0.956497 }, { "epoch": 0.85079426956437, "grad_norm": 0.26536324620246887, "learning_rate": 5.964297640535255e-07, "loss": 0.01179501973092556, "memory(GiB)": 22.66, "step": 26190, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.956503 }, { "epoch": 0.8508267550271253, "grad_norm": 0.3851165771484375, "learning_rate": 5.961753678600279e-07, "loss": 0.010848093777894974, "memory(GiB)": 22.66, "step": 26191, "token_acc": 1.0, "train_speed(iter/s)": 0.956509 }, { "epoch": 0.8508592404898808, "grad_norm": 0.33480358123779297, "learning_rate": 5.959210224925261e-07, "loss": 0.010061648674309254, "memory(GiB)": 22.66, "step": 26192, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956515 }, { "epoch": 0.8508917259526362, "grad_norm": 0.2445971518754959, "learning_rate": 5.956667279539596e-07, "loss": 0.008869788609445095, "memory(GiB)": 22.66, "step": 26193, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.956522 }, { "epoch": 0.8509242114153917, "grad_norm": 0.29443204402923584, "learning_rate": 5.954124842472603e-07, "loss": 0.009227635338902473, "memory(GiB)": 22.66, "step": 26194, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.95653 }, { "epoch": 0.850956696878147, "grad_norm": 0.3813375234603882, "learning_rate": 5.951582913753634e-07, "loss": 0.015275301411747932, "memory(GiB)": 22.66, "step": 26195, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.956538 }, { "epoch": 0.8509891823409025, "grad_norm": 0.2518174350261688, "learning_rate": 5.949041493412039e-07, "loss": 0.008918298408389091, "memory(GiB)": 22.66, "step": 26196, "token_acc": 1.0, "train_speed(iter/s)": 0.956545 }, { "epoch": 0.8510216678036578, "grad_norm": 0.34477272629737854, "learning_rate": 5.94650058147711e-07, "loss": 0.011947005987167358, "memory(GiB)": 22.66, "step": 26197, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.956553 }, { "epoch": 0.8510541532664133, "grad_norm": 0.3816086947917938, "learning_rate": 5.943960177978225e-07, "loss": 0.01376175507903099, "memory(GiB)": 22.66, "step": 26198, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.956561 }, { "epoch": 0.8510866387291687, "grad_norm": 0.3541431725025177, "learning_rate": 5.941420282944665e-07, "loss": 0.01325357798486948, "memory(GiB)": 22.66, "step": 26199, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.956569 }, { "epoch": 0.8511191241919241, "grad_norm": 0.41721343994140625, "learning_rate": 5.938880896405774e-07, "loss": 0.014750435948371887, "memory(GiB)": 22.66, "step": 26200, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.956576 }, { "epoch": 0.8511516096546795, "grad_norm": 0.3157115876674652, "learning_rate": 5.936342018390823e-07, "loss": 0.009028090164065361, "memory(GiB)": 22.66, "step": 26201, "token_acc": 0.995, "train_speed(iter/s)": 0.956583 }, { "epoch": 0.851184095117435, "grad_norm": 0.432982474565506, "learning_rate": 5.933803648929137e-07, "loss": 0.01977817714214325, "memory(GiB)": 22.66, "step": 26202, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.956591 }, { "epoch": 0.8512165805801903, "grad_norm": 0.5488465428352356, "learning_rate": 5.931265788050011e-07, "loss": 0.011983719654381275, "memory(GiB)": 22.66, "step": 26203, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956598 }, { "epoch": 0.8512490660429458, "grad_norm": 0.44094592332839966, "learning_rate": 5.928728435782744e-07, "loss": 0.01310027576982975, "memory(GiB)": 22.66, "step": 26204, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956606 }, { "epoch": 0.8512815515057012, "grad_norm": 0.5324439406394958, "learning_rate": 5.926191592156593e-07, "loss": 0.014291804283857346, "memory(GiB)": 22.66, "step": 26205, "token_acc": 1.0, "train_speed(iter/s)": 0.956614 }, { "epoch": 0.8513140369684566, "grad_norm": 0.28569814562797546, "learning_rate": 5.923655257200861e-07, "loss": 0.007802761159837246, "memory(GiB)": 22.66, "step": 26206, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.956622 }, { "epoch": 0.851346522431212, "grad_norm": 0.32290199398994446, "learning_rate": 5.92111943094481e-07, "loss": 0.014848203398287296, "memory(GiB)": 22.66, "step": 26207, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.95663 }, { "epoch": 0.8513790078939675, "grad_norm": 0.2745222747325897, "learning_rate": 5.918584113417719e-07, "loss": 0.009578109718859196, "memory(GiB)": 22.66, "step": 26208, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.956638 }, { "epoch": 0.8514114933567228, "grad_norm": 0.5263748168945312, "learning_rate": 5.91604930464883e-07, "loss": 0.018054675310850143, "memory(GiB)": 22.66, "step": 26209, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.956645 }, { "epoch": 0.8514439788194783, "grad_norm": 0.24239033460617065, "learning_rate": 5.913515004667409e-07, "loss": 0.012628598138689995, "memory(GiB)": 22.66, "step": 26210, "token_acc": 1.0, "train_speed(iter/s)": 0.956653 }, { "epoch": 0.8514764642822337, "grad_norm": 0.24491465091705322, "learning_rate": 5.910981213502698e-07, "loss": 0.006418297067284584, "memory(GiB)": 22.66, "step": 26211, "token_acc": 1.0, "train_speed(iter/s)": 0.956661 }, { "epoch": 0.8515089497449891, "grad_norm": 0.6747642755508423, "learning_rate": 5.908447931183953e-07, "loss": 0.010901411063969135, "memory(GiB)": 22.66, "step": 26212, "token_acc": 1.0, "train_speed(iter/s)": 0.956668 }, { "epoch": 0.8515414352077445, "grad_norm": 0.4639739692211151, "learning_rate": 5.905915157740411e-07, "loss": 0.010975413955748081, "memory(GiB)": 22.66, "step": 26213, "token_acc": 0.9904761904761905, "train_speed(iter/s)": 0.956676 }, { "epoch": 0.8515739206705, "grad_norm": 0.18222929537296295, "learning_rate": 5.903382893201288e-07, "loss": 0.00652841804549098, "memory(GiB)": 22.66, "step": 26214, "token_acc": 1.0, "train_speed(iter/s)": 0.956684 }, { "epoch": 0.8516064061332553, "grad_norm": 0.26631665229797363, "learning_rate": 5.90085113759582e-07, "loss": 0.011470300145447254, "memory(GiB)": 22.66, "step": 26215, "token_acc": 1.0, "train_speed(iter/s)": 0.956692 }, { "epoch": 0.8516388915960108, "grad_norm": 0.32669347524642944, "learning_rate": 5.89831989095322e-07, "loss": 0.010060638189315796, "memory(GiB)": 22.66, "step": 26216, "token_acc": 0.992, "train_speed(iter/s)": 0.956699 }, { "epoch": 0.8516713770587662, "grad_norm": 0.35778388381004333, "learning_rate": 5.895789153302722e-07, "loss": 0.01770959049463272, "memory(GiB)": 22.66, "step": 26217, "token_acc": 1.0, "train_speed(iter/s)": 0.956706 }, { "epoch": 0.8517038625215216, "grad_norm": 0.4008195996284485, "learning_rate": 5.893258924673512e-07, "loss": 0.006585761439055204, "memory(GiB)": 22.66, "step": 26218, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956714 }, { "epoch": 0.851736347984277, "grad_norm": 0.41210347414016724, "learning_rate": 5.890729205094808e-07, "loss": 0.013491843827068806, "memory(GiB)": 22.66, "step": 26219, "token_acc": 0.9966666666666667, "train_speed(iter/s)": 0.956722 }, { "epoch": 0.8517688334470325, "grad_norm": 0.3713585138320923, "learning_rate": 5.888199994595778e-07, "loss": 0.012220678851008415, "memory(GiB)": 22.66, "step": 26220, "token_acc": 1.0, "train_speed(iter/s)": 0.956729 }, { "epoch": 0.8518013189097878, "grad_norm": 0.37019461393356323, "learning_rate": 5.885671293205653e-07, "loss": 0.013210058212280273, "memory(GiB)": 22.66, "step": 26221, "token_acc": 1.0, "train_speed(iter/s)": 0.956737 }, { "epoch": 0.8518338043725433, "grad_norm": 0.25127947330474854, "learning_rate": 5.883143100953586e-07, "loss": 0.011120825074613094, "memory(GiB)": 22.66, "step": 26222, "token_acc": 1.0, "train_speed(iter/s)": 0.956745 }, { "epoch": 0.8518662898352987, "grad_norm": 0.5779949426651001, "learning_rate": 5.880615417868779e-07, "loss": 0.009426658973097801, "memory(GiB)": 22.66, "step": 26223, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.956752 }, { "epoch": 0.8518987752980541, "grad_norm": 0.29238268733024597, "learning_rate": 5.878088243980384e-07, "loss": 0.011413227766752243, "memory(GiB)": 22.66, "step": 26224, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956759 }, { "epoch": 0.8519312607608095, "grad_norm": 0.35081762075424194, "learning_rate": 5.875561579317574e-07, "loss": 0.013375982642173767, "memory(GiB)": 22.66, "step": 26225, "token_acc": 1.0, "train_speed(iter/s)": 0.956765 }, { "epoch": 0.851963746223565, "grad_norm": 0.1794898808002472, "learning_rate": 5.873035423909512e-07, "loss": 0.0032802866771817207, "memory(GiB)": 22.66, "step": 26226, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956771 }, { "epoch": 0.8519962316863203, "grad_norm": 0.2963564097881317, "learning_rate": 5.870509777785355e-07, "loss": 0.008291177451610565, "memory(GiB)": 22.66, "step": 26227, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.956778 }, { "epoch": 0.8520287171490758, "grad_norm": 0.24825029075145721, "learning_rate": 5.867984640974261e-07, "loss": 0.008048679679632187, "memory(GiB)": 22.66, "step": 26228, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.956784 }, { "epoch": 0.8520612026118312, "grad_norm": 0.33457356691360474, "learning_rate": 5.865460013505358e-07, "loss": 0.012773450464010239, "memory(GiB)": 22.66, "step": 26229, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.956791 }, { "epoch": 0.8520936880745866, "grad_norm": 0.3706476092338562, "learning_rate": 5.862935895407784e-07, "loss": 0.010857928544282913, "memory(GiB)": 22.66, "step": 26230, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956797 }, { "epoch": 0.852126173537342, "grad_norm": 0.40135982632637024, "learning_rate": 5.860412286710681e-07, "loss": 0.005982613191008568, "memory(GiB)": 22.66, "step": 26231, "token_acc": 1.0, "train_speed(iter/s)": 0.956803 }, { "epoch": 0.8521586590000975, "grad_norm": 0.33616894483566284, "learning_rate": 5.857889187443177e-07, "loss": 0.008533412590622902, "memory(GiB)": 22.66, "step": 26232, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.956809 }, { "epoch": 0.8521911444628528, "grad_norm": 0.337237685918808, "learning_rate": 5.855366597634376e-07, "loss": 0.011665300466120243, "memory(GiB)": 22.66, "step": 26233, "token_acc": 1.0, "train_speed(iter/s)": 0.956815 }, { "epoch": 0.8522236299256083, "grad_norm": 0.2910948097705841, "learning_rate": 5.8528445173134e-07, "loss": 0.01040736585855484, "memory(GiB)": 22.66, "step": 26234, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.95682 }, { "epoch": 0.8522561153883637, "grad_norm": 0.4408292770385742, "learning_rate": 5.850322946509357e-07, "loss": 0.01077024731785059, "memory(GiB)": 22.66, "step": 26235, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.956826 }, { "epoch": 0.8522886008511191, "grad_norm": 0.2284095734357834, "learning_rate": 5.847801885251359e-07, "loss": 0.008437019772827625, "memory(GiB)": 22.66, "step": 26236, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.956831 }, { "epoch": 0.8523210863138745, "grad_norm": 0.5995502471923828, "learning_rate": 5.845281333568487e-07, "loss": 0.0068782600574195385, "memory(GiB)": 22.66, "step": 26237, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.956837 }, { "epoch": 0.85235357177663, "grad_norm": 0.37989023327827454, "learning_rate": 5.84276129148984e-07, "loss": 0.007716996595263481, "memory(GiB)": 22.66, "step": 26238, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.956842 }, { "epoch": 0.8523860572393853, "grad_norm": 0.37440142035484314, "learning_rate": 5.840241759044496e-07, "loss": 0.006598654203116894, "memory(GiB)": 22.66, "step": 26239, "token_acc": 1.0, "train_speed(iter/s)": 0.956846 }, { "epoch": 0.8524185427021408, "grad_norm": 0.3319959044456482, "learning_rate": 5.837722736261553e-07, "loss": 0.010656345635652542, "memory(GiB)": 22.66, "step": 26240, "token_acc": 1.0, "train_speed(iter/s)": 0.956852 }, { "epoch": 0.8524510281648963, "grad_norm": 0.5698906183242798, "learning_rate": 5.835204223170054e-07, "loss": 0.011423170566558838, "memory(GiB)": 22.66, "step": 26241, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.956857 }, { "epoch": 0.8524835136276516, "grad_norm": 0.4277549088001251, "learning_rate": 5.832686219799083e-07, "loss": 0.014554662629961967, "memory(GiB)": 22.66, "step": 26242, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956862 }, { "epoch": 0.8525159990904071, "grad_norm": 0.30786970257759094, "learning_rate": 5.830168726177704e-07, "loss": 0.007730406709015369, "memory(GiB)": 22.66, "step": 26243, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956866 }, { "epoch": 0.8525484845531625, "grad_norm": 0.29691997170448303, "learning_rate": 5.827651742334961e-07, "loss": 0.012444188818335533, "memory(GiB)": 22.66, "step": 26244, "token_acc": 0.9846153846153847, "train_speed(iter/s)": 0.956872 }, { "epoch": 0.8525809700159179, "grad_norm": 0.4127582609653473, "learning_rate": 5.825135268299925e-07, "loss": 0.016820505261421204, "memory(GiB)": 22.66, "step": 26245, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.956877 }, { "epoch": 0.8526134554786733, "grad_norm": 0.26259106397628784, "learning_rate": 5.822619304101617e-07, "loss": 0.010156877338886261, "memory(GiB)": 22.66, "step": 26246, "token_acc": 1.0, "train_speed(iter/s)": 0.956882 }, { "epoch": 0.8526459409414288, "grad_norm": 0.40497827529907227, "learning_rate": 5.820103849769088e-07, "loss": 0.010641677305102348, "memory(GiB)": 22.66, "step": 26247, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.956887 }, { "epoch": 0.8526784264041841, "grad_norm": 0.25271713733673096, "learning_rate": 5.817588905331345e-07, "loss": 0.009241094812750816, "memory(GiB)": 22.66, "step": 26248, "token_acc": 1.0, "train_speed(iter/s)": 0.956893 }, { "epoch": 0.8527109118669396, "grad_norm": 0.36615240573883057, "learning_rate": 5.81507447081745e-07, "loss": 0.01212427020072937, "memory(GiB)": 22.66, "step": 26249, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.956899 }, { "epoch": 0.852743397329695, "grad_norm": 0.24219118058681488, "learning_rate": 5.812560546256401e-07, "loss": 0.010330164805054665, "memory(GiB)": 22.66, "step": 26250, "token_acc": 1.0, "train_speed(iter/s)": 0.956905 }, { "epoch": 0.8527758827924504, "grad_norm": 0.29587823152542114, "learning_rate": 5.810047131677221e-07, "loss": 0.010994420386850834, "memory(GiB)": 22.66, "step": 26251, "token_acc": 1.0, "train_speed(iter/s)": 0.956911 }, { "epoch": 0.8528083682552058, "grad_norm": 0.32991546392440796, "learning_rate": 5.807534227108908e-07, "loss": 0.0141300018876791, "memory(GiB)": 22.66, "step": 26252, "token_acc": 0.9948186528497409, "train_speed(iter/s)": 0.956917 }, { "epoch": 0.8528408537179613, "grad_norm": 0.336612343788147, "learning_rate": 5.805021832580471e-07, "loss": 0.011162381619215012, "memory(GiB)": 22.66, "step": 26253, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956925 }, { "epoch": 0.8528733391807166, "grad_norm": 0.35864371061325073, "learning_rate": 5.8025099481209e-07, "loss": 0.012911029160022736, "memory(GiB)": 22.66, "step": 26254, "token_acc": 1.0, "train_speed(iter/s)": 0.956932 }, { "epoch": 0.8529058246434721, "grad_norm": 0.3478870987892151, "learning_rate": 5.799998573759203e-07, "loss": 0.013441715389490128, "memory(GiB)": 22.66, "step": 26255, "token_acc": 1.0, "train_speed(iter/s)": 0.95694 }, { "epoch": 0.8529383101062274, "grad_norm": 0.8238123059272766, "learning_rate": 5.797487709524347e-07, "loss": 0.014313869178295135, "memory(GiB)": 22.66, "step": 26256, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.956947 }, { "epoch": 0.8529707955689829, "grad_norm": 0.31220030784606934, "learning_rate": 5.794977355445309e-07, "loss": 0.01036287285387516, "memory(GiB)": 22.66, "step": 26257, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956954 }, { "epoch": 0.8530032810317383, "grad_norm": 0.3150174617767334, "learning_rate": 5.792467511551076e-07, "loss": 0.010716566815972328, "memory(GiB)": 22.66, "step": 26258, "token_acc": 1.0, "train_speed(iter/s)": 0.956962 }, { "epoch": 0.8530357664944938, "grad_norm": 0.37488502264022827, "learning_rate": 5.789958177870603e-07, "loss": 0.00841793604195118, "memory(GiB)": 22.66, "step": 26259, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956969 }, { "epoch": 0.8530682519572491, "grad_norm": 0.4608815610408783, "learning_rate": 5.787449354432867e-07, "loss": 0.01895950362086296, "memory(GiB)": 22.66, "step": 26260, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.956976 }, { "epoch": 0.8531007374200046, "grad_norm": 0.6331656575202942, "learning_rate": 5.784941041266806e-07, "loss": 0.011157318949699402, "memory(GiB)": 22.66, "step": 26261, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956983 }, { "epoch": 0.85313322288276, "grad_norm": 0.3107597827911377, "learning_rate": 5.782433238401369e-07, "loss": 0.013566010631620884, "memory(GiB)": 22.66, "step": 26262, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956991 }, { "epoch": 0.8531657083455154, "grad_norm": 0.5804725885391235, "learning_rate": 5.779925945865511e-07, "loss": 0.010184606537222862, "memory(GiB)": 22.66, "step": 26263, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956999 }, { "epoch": 0.8531981938082708, "grad_norm": 0.403099000453949, "learning_rate": 5.777419163688175e-07, "loss": 0.013885386288166046, "memory(GiB)": 22.66, "step": 26264, "token_acc": 1.0, "train_speed(iter/s)": 0.957006 }, { "epoch": 0.8532306792710262, "grad_norm": 0.24849897623062134, "learning_rate": 5.774912891898266e-07, "loss": 0.01023835688829422, "memory(GiB)": 22.66, "step": 26265, "token_acc": 0.992, "train_speed(iter/s)": 0.957014 }, { "epoch": 0.8532631647337816, "grad_norm": 0.2487681359052658, "learning_rate": 5.772407130524732e-07, "loss": 0.009288391098380089, "memory(GiB)": 22.66, "step": 26266, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957022 }, { "epoch": 0.8532956501965371, "grad_norm": 0.392156720161438, "learning_rate": 5.769901879596484e-07, "loss": 0.009522700682282448, "memory(GiB)": 22.66, "step": 26267, "token_acc": 1.0, "train_speed(iter/s)": 0.95703 }, { "epoch": 0.8533281356592924, "grad_norm": 0.40192627906799316, "learning_rate": 5.76739713914245e-07, "loss": 0.014528199099004269, "memory(GiB)": 22.66, "step": 26268, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.957037 }, { "epoch": 0.8533606211220479, "grad_norm": 0.27330729365348816, "learning_rate": 5.76489290919151e-07, "loss": 0.009619778022170067, "memory(GiB)": 22.66, "step": 26269, "token_acc": 1.0, "train_speed(iter/s)": 0.957045 }, { "epoch": 0.8533931065848033, "grad_norm": 3.9425511360168457, "learning_rate": 5.762389189772599e-07, "loss": 0.009302333928644657, "memory(GiB)": 22.66, "step": 26270, "token_acc": 1.0, "train_speed(iter/s)": 0.957053 }, { "epoch": 0.8534255920475587, "grad_norm": 0.2859826683998108, "learning_rate": 5.759885980914575e-07, "loss": 0.011019391939043999, "memory(GiB)": 22.66, "step": 26271, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.957061 }, { "epoch": 0.8534580775103141, "grad_norm": 0.3960764706134796, "learning_rate": 5.757383282646373e-07, "loss": 0.012612920254468918, "memory(GiB)": 22.66, "step": 26272, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957068 }, { "epoch": 0.8534905629730696, "grad_norm": 0.41573765873908997, "learning_rate": 5.754881094996839e-07, "loss": 0.01012705359607935, "memory(GiB)": 22.66, "step": 26273, "token_acc": 0.9963503649635036, "train_speed(iter/s)": 0.957076 }, { "epoch": 0.8535230484358249, "grad_norm": 0.35674598813056946, "learning_rate": 5.752379417994874e-07, "loss": 0.016864510253071785, "memory(GiB)": 22.66, "step": 26274, "token_acc": 0.9921875, "train_speed(iter/s)": 0.957084 }, { "epoch": 0.8535555338985804, "grad_norm": 0.2572563588619232, "learning_rate": 5.749878251669349e-07, "loss": 0.008419954217970371, "memory(GiB)": 22.66, "step": 26275, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.957091 }, { "epoch": 0.8535880193613358, "grad_norm": 0.19901464879512787, "learning_rate": 5.747377596049108e-07, "loss": 0.005322723649442196, "memory(GiB)": 22.66, "step": 26276, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957098 }, { "epoch": 0.8536205048240912, "grad_norm": 0.3174731731414795, "learning_rate": 5.74487745116305e-07, "loss": 0.007555911783128977, "memory(GiB)": 22.66, "step": 26277, "token_acc": 0.9965397923875432, "train_speed(iter/s)": 0.957106 }, { "epoch": 0.8536529902868466, "grad_norm": 0.3495822846889496, "learning_rate": 5.742377817039996e-07, "loss": 0.00850141141563654, "memory(GiB)": 22.66, "step": 26278, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.957114 }, { "epoch": 0.8536854757496021, "grad_norm": 0.27524492144584656, "learning_rate": 5.739878693708822e-07, "loss": 0.0070936838164925575, "memory(GiB)": 22.66, "step": 26279, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.957122 }, { "epoch": 0.8537179612123574, "grad_norm": 0.22139900922775269, "learning_rate": 5.737380081198346e-07, "loss": 0.006549866870045662, "memory(GiB)": 22.66, "step": 26280, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.95713 }, { "epoch": 0.8537504466751129, "grad_norm": 0.3631691634654999, "learning_rate": 5.734881979537421e-07, "loss": 0.013904007151722908, "memory(GiB)": 22.66, "step": 26281, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957138 }, { "epoch": 0.8537829321378683, "grad_norm": 0.3556355834007263, "learning_rate": 5.732384388754869e-07, "loss": 0.014780525118112564, "memory(GiB)": 22.66, "step": 26282, "token_acc": 0.996, "train_speed(iter/s)": 0.957146 }, { "epoch": 0.8538154176006237, "grad_norm": 0.35078030824661255, "learning_rate": 5.729887308879539e-07, "loss": 0.009708582423627377, "memory(GiB)": 22.66, "step": 26283, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957153 }, { "epoch": 0.8538479030633791, "grad_norm": 0.3822813332080841, "learning_rate": 5.727390739940214e-07, "loss": 0.015438569709658623, "memory(GiB)": 22.66, "step": 26284, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.957161 }, { "epoch": 0.8538803885261346, "grad_norm": 0.4275681972503662, "learning_rate": 5.724894681965731e-07, "loss": 0.014115103520452976, "memory(GiB)": 22.66, "step": 26285, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.957169 }, { "epoch": 0.8539128739888899, "grad_norm": 0.31750956177711487, "learning_rate": 5.722399134984896e-07, "loss": 0.010131629183888435, "memory(GiB)": 22.66, "step": 26286, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957176 }, { "epoch": 0.8539453594516454, "grad_norm": 0.37033000588417053, "learning_rate": 5.719904099026513e-07, "loss": 0.017043527215719223, "memory(GiB)": 22.66, "step": 26287, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.957183 }, { "epoch": 0.8539778449144008, "grad_norm": 0.38108882308006287, "learning_rate": 5.71740957411937e-07, "loss": 0.008169487118721008, "memory(GiB)": 22.66, "step": 26288, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.957189 }, { "epoch": 0.8540103303771562, "grad_norm": 0.2797929644584656, "learning_rate": 5.714915560292255e-07, "loss": 0.006804571487009525, "memory(GiB)": 22.66, "step": 26289, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957196 }, { "epoch": 0.8540428158399116, "grad_norm": 0.3402041792869568, "learning_rate": 5.712422057573957e-07, "loss": 0.015667259693145752, "memory(GiB)": 22.66, "step": 26290, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.957201 }, { "epoch": 0.8540753013026671, "grad_norm": 0.31489261984825134, "learning_rate": 5.709929065993258e-07, "loss": 0.007796446327120066, "memory(GiB)": 22.66, "step": 26291, "token_acc": 0.9894179894179894, "train_speed(iter/s)": 0.957207 }, { "epoch": 0.8541077867654224, "grad_norm": 0.56134033203125, "learning_rate": 5.707436585578941e-07, "loss": 0.018175508826971054, "memory(GiB)": 22.66, "step": 26292, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.957214 }, { "epoch": 0.8541402722281779, "grad_norm": 0.23925861716270447, "learning_rate": 5.704944616359743e-07, "loss": 0.010471399873495102, "memory(GiB)": 22.66, "step": 26293, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.95722 }, { "epoch": 0.8541727576909333, "grad_norm": 0.3348091244697571, "learning_rate": 5.702453158364441e-07, "loss": 0.012381099164485931, "memory(GiB)": 22.66, "step": 26294, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957225 }, { "epoch": 0.8542052431536887, "grad_norm": 0.4461129903793335, "learning_rate": 5.69996221162179e-07, "loss": 0.014524431899189949, "memory(GiB)": 22.66, "step": 26295, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.95723 }, { "epoch": 0.8542377286164441, "grad_norm": 0.3295672535896301, "learning_rate": 5.697471776160552e-07, "loss": 0.014743932522833347, "memory(GiB)": 22.66, "step": 26296, "token_acc": 1.0, "train_speed(iter/s)": 0.957235 }, { "epoch": 0.8542702140791996, "grad_norm": 0.350282222032547, "learning_rate": 5.694981852009446e-07, "loss": 0.015497133135795593, "memory(GiB)": 22.66, "step": 26297, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.95724 }, { "epoch": 0.8543026995419549, "grad_norm": 0.38087597489356995, "learning_rate": 5.692492439197222e-07, "loss": 0.015968073159456253, "memory(GiB)": 22.66, "step": 26298, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957246 }, { "epoch": 0.8543351850047104, "grad_norm": 0.5280195474624634, "learning_rate": 5.690003537752597e-07, "loss": 0.01705201342701912, "memory(GiB)": 22.66, "step": 26299, "token_acc": 1.0, "train_speed(iter/s)": 0.957251 }, { "epoch": 0.8543676704674658, "grad_norm": 0.380666583776474, "learning_rate": 5.687515147704325e-07, "loss": 0.014740603975951672, "memory(GiB)": 22.66, "step": 26300, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957257 }, { "epoch": 0.8544001559302212, "grad_norm": 0.35651689767837524, "learning_rate": 5.685027269081095e-07, "loss": 0.010731928050518036, "memory(GiB)": 22.66, "step": 26301, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.957261 }, { "epoch": 0.8544326413929766, "grad_norm": 0.3329695165157318, "learning_rate": 5.682539901911643e-07, "loss": 0.01010053139179945, "memory(GiB)": 22.66, "step": 26302, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957266 }, { "epoch": 0.8544651268557321, "grad_norm": 0.5056743621826172, "learning_rate": 5.68005304622466e-07, "loss": 0.019261404871940613, "memory(GiB)": 22.66, "step": 26303, "token_acc": 1.0, "train_speed(iter/s)": 0.957271 }, { "epoch": 0.8544976123184875, "grad_norm": 0.34623438119888306, "learning_rate": 5.677566702048848e-07, "loss": 0.013825183734297752, "memory(GiB)": 22.66, "step": 26304, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.957276 }, { "epoch": 0.8545300977812429, "grad_norm": 0.32391682267189026, "learning_rate": 5.675080869412924e-07, "loss": 0.010736064985394478, "memory(GiB)": 22.66, "step": 26305, "token_acc": 1.0, "train_speed(iter/s)": 0.957281 }, { "epoch": 0.8545625832439984, "grad_norm": 0.4630385637283325, "learning_rate": 5.672595548345555e-07, "loss": 0.015092261135578156, "memory(GiB)": 22.66, "step": 26306, "token_acc": 1.0, "train_speed(iter/s)": 0.957286 }, { "epoch": 0.8545950687067537, "grad_norm": 0.4339582920074463, "learning_rate": 5.67011073887544e-07, "loss": 0.008955403231084347, "memory(GiB)": 22.66, "step": 26307, "token_acc": 1.0, "train_speed(iter/s)": 0.957292 }, { "epoch": 0.8546275541695092, "grad_norm": 0.2987464964389801, "learning_rate": 5.667626441031243e-07, "loss": 0.01029666792601347, "memory(GiB)": 22.66, "step": 26308, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957297 }, { "epoch": 0.8546600396322646, "grad_norm": 0.36241981387138367, "learning_rate": 5.665142654841643e-07, "loss": 0.011892712675035, "memory(GiB)": 22.66, "step": 26309, "token_acc": 1.0, "train_speed(iter/s)": 0.957302 }, { "epoch": 0.85469252509502, "grad_norm": 1.4225316047668457, "learning_rate": 5.662659380335306e-07, "loss": 0.012064216658473015, "memory(GiB)": 22.66, "step": 26310, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.957308 }, { "epoch": 0.8547250105577754, "grad_norm": 0.2940956950187683, "learning_rate": 5.660176617540902e-07, "loss": 0.009618124924600124, "memory(GiB)": 22.66, "step": 26311, "token_acc": 1.0, "train_speed(iter/s)": 0.957313 }, { "epoch": 0.8547574960205309, "grad_norm": 0.34642064571380615, "learning_rate": 5.657694366487066e-07, "loss": 0.01092738751322031, "memory(GiB)": 22.66, "step": 26312, "token_acc": 1.0, "train_speed(iter/s)": 0.957318 }, { "epoch": 0.8547899814832862, "grad_norm": 0.23654469847679138, "learning_rate": 5.655212627202455e-07, "loss": 0.009106451645493507, "memory(GiB)": 22.66, "step": 26313, "token_acc": 1.0, "train_speed(iter/s)": 0.957326 }, { "epoch": 0.8548224669460417, "grad_norm": 0.24396288394927979, "learning_rate": 5.652731399715717e-07, "loss": 0.010898870415985584, "memory(GiB)": 22.66, "step": 26314, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.957334 }, { "epoch": 0.854854952408797, "grad_norm": 0.26120465993881226, "learning_rate": 5.650250684055492e-07, "loss": 0.00974226277321577, "memory(GiB)": 22.66, "step": 26315, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957342 }, { "epoch": 0.8548874378715525, "grad_norm": 0.3755740225315094, "learning_rate": 5.647770480250397e-07, "loss": 0.010841243900358677, "memory(GiB)": 22.66, "step": 26316, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.95735 }, { "epoch": 0.8549199233343079, "grad_norm": 0.4187479615211487, "learning_rate": 5.645290788329061e-07, "loss": 0.013283345848321915, "memory(GiB)": 22.66, "step": 26317, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.957357 }, { "epoch": 0.8549524087970634, "grad_norm": 0.2701074182987213, "learning_rate": 5.642811608320104e-07, "loss": 0.008392509073019028, "memory(GiB)": 22.66, "step": 26318, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957365 }, { "epoch": 0.8549848942598187, "grad_norm": 0.36478352546691895, "learning_rate": 5.640332940252141e-07, "loss": 0.013019509613513947, "memory(GiB)": 22.66, "step": 26319, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.957373 }, { "epoch": 0.8550173797225742, "grad_norm": 0.3361184000968933, "learning_rate": 5.637854784153784e-07, "loss": 0.009787766262888908, "memory(GiB)": 22.66, "step": 26320, "token_acc": 1.0, "train_speed(iter/s)": 0.957381 }, { "epoch": 0.8550498651853295, "grad_norm": 0.41026732325553894, "learning_rate": 5.635377140053628e-07, "loss": 0.01380501314997673, "memory(GiB)": 22.66, "step": 26321, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.957389 }, { "epoch": 0.855082350648085, "grad_norm": 0.2962910830974579, "learning_rate": 5.632900007980263e-07, "loss": 0.010303394868969917, "memory(GiB)": 22.66, "step": 26322, "token_acc": 1.0, "train_speed(iter/s)": 0.957396 }, { "epoch": 0.8551148361108404, "grad_norm": 0.3968397080898285, "learning_rate": 5.630423387962287e-07, "loss": 0.009969555772840977, "memory(GiB)": 22.66, "step": 26323, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957404 }, { "epoch": 0.8551473215735959, "grad_norm": 0.4915170967578888, "learning_rate": 5.62794728002829e-07, "loss": 0.01325308158993721, "memory(GiB)": 22.66, "step": 26324, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957412 }, { "epoch": 0.8551798070363512, "grad_norm": 0.402050644159317, "learning_rate": 5.62547168420683e-07, "loss": 0.013850456103682518, "memory(GiB)": 22.66, "step": 26325, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.957419 }, { "epoch": 0.8552122924991067, "grad_norm": 0.3349742889404297, "learning_rate": 5.622996600526498e-07, "loss": 0.010375861078500748, "memory(GiB)": 22.66, "step": 26326, "token_acc": 1.0, "train_speed(iter/s)": 0.957427 }, { "epoch": 0.855244777961862, "grad_norm": 0.3698970675468445, "learning_rate": 5.62052202901583e-07, "loss": 0.015389927662909031, "memory(GiB)": 22.66, "step": 26327, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.957435 }, { "epoch": 0.8552772634246175, "grad_norm": 0.35719069838523865, "learning_rate": 5.618047969703427e-07, "loss": 0.013139104470610619, "memory(GiB)": 22.66, "step": 26328, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.957443 }, { "epoch": 0.8553097488873729, "grad_norm": 0.38908645510673523, "learning_rate": 5.615574422617814e-07, "loss": 0.011205226182937622, "memory(GiB)": 22.66, "step": 26329, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.95745 }, { "epoch": 0.8553422343501283, "grad_norm": 0.3747101426124573, "learning_rate": 5.613101387787562e-07, "loss": 0.008653711527585983, "memory(GiB)": 22.66, "step": 26330, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957458 }, { "epoch": 0.8553747198128837, "grad_norm": 0.21700115501880646, "learning_rate": 5.610628865241185e-07, "loss": 0.009787974879145622, "memory(GiB)": 22.66, "step": 26331, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957466 }, { "epoch": 0.8554072052756392, "grad_norm": 0.4281252920627594, "learning_rate": 5.608156855007229e-07, "loss": 0.015774955973029137, "memory(GiB)": 22.66, "step": 26332, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.957474 }, { "epoch": 0.8554396907383945, "grad_norm": 0.3800264000892639, "learning_rate": 5.605685357114233e-07, "loss": 0.0159003846347332, "memory(GiB)": 22.66, "step": 26333, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.957481 }, { "epoch": 0.85547217620115, "grad_norm": 0.349544495344162, "learning_rate": 5.603214371590715e-07, "loss": 0.011086220853030682, "memory(GiB)": 22.66, "step": 26334, "token_acc": 0.9939024390243902, "train_speed(iter/s)": 0.957488 }, { "epoch": 0.8555046616639054, "grad_norm": 0.2277650684118271, "learning_rate": 5.60074389846521e-07, "loss": 0.008286520838737488, "memory(GiB)": 22.66, "step": 26335, "token_acc": 1.0, "train_speed(iter/s)": 0.957496 }, { "epoch": 0.8555371471266608, "grad_norm": 0.4589839279651642, "learning_rate": 5.598273937766202e-07, "loss": 0.01626284047961235, "memory(GiB)": 22.66, "step": 26336, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.957504 }, { "epoch": 0.8555696325894162, "grad_norm": 0.32933053374290466, "learning_rate": 5.595804489522211e-07, "loss": 0.01357837300747633, "memory(GiB)": 22.66, "step": 26337, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957512 }, { "epoch": 0.8556021180521717, "grad_norm": 0.3398424983024597, "learning_rate": 5.593335553761736e-07, "loss": 0.011166408658027649, "memory(GiB)": 22.66, "step": 26338, "token_acc": 0.9875, "train_speed(iter/s)": 0.95752 }, { "epoch": 0.855634603514927, "grad_norm": 0.3395174443721771, "learning_rate": 5.590867130513289e-07, "loss": 0.0124858058989048, "memory(GiB)": 22.66, "step": 26339, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957528 }, { "epoch": 0.8556670889776825, "grad_norm": 0.29659226536750793, "learning_rate": 5.58839921980533e-07, "loss": 0.010251936502754688, "memory(GiB)": 22.66, "step": 26340, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957535 }, { "epoch": 0.8556995744404379, "grad_norm": 0.331500768661499, "learning_rate": 5.585931821666357e-07, "loss": 0.009501111693680286, "memory(GiB)": 22.66, "step": 26341, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957542 }, { "epoch": 0.8557320599031933, "grad_norm": 0.2894749045372009, "learning_rate": 5.583464936124843e-07, "loss": 0.013720816932618618, "memory(GiB)": 22.66, "step": 26342, "token_acc": 1.0, "train_speed(iter/s)": 0.957549 }, { "epoch": 0.8557645453659487, "grad_norm": 0.33157846331596375, "learning_rate": 5.580998563209272e-07, "loss": 0.009126597084105015, "memory(GiB)": 22.66, "step": 26343, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.957556 }, { "epoch": 0.8557970308287042, "grad_norm": 0.3536892533302307, "learning_rate": 5.578532702948092e-07, "loss": 0.012234071269631386, "memory(GiB)": 22.66, "step": 26344, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957564 }, { "epoch": 0.8558295162914595, "grad_norm": 0.33436381816864014, "learning_rate": 5.576067355369769e-07, "loss": 0.00987071543931961, "memory(GiB)": 22.66, "step": 26345, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957571 }, { "epoch": 0.855862001754215, "grad_norm": 0.4427472949028015, "learning_rate": 5.573602520502758e-07, "loss": 0.017316848039627075, "memory(GiB)": 22.66, "step": 26346, "token_acc": 1.0, "train_speed(iter/s)": 0.957579 }, { "epoch": 0.8558944872169704, "grad_norm": 0.2765193581581116, "learning_rate": 5.571138198375514e-07, "loss": 0.012054691091179848, "memory(GiB)": 22.66, "step": 26347, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.957586 }, { "epoch": 0.8559269726797258, "grad_norm": 0.286539763212204, "learning_rate": 5.56867438901646e-07, "loss": 0.013722289353609085, "memory(GiB)": 22.66, "step": 26348, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957594 }, { "epoch": 0.8559594581424812, "grad_norm": 0.5500001311302185, "learning_rate": 5.566211092454049e-07, "loss": 0.01596742868423462, "memory(GiB)": 22.66, "step": 26349, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.957601 }, { "epoch": 0.8559919436052367, "grad_norm": 0.4453389048576355, "learning_rate": 5.563748308716687e-07, "loss": 0.017339620739221573, "memory(GiB)": 22.66, "step": 26350, "token_acc": 1.0, "train_speed(iter/s)": 0.957607 }, { "epoch": 0.856024429067992, "grad_norm": 0.28666993975639343, "learning_rate": 5.561286037832819e-07, "loss": 0.00699708703905344, "memory(GiB)": 22.66, "step": 26351, "token_acc": 1.0, "train_speed(iter/s)": 0.957613 }, { "epoch": 0.8560569145307475, "grad_norm": 0.4026206433773041, "learning_rate": 5.558824279830877e-07, "loss": 0.01549097802489996, "memory(GiB)": 22.66, "step": 26352, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.95762 }, { "epoch": 0.8560893999935029, "grad_norm": 0.3664243221282959, "learning_rate": 5.556363034739237e-07, "loss": 0.011526345275342464, "memory(GiB)": 22.66, "step": 26353, "token_acc": 1.0, "train_speed(iter/s)": 0.957625 }, { "epoch": 0.8561218854562583, "grad_norm": 0.28902849555015564, "learning_rate": 5.553902302586334e-07, "loss": 0.008939852938055992, "memory(GiB)": 22.66, "step": 26354, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957631 }, { "epoch": 0.8561543709190137, "grad_norm": 0.41677308082580566, "learning_rate": 5.551442083400532e-07, "loss": 0.016705194488167763, "memory(GiB)": 22.66, "step": 26355, "token_acc": 0.9921875, "train_speed(iter/s)": 0.957636 }, { "epoch": 0.8561868563817692, "grad_norm": 0.3120468854904175, "learning_rate": 5.548982377210271e-07, "loss": 0.01710241660475731, "memory(GiB)": 22.66, "step": 26356, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.957642 }, { "epoch": 0.8562193418445245, "grad_norm": 0.4358496069908142, "learning_rate": 5.546523184043911e-07, "loss": 0.01581590622663498, "memory(GiB)": 22.66, "step": 26357, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.957648 }, { "epoch": 0.85625182730728, "grad_norm": 0.3025479018688202, "learning_rate": 5.544064503929847e-07, "loss": 0.010861759074032307, "memory(GiB)": 22.66, "step": 26358, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.957654 }, { "epoch": 0.8562843127700354, "grad_norm": 0.25578397512435913, "learning_rate": 5.541606336896443e-07, "loss": 0.010038137435913086, "memory(GiB)": 22.66, "step": 26359, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957659 }, { "epoch": 0.8563167982327908, "grad_norm": 0.3836587071418762, "learning_rate": 5.539148682972073e-07, "loss": 0.011924825608730316, "memory(GiB)": 22.66, "step": 26360, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957664 }, { "epoch": 0.8563492836955462, "grad_norm": 0.5256235003471375, "learning_rate": 5.536691542185108e-07, "loss": 0.013331850990653038, "memory(GiB)": 22.66, "step": 26361, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.957669 }, { "epoch": 0.8563817691583017, "grad_norm": 0.163837730884552, "learning_rate": 5.534234914563907e-07, "loss": 0.006722742225974798, "memory(GiB)": 22.66, "step": 26362, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957674 }, { "epoch": 0.856414254621057, "grad_norm": 0.6197023987770081, "learning_rate": 5.531778800136817e-07, "loss": 0.016806118190288544, "memory(GiB)": 22.66, "step": 26363, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.957679 }, { "epoch": 0.8564467400838125, "grad_norm": 0.48713958263397217, "learning_rate": 5.529323198932179e-07, "loss": 0.01374550350010395, "memory(GiB)": 22.66, "step": 26364, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957684 }, { "epoch": 0.8564792255465679, "grad_norm": 0.43610092997550964, "learning_rate": 5.526868110978345e-07, "loss": 0.009511598385870457, "memory(GiB)": 22.66, "step": 26365, "token_acc": 1.0, "train_speed(iter/s)": 0.957689 }, { "epoch": 0.8565117110093233, "grad_norm": 0.2468784600496292, "learning_rate": 5.524413536303646e-07, "loss": 0.012545714154839516, "memory(GiB)": 22.66, "step": 26366, "token_acc": 1.0, "train_speed(iter/s)": 0.957694 }, { "epoch": 0.8565441964720787, "grad_norm": 0.3416289985179901, "learning_rate": 5.521959474936422e-07, "loss": 0.010857452638447285, "memory(GiB)": 22.66, "step": 26367, "token_acc": 0.996, "train_speed(iter/s)": 0.9577 }, { "epoch": 0.8565766819348342, "grad_norm": 0.4384899437427521, "learning_rate": 5.519505926904978e-07, "loss": 0.010356277227401733, "memory(GiB)": 22.66, "step": 26368, "token_acc": 1.0, "train_speed(iter/s)": 0.957704 }, { "epoch": 0.8566091673975896, "grad_norm": 0.3982643187046051, "learning_rate": 5.517052892237639e-07, "loss": 0.012568543665111065, "memory(GiB)": 22.66, "step": 26369, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957709 }, { "epoch": 0.856641652860345, "grad_norm": 0.2606163024902344, "learning_rate": 5.514600370962713e-07, "loss": 0.005600514821708202, "memory(GiB)": 22.66, "step": 26370, "token_acc": 1.0, "train_speed(iter/s)": 0.957714 }, { "epoch": 0.8566741383231005, "grad_norm": 0.3655267655849457, "learning_rate": 5.512148363108521e-07, "loss": 0.012154609896242619, "memory(GiB)": 22.66, "step": 26371, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.957719 }, { "epoch": 0.8567066237858558, "grad_norm": 0.3644745349884033, "learning_rate": 5.50969686870334e-07, "loss": 0.015226059593260288, "memory(GiB)": 22.66, "step": 26372, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.957725 }, { "epoch": 0.8567391092486113, "grad_norm": 0.4278046190738678, "learning_rate": 5.507245887775469e-07, "loss": 0.014033954590559006, "memory(GiB)": 22.66, "step": 26373, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957731 }, { "epoch": 0.8567715947113667, "grad_norm": 0.28507643938064575, "learning_rate": 5.504795420353204e-07, "loss": 0.010015126317739487, "memory(GiB)": 22.66, "step": 26374, "token_acc": 1.0, "train_speed(iter/s)": 0.957737 }, { "epoch": 0.8568040801741221, "grad_norm": 0.282952219247818, "learning_rate": 5.502345466464832e-07, "loss": 0.007830856367945671, "memory(GiB)": 22.66, "step": 26375, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.957744 }, { "epoch": 0.8568365656368775, "grad_norm": 0.36786606907844543, "learning_rate": 5.499896026138613e-07, "loss": 0.012387782335281372, "memory(GiB)": 22.66, "step": 26376, "token_acc": 1.0, "train_speed(iter/s)": 0.95775 }, { "epoch": 0.856869051099633, "grad_norm": 0.2835075259208679, "learning_rate": 5.497447099402831e-07, "loss": 0.012007078155875206, "memory(GiB)": 22.66, "step": 26377, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957756 }, { "epoch": 0.8569015365623883, "grad_norm": 0.42868462204933167, "learning_rate": 5.494998686285719e-07, "loss": 0.009520092979073524, "memory(GiB)": 22.66, "step": 26378, "token_acc": 1.0, "train_speed(iter/s)": 0.957763 }, { "epoch": 0.8569340220251438, "grad_norm": 0.39692726731300354, "learning_rate": 5.492550786815581e-07, "loss": 0.013915537856519222, "memory(GiB)": 22.66, "step": 26379, "token_acc": 0.994413407821229, "train_speed(iter/s)": 0.957769 }, { "epoch": 0.8569665074878992, "grad_norm": 0.2807602882385254, "learning_rate": 5.490103401020635e-07, "loss": 0.010118767619132996, "memory(GiB)": 22.66, "step": 26380, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.957775 }, { "epoch": 0.8569989929506546, "grad_norm": 0.2698768377304077, "learning_rate": 5.487656528929141e-07, "loss": 0.00850137509405613, "memory(GiB)": 22.66, "step": 26381, "token_acc": 1.0, "train_speed(iter/s)": 0.957781 }, { "epoch": 0.85703147841341, "grad_norm": 0.22772017121315002, "learning_rate": 5.485210170569344e-07, "loss": 0.008240466937422752, "memory(GiB)": 22.66, "step": 26382, "token_acc": 1.0, "train_speed(iter/s)": 0.957787 }, { "epoch": 0.8570639638761655, "grad_norm": 0.4099855124950409, "learning_rate": 5.482764325969453e-07, "loss": 0.015312045812606812, "memory(GiB)": 22.66, "step": 26383, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.957793 }, { "epoch": 0.8570964493389208, "grad_norm": 0.37958529591560364, "learning_rate": 5.480318995157735e-07, "loss": 0.016513129696249962, "memory(GiB)": 22.66, "step": 26384, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957799 }, { "epoch": 0.8571289348016763, "grad_norm": 0.3661831021308899, "learning_rate": 5.477874178162379e-07, "loss": 0.012884518131613731, "memory(GiB)": 22.66, "step": 26385, "token_acc": 0.9965277777777778, "train_speed(iter/s)": 0.957805 }, { "epoch": 0.8571614202644317, "grad_norm": 0.6041659712791443, "learning_rate": 5.475429875011623e-07, "loss": 0.013044541701674461, "memory(GiB)": 22.66, "step": 26386, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.957811 }, { "epoch": 0.8571939057271871, "grad_norm": 0.3216747045516968, "learning_rate": 5.472986085733656e-07, "loss": 0.011170654557645321, "memory(GiB)": 22.66, "step": 26387, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.957817 }, { "epoch": 0.8572263911899425, "grad_norm": 0.2559853792190552, "learning_rate": 5.470542810356699e-07, "loss": 0.010679172351956367, "memory(GiB)": 22.66, "step": 26388, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957824 }, { "epoch": 0.857258876652698, "grad_norm": 0.3298294246196747, "learning_rate": 5.468100048908947e-07, "loss": 0.010171369649469852, "memory(GiB)": 22.66, "step": 26389, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.95783 }, { "epoch": 0.8572913621154533, "grad_norm": 0.2588123381137848, "learning_rate": 5.4656578014186e-07, "loss": 0.008369764313101768, "memory(GiB)": 22.66, "step": 26390, "token_acc": 1.0, "train_speed(iter/s)": 0.957837 }, { "epoch": 0.8573238475782088, "grad_norm": 0.3823733925819397, "learning_rate": 5.463216067913829e-07, "loss": 0.01052970439195633, "memory(GiB)": 22.66, "step": 26391, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957843 }, { "epoch": 0.8573563330409641, "grad_norm": 0.2875044345855713, "learning_rate": 5.460774848422823e-07, "loss": 0.009542432613670826, "memory(GiB)": 22.66, "step": 26392, "token_acc": 1.0, "train_speed(iter/s)": 0.957848 }, { "epoch": 0.8573888185037196, "grad_norm": 0.33664456009864807, "learning_rate": 5.458334142973754e-07, "loss": 0.01119348593056202, "memory(GiB)": 22.66, "step": 26393, "token_acc": 1.0, "train_speed(iter/s)": 0.957854 }, { "epoch": 0.857421303966475, "grad_norm": 0.4790077805519104, "learning_rate": 5.455893951594804e-07, "loss": 0.013689645566046238, "memory(GiB)": 22.66, "step": 26394, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.95786 }, { "epoch": 0.8574537894292305, "grad_norm": 0.303334504365921, "learning_rate": 5.453454274314118e-07, "loss": 0.010956859216094017, "memory(GiB)": 22.66, "step": 26395, "token_acc": 0.996, "train_speed(iter/s)": 0.957866 }, { "epoch": 0.8574862748919858, "grad_norm": 0.5136995911598206, "learning_rate": 5.451015111159858e-07, "loss": 0.014603162184357643, "memory(GiB)": 22.66, "step": 26396, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.957873 }, { "epoch": 0.8575187603547413, "grad_norm": 0.32128503918647766, "learning_rate": 5.448576462160182e-07, "loss": 0.008689641952514648, "memory(GiB)": 22.66, "step": 26397, "token_acc": 1.0, "train_speed(iter/s)": 0.957879 }, { "epoch": 0.8575512458174966, "grad_norm": 0.4231244623661041, "learning_rate": 5.446138327343231e-07, "loss": 0.016897859051823616, "memory(GiB)": 22.66, "step": 26398, "token_acc": 0.9867549668874173, "train_speed(iter/s)": 0.957887 }, { "epoch": 0.8575837312802521, "grad_norm": 0.36112940311431885, "learning_rate": 5.443700706737154e-07, "loss": 0.011473345570266247, "memory(GiB)": 22.66, "step": 26399, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.957895 }, { "epoch": 0.8576162167430075, "grad_norm": 0.5848538875579834, "learning_rate": 5.441263600370061e-07, "loss": 0.007093362044543028, "memory(GiB)": 22.66, "step": 26400, "token_acc": 1.0, "train_speed(iter/s)": 0.957901 }, { "epoch": 0.857648702205763, "grad_norm": 0.2164636105298996, "learning_rate": 5.438827008270097e-07, "loss": 0.00930265337228775, "memory(GiB)": 22.66, "step": 26401, "token_acc": 1.0, "train_speed(iter/s)": 0.957909 }, { "epoch": 0.8576811876685183, "grad_norm": 0.32613804936408997, "learning_rate": 5.436390930465385e-07, "loss": 0.009053424000740051, "memory(GiB)": 22.66, "step": 26402, "token_acc": 1.0, "train_speed(iter/s)": 0.957916 }, { "epoch": 0.8577136731312738, "grad_norm": 0.3635501265525818, "learning_rate": 5.433955366984039e-07, "loss": 0.01140227448195219, "memory(GiB)": 22.66, "step": 26403, "token_acc": 1.0, "train_speed(iter/s)": 0.957924 }, { "epoch": 0.8577461585940291, "grad_norm": 0.30257147550582886, "learning_rate": 5.431520317854155e-07, "loss": 0.004666083492338657, "memory(GiB)": 22.66, "step": 26404, "token_acc": 1.0, "train_speed(iter/s)": 0.957931 }, { "epoch": 0.8577786440567846, "grad_norm": 0.40175047516822815, "learning_rate": 5.42908578310386e-07, "loss": 0.012566572055220604, "memory(GiB)": 22.66, "step": 26405, "token_acc": 1.0, "train_speed(iter/s)": 0.957938 }, { "epoch": 0.85781112951954, "grad_norm": 0.24258124828338623, "learning_rate": 5.42665176276122e-07, "loss": 0.006743531674146652, "memory(GiB)": 22.66, "step": 26406, "token_acc": 1.0, "train_speed(iter/s)": 0.957945 }, { "epoch": 0.8578436149822954, "grad_norm": 0.4903284013271332, "learning_rate": 5.424218256854364e-07, "loss": 0.020658180117607117, "memory(GiB)": 22.66, "step": 26407, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.957952 }, { "epoch": 0.8578761004450508, "grad_norm": 0.28126055002212524, "learning_rate": 5.42178526541135e-07, "loss": 0.009056220762431622, "memory(GiB)": 22.66, "step": 26408, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.95796 }, { "epoch": 0.8579085859078063, "grad_norm": 0.38336023688316345, "learning_rate": 5.419352788460274e-07, "loss": 0.00985698588192463, "memory(GiB)": 22.66, "step": 26409, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957968 }, { "epoch": 0.8579410713705616, "grad_norm": 0.3872587978839874, "learning_rate": 5.416920826029199e-07, "loss": 0.01658138446509838, "memory(GiB)": 22.66, "step": 26410, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.957976 }, { "epoch": 0.8579735568333171, "grad_norm": 0.47126197814941406, "learning_rate": 5.414489378146198e-07, "loss": 0.019239459186792374, "memory(GiB)": 22.66, "step": 26411, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.957984 }, { "epoch": 0.8580060422960725, "grad_norm": 0.3460809290409088, "learning_rate": 5.41205844483933e-07, "loss": 0.011603164486587048, "memory(GiB)": 22.66, "step": 26412, "token_acc": 1.0, "train_speed(iter/s)": 0.957991 }, { "epoch": 0.8580385277588279, "grad_norm": 0.3018175959587097, "learning_rate": 5.409628026136654e-07, "loss": 0.009207308292388916, "memory(GiB)": 22.66, "step": 26413, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.957997 }, { "epoch": 0.8580710132215833, "grad_norm": 0.25241750478744507, "learning_rate": 5.407198122066226e-07, "loss": 0.012878520414233208, "memory(GiB)": 22.66, "step": 26414, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.958003 }, { "epoch": 0.8581034986843388, "grad_norm": 0.25918376445770264, "learning_rate": 5.40476873265608e-07, "loss": 0.008683105930685997, "memory(GiB)": 22.66, "step": 26415, "token_acc": 1.0, "train_speed(iter/s)": 0.958009 }, { "epoch": 0.8581359841470941, "grad_norm": 0.4369952976703644, "learning_rate": 5.402339857934258e-07, "loss": 0.01226004958152771, "memory(GiB)": 22.66, "step": 26416, "token_acc": 0.9961538461538462, "train_speed(iter/s)": 0.958016 }, { "epoch": 0.8581684696098496, "grad_norm": 0.3488195836544037, "learning_rate": 5.399911497928795e-07, "loss": 0.012121764943003654, "memory(GiB)": 22.66, "step": 26417, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.958022 }, { "epoch": 0.858200955072605, "grad_norm": 0.33335554599761963, "learning_rate": 5.397483652667723e-07, "loss": 0.009506141766905785, "memory(GiB)": 22.66, "step": 26418, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.958028 }, { "epoch": 0.8582334405353604, "grad_norm": 0.4025036692619324, "learning_rate": 5.395056322179043e-07, "loss": 0.00869489274919033, "memory(GiB)": 22.66, "step": 26419, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.958034 }, { "epoch": 0.8582659259981158, "grad_norm": 0.4871773421764374, "learning_rate": 5.392629506490787e-07, "loss": 0.019000355154275894, "memory(GiB)": 22.66, "step": 26420, "token_acc": 1.0, "train_speed(iter/s)": 0.95804 }, { "epoch": 0.8582984114608713, "grad_norm": 0.31808164715766907, "learning_rate": 5.390203205630956e-07, "loss": 0.013603400439023972, "memory(GiB)": 22.66, "step": 26421, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.958045 }, { "epoch": 0.8583308969236266, "grad_norm": 0.16857384145259857, "learning_rate": 5.387777419627571e-07, "loss": 0.007279150187969208, "memory(GiB)": 22.66, "step": 26422, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.958051 }, { "epoch": 0.8583633823863821, "grad_norm": 0.7923502922058105, "learning_rate": 5.385352148508599e-07, "loss": 0.013404187746345997, "memory(GiB)": 22.66, "step": 26423, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.958057 }, { "epoch": 0.8583958678491375, "grad_norm": 0.3633841276168823, "learning_rate": 5.382927392302045e-07, "loss": 0.011474085971713066, "memory(GiB)": 22.66, "step": 26424, "token_acc": 1.0, "train_speed(iter/s)": 0.958062 }, { "epoch": 0.8584283533118929, "grad_norm": 0.2471267580986023, "learning_rate": 5.380503151035893e-07, "loss": 0.006541277747601271, "memory(GiB)": 22.66, "step": 26425, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.958068 }, { "epoch": 0.8584608387746483, "grad_norm": 0.5174965262413025, "learning_rate": 5.378079424738136e-07, "loss": 0.013296077027916908, "memory(GiB)": 22.66, "step": 26426, "token_acc": 1.0, "train_speed(iter/s)": 0.958074 }, { "epoch": 0.8584933242374038, "grad_norm": 0.3617708683013916, "learning_rate": 5.375656213436719e-07, "loss": 0.011037047952413559, "memory(GiB)": 22.66, "step": 26427, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.95808 }, { "epoch": 0.8585258097001591, "grad_norm": 0.4042273759841919, "learning_rate": 5.373233517159626e-07, "loss": 0.01153047475963831, "memory(GiB)": 22.66, "step": 26428, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.958086 }, { "epoch": 0.8585582951629146, "grad_norm": 0.4455944299697876, "learning_rate": 5.370811335934823e-07, "loss": 0.012219306081533432, "memory(GiB)": 22.66, "step": 26429, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.958092 }, { "epoch": 0.85859078062567, "grad_norm": 0.40882667899131775, "learning_rate": 5.368389669790252e-07, "loss": 0.011611747555434704, "memory(GiB)": 22.66, "step": 26430, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.958097 }, { "epoch": 0.8586232660884254, "grad_norm": 0.3829647898674011, "learning_rate": 5.365968518753878e-07, "loss": 0.007796387188136578, "memory(GiB)": 22.66, "step": 26431, "token_acc": 1.0, "train_speed(iter/s)": 0.958104 }, { "epoch": 0.8586557515511809, "grad_norm": 0.40854009985923767, "learning_rate": 5.363547882853631e-07, "loss": 0.017585575580596924, "memory(GiB)": 22.66, "step": 26432, "token_acc": 1.0, "train_speed(iter/s)": 0.95811 }, { "epoch": 0.8586882370139363, "grad_norm": 0.33493930101394653, "learning_rate": 5.361127762117458e-07, "loss": 0.013432083651423454, "memory(GiB)": 22.66, "step": 26433, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.958117 }, { "epoch": 0.8587207224766917, "grad_norm": 0.21736356616020203, "learning_rate": 5.358708156573261e-07, "loss": 0.007346475962549448, "memory(GiB)": 22.66, "step": 26434, "token_acc": 1.0, "train_speed(iter/s)": 0.958125 }, { "epoch": 0.8587532079394471, "grad_norm": 0.32557788491249084, "learning_rate": 5.356289066249016e-07, "loss": 0.010532457381486893, "memory(GiB)": 22.66, "step": 26435, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.958133 }, { "epoch": 0.8587856934022026, "grad_norm": 0.2855123281478882, "learning_rate": 5.353870491172603e-07, "loss": 0.009809479117393494, "memory(GiB)": 22.66, "step": 26436, "token_acc": 1.0, "train_speed(iter/s)": 0.95814 }, { "epoch": 0.8588181788649579, "grad_norm": 0.30688050389289856, "learning_rate": 5.351452431371962e-07, "loss": 0.011168602854013443, "memory(GiB)": 22.66, "step": 26437, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.958147 }, { "epoch": 0.8588506643277134, "grad_norm": 0.4122917056083679, "learning_rate": 5.34903488687497e-07, "loss": 0.011153839528560638, "memory(GiB)": 22.66, "step": 26438, "token_acc": 1.0, "train_speed(iter/s)": 0.958153 }, { "epoch": 0.8588831497904688, "grad_norm": 0.2322123944759369, "learning_rate": 5.346617857709552e-07, "loss": 0.006149878725409508, "memory(GiB)": 22.66, "step": 26439, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.958159 }, { "epoch": 0.8589156352532242, "grad_norm": 0.49056556820869446, "learning_rate": 5.344201343903594e-07, "loss": 0.014093548059463501, "memory(GiB)": 22.66, "step": 26440, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.958166 }, { "epoch": 0.8589481207159796, "grad_norm": 0.27602261304855347, "learning_rate": 5.341785345485001e-07, "loss": 0.012247681617736816, "memory(GiB)": 22.66, "step": 26441, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.958172 }, { "epoch": 0.8589806061787351, "grad_norm": 0.2438182383775711, "learning_rate": 5.339369862481636e-07, "loss": 0.00916270911693573, "memory(GiB)": 22.66, "step": 26442, "token_acc": 1.0, "train_speed(iter/s)": 0.958178 }, { "epoch": 0.8590130916414904, "grad_norm": 0.39072856307029724, "learning_rate": 5.336954894921381e-07, "loss": 0.014503461308777332, "memory(GiB)": 22.66, "step": 26443, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.958184 }, { "epoch": 0.8590455771042459, "grad_norm": 0.33822351694107056, "learning_rate": 5.33454044283212e-07, "loss": 0.010796505026519299, "memory(GiB)": 22.66, "step": 26444, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.95819 }, { "epoch": 0.8590780625670013, "grad_norm": 0.341081440448761, "learning_rate": 5.332126506241708e-07, "loss": 0.013423949480056763, "memory(GiB)": 22.66, "step": 26445, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.958196 }, { "epoch": 0.8591105480297567, "grad_norm": 0.2997814118862152, "learning_rate": 5.329713085178017e-07, "loss": 0.010913534089922905, "memory(GiB)": 22.66, "step": 26446, "token_acc": 1.0, "train_speed(iter/s)": 0.958202 }, { "epoch": 0.8591430334925121, "grad_norm": 0.2711687684059143, "learning_rate": 5.327300179668887e-07, "loss": 0.012818878516554832, "memory(GiB)": 22.66, "step": 26447, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.958208 }, { "epoch": 0.8591755189552676, "grad_norm": 0.18512718379497528, "learning_rate": 5.324887789742167e-07, "loss": 0.006948923692107201, "memory(GiB)": 22.66, "step": 26448, "token_acc": 1.0, "train_speed(iter/s)": 0.958214 }, { "epoch": 0.8592080044180229, "grad_norm": 0.3509964346885681, "learning_rate": 5.322475915425712e-07, "loss": 0.010003045201301575, "memory(GiB)": 22.66, "step": 26449, "token_acc": 0.99609375, "train_speed(iter/s)": 0.95822 }, { "epoch": 0.8592404898807784, "grad_norm": 0.44835397601127625, "learning_rate": 5.320064556747356e-07, "loss": 0.017222978174686432, "memory(GiB)": 22.66, "step": 26450, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.958225 }, { "epoch": 0.8592729753435338, "grad_norm": 0.3068494498729706, "learning_rate": 5.31765371373491e-07, "loss": 0.01203608326613903, "memory(GiB)": 22.66, "step": 26451, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.958231 }, { "epoch": 0.8593054608062892, "grad_norm": 0.38064396381378174, "learning_rate": 5.315243386416218e-07, "loss": 0.00976583268493414, "memory(GiB)": 22.66, "step": 26452, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.958238 }, { "epoch": 0.8593379462690446, "grad_norm": 0.3851528763771057, "learning_rate": 5.312833574819093e-07, "loss": 0.015138278715312481, "memory(GiB)": 22.66, "step": 26453, "token_acc": 1.0, "train_speed(iter/s)": 0.958244 }, { "epoch": 0.8593704317318, "grad_norm": 0.2909964919090271, "learning_rate": 5.310424278971354e-07, "loss": 0.010571170598268509, "memory(GiB)": 22.66, "step": 26454, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.95825 }, { "epoch": 0.8594029171945554, "grad_norm": 0.36283183097839355, "learning_rate": 5.308015498900792e-07, "loss": 0.010715040378272533, "memory(GiB)": 22.66, "step": 26455, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.958256 }, { "epoch": 0.8594354026573109, "grad_norm": 0.32642897963523865, "learning_rate": 5.305607234635224e-07, "loss": 0.014753539115190506, "memory(GiB)": 22.66, "step": 26456, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.958261 }, { "epoch": 0.8594678881200662, "grad_norm": 0.39870932698249817, "learning_rate": 5.303199486202415e-07, "loss": 0.012427695095539093, "memory(GiB)": 22.66, "step": 26457, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.958267 }, { "epoch": 0.8595003735828217, "grad_norm": 0.2906632721424103, "learning_rate": 5.300792253630194e-07, "loss": 0.007412828505039215, "memory(GiB)": 22.66, "step": 26458, "token_acc": 1.0, "train_speed(iter/s)": 0.958273 }, { "epoch": 0.8595328590455771, "grad_norm": 0.4427134394645691, "learning_rate": 5.298385536946326e-07, "loss": 0.01244090311229229, "memory(GiB)": 22.66, "step": 26459, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.958279 }, { "epoch": 0.8595653445083326, "grad_norm": 0.40862831473350525, "learning_rate": 5.29597933617858e-07, "loss": 0.008781332522630692, "memory(GiB)": 22.66, "step": 26460, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.958285 }, { "epoch": 0.8595978299710879, "grad_norm": 0.2878670394420624, "learning_rate": 5.293573651354744e-07, "loss": 0.007691153325140476, "memory(GiB)": 22.66, "step": 26461, "token_acc": 1.0, "train_speed(iter/s)": 0.958293 }, { "epoch": 0.8596303154338434, "grad_norm": 0.44442570209503174, "learning_rate": 5.291168482502551e-07, "loss": 0.01433483138680458, "memory(GiB)": 22.66, "step": 26462, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.958301 }, { "epoch": 0.8596628008965987, "grad_norm": 0.3463292717933655, "learning_rate": 5.288763829649801e-07, "loss": 0.008043899200856686, "memory(GiB)": 22.66, "step": 26463, "token_acc": 1.0, "train_speed(iter/s)": 0.958308 }, { "epoch": 0.8596952863593542, "grad_norm": 0.285415381193161, "learning_rate": 5.286359692824217e-07, "loss": 0.0059716831892728806, "memory(GiB)": 22.66, "step": 26464, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.958316 }, { "epoch": 0.8597277718221096, "grad_norm": 0.31263065338134766, "learning_rate": 5.283956072053564e-07, "loss": 0.00981459766626358, "memory(GiB)": 22.66, "step": 26465, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.958324 }, { "epoch": 0.859760257284865, "grad_norm": 0.5678107142448425, "learning_rate": 5.28155296736556e-07, "loss": 0.021301930770277977, "memory(GiB)": 22.66, "step": 26466, "token_acc": 1.0, "train_speed(iter/s)": 0.958332 }, { "epoch": 0.8597927427476204, "grad_norm": 2.724487543106079, "learning_rate": 5.279150378787973e-07, "loss": 0.013374237343668938, "memory(GiB)": 22.66, "step": 26467, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.958339 }, { "epoch": 0.8598252282103759, "grad_norm": 0.45305299758911133, "learning_rate": 5.2767483063485e-07, "loss": 0.02000388689339161, "memory(GiB)": 22.66, "step": 26468, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.958347 }, { "epoch": 0.8598577136731312, "grad_norm": 0.3256540894508362, "learning_rate": 5.274346750074894e-07, "loss": 0.0070924605242908, "memory(GiB)": 22.66, "step": 26469, "token_acc": 1.0, "train_speed(iter/s)": 0.958355 }, { "epoch": 0.8598901991358867, "grad_norm": 1.152123212814331, "learning_rate": 5.271945709994842e-07, "loss": 0.012638477608561516, "memory(GiB)": 22.66, "step": 26470, "token_acc": 0.99609375, "train_speed(iter/s)": 0.958362 }, { "epoch": 0.8599226845986421, "grad_norm": 0.4215013384819031, "learning_rate": 5.269545186136072e-07, "loss": 0.015302536077797413, "memory(GiB)": 22.66, "step": 26471, "token_acc": 1.0, "train_speed(iter/s)": 0.95837 }, { "epoch": 0.8599551700613975, "grad_norm": 0.4148717522621155, "learning_rate": 5.267145178526284e-07, "loss": 0.010301749221980572, "memory(GiB)": 22.66, "step": 26472, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.958378 }, { "epoch": 0.8599876555241529, "grad_norm": 0.30106356739997864, "learning_rate": 5.26474568719319e-07, "loss": 0.01068976242095232, "memory(GiB)": 22.66, "step": 26473, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.958384 }, { "epoch": 0.8600201409869084, "grad_norm": 0.3602902889251709, "learning_rate": 5.262346712164463e-07, "loss": 0.013445544056594372, "memory(GiB)": 22.66, "step": 26474, "token_acc": 0.9897435897435898, "train_speed(iter/s)": 0.958389 }, { "epoch": 0.8600526264496637, "grad_norm": 0.40259474515914917, "learning_rate": 5.259948253467806e-07, "loss": 0.016066618263721466, "memory(GiB)": 22.66, "step": 26475, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.958395 }, { "epoch": 0.8600851119124192, "grad_norm": 0.4993622601032257, "learning_rate": 5.257550311130888e-07, "loss": 0.01207075733691454, "memory(GiB)": 22.66, "step": 26476, "token_acc": 1.0, "train_speed(iter/s)": 0.9584 }, { "epoch": 0.8601175973751746, "grad_norm": 0.29382333159446716, "learning_rate": 5.255152885181398e-07, "loss": 0.007579799275845289, "memory(GiB)": 22.66, "step": 26477, "token_acc": 0.993006993006993, "train_speed(iter/s)": 0.958406 }, { "epoch": 0.86015008283793, "grad_norm": 0.4278647303581238, "learning_rate": 5.252755975647e-07, "loss": 0.016305021941661835, "memory(GiB)": 22.66, "step": 26478, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.958412 }, { "epoch": 0.8601825683006854, "grad_norm": 0.3452397286891937, "learning_rate": 5.250359582555353e-07, "loss": 0.008771456778049469, "memory(GiB)": 22.66, "step": 26479, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.958418 }, { "epoch": 0.8602150537634409, "grad_norm": 0.2525261342525482, "learning_rate": 5.247963705934117e-07, "loss": 0.0076215979643166065, "memory(GiB)": 22.66, "step": 26480, "token_acc": 1.0, "train_speed(iter/s)": 0.958424 }, { "epoch": 0.8602475392261962, "grad_norm": 0.6049953699111938, "learning_rate": 5.245568345810947e-07, "loss": 0.018676752224564552, "memory(GiB)": 22.66, "step": 26481, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.958429 }, { "epoch": 0.8602800246889517, "grad_norm": 0.44989126920700073, "learning_rate": 5.243173502213495e-07, "loss": 0.011462355963885784, "memory(GiB)": 22.66, "step": 26482, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.958435 }, { "epoch": 0.8603125101517071, "grad_norm": 0.3988092839717865, "learning_rate": 5.24077917516938e-07, "loss": 0.018894711509346962, "memory(GiB)": 22.66, "step": 26483, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.95844 }, { "epoch": 0.8603449956144625, "grad_norm": 2.06050181388855, "learning_rate": 5.238385364706249e-07, "loss": 0.011246699839830399, "memory(GiB)": 22.66, "step": 26484, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.958446 }, { "epoch": 0.8603774810772179, "grad_norm": 0.3349909484386444, "learning_rate": 5.235992070851725e-07, "loss": 0.012896058149635792, "memory(GiB)": 22.66, "step": 26485, "token_acc": 1.0, "train_speed(iter/s)": 0.958452 }, { "epoch": 0.8604099665399734, "grad_norm": 0.451896995306015, "learning_rate": 5.233599293633451e-07, "loss": 0.011372164823114872, "memory(GiB)": 22.66, "step": 26486, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.958458 }, { "epoch": 0.8604424520027287, "grad_norm": 0.27233320474624634, "learning_rate": 5.231207033079011e-07, "loss": 0.009633623994886875, "memory(GiB)": 22.66, "step": 26487, "token_acc": 0.9963503649635036, "train_speed(iter/s)": 0.958464 }, { "epoch": 0.8604749374654842, "grad_norm": 0.41865798830986023, "learning_rate": 5.228815289216038e-07, "loss": 0.015510257333517075, "memory(GiB)": 22.66, "step": 26488, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.958471 }, { "epoch": 0.8605074229282396, "grad_norm": 0.27024775743484497, "learning_rate": 5.226424062072105e-07, "loss": 0.008221965283155441, "memory(GiB)": 22.66, "step": 26489, "token_acc": 1.0, "train_speed(iter/s)": 0.958476 }, { "epoch": 0.860539908390995, "grad_norm": 0.6701455116271973, "learning_rate": 5.224033351674846e-07, "loss": 0.021649811416864395, "memory(GiB)": 22.66, "step": 26490, "token_acc": 1.0, "train_speed(iter/s)": 0.958482 }, { "epoch": 0.8605723938537504, "grad_norm": 0.2644726037979126, "learning_rate": 5.221643158051847e-07, "loss": 0.00833054631948471, "memory(GiB)": 22.66, "step": 26491, "token_acc": 1.0, "train_speed(iter/s)": 0.958488 }, { "epoch": 0.8606048793165059, "grad_norm": 0.3127689063549042, "learning_rate": 5.219253481230674e-07, "loss": 0.010667865164577961, "memory(GiB)": 22.66, "step": 26492, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.958493 }, { "epoch": 0.8606373647792612, "grad_norm": 0.4404534697532654, "learning_rate": 5.216864321238929e-07, "loss": 0.014081261120736599, "memory(GiB)": 22.66, "step": 26493, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.958501 }, { "epoch": 0.8606698502420167, "grad_norm": 0.29872527718544006, "learning_rate": 5.214475678104158e-07, "loss": 0.00974151398986578, "memory(GiB)": 22.66, "step": 26494, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.958509 }, { "epoch": 0.8607023357047721, "grad_norm": 0.4418809115886688, "learning_rate": 5.212087551853967e-07, "loss": 0.015869813039898872, "memory(GiB)": 22.66, "step": 26495, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.958516 }, { "epoch": 0.8607348211675275, "grad_norm": 0.3610171973705292, "learning_rate": 5.209699942515889e-07, "loss": 0.016326701268553734, "memory(GiB)": 22.66, "step": 26496, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.958524 }, { "epoch": 0.860767306630283, "grad_norm": 0.27314382791519165, "learning_rate": 5.207312850117497e-07, "loss": 0.008102646097540855, "memory(GiB)": 22.66, "step": 26497, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.958531 }, { "epoch": 0.8607997920930384, "grad_norm": 0.5782312154769897, "learning_rate": 5.204926274686328e-07, "loss": 0.018387237563729286, "memory(GiB)": 22.66, "step": 26498, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.958539 }, { "epoch": 0.8608322775557938, "grad_norm": 0.3128182590007782, "learning_rate": 5.20254021624993e-07, "loss": 0.010978300124406815, "memory(GiB)": 22.66, "step": 26499, "token_acc": 0.99609375, "train_speed(iter/s)": 0.958545 }, { "epoch": 0.8608647630185492, "grad_norm": 0.2944931387901306, "learning_rate": 5.200154674835839e-07, "loss": 0.007078580558300018, "memory(GiB)": 22.66, "step": 26500, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.958551 }, { "epoch": 0.8608647630185492, "eval_loss": 0.01167321763932705, "eval_runtime": 80.6517, "eval_samples_per_second": 123.37, "eval_steps_per_second": 3.856, "eval_token_acc": 0.9952961729278534, "step": 26500 }, { "epoch": 0.8608972484813047, "grad_norm": 0.22346021234989166, "learning_rate": 5.197769650471606e-07, "loss": 0.006907913833856583, "memory(GiB)": 22.66, "step": 26501, "token_acc": 0.9950527614087982, "train_speed(iter/s)": 0.955384 }, { "epoch": 0.86092973394406, "grad_norm": 0.34468021988868713, "learning_rate": 5.195385143184728e-07, "loss": 0.011225124821066856, "memory(GiB)": 22.66, "step": 26502, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.955389 }, { "epoch": 0.8609622194068155, "grad_norm": 0.2893737554550171, "learning_rate": 5.193001153002747e-07, "loss": 0.008961442857980728, "memory(GiB)": 22.66, "step": 26503, "token_acc": 1.0, "train_speed(iter/s)": 0.955394 }, { "epoch": 0.8609947048695709, "grad_norm": 0.292673796415329, "learning_rate": 5.190617679953169e-07, "loss": 0.009845107793807983, "memory(GiB)": 22.66, "step": 26504, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955399 }, { "epoch": 0.8610271903323263, "grad_norm": 0.39759042859077454, "learning_rate": 5.188234724063501e-07, "loss": 0.011207993142306805, "memory(GiB)": 22.66, "step": 26505, "token_acc": 0.994413407821229, "train_speed(iter/s)": 0.955404 }, { "epoch": 0.8610596757950817, "grad_norm": 0.4006885588169098, "learning_rate": 5.185852285361259e-07, "loss": 0.014019189402461052, "memory(GiB)": 22.66, "step": 26506, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.955409 }, { "epoch": 0.8610921612578372, "grad_norm": 0.3658830225467682, "learning_rate": 5.183470363873921e-07, "loss": 0.013553242199122906, "memory(GiB)": 22.66, "step": 26507, "token_acc": 1.0, "train_speed(iter/s)": 0.955414 }, { "epoch": 0.8611246467205925, "grad_norm": 0.3927139937877655, "learning_rate": 5.181088959628983e-07, "loss": 0.013789297081530094, "memory(GiB)": 22.66, "step": 26508, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.95542 }, { "epoch": 0.861157132183348, "grad_norm": 0.2943229377269745, "learning_rate": 5.178708072653938e-07, "loss": 0.011852479539811611, "memory(GiB)": 22.66, "step": 26509, "token_acc": 0.9855769230769231, "train_speed(iter/s)": 0.955425 }, { "epoch": 0.8611896176461034, "grad_norm": 0.3089984953403473, "learning_rate": 5.176327702976264e-07, "loss": 0.010856582783162594, "memory(GiB)": 22.66, "step": 26510, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955431 }, { "epoch": 0.8612221031088588, "grad_norm": 0.4055011570453644, "learning_rate": 5.173947850623418e-07, "loss": 0.007453545928001404, "memory(GiB)": 22.66, "step": 26511, "token_acc": 1.0, "train_speed(iter/s)": 0.955436 }, { "epoch": 0.8612545885716142, "grad_norm": 0.33706924319267273, "learning_rate": 5.171568515622876e-07, "loss": 0.008179373107850552, "memory(GiB)": 22.66, "step": 26512, "token_acc": 1.0, "train_speed(iter/s)": 0.955442 }, { "epoch": 0.8612870740343697, "grad_norm": 0.5658156871795654, "learning_rate": 5.169189698002103e-07, "loss": 0.020328864455223083, "memory(GiB)": 22.66, "step": 26513, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.955448 }, { "epoch": 0.861319559497125, "grad_norm": 0.3185473084449768, "learning_rate": 5.166811397788563e-07, "loss": 0.011224687099456787, "memory(GiB)": 22.66, "step": 26514, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.955456 }, { "epoch": 0.8613520449598805, "grad_norm": 0.4286128878593445, "learning_rate": 5.164433615009679e-07, "loss": 0.010278388857841492, "memory(GiB)": 22.66, "step": 26515, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.955462 }, { "epoch": 0.8613845304226359, "grad_norm": 0.3365716338157654, "learning_rate": 5.162056349692918e-07, "loss": 0.008373476564884186, "memory(GiB)": 22.66, "step": 26516, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.95547 }, { "epoch": 0.8614170158853913, "grad_norm": 0.3729950785636902, "learning_rate": 5.159679601865686e-07, "loss": 0.01184402871876955, "memory(GiB)": 22.66, "step": 26517, "token_acc": 1.0, "train_speed(iter/s)": 0.955478 }, { "epoch": 0.8614495013481467, "grad_norm": 0.23342597484588623, "learning_rate": 5.15730337155545e-07, "loss": 0.008565285243093967, "memory(GiB)": 22.66, "step": 26518, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955486 }, { "epoch": 0.8614819868109022, "grad_norm": 0.40103185176849365, "learning_rate": 5.15492765878961e-07, "loss": 0.013068604283034801, "memory(GiB)": 22.66, "step": 26519, "token_acc": 1.0, "train_speed(iter/s)": 0.955492 }, { "epoch": 0.8615144722736575, "grad_norm": 0.34957489371299744, "learning_rate": 5.1525524635956e-07, "loss": 0.010299718007445335, "memory(GiB)": 22.66, "step": 26520, "token_acc": 1.0, "train_speed(iter/s)": 0.9555 }, { "epoch": 0.861546957736413, "grad_norm": 0.4463037848472595, "learning_rate": 5.150177786000832e-07, "loss": 0.013323234394192696, "memory(GiB)": 22.66, "step": 26521, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.955508 }, { "epoch": 0.8615794431991683, "grad_norm": 0.1975046694278717, "learning_rate": 5.147803626032682e-07, "loss": 0.010971159674227238, "memory(GiB)": 22.66, "step": 26522, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955515 }, { "epoch": 0.8616119286619238, "grad_norm": 0.4200044572353363, "learning_rate": 5.145429983718603e-07, "loss": 0.01559714786708355, "memory(GiB)": 22.66, "step": 26523, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955522 }, { "epoch": 0.8616444141246792, "grad_norm": 0.5798022747039795, "learning_rate": 5.143056859085954e-07, "loss": 0.016698624938726425, "memory(GiB)": 22.66, "step": 26524, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.95553 }, { "epoch": 0.8616768995874347, "grad_norm": 0.289867103099823, "learning_rate": 5.140684252162137e-07, "loss": 0.008400587365031242, "memory(GiB)": 22.66, "step": 26525, "token_acc": 1.0, "train_speed(iter/s)": 0.955538 }, { "epoch": 0.86170938505019, "grad_norm": 0.4046870470046997, "learning_rate": 5.138312162974524e-07, "loss": 0.013908623717725277, "memory(GiB)": 22.66, "step": 26526, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.955546 }, { "epoch": 0.8617418705129455, "grad_norm": 0.29192498326301575, "learning_rate": 5.135940591550503e-07, "loss": 0.007211756426841021, "memory(GiB)": 22.66, "step": 26527, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955554 }, { "epoch": 0.8617743559757008, "grad_norm": 0.3482901453971863, "learning_rate": 5.133569537917438e-07, "loss": 0.01039827335625887, "memory(GiB)": 22.66, "step": 26528, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955562 }, { "epoch": 0.8618068414384563, "grad_norm": 0.3135073781013489, "learning_rate": 5.13119900210271e-07, "loss": 0.010160104371607304, "memory(GiB)": 22.66, "step": 26529, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955569 }, { "epoch": 0.8618393269012117, "grad_norm": 0.332122266292572, "learning_rate": 5.128828984133654e-07, "loss": 0.012236922979354858, "memory(GiB)": 22.66, "step": 26530, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.955577 }, { "epoch": 0.8618718123639671, "grad_norm": 0.5988189578056335, "learning_rate": 5.126459484037632e-07, "loss": 0.01083064079284668, "memory(GiB)": 22.66, "step": 26531, "token_acc": 1.0, "train_speed(iter/s)": 0.955584 }, { "epoch": 0.8619042978267225, "grad_norm": 0.4872550666332245, "learning_rate": 5.124090501842e-07, "loss": 0.012748520821332932, "memory(GiB)": 22.66, "step": 26532, "token_acc": 1.0, "train_speed(iter/s)": 0.955592 }, { "epoch": 0.861936783289478, "grad_norm": 0.544206440448761, "learning_rate": 5.121722037574101e-07, "loss": 0.01500706747174263, "memory(GiB)": 22.66, "step": 26533, "token_acc": 1.0, "train_speed(iter/s)": 0.9556 }, { "epoch": 0.8619692687522333, "grad_norm": 0.4260346293449402, "learning_rate": 5.119354091261253e-07, "loss": 0.007986565120518208, "memory(GiB)": 22.66, "step": 26534, "token_acc": 0.99609375, "train_speed(iter/s)": 0.955608 }, { "epoch": 0.8620017542149888, "grad_norm": 0.18177947402000427, "learning_rate": 5.116986662930795e-07, "loss": 0.004738016054034233, "memory(GiB)": 22.66, "step": 26535, "token_acc": 1.0, "train_speed(iter/s)": 0.955615 }, { "epoch": 0.8620342396777442, "grad_norm": 0.4066165089607239, "learning_rate": 5.114619752610045e-07, "loss": 0.015464218333363533, "memory(GiB)": 22.66, "step": 26536, "token_acc": 0.996551724137931, "train_speed(iter/s)": 0.955623 }, { "epoch": 0.8620667251404996, "grad_norm": 0.4565853774547577, "learning_rate": 5.11225336032633e-07, "loss": 0.016756411641836166, "memory(GiB)": 22.66, "step": 26537, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.955631 }, { "epoch": 0.862099210603255, "grad_norm": 0.5228590369224548, "learning_rate": 5.109887486106968e-07, "loss": 0.007786882109940052, "memory(GiB)": 22.66, "step": 26538, "token_acc": 1.0, "train_speed(iter/s)": 0.955638 }, { "epoch": 0.8621316960660105, "grad_norm": 0.2491615116596222, "learning_rate": 5.107522129979243e-07, "loss": 0.010831792838871479, "memory(GiB)": 22.66, "step": 26539, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955646 }, { "epoch": 0.8621641815287658, "grad_norm": 0.471876859664917, "learning_rate": 5.105157291970458e-07, "loss": 0.008235589601099491, "memory(GiB)": 22.66, "step": 26540, "token_acc": 1.0, "train_speed(iter/s)": 0.955644 }, { "epoch": 0.8621966669915213, "grad_norm": 0.37092864513397217, "learning_rate": 5.102792972107918e-07, "loss": 0.013257636688649654, "memory(GiB)": 22.66, "step": 26541, "token_acc": 1.0, "train_speed(iter/s)": 0.955651 }, { "epoch": 0.8622291524542767, "grad_norm": 0.5919777154922485, "learning_rate": 5.100429170418913e-07, "loss": 0.014721200801432133, "memory(GiB)": 22.66, "step": 26542, "token_acc": 1.0, "train_speed(iter/s)": 0.955657 }, { "epoch": 0.8622616379170321, "grad_norm": 0.41199108958244324, "learning_rate": 5.098065886930709e-07, "loss": 0.017090551555156708, "memory(GiB)": 22.66, "step": 26543, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955663 }, { "epoch": 0.8622941233797875, "grad_norm": 0.45681828260421753, "learning_rate": 5.095703121670597e-07, "loss": 0.018543127924203873, "memory(GiB)": 22.66, "step": 26544, "token_acc": 0.9796747967479674, "train_speed(iter/s)": 0.955669 }, { "epoch": 0.862326608842543, "grad_norm": 0.3607262969017029, "learning_rate": 5.09334087466582e-07, "loss": 0.012635432183742523, "memory(GiB)": 22.66, "step": 26545, "token_acc": 1.0, "train_speed(iter/s)": 0.955674 }, { "epoch": 0.8623590943052983, "grad_norm": 0.46433722972869873, "learning_rate": 5.090979145943681e-07, "loss": 0.00832395814359188, "memory(GiB)": 22.66, "step": 26546, "token_acc": 1.0, "train_speed(iter/s)": 0.955681 }, { "epoch": 0.8623915797680538, "grad_norm": 0.2947004735469818, "learning_rate": 5.088617935531404e-07, "loss": 0.010498994030058384, "memory(GiB)": 22.66, "step": 26547, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955686 }, { "epoch": 0.8624240652308092, "grad_norm": 0.4347395598888397, "learning_rate": 5.086257243456266e-07, "loss": 0.01627834513783455, "memory(GiB)": 22.66, "step": 26548, "token_acc": 0.9891696750902527, "train_speed(iter/s)": 0.955692 }, { "epoch": 0.8624565506935646, "grad_norm": 0.4142615795135498, "learning_rate": 5.083897069745486e-07, "loss": 0.012079273350536823, "memory(GiB)": 22.66, "step": 26549, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955699 }, { "epoch": 0.86248903615632, "grad_norm": 0.3580942451953888, "learning_rate": 5.081537414426319e-07, "loss": 0.011477898806333542, "memory(GiB)": 22.66, "step": 26550, "token_acc": 1.0, "train_speed(iter/s)": 0.955705 }, { "epoch": 0.8625215216190755, "grad_norm": 0.4401106834411621, "learning_rate": 5.079178277525993e-07, "loss": 0.01852976530790329, "memory(GiB)": 22.66, "step": 26551, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.955711 }, { "epoch": 0.8625540070818308, "grad_norm": 0.5036683082580566, "learning_rate": 5.076819659071741e-07, "loss": 0.015828225761651993, "memory(GiB)": 22.66, "step": 26552, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955716 }, { "epoch": 0.8625864925445863, "grad_norm": 0.4155603349208832, "learning_rate": 5.074461559090798e-07, "loss": 0.011928204447031021, "memory(GiB)": 22.66, "step": 26553, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.955722 }, { "epoch": 0.8626189780073417, "grad_norm": 0.29708370566368103, "learning_rate": 5.072103977610349e-07, "loss": 0.011605819687247276, "memory(GiB)": 22.66, "step": 26554, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.955727 }, { "epoch": 0.8626514634700971, "grad_norm": 0.42951348423957825, "learning_rate": 5.06974691465762e-07, "loss": 0.012734371237456799, "memory(GiB)": 22.66, "step": 26555, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955732 }, { "epoch": 0.8626839489328525, "grad_norm": 0.38024193048477173, "learning_rate": 5.067390370259811e-07, "loss": 0.01816920004785061, "memory(GiB)": 22.66, "step": 26556, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955736 }, { "epoch": 0.862716434395608, "grad_norm": 0.3644256889820099, "learning_rate": 5.065034344444136e-07, "loss": 0.010601992718875408, "memory(GiB)": 22.66, "step": 26557, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.955741 }, { "epoch": 0.8627489198583633, "grad_norm": 0.35150566697120667, "learning_rate": 5.062678837237761e-07, "loss": 0.010621681809425354, "memory(GiB)": 22.66, "step": 26558, "token_acc": 0.9961977186311787, "train_speed(iter/s)": 0.955747 }, { "epoch": 0.8627814053211188, "grad_norm": 3.200561046600342, "learning_rate": 5.060323848667887e-07, "loss": 0.013593867421150208, "memory(GiB)": 22.66, "step": 26559, "token_acc": 1.0, "train_speed(iter/s)": 0.955751 }, { "epoch": 0.8628138907838743, "grad_norm": 0.27426713705062866, "learning_rate": 5.057969378761684e-07, "loss": 0.007734575308859348, "memory(GiB)": 22.66, "step": 26560, "token_acc": 1.0, "train_speed(iter/s)": 0.955756 }, { "epoch": 0.8628463762466296, "grad_norm": 0.2981964945793152, "learning_rate": 5.055615427546351e-07, "loss": 0.010470274835824966, "memory(GiB)": 22.66, "step": 26561, "token_acc": 1.0, "train_speed(iter/s)": 0.955761 }, { "epoch": 0.8628788617093851, "grad_norm": 0.38126006722450256, "learning_rate": 5.05326199504902e-07, "loss": 0.009274126030504704, "memory(GiB)": 22.66, "step": 26562, "token_acc": 1.0, "train_speed(iter/s)": 0.955766 }, { "epoch": 0.8629113471721405, "grad_norm": 0.23693318665027618, "learning_rate": 5.050909081296873e-07, "loss": 0.009794553741812706, "memory(GiB)": 22.66, "step": 26563, "token_acc": 0.9947368421052631, "train_speed(iter/s)": 0.955772 }, { "epoch": 0.8629438326348959, "grad_norm": 0.3514486253261566, "learning_rate": 5.048556686317058e-07, "loss": 0.013903489336371422, "memory(GiB)": 22.66, "step": 26564, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.955776 }, { "epoch": 0.8629763180976513, "grad_norm": 0.22049446403980255, "learning_rate": 5.04620481013674e-07, "loss": 0.004915828350931406, "memory(GiB)": 22.66, "step": 26565, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955782 }, { "epoch": 0.8630088035604068, "grad_norm": 0.29156890511512756, "learning_rate": 5.043853452783048e-07, "loss": 0.0075448621064424515, "memory(GiB)": 22.66, "step": 26566, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.955788 }, { "epoch": 0.8630412890231621, "grad_norm": 0.22866471111774445, "learning_rate": 5.041502614283117e-07, "loss": 0.010088425129652023, "memory(GiB)": 22.66, "step": 26567, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955794 }, { "epoch": 0.8630737744859176, "grad_norm": 0.4878484308719635, "learning_rate": 5.03915229466409e-07, "loss": 0.016535451635718346, "memory(GiB)": 22.66, "step": 26568, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.955799 }, { "epoch": 0.863106259948673, "grad_norm": 0.4808371961116791, "learning_rate": 5.036802493953086e-07, "loss": 0.010503091849386692, "memory(GiB)": 22.66, "step": 26569, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.955805 }, { "epoch": 0.8631387454114284, "grad_norm": 0.29013073444366455, "learning_rate": 5.034453212177237e-07, "loss": 0.014805884100496769, "memory(GiB)": 22.66, "step": 26570, "token_acc": 1.0, "train_speed(iter/s)": 0.955811 }, { "epoch": 0.8631712308741838, "grad_norm": 0.21220891177654266, "learning_rate": 5.032104449363634e-07, "loss": 0.008922122418880463, "memory(GiB)": 22.66, "step": 26571, "token_acc": 1.0, "train_speed(iter/s)": 0.955815 }, { "epoch": 0.8632037163369393, "grad_norm": 0.3548314571380615, "learning_rate": 5.02975620553941e-07, "loss": 0.010905379429459572, "memory(GiB)": 22.66, "step": 26572, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.955822 }, { "epoch": 0.8632362017996946, "grad_norm": 0.35898441076278687, "learning_rate": 5.027408480731638e-07, "loss": 0.011669822037220001, "memory(GiB)": 22.66, "step": 26573, "token_acc": 1.0, "train_speed(iter/s)": 0.95583 }, { "epoch": 0.8632686872624501, "grad_norm": 0.21645379066467285, "learning_rate": 5.025061274967446e-07, "loss": 0.009127110242843628, "memory(GiB)": 22.66, "step": 26574, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955838 }, { "epoch": 0.8633011727252055, "grad_norm": 0.820908784866333, "learning_rate": 5.022714588273902e-07, "loss": 0.014039370231330395, "memory(GiB)": 22.66, "step": 26575, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955846 }, { "epoch": 0.8633336581879609, "grad_norm": 0.31567874550819397, "learning_rate": 5.020368420678106e-07, "loss": 0.010981066152453423, "memory(GiB)": 22.66, "step": 26576, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.955853 }, { "epoch": 0.8633661436507163, "grad_norm": 0.44359686970710754, "learning_rate": 5.018022772207115e-07, "loss": 0.013131612911820412, "memory(GiB)": 22.66, "step": 26577, "token_acc": 1.0, "train_speed(iter/s)": 0.955861 }, { "epoch": 0.8633986291134718, "grad_norm": 0.4891289174556732, "learning_rate": 5.015677642888011e-07, "loss": 0.014237084425985813, "memory(GiB)": 22.66, "step": 26578, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.955868 }, { "epoch": 0.8634311145762271, "grad_norm": 0.3059861958026886, "learning_rate": 5.013333032747863e-07, "loss": 0.010194487869739532, "memory(GiB)": 22.66, "step": 26579, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955875 }, { "epoch": 0.8634636000389826, "grad_norm": 0.5425301194190979, "learning_rate": 5.010988941813739e-07, "loss": 0.014755425974726677, "memory(GiB)": 22.66, "step": 26580, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.955883 }, { "epoch": 0.863496085501738, "grad_norm": 0.4028645157814026, "learning_rate": 5.00864537011267e-07, "loss": 0.012187285348773003, "memory(GiB)": 22.66, "step": 26581, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.955891 }, { "epoch": 0.8635285709644934, "grad_norm": 0.27212080359458923, "learning_rate": 5.006302317671718e-07, "loss": 0.010029420256614685, "memory(GiB)": 22.66, "step": 26582, "token_acc": 1.0, "train_speed(iter/s)": 0.955898 }, { "epoch": 0.8635610564272488, "grad_norm": 0.4867914915084839, "learning_rate": 5.003959784517926e-07, "loss": 0.012748723849654198, "memory(GiB)": 22.66, "step": 26583, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955905 }, { "epoch": 0.8635935418900043, "grad_norm": 0.27579066157341003, "learning_rate": 5.001617770678325e-07, "loss": 0.011864837259054184, "memory(GiB)": 22.66, "step": 26584, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.955913 }, { "epoch": 0.8636260273527596, "grad_norm": 1.3419365882873535, "learning_rate": 4.999276276179954e-07, "loss": 0.01896766386926174, "memory(GiB)": 22.66, "step": 26585, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.95592 }, { "epoch": 0.8636585128155151, "grad_norm": 0.47220954298973083, "learning_rate": 4.996935301049826e-07, "loss": 0.012972649186849594, "memory(GiB)": 22.66, "step": 26586, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.955928 }, { "epoch": 0.8636909982782704, "grad_norm": 0.40070483088493347, "learning_rate": 4.994594845314965e-07, "loss": 0.01203934196382761, "memory(GiB)": 22.66, "step": 26587, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.955935 }, { "epoch": 0.8637234837410259, "grad_norm": 0.23453381657600403, "learning_rate": 4.992254909002381e-07, "loss": 0.007401846349239349, "memory(GiB)": 22.66, "step": 26588, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955943 }, { "epoch": 0.8637559692037813, "grad_norm": 0.3363240361213684, "learning_rate": 4.989915492139086e-07, "loss": 0.007710580714046955, "memory(GiB)": 22.66, "step": 26589, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.955951 }, { "epoch": 0.8637884546665368, "grad_norm": 0.2455376833677292, "learning_rate": 4.987576594752069e-07, "loss": 0.012407738715410233, "memory(GiB)": 22.66, "step": 26590, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955958 }, { "epoch": 0.8638209401292921, "grad_norm": 0.3257608711719513, "learning_rate": 4.985238216868326e-07, "loss": 0.00856383703649044, "memory(GiB)": 22.66, "step": 26591, "token_acc": 1.0, "train_speed(iter/s)": 0.955966 }, { "epoch": 0.8638534255920476, "grad_norm": 0.4770951271057129, "learning_rate": 4.982900358514853e-07, "loss": 0.018255244940519333, "memory(GiB)": 22.66, "step": 26592, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955974 }, { "epoch": 0.863885911054803, "grad_norm": 0.2793556749820709, "learning_rate": 4.980563019718632e-07, "loss": 0.005855749361217022, "memory(GiB)": 22.66, "step": 26593, "token_acc": 1.0, "train_speed(iter/s)": 0.955981 }, { "epoch": 0.8639183965175584, "grad_norm": 0.39415544271469116, "learning_rate": 4.978226200506631e-07, "loss": 0.010315682739019394, "memory(GiB)": 22.66, "step": 26594, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.955989 }, { "epoch": 0.8639508819803138, "grad_norm": 0.37585753202438354, "learning_rate": 4.975889900905829e-07, "loss": 0.014115921221673489, "memory(GiB)": 22.66, "step": 26595, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.955997 }, { "epoch": 0.8639833674430693, "grad_norm": 0.3210594356060028, "learning_rate": 4.973554120943164e-07, "loss": 0.014616166241466999, "memory(GiB)": 22.66, "step": 26596, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.956004 }, { "epoch": 0.8640158529058246, "grad_norm": 0.3822108507156372, "learning_rate": 4.971218860645627e-07, "loss": 0.017335040494799614, "memory(GiB)": 22.66, "step": 26597, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956011 }, { "epoch": 0.8640483383685801, "grad_norm": 0.3487493395805359, "learning_rate": 4.968884120040163e-07, "loss": 0.009926547296345234, "memory(GiB)": 22.66, "step": 26598, "token_acc": 1.0, "train_speed(iter/s)": 0.956019 }, { "epoch": 0.8640808238313354, "grad_norm": 0.3937021493911743, "learning_rate": 4.966549899153705e-07, "loss": 0.010144894942641258, "memory(GiB)": 22.66, "step": 26599, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956027 }, { "epoch": 0.8641133092940909, "grad_norm": 0.34517794847488403, "learning_rate": 4.964216198013211e-07, "loss": 0.009145265445113182, "memory(GiB)": 22.66, "step": 26600, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956034 }, { "epoch": 0.8641457947568463, "grad_norm": 0.27768972516059875, "learning_rate": 4.961883016645586e-07, "loss": 0.007014710456132889, "memory(GiB)": 22.66, "step": 26601, "token_acc": 1.0, "train_speed(iter/s)": 0.956042 }, { "epoch": 0.8641782802196017, "grad_norm": 0.26413771510124207, "learning_rate": 4.959550355077797e-07, "loss": 0.009179238229990005, "memory(GiB)": 22.66, "step": 26602, "token_acc": 1.0, "train_speed(iter/s)": 0.956049 }, { "epoch": 0.8642107656823571, "grad_norm": 0.43227794766426086, "learning_rate": 4.957218213336734e-07, "loss": 0.013198822736740112, "memory(GiB)": 22.66, "step": 26603, "token_acc": 1.0, "train_speed(iter/s)": 0.956057 }, { "epoch": 0.8642432511451126, "grad_norm": 0.38005372881889343, "learning_rate": 4.954886591449332e-07, "loss": 0.012857792899012566, "memory(GiB)": 22.66, "step": 26604, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.956064 }, { "epoch": 0.8642757366078679, "grad_norm": 0.46130508184432983, "learning_rate": 4.95255548944249e-07, "loss": 0.011320846155285835, "memory(GiB)": 22.66, "step": 26605, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.956071 }, { "epoch": 0.8643082220706234, "grad_norm": 0.3280937373638153, "learning_rate": 4.950224907343109e-07, "loss": 0.011462929658591747, "memory(GiB)": 22.66, "step": 26606, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.956077 }, { "epoch": 0.8643407075333788, "grad_norm": 0.3569686710834503, "learning_rate": 4.947894845178097e-07, "loss": 0.011354953050613403, "memory(GiB)": 22.66, "step": 26607, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956082 }, { "epoch": 0.8643731929961342, "grad_norm": 0.32232362031936646, "learning_rate": 4.945565302974354e-07, "loss": 0.013393034227192402, "memory(GiB)": 22.66, "step": 26608, "token_acc": 0.9829787234042553, "train_speed(iter/s)": 0.956088 }, { "epoch": 0.8644056784588896, "grad_norm": 0.32860615849494934, "learning_rate": 4.943236280758745e-07, "loss": 0.01041347160935402, "memory(GiB)": 22.66, "step": 26609, "token_acc": 1.0, "train_speed(iter/s)": 0.956094 }, { "epoch": 0.8644381639216451, "grad_norm": 0.40045756101608276, "learning_rate": 4.940907778558157e-07, "loss": 0.007386535406112671, "memory(GiB)": 22.66, "step": 26610, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.9561 }, { "epoch": 0.8644706493844004, "grad_norm": 0.1941477209329605, "learning_rate": 4.938579796399467e-07, "loss": 0.007939524948596954, "memory(GiB)": 22.66, "step": 26611, "token_acc": 1.0, "train_speed(iter/s)": 0.956106 }, { "epoch": 0.8645031348471559, "grad_norm": 0.3468833565711975, "learning_rate": 4.936252334309555e-07, "loss": 0.007878695614635944, "memory(GiB)": 22.66, "step": 26612, "token_acc": 1.0, "train_speed(iter/s)": 0.956111 }, { "epoch": 0.8645356203099113, "grad_norm": 0.3001452386379242, "learning_rate": 4.933925392315258e-07, "loss": 0.005855748429894447, "memory(GiB)": 22.66, "step": 26613, "token_acc": 1.0, "train_speed(iter/s)": 0.956116 }, { "epoch": 0.8645681057726667, "grad_norm": 0.5064783692359924, "learning_rate": 4.931598970443446e-07, "loss": 0.011373667977750301, "memory(GiB)": 22.66, "step": 26614, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.956121 }, { "epoch": 0.8646005912354221, "grad_norm": 0.4205581843852997, "learning_rate": 4.929273068720969e-07, "loss": 0.013835523277521133, "memory(GiB)": 22.66, "step": 26615, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.956126 }, { "epoch": 0.8646330766981776, "grad_norm": 0.25262579321861267, "learning_rate": 4.926947687174666e-07, "loss": 0.010513815097510815, "memory(GiB)": 22.66, "step": 26616, "token_acc": 0.996, "train_speed(iter/s)": 0.956131 }, { "epoch": 0.8646655621609329, "grad_norm": 0.37770530581474304, "learning_rate": 4.924622825831393e-07, "loss": 0.01066850870847702, "memory(GiB)": 22.66, "step": 26617, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.956136 }, { "epoch": 0.8646980476236884, "grad_norm": 0.3320395052433014, "learning_rate": 4.922298484717952e-07, "loss": 0.017288342118263245, "memory(GiB)": 22.66, "step": 26618, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.95614 }, { "epoch": 0.8647305330864438, "grad_norm": 0.36768051981925964, "learning_rate": 4.919974663861194e-07, "loss": 0.00933990627527237, "memory(GiB)": 22.66, "step": 26619, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.956145 }, { "epoch": 0.8647630185491992, "grad_norm": 0.313838928937912, "learning_rate": 4.917651363287923e-07, "loss": 0.009746886789798737, "memory(GiB)": 22.66, "step": 26620, "token_acc": 1.0, "train_speed(iter/s)": 0.956149 }, { "epoch": 0.8647955040119546, "grad_norm": 0.33980512619018555, "learning_rate": 4.91532858302497e-07, "loss": 0.015956435352563858, "memory(GiB)": 22.66, "step": 26621, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956154 }, { "epoch": 0.8648279894747101, "grad_norm": 0.3579462170600891, "learning_rate": 4.913006323099123e-07, "loss": 0.007524712942540646, "memory(GiB)": 22.66, "step": 26622, "token_acc": 1.0, "train_speed(iter/s)": 0.95616 }, { "epoch": 0.8648604749374654, "grad_norm": 0.5614064931869507, "learning_rate": 4.910684583537206e-07, "loss": 0.019601067528128624, "memory(GiB)": 22.66, "step": 26623, "token_acc": 1.0, "train_speed(iter/s)": 0.956164 }, { "epoch": 0.8648929604002209, "grad_norm": 0.40855658054351807, "learning_rate": 4.908363364365981e-07, "loss": 0.019549395889043808, "memory(GiB)": 22.66, "step": 26624, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956168 }, { "epoch": 0.8649254458629764, "grad_norm": 0.3318856358528137, "learning_rate": 4.906042665612282e-07, "loss": 0.010397925972938538, "memory(GiB)": 22.66, "step": 26625, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.956173 }, { "epoch": 0.8649579313257317, "grad_norm": 0.2679998576641083, "learning_rate": 4.903722487302859e-07, "loss": 0.008840219117701054, "memory(GiB)": 22.66, "step": 26626, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.956178 }, { "epoch": 0.8649904167884872, "grad_norm": 0.30527210235595703, "learning_rate": 4.901402829464508e-07, "loss": 0.011142278090119362, "memory(GiB)": 22.66, "step": 26627, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.956183 }, { "epoch": 0.8650229022512426, "grad_norm": 0.5303703546524048, "learning_rate": 4.899083692123991e-07, "loss": 0.019769512116909027, "memory(GiB)": 22.66, "step": 26628, "token_acc": 1.0, "train_speed(iter/s)": 0.956189 }, { "epoch": 0.865055387713998, "grad_norm": 0.6256718039512634, "learning_rate": 4.896765075308063e-07, "loss": 0.01566971465945244, "memory(GiB)": 22.66, "step": 26629, "token_acc": 0.9826388888888888, "train_speed(iter/s)": 0.956195 }, { "epoch": 0.8650878731767534, "grad_norm": 0.30371958017349243, "learning_rate": 4.894446979043515e-07, "loss": 0.009218527004122734, "memory(GiB)": 22.66, "step": 26630, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.9562 }, { "epoch": 0.8651203586395089, "grad_norm": 0.3515520989894867, "learning_rate": 4.892129403357082e-07, "loss": 0.009649140760302544, "memory(GiB)": 22.66, "step": 26631, "token_acc": 1.0, "train_speed(iter/s)": 0.956206 }, { "epoch": 0.8651528441022642, "grad_norm": 0.4899493157863617, "learning_rate": 4.889812348275519e-07, "loss": 0.010780712589621544, "memory(GiB)": 22.66, "step": 26632, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.956214 }, { "epoch": 0.8651853295650197, "grad_norm": 0.3940543532371521, "learning_rate": 4.887495813825555e-07, "loss": 0.009879613295197487, "memory(GiB)": 22.66, "step": 26633, "token_acc": 1.0, "train_speed(iter/s)": 0.956221 }, { "epoch": 0.8652178150277751, "grad_norm": 0.46157100796699524, "learning_rate": 4.885179800033935e-07, "loss": 0.015187431126832962, "memory(GiB)": 22.66, "step": 26634, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.956229 }, { "epoch": 0.8652503004905305, "grad_norm": 0.2677735686302185, "learning_rate": 4.882864306927388e-07, "loss": 0.010627645999193192, "memory(GiB)": 22.66, "step": 26635, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956236 }, { "epoch": 0.8652827859532859, "grad_norm": 0.28562653064727783, "learning_rate": 4.880549334532642e-07, "loss": 0.009984293952584267, "memory(GiB)": 22.66, "step": 26636, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956243 }, { "epoch": 0.8653152714160414, "grad_norm": 0.3298993408679962, "learning_rate": 4.878234882876404e-07, "loss": 0.010980427265167236, "memory(GiB)": 22.66, "step": 26637, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.95625 }, { "epoch": 0.8653477568787967, "grad_norm": 0.25041666626930237, "learning_rate": 4.875920951985397e-07, "loss": 0.007040905766189098, "memory(GiB)": 22.66, "step": 26638, "token_acc": 1.0, "train_speed(iter/s)": 0.956258 }, { "epoch": 0.8653802423415522, "grad_norm": 0.36696726083755493, "learning_rate": 4.873607541886316e-07, "loss": 0.009561131708323956, "memory(GiB)": 22.66, "step": 26639, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.956266 }, { "epoch": 0.8654127278043076, "grad_norm": 0.3531002104282379, "learning_rate": 4.871294652605879e-07, "loss": 0.011175988242030144, "memory(GiB)": 22.66, "step": 26640, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956273 }, { "epoch": 0.865445213267063, "grad_norm": 0.29277119040489197, "learning_rate": 4.868982284170759e-07, "loss": 0.007428412325680256, "memory(GiB)": 22.66, "step": 26641, "token_acc": 1.0, "train_speed(iter/s)": 0.95628 }, { "epoch": 0.8654776987298184, "grad_norm": 0.18228530883789062, "learning_rate": 4.866670436607651e-07, "loss": 0.007688236888498068, "memory(GiB)": 22.66, "step": 26642, "token_acc": 1.0, "train_speed(iter/s)": 0.956288 }, { "epoch": 0.8655101841925739, "grad_norm": 0.39012643694877625, "learning_rate": 4.86435910994324e-07, "loss": 0.015080248937010765, "memory(GiB)": 22.66, "step": 26643, "token_acc": 1.0, "train_speed(iter/s)": 0.956296 }, { "epoch": 0.8655426696553292, "grad_norm": 0.4742890000343323, "learning_rate": 4.862048304204198e-07, "loss": 0.0162583589553833, "memory(GiB)": 22.66, "step": 26644, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956304 }, { "epoch": 0.8655751551180847, "grad_norm": 0.21377286314964294, "learning_rate": 4.85973801941721e-07, "loss": 0.005678130313754082, "memory(GiB)": 22.66, "step": 26645, "token_acc": 1.0, "train_speed(iter/s)": 0.956311 }, { "epoch": 0.86560764058084, "grad_norm": 0.35854265093803406, "learning_rate": 4.857428255608914e-07, "loss": 0.01417899876832962, "memory(GiB)": 22.66, "step": 26646, "token_acc": 1.0, "train_speed(iter/s)": 0.956319 }, { "epoch": 0.8656401260435955, "grad_norm": 0.367022305727005, "learning_rate": 4.855119012805981e-07, "loss": 0.010978790000081062, "memory(GiB)": 22.66, "step": 26647, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.956327 }, { "epoch": 0.8656726115063509, "grad_norm": 0.43616387248039246, "learning_rate": 4.852810291035065e-07, "loss": 0.008349860087037086, "memory(GiB)": 22.66, "step": 26648, "token_acc": 1.0, "train_speed(iter/s)": 0.956334 }, { "epoch": 0.8657050969691064, "grad_norm": 0.3050037622451782, "learning_rate": 4.850502090322817e-07, "loss": 0.010547295212745667, "memory(GiB)": 22.66, "step": 26649, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.956342 }, { "epoch": 0.8657375824318617, "grad_norm": 0.3993988335132599, "learning_rate": 4.848194410695861e-07, "loss": 0.013627810403704643, "memory(GiB)": 22.66, "step": 26650, "token_acc": 1.0, "train_speed(iter/s)": 0.956349 }, { "epoch": 0.8657700678946172, "grad_norm": 0.2930421531200409, "learning_rate": 4.845887252180842e-07, "loss": 0.010298597626388073, "memory(GiB)": 22.66, "step": 26651, "token_acc": 1.0, "train_speed(iter/s)": 0.956357 }, { "epoch": 0.8658025533573726, "grad_norm": 0.3358723521232605, "learning_rate": 4.843580614804372e-07, "loss": 0.014556271955370903, "memory(GiB)": 22.66, "step": 26652, "token_acc": 1.0, "train_speed(iter/s)": 0.956365 }, { "epoch": 0.865835038820128, "grad_norm": 0.4118451774120331, "learning_rate": 4.841274498593102e-07, "loss": 0.010812717489898205, "memory(GiB)": 22.66, "step": 26653, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.956372 }, { "epoch": 0.8658675242828834, "grad_norm": 0.2940244972705841, "learning_rate": 4.838968903573615e-07, "loss": 0.009925447404384613, "memory(GiB)": 22.66, "step": 26654, "token_acc": 1.0, "train_speed(iter/s)": 0.956379 }, { "epoch": 0.8659000097456389, "grad_norm": 0.3374879062175751, "learning_rate": 4.836663829772553e-07, "loss": 0.014829008840024471, "memory(GiB)": 22.66, "step": 26655, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.956386 }, { "epoch": 0.8659324952083942, "grad_norm": 0.21669960021972656, "learning_rate": 4.834359277216489e-07, "loss": 0.009169417433440685, "memory(GiB)": 22.66, "step": 26656, "token_acc": 1.0, "train_speed(iter/s)": 0.956393 }, { "epoch": 0.8659649806711497, "grad_norm": 0.31351369619369507, "learning_rate": 4.832055245932038e-07, "loss": 0.009709118865430355, "memory(GiB)": 22.66, "step": 26657, "token_acc": 1.0, "train_speed(iter/s)": 0.956401 }, { "epoch": 0.865997466133905, "grad_norm": 0.4133012890815735, "learning_rate": 4.829751735945787e-07, "loss": 0.012707548215985298, "memory(GiB)": 22.66, "step": 26658, "token_acc": 1.0, "train_speed(iter/s)": 0.956409 }, { "epoch": 0.8660299515966605, "grad_norm": 0.44237831234931946, "learning_rate": 4.827448747284319e-07, "loss": 0.011816765181720257, "memory(GiB)": 22.66, "step": 26659, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956416 }, { "epoch": 0.8660624370594159, "grad_norm": 0.33811521530151367, "learning_rate": 4.82514627997423e-07, "loss": 0.011463734321296215, "memory(GiB)": 22.66, "step": 26660, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956424 }, { "epoch": 0.8660949225221714, "grad_norm": 0.25237640738487244, "learning_rate": 4.822844334042065e-07, "loss": 0.010634202510118484, "memory(GiB)": 22.66, "step": 26661, "token_acc": 1.0, "train_speed(iter/s)": 0.956432 }, { "epoch": 0.8661274079849267, "grad_norm": 0.311161607503891, "learning_rate": 4.820542909514414e-07, "loss": 0.007770064286887646, "memory(GiB)": 22.66, "step": 26662, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.956439 }, { "epoch": 0.8661598934476822, "grad_norm": 0.5269158482551575, "learning_rate": 4.818242006417828e-07, "loss": 0.010687184520065784, "memory(GiB)": 22.66, "step": 26663, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.956447 }, { "epoch": 0.8661923789104375, "grad_norm": 0.37375760078430176, "learning_rate": 4.81594162477888e-07, "loss": 0.013538310304284096, "memory(GiB)": 22.66, "step": 26664, "token_acc": 0.9948717948717949, "train_speed(iter/s)": 0.956455 }, { "epoch": 0.866224864373193, "grad_norm": 0.4351590871810913, "learning_rate": 4.813641764624089e-07, "loss": 0.01428433507680893, "memory(GiB)": 22.66, "step": 26665, "token_acc": 1.0, "train_speed(iter/s)": 0.956462 }, { "epoch": 0.8662573498359484, "grad_norm": 0.4131452143192291, "learning_rate": 4.81134242598002e-07, "loss": 0.01258903183043003, "memory(GiB)": 22.66, "step": 26666, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.95647 }, { "epoch": 0.8662898352987038, "grad_norm": 0.25204795598983765, "learning_rate": 4.809043608873204e-07, "loss": 0.006558568682521582, "memory(GiB)": 22.66, "step": 26667, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.956476 }, { "epoch": 0.8663223207614592, "grad_norm": 0.4044869840145111, "learning_rate": 4.806745313330186e-07, "loss": 0.014637092128396034, "memory(GiB)": 22.66, "step": 26668, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956483 }, { "epoch": 0.8663548062242147, "grad_norm": 0.25490856170654297, "learning_rate": 4.804447539377466e-07, "loss": 0.009786315262317657, "memory(GiB)": 22.66, "step": 26669, "token_acc": 1.0, "train_speed(iter/s)": 0.956489 }, { "epoch": 0.86638729168697, "grad_norm": 0.294744074344635, "learning_rate": 4.802150287041574e-07, "loss": 0.010189969092607498, "memory(GiB)": 22.66, "step": 26670, "token_acc": 1.0, "train_speed(iter/s)": 0.956495 }, { "epoch": 0.8664197771497255, "grad_norm": 0.40990930795669556, "learning_rate": 4.799853556349033e-07, "loss": 0.012440569698810577, "memory(GiB)": 22.66, "step": 26671, "token_acc": 1.0, "train_speed(iter/s)": 0.956501 }, { "epoch": 0.8664522626124809, "grad_norm": 0.33902111649513245, "learning_rate": 4.797557347326348e-07, "loss": 0.007671254687011242, "memory(GiB)": 22.66, "step": 26672, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956506 }, { "epoch": 0.8664847480752363, "grad_norm": 0.44390547275543213, "learning_rate": 4.795261660000006e-07, "loss": 0.014365388080477715, "memory(GiB)": 22.66, "step": 26673, "token_acc": 1.0, "train_speed(iter/s)": 0.956511 }, { "epoch": 0.8665172335379917, "grad_norm": 0.3122430145740509, "learning_rate": 4.792966494396511e-07, "loss": 0.008820140734314919, "memory(GiB)": 22.66, "step": 26674, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956516 }, { "epoch": 0.8665497190007472, "grad_norm": 0.27639704942703247, "learning_rate": 4.790671850542356e-07, "loss": 0.007653489243239164, "memory(GiB)": 22.66, "step": 26675, "token_acc": 0.995, "train_speed(iter/s)": 0.956521 }, { "epoch": 0.8665822044635025, "grad_norm": 0.16268984973430634, "learning_rate": 4.788377728464022e-07, "loss": 0.006212097592651844, "memory(GiB)": 22.66, "step": 26676, "token_acc": 1.0, "train_speed(iter/s)": 0.956526 }, { "epoch": 0.866614689926258, "grad_norm": 0.35016119480133057, "learning_rate": 4.786084128187995e-07, "loss": 0.012236570939421654, "memory(GiB)": 22.66, "step": 26677, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.956531 }, { "epoch": 0.8666471753890134, "grad_norm": 0.3191995620727539, "learning_rate": 4.783791049740722e-07, "loss": 0.013543860986828804, "memory(GiB)": 22.66, "step": 26678, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956537 }, { "epoch": 0.8666796608517688, "grad_norm": 0.3123575747013092, "learning_rate": 4.781498493148695e-07, "loss": 0.006746739149093628, "memory(GiB)": 22.66, "step": 26679, "token_acc": 0.9965397923875432, "train_speed(iter/s)": 0.956542 }, { "epoch": 0.8667121463145242, "grad_norm": 0.3906831443309784, "learning_rate": 4.779206458438346e-07, "loss": 0.015248917043209076, "memory(GiB)": 22.66, "step": 26680, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.956547 }, { "epoch": 0.8667446317772797, "grad_norm": 0.42797568440437317, "learning_rate": 4.776914945636158e-07, "loss": 0.010260231792926788, "memory(GiB)": 22.66, "step": 26681, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956551 }, { "epoch": 0.866777117240035, "grad_norm": 0.3193170130252838, "learning_rate": 4.774623954768559e-07, "loss": 0.010525461286306381, "memory(GiB)": 22.66, "step": 26682, "token_acc": 1.0, "train_speed(iter/s)": 0.956556 }, { "epoch": 0.8668096027027905, "grad_norm": 0.5244050025939941, "learning_rate": 4.772333485861996e-07, "loss": 0.016394317150115967, "memory(GiB)": 22.66, "step": 26683, "token_acc": 0.9800796812749004, "train_speed(iter/s)": 0.956561 }, { "epoch": 0.8668420881655459, "grad_norm": 0.705396831035614, "learning_rate": 4.770043538942898e-07, "loss": 0.015839476138353348, "memory(GiB)": 22.66, "step": 26684, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956566 }, { "epoch": 0.8668745736283013, "grad_norm": 0.4169043302536011, "learning_rate": 4.767754114037698e-07, "loss": 0.014528900384902954, "memory(GiB)": 22.66, "step": 26685, "token_acc": 0.9860627177700348, "train_speed(iter/s)": 0.956571 }, { "epoch": 0.8669070590910567, "grad_norm": 0.35715213418006897, "learning_rate": 4.765465211172821e-07, "loss": 0.010358576662838459, "memory(GiB)": 22.66, "step": 26686, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956576 }, { "epoch": 0.8669395445538122, "grad_norm": 0.3940920829772949, "learning_rate": 4.7631768303746885e-07, "loss": 0.011898682452738285, "memory(GiB)": 22.66, "step": 26687, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956581 }, { "epoch": 0.8669720300165676, "grad_norm": 0.3759324550628662, "learning_rate": 4.760888971669697e-07, "loss": 0.009396960958838463, "memory(GiB)": 22.66, "step": 26688, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.956587 }, { "epoch": 0.867004515479323, "grad_norm": 0.5123164057731628, "learning_rate": 4.758601635084259e-07, "loss": 0.01612139865756035, "memory(GiB)": 22.66, "step": 26689, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956592 }, { "epoch": 0.8670370009420785, "grad_norm": 0.3557659089565277, "learning_rate": 4.7563148206447753e-07, "loss": 0.015077048912644386, "memory(GiB)": 22.66, "step": 26690, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.956596 }, { "epoch": 0.8670694864048338, "grad_norm": 0.3615093231201172, "learning_rate": 4.754028528377641e-07, "loss": 0.010888976976275444, "memory(GiB)": 22.66, "step": 26691, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956604 }, { "epoch": 0.8671019718675893, "grad_norm": 0.4081140160560608, "learning_rate": 4.751742758309247e-07, "loss": 0.015297001227736473, "memory(GiB)": 22.66, "step": 26692, "token_acc": 1.0, "train_speed(iter/s)": 0.956607 }, { "epoch": 0.8671344573303447, "grad_norm": 0.4099388122558594, "learning_rate": 4.7494575104659543e-07, "loss": 0.012281965464353561, "memory(GiB)": 22.66, "step": 26693, "token_acc": 1.0, "train_speed(iter/s)": 0.956615 }, { "epoch": 0.8671669427931001, "grad_norm": 0.4028679430484772, "learning_rate": 4.7471727848741545e-07, "loss": 0.014134230092167854, "memory(GiB)": 22.66, "step": 26694, "token_acc": 1.0, "train_speed(iter/s)": 0.956622 }, { "epoch": 0.8671994282558555, "grad_norm": 0.21614807844161987, "learning_rate": 4.7448885815602087e-07, "loss": 0.007039161399006844, "memory(GiB)": 22.66, "step": 26695, "token_acc": 1.0, "train_speed(iter/s)": 0.95663 }, { "epoch": 0.867231913718611, "grad_norm": 0.3464193046092987, "learning_rate": 4.742604900550496e-07, "loss": 0.015185303054749966, "memory(GiB)": 22.66, "step": 26696, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956638 }, { "epoch": 0.8672643991813663, "grad_norm": 0.26425859332084656, "learning_rate": 4.7403217418713453e-07, "loss": 0.008686104789376259, "memory(GiB)": 22.66, "step": 26697, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956646 }, { "epoch": 0.8672968846441218, "grad_norm": 0.24577832221984863, "learning_rate": 4.7380391055491306e-07, "loss": 0.011831961572170258, "memory(GiB)": 22.66, "step": 26698, "token_acc": 1.0, "train_speed(iter/s)": 0.956653 }, { "epoch": 0.8673293701068772, "grad_norm": 0.292420357465744, "learning_rate": 4.73575699161018e-07, "loss": 0.010673828423023224, "memory(GiB)": 22.66, "step": 26699, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.956661 }, { "epoch": 0.8673618555696326, "grad_norm": 0.3867889642715454, "learning_rate": 4.733475400080856e-07, "loss": 0.01648028939962387, "memory(GiB)": 22.66, "step": 26700, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.956667 }, { "epoch": 0.867394341032388, "grad_norm": 0.4716179370880127, "learning_rate": 4.73119433098746e-07, "loss": 0.01646464504301548, "memory(GiB)": 22.66, "step": 26701, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.956675 }, { "epoch": 0.8674268264951435, "grad_norm": 0.3215339183807373, "learning_rate": 4.7289137843563483e-07, "loss": 0.009948089718818665, "memory(GiB)": 22.66, "step": 26702, "token_acc": 1.0, "train_speed(iter/s)": 0.956659 }, { "epoch": 0.8674593119578988, "grad_norm": 0.31086021661758423, "learning_rate": 4.7266337602138056e-07, "loss": 0.011649437248706818, "memory(GiB)": 22.66, "step": 26703, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.956667 }, { "epoch": 0.8674917974206543, "grad_norm": 0.4982556998729706, "learning_rate": 4.724354258586189e-07, "loss": 0.0179927758872509, "memory(GiB)": 22.66, "step": 26704, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.956674 }, { "epoch": 0.8675242828834097, "grad_norm": 0.3634141981601715, "learning_rate": 4.722075279499777e-07, "loss": 0.010330483317375183, "memory(GiB)": 22.66, "step": 26705, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.956682 }, { "epoch": 0.8675567683461651, "grad_norm": 0.45898154377937317, "learning_rate": 4.7197968229808766e-07, "loss": 0.012558065354824066, "memory(GiB)": 22.66, "step": 26706, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.95669 }, { "epoch": 0.8675892538089205, "grad_norm": 0.3119452893733978, "learning_rate": 4.7175188890558e-07, "loss": 0.010876988992094994, "memory(GiB)": 22.66, "step": 26707, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956697 }, { "epoch": 0.867621739271676, "grad_norm": 0.34720978140830994, "learning_rate": 4.715241477750809e-07, "loss": 0.011654568836092949, "memory(GiB)": 22.66, "step": 26708, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.956704 }, { "epoch": 0.8676542247344313, "grad_norm": 0.3146180808544159, "learning_rate": 4.712964589092228e-07, "loss": 0.014108618721365929, "memory(GiB)": 22.66, "step": 26709, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.956712 }, { "epoch": 0.8676867101971868, "grad_norm": 0.31181132793426514, "learning_rate": 4.7106882231062965e-07, "loss": 0.009132424369454384, "memory(GiB)": 22.66, "step": 26710, "token_acc": 1.0, "train_speed(iter/s)": 0.956719 }, { "epoch": 0.8677191956599422, "grad_norm": 0.6625661849975586, "learning_rate": 4.708412379819316e-07, "loss": 0.01363963633775711, "memory(GiB)": 22.66, "step": 26711, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.956727 }, { "epoch": 0.8677516811226976, "grad_norm": 0.3096235990524292, "learning_rate": 4.7061370592575264e-07, "loss": 0.011205136775970459, "memory(GiB)": 22.66, "step": 26712, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956734 }, { "epoch": 0.867784166585453, "grad_norm": 0.3324646055698395, "learning_rate": 4.7038622614472065e-07, "loss": 0.007329922169446945, "memory(GiB)": 22.66, "step": 26713, "token_acc": 1.0, "train_speed(iter/s)": 0.956742 }, { "epoch": 0.8678166520482085, "grad_norm": 0.43954113125801086, "learning_rate": 4.7015879864146017e-07, "loss": 0.014258724637329578, "memory(GiB)": 22.66, "step": 26714, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.95675 }, { "epoch": 0.8678491375109638, "grad_norm": 0.16927647590637207, "learning_rate": 4.69931423418597e-07, "loss": 0.008179029449820518, "memory(GiB)": 22.66, "step": 26715, "token_acc": 1.0, "train_speed(iter/s)": 0.956757 }, { "epoch": 0.8678816229737193, "grad_norm": 0.32759374380111694, "learning_rate": 4.697041004787545e-07, "loss": 0.008964127860963345, "memory(GiB)": 22.66, "step": 26716, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.956765 }, { "epoch": 0.8679141084364747, "grad_norm": 0.3571491837501526, "learning_rate": 4.694768298245556e-07, "loss": 0.010524865239858627, "memory(GiB)": 22.66, "step": 26717, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.956772 }, { "epoch": 0.8679465938992301, "grad_norm": 0.7701017260551453, "learning_rate": 4.692496114586248e-07, "loss": 0.018535085022449493, "memory(GiB)": 22.66, "step": 26718, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.95678 }, { "epoch": 0.8679790793619855, "grad_norm": 0.3793312609195709, "learning_rate": 4.690224453835851e-07, "loss": 0.013529134914278984, "memory(GiB)": 22.66, "step": 26719, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.956787 }, { "epoch": 0.868011564824741, "grad_norm": 0.3939932882785797, "learning_rate": 4.687953316020555e-07, "loss": 0.013357691466808319, "memory(GiB)": 22.66, "step": 26720, "token_acc": 1.0, "train_speed(iter/s)": 0.956795 }, { "epoch": 0.8680440502874963, "grad_norm": 0.334827721118927, "learning_rate": 4.685682701166594e-07, "loss": 0.01159297488629818, "memory(GiB)": 22.66, "step": 26721, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.956803 }, { "epoch": 0.8680765357502518, "grad_norm": 0.4416811466217041, "learning_rate": 4.6834126093001644e-07, "loss": 0.016209589317440987, "memory(GiB)": 22.66, "step": 26722, "token_acc": 1.0, "train_speed(iter/s)": 0.956811 }, { "epoch": 0.8681090212130071, "grad_norm": 0.29702383279800415, "learning_rate": 4.6811430404474666e-07, "loss": 0.007856025360524654, "memory(GiB)": 22.66, "step": 26723, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.956818 }, { "epoch": 0.8681415066757626, "grad_norm": 0.3638684153556824, "learning_rate": 4.6788739946347084e-07, "loss": 0.01072643045336008, "memory(GiB)": 22.66, "step": 26724, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956826 }, { "epoch": 0.868173992138518, "grad_norm": 0.3797166645526886, "learning_rate": 4.6766054718880626e-07, "loss": 0.013746997341513634, "memory(GiB)": 22.66, "step": 26725, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.956833 }, { "epoch": 0.8682064776012735, "grad_norm": 0.4735122621059418, "learning_rate": 4.674337472233709e-07, "loss": 0.017051106318831444, "memory(GiB)": 22.66, "step": 26726, "token_acc": 1.0, "train_speed(iter/s)": 0.95684 }, { "epoch": 0.8682389630640288, "grad_norm": 0.38491037487983704, "learning_rate": 4.672069995697831e-07, "loss": 0.017018163576722145, "memory(GiB)": 22.66, "step": 26727, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.956848 }, { "epoch": 0.8682714485267843, "grad_norm": 0.38350871205329895, "learning_rate": 4.669803042306609e-07, "loss": 0.012981937266886234, "memory(GiB)": 22.66, "step": 26728, "token_acc": 0.984375, "train_speed(iter/s)": 0.956855 }, { "epoch": 0.8683039339895396, "grad_norm": 0.4506354331970215, "learning_rate": 4.667536612086182e-07, "loss": 0.016453687101602554, "memory(GiB)": 22.66, "step": 26729, "token_acc": 1.0, "train_speed(iter/s)": 0.956861 }, { "epoch": 0.8683364194522951, "grad_norm": 0.3246235251426697, "learning_rate": 4.66527070506273e-07, "loss": 0.013460133224725723, "memory(GiB)": 22.66, "step": 26730, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956867 }, { "epoch": 0.8683689049150505, "grad_norm": 0.2350921779870987, "learning_rate": 4.663005321262376e-07, "loss": 0.006361070089042187, "memory(GiB)": 22.66, "step": 26731, "token_acc": 1.0, "train_speed(iter/s)": 0.956872 }, { "epoch": 0.868401390377806, "grad_norm": 0.3475753366947174, "learning_rate": 4.660740460711299e-07, "loss": 0.015396376140415668, "memory(GiB)": 22.66, "step": 26732, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.956878 }, { "epoch": 0.8684338758405613, "grad_norm": 0.3738759458065033, "learning_rate": 4.658476123435618e-07, "loss": 0.014066338539123535, "memory(GiB)": 22.66, "step": 26733, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.956883 }, { "epoch": 0.8684663613033168, "grad_norm": 0.3171410858631134, "learning_rate": 4.6562123094614774e-07, "loss": 0.013360748067498207, "memory(GiB)": 22.66, "step": 26734, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956888 }, { "epoch": 0.8684988467660721, "grad_norm": 0.4782167673110962, "learning_rate": 4.653949018814996e-07, "loss": 0.010267348960042, "memory(GiB)": 22.66, "step": 26735, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.956893 }, { "epoch": 0.8685313322288276, "grad_norm": 0.39788806438446045, "learning_rate": 4.651686251522286e-07, "loss": 0.005974072962999344, "memory(GiB)": 22.66, "step": 26736, "token_acc": 1.0, "train_speed(iter/s)": 0.956898 }, { "epoch": 0.868563817691583, "grad_norm": 0.3378671705722809, "learning_rate": 4.649424007609493e-07, "loss": 0.009511658921837807, "memory(GiB)": 22.66, "step": 26737, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956903 }, { "epoch": 0.8685963031543384, "grad_norm": 0.42094162106513977, "learning_rate": 4.6471622871027024e-07, "loss": 0.01310367789119482, "memory(GiB)": 22.66, "step": 26738, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.956908 }, { "epoch": 0.8686287886170938, "grad_norm": 0.30689433217048645, "learning_rate": 4.6449010900280254e-07, "loss": 0.009452476166188717, "memory(GiB)": 22.66, "step": 26739, "token_acc": 0.99609375, "train_speed(iter/s)": 0.956913 }, { "epoch": 0.8686612740798493, "grad_norm": 0.447992205619812, "learning_rate": 4.6426404164115534e-07, "loss": 0.014841174706816673, "memory(GiB)": 22.66, "step": 26740, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956918 }, { "epoch": 0.8686937595426046, "grad_norm": 0.355744868516922, "learning_rate": 4.640380266279382e-07, "loss": 0.010192769579589367, "memory(GiB)": 22.66, "step": 26741, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.956924 }, { "epoch": 0.8687262450053601, "grad_norm": 0.1488494724035263, "learning_rate": 4.6381206396575953e-07, "loss": 0.00546178687363863, "memory(GiB)": 22.66, "step": 26742, "token_acc": 1.0, "train_speed(iter/s)": 0.956928 }, { "epoch": 0.8687587304681155, "grad_norm": 0.6396359801292419, "learning_rate": 4.635861536572284e-07, "loss": 0.0156235471367836, "memory(GiB)": 22.66, "step": 26743, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.956933 }, { "epoch": 0.8687912159308709, "grad_norm": 0.3658944070339203, "learning_rate": 4.633602957049493e-07, "loss": 0.007109959609806538, "memory(GiB)": 22.66, "step": 26744, "token_acc": 1.0, "train_speed(iter/s)": 0.956937 }, { "epoch": 0.8688237013936263, "grad_norm": 0.3356621265411377, "learning_rate": 4.6313449011153145e-07, "loss": 0.00887343380600214, "memory(GiB)": 22.66, "step": 26745, "token_acc": 1.0, "train_speed(iter/s)": 0.956942 }, { "epoch": 0.8688561868563818, "grad_norm": 0.3431251347064972, "learning_rate": 4.6290873687957985e-07, "loss": 0.017073428258299828, "memory(GiB)": 22.66, "step": 26746, "token_acc": 1.0, "train_speed(iter/s)": 0.956947 }, { "epoch": 0.8688886723191371, "grad_norm": 0.2983524799346924, "learning_rate": 4.626830360117013e-07, "loss": 0.00924668088555336, "memory(GiB)": 22.66, "step": 26747, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.956952 }, { "epoch": 0.8689211577818926, "grad_norm": 0.38463693857192993, "learning_rate": 4.6245738751049873e-07, "loss": 0.009237236343324184, "memory(GiB)": 22.66, "step": 26748, "token_acc": 0.9932203389830508, "train_speed(iter/s)": 0.956955 }, { "epoch": 0.868953643244648, "grad_norm": 0.3671320676803589, "learning_rate": 4.622317913785773e-07, "loss": 0.014480626210570335, "memory(GiB)": 22.66, "step": 26749, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.956961 }, { "epoch": 0.8689861287074034, "grad_norm": 0.33301830291748047, "learning_rate": 4.620062476185405e-07, "loss": 0.01148628257215023, "memory(GiB)": 22.66, "step": 26750, "token_acc": 1.0, "train_speed(iter/s)": 0.956966 }, { "epoch": 0.8690186141701588, "grad_norm": 0.2400088906288147, "learning_rate": 4.617807562329929e-07, "loss": 0.010241544805467129, "memory(GiB)": 22.66, "step": 26751, "token_acc": 1.0, "train_speed(iter/s)": 0.956972 }, { "epoch": 0.8690510996329143, "grad_norm": 0.3323250412940979, "learning_rate": 4.6155531722453516e-07, "loss": 0.009984169155359268, "memory(GiB)": 22.66, "step": 26752, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.95698 }, { "epoch": 0.8690835850956697, "grad_norm": 0.24820993840694427, "learning_rate": 4.613299305957691e-07, "loss": 0.013270316645503044, "memory(GiB)": 22.66, "step": 26753, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.956987 }, { "epoch": 0.8691160705584251, "grad_norm": 0.27359500527381897, "learning_rate": 4.611045963492966e-07, "loss": 0.0158370528370142, "memory(GiB)": 22.66, "step": 26754, "token_acc": 0.9921875, "train_speed(iter/s)": 0.956995 }, { "epoch": 0.8691485560211806, "grad_norm": 0.33242976665496826, "learning_rate": 4.6087931448771885e-07, "loss": 0.013605667278170586, "memory(GiB)": 22.66, "step": 26755, "token_acc": 1.0, "train_speed(iter/s)": 0.957003 }, { "epoch": 0.8691810414839359, "grad_norm": 0.3176734447479248, "learning_rate": 4.6065408501363594e-07, "loss": 0.009587961249053478, "memory(GiB)": 22.66, "step": 26756, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957011 }, { "epoch": 0.8692135269466914, "grad_norm": 0.3685011863708496, "learning_rate": 4.604289079296459e-07, "loss": 0.012784965336322784, "memory(GiB)": 22.66, "step": 26757, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957019 }, { "epoch": 0.8692460124094468, "grad_norm": 0.41481244564056396, "learning_rate": 4.6020378323834937e-07, "loss": 0.014206917025148869, "memory(GiB)": 22.66, "step": 26758, "token_acc": 0.9828326180257511, "train_speed(iter/s)": 0.957027 }, { "epoch": 0.8692784978722022, "grad_norm": 0.2987121641635895, "learning_rate": 4.5997871094234146e-07, "loss": 0.011791625060141087, "memory(GiB)": 22.66, "step": 26759, "token_acc": 1.0, "train_speed(iter/s)": 0.957034 }, { "epoch": 0.8693109833349576, "grad_norm": 0.3571234941482544, "learning_rate": 4.597536910442241e-07, "loss": 0.008076739497482777, "memory(GiB)": 22.66, "step": 26760, "token_acc": 1.0, "train_speed(iter/s)": 0.957042 }, { "epoch": 0.8693434687977131, "grad_norm": 0.4788154661655426, "learning_rate": 4.5952872354659117e-07, "loss": 0.011691797524690628, "memory(GiB)": 22.66, "step": 26761, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.95705 }, { "epoch": 0.8693759542604684, "grad_norm": 0.4050116240978241, "learning_rate": 4.5930380845204125e-07, "loss": 0.01408517174422741, "memory(GiB)": 22.66, "step": 26762, "token_acc": 1.0, "train_speed(iter/s)": 0.957058 }, { "epoch": 0.8694084397232239, "grad_norm": 0.18183410167694092, "learning_rate": 4.590789457631678e-07, "loss": 0.007716567255556583, "memory(GiB)": 22.66, "step": 26763, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957064 }, { "epoch": 0.8694409251859793, "grad_norm": 0.4564298391342163, "learning_rate": 4.588541354825676e-07, "loss": 0.01598639227449894, "memory(GiB)": 22.66, "step": 26764, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.957072 }, { "epoch": 0.8694734106487347, "grad_norm": 0.36918115615844727, "learning_rate": 4.586293776128348e-07, "loss": 0.013122771866619587, "memory(GiB)": 22.66, "step": 26765, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957079 }, { "epoch": 0.8695058961114901, "grad_norm": 0.24330198764801025, "learning_rate": 4.584046721565644e-07, "loss": 0.008194072172045708, "memory(GiB)": 22.66, "step": 26766, "token_acc": 0.9880239520958084, "train_speed(iter/s)": 0.957087 }, { "epoch": 0.8695383815742456, "grad_norm": 0.32786741852760315, "learning_rate": 4.5818001911634824e-07, "loss": 0.010380243882536888, "memory(GiB)": 22.66, "step": 26767, "token_acc": 1.0, "train_speed(iter/s)": 0.957094 }, { "epoch": 0.8695708670370009, "grad_norm": 0.27466362714767456, "learning_rate": 4.579554184947793e-07, "loss": 0.012760475277900696, "memory(GiB)": 22.66, "step": 26768, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.957102 }, { "epoch": 0.8696033524997564, "grad_norm": 0.35977083444595337, "learning_rate": 4.577308702944505e-07, "loss": 0.010197179391980171, "memory(GiB)": 22.66, "step": 26769, "token_acc": 1.0, "train_speed(iter/s)": 0.95711 }, { "epoch": 0.8696358379625118, "grad_norm": 0.39239388704299927, "learning_rate": 4.5750637451795366e-07, "loss": 0.01203068345785141, "memory(GiB)": 22.66, "step": 26770, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.957118 }, { "epoch": 0.8696683234252672, "grad_norm": 0.5263389348983765, "learning_rate": 4.5728193116788e-07, "loss": 0.01704639196395874, "memory(GiB)": 22.66, "step": 26771, "token_acc": 1.0, "train_speed(iter/s)": 0.957125 }, { "epoch": 0.8697008088880226, "grad_norm": 0.28826120495796204, "learning_rate": 4.5705754024681806e-07, "loss": 0.007868615910410881, "memory(GiB)": 22.66, "step": 26772, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.957133 }, { "epoch": 0.8697332943507781, "grad_norm": 0.24510982632637024, "learning_rate": 4.568332017573585e-07, "loss": 0.006632080301642418, "memory(GiB)": 22.66, "step": 26773, "token_acc": 1.0, "train_speed(iter/s)": 0.957141 }, { "epoch": 0.8697657798135334, "grad_norm": 0.36948156356811523, "learning_rate": 4.566089157020914e-07, "loss": 0.014012887142598629, "memory(GiB)": 22.66, "step": 26774, "token_acc": 1.0, "train_speed(iter/s)": 0.957148 }, { "epoch": 0.8697982652762889, "grad_norm": 0.22541530430316925, "learning_rate": 4.5638468208360486e-07, "loss": 0.010541620664298534, "memory(GiB)": 22.66, "step": 26775, "token_acc": 0.9963503649635036, "train_speed(iter/s)": 0.957156 }, { "epoch": 0.8698307507390443, "grad_norm": 0.29813897609710693, "learning_rate": 4.561605009044862e-07, "loss": 0.011752829886972904, "memory(GiB)": 22.66, "step": 26776, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957164 }, { "epoch": 0.8698632362017997, "grad_norm": 0.3811857998371124, "learning_rate": 4.559363721673232e-07, "loss": 0.011892421171069145, "memory(GiB)": 22.66, "step": 26777, "token_acc": 1.0, "train_speed(iter/s)": 0.957171 }, { "epoch": 0.8698957216645551, "grad_norm": 1.4773832559585571, "learning_rate": 4.5571229587470235e-07, "loss": 0.00925843883305788, "memory(GiB)": 22.66, "step": 26778, "token_acc": 0.9963636363636363, "train_speed(iter/s)": 0.957178 }, { "epoch": 0.8699282071273106, "grad_norm": 0.45386558771133423, "learning_rate": 4.554882720292114e-07, "loss": 0.019050704315304756, "memory(GiB)": 22.66, "step": 26779, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.957186 }, { "epoch": 0.8699606925900659, "grad_norm": 0.2391989678144455, "learning_rate": 4.5526430063343283e-07, "loss": 0.00951368361711502, "memory(GiB)": 22.66, "step": 26780, "token_acc": 1.0, "train_speed(iter/s)": 0.957193 }, { "epoch": 0.8699931780528214, "grad_norm": 0.2522490918636322, "learning_rate": 4.550403816899551e-07, "loss": 0.005010129883885384, "memory(GiB)": 22.66, "step": 26781, "token_acc": 0.99609375, "train_speed(iter/s)": 0.957201 }, { "epoch": 0.8700256635155768, "grad_norm": 0.20877684652805328, "learning_rate": 4.5481651520135837e-07, "loss": 0.006405252497643232, "memory(GiB)": 22.66, "step": 26782, "token_acc": 1.0, "train_speed(iter/s)": 0.957209 }, { "epoch": 0.8700581489783322, "grad_norm": 0.2737811505794525, "learning_rate": 4.545927011702295e-07, "loss": 0.010042328387498856, "memory(GiB)": 22.66, "step": 26783, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957216 }, { "epoch": 0.8700906344410876, "grad_norm": 0.3290044367313385, "learning_rate": 4.5436893959915185e-07, "loss": 0.009146827273070812, "memory(GiB)": 22.66, "step": 26784, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.957224 }, { "epoch": 0.8701231199038431, "grad_norm": 0.322838693857193, "learning_rate": 4.541452304907057e-07, "loss": 0.009912352077662945, "memory(GiB)": 22.66, "step": 26785, "token_acc": 1.0, "train_speed(iter/s)": 0.957232 }, { "epoch": 0.8701556053665984, "grad_norm": 0.2831164002418518, "learning_rate": 4.539215738474756e-07, "loss": 0.008334174752235413, "memory(GiB)": 22.66, "step": 26786, "token_acc": 0.9927272727272727, "train_speed(iter/s)": 0.957239 }, { "epoch": 0.8701880908293539, "grad_norm": 0.4133111536502838, "learning_rate": 4.5369796967203894e-07, "loss": 0.014503196813166142, "memory(GiB)": 22.66, "step": 26787, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.957247 }, { "epoch": 0.8702205762921092, "grad_norm": 0.4508664608001709, "learning_rate": 4.534744179669809e-07, "loss": 0.011546982452273369, "memory(GiB)": 22.66, "step": 26788, "token_acc": 1.0, "train_speed(iter/s)": 0.957254 }, { "epoch": 0.8702530617548647, "grad_norm": 0.4150179922580719, "learning_rate": 4.532509187348788e-07, "loss": 0.016168616712093353, "memory(GiB)": 22.66, "step": 26789, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.957262 }, { "epoch": 0.8702855472176201, "grad_norm": 0.4113066494464874, "learning_rate": 4.5302747197831343e-07, "loss": 0.012198102660477161, "memory(GiB)": 22.66, "step": 26790, "token_acc": 0.9922779922779923, "train_speed(iter/s)": 0.95727 }, { "epoch": 0.8703180326803756, "grad_norm": 0.2284127026796341, "learning_rate": 4.5280407769986213e-07, "loss": 0.009693183936178684, "memory(GiB)": 22.66, "step": 26791, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957276 }, { "epoch": 0.8703505181431309, "grad_norm": 0.35530972480773926, "learning_rate": 4.525807359021039e-07, "loss": 0.016803426668047905, "memory(GiB)": 22.66, "step": 26792, "token_acc": 0.985, "train_speed(iter/s)": 0.957282 }, { "epoch": 0.8703830036058864, "grad_norm": 0.4943418502807617, "learning_rate": 4.5235744658761683e-07, "loss": 0.01506028976291418, "memory(GiB)": 22.66, "step": 26793, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957288 }, { "epoch": 0.8704154890686417, "grad_norm": 0.46608543395996094, "learning_rate": 4.521342097589787e-07, "loss": 0.016173232346773148, "memory(GiB)": 22.66, "step": 26794, "token_acc": 0.9878542510121457, "train_speed(iter/s)": 0.957293 }, { "epoch": 0.8704479745313972, "grad_norm": 0.5626645684242249, "learning_rate": 4.5191102541876365e-07, "loss": 0.015221117064356804, "memory(GiB)": 22.66, "step": 26795, "token_acc": 1.0, "train_speed(iter/s)": 0.957298 }, { "epoch": 0.8704804599941526, "grad_norm": 0.40973758697509766, "learning_rate": 4.516878935695496e-07, "loss": 0.011015471071004868, "memory(GiB)": 22.66, "step": 26796, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.957303 }, { "epoch": 0.870512945456908, "grad_norm": 1.1683156490325928, "learning_rate": 4.5146481421391053e-07, "loss": 0.01999080739915371, "memory(GiB)": 22.66, "step": 26797, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.957308 }, { "epoch": 0.8705454309196634, "grad_norm": 0.19319143891334534, "learning_rate": 4.512417873544217e-07, "loss": 0.008096723817288876, "memory(GiB)": 22.66, "step": 26798, "token_acc": 1.0, "train_speed(iter/s)": 0.957312 }, { "epoch": 0.8705779163824189, "grad_norm": 0.36128637194633484, "learning_rate": 4.510188129936577e-07, "loss": 0.007918260060250759, "memory(GiB)": 22.66, "step": 26799, "token_acc": 0.9840425531914894, "train_speed(iter/s)": 0.957317 }, { "epoch": 0.8706104018451742, "grad_norm": 0.41349032521247864, "learning_rate": 4.507958911341909e-07, "loss": 0.011573495343327522, "memory(GiB)": 22.66, "step": 26800, "token_acc": 1.0, "train_speed(iter/s)": 0.957322 }, { "epoch": 0.8706428873079297, "grad_norm": 0.34461960196495056, "learning_rate": 4.505730217785942e-07, "loss": 0.012816774658858776, "memory(GiB)": 22.66, "step": 26801, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957328 }, { "epoch": 0.8706753727706851, "grad_norm": 0.47003915905952454, "learning_rate": 4.5035020492944005e-07, "loss": 0.017831772565841675, "memory(GiB)": 22.66, "step": 26802, "token_acc": 0.9895287958115183, "train_speed(iter/s)": 0.957333 }, { "epoch": 0.8707078582334405, "grad_norm": 0.290926069021225, "learning_rate": 4.501274405893008e-07, "loss": 0.007405319716781378, "memory(GiB)": 22.66, "step": 26803, "token_acc": 0.991869918699187, "train_speed(iter/s)": 0.957338 }, { "epoch": 0.8707403436961959, "grad_norm": 0.35348325967788696, "learning_rate": 4.499047287607461e-07, "loss": 0.013605711050331593, "memory(GiB)": 22.66, "step": 26804, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.957343 }, { "epoch": 0.8707728291589514, "grad_norm": 0.401579350233078, "learning_rate": 4.4968206944634716e-07, "loss": 0.012453618459403515, "memory(GiB)": 22.66, "step": 26805, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.957348 }, { "epoch": 0.8708053146217067, "grad_norm": 0.3787044584751129, "learning_rate": 4.4945946264867367e-07, "loss": 0.013054415583610535, "memory(GiB)": 22.66, "step": 26806, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957354 }, { "epoch": 0.8708378000844622, "grad_norm": 0.3805210590362549, "learning_rate": 4.492369083702952e-07, "loss": 0.00875388365238905, "memory(GiB)": 22.66, "step": 26807, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957359 }, { "epoch": 0.8708702855472176, "grad_norm": 0.3657359480857849, "learning_rate": 4.4901440661377915e-07, "loss": 0.01822776347398758, "memory(GiB)": 22.66, "step": 26808, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.957363 }, { "epoch": 0.870902771009973, "grad_norm": 0.9418458342552185, "learning_rate": 4.487919573816951e-07, "loss": 0.010125771164894104, "memory(GiB)": 22.66, "step": 26809, "token_acc": 1.0, "train_speed(iter/s)": 0.957368 }, { "epoch": 0.8709352564727284, "grad_norm": 0.34498634934425354, "learning_rate": 4.4856956067660774e-07, "loss": 0.011520527303218842, "memory(GiB)": 22.66, "step": 26810, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.957374 }, { "epoch": 0.8709677419354839, "grad_norm": 0.310722291469574, "learning_rate": 4.4834721650108714e-07, "loss": 0.011910032480955124, "memory(GiB)": 22.66, "step": 26811, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.957379 }, { "epoch": 0.8710002273982392, "grad_norm": 0.2998232841491699, "learning_rate": 4.4812492485769745e-07, "loss": 0.009554642252624035, "memory(GiB)": 22.66, "step": 26812, "token_acc": 1.0, "train_speed(iter/s)": 0.957375 }, { "epoch": 0.8710327128609947, "grad_norm": 0.2247343808412552, "learning_rate": 4.479026857490043e-07, "loss": 0.009718697518110275, "memory(GiB)": 22.66, "step": 26813, "token_acc": 0.9965156794425087, "train_speed(iter/s)": 0.957381 }, { "epoch": 0.8710651983237501, "grad_norm": 0.24664318561553955, "learning_rate": 4.4768049917757406e-07, "loss": 0.00814873818308115, "memory(GiB)": 22.66, "step": 26814, "token_acc": 1.0, "train_speed(iter/s)": 0.957388 }, { "epoch": 0.8710976837865055, "grad_norm": 0.18112987279891968, "learning_rate": 4.47458365145968e-07, "loss": 0.006354579236358404, "memory(GiB)": 22.66, "step": 26815, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.957394 }, { "epoch": 0.871130169249261, "grad_norm": 0.2552841007709503, "learning_rate": 4.4723628365675397e-07, "loss": 0.006700015626847744, "memory(GiB)": 22.66, "step": 26816, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.9574 }, { "epoch": 0.8711626547120164, "grad_norm": 0.4829489290714264, "learning_rate": 4.470142547124917e-07, "loss": 0.01819521188735962, "memory(GiB)": 22.66, "step": 26817, "token_acc": 1.0, "train_speed(iter/s)": 0.957408 }, { "epoch": 0.8711951401747718, "grad_norm": 0.29760369658470154, "learning_rate": 4.4679227831574635e-07, "loss": 0.01222168654203415, "memory(GiB)": 22.66, "step": 26818, "token_acc": 1.0, "train_speed(iter/s)": 0.957416 }, { "epoch": 0.8712276256375272, "grad_norm": 0.3332577645778656, "learning_rate": 4.465703544690769e-07, "loss": 0.01142945233732462, "memory(GiB)": 22.66, "step": 26819, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.957424 }, { "epoch": 0.8712601111002827, "grad_norm": 0.44304656982421875, "learning_rate": 4.463484831750464e-07, "loss": 0.016816742718219757, "memory(GiB)": 22.66, "step": 26820, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.957432 }, { "epoch": 0.871292596563038, "grad_norm": 0.39744439721107483, "learning_rate": 4.4612666443621557e-07, "loss": 0.013103285804390907, "memory(GiB)": 22.66, "step": 26821, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.95744 }, { "epoch": 0.8713250820257935, "grad_norm": 0.23859141767024994, "learning_rate": 4.459048982551445e-07, "loss": 0.00828571617603302, "memory(GiB)": 22.66, "step": 26822, "token_acc": 0.986784140969163, "train_speed(iter/s)": 0.957447 }, { "epoch": 0.8713575674885489, "grad_norm": 0.39746540784835815, "learning_rate": 4.4568318463439175e-07, "loss": 0.02271854318678379, "memory(GiB)": 22.66, "step": 26823, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957455 }, { "epoch": 0.8713900529513043, "grad_norm": 0.24368847906589508, "learning_rate": 4.45461523576517e-07, "loss": 0.0077602374367415905, "memory(GiB)": 22.66, "step": 26824, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.957464 }, { "epoch": 0.8714225384140597, "grad_norm": 0.2366674393415451, "learning_rate": 4.452399150840786e-07, "loss": 0.009226656518876553, "memory(GiB)": 22.66, "step": 26825, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957471 }, { "epoch": 0.8714550238768152, "grad_norm": 0.4568463861942291, "learning_rate": 4.450183591596341e-07, "loss": 0.015358179807662964, "memory(GiB)": 22.66, "step": 26826, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.957479 }, { "epoch": 0.8714875093395705, "grad_norm": 0.2994610667228699, "learning_rate": 4.447968558057403e-07, "loss": 0.012178249657154083, "memory(GiB)": 22.66, "step": 26827, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957487 }, { "epoch": 0.871519994802326, "grad_norm": 0.3002415597438812, "learning_rate": 4.4457540502495347e-07, "loss": 0.010458480566740036, "memory(GiB)": 22.66, "step": 26828, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.957495 }, { "epoch": 0.8715524802650814, "grad_norm": 0.363661527633667, "learning_rate": 4.443540068198293e-07, "loss": 0.012681983411312103, "memory(GiB)": 22.66, "step": 26829, "token_acc": 1.0, "train_speed(iter/s)": 0.957503 }, { "epoch": 0.8715849657278368, "grad_norm": 0.28133130073547363, "learning_rate": 4.441326611929236e-07, "loss": 0.015766901895403862, "memory(GiB)": 22.66, "step": 26830, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957511 }, { "epoch": 0.8716174511905922, "grad_norm": 0.33474209904670715, "learning_rate": 4.4391136814679213e-07, "loss": 0.008324026130139828, "memory(GiB)": 22.66, "step": 26831, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.957518 }, { "epoch": 0.8716499366533477, "grad_norm": 0.3041680157184601, "learning_rate": 4.436901276839867e-07, "loss": 0.008008410222828388, "memory(GiB)": 22.66, "step": 26832, "token_acc": 0.9918367346938776, "train_speed(iter/s)": 0.957526 }, { "epoch": 0.871682422116103, "grad_norm": 0.6186014413833618, "learning_rate": 4.434689398070613e-07, "loss": 0.01492428407073021, "memory(GiB)": 22.66, "step": 26833, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.957533 }, { "epoch": 0.8717149075788585, "grad_norm": 0.24794119596481323, "learning_rate": 4.4324780451856895e-07, "loss": 0.006589068099856377, "memory(GiB)": 22.66, "step": 26834, "token_acc": 1.0, "train_speed(iter/s)": 0.957542 }, { "epoch": 0.8717473930416139, "grad_norm": 0.44185635447502136, "learning_rate": 4.430267218210632e-07, "loss": 0.012473625130951405, "memory(GiB)": 22.66, "step": 26835, "token_acc": 1.0, "train_speed(iter/s)": 0.95755 }, { "epoch": 0.8717798785043693, "grad_norm": 0.47674474120140076, "learning_rate": 4.4280569171709366e-07, "loss": 0.014530014246702194, "memory(GiB)": 22.66, "step": 26836, "token_acc": 1.0, "train_speed(iter/s)": 0.957558 }, { "epoch": 0.8718123639671247, "grad_norm": 0.5113825798034668, "learning_rate": 4.425847142092127e-07, "loss": 0.014040965586900711, "memory(GiB)": 22.66, "step": 26837, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957566 }, { "epoch": 0.8718448494298802, "grad_norm": 0.33780786395072937, "learning_rate": 4.423637892999683e-07, "loss": 0.009671560488641262, "memory(GiB)": 22.66, "step": 26838, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957574 }, { "epoch": 0.8718773348926355, "grad_norm": 0.7208609580993652, "learning_rate": 4.421429169919134e-07, "loss": 0.016866281628608704, "memory(GiB)": 22.66, "step": 26839, "token_acc": 1.0, "train_speed(iter/s)": 0.957582 }, { "epoch": 0.871909820355391, "grad_norm": 0.44171789288520813, "learning_rate": 4.419220972875954e-07, "loss": 0.010034840553998947, "memory(GiB)": 22.66, "step": 26840, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.95759 }, { "epoch": 0.8719423058181464, "grad_norm": 0.4223587214946747, "learning_rate": 4.417013301895634e-07, "loss": 0.011176072061061859, "memory(GiB)": 22.66, "step": 26841, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.957597 }, { "epoch": 0.8719747912809018, "grad_norm": 0.2949812114238739, "learning_rate": 4.414806157003648e-07, "loss": 0.005402664188295603, "memory(GiB)": 22.66, "step": 26842, "token_acc": 1.0, "train_speed(iter/s)": 0.957605 }, { "epoch": 0.8720072767436572, "grad_norm": 0.3148702383041382, "learning_rate": 4.4125995382254703e-07, "loss": 0.010825280100107193, "memory(GiB)": 22.66, "step": 26843, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957613 }, { "epoch": 0.8720397622064127, "grad_norm": 0.2949313223361969, "learning_rate": 4.410393445586569e-07, "loss": 0.011422143317759037, "memory(GiB)": 22.66, "step": 26844, "token_acc": 1.0, "train_speed(iter/s)": 0.957621 }, { "epoch": 0.872072247669168, "grad_norm": 0.3136722445487976, "learning_rate": 4.408187879112413e-07, "loss": 0.011486144736409187, "memory(GiB)": 22.66, "step": 26845, "token_acc": 1.0, "train_speed(iter/s)": 0.957628 }, { "epoch": 0.8721047331319235, "grad_norm": 0.31454557180404663, "learning_rate": 4.4059828388284543e-07, "loss": 0.012533912435173988, "memory(GiB)": 22.66, "step": 26846, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.957636 }, { "epoch": 0.8721372185946789, "grad_norm": 0.46096712350845337, "learning_rate": 4.403778324760133e-07, "loss": 0.009565304964780807, "memory(GiB)": 22.66, "step": 26847, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.957643 }, { "epoch": 0.8721697040574343, "grad_norm": 0.4687858819961548, "learning_rate": 4.401574336932901e-07, "loss": 0.016133897006511688, "memory(GiB)": 22.66, "step": 26848, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.95764 }, { "epoch": 0.8722021895201897, "grad_norm": 0.20320607721805573, "learning_rate": 4.3993708753721887e-07, "loss": 0.005904780235141516, "memory(GiB)": 22.66, "step": 26849, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957635 }, { "epoch": 0.8722346749829452, "grad_norm": 0.38967040181159973, "learning_rate": 4.3971679401034416e-07, "loss": 0.009916737675666809, "memory(GiB)": 22.66, "step": 26850, "token_acc": 0.9967105263157895, "train_speed(iter/s)": 0.957635 }, { "epoch": 0.8722671604457005, "grad_norm": 0.32957813143730164, "learning_rate": 4.3949655311520625e-07, "loss": 0.010734661482274532, "memory(GiB)": 22.66, "step": 26851, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.957633 }, { "epoch": 0.872299645908456, "grad_norm": 0.366982638835907, "learning_rate": 4.3927636485434797e-07, "loss": 0.013555115088820457, "memory(GiB)": 22.66, "step": 26852, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.95764 }, { "epoch": 0.8723321313712114, "grad_norm": 0.23147982358932495, "learning_rate": 4.390562292303113e-07, "loss": 0.0074437083676457405, "memory(GiB)": 22.66, "step": 26853, "token_acc": 1.0, "train_speed(iter/s)": 0.957646 }, { "epoch": 0.8723646168339668, "grad_norm": 0.38400498032569885, "learning_rate": 4.38836146245637e-07, "loss": 0.01294176559895277, "memory(GiB)": 22.66, "step": 26854, "token_acc": 1.0, "train_speed(iter/s)": 0.957652 }, { "epoch": 0.8723971022967222, "grad_norm": 0.4026876389980316, "learning_rate": 4.3861611590286346e-07, "loss": 0.009723532944917679, "memory(GiB)": 22.66, "step": 26855, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.957657 }, { "epoch": 0.8724295877594777, "grad_norm": 0.2923945188522339, "learning_rate": 4.3839613820453154e-07, "loss": 0.00902356207370758, "memory(GiB)": 22.66, "step": 26856, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.957654 }, { "epoch": 0.872462073222233, "grad_norm": 0.2691633701324463, "learning_rate": 4.3817621315317913e-07, "loss": 0.00798456184566021, "memory(GiB)": 22.66, "step": 26857, "token_acc": 1.0, "train_speed(iter/s)": 0.957657 }, { "epoch": 0.8724945586849885, "grad_norm": 0.3338997960090637, "learning_rate": 4.3795634075134596e-07, "loss": 0.008599741384387016, "memory(GiB)": 22.66, "step": 26858, "token_acc": 1.0, "train_speed(iter/s)": 0.957661 }, { "epoch": 0.8725270441477438, "grad_norm": 0.40780946612358093, "learning_rate": 4.3773652100156826e-07, "loss": 0.00795282144099474, "memory(GiB)": 22.66, "step": 26859, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957662 }, { "epoch": 0.8725595296104993, "grad_norm": 0.3496287763118744, "learning_rate": 4.3751675390638295e-07, "loss": 0.011294890195131302, "memory(GiB)": 22.66, "step": 26860, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.957667 }, { "epoch": 0.8725920150732547, "grad_norm": 0.4204584062099457, "learning_rate": 4.3729703946832736e-07, "loss": 0.01247446984052658, "memory(GiB)": 22.66, "step": 26861, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957672 }, { "epoch": 0.8726245005360102, "grad_norm": 0.700365424156189, "learning_rate": 4.370773776899362e-07, "loss": 0.016416234895586967, "memory(GiB)": 22.66, "step": 26862, "token_acc": 1.0, "train_speed(iter/s)": 0.957671 }, { "epoch": 0.8726569859987655, "grad_norm": 0.688059389591217, "learning_rate": 4.3685776857374683e-07, "loss": 0.015250551514327526, "memory(GiB)": 22.66, "step": 26863, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.957669 }, { "epoch": 0.872689471461521, "grad_norm": 0.3281574249267578, "learning_rate": 4.3663821212229117e-07, "loss": 0.01026352122426033, "memory(GiB)": 22.66, "step": 26864, "token_acc": 1.0, "train_speed(iter/s)": 0.95767 }, { "epoch": 0.8727219569242763, "grad_norm": 0.33027154207229614, "learning_rate": 4.364187083381055e-07, "loss": 0.011074274778366089, "memory(GiB)": 22.66, "step": 26865, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.957664 }, { "epoch": 0.8727544423870318, "grad_norm": 0.3047460615634918, "learning_rate": 4.3619925722372004e-07, "loss": 0.006350214593112469, "memory(GiB)": 22.66, "step": 26866, "token_acc": 1.0, "train_speed(iter/s)": 0.957669 }, { "epoch": 0.8727869278497872, "grad_norm": 0.3428404927253723, "learning_rate": 4.359798587816716e-07, "loss": 0.011077551171183586, "memory(GiB)": 22.66, "step": 26867, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957674 }, { "epoch": 0.8728194133125426, "grad_norm": 0.2985116243362427, "learning_rate": 4.357605130144893e-07, "loss": 0.01378815621137619, "memory(GiB)": 22.66, "step": 26868, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.95768 }, { "epoch": 0.872851898775298, "grad_norm": 0.32380688190460205, "learning_rate": 4.355412199247067e-07, "loss": 0.011677238158881664, "memory(GiB)": 22.66, "step": 26869, "token_acc": 1.0, "train_speed(iter/s)": 0.957687 }, { "epoch": 0.8728843842380535, "grad_norm": 0.40250176191329956, "learning_rate": 4.3532197951485165e-07, "loss": 0.014634931460022926, "memory(GiB)": 22.66, "step": 26870, "token_acc": 1.0, "train_speed(iter/s)": 0.957693 }, { "epoch": 0.8729168697008088, "grad_norm": 0.3517405688762665, "learning_rate": 4.351027917874584e-07, "loss": 0.01316311676055193, "memory(GiB)": 22.66, "step": 26871, "token_acc": 1.0, "train_speed(iter/s)": 0.957699 }, { "epoch": 0.8729493551635643, "grad_norm": 0.40937042236328125, "learning_rate": 4.3488365674505374e-07, "loss": 0.01241166703402996, "memory(GiB)": 22.66, "step": 26872, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.957704 }, { "epoch": 0.8729818406263197, "grad_norm": 0.31934991478919983, "learning_rate": 4.3466457439016897e-07, "loss": 0.010502769611775875, "memory(GiB)": 22.66, "step": 26873, "token_acc": 1.0, "train_speed(iter/s)": 0.957711 }, { "epoch": 0.8730143260890751, "grad_norm": 0.34374678134918213, "learning_rate": 4.344455447253304e-07, "loss": 0.00920240767300129, "memory(GiB)": 22.66, "step": 26874, "token_acc": 0.9921875, "train_speed(iter/s)": 0.957717 }, { "epoch": 0.8730468115518305, "grad_norm": 0.31878387928009033, "learning_rate": 4.3422656775306606e-07, "loss": 0.011560606770217419, "memory(GiB)": 22.66, "step": 26875, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957724 }, { "epoch": 0.873079297014586, "grad_norm": 0.34283941984176636, "learning_rate": 4.340076434759061e-07, "loss": 0.012700800783932209, "memory(GiB)": 22.66, "step": 26876, "token_acc": 1.0, "train_speed(iter/s)": 0.957723 }, { "epoch": 0.8731117824773413, "grad_norm": 0.37898382544517517, "learning_rate": 4.337887718963746e-07, "loss": 0.011852337047457695, "memory(GiB)": 22.66, "step": 26877, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957723 }, { "epoch": 0.8731442679400968, "grad_norm": 0.282335489988327, "learning_rate": 4.33569953016999e-07, "loss": 0.012103203684091568, "memory(GiB)": 22.66, "step": 26878, "token_acc": 1.0, "train_speed(iter/s)": 0.957725 }, { "epoch": 0.8731767534028522, "grad_norm": 0.2668963074684143, "learning_rate": 4.3335118684030286e-07, "loss": 0.009342623874545097, "memory(GiB)": 22.66, "step": 26879, "token_acc": 1.0, "train_speed(iter/s)": 0.957732 }, { "epoch": 0.8732092388656076, "grad_norm": 0.3687770664691925, "learning_rate": 4.331324733688125e-07, "loss": 0.018233656883239746, "memory(GiB)": 22.66, "step": 26880, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.957739 }, { "epoch": 0.8732417243283631, "grad_norm": 0.3609815239906311, "learning_rate": 4.3291381260505193e-07, "loss": 0.009971875697374344, "memory(GiB)": 22.66, "step": 26881, "token_acc": 1.0, "train_speed(iter/s)": 0.957746 }, { "epoch": 0.8732742097911185, "grad_norm": 0.44413813948631287, "learning_rate": 4.3269520455154537e-07, "loss": 0.010171017609536648, "memory(GiB)": 22.66, "step": 26882, "token_acc": 1.0, "train_speed(iter/s)": 0.95775 }, { "epoch": 0.8733066952538739, "grad_norm": 0.3341052830219269, "learning_rate": 4.324766492108146e-07, "loss": 0.010264663025736809, "memory(GiB)": 22.66, "step": 26883, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.957753 }, { "epoch": 0.8733391807166293, "grad_norm": 0.5655564665794373, "learning_rate": 4.32258146585382e-07, "loss": 0.01773226633667946, "memory(GiB)": 22.66, "step": 26884, "token_acc": 0.9795918367346939, "train_speed(iter/s)": 0.957759 }, { "epoch": 0.8733716661793848, "grad_norm": 0.27995848655700684, "learning_rate": 4.320396966777707e-07, "loss": 0.012637091800570488, "memory(GiB)": 22.66, "step": 26885, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.95776 }, { "epoch": 0.8734041516421401, "grad_norm": 0.3493840992450714, "learning_rate": 4.3182129949050186e-07, "loss": 0.011326518841087818, "memory(GiB)": 22.66, "step": 26886, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.957759 }, { "epoch": 0.8734366371048956, "grad_norm": 0.894362211227417, "learning_rate": 4.316029550260947e-07, "loss": 0.016966432332992554, "memory(GiB)": 22.66, "step": 26887, "token_acc": 0.9881656804733728, "train_speed(iter/s)": 0.957767 }, { "epoch": 0.873469122567651, "grad_norm": 0.2961106300354004, "learning_rate": 4.3138466328706996e-07, "loss": 0.010651176795363426, "memory(GiB)": 22.66, "step": 26888, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957775 }, { "epoch": 0.8735016080304064, "grad_norm": 0.7665298581123352, "learning_rate": 4.3116642427594725e-07, "loss": 0.01656925678253174, "memory(GiB)": 22.66, "step": 26889, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.957779 }, { "epoch": 0.8735340934931618, "grad_norm": 0.5095420479774475, "learning_rate": 4.3094823799524565e-07, "loss": 0.015872932970523834, "memory(GiB)": 22.66, "step": 26890, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.957774 }, { "epoch": 0.8735665789559173, "grad_norm": 0.4537733495235443, "learning_rate": 4.307301044474821e-07, "loss": 0.018007943406701088, "memory(GiB)": 22.66, "step": 26891, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957769 }, { "epoch": 0.8735990644186726, "grad_norm": 0.38275760412216187, "learning_rate": 4.3051202363517454e-07, "loss": 0.012699870392680168, "memory(GiB)": 22.66, "step": 26892, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.957758 }, { "epoch": 0.8736315498814281, "grad_norm": 0.3612266480922699, "learning_rate": 4.302939955608404e-07, "loss": 0.012240897864103317, "memory(GiB)": 22.66, "step": 26893, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957766 }, { "epoch": 0.8736640353441835, "grad_norm": 0.23760995268821716, "learning_rate": 4.30076020226996e-07, "loss": 0.011611121706664562, "memory(GiB)": 22.66, "step": 26894, "token_acc": 1.0, "train_speed(iter/s)": 0.957774 }, { "epoch": 0.8736965208069389, "grad_norm": 0.34549185633659363, "learning_rate": 4.2985809763615773e-07, "loss": 0.01309262402355671, "memory(GiB)": 22.66, "step": 26895, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957781 }, { "epoch": 0.8737290062696943, "grad_norm": 0.27668994665145874, "learning_rate": 4.2964022779083846e-07, "loss": 0.006108763627707958, "memory(GiB)": 22.66, "step": 26896, "token_acc": 1.0, "train_speed(iter/s)": 0.957789 }, { "epoch": 0.8737614917324498, "grad_norm": 0.2920267879962921, "learning_rate": 4.294224106935557e-07, "loss": 0.012969855219125748, "memory(GiB)": 22.66, "step": 26897, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.957797 }, { "epoch": 0.8737939771952051, "grad_norm": 0.4145457446575165, "learning_rate": 4.292046463468197e-07, "loss": 0.01188535988330841, "memory(GiB)": 22.66, "step": 26898, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.957804 }, { "epoch": 0.8738264626579606, "grad_norm": 0.22315096855163574, "learning_rate": 4.289869347531472e-07, "loss": 0.008823045529425144, "memory(GiB)": 22.66, "step": 26899, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957812 }, { "epoch": 0.873858948120716, "grad_norm": 0.39726969599723816, "learning_rate": 4.287692759150491e-07, "loss": 0.01906617172062397, "memory(GiB)": 22.66, "step": 26900, "token_acc": 1.0, "train_speed(iter/s)": 0.95782 }, { "epoch": 0.8738914335834714, "grad_norm": 0.2607637047767639, "learning_rate": 4.2855166983503835e-07, "loss": 0.007212619297206402, "memory(GiB)": 22.66, "step": 26901, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.957828 }, { "epoch": 0.8739239190462268, "grad_norm": 0.39151090383529663, "learning_rate": 4.283341165156252e-07, "loss": 0.013010561466217041, "memory(GiB)": 22.66, "step": 26902, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.957836 }, { "epoch": 0.8739564045089823, "grad_norm": 0.4122765064239502, "learning_rate": 4.281166159593214e-07, "loss": 0.0108133964240551, "memory(GiB)": 22.66, "step": 26903, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957843 }, { "epoch": 0.8739888899717376, "grad_norm": 0.25617483258247375, "learning_rate": 4.2789916816863676e-07, "loss": 0.005618661176413298, "memory(GiB)": 22.66, "step": 26904, "token_acc": 1.0, "train_speed(iter/s)": 0.957845 }, { "epoch": 0.8740213754344931, "grad_norm": 0.28975817561149597, "learning_rate": 4.2768177314608195e-07, "loss": 0.011244146153330803, "memory(GiB)": 22.66, "step": 26905, "token_acc": 1.0, "train_speed(iter/s)": 0.957851 }, { "epoch": 0.8740538608972485, "grad_norm": 0.33944079279899597, "learning_rate": 4.274644308941645e-07, "loss": 0.00937609188258648, "memory(GiB)": 22.66, "step": 26906, "token_acc": 1.0, "train_speed(iter/s)": 0.957851 }, { "epoch": 0.8740863463600039, "grad_norm": 0.26494330167770386, "learning_rate": 4.2724714141539334e-07, "loss": 0.009853923693299294, "memory(GiB)": 22.66, "step": 26907, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.957854 }, { "epoch": 0.8741188318227593, "grad_norm": 0.30010196566581726, "learning_rate": 4.270299047122772e-07, "loss": 0.007849643938243389, "memory(GiB)": 22.66, "step": 26908, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.957842 }, { "epoch": 0.8741513172855148, "grad_norm": 0.31438910961151123, "learning_rate": 4.2681272078732185e-07, "loss": 0.01183358021080494, "memory(GiB)": 22.66, "step": 26909, "token_acc": 1.0, "train_speed(iter/s)": 0.957848 }, { "epoch": 0.8741838027482701, "grad_norm": 0.382438987493515, "learning_rate": 4.265955896430357e-07, "loss": 0.01449299044907093, "memory(GiB)": 22.66, "step": 26910, "token_acc": 1.0, "train_speed(iter/s)": 0.957853 }, { "epoch": 0.8742162882110256, "grad_norm": 0.4644176661968231, "learning_rate": 4.26378511281923e-07, "loss": 0.012430299073457718, "memory(GiB)": 22.66, "step": 26911, "token_acc": 1.0, "train_speed(iter/s)": 0.957859 }, { "epoch": 0.874248773673781, "grad_norm": 0.25544431805610657, "learning_rate": 4.261614857064894e-07, "loss": 0.008443962782621384, "memory(GiB)": 22.66, "step": 26912, "token_acc": 0.9890909090909091, "train_speed(iter/s)": 0.957862 }, { "epoch": 0.8742812591365364, "grad_norm": 0.5239804983139038, "learning_rate": 4.259445129192402e-07, "loss": 0.017469115555286407, "memory(GiB)": 22.66, "step": 26913, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957857 }, { "epoch": 0.8743137445992918, "grad_norm": 0.3098614811897278, "learning_rate": 4.2572759292268007e-07, "loss": 0.013398854993283749, "memory(GiB)": 22.66, "step": 26914, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.957848 }, { "epoch": 0.8743462300620473, "grad_norm": 0.30777642130851746, "learning_rate": 4.2551072571931085e-07, "loss": 0.008198460564017296, "memory(GiB)": 22.66, "step": 26915, "token_acc": 1.0, "train_speed(iter/s)": 0.957852 }, { "epoch": 0.8743787155248026, "grad_norm": 0.3882046937942505, "learning_rate": 4.252939113116367e-07, "loss": 0.01208464428782463, "memory(GiB)": 22.66, "step": 26916, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.957858 }, { "epoch": 0.8744112009875581, "grad_norm": 0.4411185681819916, "learning_rate": 4.2507714970215995e-07, "loss": 0.0094909081235528, "memory(GiB)": 22.66, "step": 26917, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.957863 }, { "epoch": 0.8744436864503135, "grad_norm": 0.5250511765480042, "learning_rate": 4.248604408933826e-07, "loss": 0.013150462880730629, "memory(GiB)": 22.66, "step": 26918, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.957867 }, { "epoch": 0.8744761719130689, "grad_norm": 0.2147035300731659, "learning_rate": 4.2464378488780486e-07, "loss": 0.004983353428542614, "memory(GiB)": 22.66, "step": 26919, "token_acc": 1.0, "train_speed(iter/s)": 0.957864 }, { "epoch": 0.8745086573758243, "grad_norm": 0.27204325795173645, "learning_rate": 4.2442718168792807e-07, "loss": 0.005745476111769676, "memory(GiB)": 22.66, "step": 26920, "token_acc": 1.0, "train_speed(iter/s)": 0.957862 }, { "epoch": 0.8745411428385798, "grad_norm": 0.3643224537372589, "learning_rate": 4.2421063129625017e-07, "loss": 0.013401835225522518, "memory(GiB)": 22.66, "step": 26921, "token_acc": 1.0, "train_speed(iter/s)": 0.957866 }, { "epoch": 0.8745736283013351, "grad_norm": 0.5119950175285339, "learning_rate": 4.2399413371527256e-07, "loss": 0.011380847543478012, "memory(GiB)": 22.66, "step": 26922, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.957872 }, { "epoch": 0.8746061137640906, "grad_norm": 0.3355921804904938, "learning_rate": 4.2377768894749426e-07, "loss": 0.008281180635094643, "memory(GiB)": 22.66, "step": 26923, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.957879 }, { "epoch": 0.874638599226846, "grad_norm": 0.4268566071987152, "learning_rate": 4.235612969954117e-07, "loss": 0.01601385325193405, "memory(GiB)": 22.66, "step": 26924, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.957881 }, { "epoch": 0.8746710846896014, "grad_norm": 0.32411181926727295, "learning_rate": 4.2334495786152395e-07, "loss": 0.010194050148129463, "memory(GiB)": 22.66, "step": 26925, "token_acc": 1.0, "train_speed(iter/s)": 0.957888 }, { "epoch": 0.8747035701523568, "grad_norm": 0.2905130982398987, "learning_rate": 4.2312867154832514e-07, "loss": 0.008181084878742695, "memory(GiB)": 22.66, "step": 26926, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.957893 }, { "epoch": 0.8747360556151123, "grad_norm": 0.3097158968448639, "learning_rate": 4.229124380583144e-07, "loss": 0.009043244644999504, "memory(GiB)": 22.66, "step": 26927, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.957898 }, { "epoch": 0.8747685410778676, "grad_norm": 0.35718172788619995, "learning_rate": 4.226962573939858e-07, "loss": 0.01657932996749878, "memory(GiB)": 22.66, "step": 26928, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.957904 }, { "epoch": 0.8748010265406231, "grad_norm": 0.38746416568756104, "learning_rate": 4.224801295578357e-07, "loss": 0.010835629887878895, "memory(GiB)": 22.66, "step": 26929, "token_acc": 1.0, "train_speed(iter/s)": 0.95791 }, { "epoch": 0.8748335120033784, "grad_norm": 0.3539575934410095, "learning_rate": 4.2226405455235655e-07, "loss": 0.01140202209353447, "memory(GiB)": 22.66, "step": 26930, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.957916 }, { "epoch": 0.8748659974661339, "grad_norm": 0.4444233179092407, "learning_rate": 4.2204803238004357e-07, "loss": 0.012584902346134186, "memory(GiB)": 22.66, "step": 26931, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.95792 }, { "epoch": 0.8748984829288893, "grad_norm": 0.50706547498703, "learning_rate": 4.2183206304338866e-07, "loss": 0.013587682507932186, "memory(GiB)": 22.66, "step": 26932, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.957926 }, { "epoch": 0.8749309683916447, "grad_norm": 0.3301369547843933, "learning_rate": 4.21616146544887e-07, "loss": 0.011496459133923054, "memory(GiB)": 22.66, "step": 26933, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.957924 }, { "epoch": 0.8749634538544001, "grad_norm": 0.34741631150245667, "learning_rate": 4.2140028288702727e-07, "loss": 0.01136719435453415, "memory(GiB)": 22.66, "step": 26934, "token_acc": 1.0, "train_speed(iter/s)": 0.957925 }, { "epoch": 0.8749959393171556, "grad_norm": 0.22921113669872284, "learning_rate": 4.2118447207230293e-07, "loss": 0.0055007124319672585, "memory(GiB)": 22.66, "step": 26935, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.95792 }, { "epoch": 0.8750284247799109, "grad_norm": 0.308630108833313, "learning_rate": 4.209687141032043e-07, "loss": 0.011258885264396667, "memory(GiB)": 22.66, "step": 26936, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.957927 }, { "epoch": 0.8750609102426664, "grad_norm": 0.2807941734790802, "learning_rate": 4.2075300898222095e-07, "loss": 0.008230004459619522, "memory(GiB)": 22.66, "step": 26937, "token_acc": 1.0, "train_speed(iter/s)": 0.957935 }, { "epoch": 0.8750933957054218, "grad_norm": 0.5306049585342407, "learning_rate": 4.205373567118437e-07, "loss": 0.019221527501940727, "memory(GiB)": 22.66, "step": 26938, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.957925 }, { "epoch": 0.8751258811681772, "grad_norm": 0.23579838871955872, "learning_rate": 4.2032175729456006e-07, "loss": 0.008274039253592491, "memory(GiB)": 22.66, "step": 26939, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.957924 }, { "epoch": 0.8751583666309326, "grad_norm": 0.4001912474632263, "learning_rate": 4.201062107328585e-07, "loss": 0.011918431147933006, "memory(GiB)": 22.66, "step": 26940, "token_acc": 1.0, "train_speed(iter/s)": 0.957921 }, { "epoch": 0.8751908520936881, "grad_norm": 0.30586713552474976, "learning_rate": 4.1989071702922713e-07, "loss": 0.009621139615774155, "memory(GiB)": 22.66, "step": 26941, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.957916 }, { "epoch": 0.8752233375564434, "grad_norm": 0.266128808259964, "learning_rate": 4.196752761861539e-07, "loss": 0.011898483149707317, "memory(GiB)": 22.66, "step": 26942, "token_acc": 1.0, "train_speed(iter/s)": 0.957923 }, { "epoch": 0.8752558230191989, "grad_norm": 0.25024059414863586, "learning_rate": 4.194598882061235e-07, "loss": 0.005946718621999025, "memory(GiB)": 22.66, "step": 26943, "token_acc": 1.0, "train_speed(iter/s)": 0.95793 }, { "epoch": 0.8752883084819544, "grad_norm": 0.364067941904068, "learning_rate": 4.192445530916228e-07, "loss": 0.012417306192219257, "memory(GiB)": 22.66, "step": 26944, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.957934 }, { "epoch": 0.8753207939447097, "grad_norm": 0.2504345774650574, "learning_rate": 4.190292708451371e-07, "loss": 0.007281758822500706, "memory(GiB)": 22.66, "step": 26945, "token_acc": 1.0, "train_speed(iter/s)": 0.957937 }, { "epoch": 0.8753532794074652, "grad_norm": 0.331217885017395, "learning_rate": 4.188140414691516e-07, "loss": 0.01281630340963602, "memory(GiB)": 22.66, "step": 26946, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.957941 }, { "epoch": 0.8753857648702206, "grad_norm": 0.4157428443431854, "learning_rate": 4.1859886496614877e-07, "loss": 0.01859426125884056, "memory(GiB)": 22.66, "step": 26947, "token_acc": 0.9845559845559846, "train_speed(iter/s)": 0.957945 }, { "epoch": 0.875418250332976, "grad_norm": 0.3254733979701996, "learning_rate": 4.183837413386138e-07, "loss": 0.012913547456264496, "memory(GiB)": 22.66, "step": 26948, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.95794 }, { "epoch": 0.8754507357957314, "grad_norm": 0.3489913046360016, "learning_rate": 4.1816867058902643e-07, "loss": 0.014842605218291283, "memory(GiB)": 22.66, "step": 26949, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957948 }, { "epoch": 0.8754832212584869, "grad_norm": 0.3741697072982788, "learning_rate": 4.17953652719873e-07, "loss": 0.010441838763654232, "memory(GiB)": 22.66, "step": 26950, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.957955 }, { "epoch": 0.8755157067212422, "grad_norm": 0.43116775155067444, "learning_rate": 4.17738687733632e-07, "loss": 0.01570563018321991, "memory(GiB)": 22.66, "step": 26951, "token_acc": 0.9911894273127754, "train_speed(iter/s)": 0.95796 }, { "epoch": 0.8755481921839977, "grad_norm": 0.3539189100265503, "learning_rate": 4.17523775632786e-07, "loss": 0.014781249687075615, "memory(GiB)": 22.66, "step": 26952, "token_acc": 0.9858156028368794, "train_speed(iter/s)": 0.957955 }, { "epoch": 0.8755806776467531, "grad_norm": 0.3036310374736786, "learning_rate": 4.1730891641981574e-07, "loss": 0.015199922025203705, "memory(GiB)": 22.66, "step": 26953, "token_acc": 1.0, "train_speed(iter/s)": 0.957957 }, { "epoch": 0.8756131631095085, "grad_norm": 0.3678309917449951, "learning_rate": 4.1709411009719816e-07, "loss": 0.006674180738627911, "memory(GiB)": 22.66, "step": 26954, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957951 }, { "epoch": 0.8756456485722639, "grad_norm": 0.422151118516922, "learning_rate": 4.168793566674162e-07, "loss": 0.016623834148049355, "memory(GiB)": 22.66, "step": 26955, "token_acc": 1.0, "train_speed(iter/s)": 0.957957 }, { "epoch": 0.8756781340350194, "grad_norm": 0.28398144245147705, "learning_rate": 4.1666465613294513e-07, "loss": 0.01234220527112484, "memory(GiB)": 22.66, "step": 26956, "token_acc": 1.0, "train_speed(iter/s)": 0.957963 }, { "epoch": 0.8757106194977747, "grad_norm": 0.2450638711452484, "learning_rate": 4.1645000849626583e-07, "loss": 0.00797492079436779, "memory(GiB)": 22.66, "step": 26957, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957969 }, { "epoch": 0.8757431049605302, "grad_norm": 0.4259594678878784, "learning_rate": 4.162354137598534e-07, "loss": 0.015185603871941566, "memory(GiB)": 22.66, "step": 26958, "token_acc": 0.9819819819819819, "train_speed(iter/s)": 0.957975 }, { "epoch": 0.8757755904232856, "grad_norm": 0.3129527270793915, "learning_rate": 4.160208719261849e-07, "loss": 0.015210317447781563, "memory(GiB)": 22.66, "step": 26959, "token_acc": 0.9818840579710145, "train_speed(iter/s)": 0.95798 }, { "epoch": 0.875808075886041, "grad_norm": 0.30555883049964905, "learning_rate": 4.158063829977371e-07, "loss": 0.010820634663105011, "memory(GiB)": 22.66, "step": 26960, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.957985 }, { "epoch": 0.8758405613487964, "grad_norm": 0.15842558443546295, "learning_rate": 4.155919469769859e-07, "loss": 0.007149190176278353, "memory(GiB)": 22.66, "step": 26961, "token_acc": 1.0, "train_speed(iter/s)": 0.957978 }, { "epoch": 0.8758730468115519, "grad_norm": 0.616935133934021, "learning_rate": 4.153775638664043e-07, "loss": 0.012307746335864067, "memory(GiB)": 22.66, "step": 26962, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.957981 }, { "epoch": 0.8759055322743072, "grad_norm": 0.9992897510528564, "learning_rate": 4.15163233668468e-07, "loss": 0.013603388331830502, "memory(GiB)": 22.66, "step": 26963, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.957981 }, { "epoch": 0.8759380177370627, "grad_norm": 0.47089648246765137, "learning_rate": 4.149489563856507e-07, "loss": 0.012995166704058647, "memory(GiB)": 22.66, "step": 26964, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957986 }, { "epoch": 0.8759705031998181, "grad_norm": 0.4355521500110626, "learning_rate": 4.147347320204254e-07, "loss": 0.02166306972503662, "memory(GiB)": 22.66, "step": 26965, "token_acc": 0.9874476987447699, "train_speed(iter/s)": 0.957992 }, { "epoch": 0.8760029886625735, "grad_norm": 0.26841890811920166, "learning_rate": 4.145205605752639e-07, "loss": 0.012941382825374603, "memory(GiB)": 22.66, "step": 26966, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957998 }, { "epoch": 0.8760354741253289, "grad_norm": 0.28049978613853455, "learning_rate": 4.1430644205263825e-07, "loss": 0.01056200172752142, "memory(GiB)": 22.66, "step": 26967, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.958003 }, { "epoch": 0.8760679595880844, "grad_norm": 0.3945493996143341, "learning_rate": 4.1409237645501976e-07, "loss": 0.01222867053002119, "memory(GiB)": 22.66, "step": 26968, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.958007 }, { "epoch": 0.8761004450508397, "grad_norm": 0.3068530559539795, "learning_rate": 4.1387836378487866e-07, "loss": 0.009921970777213573, "memory(GiB)": 22.66, "step": 26969, "token_acc": 1.0, "train_speed(iter/s)": 0.958007 }, { "epoch": 0.8761329305135952, "grad_norm": 0.2890995740890503, "learning_rate": 4.1366440404468686e-07, "loss": 0.012470392510294914, "memory(GiB)": 22.66, "step": 26970, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.958011 }, { "epoch": 0.8761654159763506, "grad_norm": 0.25165390968322754, "learning_rate": 4.1345049723691135e-07, "loss": 0.005704428534954786, "memory(GiB)": 22.66, "step": 26971, "token_acc": 1.0, "train_speed(iter/s)": 0.958016 }, { "epoch": 0.876197901439106, "grad_norm": 0.3837413489818573, "learning_rate": 4.1323664336402116e-07, "loss": 0.011023570783436298, "memory(GiB)": 22.66, "step": 26972, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.958023 }, { "epoch": 0.8762303869018614, "grad_norm": 0.4301580488681793, "learning_rate": 4.1302284242848545e-07, "loss": 0.0167359821498394, "memory(GiB)": 22.66, "step": 26973, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.958022 }, { "epoch": 0.8762628723646169, "grad_norm": 0.3707996904850006, "learning_rate": 4.1280909443277227e-07, "loss": 0.017045900225639343, "memory(GiB)": 22.66, "step": 26974, "token_acc": 1.0, "train_speed(iter/s)": 0.95802 }, { "epoch": 0.8762953578273722, "grad_norm": 0.4414590001106262, "learning_rate": 4.125953993793469e-07, "loss": 0.016488084569573402, "memory(GiB)": 22.66, "step": 26975, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.958022 }, { "epoch": 0.8763278432901277, "grad_norm": 0.3169879913330078, "learning_rate": 4.1238175727067733e-07, "loss": 0.008848830126225948, "memory(GiB)": 22.66, "step": 26976, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.958028 }, { "epoch": 0.876360328752883, "grad_norm": 0.30561041831970215, "learning_rate": 4.1216816810922653e-07, "loss": 0.010137850418686867, "memory(GiB)": 22.66, "step": 26977, "token_acc": 0.9933774834437086, "train_speed(iter/s)": 0.958033 }, { "epoch": 0.8763928142156385, "grad_norm": 0.39643150568008423, "learning_rate": 4.119546318974632e-07, "loss": 0.013077663257718086, "memory(GiB)": 22.66, "step": 26978, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.95804 }, { "epoch": 0.8764252996783939, "grad_norm": 0.4279642403125763, "learning_rate": 4.117411486378492e-07, "loss": 0.01231903862208128, "memory(GiB)": 22.66, "step": 26979, "token_acc": 1.0, "train_speed(iter/s)": 0.958046 }, { "epoch": 0.8764577851411494, "grad_norm": 0.3204006552696228, "learning_rate": 4.1152771833285035e-07, "loss": 0.00969387125223875, "memory(GiB)": 22.66, "step": 26980, "token_acc": 1.0, "train_speed(iter/s)": 0.958052 }, { "epoch": 0.8764902706039047, "grad_norm": 0.3020123243331909, "learning_rate": 4.1131434098492797e-07, "loss": 0.009271437302231789, "memory(GiB)": 22.66, "step": 26981, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.958048 }, { "epoch": 0.8765227560666602, "grad_norm": 0.31405481696128845, "learning_rate": 4.1110101659654567e-07, "loss": 0.008630406111478806, "memory(GiB)": 22.66, "step": 26982, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.958051 }, { "epoch": 0.8765552415294156, "grad_norm": 0.3912127912044525, "learning_rate": 4.108877451701648e-07, "loss": 0.012152353301644325, "memory(GiB)": 22.66, "step": 26983, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.958045 }, { "epoch": 0.876587726992171, "grad_norm": 0.39830175042152405, "learning_rate": 4.1067452670824847e-07, "loss": 0.012382911518216133, "memory(GiB)": 22.66, "step": 26984, "token_acc": 1.0, "train_speed(iter/s)": 0.958051 }, { "epoch": 0.8766202124549264, "grad_norm": 0.4060990810394287, "learning_rate": 4.1046136121325685e-07, "loss": 0.01360227819532156, "memory(GiB)": 22.66, "step": 26985, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.958057 }, { "epoch": 0.8766526979176819, "grad_norm": 0.4668278098106384, "learning_rate": 4.1024824868764857e-07, "loss": 0.011296874843537807, "memory(GiB)": 22.66, "step": 26986, "token_acc": 1.0, "train_speed(iter/s)": 0.958062 }, { "epoch": 0.8766851833804372, "grad_norm": 0.29721492528915405, "learning_rate": 4.1003518913388494e-07, "loss": 0.007769627030938864, "memory(GiB)": 22.66, "step": 26987, "token_acc": 0.996, "train_speed(iter/s)": 0.958064 }, { "epoch": 0.8767176688431927, "grad_norm": 0.7153883576393127, "learning_rate": 4.0982218255442407e-07, "loss": 0.013804704882204533, "memory(GiB)": 22.66, "step": 26988, "token_acc": 1.0, "train_speed(iter/s)": 0.95806 }, { "epoch": 0.876750154305948, "grad_norm": 0.23236072063446045, "learning_rate": 4.0960922895172615e-07, "loss": 0.007360196206718683, "memory(GiB)": 22.66, "step": 26989, "token_acc": 1.0, "train_speed(iter/s)": 0.958063 }, { "epoch": 0.8767826397687035, "grad_norm": 0.37542590498924255, "learning_rate": 4.0939632832824593e-07, "loss": 0.012371169403195381, "memory(GiB)": 22.66, "step": 26990, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.95807 }, { "epoch": 0.8768151252314589, "grad_norm": 0.424826443195343, "learning_rate": 4.0918348068644253e-07, "loss": 0.009890352375805378, "memory(GiB)": 22.66, "step": 26991, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.958078 }, { "epoch": 0.8768476106942144, "grad_norm": 0.3728736639022827, "learning_rate": 4.089706860287723e-07, "loss": 0.009978326968848705, "memory(GiB)": 22.66, "step": 26992, "token_acc": 1.0, "train_speed(iter/s)": 0.958085 }, { "epoch": 0.8768800961569697, "grad_norm": 0.3991810977458954, "learning_rate": 4.0875794435769166e-07, "loss": 0.012721417471766472, "memory(GiB)": 22.66, "step": 26993, "token_acc": 1.0, "train_speed(iter/s)": 0.958088 }, { "epoch": 0.8769125816197252, "grad_norm": 0.40663716197013855, "learning_rate": 4.085452556756547e-07, "loss": 0.015980549156665802, "memory(GiB)": 22.66, "step": 26994, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.958093 }, { "epoch": 0.8769450670824805, "grad_norm": 0.3308778703212738, "learning_rate": 4.083326199851162e-07, "loss": 0.009357692673802376, "memory(GiB)": 22.66, "step": 26995, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.9581 }, { "epoch": 0.876977552545236, "grad_norm": 0.37996917963027954, "learning_rate": 4.0812003728853135e-07, "loss": 0.008658814243972301, "memory(GiB)": 22.66, "step": 26996, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.958107 }, { "epoch": 0.8770100380079914, "grad_norm": 0.4214983284473419, "learning_rate": 4.079075075883532e-07, "loss": 0.012914080172777176, "memory(GiB)": 22.66, "step": 26997, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.958114 }, { "epoch": 0.8770425234707468, "grad_norm": 0.4488368630409241, "learning_rate": 4.0769503088703424e-07, "loss": 0.017465446144342422, "memory(GiB)": 22.66, "step": 26998, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.958122 }, { "epoch": 0.8770750089335022, "grad_norm": 0.3364904224872589, "learning_rate": 4.074826071870264e-07, "loss": 0.009854821488261223, "memory(GiB)": 22.66, "step": 26999, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.958129 }, { "epoch": 0.8771074943962577, "grad_norm": 0.24635908007621765, "learning_rate": 4.0727023649078267e-07, "loss": 0.005801275838166475, "memory(GiB)": 22.66, "step": 27000, "token_acc": 1.0, "train_speed(iter/s)": 0.958136 }, { "epoch": 0.8771074943962577, "eval_loss": 0.011594435200095177, "eval_runtime": 80.1402, "eval_samples_per_second": 124.157, "eval_steps_per_second": 3.881, "eval_token_acc": 0.9953064695212256, "step": 27000 }, { "epoch": 0.877139979859013, "grad_norm": 0.3823513388633728, "learning_rate": 4.070579188007528e-07, "loss": 0.01040576584637165, "memory(GiB)": 22.66, "step": 27001, "token_acc": 0.9949221209383811, "train_speed(iter/s)": 0.955017 }, { "epoch": 0.8771724653217685, "grad_norm": 0.3272474706172943, "learning_rate": 4.068456541193888e-07, "loss": 0.01177784614264965, "memory(GiB)": 22.66, "step": 27002, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955021 }, { "epoch": 0.8772049507845239, "grad_norm": 0.45233985781669617, "learning_rate": 4.0663344244913803e-07, "loss": 0.01756526529788971, "memory(GiB)": 22.66, "step": 27003, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.955026 }, { "epoch": 0.8772374362472793, "grad_norm": 0.28895998001098633, "learning_rate": 4.064212837924525e-07, "loss": 0.010683891363441944, "memory(GiB)": 22.66, "step": 27004, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.955031 }, { "epoch": 0.8772699217100347, "grad_norm": 0.5026919841766357, "learning_rate": 4.062091781517774e-07, "loss": 0.01465732604265213, "memory(GiB)": 22.66, "step": 27005, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955036 }, { "epoch": 0.8773024071727902, "grad_norm": 0.3351183533668518, "learning_rate": 4.0599712552956474e-07, "loss": 0.008793622255325317, "memory(GiB)": 22.66, "step": 27006, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.955042 }, { "epoch": 0.8773348926355455, "grad_norm": 0.31150883436203003, "learning_rate": 4.0578512592825857e-07, "loss": 0.008646972477436066, "memory(GiB)": 22.66, "step": 27007, "token_acc": 1.0, "train_speed(iter/s)": 0.955048 }, { "epoch": 0.877367378098301, "grad_norm": 0.40495777130126953, "learning_rate": 4.0557317935030815e-07, "loss": 0.013659536838531494, "memory(GiB)": 22.66, "step": 27008, "token_acc": 0.9881656804733728, "train_speed(iter/s)": 0.955054 }, { "epoch": 0.8773998635610565, "grad_norm": 0.28044137358665466, "learning_rate": 4.053612857981576e-07, "loss": 0.007373523432761431, "memory(GiB)": 22.66, "step": 27009, "token_acc": 1.0, "train_speed(iter/s)": 0.955061 }, { "epoch": 0.8774323490238118, "grad_norm": 0.2594822943210602, "learning_rate": 4.051494452742533e-07, "loss": 0.009846888482570648, "memory(GiB)": 22.66, "step": 27010, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955067 }, { "epoch": 0.8774648344865673, "grad_norm": 0.3385161757469177, "learning_rate": 4.0493765778103987e-07, "loss": 0.010960815474390984, "memory(GiB)": 22.66, "step": 27011, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955073 }, { "epoch": 0.8774973199493227, "grad_norm": 0.30699238181114197, "learning_rate": 4.0472592332096273e-07, "loss": 0.009608135558664799, "memory(GiB)": 22.66, "step": 27012, "token_acc": 0.9928057553956835, "train_speed(iter/s)": 0.955078 }, { "epoch": 0.8775298054120781, "grad_norm": 0.5277191996574402, "learning_rate": 4.045142418964637e-07, "loss": 0.014747027307748795, "memory(GiB)": 22.66, "step": 27013, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.955084 }, { "epoch": 0.8775622908748335, "grad_norm": 0.3418405055999756, "learning_rate": 4.04302613509987e-07, "loss": 0.010579450987279415, "memory(GiB)": 22.66, "step": 27014, "token_acc": 1.0, "train_speed(iter/s)": 0.95509 }, { "epoch": 0.877594776337589, "grad_norm": 0.3036395311355591, "learning_rate": 4.040910381639751e-07, "loss": 0.009648008272051811, "memory(GiB)": 22.66, "step": 27015, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955096 }, { "epoch": 0.8776272618003443, "grad_norm": 0.28505808115005493, "learning_rate": 4.0387951586086995e-07, "loss": 0.006156759802252054, "memory(GiB)": 22.66, "step": 27016, "token_acc": 1.0, "train_speed(iter/s)": 0.955101 }, { "epoch": 0.8776597472630998, "grad_norm": 0.38871362805366516, "learning_rate": 4.036680466031134e-07, "loss": 0.010800521820783615, "memory(GiB)": 22.66, "step": 27017, "token_acc": 1.0, "train_speed(iter/s)": 0.955108 }, { "epoch": 0.8776922327258552, "grad_norm": 0.21403983235359192, "learning_rate": 4.034566303931442e-07, "loss": 0.007541462779045105, "memory(GiB)": 22.66, "step": 27018, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.955114 }, { "epoch": 0.8777247181886106, "grad_norm": 0.31253582239151, "learning_rate": 4.0324526723340363e-07, "loss": 0.013034231029450893, "memory(GiB)": 22.66, "step": 27019, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.955119 }, { "epoch": 0.877757203651366, "grad_norm": 0.37178048491477966, "learning_rate": 4.0303395712633087e-07, "loss": 0.014986199326813221, "memory(GiB)": 22.66, "step": 27020, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955125 }, { "epoch": 0.8777896891141215, "grad_norm": 0.4055135250091553, "learning_rate": 4.028227000743662e-07, "loss": 0.015228049829602242, "memory(GiB)": 22.66, "step": 27021, "token_acc": 1.0, "train_speed(iter/s)": 0.95513 }, { "epoch": 0.8778221745768768, "grad_norm": 0.3128330111503601, "learning_rate": 4.026114960799449e-07, "loss": 0.011698910966515541, "memory(GiB)": 22.66, "step": 27022, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955135 }, { "epoch": 0.8778546600396323, "grad_norm": 0.30792635679244995, "learning_rate": 4.0240034514550606e-07, "loss": 0.013334715738892555, "memory(GiB)": 22.66, "step": 27023, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.955141 }, { "epoch": 0.8778871455023877, "grad_norm": 0.2760826051235199, "learning_rate": 4.021892472734867e-07, "loss": 0.009599998593330383, "memory(GiB)": 22.66, "step": 27024, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955147 }, { "epoch": 0.8779196309651431, "grad_norm": 0.34819138050079346, "learning_rate": 4.0197820246632435e-07, "loss": 0.017228081822395325, "memory(GiB)": 22.66, "step": 27025, "token_acc": 1.0, "train_speed(iter/s)": 0.955153 }, { "epoch": 0.8779521164278985, "grad_norm": 0.3684872090816498, "learning_rate": 4.0176721072645195e-07, "loss": 0.011036721989512444, "memory(GiB)": 22.66, "step": 27026, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.955161 }, { "epoch": 0.877984601890654, "grad_norm": 0.21471646428108215, "learning_rate": 4.0155627205630766e-07, "loss": 0.008768337778747082, "memory(GiB)": 22.66, "step": 27027, "token_acc": 0.9947643979057592, "train_speed(iter/s)": 0.955168 }, { "epoch": 0.8780170873534093, "grad_norm": 0.26266077160835266, "learning_rate": 4.0134538645832277e-07, "loss": 0.006451199296861887, "memory(GiB)": 22.66, "step": 27028, "token_acc": 1.0, "train_speed(iter/s)": 0.955175 }, { "epoch": 0.8780495728161648, "grad_norm": 0.7194840908050537, "learning_rate": 4.0113455393493426e-07, "loss": 0.011567043140530586, "memory(GiB)": 22.66, "step": 27029, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.955183 }, { "epoch": 0.8780820582789202, "grad_norm": 0.3320044279098511, "learning_rate": 4.00923774488573e-07, "loss": 0.01391204446554184, "memory(GiB)": 22.66, "step": 27030, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.95519 }, { "epoch": 0.8781145437416756, "grad_norm": 0.46078041195869446, "learning_rate": 4.007130481216731e-07, "loss": 0.014046479016542435, "memory(GiB)": 22.66, "step": 27031, "token_acc": 1.0, "train_speed(iter/s)": 0.955197 }, { "epoch": 0.878147029204431, "grad_norm": 0.20945072174072266, "learning_rate": 4.005023748366671e-07, "loss": 0.008634669706225395, "memory(GiB)": 22.66, "step": 27032, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955204 }, { "epoch": 0.8781795146671865, "grad_norm": 0.3038821518421173, "learning_rate": 4.0029175463598416e-07, "loss": 0.009539197199046612, "memory(GiB)": 22.66, "step": 27033, "token_acc": 0.9933774834437086, "train_speed(iter/s)": 0.955212 }, { "epoch": 0.8782120001299418, "grad_norm": 0.3935772776603699, "learning_rate": 4.0008118752205784e-07, "loss": 0.011896546930074692, "memory(GiB)": 22.66, "step": 27034, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.955219 }, { "epoch": 0.8782444855926973, "grad_norm": 0.343451589345932, "learning_rate": 3.9987067349731676e-07, "loss": 0.009566707536578178, "memory(GiB)": 22.66, "step": 27035, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.955227 }, { "epoch": 0.8782769710554527, "grad_norm": 0.22986121475696564, "learning_rate": 3.996602125641913e-07, "loss": 0.004712359048426151, "memory(GiB)": 22.66, "step": 27036, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955234 }, { "epoch": 0.8783094565182081, "grad_norm": 0.3566305935382843, "learning_rate": 3.994498047251094e-07, "loss": 0.011335642077028751, "memory(GiB)": 22.66, "step": 27037, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955241 }, { "epoch": 0.8783419419809635, "grad_norm": 0.27937865257263184, "learning_rate": 3.9923944998249975e-07, "loss": 0.011072062887251377, "memory(GiB)": 22.66, "step": 27038, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.955249 }, { "epoch": 0.878374427443719, "grad_norm": 0.4272262454032898, "learning_rate": 3.9902914833879034e-07, "loss": 0.01219065859913826, "memory(GiB)": 22.66, "step": 27039, "token_acc": 1.0, "train_speed(iter/s)": 0.955256 }, { "epoch": 0.8784069129064743, "grad_norm": 0.35576027631759644, "learning_rate": 3.9881889979640986e-07, "loss": 0.011597840115427971, "memory(GiB)": 22.66, "step": 27040, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955263 }, { "epoch": 0.8784393983692298, "grad_norm": 0.38157784938812256, "learning_rate": 3.9860870435778243e-07, "loss": 0.011671466752886772, "memory(GiB)": 22.66, "step": 27041, "token_acc": 0.9888268156424581, "train_speed(iter/s)": 0.955271 }, { "epoch": 0.8784718838319852, "grad_norm": 0.4354855716228485, "learning_rate": 3.9839856202533443e-07, "loss": 0.015058813616633415, "memory(GiB)": 22.66, "step": 27042, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.955278 }, { "epoch": 0.8785043692947406, "grad_norm": 0.3226800560951233, "learning_rate": 3.981884728014923e-07, "loss": 0.013016758486628532, "memory(GiB)": 22.66, "step": 27043, "token_acc": 0.988, "train_speed(iter/s)": 0.955285 }, { "epoch": 0.878536854757496, "grad_norm": 0.4056975841522217, "learning_rate": 3.979784366886813e-07, "loss": 0.008443155325949192, "memory(GiB)": 22.66, "step": 27044, "token_acc": 0.9930795847750865, "train_speed(iter/s)": 0.955293 }, { "epoch": 0.8785693402202515, "grad_norm": 0.33212557435035706, "learning_rate": 3.977684536893228e-07, "loss": 0.010981443338096142, "memory(GiB)": 22.66, "step": 27045, "token_acc": 1.0, "train_speed(iter/s)": 0.955299 }, { "epoch": 0.8786018256830068, "grad_norm": 0.383767694234848, "learning_rate": 3.9755852380584217e-07, "loss": 0.010974697768688202, "memory(GiB)": 22.66, "step": 27046, "token_acc": 1.0, "train_speed(iter/s)": 0.955305 }, { "epoch": 0.8786343111457623, "grad_norm": 0.16646896302700043, "learning_rate": 3.9734864704066236e-07, "loss": 0.005481769796460867, "memory(GiB)": 22.66, "step": 27047, "token_acc": 1.0, "train_speed(iter/s)": 0.955302 }, { "epoch": 0.8786667966085177, "grad_norm": 0.3383694291114807, "learning_rate": 3.971388233962048e-07, "loss": 0.003384770592674613, "memory(GiB)": 22.66, "step": 27048, "token_acc": 1.0, "train_speed(iter/s)": 0.955308 }, { "epoch": 0.8786992820712731, "grad_norm": 0.31035491824150085, "learning_rate": 3.9692905287489313e-07, "loss": 0.007772989571094513, "memory(GiB)": 22.66, "step": 27049, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955313 }, { "epoch": 0.8787317675340285, "grad_norm": 0.30108642578125, "learning_rate": 3.967193354791454e-07, "loss": 0.011097634211182594, "memory(GiB)": 22.66, "step": 27050, "token_acc": 0.995, "train_speed(iter/s)": 0.955319 }, { "epoch": 0.878764252996784, "grad_norm": 0.38109883666038513, "learning_rate": 3.965096712113836e-07, "loss": 0.009318048134446144, "memory(GiB)": 22.66, "step": 27051, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955325 }, { "epoch": 0.8787967384595393, "grad_norm": 0.5594136118888855, "learning_rate": 3.963000600740274e-07, "loss": 0.009828973561525345, "memory(GiB)": 22.66, "step": 27052, "token_acc": 0.99609375, "train_speed(iter/s)": 0.955331 }, { "epoch": 0.8788292239222948, "grad_norm": 0.2758834958076477, "learning_rate": 3.9609050206949716e-07, "loss": 0.010138102807104588, "memory(GiB)": 22.66, "step": 27053, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955337 }, { "epoch": 0.8788617093850501, "grad_norm": 0.3171955943107605, "learning_rate": 3.9588099720020925e-07, "loss": 0.013353897258639336, "memory(GiB)": 22.66, "step": 27054, "token_acc": 1.0, "train_speed(iter/s)": 0.955343 }, { "epoch": 0.8788941948478056, "grad_norm": 0.4264858663082123, "learning_rate": 3.956715454685833e-07, "loss": 0.012926455587148666, "memory(GiB)": 22.66, "step": 27055, "token_acc": 1.0, "train_speed(iter/s)": 0.955348 }, { "epoch": 0.878926680310561, "grad_norm": 0.27424710988998413, "learning_rate": 3.9546214687703477e-07, "loss": 0.007008343003690243, "memory(GiB)": 22.66, "step": 27056, "token_acc": 1.0, "train_speed(iter/s)": 0.955342 }, { "epoch": 0.8789591657733165, "grad_norm": 0.28826597332954407, "learning_rate": 3.952528014279833e-07, "loss": 0.012004731222987175, "memory(GiB)": 22.66, "step": 27057, "token_acc": 1.0, "train_speed(iter/s)": 0.955349 }, { "epoch": 0.8789916512360718, "grad_norm": 0.3018281161785126, "learning_rate": 3.950435091238425e-07, "loss": 0.01157598290592432, "memory(GiB)": 22.66, "step": 27058, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.955355 }, { "epoch": 0.8790241366988273, "grad_norm": 1.3700827360153198, "learning_rate": 3.948342699670299e-07, "loss": 0.014225357212126255, "memory(GiB)": 22.66, "step": 27059, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.95536 }, { "epoch": 0.8790566221615826, "grad_norm": 0.40119293332099915, "learning_rate": 3.9462508395995804e-07, "loss": 0.014251613058149815, "memory(GiB)": 22.66, "step": 27060, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955364 }, { "epoch": 0.8790891076243381, "grad_norm": 0.39913633465766907, "learning_rate": 3.944159511050416e-07, "loss": 0.010484331287443638, "memory(GiB)": 22.66, "step": 27061, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955369 }, { "epoch": 0.8791215930870935, "grad_norm": 0.4679226875305176, "learning_rate": 3.9420687140469706e-07, "loss": 0.017231488600373268, "memory(GiB)": 22.66, "step": 27062, "token_acc": 0.9836956521739131, "train_speed(iter/s)": 0.955375 }, { "epoch": 0.879154078549849, "grad_norm": 0.22734765708446503, "learning_rate": 3.9399784486133464e-07, "loss": 0.009954029694199562, "memory(GiB)": 22.66, "step": 27063, "token_acc": 1.0, "train_speed(iter/s)": 0.955382 }, { "epoch": 0.8791865640126043, "grad_norm": 0.486468642950058, "learning_rate": 3.9378887147736857e-07, "loss": 0.01734737679362297, "memory(GiB)": 22.66, "step": 27064, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955388 }, { "epoch": 0.8792190494753598, "grad_norm": 0.22753804922103882, "learning_rate": 3.9357995125520853e-07, "loss": 0.007356445305049419, "memory(GiB)": 22.66, "step": 27065, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955395 }, { "epoch": 0.8792515349381151, "grad_norm": 0.3662896156311035, "learning_rate": 3.933710841972671e-07, "loss": 0.014444992877542973, "memory(GiB)": 22.66, "step": 27066, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.955401 }, { "epoch": 0.8792840204008706, "grad_norm": 0.3556748330593109, "learning_rate": 3.931622703059551e-07, "loss": 0.01525389589369297, "memory(GiB)": 22.66, "step": 27067, "token_acc": 1.0, "train_speed(iter/s)": 0.955406 }, { "epoch": 0.879316505863626, "grad_norm": 0.35917797684669495, "learning_rate": 3.9295350958368284e-07, "loss": 0.011753060854971409, "memory(GiB)": 22.66, "step": 27068, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.955412 }, { "epoch": 0.8793489913263814, "grad_norm": 0.30815795063972473, "learning_rate": 3.927448020328584e-07, "loss": 0.010813141241669655, "memory(GiB)": 22.66, "step": 27069, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.955418 }, { "epoch": 0.8793814767891368, "grad_norm": 0.5756612420082092, "learning_rate": 3.9253614765589085e-07, "loss": 0.013713780790567398, "memory(GiB)": 22.66, "step": 27070, "token_acc": 1.0, "train_speed(iter/s)": 0.955424 }, { "epoch": 0.8794139622518923, "grad_norm": 0.2857711613178253, "learning_rate": 3.92327546455189e-07, "loss": 0.0086539126932621, "memory(GiB)": 22.66, "step": 27071, "token_acc": 1.0, "train_speed(iter/s)": 0.95543 }, { "epoch": 0.8794464477146477, "grad_norm": 0.4006759524345398, "learning_rate": 3.921189984331608e-07, "loss": 0.011751986108720303, "memory(GiB)": 22.66, "step": 27072, "token_acc": 0.9947368421052631, "train_speed(iter/s)": 0.955435 }, { "epoch": 0.8794789331774031, "grad_norm": 0.552141547203064, "learning_rate": 3.9191050359221107e-07, "loss": 0.014564459212124348, "memory(GiB)": 22.66, "step": 27073, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.95544 }, { "epoch": 0.8795114186401586, "grad_norm": 0.352628231048584, "learning_rate": 3.9170206193474835e-07, "loss": 0.011486491188406944, "memory(GiB)": 22.66, "step": 27074, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955446 }, { "epoch": 0.8795439041029139, "grad_norm": 0.6573164463043213, "learning_rate": 3.914936734631769e-07, "loss": 0.013927796855568886, "memory(GiB)": 22.66, "step": 27075, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.955452 }, { "epoch": 0.8795763895656694, "grad_norm": 0.3135473132133484, "learning_rate": 3.9128533817990253e-07, "loss": 0.008704964071512222, "memory(GiB)": 22.66, "step": 27076, "token_acc": 1.0, "train_speed(iter/s)": 0.955457 }, { "epoch": 0.8796088750284248, "grad_norm": 0.5258913040161133, "learning_rate": 3.9107705608733057e-07, "loss": 0.014331185258924961, "memory(GiB)": 22.66, "step": 27077, "token_acc": 1.0, "train_speed(iter/s)": 0.955462 }, { "epoch": 0.8796413604911802, "grad_norm": 0.20920290052890778, "learning_rate": 3.9086882718786247e-07, "loss": 0.009738299995660782, "memory(GiB)": 22.66, "step": 27078, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.955466 }, { "epoch": 0.8796738459539356, "grad_norm": 0.2860080301761627, "learning_rate": 3.906606514839034e-07, "loss": 0.009349147789180279, "memory(GiB)": 22.66, "step": 27079, "token_acc": 1.0, "train_speed(iter/s)": 0.955472 }, { "epoch": 0.8797063314166911, "grad_norm": 0.42766809463500977, "learning_rate": 3.9045252897785493e-07, "loss": 0.01302922423928976, "memory(GiB)": 22.66, "step": 27080, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955478 }, { "epoch": 0.8797388168794464, "grad_norm": 0.2963007092475891, "learning_rate": 3.902444596721211e-07, "loss": 0.011750796809792519, "memory(GiB)": 22.66, "step": 27081, "token_acc": 1.0, "train_speed(iter/s)": 0.955484 }, { "epoch": 0.8797713023422019, "grad_norm": 0.2560359537601471, "learning_rate": 3.900364435691001e-07, "loss": 0.007249285001307726, "memory(GiB)": 22.66, "step": 27082, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.95549 }, { "epoch": 0.8798037878049573, "grad_norm": 0.3019956350326538, "learning_rate": 3.8982848067119604e-07, "loss": 0.01172097772359848, "memory(GiB)": 22.66, "step": 27083, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.955495 }, { "epoch": 0.8798362732677127, "grad_norm": 0.5415331125259399, "learning_rate": 3.896205709808054e-07, "loss": 0.013073291629552841, "memory(GiB)": 22.66, "step": 27084, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.9555 }, { "epoch": 0.8798687587304681, "grad_norm": 0.3536377251148224, "learning_rate": 3.8941271450033123e-07, "loss": 0.011584736406803131, "memory(GiB)": 22.66, "step": 27085, "token_acc": 1.0, "train_speed(iter/s)": 0.955505 }, { "epoch": 0.8799012441932236, "grad_norm": 0.4527907073497772, "learning_rate": 3.892049112321705e-07, "loss": 0.011602109298110008, "memory(GiB)": 22.66, "step": 27086, "token_acc": 1.0, "train_speed(iter/s)": 0.9555 }, { "epoch": 0.8799337296559789, "grad_norm": 0.2593759298324585, "learning_rate": 3.8899716117872244e-07, "loss": 0.009538883343338966, "memory(GiB)": 22.66, "step": 27087, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955506 }, { "epoch": 0.8799662151187344, "grad_norm": 0.15528711676597595, "learning_rate": 3.8878946434238394e-07, "loss": 0.0037496297154575586, "memory(GiB)": 22.66, "step": 27088, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955512 }, { "epoch": 0.8799987005814898, "grad_norm": 0.23467330634593964, "learning_rate": 3.8858182072555263e-07, "loss": 0.007197333499789238, "memory(GiB)": 22.66, "step": 27089, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.955518 }, { "epoch": 0.8800311860442452, "grad_norm": 0.2672562003135681, "learning_rate": 3.8837423033062424e-07, "loss": 0.006367722991853952, "memory(GiB)": 22.66, "step": 27090, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955525 }, { "epoch": 0.8800636715070006, "grad_norm": 0.3723817765712738, "learning_rate": 3.881666931599959e-07, "loss": 0.009515751153230667, "memory(GiB)": 22.66, "step": 27091, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.955533 }, { "epoch": 0.8800961569697561, "grad_norm": 0.38431715965270996, "learning_rate": 3.879592092160633e-07, "loss": 0.010863369330763817, "memory(GiB)": 22.66, "step": 27092, "token_acc": 1.0, "train_speed(iter/s)": 0.95554 }, { "epoch": 0.8801286424325114, "grad_norm": 0.3330497443675995, "learning_rate": 3.8775177850121857e-07, "loss": 0.012050114572048187, "memory(GiB)": 22.66, "step": 27093, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955547 }, { "epoch": 0.8801611278952669, "grad_norm": 0.5011404752731323, "learning_rate": 3.875444010178575e-07, "loss": 0.01689666137099266, "memory(GiB)": 22.66, "step": 27094, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.955555 }, { "epoch": 0.8801936133580223, "grad_norm": 0.33965006470680237, "learning_rate": 3.873370767683732e-07, "loss": 0.007882981561124325, "memory(GiB)": 22.66, "step": 27095, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.955563 }, { "epoch": 0.8802260988207777, "grad_norm": 0.5055893063545227, "learning_rate": 3.8712980575515923e-07, "loss": 0.012945286929607391, "memory(GiB)": 22.66, "step": 27096, "token_acc": 0.9857651245551602, "train_speed(iter/s)": 0.95557 }, { "epoch": 0.8802585842835331, "grad_norm": 0.3420224189758301, "learning_rate": 3.8692258798060654e-07, "loss": 0.007595675066113472, "memory(GiB)": 22.66, "step": 27097, "token_acc": 0.9944444444444445, "train_speed(iter/s)": 0.955577 }, { "epoch": 0.8802910697462886, "grad_norm": 0.28756096959114075, "learning_rate": 3.8671542344710653e-07, "loss": 0.011076909489929676, "memory(GiB)": 22.66, "step": 27098, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.955585 }, { "epoch": 0.8803235552090439, "grad_norm": 0.3706172704696655, "learning_rate": 3.865083121570512e-07, "loss": 0.012360906228423119, "memory(GiB)": 22.66, "step": 27099, "token_acc": 1.0, "train_speed(iter/s)": 0.955592 }, { "epoch": 0.8803560406717994, "grad_norm": 0.385127454996109, "learning_rate": 3.8630125411283136e-07, "loss": 0.01155596598982811, "memory(GiB)": 22.66, "step": 27100, "token_acc": 0.9965277777777778, "train_speed(iter/s)": 0.9556 }, { "epoch": 0.8803885261345548, "grad_norm": 0.2611393630504608, "learning_rate": 3.8609424931683515e-07, "loss": 0.007936855778098106, "memory(GiB)": 22.66, "step": 27101, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.955607 }, { "epoch": 0.8804210115973102, "grad_norm": 0.37034058570861816, "learning_rate": 3.8588729777145226e-07, "loss": 0.012611180543899536, "memory(GiB)": 22.66, "step": 27102, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955614 }, { "epoch": 0.8804534970600656, "grad_norm": 0.2572910487651825, "learning_rate": 3.856803994790709e-07, "loss": 0.010984879918396473, "memory(GiB)": 22.66, "step": 27103, "token_acc": 1.0, "train_speed(iter/s)": 0.95562 }, { "epoch": 0.8804859825228211, "grad_norm": 0.4261287748813629, "learning_rate": 3.854735544420807e-07, "loss": 0.01328461617231369, "memory(GiB)": 22.66, "step": 27104, "token_acc": 1.0, "train_speed(iter/s)": 0.955626 }, { "epoch": 0.8805184679855764, "grad_norm": 0.37753406167030334, "learning_rate": 3.8526676266286645e-07, "loss": 0.013729069381952286, "memory(GiB)": 22.66, "step": 27105, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.955632 }, { "epoch": 0.8805509534483319, "grad_norm": 0.2753923833370209, "learning_rate": 3.8506002414381737e-07, "loss": 0.008763938210904598, "memory(GiB)": 22.66, "step": 27106, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955638 }, { "epoch": 0.8805834389110873, "grad_norm": 0.31943199038505554, "learning_rate": 3.8485333888731603e-07, "loss": 0.005389743484556675, "memory(GiB)": 22.66, "step": 27107, "token_acc": 1.0, "train_speed(iter/s)": 0.955642 }, { "epoch": 0.8806159243738427, "grad_norm": 0.19049537181854248, "learning_rate": 3.846467068957505e-07, "loss": 0.005786853842437267, "memory(GiB)": 22.66, "step": 27108, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.955648 }, { "epoch": 0.8806484098365981, "grad_norm": 0.34197333455085754, "learning_rate": 3.84440128171506e-07, "loss": 0.013099007308483124, "memory(GiB)": 22.66, "step": 27109, "token_acc": 1.0, "train_speed(iter/s)": 0.955653 }, { "epoch": 0.8806808952993536, "grad_norm": 0.4670714735984802, "learning_rate": 3.842336027169652e-07, "loss": 0.018924212083220482, "memory(GiB)": 22.66, "step": 27110, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.955659 }, { "epoch": 0.8807133807621089, "grad_norm": 0.37977108359336853, "learning_rate": 3.840271305345128e-07, "loss": 0.014744534157216549, "memory(GiB)": 22.66, "step": 27111, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.955664 }, { "epoch": 0.8807458662248644, "grad_norm": 0.263443261384964, "learning_rate": 3.8382071162652914e-07, "loss": 0.008047159761190414, "memory(GiB)": 22.66, "step": 27112, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.95567 }, { "epoch": 0.8807783516876198, "grad_norm": 0.3695604205131531, "learning_rate": 3.836143459954006e-07, "loss": 0.005476399324834347, "memory(GiB)": 22.66, "step": 27113, "token_acc": 1.0, "train_speed(iter/s)": 0.955676 }, { "epoch": 0.8808108371503752, "grad_norm": 0.3542661666870117, "learning_rate": 3.834080336435064e-07, "loss": 0.011639096774160862, "memory(GiB)": 22.66, "step": 27114, "token_acc": 0.9929824561403509, "train_speed(iter/s)": 0.955682 }, { "epoch": 0.8808433226131306, "grad_norm": 0.5870959758758545, "learning_rate": 3.832017745732286e-07, "loss": 0.014662300236523151, "memory(GiB)": 22.66, "step": 27115, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.955676 }, { "epoch": 0.8808758080758861, "grad_norm": 0.3916912376880646, "learning_rate": 3.8299556878694633e-07, "loss": 0.011969871819019318, "memory(GiB)": 22.66, "step": 27116, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955682 }, { "epoch": 0.8809082935386414, "grad_norm": 0.4414900839328766, "learning_rate": 3.8278941628704046e-07, "loss": 0.013758406043052673, "memory(GiB)": 22.66, "step": 27117, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.955687 }, { "epoch": 0.8809407790013969, "grad_norm": 0.2553822696208954, "learning_rate": 3.825833170758902e-07, "loss": 0.008083690889179707, "memory(GiB)": 22.66, "step": 27118, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.955693 }, { "epoch": 0.8809732644641523, "grad_norm": 0.38600537180900574, "learning_rate": 3.823772711558754e-07, "loss": 0.012027108110487461, "memory(GiB)": 22.66, "step": 27119, "token_acc": 1.0, "train_speed(iter/s)": 0.955699 }, { "epoch": 0.8810057499269077, "grad_norm": 0.28297698497772217, "learning_rate": 3.8217127852937184e-07, "loss": 0.009258486330509186, "memory(GiB)": 22.66, "step": 27120, "token_acc": 1.0, "train_speed(iter/s)": 0.955705 }, { "epoch": 0.8810382353896631, "grad_norm": 0.30819594860076904, "learning_rate": 3.8196533919875824e-07, "loss": 0.012486685067415237, "memory(GiB)": 22.66, "step": 27121, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955711 }, { "epoch": 0.8810707208524186, "grad_norm": 0.37210455536842346, "learning_rate": 3.817594531664109e-07, "loss": 0.01634904183447361, "memory(GiB)": 22.66, "step": 27122, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955717 }, { "epoch": 0.8811032063151739, "grad_norm": 0.31418365240097046, "learning_rate": 3.8155362043470644e-07, "loss": 0.00965399481356144, "memory(GiB)": 22.66, "step": 27123, "token_acc": 1.0, "train_speed(iter/s)": 0.955721 }, { "epoch": 0.8811356917779294, "grad_norm": 0.5611592531204224, "learning_rate": 3.813478410060212e-07, "loss": 0.01510104164481163, "memory(GiB)": 22.66, "step": 27124, "token_acc": 1.0, "train_speed(iter/s)": 0.955729 }, { "epoch": 0.8811681772406847, "grad_norm": 0.3835510313510895, "learning_rate": 3.811421148827288e-07, "loss": 0.008311796002089977, "memory(GiB)": 22.66, "step": 27125, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955736 }, { "epoch": 0.8812006627034402, "grad_norm": 0.4500840902328491, "learning_rate": 3.8093644206720347e-07, "loss": 0.013290764763951302, "memory(GiB)": 22.66, "step": 27126, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.955744 }, { "epoch": 0.8812331481661956, "grad_norm": 0.2968180179595947, "learning_rate": 3.8073082256182e-07, "loss": 0.006074616219848394, "memory(GiB)": 22.66, "step": 27127, "token_acc": 0.9952830188679245, "train_speed(iter/s)": 0.955752 }, { "epoch": 0.881265633628951, "grad_norm": 0.36132514476776123, "learning_rate": 3.8052525636895154e-07, "loss": 0.011853373609483242, "memory(GiB)": 22.66, "step": 27128, "token_acc": 1.0, "train_speed(iter/s)": 0.955759 }, { "epoch": 0.8812981190917064, "grad_norm": 0.4355775713920593, "learning_rate": 3.8031974349097e-07, "loss": 0.01877707988023758, "memory(GiB)": 22.66, "step": 27129, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955765 }, { "epoch": 0.8813306045544619, "grad_norm": 0.24475443363189697, "learning_rate": 3.8011428393024685e-07, "loss": 0.010161658748984337, "memory(GiB)": 22.66, "step": 27130, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.955771 }, { "epoch": 0.8813630900172172, "grad_norm": 0.46284106373786926, "learning_rate": 3.7990887768915354e-07, "loss": 0.016597505658864975, "memory(GiB)": 22.66, "step": 27131, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.955777 }, { "epoch": 0.8813955754799727, "grad_norm": 0.32311519980430603, "learning_rate": 3.7970352477006255e-07, "loss": 0.008809329941868782, "memory(GiB)": 22.66, "step": 27132, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.955783 }, { "epoch": 0.8814280609427281, "grad_norm": 0.4084540903568268, "learning_rate": 3.7949822517534153e-07, "loss": 0.014769770205020905, "memory(GiB)": 22.66, "step": 27133, "token_acc": 1.0, "train_speed(iter/s)": 0.955789 }, { "epoch": 0.8814605464054835, "grad_norm": 0.4196303188800812, "learning_rate": 3.792929789073618e-07, "loss": 0.014328762888908386, "memory(GiB)": 22.66, "step": 27134, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.955795 }, { "epoch": 0.8814930318682389, "grad_norm": 0.33853161334991455, "learning_rate": 3.7908778596848937e-07, "loss": 0.006976719945669174, "memory(GiB)": 22.66, "step": 27135, "token_acc": 1.0, "train_speed(iter/s)": 0.955801 }, { "epoch": 0.8815255173309944, "grad_norm": 0.35528257489204407, "learning_rate": 3.7888264636109616e-07, "loss": 0.007804736495018005, "memory(GiB)": 22.66, "step": 27136, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.955807 }, { "epoch": 0.8815580027937499, "grad_norm": 0.5477991700172424, "learning_rate": 3.78677560087547e-07, "loss": 0.016374429687857628, "memory(GiB)": 22.66, "step": 27137, "token_acc": 0.9692307692307692, "train_speed(iter/s)": 0.955812 }, { "epoch": 0.8815904882565052, "grad_norm": 0.24461975693702698, "learning_rate": 3.784725271502093e-07, "loss": 0.008936228230595589, "memory(GiB)": 22.66, "step": 27138, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.955818 }, { "epoch": 0.8816229737192607, "grad_norm": 0.6092352867126465, "learning_rate": 3.7826754755145133e-07, "loss": 0.015630032867193222, "memory(GiB)": 22.66, "step": 27139, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.955824 }, { "epoch": 0.881655459182016, "grad_norm": 0.2721898555755615, "learning_rate": 3.780626212936356e-07, "loss": 0.007559842895716429, "memory(GiB)": 22.66, "step": 27140, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955829 }, { "epoch": 0.8816879446447715, "grad_norm": 0.3864116072654724, "learning_rate": 3.778577483791307e-07, "loss": 0.012999370694160461, "memory(GiB)": 22.66, "step": 27141, "token_acc": 1.0, "train_speed(iter/s)": 0.955835 }, { "epoch": 0.8817204301075269, "grad_norm": 0.47191962599754333, "learning_rate": 3.776529288102981e-07, "loss": 0.014305423945188522, "memory(GiB)": 22.66, "step": 27142, "token_acc": 0.992, "train_speed(iter/s)": 0.95584 }, { "epoch": 0.8817529155702823, "grad_norm": 0.5220856666564941, "learning_rate": 3.774481625895043e-07, "loss": 0.011788906529545784, "memory(GiB)": 22.66, "step": 27143, "token_acc": 1.0, "train_speed(iter/s)": 0.955846 }, { "epoch": 0.8817854010330377, "grad_norm": 0.3210492432117462, "learning_rate": 3.772434497191102e-07, "loss": 0.012279357761144638, "memory(GiB)": 22.66, "step": 27144, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955852 }, { "epoch": 0.8818178864957932, "grad_norm": 0.3543759882450104, "learning_rate": 3.770387902014799e-07, "loss": 0.01002868078649044, "memory(GiB)": 22.66, "step": 27145, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955858 }, { "epoch": 0.8818503719585485, "grad_norm": 0.512312650680542, "learning_rate": 3.76834184038975e-07, "loss": 0.010442335158586502, "memory(GiB)": 22.66, "step": 27146, "token_acc": 1.0, "train_speed(iter/s)": 0.955864 }, { "epoch": 0.881882857421304, "grad_norm": 0.3186814486980438, "learning_rate": 3.766296312339579e-07, "loss": 0.009586745873093605, "memory(GiB)": 22.66, "step": 27147, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955869 }, { "epoch": 0.8819153428840594, "grad_norm": 0.3742971122264862, "learning_rate": 3.764251317887879e-07, "loss": 0.011975719593465328, "memory(GiB)": 22.66, "step": 27148, "token_acc": 1.0, "train_speed(iter/s)": 0.955875 }, { "epoch": 0.8819478283468148, "grad_norm": 0.30083683133125305, "learning_rate": 3.7622068570582537e-07, "loss": 0.008920636028051376, "memory(GiB)": 22.66, "step": 27149, "token_acc": 1.0, "train_speed(iter/s)": 0.95588 }, { "epoch": 0.8819803138095702, "grad_norm": 0.3412773311138153, "learning_rate": 3.7601629298743056e-07, "loss": 0.009149668738245964, "memory(GiB)": 22.66, "step": 27150, "token_acc": 1.0, "train_speed(iter/s)": 0.955886 }, { "epoch": 0.8820127992723257, "grad_norm": 0.3905373811721802, "learning_rate": 3.7581195363596337e-07, "loss": 0.00989330280572176, "memory(GiB)": 22.66, "step": 27151, "token_acc": 1.0, "train_speed(iter/s)": 0.955893 }, { "epoch": 0.882045284735081, "grad_norm": 0.4826882481575012, "learning_rate": 3.7560766765378007e-07, "loss": 0.013419033028185368, "memory(GiB)": 22.66, "step": 27152, "token_acc": 0.9965156794425087, "train_speed(iter/s)": 0.955898 }, { "epoch": 0.8820777701978365, "grad_norm": 0.22235974669456482, "learning_rate": 3.75403435043239e-07, "loss": 0.0036141257733106613, "memory(GiB)": 22.66, "step": 27153, "token_acc": 1.0, "train_speed(iter/s)": 0.955906 }, { "epoch": 0.8821102556605919, "grad_norm": 0.47304707765579224, "learning_rate": 3.75199255806698e-07, "loss": 0.015660420060157776, "memory(GiB)": 22.66, "step": 27154, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.955913 }, { "epoch": 0.8821427411233473, "grad_norm": 0.46559691429138184, "learning_rate": 3.7499512994651265e-07, "loss": 0.018484018743038177, "memory(GiB)": 22.66, "step": 27155, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.95592 }, { "epoch": 0.8821752265861027, "grad_norm": 0.4588390290737152, "learning_rate": 3.747910574650404e-07, "loss": 0.014686454087495804, "memory(GiB)": 22.66, "step": 27156, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.955928 }, { "epoch": 0.8822077120488582, "grad_norm": 0.3926749527454376, "learning_rate": 3.7458703836463495e-07, "loss": 0.012716980651021004, "memory(GiB)": 22.66, "step": 27157, "token_acc": 1.0, "train_speed(iter/s)": 0.955935 }, { "epoch": 0.8822401975116135, "grad_norm": 0.19837312400341034, "learning_rate": 3.7438307264765163e-07, "loss": 0.007273735012859106, "memory(GiB)": 22.66, "step": 27158, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.955942 }, { "epoch": 0.882272682974369, "grad_norm": 0.4746403098106384, "learning_rate": 3.741791603164441e-07, "loss": 0.01630162075161934, "memory(GiB)": 22.66, "step": 27159, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955949 }, { "epoch": 0.8823051684371244, "grad_norm": 0.5457342267036438, "learning_rate": 3.739753013733666e-07, "loss": 0.014046031050384045, "memory(GiB)": 22.66, "step": 27160, "token_acc": 0.9927536231884058, "train_speed(iter/s)": 0.955956 }, { "epoch": 0.8823376538998798, "grad_norm": 0.2536773383617401, "learning_rate": 3.7377149582077055e-07, "loss": 0.008247777819633484, "memory(GiB)": 22.66, "step": 27161, "token_acc": 1.0, "train_speed(iter/s)": 0.955964 }, { "epoch": 0.8823701393626352, "grad_norm": 0.367847204208374, "learning_rate": 3.735677436610102e-07, "loss": 0.012119833379983902, "memory(GiB)": 22.66, "step": 27162, "token_acc": 1.0, "train_speed(iter/s)": 0.955971 }, { "epoch": 0.8824026248253907, "grad_norm": 0.25249573588371277, "learning_rate": 3.733640448964337e-07, "loss": 0.007432634010910988, "memory(GiB)": 22.66, "step": 27163, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955976 }, { "epoch": 0.882435110288146, "grad_norm": 0.5888431072235107, "learning_rate": 3.7316039952939586e-07, "loss": 0.011047035455703735, "memory(GiB)": 22.66, "step": 27164, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.955982 }, { "epoch": 0.8824675957509015, "grad_norm": 0.3857809603214264, "learning_rate": 3.7295680756224473e-07, "loss": 0.01574721559882164, "memory(GiB)": 22.66, "step": 27165, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.955988 }, { "epoch": 0.8825000812136569, "grad_norm": 0.33286887407302856, "learning_rate": 3.727532689973318e-07, "loss": 0.009044885635375977, "memory(GiB)": 22.66, "step": 27166, "token_acc": 1.0, "train_speed(iter/s)": 0.955994 }, { "epoch": 0.8825325666764123, "grad_norm": 0.39139533042907715, "learning_rate": 3.725497838370035e-07, "loss": 0.01583150401711464, "memory(GiB)": 22.66, "step": 27167, "token_acc": 1.0, "train_speed(iter/s)": 0.956 }, { "epoch": 0.8825650521391677, "grad_norm": 0.4426833987236023, "learning_rate": 3.723463520836101e-07, "loss": 0.01974010095000267, "memory(GiB)": 22.66, "step": 27168, "token_acc": 0.9853658536585366, "train_speed(iter/s)": 0.956006 }, { "epoch": 0.8825975376019232, "grad_norm": 0.3284057676792145, "learning_rate": 3.721429737394988e-07, "loss": 0.010135055519640446, "memory(GiB)": 22.66, "step": 27169, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956012 }, { "epoch": 0.8826300230646785, "grad_norm": 0.39611342549324036, "learning_rate": 3.719396488070176e-07, "loss": 0.010715109296143055, "memory(GiB)": 22.66, "step": 27170, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.956018 }, { "epoch": 0.882662508527434, "grad_norm": 0.2829495966434479, "learning_rate": 3.717363772885141e-07, "loss": 0.008734158240258694, "memory(GiB)": 22.66, "step": 27171, "token_acc": 1.0, "train_speed(iter/s)": 0.956023 }, { "epoch": 0.8826949939901894, "grad_norm": 0.2260587215423584, "learning_rate": 3.715331591863314e-07, "loss": 0.008520861156284809, "memory(GiB)": 22.66, "step": 27172, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.956029 }, { "epoch": 0.8827274794529448, "grad_norm": 0.4136374294757843, "learning_rate": 3.7132999450281703e-07, "loss": 0.010748195461928844, "memory(GiB)": 22.66, "step": 27173, "token_acc": 0.993006993006993, "train_speed(iter/s)": 0.956034 }, { "epoch": 0.8827599649157002, "grad_norm": 0.4887087643146515, "learning_rate": 3.7112688324031477e-07, "loss": 0.008582895621657372, "memory(GiB)": 22.66, "step": 27174, "token_acc": 1.0, "train_speed(iter/s)": 0.956039 }, { "epoch": 0.8827924503784557, "grad_norm": 0.4571179151535034, "learning_rate": 3.7092382540117047e-07, "loss": 0.011696923524141312, "memory(GiB)": 22.66, "step": 27175, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.956045 }, { "epoch": 0.882824935841211, "grad_norm": 0.31778618693351746, "learning_rate": 3.7072082098772553e-07, "loss": 0.006239590235054493, "memory(GiB)": 22.66, "step": 27176, "token_acc": 1.0, "train_speed(iter/s)": 0.956051 }, { "epoch": 0.8828574213039665, "grad_norm": 0.20086248219013214, "learning_rate": 3.705178700023243e-07, "loss": 0.006507138255983591, "memory(GiB)": 22.66, "step": 27177, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.956057 }, { "epoch": 0.8828899067667219, "grad_norm": 0.25982895493507385, "learning_rate": 3.7031497244730817e-07, "loss": 0.009406284429132938, "memory(GiB)": 22.66, "step": 27178, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956063 }, { "epoch": 0.8829223922294773, "grad_norm": 0.47984862327575684, "learning_rate": 3.701121283250203e-07, "loss": 0.012097759172320366, "memory(GiB)": 22.66, "step": 27179, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.956069 }, { "epoch": 0.8829548776922327, "grad_norm": 0.2644762694835663, "learning_rate": 3.6990933763780044e-07, "loss": 0.00926374550908804, "memory(GiB)": 22.66, "step": 27180, "token_acc": 1.0, "train_speed(iter/s)": 0.956074 }, { "epoch": 0.8829873631549882, "grad_norm": 0.30010202527046204, "learning_rate": 3.69706600387989e-07, "loss": 0.009231491014361382, "memory(GiB)": 22.66, "step": 27181, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.95608 }, { "epoch": 0.8830198486177435, "grad_norm": 0.4217180609703064, "learning_rate": 3.695039165779263e-07, "loss": 0.013836350291967392, "memory(GiB)": 22.66, "step": 27182, "token_acc": 1.0, "train_speed(iter/s)": 0.956088 }, { "epoch": 0.883052334080499, "grad_norm": 0.22728779911994934, "learning_rate": 3.6930128620995264e-07, "loss": 0.008331932127475739, "memory(GiB)": 22.66, "step": 27183, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956095 }, { "epoch": 0.8830848195432544, "grad_norm": 0.5211424231529236, "learning_rate": 3.6909870928640457e-07, "loss": 0.016419868916273117, "memory(GiB)": 22.66, "step": 27184, "token_acc": 0.9838709677419355, "train_speed(iter/s)": 0.956103 }, { "epoch": 0.8831173050060098, "grad_norm": 0.32712090015411377, "learning_rate": 3.688961858096207e-07, "loss": 0.013932879082858562, "memory(GiB)": 22.66, "step": 27185, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956111 }, { "epoch": 0.8831497904687652, "grad_norm": 0.35355332493782043, "learning_rate": 3.686937157819398e-07, "loss": 0.008480115793645382, "memory(GiB)": 22.66, "step": 27186, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956118 }, { "epoch": 0.8831822759315207, "grad_norm": 0.34103742241859436, "learning_rate": 3.6849129920569717e-07, "loss": 0.008626646362245083, "memory(GiB)": 22.66, "step": 27187, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956126 }, { "epoch": 0.883214761394276, "grad_norm": 0.2913689613342285, "learning_rate": 3.682889360832298e-07, "loss": 0.009398601949214935, "memory(GiB)": 22.66, "step": 27188, "token_acc": 1.0, "train_speed(iter/s)": 0.956133 }, { "epoch": 0.8832472468570315, "grad_norm": 0.39079749584198, "learning_rate": 3.6808662641687256e-07, "loss": 0.013300796039402485, "memory(GiB)": 22.66, "step": 27189, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956141 }, { "epoch": 0.8832797323197868, "grad_norm": 0.25015243887901306, "learning_rate": 3.678843702089618e-07, "loss": 0.010106513276696205, "memory(GiB)": 22.66, "step": 27190, "token_acc": 0.9948979591836735, "train_speed(iter/s)": 0.956148 }, { "epoch": 0.8833122177825423, "grad_norm": 0.34633249044418335, "learning_rate": 3.676821674618286e-07, "loss": 0.00748237781226635, "memory(GiB)": 22.66, "step": 27191, "token_acc": 0.988950276243094, "train_speed(iter/s)": 0.956154 }, { "epoch": 0.8833447032452977, "grad_norm": 0.5050609111785889, "learning_rate": 3.67480018177811e-07, "loss": 0.013783265836536884, "memory(GiB)": 22.66, "step": 27192, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.95616 }, { "epoch": 0.8833771887080532, "grad_norm": 0.3946472704410553, "learning_rate": 3.6727792235923877e-07, "loss": 0.016760388389229774, "memory(GiB)": 22.66, "step": 27193, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956166 }, { "epoch": 0.8834096741708085, "grad_norm": 0.2962179481983185, "learning_rate": 3.670758800084462e-07, "loss": 0.007569760084152222, "memory(GiB)": 22.66, "step": 27194, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.956172 }, { "epoch": 0.883442159633564, "grad_norm": 0.43415573239326477, "learning_rate": 3.6687389112776304e-07, "loss": 0.011197369545698166, "memory(GiB)": 22.66, "step": 27195, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956177 }, { "epoch": 0.8834746450963193, "grad_norm": 0.33406195044517517, "learning_rate": 3.666719557195225e-07, "loss": 0.009179051965475082, "memory(GiB)": 22.66, "step": 27196, "token_acc": 1.0, "train_speed(iter/s)": 0.956183 }, { "epoch": 0.8835071305590748, "grad_norm": 0.3934432864189148, "learning_rate": 3.664700737860544e-07, "loss": 0.015198016539216042, "memory(GiB)": 22.66, "step": 27197, "token_acc": 1.0, "train_speed(iter/s)": 0.95619 }, { "epoch": 0.8835396160218302, "grad_norm": 0.35566428303718567, "learning_rate": 3.662682453296901e-07, "loss": 0.010678254067897797, "memory(GiB)": 22.66, "step": 27198, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.956196 }, { "epoch": 0.8835721014845856, "grad_norm": 0.38911983370780945, "learning_rate": 3.660664703527561e-07, "loss": 0.01717551052570343, "memory(GiB)": 22.66, "step": 27199, "token_acc": 0.9860627177700348, "train_speed(iter/s)": 0.956202 }, { "epoch": 0.8836045869473411, "grad_norm": 0.2757842540740967, "learning_rate": 3.6586474885758336e-07, "loss": 0.010504772886633873, "memory(GiB)": 22.66, "step": 27200, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.956207 }, { "epoch": 0.8836370724100965, "grad_norm": 0.33485981822013855, "learning_rate": 3.656630808464995e-07, "loss": 0.010021751746535301, "memory(GiB)": 22.66, "step": 27201, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.956213 }, { "epoch": 0.883669557872852, "grad_norm": 0.26512396335601807, "learning_rate": 3.6546146632183145e-07, "loss": 0.008615956641733646, "memory(GiB)": 22.66, "step": 27202, "token_acc": 1.0, "train_speed(iter/s)": 0.956219 }, { "epoch": 0.8837020433356073, "grad_norm": 0.7320075035095215, "learning_rate": 3.6525990528590793e-07, "loss": 0.010846313089132309, "memory(GiB)": 22.66, "step": 27203, "token_acc": 1.0, "train_speed(iter/s)": 0.956224 }, { "epoch": 0.8837345287983628, "grad_norm": 0.21702946722507477, "learning_rate": 3.650583977410527e-07, "loss": 0.006622034125030041, "memory(GiB)": 22.66, "step": 27204, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.95623 }, { "epoch": 0.8837670142611181, "grad_norm": 0.429350882768631, "learning_rate": 3.648569436895932e-07, "loss": 0.017611362040042877, "memory(GiB)": 22.66, "step": 27205, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956236 }, { "epoch": 0.8837994997238736, "grad_norm": 0.20539718866348267, "learning_rate": 3.646555431338533e-07, "loss": 0.005480234511196613, "memory(GiB)": 22.66, "step": 27206, "token_acc": 1.0, "train_speed(iter/s)": 0.956242 }, { "epoch": 0.883831985186629, "grad_norm": 0.4348989427089691, "learning_rate": 3.644541960761594e-07, "loss": 0.016358649358153343, "memory(GiB)": 22.66, "step": 27207, "token_acc": 0.9890510948905109, "train_speed(iter/s)": 0.956248 }, { "epoch": 0.8838644706493844, "grad_norm": 0.3368830978870392, "learning_rate": 3.642529025188329e-07, "loss": 0.014951237477362156, "memory(GiB)": 22.66, "step": 27208, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.956253 }, { "epoch": 0.8838969561121398, "grad_norm": 0.26271745562553406, "learning_rate": 3.640516624641982e-07, "loss": 0.01184028945863247, "memory(GiB)": 22.66, "step": 27209, "token_acc": 0.996, "train_speed(iter/s)": 0.956258 }, { "epoch": 0.8839294415748953, "grad_norm": 0.3824474513530731, "learning_rate": 3.638504759145778e-07, "loss": 0.016174308955669403, "memory(GiB)": 22.66, "step": 27210, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.956264 }, { "epoch": 0.8839619270376506, "grad_norm": 0.4193640649318695, "learning_rate": 3.636493428722943e-07, "loss": 0.012017311528325081, "memory(GiB)": 22.66, "step": 27211, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.956269 }, { "epoch": 0.8839944125004061, "grad_norm": 0.37565726041793823, "learning_rate": 3.6344826333966745e-07, "loss": 0.011006408371031284, "memory(GiB)": 22.66, "step": 27212, "token_acc": 0.996, "train_speed(iter/s)": 0.956275 }, { "epoch": 0.8840268979631615, "grad_norm": 0.3650214970111847, "learning_rate": 3.632472373190199e-07, "loss": 0.012612747959792614, "memory(GiB)": 22.66, "step": 27213, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.956281 }, { "epoch": 0.884059383425917, "grad_norm": 0.34660810232162476, "learning_rate": 3.630462648126687e-07, "loss": 0.005517785903066397, "memory(GiB)": 22.66, "step": 27214, "token_acc": 1.0, "train_speed(iter/s)": 0.956287 }, { "epoch": 0.8840918688886723, "grad_norm": 0.2677620053291321, "learning_rate": 3.628453458229364e-07, "loss": 0.007047373801469803, "memory(GiB)": 22.66, "step": 27215, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956295 }, { "epoch": 0.8841243543514278, "grad_norm": 0.361555278301239, "learning_rate": 3.6264448035214174e-07, "loss": 0.012624623253941536, "memory(GiB)": 22.66, "step": 27216, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956302 }, { "epoch": 0.8841568398141831, "grad_norm": 0.36018118262290955, "learning_rate": 3.6244366840260114e-07, "loss": 0.009765936061739922, "memory(GiB)": 22.66, "step": 27217, "token_acc": 1.0, "train_speed(iter/s)": 0.95631 }, { "epoch": 0.8841893252769386, "grad_norm": 0.36160212755203247, "learning_rate": 3.6224290997663445e-07, "loss": 0.009516136720776558, "memory(GiB)": 22.66, "step": 27218, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.956317 }, { "epoch": 0.884221810739694, "grad_norm": 0.30940330028533936, "learning_rate": 3.620422050765554e-07, "loss": 0.01002897322177887, "memory(GiB)": 22.66, "step": 27219, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.956324 }, { "epoch": 0.8842542962024494, "grad_norm": 0.4106747508049011, "learning_rate": 3.6184155370468433e-07, "loss": 0.01688118651509285, "memory(GiB)": 22.66, "step": 27220, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956332 }, { "epoch": 0.8842867816652048, "grad_norm": 0.5013177990913391, "learning_rate": 3.616409558633338e-07, "loss": 0.011730417609214783, "memory(GiB)": 22.66, "step": 27221, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956339 }, { "epoch": 0.8843192671279603, "grad_norm": 0.23485100269317627, "learning_rate": 3.6144041155482145e-07, "loss": 0.0056967223063111305, "memory(GiB)": 22.66, "step": 27222, "token_acc": 1.0, "train_speed(iter/s)": 0.956345 }, { "epoch": 0.8843517525907156, "grad_norm": 0.554661214351654, "learning_rate": 3.6123992078145984e-07, "loss": 0.013449160382151604, "memory(GiB)": 22.66, "step": 27223, "token_acc": 0.9869565217391304, "train_speed(iter/s)": 0.956351 }, { "epoch": 0.8843842380534711, "grad_norm": 0.34137672185897827, "learning_rate": 3.6103948354556327e-07, "loss": 0.013391758315265179, "memory(GiB)": 22.66, "step": 27224, "token_acc": 1.0, "train_speed(iter/s)": 0.956357 }, { "epoch": 0.8844167235162265, "grad_norm": 0.8479905128479004, "learning_rate": 3.60839099849446e-07, "loss": 0.008663026615977287, "memory(GiB)": 22.66, "step": 27225, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956363 }, { "epoch": 0.8844492089789819, "grad_norm": 0.11055265367031097, "learning_rate": 3.606387696954211e-07, "loss": 0.0027665491215884686, "memory(GiB)": 22.66, "step": 27226, "token_acc": 1.0, "train_speed(iter/s)": 0.956368 }, { "epoch": 0.8844816944417373, "grad_norm": 0.3128693997859955, "learning_rate": 3.6043849308579846e-07, "loss": 0.01144590973854065, "memory(GiB)": 22.66, "step": 27227, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.956374 }, { "epoch": 0.8845141799044928, "grad_norm": 0.23187598586082458, "learning_rate": 3.6023827002289125e-07, "loss": 0.007948619313538074, "memory(GiB)": 22.66, "step": 27228, "token_acc": 1.0, "train_speed(iter/s)": 0.95638 }, { "epoch": 0.8845466653672481, "grad_norm": 0.3472153842449188, "learning_rate": 3.6003810050900976e-07, "loss": 0.017966069281101227, "memory(GiB)": 22.66, "step": 27229, "token_acc": 0.9865319865319865, "train_speed(iter/s)": 0.956384 }, { "epoch": 0.8845791508300036, "grad_norm": 0.349775493144989, "learning_rate": 3.5983798454646443e-07, "loss": 0.009828408248722553, "memory(GiB)": 22.66, "step": 27230, "token_acc": 1.0, "train_speed(iter/s)": 0.95639 }, { "epoch": 0.884611636292759, "grad_norm": 0.46118029952049255, "learning_rate": 3.5963792213756566e-07, "loss": 0.011019760742783546, "memory(GiB)": 22.66, "step": 27231, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.956395 }, { "epoch": 0.8846441217555144, "grad_norm": 0.29602280259132385, "learning_rate": 3.5943791328462096e-07, "loss": 0.007684493437409401, "memory(GiB)": 22.66, "step": 27232, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956401 }, { "epoch": 0.8846766072182698, "grad_norm": 0.4034661650657654, "learning_rate": 3.592379579899391e-07, "loss": 0.011594295501708984, "memory(GiB)": 22.66, "step": 27233, "token_acc": 1.0, "train_speed(iter/s)": 0.956406 }, { "epoch": 0.8847090926810253, "grad_norm": 0.34502217173576355, "learning_rate": 3.590380562558277e-07, "loss": 0.008427826687693596, "memory(GiB)": 22.66, "step": 27234, "token_acc": 1.0, "train_speed(iter/s)": 0.956412 }, { "epoch": 0.8847415781437806, "grad_norm": 0.38801297545433044, "learning_rate": 3.5883820808459536e-07, "loss": 0.01338171400129795, "memory(GiB)": 22.66, "step": 27235, "token_acc": 0.9839357429718876, "train_speed(iter/s)": 0.956418 }, { "epoch": 0.8847740636065361, "grad_norm": 0.3951495289802551, "learning_rate": 3.58638413478547e-07, "loss": 0.015211094170808792, "memory(GiB)": 22.66, "step": 27236, "token_acc": 0.9875, "train_speed(iter/s)": 0.956424 }, { "epoch": 0.8848065490692915, "grad_norm": 0.2701873779296875, "learning_rate": 3.5843867243998854e-07, "loss": 0.008741458877921104, "memory(GiB)": 22.66, "step": 27237, "token_acc": 0.9931506849315068, "train_speed(iter/s)": 0.95643 }, { "epoch": 0.8848390345320469, "grad_norm": 0.18676002323627472, "learning_rate": 3.582389849712259e-07, "loss": 0.009097982197999954, "memory(GiB)": 22.66, "step": 27238, "token_acc": 1.0, "train_speed(iter/s)": 0.956436 }, { "epoch": 0.8848715199948023, "grad_norm": 0.42145398259162903, "learning_rate": 3.580393510745639e-07, "loss": 0.014910721220076084, "memory(GiB)": 22.66, "step": 27239, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.956442 }, { "epoch": 0.8849040054575578, "grad_norm": 0.5026933550834656, "learning_rate": 3.578397707523057e-07, "loss": 0.012629006057977676, "memory(GiB)": 22.66, "step": 27240, "token_acc": 0.99609375, "train_speed(iter/s)": 0.956448 }, { "epoch": 0.8849364909203131, "grad_norm": 0.32777518033981323, "learning_rate": 3.576402440067567e-07, "loss": 0.009767068549990654, "memory(GiB)": 22.66, "step": 27241, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.956455 }, { "epoch": 0.8849689763830686, "grad_norm": 0.40613940358161926, "learning_rate": 3.5744077084021554e-07, "loss": 0.010922514833509922, "memory(GiB)": 22.66, "step": 27242, "token_acc": 1.0, "train_speed(iter/s)": 0.956462 }, { "epoch": 0.885001461845824, "grad_norm": 0.24284400045871735, "learning_rate": 3.572413512549894e-07, "loss": 0.009522528387606144, "memory(GiB)": 22.66, "step": 27243, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.956469 }, { "epoch": 0.8850339473085794, "grad_norm": 0.47677096724510193, "learning_rate": 3.5704198525337686e-07, "loss": 0.01661084219813347, "memory(GiB)": 22.66, "step": 27244, "token_acc": 1.0, "train_speed(iter/s)": 0.956476 }, { "epoch": 0.8850664327713348, "grad_norm": 0.28434282541275024, "learning_rate": 3.5684267283768e-07, "loss": 0.009103729389607906, "memory(GiB)": 22.66, "step": 27245, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.956484 }, { "epoch": 0.8850989182340903, "grad_norm": 0.3136313557624817, "learning_rate": 3.566434140101982e-07, "loss": 0.012881139293313026, "memory(GiB)": 22.66, "step": 27246, "token_acc": 1.0, "train_speed(iter/s)": 0.956491 }, { "epoch": 0.8851314036968456, "grad_norm": 0.36000755429267883, "learning_rate": 3.564442087732306e-07, "loss": 0.011951424181461334, "memory(GiB)": 22.66, "step": 27247, "token_acc": 0.9817351598173516, "train_speed(iter/s)": 0.956498 }, { "epoch": 0.8851638891596011, "grad_norm": 0.31492742896080017, "learning_rate": 3.562450571290793e-07, "loss": 0.010683605447411537, "memory(GiB)": 22.66, "step": 27248, "token_acc": 1.0, "train_speed(iter/s)": 0.956506 }, { "epoch": 0.8851963746223565, "grad_norm": 0.3202601373195648, "learning_rate": 3.560459590800397e-07, "loss": 0.007334292866289616, "memory(GiB)": 22.66, "step": 27249, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956513 }, { "epoch": 0.8852288600851119, "grad_norm": 0.41524332761764526, "learning_rate": 3.5584691462841213e-07, "loss": 0.019212648272514343, "memory(GiB)": 22.66, "step": 27250, "token_acc": 1.0, "train_speed(iter/s)": 0.956521 }, { "epoch": 0.8852613455478673, "grad_norm": 0.3961201012134552, "learning_rate": 3.5564792377649095e-07, "loss": 0.015734877437353134, "memory(GiB)": 22.66, "step": 27251, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.956528 }, { "epoch": 0.8852938310106228, "grad_norm": 0.4552018344402313, "learning_rate": 3.5544898652657476e-07, "loss": 0.015989139676094055, "memory(GiB)": 22.66, "step": 27252, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.956535 }, { "epoch": 0.8853263164733781, "grad_norm": 0.28867384791374207, "learning_rate": 3.5525010288095964e-07, "loss": 0.012330901809036732, "memory(GiB)": 22.66, "step": 27253, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956541 }, { "epoch": 0.8853588019361336, "grad_norm": 0.22255997359752655, "learning_rate": 3.5505127284194085e-07, "loss": 0.0036914104130119085, "memory(GiB)": 22.66, "step": 27254, "token_acc": 1.0, "train_speed(iter/s)": 0.956547 }, { "epoch": 0.885391287398889, "grad_norm": 0.3630341589450836, "learning_rate": 3.548524964118116e-07, "loss": 0.009922072291374207, "memory(GiB)": 22.66, "step": 27255, "token_acc": 1.0, "train_speed(iter/s)": 0.956553 }, { "epoch": 0.8854237728616444, "grad_norm": 0.3090378940105438, "learning_rate": 3.546537735928679e-07, "loss": 0.011487802490592003, "memory(GiB)": 22.66, "step": 27256, "token_acc": 1.0, "train_speed(iter/s)": 0.956559 }, { "epoch": 0.8854562583243998, "grad_norm": 0.2846752107143402, "learning_rate": 3.5445510438740226e-07, "loss": 0.006724067963659763, "memory(GiB)": 22.66, "step": 27257, "token_acc": 1.0, "train_speed(iter/s)": 0.956565 }, { "epoch": 0.8854887437871553, "grad_norm": 0.23973514139652252, "learning_rate": 3.542564887977085e-07, "loss": 0.010484413243830204, "memory(GiB)": 22.66, "step": 27258, "token_acc": 0.9838056680161943, "train_speed(iter/s)": 0.956571 }, { "epoch": 0.8855212292499106, "grad_norm": 0.40032246708869934, "learning_rate": 3.5405792682607796e-07, "loss": 0.011539597064256668, "memory(GiB)": 22.66, "step": 27259, "token_acc": 1.0, "train_speed(iter/s)": 0.956577 }, { "epoch": 0.8855537147126661, "grad_norm": 0.6243445873260498, "learning_rate": 3.5385941847480233e-07, "loss": 0.016547342762351036, "memory(GiB)": 22.66, "step": 27260, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956582 }, { "epoch": 0.8855862001754214, "grad_norm": 0.46510565280914307, "learning_rate": 3.536609637461735e-07, "loss": 0.017989864572882652, "memory(GiB)": 22.66, "step": 27261, "token_acc": 1.0, "train_speed(iter/s)": 0.956588 }, { "epoch": 0.8856186856381769, "grad_norm": 0.3777138590812683, "learning_rate": 3.534625626424809e-07, "loss": 0.010893115773797035, "memory(GiB)": 22.66, "step": 27262, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.956594 }, { "epoch": 0.8856511711009323, "grad_norm": 0.3701799213886261, "learning_rate": 3.5326421516601593e-07, "loss": 0.009780588559806347, "memory(GiB)": 22.66, "step": 27263, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.956599 }, { "epoch": 0.8856836565636877, "grad_norm": 0.35346269607543945, "learning_rate": 3.530659213190657e-07, "loss": 0.006778322160243988, "memory(GiB)": 22.66, "step": 27264, "token_acc": 1.0, "train_speed(iter/s)": 0.956605 }, { "epoch": 0.8857161420264432, "grad_norm": 0.37826329469680786, "learning_rate": 3.5286768110392e-07, "loss": 0.01422676257789135, "memory(GiB)": 22.66, "step": 27265, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956611 }, { "epoch": 0.8857486274891986, "grad_norm": 0.3119800090789795, "learning_rate": 3.5266949452286646e-07, "loss": 0.010315535590052605, "memory(GiB)": 22.66, "step": 27266, "token_acc": 1.0, "train_speed(iter/s)": 0.956617 }, { "epoch": 0.885781112951954, "grad_norm": 0.35017332434654236, "learning_rate": 3.5247136157819326e-07, "loss": 0.01463218592107296, "memory(GiB)": 22.66, "step": 27267, "token_acc": 1.0, "train_speed(iter/s)": 0.956622 }, { "epoch": 0.8858135984147094, "grad_norm": 0.3329828381538391, "learning_rate": 3.5227328227218584e-07, "loss": 0.012084037065505981, "memory(GiB)": 22.66, "step": 27268, "token_acc": 1.0, "train_speed(iter/s)": 0.956628 }, { "epoch": 0.8858460838774649, "grad_norm": 0.2825744152069092, "learning_rate": 3.5207525660713173e-07, "loss": 0.012484611943364143, "memory(GiB)": 22.66, "step": 27269, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.956635 }, { "epoch": 0.8858785693402202, "grad_norm": 0.37484991550445557, "learning_rate": 3.518772845853136e-07, "loss": 0.009872540831565857, "memory(GiB)": 22.66, "step": 27270, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.95664 }, { "epoch": 0.8859110548029757, "grad_norm": 0.3016609251499176, "learning_rate": 3.516793662090201e-07, "loss": 0.006796636618673801, "memory(GiB)": 22.66, "step": 27271, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956646 }, { "epoch": 0.8859435402657311, "grad_norm": 0.25959381461143494, "learning_rate": 3.514815014805328e-07, "loss": 0.006392951589077711, "memory(GiB)": 22.66, "step": 27272, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956652 }, { "epoch": 0.8859760257284865, "grad_norm": 0.36892446875572205, "learning_rate": 3.512836904021366e-07, "loss": 0.01114570815116167, "memory(GiB)": 22.66, "step": 27273, "token_acc": 1.0, "train_speed(iter/s)": 0.956657 }, { "epoch": 0.8860085111912419, "grad_norm": 0.31124740839004517, "learning_rate": 3.510859329761135e-07, "loss": 0.010486328043043613, "memory(GiB)": 22.66, "step": 27274, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.956663 }, { "epoch": 0.8860409966539974, "grad_norm": 0.31984207034111023, "learning_rate": 3.508882292047466e-07, "loss": 0.012199519202113152, "memory(GiB)": 22.66, "step": 27275, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.95667 }, { "epoch": 0.8860734821167527, "grad_norm": 0.42769157886505127, "learning_rate": 3.50690579090317e-07, "loss": 0.014182381331920624, "memory(GiB)": 22.66, "step": 27276, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.956676 }, { "epoch": 0.8861059675795082, "grad_norm": 0.30613258481025696, "learning_rate": 3.5049298263510666e-07, "loss": 0.011656554415822029, "memory(GiB)": 22.66, "step": 27277, "token_acc": 1.0, "train_speed(iter/s)": 0.956683 }, { "epoch": 0.8861384530422636, "grad_norm": 0.4323124289512634, "learning_rate": 3.5029543984139656e-07, "loss": 0.012639595195651054, "memory(GiB)": 22.66, "step": 27278, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.956691 }, { "epoch": 0.886170938505019, "grad_norm": 0.38203367590904236, "learning_rate": 3.5009795071146434e-07, "loss": 0.01045604795217514, "memory(GiB)": 22.66, "step": 27279, "token_acc": 0.9867549668874173, "train_speed(iter/s)": 0.956698 }, { "epoch": 0.8862034239677744, "grad_norm": 0.4970862865447998, "learning_rate": 3.4990051524759206e-07, "loss": 0.013273602351546288, "memory(GiB)": 22.66, "step": 27280, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.956705 }, { "epoch": 0.8862359094305299, "grad_norm": 0.3180732727050781, "learning_rate": 3.497031334520562e-07, "loss": 0.011258188635110855, "memory(GiB)": 22.66, "step": 27281, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.956711 }, { "epoch": 0.8862683948932852, "grad_norm": 0.35669994354248047, "learning_rate": 3.495058053271372e-07, "loss": 0.015273841097950935, "memory(GiB)": 22.66, "step": 27282, "token_acc": 1.0, "train_speed(iter/s)": 0.956717 }, { "epoch": 0.8863008803560407, "grad_norm": 0.2566255033016205, "learning_rate": 3.493085308751099e-07, "loss": 0.008669610135257244, "memory(GiB)": 22.66, "step": 27283, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956723 }, { "epoch": 0.8863333658187961, "grad_norm": 0.4717848002910614, "learning_rate": 3.491113100982518e-07, "loss": 0.016963258385658264, "memory(GiB)": 22.66, "step": 27284, "token_acc": 0.988, "train_speed(iter/s)": 0.956728 }, { "epoch": 0.8863658512815515, "grad_norm": 0.21915137767791748, "learning_rate": 3.4891414299883963e-07, "loss": 0.006260756403207779, "memory(GiB)": 22.66, "step": 27285, "token_acc": 1.0, "train_speed(iter/s)": 0.956733 }, { "epoch": 0.8863983367443069, "grad_norm": 0.4121035039424896, "learning_rate": 3.4871702957914976e-07, "loss": 0.008638715371489525, "memory(GiB)": 22.66, "step": 27286, "token_acc": 1.0, "train_speed(iter/s)": 0.95674 }, { "epoch": 0.8864308222070624, "grad_norm": 0.4013417959213257, "learning_rate": 3.4851996984145544e-07, "loss": 0.011346017941832542, "memory(GiB)": 22.66, "step": 27287, "token_acc": 1.0, "train_speed(iter/s)": 0.956745 }, { "epoch": 0.8864633076698177, "grad_norm": 0.43306419253349304, "learning_rate": 3.48322963788032e-07, "loss": 0.01453747320920229, "memory(GiB)": 22.66, "step": 27288, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956751 }, { "epoch": 0.8864957931325732, "grad_norm": 0.5078442692756653, "learning_rate": 3.4812601142115263e-07, "loss": 0.014029810205101967, "memory(GiB)": 22.66, "step": 27289, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956757 }, { "epoch": 0.8865282785953286, "grad_norm": 0.47637802362442017, "learning_rate": 3.479291127430917e-07, "loss": 0.017236214131116867, "memory(GiB)": 22.66, "step": 27290, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.956762 }, { "epoch": 0.886560764058084, "grad_norm": 0.36236703395843506, "learning_rate": 3.4773226775611955e-07, "loss": 0.013519125990569592, "memory(GiB)": 22.66, "step": 27291, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.956768 }, { "epoch": 0.8865932495208394, "grad_norm": 0.37217551469802856, "learning_rate": 3.4753547646250995e-07, "loss": 0.01525834295898676, "memory(GiB)": 22.66, "step": 27292, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956774 }, { "epoch": 0.8866257349835949, "grad_norm": 0.3346637189388275, "learning_rate": 3.473387388645327e-07, "loss": 0.011275989934802055, "memory(GiB)": 22.66, "step": 27293, "token_acc": 0.9947368421052631, "train_speed(iter/s)": 0.95678 }, { "epoch": 0.8866582204463502, "grad_norm": 0.604896068572998, "learning_rate": 3.4714205496445883e-07, "loss": 0.011696234345436096, "memory(GiB)": 22.66, "step": 27294, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.956785 }, { "epoch": 0.8866907059091057, "grad_norm": 0.33438462018966675, "learning_rate": 3.4694542476455983e-07, "loss": 0.007774848025292158, "memory(GiB)": 22.66, "step": 27295, "token_acc": 1.0, "train_speed(iter/s)": 0.956791 }, { "epoch": 0.8867231913718611, "grad_norm": 0.2834361493587494, "learning_rate": 3.467488482671033e-07, "loss": 0.008055251091718674, "memory(GiB)": 22.66, "step": 27296, "token_acc": 1.0, "train_speed(iter/s)": 0.956797 }, { "epoch": 0.8867556768346165, "grad_norm": 0.3723765015602112, "learning_rate": 3.4655232547435804e-07, "loss": 0.011947644874453545, "memory(GiB)": 22.66, "step": 27297, "token_acc": 1.0, "train_speed(iter/s)": 0.956802 }, { "epoch": 0.8867881622973719, "grad_norm": 0.34620773792266846, "learning_rate": 3.463558563885927e-07, "loss": 0.012212160043418407, "memory(GiB)": 22.66, "step": 27298, "token_acc": 1.0, "train_speed(iter/s)": 0.956808 }, { "epoch": 0.8868206477601274, "grad_norm": 0.2964284121990204, "learning_rate": 3.461594410120761e-07, "loss": 0.008238233625888824, "memory(GiB)": 22.66, "step": 27299, "token_acc": 1.0, "train_speed(iter/s)": 0.956813 }, { "epoch": 0.8868531332228827, "grad_norm": 0.4571971893310547, "learning_rate": 3.459630793470725e-07, "loss": 0.01443460863083601, "memory(GiB)": 22.66, "step": 27300, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.956819 }, { "epoch": 0.8868856186856382, "grad_norm": 0.44754451513290405, "learning_rate": 3.4576677139585015e-07, "loss": 0.013150027953088284, "memory(GiB)": 22.66, "step": 27301, "token_acc": 1.0, "train_speed(iter/s)": 0.956827 }, { "epoch": 0.8869181041483936, "grad_norm": 0.3208288550376892, "learning_rate": 3.455705171606727e-07, "loss": 0.015366842970252037, "memory(GiB)": 22.66, "step": 27302, "token_acc": 1.0, "train_speed(iter/s)": 0.956834 }, { "epoch": 0.886950589611149, "grad_norm": 0.4385974705219269, "learning_rate": 3.4537431664380837e-07, "loss": 0.015367040410637856, "memory(GiB)": 22.66, "step": 27303, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956841 }, { "epoch": 0.8869830750739044, "grad_norm": 0.41894230246543884, "learning_rate": 3.451781698475187e-07, "loss": 0.011158725246787071, "memory(GiB)": 22.66, "step": 27304, "token_acc": 1.0, "train_speed(iter/s)": 0.956849 }, { "epoch": 0.8870155605366599, "grad_norm": 0.28773555159568787, "learning_rate": 3.4498207677406913e-07, "loss": 0.011416242457926273, "memory(GiB)": 22.66, "step": 27305, "token_acc": 0.9851301115241635, "train_speed(iter/s)": 0.956856 }, { "epoch": 0.8870480459994152, "grad_norm": 0.47842708230018616, "learning_rate": 3.447860374257211e-07, "loss": 0.010007208213210106, "memory(GiB)": 22.66, "step": 27306, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956864 }, { "epoch": 0.8870805314621707, "grad_norm": 0.37245529890060425, "learning_rate": 3.44590051804739e-07, "loss": 0.01409229077398777, "memory(GiB)": 22.66, "step": 27307, "token_acc": 1.0, "train_speed(iter/s)": 0.956871 }, { "epoch": 0.8871130169249261, "grad_norm": 0.2524896264076233, "learning_rate": 3.443941199133838e-07, "loss": 0.012297237291932106, "memory(GiB)": 22.66, "step": 27308, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.956879 }, { "epoch": 0.8871455023876815, "grad_norm": 0.232730895280838, "learning_rate": 3.441982417539169e-07, "loss": 0.0077041760087013245, "memory(GiB)": 22.66, "step": 27309, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.956886 }, { "epoch": 0.8871779878504369, "grad_norm": 0.32452312111854553, "learning_rate": 3.4400241732859996e-07, "loss": 0.013272898271679878, "memory(GiB)": 22.66, "step": 27310, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.956893 }, { "epoch": 0.8872104733131924, "grad_norm": 0.41107243299484253, "learning_rate": 3.4380664663969163e-07, "loss": 0.013080919161438942, "memory(GiB)": 22.66, "step": 27311, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.956901 }, { "epoch": 0.8872429587759477, "grad_norm": 0.3155547082424164, "learning_rate": 3.4361092968945186e-07, "loss": 0.006951260380446911, "memory(GiB)": 22.66, "step": 27312, "token_acc": 0.9940828402366864, "train_speed(iter/s)": 0.956909 }, { "epoch": 0.8872754442387032, "grad_norm": 0.3272441327571869, "learning_rate": 3.434152664801393e-07, "loss": 0.007719411049038172, "memory(GiB)": 22.66, "step": 27313, "token_acc": 1.0, "train_speed(iter/s)": 0.956916 }, { "epoch": 0.8873079297014586, "grad_norm": 0.34206846356391907, "learning_rate": 3.4321965701401395e-07, "loss": 0.01102842204272747, "memory(GiB)": 22.66, "step": 27314, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.956923 }, { "epoch": 0.887340415164214, "grad_norm": 0.3599191904067993, "learning_rate": 3.4302410129333054e-07, "loss": 0.016924602910876274, "memory(GiB)": 22.66, "step": 27315, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.956928 }, { "epoch": 0.8873729006269694, "grad_norm": 0.28442472219467163, "learning_rate": 3.428285993203473e-07, "loss": 0.008019371889531612, "memory(GiB)": 22.66, "step": 27316, "token_acc": 1.0, "train_speed(iter/s)": 0.956934 }, { "epoch": 0.8874053860897249, "grad_norm": 0.407850056886673, "learning_rate": 3.426331510973213e-07, "loss": 0.011439596302807331, "memory(GiB)": 22.66, "step": 27317, "token_acc": 1.0, "train_speed(iter/s)": 0.95694 }, { "epoch": 0.8874378715524802, "grad_norm": 0.39186614751815796, "learning_rate": 3.42437756626508e-07, "loss": 0.009659484028816223, "memory(GiB)": 22.66, "step": 27318, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.956945 }, { "epoch": 0.8874703570152357, "grad_norm": 0.25311753153800964, "learning_rate": 3.4224241591016173e-07, "loss": 0.005558658391237259, "memory(GiB)": 22.66, "step": 27319, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.956951 }, { "epoch": 0.887502842477991, "grad_norm": 0.36282166838645935, "learning_rate": 3.420471289505373e-07, "loss": 0.01106911338865757, "memory(GiB)": 22.66, "step": 27320, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956957 }, { "epoch": 0.8875353279407465, "grad_norm": 0.39132219552993774, "learning_rate": 3.4185189574988843e-07, "loss": 0.013926228508353233, "memory(GiB)": 22.66, "step": 27321, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956962 }, { "epoch": 0.8875678134035019, "grad_norm": 0.3542754054069519, "learning_rate": 3.4165671631047003e-07, "loss": 0.009377877227962017, "memory(GiB)": 22.66, "step": 27322, "token_acc": 1.0, "train_speed(iter/s)": 0.956968 }, { "epoch": 0.8876002988662574, "grad_norm": 0.3153008818626404, "learning_rate": 3.41461590634532e-07, "loss": 0.010174086317420006, "memory(GiB)": 22.66, "step": 27323, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.956973 }, { "epoch": 0.8876327843290127, "grad_norm": 0.31474199891090393, "learning_rate": 3.4126651872432806e-07, "loss": 0.011051878333091736, "memory(GiB)": 22.66, "step": 27324, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956979 }, { "epoch": 0.8876652697917682, "grad_norm": 0.27684351801872253, "learning_rate": 3.4107150058210916e-07, "loss": 0.00748192984610796, "memory(GiB)": 22.66, "step": 27325, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.956984 }, { "epoch": 0.8876977552545235, "grad_norm": 0.37425312399864197, "learning_rate": 3.4087653621012627e-07, "loss": 0.011116026900708675, "memory(GiB)": 22.66, "step": 27326, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.95699 }, { "epoch": 0.887730240717279, "grad_norm": 0.30224746465682983, "learning_rate": 3.4068162561063047e-07, "loss": 0.010718252509832382, "memory(GiB)": 22.66, "step": 27327, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.956995 }, { "epoch": 0.8877627261800345, "grad_norm": 0.2789263427257538, "learning_rate": 3.4048676878586873e-07, "loss": 0.009452278725802898, "memory(GiB)": 22.66, "step": 27328, "token_acc": 1.0, "train_speed(iter/s)": 0.957001 }, { "epoch": 0.8877952116427898, "grad_norm": 0.28825780749320984, "learning_rate": 3.402919657380926e-07, "loss": 0.006693762727081776, "memory(GiB)": 22.66, "step": 27329, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.957007 }, { "epoch": 0.8878276971055453, "grad_norm": 0.32914453744888306, "learning_rate": 3.400972164695482e-07, "loss": 0.011707964353263378, "memory(GiB)": 22.66, "step": 27330, "token_acc": 1.0, "train_speed(iter/s)": 0.957013 }, { "epoch": 0.8878601825683007, "grad_norm": 0.5262058973312378, "learning_rate": 3.399025209824852e-07, "loss": 0.01281100045889616, "memory(GiB)": 22.66, "step": 27331, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.957019 }, { "epoch": 0.8878926680310562, "grad_norm": 0.3689258396625519, "learning_rate": 3.397078792791492e-07, "loss": 0.014993023127317429, "memory(GiB)": 22.66, "step": 27332, "token_acc": 1.0, "train_speed(iter/s)": 0.957024 }, { "epoch": 0.8879251534938115, "grad_norm": 0.2721691131591797, "learning_rate": 3.3951329136178765e-07, "loss": 0.011318757198750973, "memory(GiB)": 22.66, "step": 27333, "token_acc": 1.0, "train_speed(iter/s)": 0.957029 }, { "epoch": 0.887957638956567, "grad_norm": 0.30976971983909607, "learning_rate": 3.3931875723264505e-07, "loss": 0.01024786289781332, "memory(GiB)": 22.66, "step": 27334, "token_acc": 1.0, "train_speed(iter/s)": 0.957035 }, { "epoch": 0.8879901244193223, "grad_norm": 0.38375332951545715, "learning_rate": 3.3912427689396734e-07, "loss": 0.012419730424880981, "memory(GiB)": 22.66, "step": 27335, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.957041 }, { "epoch": 0.8880226098820778, "grad_norm": 0.4329615533351898, "learning_rate": 3.389298503479993e-07, "loss": 0.014301026239991188, "memory(GiB)": 22.66, "step": 27336, "token_acc": 0.9929824561403509, "train_speed(iter/s)": 0.957047 }, { "epoch": 0.8880550953448332, "grad_norm": 0.24927864968776703, "learning_rate": 3.387354775969853e-07, "loss": 0.00858233217149973, "memory(GiB)": 22.66, "step": 27337, "token_acc": 1.0, "train_speed(iter/s)": 0.957052 }, { "epoch": 0.8880875808075887, "grad_norm": 0.39713820815086365, "learning_rate": 3.3854115864316697e-07, "loss": 0.013881603255867958, "memory(GiB)": 22.66, "step": 27338, "token_acc": 1.0, "train_speed(iter/s)": 0.957059 }, { "epoch": 0.888120066270344, "grad_norm": 0.20682275295257568, "learning_rate": 3.383468934887885e-07, "loss": 0.0071130553260445595, "memory(GiB)": 22.66, "step": 27339, "token_acc": 1.0, "train_speed(iter/s)": 0.957066 }, { "epoch": 0.8881525517330995, "grad_norm": 0.38991639018058777, "learning_rate": 3.3815268213609144e-07, "loss": 0.0124615877866745, "memory(GiB)": 22.66, "step": 27340, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957073 }, { "epoch": 0.8881850371958548, "grad_norm": 0.305255651473999, "learning_rate": 3.379585245873174e-07, "loss": 0.011639246717095375, "memory(GiB)": 22.66, "step": 27341, "token_acc": 1.0, "train_speed(iter/s)": 0.957079 }, { "epoch": 0.8882175226586103, "grad_norm": 0.31589993834495544, "learning_rate": 3.377644208447073e-07, "loss": 0.010417807847261429, "memory(GiB)": 22.66, "step": 27342, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957086 }, { "epoch": 0.8882500081213657, "grad_norm": 0.252915620803833, "learning_rate": 3.3757037091050106e-07, "loss": 0.008294297382235527, "memory(GiB)": 22.66, "step": 27343, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957092 }, { "epoch": 0.8882824935841211, "grad_norm": 0.3298925459384918, "learning_rate": 3.373763747869385e-07, "loss": 0.010866424068808556, "memory(GiB)": 22.66, "step": 27344, "token_acc": 1.0, "train_speed(iter/s)": 0.957098 }, { "epoch": 0.8883149790468765, "grad_norm": 0.4714685380458832, "learning_rate": 3.371824324762579e-07, "loss": 0.012265059165656567, "memory(GiB)": 22.66, "step": 27345, "token_acc": 1.0, "train_speed(iter/s)": 0.957104 }, { "epoch": 0.888347464509632, "grad_norm": 0.33355334401130676, "learning_rate": 3.369885439806997e-07, "loss": 0.009422853589057922, "memory(GiB)": 22.66, "step": 27346, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.95711 }, { "epoch": 0.8883799499723873, "grad_norm": 0.41126707196235657, "learning_rate": 3.367947093024987e-07, "loss": 0.016783924773335457, "memory(GiB)": 22.66, "step": 27347, "token_acc": 1.0, "train_speed(iter/s)": 0.957116 }, { "epoch": 0.8884124354351428, "grad_norm": 0.3353872001171112, "learning_rate": 3.366009284438937e-07, "loss": 0.008947241120040417, "memory(GiB)": 22.66, "step": 27348, "token_acc": 1.0, "train_speed(iter/s)": 0.957122 }, { "epoch": 0.8884449208978982, "grad_norm": 0.4838692247867584, "learning_rate": 3.3640720140712124e-07, "loss": 0.02000688761472702, "memory(GiB)": 22.66, "step": 27349, "token_acc": 1.0, "train_speed(iter/s)": 0.957127 }, { "epoch": 0.8884774063606536, "grad_norm": 0.3471231162548065, "learning_rate": 3.362135281944179e-07, "loss": 0.016897030174732208, "memory(GiB)": 22.66, "step": 27350, "token_acc": 0.9826839826839827, "train_speed(iter/s)": 0.957133 }, { "epoch": 0.888509891823409, "grad_norm": 0.3850303292274475, "learning_rate": 3.360199088080163e-07, "loss": 0.011135208420455456, "memory(GiB)": 22.66, "step": 27351, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957138 }, { "epoch": 0.8885423772861645, "grad_norm": 0.2915625274181366, "learning_rate": 3.3582634325015406e-07, "loss": 0.013571365736424923, "memory(GiB)": 22.66, "step": 27352, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957145 }, { "epoch": 0.8885748627489198, "grad_norm": 0.4618780016899109, "learning_rate": 3.3563283152306227e-07, "loss": 0.01269657351076603, "memory(GiB)": 22.66, "step": 27353, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.95715 }, { "epoch": 0.8886073482116753, "grad_norm": 0.4004683792591095, "learning_rate": 3.3543937362897627e-07, "loss": 0.009986750781536102, "memory(GiB)": 22.66, "step": 27354, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957156 }, { "epoch": 0.8886398336744307, "grad_norm": 0.3260456323623657, "learning_rate": 3.3524596957012933e-07, "loss": 0.012919098138809204, "memory(GiB)": 22.66, "step": 27355, "token_acc": 1.0, "train_speed(iter/s)": 0.957161 }, { "epoch": 0.8886723191371861, "grad_norm": 0.2670954465866089, "learning_rate": 3.350526193487524e-07, "loss": 0.006902705412358046, "memory(GiB)": 22.66, "step": 27356, "token_acc": 1.0, "train_speed(iter/s)": 0.957167 }, { "epoch": 0.8887048045999415, "grad_norm": 0.3867083787918091, "learning_rate": 3.348593229670777e-07, "loss": 0.012126191519200802, "memory(GiB)": 22.66, "step": 27357, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.957172 }, { "epoch": 0.888737290062697, "grad_norm": 0.30618011951446533, "learning_rate": 3.346660804273338e-07, "loss": 0.01178659126162529, "memory(GiB)": 22.66, "step": 27358, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.957178 }, { "epoch": 0.8887697755254523, "grad_norm": 0.2766020596027374, "learning_rate": 3.344728917317547e-07, "loss": 0.008372703567147255, "memory(GiB)": 22.66, "step": 27359, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.957184 }, { "epoch": 0.8888022609882078, "grad_norm": 0.3017503023147583, "learning_rate": 3.3427975688256785e-07, "loss": 0.012506313621997833, "memory(GiB)": 22.66, "step": 27360, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.957191 }, { "epoch": 0.8888347464509632, "grad_norm": 0.42866164445877075, "learning_rate": 3.340866758820027e-07, "loss": 0.011535512283444405, "memory(GiB)": 22.66, "step": 27361, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.957198 }, { "epoch": 0.8888672319137186, "grad_norm": 0.41014111042022705, "learning_rate": 3.338936487322875e-07, "loss": 0.01121248584240675, "memory(GiB)": 22.66, "step": 27362, "token_acc": 1.0, "train_speed(iter/s)": 0.957205 }, { "epoch": 0.888899717376474, "grad_norm": 0.31535184383392334, "learning_rate": 3.337006754356498e-07, "loss": 0.014606079086661339, "memory(GiB)": 22.66, "step": 27363, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.957212 }, { "epoch": 0.8889322028392295, "grad_norm": 0.3742661774158478, "learning_rate": 3.335077559943173e-07, "loss": 0.011796273291110992, "memory(GiB)": 22.66, "step": 27364, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.95722 }, { "epoch": 0.8889646883019848, "grad_norm": 0.3690411150455475, "learning_rate": 3.3331489041051713e-07, "loss": 0.012287288904190063, "memory(GiB)": 22.66, "step": 27365, "token_acc": 0.9798387096774194, "train_speed(iter/s)": 0.957228 }, { "epoch": 0.8889971737647403, "grad_norm": 0.40188395977020264, "learning_rate": 3.331220786864736e-07, "loss": 0.014400919899344444, "memory(GiB)": 22.66, "step": 27366, "token_acc": 0.996, "train_speed(iter/s)": 0.957235 }, { "epoch": 0.8890296592274957, "grad_norm": 0.2802322506904602, "learning_rate": 3.3292932082441275e-07, "loss": 0.010420422069728374, "memory(GiB)": 22.66, "step": 27367, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.957242 }, { "epoch": 0.8890621446902511, "grad_norm": 0.357338011264801, "learning_rate": 3.3273661682656e-07, "loss": 0.012157079763710499, "memory(GiB)": 22.66, "step": 27368, "token_acc": 1.0, "train_speed(iter/s)": 0.957249 }, { "epoch": 0.8890946301530065, "grad_norm": 0.30804964900016785, "learning_rate": 3.32543966695138e-07, "loss": 0.00951780378818512, "memory(GiB)": 22.66, "step": 27369, "token_acc": 1.0, "train_speed(iter/s)": 0.957257 }, { "epoch": 0.889127115615762, "grad_norm": 0.3913327157497406, "learning_rate": 3.323513704323722e-07, "loss": 0.0199184101074934, "memory(GiB)": 22.66, "step": 27370, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957264 }, { "epoch": 0.8891596010785173, "grad_norm": 0.40638643503189087, "learning_rate": 3.321588280404836e-07, "loss": 0.010543102398514748, "memory(GiB)": 22.66, "step": 27371, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.957272 }, { "epoch": 0.8891920865412728, "grad_norm": 0.37890204787254333, "learning_rate": 3.319663395216949e-07, "loss": 0.01401458028703928, "memory(GiB)": 22.66, "step": 27372, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957279 }, { "epoch": 0.8892245720040282, "grad_norm": 0.27871936559677124, "learning_rate": 3.3177390487822757e-07, "loss": 0.01538256835192442, "memory(GiB)": 22.66, "step": 27373, "token_acc": 0.9890510948905109, "train_speed(iter/s)": 0.957286 }, { "epoch": 0.8892570574667836, "grad_norm": 0.769547164440155, "learning_rate": 3.3158152411230326e-07, "loss": 0.020921306684613228, "memory(GiB)": 22.66, "step": 27374, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.957294 }, { "epoch": 0.889289542929539, "grad_norm": 0.350540429353714, "learning_rate": 3.313891972261418e-07, "loss": 0.0087884571403265, "memory(GiB)": 22.66, "step": 27375, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957301 }, { "epoch": 0.8893220283922945, "grad_norm": 0.3785600960254669, "learning_rate": 3.311969242219626e-07, "loss": 0.009318930096924305, "memory(GiB)": 22.66, "step": 27376, "token_acc": 0.9965277777777778, "train_speed(iter/s)": 0.957308 }, { "epoch": 0.8893545138550498, "grad_norm": 0.4517814815044403, "learning_rate": 3.310047051019849e-07, "loss": 0.014407849870622158, "memory(GiB)": 22.66, "step": 27377, "token_acc": 1.0, "train_speed(iter/s)": 0.957313 }, { "epoch": 0.8893869993178053, "grad_norm": 0.4355950951576233, "learning_rate": 3.308125398684281e-07, "loss": 0.011047600768506527, "memory(GiB)": 22.66, "step": 27378, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.957319 }, { "epoch": 0.8894194847805607, "grad_norm": 0.3091781735420227, "learning_rate": 3.306204285235087e-07, "loss": 0.011480177752673626, "memory(GiB)": 22.66, "step": 27379, "token_acc": 1.0, "train_speed(iter/s)": 0.957326 }, { "epoch": 0.8894519702433161, "grad_norm": 0.38165315985679626, "learning_rate": 3.3042837106944505e-07, "loss": 0.017052534967660904, "memory(GiB)": 22.66, "step": 27380, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957331 }, { "epoch": 0.8894844557060715, "grad_norm": 0.42590004205703735, "learning_rate": 3.302363675084519e-07, "loss": 0.015382744371891022, "memory(GiB)": 22.66, "step": 27381, "token_acc": 0.9934640522875817, "train_speed(iter/s)": 0.957337 }, { "epoch": 0.889516941168827, "grad_norm": 0.2175307422876358, "learning_rate": 3.3004441784274753e-07, "loss": 0.007346725091338158, "memory(GiB)": 22.66, "step": 27382, "token_acc": 1.0, "train_speed(iter/s)": 0.957343 }, { "epoch": 0.8895494266315823, "grad_norm": 0.358314573764801, "learning_rate": 3.298525220745463e-07, "loss": 0.010044044815003872, "memory(GiB)": 22.66, "step": 27383, "token_acc": 1.0, "train_speed(iter/s)": 0.957349 }, { "epoch": 0.8895819120943378, "grad_norm": 0.27397677302360535, "learning_rate": 3.2966068020606314e-07, "loss": 0.011123632080852985, "memory(GiB)": 22.66, "step": 27384, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957355 }, { "epoch": 0.8896143975570932, "grad_norm": 0.3328820765018463, "learning_rate": 3.2946889223951117e-07, "loss": 0.014787509106099606, "memory(GiB)": 22.66, "step": 27385, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.95736 }, { "epoch": 0.8896468830198486, "grad_norm": 0.30764588713645935, "learning_rate": 3.292771581771043e-07, "loss": 0.0076962606981396675, "memory(GiB)": 22.66, "step": 27386, "token_acc": 1.0, "train_speed(iter/s)": 0.957365 }, { "epoch": 0.889679368482604, "grad_norm": 0.376322478055954, "learning_rate": 3.290854780210567e-07, "loss": 0.010667931288480759, "memory(GiB)": 22.66, "step": 27387, "token_acc": 0.995, "train_speed(iter/s)": 0.95737 }, { "epoch": 0.8897118539453595, "grad_norm": 0.2481212615966797, "learning_rate": 3.28893851773579e-07, "loss": 0.008415533229708672, "memory(GiB)": 22.66, "step": 27388, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.957374 }, { "epoch": 0.8897443394081148, "grad_norm": 0.35299867391586304, "learning_rate": 3.287022794368844e-07, "loss": 0.010271516628563404, "memory(GiB)": 22.66, "step": 27389, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957379 }, { "epoch": 0.8897768248708703, "grad_norm": 0.2820168137550354, "learning_rate": 3.2851076101318214e-07, "loss": 0.010227739810943604, "memory(GiB)": 22.66, "step": 27390, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957384 }, { "epoch": 0.8898093103336256, "grad_norm": 0.5041965246200562, "learning_rate": 3.2831929650468383e-07, "loss": 0.012851606123149395, "memory(GiB)": 22.66, "step": 27391, "token_acc": 1.0, "train_speed(iter/s)": 0.95739 }, { "epoch": 0.8898417957963811, "grad_norm": 0.43023887276649475, "learning_rate": 3.281278859135983e-07, "loss": 0.018004240468144417, "memory(GiB)": 22.66, "step": 27392, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.957396 }, { "epoch": 0.8898742812591366, "grad_norm": 0.4019818902015686, "learning_rate": 3.279365292421366e-07, "loss": 0.013138974085450172, "memory(GiB)": 22.66, "step": 27393, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.957402 }, { "epoch": 0.889906766721892, "grad_norm": 0.7074756622314453, "learning_rate": 3.277452264925046e-07, "loss": 0.011082051321864128, "memory(GiB)": 22.66, "step": 27394, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.957408 }, { "epoch": 0.8899392521846474, "grad_norm": 0.35005781054496765, "learning_rate": 3.275539776669112e-07, "loss": 0.010961461812257767, "memory(GiB)": 22.66, "step": 27395, "token_acc": 1.0, "train_speed(iter/s)": 0.957413 }, { "epoch": 0.8899717376474028, "grad_norm": 0.3184170722961426, "learning_rate": 3.273627827675646e-07, "loss": 0.009721457958221436, "memory(GiB)": 22.66, "step": 27396, "token_acc": 1.0, "train_speed(iter/s)": 0.957419 }, { "epoch": 0.8900042231101583, "grad_norm": 0.38616371154785156, "learning_rate": 3.271716417966708e-07, "loss": 0.014305498450994492, "memory(GiB)": 22.66, "step": 27397, "token_acc": 1.0, "train_speed(iter/s)": 0.957424 }, { "epoch": 0.8900367085729136, "grad_norm": 0.27837416529655457, "learning_rate": 3.269805547564353e-07, "loss": 0.012162200175225735, "memory(GiB)": 22.66, "step": 27398, "token_acc": 0.9869565217391304, "train_speed(iter/s)": 0.95743 }, { "epoch": 0.8900691940356691, "grad_norm": 0.32191842794418335, "learning_rate": 3.267895216490641e-07, "loss": 0.013695644214749336, "memory(GiB)": 22.66, "step": 27399, "token_acc": 1.0, "train_speed(iter/s)": 0.957436 }, { "epoch": 0.8901016794984244, "grad_norm": 0.4885897636413574, "learning_rate": 3.2659854247676205e-07, "loss": 0.013815789483487606, "memory(GiB)": 22.66, "step": 27400, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957441 }, { "epoch": 0.8901341649611799, "grad_norm": 0.26136794686317444, "learning_rate": 3.264076172417324e-07, "loss": 0.014828702434897423, "memory(GiB)": 22.66, "step": 27401, "token_acc": 1.0, "train_speed(iter/s)": 0.957446 }, { "epoch": 0.8901666504239353, "grad_norm": 0.4081391394138336, "learning_rate": 3.262167459461807e-07, "loss": 0.01751607283949852, "memory(GiB)": 22.66, "step": 27402, "token_acc": 0.9948717948717949, "train_speed(iter/s)": 0.957452 }, { "epoch": 0.8901991358866908, "grad_norm": 0.23178458213806152, "learning_rate": 3.2602592859230787e-07, "loss": 0.008611798286437988, "memory(GiB)": 22.66, "step": 27403, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.957457 }, { "epoch": 0.8902316213494461, "grad_norm": 0.43322131037712097, "learning_rate": 3.2583516518231664e-07, "loss": 0.01742362231016159, "memory(GiB)": 22.66, "step": 27404, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.957462 }, { "epoch": 0.8902641068122016, "grad_norm": 0.3550926446914673, "learning_rate": 3.256444557184091e-07, "loss": 0.011030597612261772, "memory(GiB)": 22.66, "step": 27405, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957468 }, { "epoch": 0.8902965922749569, "grad_norm": 0.2164900302886963, "learning_rate": 3.2545380020278626e-07, "loss": 0.005947712808847427, "memory(GiB)": 22.66, "step": 27406, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957474 }, { "epoch": 0.8903290777377124, "grad_norm": 0.3496401309967041, "learning_rate": 3.252631986376481e-07, "loss": 0.010763334110379219, "memory(GiB)": 22.66, "step": 27407, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.95748 }, { "epoch": 0.8903615632004678, "grad_norm": 0.2982942759990692, "learning_rate": 3.250726510251956e-07, "loss": 0.009037786163389683, "memory(GiB)": 22.66, "step": 27408, "token_acc": 1.0, "train_speed(iter/s)": 0.957485 }, { "epoch": 0.8903940486632232, "grad_norm": 0.3220828175544739, "learning_rate": 3.248821573676253e-07, "loss": 0.008465159684419632, "memory(GiB)": 22.66, "step": 27409, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.957491 }, { "epoch": 0.8904265341259786, "grad_norm": 0.396762877702713, "learning_rate": 3.2469171766713883e-07, "loss": 0.013718256726861, "memory(GiB)": 22.66, "step": 27410, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957496 }, { "epoch": 0.8904590195887341, "grad_norm": 0.14627589285373688, "learning_rate": 3.2450133192593215e-07, "loss": 0.004324242006987333, "memory(GiB)": 22.66, "step": 27411, "token_acc": 1.0, "train_speed(iter/s)": 0.957502 }, { "epoch": 0.8904915050514894, "grad_norm": 0.2811845541000366, "learning_rate": 3.243110001462041e-07, "loss": 0.01097598485648632, "memory(GiB)": 22.66, "step": 27412, "token_acc": 1.0, "train_speed(iter/s)": 0.957507 }, { "epoch": 0.8905239905142449, "grad_norm": 0.28177720308303833, "learning_rate": 3.2412072233014956e-07, "loss": 0.008863412775099277, "memory(GiB)": 22.66, "step": 27413, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.957511 }, { "epoch": 0.8905564759770003, "grad_norm": 0.45732346177101135, "learning_rate": 3.2393049847996515e-07, "loss": 0.014326242730021477, "memory(GiB)": 22.66, "step": 27414, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957517 }, { "epoch": 0.8905889614397557, "grad_norm": 0.2912505567073822, "learning_rate": 3.2374032859784685e-07, "loss": 0.008665235713124275, "memory(GiB)": 22.66, "step": 27415, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957523 }, { "epoch": 0.8906214469025111, "grad_norm": 0.3077224791049957, "learning_rate": 3.2355021268598963e-07, "loss": 0.012590640224516392, "memory(GiB)": 22.66, "step": 27416, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957529 }, { "epoch": 0.8906539323652666, "grad_norm": 1.372817039489746, "learning_rate": 3.233601507465878e-07, "loss": 0.015786172822117805, "memory(GiB)": 22.66, "step": 27417, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.957535 }, { "epoch": 0.8906864178280219, "grad_norm": 0.4781409502029419, "learning_rate": 3.2317014278183347e-07, "loss": 0.021046999841928482, "memory(GiB)": 22.66, "step": 27418, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.957541 }, { "epoch": 0.8907189032907774, "grad_norm": 0.37552523612976074, "learning_rate": 3.229801887939205e-07, "loss": 0.013550017029047012, "memory(GiB)": 22.66, "step": 27419, "token_acc": 1.0, "train_speed(iter/s)": 0.957547 }, { "epoch": 0.8907513887535328, "grad_norm": 0.22487202286720276, "learning_rate": 3.227902887850415e-07, "loss": 0.008063048124313354, "memory(GiB)": 22.66, "step": 27420, "token_acc": 1.0, "train_speed(iter/s)": 0.957554 }, { "epoch": 0.8907838742162882, "grad_norm": 0.2854154407978058, "learning_rate": 3.2260044275738865e-07, "loss": 0.007944010198116302, "memory(GiB)": 22.66, "step": 27421, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.957561 }, { "epoch": 0.8908163596790436, "grad_norm": 3.3704922199249268, "learning_rate": 3.2241065071315136e-07, "loss": 0.010511863976716995, "memory(GiB)": 22.66, "step": 27422, "token_acc": 1.0, "train_speed(iter/s)": 0.957568 }, { "epoch": 0.8908488451417991, "grad_norm": 0.18096362054347992, "learning_rate": 3.2222091265452116e-07, "loss": 0.005727922078222036, "memory(GiB)": 22.66, "step": 27423, "token_acc": 0.996, "train_speed(iter/s)": 0.957575 }, { "epoch": 0.8908813306045544, "grad_norm": 0.31978461146354675, "learning_rate": 3.220312285836874e-07, "loss": 0.009084992110729218, "memory(GiB)": 22.66, "step": 27424, "token_acc": 0.99609375, "train_speed(iter/s)": 0.957583 }, { "epoch": 0.8909138160673099, "grad_norm": 0.27169185876846313, "learning_rate": 3.218415985028406e-07, "loss": 0.011282533407211304, "memory(GiB)": 22.66, "step": 27425, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957591 }, { "epoch": 0.8909463015300653, "grad_norm": 0.38430893421173096, "learning_rate": 3.216520224141678e-07, "loss": 0.009626632556319237, "memory(GiB)": 22.66, "step": 27426, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.957598 }, { "epoch": 0.8909787869928207, "grad_norm": 0.32997259497642517, "learning_rate": 3.2146250031985736e-07, "loss": 0.011116905137896538, "memory(GiB)": 22.66, "step": 27427, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.957605 }, { "epoch": 0.8910112724555761, "grad_norm": 0.3361305594444275, "learning_rate": 3.212730322220969e-07, "loss": 0.016012603417038918, "memory(GiB)": 22.66, "step": 27428, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.957613 }, { "epoch": 0.8910437579183316, "grad_norm": 0.27652421593666077, "learning_rate": 3.2108361812307363e-07, "loss": 0.010036997497081757, "memory(GiB)": 22.66, "step": 27429, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957621 }, { "epoch": 0.8910762433810869, "grad_norm": 0.3412593603134155, "learning_rate": 3.2089425802497243e-07, "loss": 0.01779702678322792, "memory(GiB)": 22.66, "step": 27430, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.957628 }, { "epoch": 0.8911087288438424, "grad_norm": 0.39195558428764343, "learning_rate": 3.207049519299793e-07, "loss": 0.020177491009235382, "memory(GiB)": 22.66, "step": 27431, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.957636 }, { "epoch": 0.8911412143065978, "grad_norm": 0.3077807128429413, "learning_rate": 3.205156998402792e-07, "loss": 0.010982269421219826, "memory(GiB)": 22.66, "step": 27432, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.957643 }, { "epoch": 0.8911736997693532, "grad_norm": 0.2854413092136383, "learning_rate": 3.2032650175805645e-07, "loss": 0.009413428604602814, "memory(GiB)": 22.66, "step": 27433, "token_acc": 1.0, "train_speed(iter/s)": 0.95765 }, { "epoch": 0.8912061852321086, "grad_norm": 0.3153984546661377, "learning_rate": 3.201373576854949e-07, "loss": 0.009383067488670349, "memory(GiB)": 22.66, "step": 27434, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.957657 }, { "epoch": 0.8912386706948641, "grad_norm": 0.31382641196250916, "learning_rate": 3.199482676247767e-07, "loss": 0.009834470227360725, "memory(GiB)": 22.66, "step": 27435, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957665 }, { "epoch": 0.8912711561576194, "grad_norm": 0.3389790952205658, "learning_rate": 3.1975923157808507e-07, "loss": 0.011719192378222942, "memory(GiB)": 22.66, "step": 27436, "token_acc": 0.996, "train_speed(iter/s)": 0.957672 }, { "epoch": 0.8913036416203749, "grad_norm": 0.36506590247154236, "learning_rate": 3.1957024954759997e-07, "loss": 0.014154075644910336, "memory(GiB)": 22.66, "step": 27437, "token_acc": 1.0, "train_speed(iter/s)": 0.957679 }, { "epoch": 0.8913361270831303, "grad_norm": 0.3221427798271179, "learning_rate": 3.1938132153550515e-07, "loss": 0.01010420173406601, "memory(GiB)": 22.66, "step": 27438, "token_acc": 1.0, "train_speed(iter/s)": 0.957686 }, { "epoch": 0.8913686125458857, "grad_norm": 0.17775258421897888, "learning_rate": 3.1919244754397893e-07, "loss": 0.00482090562582016, "memory(GiB)": 22.66, "step": 27439, "token_acc": 1.0, "train_speed(iter/s)": 0.957693 }, { "epoch": 0.8914010980086411, "grad_norm": 0.2976265251636505, "learning_rate": 3.190036275752034e-07, "loss": 0.013916819356381893, "memory(GiB)": 22.66, "step": 27440, "token_acc": 1.0, "train_speed(iter/s)": 0.9577 }, { "epoch": 0.8914335834713966, "grad_norm": 0.2788204848766327, "learning_rate": 3.188148616313552e-07, "loss": 0.01042284443974495, "memory(GiB)": 22.66, "step": 27441, "token_acc": 1.0, "train_speed(iter/s)": 0.957706 }, { "epoch": 0.8914660689341519, "grad_norm": 0.38360416889190674, "learning_rate": 3.1862614971461427e-07, "loss": 0.013178876601159573, "memory(GiB)": 22.66, "step": 27442, "token_acc": 1.0, "train_speed(iter/s)": 0.957712 }, { "epoch": 0.8914985543969074, "grad_norm": 0.23393601179122925, "learning_rate": 3.184374918271582e-07, "loss": 0.005286103580147028, "memory(GiB)": 22.66, "step": 27443, "token_acc": 1.0, "train_speed(iter/s)": 0.957718 }, { "epoch": 0.8915310398596628, "grad_norm": 0.197398379445076, "learning_rate": 3.182488879711654e-07, "loss": 0.008280857466161251, "memory(GiB)": 22.66, "step": 27444, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957724 }, { "epoch": 0.8915635253224182, "grad_norm": 0.3461446762084961, "learning_rate": 3.1806033814881123e-07, "loss": 0.011820069514214993, "memory(GiB)": 22.66, "step": 27445, "token_acc": 1.0, "train_speed(iter/s)": 0.95773 }, { "epoch": 0.8915960107851736, "grad_norm": 0.28823575377464294, "learning_rate": 3.178718423622723e-07, "loss": 0.010428277775645256, "memory(GiB)": 22.66, "step": 27446, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.957736 }, { "epoch": 0.8916284962479291, "grad_norm": 0.41771671175956726, "learning_rate": 3.1768340061372473e-07, "loss": 0.011204364709556103, "memory(GiB)": 22.66, "step": 27447, "token_acc": 0.9947643979057592, "train_speed(iter/s)": 0.957742 }, { "epoch": 0.8916609817106844, "grad_norm": 0.24805893003940582, "learning_rate": 3.1749501290534226e-07, "loss": 0.007272415328770876, "memory(GiB)": 22.66, "step": 27448, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957748 }, { "epoch": 0.8916934671734399, "grad_norm": 0.4679759442806244, "learning_rate": 3.1730667923930036e-07, "loss": 0.01369614526629448, "memory(GiB)": 22.66, "step": 27449, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.957754 }, { "epoch": 0.8917259526361953, "grad_norm": 0.3918132483959198, "learning_rate": 3.171183996177718e-07, "loss": 0.01156992744654417, "memory(GiB)": 22.66, "step": 27450, "token_acc": 1.0, "train_speed(iter/s)": 0.95776 }, { "epoch": 0.8917584380989507, "grad_norm": 0.26453229784965515, "learning_rate": 3.169301740429298e-07, "loss": 0.009313107468187809, "memory(GiB)": 22.66, "step": 27451, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.957766 }, { "epoch": 0.8917909235617061, "grad_norm": 0.4603445827960968, "learning_rate": 3.167420025169465e-07, "loss": 0.011457785032689571, "memory(GiB)": 22.66, "step": 27452, "token_acc": 1.0, "train_speed(iter/s)": 0.957771 }, { "epoch": 0.8918234090244616, "grad_norm": 0.3038943409919739, "learning_rate": 3.165538850419947e-07, "loss": 0.01677566207945347, "memory(GiB)": 22.66, "step": 27453, "token_acc": 1.0, "train_speed(iter/s)": 0.957778 }, { "epoch": 0.8918558944872169, "grad_norm": 0.33973243832588196, "learning_rate": 3.1636582162024364e-07, "loss": 0.007876346819102764, "memory(GiB)": 22.66, "step": 27454, "token_acc": 1.0, "train_speed(iter/s)": 0.957783 }, { "epoch": 0.8918883799499724, "grad_norm": 0.3596256971359253, "learning_rate": 3.161778122538656e-07, "loss": 0.010827631689608097, "memory(GiB)": 22.66, "step": 27455, "token_acc": 1.0, "train_speed(iter/s)": 0.95779 }, { "epoch": 0.8919208654127279, "grad_norm": 0.3740827143192291, "learning_rate": 3.159898569450293e-07, "loss": 0.011002304963767529, "memory(GiB)": 22.66, "step": 27456, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.957796 }, { "epoch": 0.8919533508754832, "grad_norm": 0.34384578466415405, "learning_rate": 3.1580195569590523e-07, "loss": 0.011451237834990025, "memory(GiB)": 22.66, "step": 27457, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.957802 }, { "epoch": 0.8919858363382387, "grad_norm": 0.43230652809143066, "learning_rate": 3.156141085086606e-07, "loss": 0.015730496495962143, "memory(GiB)": 22.66, "step": 27458, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957807 }, { "epoch": 0.892018321800994, "grad_norm": 0.31295472383499146, "learning_rate": 3.154263153854648e-07, "loss": 0.008871686644852161, "memory(GiB)": 22.66, "step": 27459, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.957812 }, { "epoch": 0.8920508072637495, "grad_norm": 0.7271183729171753, "learning_rate": 3.1523857632848264e-07, "loss": 0.011641789227724075, "memory(GiB)": 22.66, "step": 27460, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.957816 }, { "epoch": 0.8920832927265049, "grad_norm": 0.2937883138656616, "learning_rate": 3.1505089133988474e-07, "loss": 0.008791817352175713, "memory(GiB)": 22.66, "step": 27461, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957821 }, { "epoch": 0.8921157781892604, "grad_norm": 0.4940241575241089, "learning_rate": 3.1486326042183434e-07, "loss": 0.015407416969537735, "memory(GiB)": 22.66, "step": 27462, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957827 }, { "epoch": 0.8921482636520157, "grad_norm": 0.5079026818275452, "learning_rate": 3.1467568357649747e-07, "loss": 0.013103331439197063, "memory(GiB)": 22.66, "step": 27463, "token_acc": 1.0, "train_speed(iter/s)": 0.957832 }, { "epoch": 0.8921807491147712, "grad_norm": 0.4009227454662323, "learning_rate": 3.1448816080604015e-07, "loss": 0.012831537052989006, "memory(GiB)": 22.66, "step": 27464, "token_acc": 1.0, "train_speed(iter/s)": 0.957837 }, { "epoch": 0.8922132345775265, "grad_norm": 0.2667655348777771, "learning_rate": 3.143006921126246e-07, "loss": 0.0063146427273750305, "memory(GiB)": 22.66, "step": 27465, "token_acc": 0.996, "train_speed(iter/s)": 0.957843 }, { "epoch": 0.892245720040282, "grad_norm": 0.36426103115081787, "learning_rate": 3.1411327749841735e-07, "loss": 0.012203785590827465, "memory(GiB)": 22.66, "step": 27466, "token_acc": 0.992, "train_speed(iter/s)": 0.957848 }, { "epoch": 0.8922782055030374, "grad_norm": 0.2776718735694885, "learning_rate": 3.1392591696557893e-07, "loss": 0.012184884399175644, "memory(GiB)": 22.66, "step": 27467, "token_acc": 0.9924242424242424, "train_speed(iter/s)": 0.957854 }, { "epoch": 0.8923106909657929, "grad_norm": 0.28330618143081665, "learning_rate": 3.137386105162732e-07, "loss": 0.010102648288011551, "memory(GiB)": 22.66, "step": 27468, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.95786 }, { "epoch": 0.8923431764285482, "grad_norm": 0.3722223937511444, "learning_rate": 3.135513581526606e-07, "loss": 0.011913884431123734, "memory(GiB)": 22.66, "step": 27469, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.957864 }, { "epoch": 0.8923756618913037, "grad_norm": 0.32836291193962097, "learning_rate": 3.133641598769027e-07, "loss": 0.015578651800751686, "memory(GiB)": 22.66, "step": 27470, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957869 }, { "epoch": 0.892408147354059, "grad_norm": 0.21258148550987244, "learning_rate": 3.1317701569116065e-07, "loss": 0.008105692453682423, "memory(GiB)": 22.66, "step": 27471, "token_acc": 1.0, "train_speed(iter/s)": 0.957875 }, { "epoch": 0.8924406328168145, "grad_norm": 0.35713663697242737, "learning_rate": 3.1298992559759486e-07, "loss": 0.011130350641906261, "memory(GiB)": 22.66, "step": 27472, "token_acc": 1.0, "train_speed(iter/s)": 0.95788 }, { "epoch": 0.8924731182795699, "grad_norm": 0.15928791463375092, "learning_rate": 3.128028895983626e-07, "loss": 0.009392107836902142, "memory(GiB)": 22.66, "step": 27473, "token_acc": 1.0, "train_speed(iter/s)": 0.957886 }, { "epoch": 0.8925056037423253, "grad_norm": 0.35622110962867737, "learning_rate": 3.1261590769562367e-07, "loss": 0.014220993965864182, "memory(GiB)": 22.66, "step": 27474, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.957892 }, { "epoch": 0.8925380892050807, "grad_norm": 0.45186474919319153, "learning_rate": 3.1242897989153643e-07, "loss": 0.01586339995265007, "memory(GiB)": 22.66, "step": 27475, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.957898 }, { "epoch": 0.8925705746678362, "grad_norm": 0.29922905564308167, "learning_rate": 3.122421061882586e-07, "loss": 0.01095054391771555, "memory(GiB)": 22.66, "step": 27476, "token_acc": 1.0, "train_speed(iter/s)": 0.957903 }, { "epoch": 0.8926030601305915, "grad_norm": 0.2941409647464752, "learning_rate": 3.1205528658794505e-07, "loss": 0.008731620386242867, "memory(GiB)": 22.66, "step": 27477, "token_acc": 1.0, "train_speed(iter/s)": 0.957908 }, { "epoch": 0.892635545593347, "grad_norm": 0.40417027473449707, "learning_rate": 3.118685210927536e-07, "loss": 0.010782115161418915, "memory(GiB)": 22.66, "step": 27478, "token_acc": 1.0, "train_speed(iter/s)": 0.957914 }, { "epoch": 0.8926680310561024, "grad_norm": 0.42089855670928955, "learning_rate": 3.116818097048391e-07, "loss": 0.016255415976047516, "memory(GiB)": 22.66, "step": 27479, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957922 }, { "epoch": 0.8927005165188578, "grad_norm": 0.3974387049674988, "learning_rate": 3.114951524263565e-07, "loss": 0.012751287780702114, "memory(GiB)": 22.66, "step": 27480, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.957929 }, { "epoch": 0.8927330019816132, "grad_norm": 0.2733834385871887, "learning_rate": 3.113085492594614e-07, "loss": 0.009799407795071602, "memory(GiB)": 22.66, "step": 27481, "token_acc": 1.0, "train_speed(iter/s)": 0.957936 }, { "epoch": 0.8927654874443687, "grad_norm": 0.3758440315723419, "learning_rate": 3.1112200020630535e-07, "loss": 0.011460397392511368, "memory(GiB)": 22.66, "step": 27482, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957944 }, { "epoch": 0.892797972907124, "grad_norm": 0.42618557810783386, "learning_rate": 3.109355052690421e-07, "loss": 0.015744123607873917, "memory(GiB)": 22.66, "step": 27483, "token_acc": 1.0, "train_speed(iter/s)": 0.957952 }, { "epoch": 0.8928304583698795, "grad_norm": 0.345592737197876, "learning_rate": 3.10749064449824e-07, "loss": 0.00803765282034874, "memory(GiB)": 22.66, "step": 27484, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957959 }, { "epoch": 0.8928629438326349, "grad_norm": 0.3289797902107239, "learning_rate": 3.1056267775080473e-07, "loss": 0.009824644774198532, "memory(GiB)": 22.66, "step": 27485, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957967 }, { "epoch": 0.8928954292953903, "grad_norm": 0.3286782205104828, "learning_rate": 3.1037634517413206e-07, "loss": 0.010445061139762402, "memory(GiB)": 22.66, "step": 27486, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957975 }, { "epoch": 0.8929279147581457, "grad_norm": 0.2473898082971573, "learning_rate": 3.101900667219598e-07, "loss": 0.01037411019206047, "memory(GiB)": 22.66, "step": 27487, "token_acc": 0.9961685823754789, "train_speed(iter/s)": 0.957982 }, { "epoch": 0.8929604002209012, "grad_norm": 0.34354785084724426, "learning_rate": 3.100038423964341e-07, "loss": 0.010613636113703251, "memory(GiB)": 22.66, "step": 27488, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.957989 }, { "epoch": 0.8929928856836565, "grad_norm": 0.4031008780002594, "learning_rate": 3.0981767219970816e-07, "loss": 0.016608325764536858, "memory(GiB)": 22.66, "step": 27489, "token_acc": 0.98046875, "train_speed(iter/s)": 0.957997 }, { "epoch": 0.893025371146412, "grad_norm": 0.3863926827907562, "learning_rate": 3.096315561339286e-07, "loss": 0.012628817930817604, "memory(GiB)": 22.66, "step": 27490, "token_acc": 1.0, "train_speed(iter/s)": 0.958004 }, { "epoch": 0.8930578566091674, "grad_norm": 0.24152925610542297, "learning_rate": 3.0944549420124425e-07, "loss": 0.007617554627358913, "memory(GiB)": 22.66, "step": 27491, "token_acc": 1.0, "train_speed(iter/s)": 0.958012 }, { "epoch": 0.8930903420719228, "grad_norm": 0.3622128963470459, "learning_rate": 3.092594864038012e-07, "loss": 0.007424910552799702, "memory(GiB)": 22.66, "step": 27492, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.958019 }, { "epoch": 0.8931228275346782, "grad_norm": 0.2852507531642914, "learning_rate": 3.0907353274374607e-07, "loss": 0.008624931797385216, "memory(GiB)": 22.66, "step": 27493, "token_acc": 1.0, "train_speed(iter/s)": 0.958026 }, { "epoch": 0.8931553129974337, "grad_norm": 0.299851655960083, "learning_rate": 3.088876332232277e-07, "loss": 0.007081357296556234, "memory(GiB)": 22.66, "step": 27494, "token_acc": 1.0, "train_speed(iter/s)": 0.958033 }, { "epoch": 0.893187798460189, "grad_norm": 0.27637675404548645, "learning_rate": 3.0870178784438875e-07, "loss": 0.007796037010848522, "memory(GiB)": 22.66, "step": 27495, "token_acc": 1.0, "train_speed(iter/s)": 0.958041 }, { "epoch": 0.8932202839229445, "grad_norm": 0.2748003304004669, "learning_rate": 3.0851599660937646e-07, "loss": 0.009792040102183819, "memory(GiB)": 22.66, "step": 27496, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.958048 }, { "epoch": 0.8932527693856999, "grad_norm": 0.29281163215637207, "learning_rate": 3.08330259520333e-07, "loss": 0.00992206297814846, "memory(GiB)": 22.66, "step": 27497, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.958056 }, { "epoch": 0.8932852548484553, "grad_norm": 0.44175103306770325, "learning_rate": 3.0814457657940334e-07, "loss": 0.008787223137915134, "memory(GiB)": 22.66, "step": 27498, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.958064 }, { "epoch": 0.8933177403112107, "grad_norm": 0.38112014532089233, "learning_rate": 3.079589477887296e-07, "loss": 0.010065000504255295, "memory(GiB)": 22.66, "step": 27499, "token_acc": 0.995, "train_speed(iter/s)": 0.958071 }, { "epoch": 0.8933502257739662, "grad_norm": 0.38448160886764526, "learning_rate": 3.077733731504556e-07, "loss": 0.01620025560259819, "memory(GiB)": 22.66, "step": 27500, "token_acc": 0.9810606060606061, "train_speed(iter/s)": 0.958079 }, { "epoch": 0.8933502257739662, "eval_loss": 0.011429489590227604, "eval_runtime": 79.4839, "eval_samples_per_second": 125.183, "eval_steps_per_second": 3.913, "eval_token_acc": 0.9954008549604697, "step": 27500 }, { "epoch": 0.8933827112367215, "grad_norm": 0.3859098553657532, "learning_rate": 3.07587852666722e-07, "loss": 0.010738144628703594, "memory(GiB)": 22.66, "step": 27501, "token_acc": 0.9949584897388827, "train_speed(iter/s)": 0.955033 }, { "epoch": 0.893415196699477, "grad_norm": 0.3170775771141052, "learning_rate": 3.074023863396697e-07, "loss": 0.010793495923280716, "memory(GiB)": 22.66, "step": 27502, "token_acc": 1.0, "train_speed(iter/s)": 0.955038 }, { "epoch": 0.8934476821622324, "grad_norm": 0.5145307183265686, "learning_rate": 3.0721697417143926e-07, "loss": 0.012077229097485542, "memory(GiB)": 22.66, "step": 27503, "token_acc": 0.9948979591836735, "train_speed(iter/s)": 0.955042 }, { "epoch": 0.8934801676249878, "grad_norm": 0.30777138471603394, "learning_rate": 3.070316161641723e-07, "loss": 0.0071031441912055016, "memory(GiB)": 22.66, "step": 27504, "token_acc": 1.0, "train_speed(iter/s)": 0.955048 }, { "epoch": 0.8935126530877432, "grad_norm": 0.23642845451831818, "learning_rate": 3.0684631232000607e-07, "loss": 0.012623827904462814, "memory(GiB)": 22.66, "step": 27505, "token_acc": 0.9927007299270073, "train_speed(iter/s)": 0.955053 }, { "epoch": 0.8935451385504987, "grad_norm": 0.3659668266773224, "learning_rate": 3.0666106264107986e-07, "loss": 0.008899111300706863, "memory(GiB)": 22.66, "step": 27506, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.955059 }, { "epoch": 0.893577624013254, "grad_norm": 0.32399705052375793, "learning_rate": 3.0647586712953147e-07, "loss": 0.010351584292948246, "memory(GiB)": 22.66, "step": 27507, "token_acc": 1.0, "train_speed(iter/s)": 0.955065 }, { "epoch": 0.8936101094760095, "grad_norm": 0.24926826357841492, "learning_rate": 3.062907257874997e-07, "loss": 0.008336851373314857, "memory(GiB)": 22.66, "step": 27508, "token_acc": 1.0, "train_speed(iter/s)": 0.955071 }, { "epoch": 0.8936425949387649, "grad_norm": 0.27949464321136475, "learning_rate": 3.061056386171196e-07, "loss": 0.00813797116279602, "memory(GiB)": 22.66, "step": 27509, "token_acc": 1.0, "train_speed(iter/s)": 0.955077 }, { "epoch": 0.8936750804015203, "grad_norm": 0.596858024597168, "learning_rate": 3.059206056205283e-07, "loss": 0.014819677919149399, "memory(GiB)": 22.66, "step": 27510, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.955084 }, { "epoch": 0.8937075658642757, "grad_norm": 0.28675076365470886, "learning_rate": 3.057356267998607e-07, "loss": 0.007047874853014946, "memory(GiB)": 22.66, "step": 27511, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955091 }, { "epoch": 0.8937400513270312, "grad_norm": 0.2844356596469879, "learning_rate": 3.055507021572518e-07, "loss": 0.00975729338824749, "memory(GiB)": 22.66, "step": 27512, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.955098 }, { "epoch": 0.8937725367897865, "grad_norm": 0.29224613308906555, "learning_rate": 3.053658316948371e-07, "loss": 0.011580363847315311, "memory(GiB)": 22.66, "step": 27513, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.955105 }, { "epoch": 0.893805022252542, "grad_norm": 0.3077966272830963, "learning_rate": 3.051810154147483e-07, "loss": 0.010620207525789738, "memory(GiB)": 22.66, "step": 27514, "token_acc": 1.0, "train_speed(iter/s)": 0.955113 }, { "epoch": 0.8938375077152974, "grad_norm": 0.28438326716423035, "learning_rate": 3.049962533191203e-07, "loss": 0.010288845747709274, "memory(GiB)": 22.66, "step": 27515, "token_acc": 1.0, "train_speed(iter/s)": 0.95512 }, { "epoch": 0.8938699931780528, "grad_norm": 0.41658011078834534, "learning_rate": 3.0481154541008304e-07, "loss": 0.01443470735102892, "memory(GiB)": 22.66, "step": 27516, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.955127 }, { "epoch": 0.8939024786408082, "grad_norm": 0.46556830406188965, "learning_rate": 3.04626891689771e-07, "loss": 0.010567809455096722, "memory(GiB)": 22.66, "step": 27517, "token_acc": 0.9968454258675079, "train_speed(iter/s)": 0.955134 }, { "epoch": 0.8939349641035637, "grad_norm": 0.4074091613292694, "learning_rate": 3.044422921603135e-07, "loss": 0.013674365356564522, "memory(GiB)": 22.66, "step": 27518, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.955141 }, { "epoch": 0.893967449566319, "grad_norm": 0.4495393633842468, "learning_rate": 3.042577468238428e-07, "loss": 0.016140565276145935, "memory(GiB)": 22.66, "step": 27519, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.955148 }, { "epoch": 0.8939999350290745, "grad_norm": 0.39256423711776733, "learning_rate": 3.0407325568248716e-07, "loss": 0.00839606486260891, "memory(GiB)": 22.66, "step": 27520, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.955156 }, { "epoch": 0.89403242049183, "grad_norm": 0.2499207705259323, "learning_rate": 3.0388881873837594e-07, "loss": 0.010584680363535881, "memory(GiB)": 22.66, "step": 27521, "token_acc": 1.0, "train_speed(iter/s)": 0.955163 }, { "epoch": 0.8940649059545853, "grad_norm": 0.2714378237724304, "learning_rate": 3.037044359936381e-07, "loss": 0.008018961176276207, "memory(GiB)": 22.66, "step": 27522, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.955171 }, { "epoch": 0.8940973914173408, "grad_norm": 0.42635878920555115, "learning_rate": 3.0352010745040297e-07, "loss": 0.012366036884486675, "memory(GiB)": 22.66, "step": 27523, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.955177 }, { "epoch": 0.8941298768800962, "grad_norm": 0.37385112047195435, "learning_rate": 3.033358331107961e-07, "loss": 0.012546001002192497, "memory(GiB)": 22.66, "step": 27524, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955184 }, { "epoch": 0.8941623623428516, "grad_norm": 0.3697727918624878, "learning_rate": 3.031516129769446e-07, "loss": 0.010801349766552448, "memory(GiB)": 22.66, "step": 27525, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.955191 }, { "epoch": 0.894194847805607, "grad_norm": 0.3313676118850708, "learning_rate": 3.0296744705097526e-07, "loss": 0.010368824936449528, "memory(GiB)": 22.66, "step": 27526, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.955199 }, { "epoch": 0.8942273332683625, "grad_norm": 0.489532470703125, "learning_rate": 3.027833353350135e-07, "loss": 0.018247418105602264, "memory(GiB)": 22.66, "step": 27527, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955206 }, { "epoch": 0.8942598187311178, "grad_norm": 0.3142130672931671, "learning_rate": 3.0259927783118427e-07, "loss": 0.008166462182998657, "memory(GiB)": 22.66, "step": 27528, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955213 }, { "epoch": 0.8942923041938733, "grad_norm": 0.43510720133781433, "learning_rate": 3.0241527454161143e-07, "loss": 0.011606384068727493, "memory(GiB)": 22.66, "step": 27529, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955221 }, { "epoch": 0.8943247896566286, "grad_norm": 0.22019162774085999, "learning_rate": 3.022313254684184e-07, "loss": 0.006450500804930925, "memory(GiB)": 22.66, "step": 27530, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.955228 }, { "epoch": 0.8943572751193841, "grad_norm": 0.22012794017791748, "learning_rate": 3.0204743061372887e-07, "loss": 0.010806276462972164, "memory(GiB)": 22.66, "step": 27531, "token_acc": 1.0, "train_speed(iter/s)": 0.955235 }, { "epoch": 0.8943897605821395, "grad_norm": 0.4475368559360504, "learning_rate": 3.0186358997966515e-07, "loss": 0.009624800644814968, "memory(GiB)": 22.66, "step": 27532, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.955241 }, { "epoch": 0.894422246044895, "grad_norm": 0.4576474726200104, "learning_rate": 3.0167980356834827e-07, "loss": 0.01014251820743084, "memory(GiB)": 22.66, "step": 27533, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955249 }, { "epoch": 0.8944547315076503, "grad_norm": 0.3607932925224304, "learning_rate": 3.014960713819004e-07, "loss": 0.009037344716489315, "memory(GiB)": 22.66, "step": 27534, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.955257 }, { "epoch": 0.8944872169704058, "grad_norm": 0.33351874351501465, "learning_rate": 3.01312393422441e-07, "loss": 0.007360667921602726, "memory(GiB)": 22.66, "step": 27535, "token_acc": 1.0, "train_speed(iter/s)": 0.955264 }, { "epoch": 0.8945197024331611, "grad_norm": 0.2781679332256317, "learning_rate": 3.011287696920917e-07, "loss": 0.008081965148448944, "memory(GiB)": 22.66, "step": 27536, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.955271 }, { "epoch": 0.8945521878959166, "grad_norm": 0.5324723124504089, "learning_rate": 3.009452001929697e-07, "loss": 0.009395770728588104, "memory(GiB)": 22.66, "step": 27537, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.955279 }, { "epoch": 0.894584673358672, "grad_norm": 0.28097060322761536, "learning_rate": 3.0076168492719495e-07, "loss": 0.00856550969183445, "memory(GiB)": 22.66, "step": 27538, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.955286 }, { "epoch": 0.8946171588214274, "grad_norm": 0.6388089060783386, "learning_rate": 3.005782238968835e-07, "loss": 0.012089090421795845, "memory(GiB)": 22.66, "step": 27539, "token_acc": 1.0, "train_speed(iter/s)": 0.955293 }, { "epoch": 0.8946496442841828, "grad_norm": 0.3496410548686981, "learning_rate": 3.003948171041554e-07, "loss": 0.01240656990557909, "memory(GiB)": 22.66, "step": 27540, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.9553 }, { "epoch": 0.8946821297469383, "grad_norm": 0.41263678669929504, "learning_rate": 3.0021146455112726e-07, "loss": 0.015318389050662518, "memory(GiB)": 22.66, "step": 27541, "token_acc": 1.0, "train_speed(iter/s)": 0.955307 }, { "epoch": 0.8947146152096936, "grad_norm": 1.6234285831451416, "learning_rate": 3.000281662399129e-07, "loss": 0.011932481080293655, "memory(GiB)": 22.66, "step": 27542, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.955313 }, { "epoch": 0.8947471006724491, "grad_norm": 0.35634884238243103, "learning_rate": 2.9984492217263016e-07, "loss": 0.012498370371758938, "memory(GiB)": 22.66, "step": 27543, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.95532 }, { "epoch": 0.8947795861352045, "grad_norm": 0.38964754343032837, "learning_rate": 2.996617323513912e-07, "loss": 0.013025866821408272, "memory(GiB)": 22.66, "step": 27544, "token_acc": 1.0, "train_speed(iter/s)": 0.955327 }, { "epoch": 0.89481207159796, "grad_norm": 0.4047486484050751, "learning_rate": 2.994785967783137e-07, "loss": 0.014254668727517128, "memory(GiB)": 22.66, "step": 27545, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.955333 }, { "epoch": 0.8948445570607153, "grad_norm": 0.2948513627052307, "learning_rate": 2.992955154555088e-07, "loss": 0.009002882987260818, "memory(GiB)": 22.66, "step": 27546, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955338 }, { "epoch": 0.8948770425234708, "grad_norm": 0.3693358302116394, "learning_rate": 2.99112488385091e-07, "loss": 0.014167051762342453, "memory(GiB)": 22.66, "step": 27547, "token_acc": 1.0, "train_speed(iter/s)": 0.955343 }, { "epoch": 0.8949095279862261, "grad_norm": 0.2739719748497009, "learning_rate": 2.989295155691713e-07, "loss": 0.012081058695912361, "memory(GiB)": 22.66, "step": 27548, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955349 }, { "epoch": 0.8949420134489816, "grad_norm": 0.3393568992614746, "learning_rate": 2.9874659700986183e-07, "loss": 0.0095153097063303, "memory(GiB)": 22.66, "step": 27549, "token_acc": 1.0, "train_speed(iter/s)": 0.955354 }, { "epoch": 0.894974498911737, "grad_norm": 0.5066435933113098, "learning_rate": 2.9856373270927387e-07, "loss": 0.009419555775821209, "memory(GiB)": 22.66, "step": 27550, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.955359 }, { "epoch": 0.8950069843744924, "grad_norm": 0.3857637941837311, "learning_rate": 2.9838092266951947e-07, "loss": 0.013114647939801216, "memory(GiB)": 22.66, "step": 27551, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955364 }, { "epoch": 0.8950394698372478, "grad_norm": 0.37764763832092285, "learning_rate": 2.9819816689270584e-07, "loss": 0.013178031891584396, "memory(GiB)": 22.66, "step": 27552, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.955368 }, { "epoch": 0.8950719553000033, "grad_norm": 0.2590397596359253, "learning_rate": 2.980154653809436e-07, "loss": 0.009616304188966751, "memory(GiB)": 22.66, "step": 27553, "token_acc": 1.0, "train_speed(iter/s)": 0.955373 }, { "epoch": 0.8951044407627586, "grad_norm": 0.460482120513916, "learning_rate": 2.9783281813634104e-07, "loss": 0.014721319079399109, "memory(GiB)": 22.66, "step": 27554, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.955379 }, { "epoch": 0.8951369262255141, "grad_norm": 0.30899864435195923, "learning_rate": 2.976502251610064e-07, "loss": 0.008531555533409119, "memory(GiB)": 22.66, "step": 27555, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.955374 }, { "epoch": 0.8951694116882695, "grad_norm": 0.32146579027175903, "learning_rate": 2.974676864570475e-07, "loss": 0.00897118542343378, "memory(GiB)": 22.66, "step": 27556, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.955379 }, { "epoch": 0.8952018971510249, "grad_norm": 0.35328394174575806, "learning_rate": 2.972852020265704e-07, "loss": 0.01073203794658184, "memory(GiB)": 22.66, "step": 27557, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955384 }, { "epoch": 0.8952343826137803, "grad_norm": 0.4857642352581024, "learning_rate": 2.971027718716807e-07, "loss": 0.008922389708459377, "memory(GiB)": 22.66, "step": 27558, "token_acc": 1.0, "train_speed(iter/s)": 0.95539 }, { "epoch": 0.8952668680765358, "grad_norm": 0.39435243606567383, "learning_rate": 2.9692039599448496e-07, "loss": 0.011022438295185566, "memory(GiB)": 22.66, "step": 27559, "token_acc": 1.0, "train_speed(iter/s)": 0.955394 }, { "epoch": 0.8952993535392911, "grad_norm": 0.2542456090450287, "learning_rate": 2.9673807439708825e-07, "loss": 0.008528126403689384, "memory(GiB)": 22.66, "step": 27560, "token_acc": 0.9963503649635036, "train_speed(iter/s)": 0.9554 }, { "epoch": 0.8953318390020466, "grad_norm": 0.2651937007904053, "learning_rate": 2.9655580708159324e-07, "loss": 0.008220908232033253, "memory(GiB)": 22.66, "step": 27561, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955405 }, { "epoch": 0.895364324464802, "grad_norm": 0.3927132785320282, "learning_rate": 2.9637359405010503e-07, "loss": 0.009309470653533936, "memory(GiB)": 22.66, "step": 27562, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.95541 }, { "epoch": 0.8953968099275574, "grad_norm": 0.37393033504486084, "learning_rate": 2.9619143530472574e-07, "loss": 0.01168433390557766, "memory(GiB)": 22.66, "step": 27563, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.955415 }, { "epoch": 0.8954292953903128, "grad_norm": 0.31332916021347046, "learning_rate": 2.9600933084755866e-07, "loss": 0.00726857827976346, "memory(GiB)": 22.66, "step": 27564, "token_acc": 1.0, "train_speed(iter/s)": 0.95542 }, { "epoch": 0.8954617808530683, "grad_norm": 0.5830811262130737, "learning_rate": 2.958272806807044e-07, "loss": 0.013767747208476067, "memory(GiB)": 22.66, "step": 27565, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.955427 }, { "epoch": 0.8954942663158236, "grad_norm": 0.46799713373184204, "learning_rate": 2.956452848062652e-07, "loss": 0.017021842300891876, "memory(GiB)": 22.66, "step": 27566, "token_acc": 0.991869918699187, "train_speed(iter/s)": 0.955433 }, { "epoch": 0.8955267517785791, "grad_norm": 0.2632973790168762, "learning_rate": 2.954633432263393e-07, "loss": 0.011153623461723328, "memory(GiB)": 22.66, "step": 27567, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955438 }, { "epoch": 0.8955592372413345, "grad_norm": 0.47837525606155396, "learning_rate": 2.9528145594303e-07, "loss": 0.01370216440409422, "memory(GiB)": 22.66, "step": 27568, "token_acc": 1.0, "train_speed(iter/s)": 0.955444 }, { "epoch": 0.8955917227040899, "grad_norm": 0.41763973236083984, "learning_rate": 2.9509962295843355e-07, "loss": 0.0127370934933424, "memory(GiB)": 22.66, "step": 27569, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.95545 }, { "epoch": 0.8956242081668453, "grad_norm": 0.3345601260662079, "learning_rate": 2.9491784427464976e-07, "loss": 0.009818541817367077, "memory(GiB)": 22.66, "step": 27570, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955456 }, { "epoch": 0.8956566936296008, "grad_norm": 0.36895614862442017, "learning_rate": 2.9473611989377705e-07, "loss": 0.01172830443829298, "memory(GiB)": 22.66, "step": 27571, "token_acc": 1.0, "train_speed(iter/s)": 0.955462 }, { "epoch": 0.8956891790923561, "grad_norm": 0.32963332533836365, "learning_rate": 2.9455444981791094e-07, "loss": 0.010468300431966782, "memory(GiB)": 22.66, "step": 27572, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955468 }, { "epoch": 0.8957216645551116, "grad_norm": 0.269087553024292, "learning_rate": 2.9437283404915085e-07, "loss": 0.010429995134472847, "memory(GiB)": 22.66, "step": 27573, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955474 }, { "epoch": 0.895754150017867, "grad_norm": 0.3963136076927185, "learning_rate": 2.941912725895907e-07, "loss": 0.016426820307970047, "memory(GiB)": 22.66, "step": 27574, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.955481 }, { "epoch": 0.8957866354806224, "grad_norm": 0.37522661685943604, "learning_rate": 2.9400976544132764e-07, "loss": 0.013722706586122513, "memory(GiB)": 22.66, "step": 27575, "token_acc": 0.99, "train_speed(iter/s)": 0.955488 }, { "epoch": 0.8958191209433778, "grad_norm": 0.2948799729347229, "learning_rate": 2.938283126064545e-07, "loss": 0.008677545003592968, "memory(GiB)": 22.66, "step": 27576, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955496 }, { "epoch": 0.8958516064061333, "grad_norm": 0.25971361994743347, "learning_rate": 2.936469140870668e-07, "loss": 0.009624327532947063, "memory(GiB)": 22.66, "step": 27577, "token_acc": 1.0, "train_speed(iter/s)": 0.955504 }, { "epoch": 0.8958840918688886, "grad_norm": 0.28755903244018555, "learning_rate": 2.9346556988525733e-07, "loss": 0.008636260405182838, "memory(GiB)": 22.66, "step": 27578, "token_acc": 1.0, "train_speed(iter/s)": 0.955511 }, { "epoch": 0.8959165773316441, "grad_norm": 0.34665125608444214, "learning_rate": 2.9328428000312106e-07, "loss": 0.012026859447360039, "memory(GiB)": 22.66, "step": 27579, "token_acc": 0.996415770609319, "train_speed(iter/s)": 0.955519 }, { "epoch": 0.8959490627943995, "grad_norm": 0.3753244876861572, "learning_rate": 2.931030444427474e-07, "loss": 0.011404681019484997, "memory(GiB)": 22.66, "step": 27580, "token_acc": 1.0, "train_speed(iter/s)": 0.955527 }, { "epoch": 0.8959815482571549, "grad_norm": 0.3900902569293976, "learning_rate": 2.9292186320622975e-07, "loss": 0.015864605084061623, "memory(GiB)": 22.66, "step": 27581, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.955534 }, { "epoch": 0.8960140337199103, "grad_norm": 0.2150978296995163, "learning_rate": 2.927407362956591e-07, "loss": 0.004684210754930973, "memory(GiB)": 22.66, "step": 27582, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955542 }, { "epoch": 0.8960465191826658, "grad_norm": 0.32276099920272827, "learning_rate": 2.9255966371312616e-07, "loss": 0.010563584975898266, "memory(GiB)": 22.66, "step": 27583, "token_acc": 1.0, "train_speed(iter/s)": 0.955549 }, { "epoch": 0.8960790046454212, "grad_norm": 0.45799243450164795, "learning_rate": 2.923786454607197e-07, "loss": 0.013873646035790443, "memory(GiB)": 22.66, "step": 27584, "token_acc": 0.9858156028368794, "train_speed(iter/s)": 0.955557 }, { "epoch": 0.8961114901081766, "grad_norm": 0.27065932750701904, "learning_rate": 2.921976815405297e-07, "loss": 0.00841593835502863, "memory(GiB)": 22.66, "step": 27585, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955564 }, { "epoch": 0.8961439755709321, "grad_norm": 0.35937947034835815, "learning_rate": 2.9201677195464407e-07, "loss": 0.01088528148829937, "memory(GiB)": 22.66, "step": 27586, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955572 }, { "epoch": 0.8961764610336874, "grad_norm": 0.36885762214660645, "learning_rate": 2.9183591670515155e-07, "loss": 0.013908146880567074, "memory(GiB)": 22.66, "step": 27587, "token_acc": 1.0, "train_speed(iter/s)": 0.955579 }, { "epoch": 0.8962089464964429, "grad_norm": 0.3573448359966278, "learning_rate": 2.916551157941394e-07, "loss": 0.012428316287696362, "memory(GiB)": 22.66, "step": 27588, "token_acc": 0.9894179894179894, "train_speed(iter/s)": 0.955587 }, { "epoch": 0.8962414319591983, "grad_norm": 0.46430686116218567, "learning_rate": 2.9147436922369375e-07, "loss": 0.008598767220973969, "memory(GiB)": 22.66, "step": 27589, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.955594 }, { "epoch": 0.8962739174219537, "grad_norm": 0.1841866821050644, "learning_rate": 2.9129367699590074e-07, "loss": 0.005441777408123016, "memory(GiB)": 22.66, "step": 27590, "token_acc": 1.0, "train_speed(iter/s)": 0.955602 }, { "epoch": 0.8963064028847091, "grad_norm": 0.2074878066778183, "learning_rate": 2.9111303911284584e-07, "loss": 0.011197690851986408, "memory(GiB)": 22.66, "step": 27591, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.955609 }, { "epoch": 0.8963388883474646, "grad_norm": 0.25267958641052246, "learning_rate": 2.9093245557661523e-07, "loss": 0.006858561187982559, "memory(GiB)": 22.66, "step": 27592, "token_acc": 1.0, "train_speed(iter/s)": 0.955617 }, { "epoch": 0.8963713738102199, "grad_norm": 0.27962180972099304, "learning_rate": 2.907519263892905e-07, "loss": 0.008530979976058006, "memory(GiB)": 22.66, "step": 27593, "token_acc": 1.0, "train_speed(iter/s)": 0.955625 }, { "epoch": 0.8964038592729754, "grad_norm": 0.31822314858436584, "learning_rate": 2.905714515529573e-07, "loss": 0.008803663775324821, "memory(GiB)": 22.66, "step": 27594, "token_acc": 1.0, "train_speed(iter/s)": 0.955632 }, { "epoch": 0.8964363447357308, "grad_norm": 0.2727964520454407, "learning_rate": 2.903910310696967e-07, "loss": 0.006497421767562628, "memory(GiB)": 22.66, "step": 27595, "token_acc": 1.0, "train_speed(iter/s)": 0.955639 }, { "epoch": 0.8964688301984862, "grad_norm": 0.3622696101665497, "learning_rate": 2.902106649415931e-07, "loss": 0.007319378666579723, "memory(GiB)": 22.66, "step": 27596, "token_acc": 1.0, "train_speed(iter/s)": 0.955646 }, { "epoch": 0.8965013156612416, "grad_norm": 0.3600389063358307, "learning_rate": 2.900303531707266e-07, "loss": 0.013653922826051712, "memory(GiB)": 22.66, "step": 27597, "token_acc": 1.0, "train_speed(iter/s)": 0.955654 }, { "epoch": 0.896533801123997, "grad_norm": 0.33840110898017883, "learning_rate": 2.898500957591799e-07, "loss": 0.013566631823778152, "memory(GiB)": 22.66, "step": 27598, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955661 }, { "epoch": 0.8965662865867524, "grad_norm": 0.33145493268966675, "learning_rate": 2.896698927090313e-07, "loss": 0.013164527714252472, "memory(GiB)": 22.66, "step": 27599, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955669 }, { "epoch": 0.8965987720495079, "grad_norm": 0.35124385356903076, "learning_rate": 2.894897440223615e-07, "loss": 0.013386616483330727, "memory(GiB)": 22.66, "step": 27600, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.955676 }, { "epoch": 0.8966312575122632, "grad_norm": 0.3337365984916687, "learning_rate": 2.893096497012493e-07, "loss": 0.009710729122161865, "memory(GiB)": 22.66, "step": 27601, "token_acc": 1.0, "train_speed(iter/s)": 0.955684 }, { "epoch": 0.8966637429750187, "grad_norm": 0.2558043301105499, "learning_rate": 2.8912960974777414e-07, "loss": 0.009857576340436935, "memory(GiB)": 22.66, "step": 27602, "token_acc": 1.0, "train_speed(iter/s)": 0.955691 }, { "epoch": 0.8966962284377741, "grad_norm": 0.28819331526756287, "learning_rate": 2.889496241640144e-07, "loss": 0.008940868079662323, "memory(GiB)": 22.66, "step": 27603, "token_acc": 1.0, "train_speed(iter/s)": 0.955699 }, { "epoch": 0.8967287139005296, "grad_norm": 0.4763264060020447, "learning_rate": 2.8876969295204503e-07, "loss": 0.015420204028487206, "memory(GiB)": 22.66, "step": 27604, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.955706 }, { "epoch": 0.8967611993632849, "grad_norm": 0.3801266849040985, "learning_rate": 2.8858981611394445e-07, "loss": 0.012483339756727219, "memory(GiB)": 22.66, "step": 27605, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955712 }, { "epoch": 0.8967936848260404, "grad_norm": 0.4204844832420349, "learning_rate": 2.8840999365178815e-07, "loss": 0.01304932963103056, "memory(GiB)": 22.66, "step": 27606, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.955718 }, { "epoch": 0.8968261702887957, "grad_norm": 0.3442613184452057, "learning_rate": 2.882302255676528e-07, "loss": 0.01158947590738535, "memory(GiB)": 22.66, "step": 27607, "token_acc": 1.0, "train_speed(iter/s)": 0.955723 }, { "epoch": 0.8968586557515512, "grad_norm": 0.4611777365207672, "learning_rate": 2.8805051186361064e-07, "loss": 0.013373837806284428, "memory(GiB)": 22.66, "step": 27608, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.955729 }, { "epoch": 0.8968911412143066, "grad_norm": 0.4905913174152374, "learning_rate": 2.878708525417373e-07, "loss": 0.02106468193233013, "memory(GiB)": 22.66, "step": 27609, "token_acc": 1.0, "train_speed(iter/s)": 0.955734 }, { "epoch": 0.896923626677062, "grad_norm": 0.3201128840446472, "learning_rate": 2.876912476041066e-07, "loss": 0.009166734293103218, "memory(GiB)": 22.66, "step": 27610, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.95574 }, { "epoch": 0.8969561121398174, "grad_norm": 0.3358275294303894, "learning_rate": 2.8751169705279135e-07, "loss": 0.01163606159389019, "memory(GiB)": 22.66, "step": 27611, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.955746 }, { "epoch": 0.8969885976025729, "grad_norm": 0.3848153054714203, "learning_rate": 2.873322008898627e-07, "loss": 0.014789406210184097, "memory(GiB)": 22.66, "step": 27612, "token_acc": 1.0, "train_speed(iter/s)": 0.955751 }, { "epoch": 0.8970210830653282, "grad_norm": 0.38225457072257996, "learning_rate": 2.871527591173928e-07, "loss": 0.012948105111718178, "memory(GiB)": 22.66, "step": 27613, "token_acc": 0.9840637450199203, "train_speed(iter/s)": 0.955756 }, { "epoch": 0.8970535685280837, "grad_norm": 0.5694188475608826, "learning_rate": 2.8697337173745286e-07, "loss": 0.019344378262758255, "memory(GiB)": 22.66, "step": 27614, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.955762 }, { "epoch": 0.8970860539908391, "grad_norm": 0.2713473439216614, "learning_rate": 2.867940387521145e-07, "loss": 0.007479079067707062, "memory(GiB)": 22.66, "step": 27615, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.955767 }, { "epoch": 0.8971185394535945, "grad_norm": 0.37580063939094543, "learning_rate": 2.86614760163445e-07, "loss": 0.009695924818515778, "memory(GiB)": 22.66, "step": 27616, "token_acc": 1.0, "train_speed(iter/s)": 0.955772 }, { "epoch": 0.8971510249163499, "grad_norm": 0.31687766313552856, "learning_rate": 2.8643553597351494e-07, "loss": 0.007442111149430275, "memory(GiB)": 22.66, "step": 27617, "token_acc": 1.0, "train_speed(iter/s)": 0.955777 }, { "epoch": 0.8971835103791054, "grad_norm": 0.24619269371032715, "learning_rate": 2.86256366184392e-07, "loss": 0.007116787601262331, "memory(GiB)": 22.66, "step": 27618, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.955782 }, { "epoch": 0.8972159958418607, "grad_norm": 0.6972640156745911, "learning_rate": 2.860772507981452e-07, "loss": 0.012122866697609425, "memory(GiB)": 22.66, "step": 27619, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955788 }, { "epoch": 0.8972484813046162, "grad_norm": 0.2884489893913269, "learning_rate": 2.858981898168417e-07, "loss": 0.011119373142719269, "memory(GiB)": 22.66, "step": 27620, "token_acc": 1.0, "train_speed(iter/s)": 0.955793 }, { "epoch": 0.8972809667673716, "grad_norm": 0.22174544632434845, "learning_rate": 2.8571918324254656e-07, "loss": 0.00754308607429266, "memory(GiB)": 22.66, "step": 27621, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.955798 }, { "epoch": 0.897313452230127, "grad_norm": 0.4983004629611969, "learning_rate": 2.8554023107732754e-07, "loss": 0.015233954414725304, "memory(GiB)": 22.66, "step": 27622, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.955804 }, { "epoch": 0.8973459376928824, "grad_norm": 0.45254606008529663, "learning_rate": 2.85361333323248e-07, "loss": 0.012524928897619247, "memory(GiB)": 22.66, "step": 27623, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955809 }, { "epoch": 0.8973784231556379, "grad_norm": 0.32164958119392395, "learning_rate": 2.851824899823752e-07, "loss": 0.010417917743325233, "memory(GiB)": 22.66, "step": 27624, "token_acc": 0.9922779922779923, "train_speed(iter/s)": 0.955814 }, { "epoch": 0.8974109086183932, "grad_norm": 0.24046674370765686, "learning_rate": 2.850037010567708e-07, "loss": 0.00842034537345171, "memory(GiB)": 22.66, "step": 27625, "token_acc": 1.0, "train_speed(iter/s)": 0.95582 }, { "epoch": 0.8974433940811487, "grad_norm": 0.2972293198108673, "learning_rate": 2.8482496654850036e-07, "loss": 0.018170014023780823, "memory(GiB)": 22.66, "step": 27626, "token_acc": 0.996, "train_speed(iter/s)": 0.955826 }, { "epoch": 0.8974758795439041, "grad_norm": 0.29493987560272217, "learning_rate": 2.84646286459625e-07, "loss": 0.011035606265068054, "memory(GiB)": 22.66, "step": 27627, "token_acc": 0.9967105263157895, "train_speed(iter/s)": 0.955832 }, { "epoch": 0.8975083650066595, "grad_norm": 0.35903334617614746, "learning_rate": 2.8446766079220756e-07, "loss": 0.012545734643936157, "memory(GiB)": 22.66, "step": 27628, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.955838 }, { "epoch": 0.8975408504694149, "grad_norm": 0.2767696976661682, "learning_rate": 2.8428908954830967e-07, "loss": 0.009189107455313206, "memory(GiB)": 22.66, "step": 27629, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.955844 }, { "epoch": 0.8975733359321704, "grad_norm": 0.3650924563407898, "learning_rate": 2.841105727299931e-07, "loss": 0.010792291723191738, "memory(GiB)": 22.66, "step": 27630, "token_acc": 1.0, "train_speed(iter/s)": 0.95585 }, { "epoch": 0.8976058213949257, "grad_norm": 0.313973993062973, "learning_rate": 2.839321103393167e-07, "loss": 0.007490450516343117, "memory(GiB)": 22.66, "step": 27631, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.955856 }, { "epoch": 0.8976383068576812, "grad_norm": 0.27955499291419983, "learning_rate": 2.837537023783404e-07, "loss": 0.007984925992786884, "memory(GiB)": 22.66, "step": 27632, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.955862 }, { "epoch": 0.8976707923204366, "grad_norm": 0.3162325918674469, "learning_rate": 2.8357534884912386e-07, "loss": 0.010366326197981834, "memory(GiB)": 22.66, "step": 27633, "token_acc": 1.0, "train_speed(iter/s)": 0.955867 }, { "epoch": 0.897703277783192, "grad_norm": 0.3529411852359772, "learning_rate": 2.8339704975372586e-07, "loss": 0.009123222902417183, "memory(GiB)": 22.66, "step": 27634, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.955872 }, { "epoch": 0.8977357632459474, "grad_norm": 0.34228935837745667, "learning_rate": 2.8321880509420364e-07, "loss": 0.012189227156341076, "memory(GiB)": 22.66, "step": 27635, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955878 }, { "epoch": 0.8977682487087029, "grad_norm": 0.300401508808136, "learning_rate": 2.830406148726139e-07, "loss": 0.008191616274416447, "memory(GiB)": 22.66, "step": 27636, "token_acc": 1.0, "train_speed(iter/s)": 0.955884 }, { "epoch": 0.8978007341714582, "grad_norm": 0.31294143199920654, "learning_rate": 2.8286247909101393e-07, "loss": 0.009282746352255344, "memory(GiB)": 22.66, "step": 27637, "token_acc": 1.0, "train_speed(iter/s)": 0.955891 }, { "epoch": 0.8978332196342137, "grad_norm": 0.5759670734405518, "learning_rate": 2.8268439775145984e-07, "loss": 0.015054469928145409, "memory(GiB)": 22.66, "step": 27638, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955895 }, { "epoch": 0.8978657050969691, "grad_norm": 0.24079033732414246, "learning_rate": 2.8250637085600665e-07, "loss": 0.011208463460206985, "memory(GiB)": 22.66, "step": 27639, "token_acc": 1.0, "train_speed(iter/s)": 0.955902 }, { "epoch": 0.8978981905597245, "grad_norm": 0.30425095558166504, "learning_rate": 2.8232839840670824e-07, "loss": 0.008473068475723267, "memory(GiB)": 22.66, "step": 27640, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.955909 }, { "epoch": 0.8979306760224799, "grad_norm": 0.29673731327056885, "learning_rate": 2.8215048040561964e-07, "loss": 0.008538058027625084, "memory(GiB)": 22.66, "step": 27641, "token_acc": 1.0, "train_speed(iter/s)": 0.955917 }, { "epoch": 0.8979631614852354, "grad_norm": 0.2549845576286316, "learning_rate": 2.819726168547937e-07, "loss": 0.008459245786070824, "memory(GiB)": 22.66, "step": 27642, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955924 }, { "epoch": 0.8979956469479907, "grad_norm": 0.5307831764221191, "learning_rate": 2.8179480775628375e-07, "loss": 0.015556909143924713, "memory(GiB)": 22.66, "step": 27643, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955932 }, { "epoch": 0.8980281324107462, "grad_norm": 0.3101383149623871, "learning_rate": 2.816170531121415e-07, "loss": 0.012306909076869488, "memory(GiB)": 22.66, "step": 27644, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.95594 }, { "epoch": 0.8980606178735016, "grad_norm": 0.21740423142910004, "learning_rate": 2.8143935292441914e-07, "loss": 0.005758093204349279, "memory(GiB)": 22.66, "step": 27645, "token_acc": 1.0, "train_speed(iter/s)": 0.955947 }, { "epoch": 0.898093103336257, "grad_norm": 0.38618040084838867, "learning_rate": 2.8126170719516567e-07, "loss": 0.011647865176200867, "memory(GiB)": 22.66, "step": 27646, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.955955 }, { "epoch": 0.8981255887990124, "grad_norm": 0.2513270080089569, "learning_rate": 2.8108411592643437e-07, "loss": 0.00894959643483162, "memory(GiB)": 22.66, "step": 27647, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955962 }, { "epoch": 0.8981580742617679, "grad_norm": 0.4441130459308624, "learning_rate": 2.80906579120272e-07, "loss": 0.01137719489634037, "memory(GiB)": 22.66, "step": 27648, "token_acc": 0.9904761904761905, "train_speed(iter/s)": 0.955969 }, { "epoch": 0.8981905597245233, "grad_norm": 0.2953159809112549, "learning_rate": 2.8072909677872906e-07, "loss": 0.010273413732647896, "memory(GiB)": 22.66, "step": 27649, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.955977 }, { "epoch": 0.8982230451872787, "grad_norm": 0.7495231032371521, "learning_rate": 2.8055166890385453e-07, "loss": 0.009840250946581364, "memory(GiB)": 22.66, "step": 27650, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.955984 }, { "epoch": 0.8982555306500342, "grad_norm": 0.48220935463905334, "learning_rate": 2.8037429549769404e-07, "loss": 0.017039701342582703, "memory(GiB)": 22.66, "step": 27651, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.955992 }, { "epoch": 0.8982880161127895, "grad_norm": 0.3425561487674713, "learning_rate": 2.801969765622969e-07, "loss": 0.015132633969187737, "memory(GiB)": 22.66, "step": 27652, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956 }, { "epoch": 0.898320501575545, "grad_norm": 0.3077642023563385, "learning_rate": 2.8001971209970833e-07, "loss": 0.009129654616117477, "memory(GiB)": 22.66, "step": 27653, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.956007 }, { "epoch": 0.8983529870383004, "grad_norm": 0.3541494309902191, "learning_rate": 2.7984250211197497e-07, "loss": 0.009492319077253342, "memory(GiB)": 22.66, "step": 27654, "token_acc": 1.0, "train_speed(iter/s)": 0.956014 }, { "epoch": 0.8983854725010558, "grad_norm": 0.459391713142395, "learning_rate": 2.796653466011412e-07, "loss": 0.01087542437016964, "memory(GiB)": 22.66, "step": 27655, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.956022 }, { "epoch": 0.8984179579638112, "grad_norm": 0.35468795895576477, "learning_rate": 2.7948824556925215e-07, "loss": 0.011005460284650326, "memory(GiB)": 22.66, "step": 27656, "token_acc": 0.991869918699187, "train_speed(iter/s)": 0.956029 }, { "epoch": 0.8984504434265667, "grad_norm": 0.38857656717300415, "learning_rate": 2.793111990183517e-07, "loss": 0.012450383976101875, "memory(GiB)": 22.66, "step": 27657, "token_acc": 1.0, "train_speed(iter/s)": 0.956037 }, { "epoch": 0.898482928889322, "grad_norm": 0.25632956624031067, "learning_rate": 2.791342069504838e-07, "loss": 0.008524483069777489, "memory(GiB)": 22.66, "step": 27658, "token_acc": 1.0, "train_speed(iter/s)": 0.956044 }, { "epoch": 0.8985154143520775, "grad_norm": 0.2765018343925476, "learning_rate": 2.7895726936769006e-07, "loss": 0.007629699073731899, "memory(GiB)": 22.66, "step": 27659, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956051 }, { "epoch": 0.8985478998148329, "grad_norm": 0.36523666977882385, "learning_rate": 2.7878038627201286e-07, "loss": 0.017378751188516617, "memory(GiB)": 22.66, "step": 27660, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956059 }, { "epoch": 0.8985803852775883, "grad_norm": 0.41157376766204834, "learning_rate": 2.7860355766549383e-07, "loss": 0.009687546640634537, "memory(GiB)": 22.66, "step": 27661, "token_acc": 1.0, "train_speed(iter/s)": 0.956066 }, { "epoch": 0.8986128707403437, "grad_norm": 0.2627798914909363, "learning_rate": 2.784267835501753e-07, "loss": 0.00787731446325779, "memory(GiB)": 22.66, "step": 27662, "token_acc": 1.0, "train_speed(iter/s)": 0.956074 }, { "epoch": 0.8986453562030992, "grad_norm": 0.36200910806655884, "learning_rate": 2.7825006392809496e-07, "loss": 0.011141868308186531, "memory(GiB)": 22.66, "step": 27663, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956082 }, { "epoch": 0.8986778416658545, "grad_norm": 0.26639261841773987, "learning_rate": 2.780733988012935e-07, "loss": 0.00872751884162426, "memory(GiB)": 22.66, "step": 27664, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.956089 }, { "epoch": 0.89871032712861, "grad_norm": 0.2227281630039215, "learning_rate": 2.7789678817180974e-07, "loss": 0.008885582908987999, "memory(GiB)": 22.66, "step": 27665, "token_acc": 1.0, "train_speed(iter/s)": 0.956094 }, { "epoch": 0.8987428125913653, "grad_norm": 0.248404860496521, "learning_rate": 2.777202320416822e-07, "loss": 0.009237770922482014, "memory(GiB)": 22.66, "step": 27666, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956099 }, { "epoch": 0.8987752980541208, "grad_norm": 0.2998256981372833, "learning_rate": 2.775437304129491e-07, "loss": 0.00780006218701601, "memory(GiB)": 22.66, "step": 27667, "token_acc": 1.0, "train_speed(iter/s)": 0.956105 }, { "epoch": 0.8988077835168762, "grad_norm": 0.22895479202270508, "learning_rate": 2.773672832876467e-07, "loss": 0.011850089766085148, "memory(GiB)": 22.66, "step": 27668, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.95611 }, { "epoch": 0.8988402689796317, "grad_norm": 0.2898179590702057, "learning_rate": 2.771908906678117e-07, "loss": 0.007258646190166473, "memory(GiB)": 22.66, "step": 27669, "token_acc": 1.0, "train_speed(iter/s)": 0.956116 }, { "epoch": 0.898872754442387, "grad_norm": 0.35799121856689453, "learning_rate": 2.770145525554796e-07, "loss": 0.009387233294546604, "memory(GiB)": 22.66, "step": 27670, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.956121 }, { "epoch": 0.8989052399051425, "grad_norm": 0.1707293838262558, "learning_rate": 2.7683826895268607e-07, "loss": 0.006039495579898357, "memory(GiB)": 22.66, "step": 27671, "token_acc": 1.0, "train_speed(iter/s)": 0.956127 }, { "epoch": 0.8989377253678978, "grad_norm": 0.3359852731227875, "learning_rate": 2.7666203986146557e-07, "loss": 0.009831038303673267, "memory(GiB)": 22.66, "step": 27672, "token_acc": 1.0, "train_speed(iter/s)": 0.956132 }, { "epoch": 0.8989702108306533, "grad_norm": 0.25079575181007385, "learning_rate": 2.76485865283852e-07, "loss": 0.0061152237467467785, "memory(GiB)": 22.66, "step": 27673, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.956138 }, { "epoch": 0.8990026962934087, "grad_norm": 0.4029199779033661, "learning_rate": 2.7630974522187715e-07, "loss": 0.01914413832128048, "memory(GiB)": 22.66, "step": 27674, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956143 }, { "epoch": 0.8990351817561641, "grad_norm": 0.3547666072845459, "learning_rate": 2.761336796775771e-07, "loss": 0.013816393911838531, "memory(GiB)": 22.66, "step": 27675, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.956149 }, { "epoch": 0.8990676672189195, "grad_norm": 0.24886882305145264, "learning_rate": 2.7595766865298024e-07, "loss": 0.0078511293977499, "memory(GiB)": 22.66, "step": 27676, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.956155 }, { "epoch": 0.899100152681675, "grad_norm": 0.36401334404945374, "learning_rate": 2.7578171215012104e-07, "loss": 0.014807593077421188, "memory(GiB)": 22.66, "step": 27677, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.95616 }, { "epoch": 0.8991326381444303, "grad_norm": 0.4338110089302063, "learning_rate": 2.756058101710274e-07, "loss": 0.01574551686644554, "memory(GiB)": 22.66, "step": 27678, "token_acc": 1.0, "train_speed(iter/s)": 0.956165 }, { "epoch": 0.8991651236071858, "grad_norm": 0.46102091670036316, "learning_rate": 2.754299627177309e-07, "loss": 0.011038939468562603, "memory(GiB)": 22.66, "step": 27679, "token_acc": 0.983739837398374, "train_speed(iter/s)": 0.95617 }, { "epoch": 0.8991976090699412, "grad_norm": 0.3817187547683716, "learning_rate": 2.752541697922617e-07, "loss": 0.009610076434910297, "memory(GiB)": 22.66, "step": 27680, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.956176 }, { "epoch": 0.8992300945326966, "grad_norm": 0.2547115683555603, "learning_rate": 2.7507843139664814e-07, "loss": 0.00803428515791893, "memory(GiB)": 22.66, "step": 27681, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956181 }, { "epoch": 0.899262579995452, "grad_norm": 0.4678223431110382, "learning_rate": 2.749027475329186e-07, "loss": 0.014046905562281609, "memory(GiB)": 22.66, "step": 27682, "token_acc": 0.9895104895104895, "train_speed(iter/s)": 0.956185 }, { "epoch": 0.8992950654582075, "grad_norm": 0.32701486349105835, "learning_rate": 2.747271182030992e-07, "loss": 0.01310865767300129, "memory(GiB)": 22.66, "step": 27683, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.95619 }, { "epoch": 0.8993275509209628, "grad_norm": 0.2140948474407196, "learning_rate": 2.7455154340921884e-07, "loss": 0.007672419771552086, "memory(GiB)": 22.66, "step": 27684, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.956196 }, { "epoch": 0.8993600363837183, "grad_norm": 0.27706098556518555, "learning_rate": 2.7437602315330325e-07, "loss": 0.011167775839567184, "memory(GiB)": 22.66, "step": 27685, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956202 }, { "epoch": 0.8993925218464737, "grad_norm": 0.3288821280002594, "learning_rate": 2.742005574373785e-07, "loss": 0.007893359288573265, "memory(GiB)": 22.66, "step": 27686, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956208 }, { "epoch": 0.8994250073092291, "grad_norm": 0.39277195930480957, "learning_rate": 2.7402514626346853e-07, "loss": 0.016238661482930183, "memory(GiB)": 22.66, "step": 27687, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956214 }, { "epoch": 0.8994574927719845, "grad_norm": 0.44234827160835266, "learning_rate": 2.7384978963359897e-07, "loss": 0.019046828150749207, "memory(GiB)": 22.66, "step": 27688, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.956219 }, { "epoch": 0.89948997823474, "grad_norm": 0.2422914206981659, "learning_rate": 2.736744875497932e-07, "loss": 0.008350403979420662, "memory(GiB)": 22.66, "step": 27689, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956224 }, { "epoch": 0.8995224636974953, "grad_norm": 0.38570916652679443, "learning_rate": 2.734992400140757e-07, "loss": 0.012064460664987564, "memory(GiB)": 22.66, "step": 27690, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.95623 }, { "epoch": 0.8995549491602508, "grad_norm": 0.2743261456489563, "learning_rate": 2.733240470284665e-07, "loss": 0.008180128410458565, "memory(GiB)": 22.66, "step": 27691, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.956236 }, { "epoch": 0.8995874346230062, "grad_norm": 0.32009032368659973, "learning_rate": 2.7314890859498955e-07, "loss": 0.010875029489398003, "memory(GiB)": 22.66, "step": 27692, "token_acc": 1.0, "train_speed(iter/s)": 0.956242 }, { "epoch": 0.8996199200857616, "grad_norm": 0.46103644371032715, "learning_rate": 2.729738247156655e-07, "loss": 0.017108742147684097, "memory(GiB)": 22.66, "step": 27693, "token_acc": 0.9894179894179894, "train_speed(iter/s)": 0.956248 }, { "epoch": 0.899652405548517, "grad_norm": 0.3237666189670563, "learning_rate": 2.7279879539251483e-07, "loss": 0.00887350831180811, "memory(GiB)": 22.66, "step": 27694, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956255 }, { "epoch": 0.8996848910112725, "grad_norm": 0.30589646100997925, "learning_rate": 2.726238206275589e-07, "loss": 0.011045300401747227, "memory(GiB)": 22.66, "step": 27695, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.956261 }, { "epoch": 0.8997173764740278, "grad_norm": 0.31978195905685425, "learning_rate": 2.724489004228159e-07, "loss": 0.009769387543201447, "memory(GiB)": 22.66, "step": 27696, "token_acc": 1.0, "train_speed(iter/s)": 0.956267 }, { "epoch": 0.8997498619367833, "grad_norm": 0.26038283109664917, "learning_rate": 2.7227403478030434e-07, "loss": 0.006516183260828257, "memory(GiB)": 22.66, "step": 27697, "token_acc": 0.99609375, "train_speed(iter/s)": 0.956273 }, { "epoch": 0.8997823473995387, "grad_norm": 0.2621971666812897, "learning_rate": 2.7209922370204314e-07, "loss": 0.00802172813564539, "memory(GiB)": 22.66, "step": 27698, "token_acc": 1.0, "train_speed(iter/s)": 0.956278 }, { "epoch": 0.8998148328622941, "grad_norm": 0.30350461602211, "learning_rate": 2.719244671900506e-07, "loss": 0.009838992729783058, "memory(GiB)": 22.66, "step": 27699, "token_acc": 0.9849624060150376, "train_speed(iter/s)": 0.956284 }, { "epoch": 0.8998473183250495, "grad_norm": 0.2510480284690857, "learning_rate": 2.717497652463419e-07, "loss": 0.007017816416919231, "memory(GiB)": 22.66, "step": 27700, "token_acc": 1.0, "train_speed(iter/s)": 0.95629 }, { "epoch": 0.899879803787805, "grad_norm": 0.4141891598701477, "learning_rate": 2.7157511787293423e-07, "loss": 0.01564955897629261, "memory(GiB)": 22.66, "step": 27701, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.956296 }, { "epoch": 0.8999122892505603, "grad_norm": 0.3156905472278595, "learning_rate": 2.714005250718432e-07, "loss": 0.010984355583786964, "memory(GiB)": 22.66, "step": 27702, "token_acc": 1.0, "train_speed(iter/s)": 0.956302 }, { "epoch": 0.8999447747133158, "grad_norm": 0.22072134912014008, "learning_rate": 2.712259868450845e-07, "loss": 0.008225853554904461, "memory(GiB)": 22.66, "step": 27703, "token_acc": 1.0, "train_speed(iter/s)": 0.95631 }, { "epoch": 0.8999772601760712, "grad_norm": 0.4025419056415558, "learning_rate": 2.7105150319467145e-07, "loss": 0.008624546229839325, "memory(GiB)": 22.66, "step": 27704, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.956318 }, { "epoch": 0.9000097456388266, "grad_norm": 0.4416501224040985, "learning_rate": 2.708770741226191e-07, "loss": 0.012333831749856472, "memory(GiB)": 22.66, "step": 27705, "token_acc": 1.0, "train_speed(iter/s)": 0.956325 }, { "epoch": 0.900042231101582, "grad_norm": 0.3920712172985077, "learning_rate": 2.707026996309381e-07, "loss": 0.019960790872573853, "memory(GiB)": 22.66, "step": 27706, "token_acc": 0.9831223628691983, "train_speed(iter/s)": 0.956333 }, { "epoch": 0.9000747165643375, "grad_norm": 0.4337107539176941, "learning_rate": 2.7052837972164404e-07, "loss": 0.015959637239575386, "memory(GiB)": 22.66, "step": 27707, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.95634 }, { "epoch": 0.9001072020270928, "grad_norm": 0.3734714686870575, "learning_rate": 2.703541143967464e-07, "loss": 0.009451878257095814, "memory(GiB)": 22.66, "step": 27708, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.956348 }, { "epoch": 0.9001396874898483, "grad_norm": 0.24630099534988403, "learning_rate": 2.7017990365825754e-07, "loss": 0.006875995546579361, "memory(GiB)": 22.66, "step": 27709, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956355 }, { "epoch": 0.9001721729526037, "grad_norm": 0.22097766399383545, "learning_rate": 2.7000574750818906e-07, "loss": 0.007661576848477125, "memory(GiB)": 22.66, "step": 27710, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956361 }, { "epoch": 0.9002046584153591, "grad_norm": 0.35383737087249756, "learning_rate": 2.698316459485484e-07, "loss": 0.009626276791095734, "memory(GiB)": 22.66, "step": 27711, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.956369 }, { "epoch": 0.9002371438781146, "grad_norm": 0.5025368928909302, "learning_rate": 2.6965759898134714e-07, "loss": 0.016405733302235603, "memory(GiB)": 22.66, "step": 27712, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.956377 }, { "epoch": 0.90026962934087, "grad_norm": 0.28290486335754395, "learning_rate": 2.694836066085932e-07, "loss": 0.008269913494586945, "memory(GiB)": 22.66, "step": 27713, "token_acc": 1.0, "train_speed(iter/s)": 0.956385 }, { "epoch": 0.9003021148036254, "grad_norm": 0.3230966627597809, "learning_rate": 2.6930966883229545e-07, "loss": 0.01755090430378914, "memory(GiB)": 22.66, "step": 27714, "token_acc": 0.9929824561403509, "train_speed(iter/s)": 0.956392 }, { "epoch": 0.9003346002663808, "grad_norm": 0.3315165936946869, "learning_rate": 2.6913578565445963e-07, "loss": 0.006999394856393337, "memory(GiB)": 22.66, "step": 27715, "token_acc": 1.0, "train_speed(iter/s)": 0.956399 }, { "epoch": 0.9003670857291363, "grad_norm": 0.2794799506664276, "learning_rate": 2.6896195707709403e-07, "loss": 0.006033408921211958, "memory(GiB)": 22.66, "step": 27716, "token_acc": 1.0, "train_speed(iter/s)": 0.956407 }, { "epoch": 0.9003995711918916, "grad_norm": 0.22722655534744263, "learning_rate": 2.687881831022038e-07, "loss": 0.008844995871186256, "memory(GiB)": 22.66, "step": 27717, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.956415 }, { "epoch": 0.9004320566546471, "grad_norm": 0.27489593625068665, "learning_rate": 2.6861446373179667e-07, "loss": 0.009541620500385761, "memory(GiB)": 22.66, "step": 27718, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.956422 }, { "epoch": 0.9004645421174025, "grad_norm": 0.31603848934173584, "learning_rate": 2.6844079896787445e-07, "loss": 0.008403019048273563, "memory(GiB)": 22.66, "step": 27719, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.95643 }, { "epoch": 0.9004970275801579, "grad_norm": 0.45747193694114685, "learning_rate": 2.6826718881244385e-07, "loss": 0.01417737640440464, "memory(GiB)": 22.66, "step": 27720, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.956437 }, { "epoch": 0.9005295130429133, "grad_norm": 0.33267974853515625, "learning_rate": 2.680936332675077e-07, "loss": 0.012940960004925728, "memory(GiB)": 22.66, "step": 27721, "token_acc": 1.0, "train_speed(iter/s)": 0.956445 }, { "epoch": 0.9005619985056688, "grad_norm": 0.2845108211040497, "learning_rate": 2.6792013233506943e-07, "loss": 0.009817485697567463, "memory(GiB)": 22.66, "step": 27722, "token_acc": 0.9942196531791907, "train_speed(iter/s)": 0.956452 }, { "epoch": 0.9005944839684241, "grad_norm": 0.3813202679157257, "learning_rate": 2.677466860171307e-07, "loss": 0.011394292116165161, "memory(GiB)": 22.66, "step": 27723, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.95646 }, { "epoch": 0.9006269694311796, "grad_norm": 0.6046414971351624, "learning_rate": 2.6757329431569446e-07, "loss": 0.01769929751753807, "memory(GiB)": 22.66, "step": 27724, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.956467 }, { "epoch": 0.900659454893935, "grad_norm": 0.22088409960269928, "learning_rate": 2.673999572327607e-07, "loss": 0.0102668646723032, "memory(GiB)": 22.66, "step": 27725, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956473 }, { "epoch": 0.9006919403566904, "grad_norm": 0.3510591387748718, "learning_rate": 2.672266747703306e-07, "loss": 0.009908709675073624, "memory(GiB)": 22.66, "step": 27726, "token_acc": 0.9941176470588236, "train_speed(iter/s)": 0.956478 }, { "epoch": 0.9007244258194458, "grad_norm": 0.3472224473953247, "learning_rate": 2.6705344693040424e-07, "loss": 0.010374002158641815, "memory(GiB)": 22.66, "step": 27727, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.956484 }, { "epoch": 0.9007569112822013, "grad_norm": 0.4989541471004486, "learning_rate": 2.668802737149806e-07, "loss": 0.013602964580059052, "memory(GiB)": 22.66, "step": 27728, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.95649 }, { "epoch": 0.9007893967449566, "grad_norm": 0.36230024695396423, "learning_rate": 2.66707155126058e-07, "loss": 0.013156496919691563, "memory(GiB)": 22.66, "step": 27729, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.956496 }, { "epoch": 0.9008218822077121, "grad_norm": 0.5058943033218384, "learning_rate": 2.66534091165635e-07, "loss": 0.008867592550814152, "memory(GiB)": 22.66, "step": 27730, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.956502 }, { "epoch": 0.9008543676704674, "grad_norm": 0.38546353578567505, "learning_rate": 2.663610818357093e-07, "loss": 0.01346286665648222, "memory(GiB)": 22.66, "step": 27731, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.956509 }, { "epoch": 0.9008868531332229, "grad_norm": 0.36899423599243164, "learning_rate": 2.6618812713827655e-07, "loss": 0.013959618285298347, "memory(GiB)": 22.66, "step": 27732, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956514 }, { "epoch": 0.9009193385959783, "grad_norm": 0.36957529187202454, "learning_rate": 2.660152270753347e-07, "loss": 0.01774470880627632, "memory(GiB)": 22.66, "step": 27733, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.95652 }, { "epoch": 0.9009518240587338, "grad_norm": 0.43206003308296204, "learning_rate": 2.658423816488759e-07, "loss": 0.014817259274423122, "memory(GiB)": 22.66, "step": 27734, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956526 }, { "epoch": 0.9009843095214891, "grad_norm": 0.3030984699726105, "learning_rate": 2.6566959086089916e-07, "loss": 0.009026875719428062, "memory(GiB)": 22.66, "step": 27735, "token_acc": 0.988, "train_speed(iter/s)": 0.956532 }, { "epoch": 0.9010167949842446, "grad_norm": 0.30476996302604675, "learning_rate": 2.654968547133957e-07, "loss": 0.013011300936341286, "memory(GiB)": 22.66, "step": 27736, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956537 }, { "epoch": 0.901049280447, "grad_norm": 0.2800324559211731, "learning_rate": 2.6532417320836057e-07, "loss": 0.009718234650790691, "memory(GiB)": 22.66, "step": 27737, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.956543 }, { "epoch": 0.9010817659097554, "grad_norm": 0.35988399386405945, "learning_rate": 2.651515463477861e-07, "loss": 0.008509780280292034, "memory(GiB)": 22.66, "step": 27738, "token_acc": 1.0, "train_speed(iter/s)": 0.956549 }, { "epoch": 0.9011142513725108, "grad_norm": 0.300706148147583, "learning_rate": 2.6497897413366505e-07, "loss": 0.010304688476026058, "memory(GiB)": 22.66, "step": 27739, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.956554 }, { "epoch": 0.9011467368352662, "grad_norm": 0.40371832251548767, "learning_rate": 2.648064565679881e-07, "loss": 0.01299209613353014, "memory(GiB)": 22.66, "step": 27740, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.956559 }, { "epoch": 0.9011792222980216, "grad_norm": 0.3348093032836914, "learning_rate": 2.646339936527481e-07, "loss": 0.011776143684983253, "memory(GiB)": 22.66, "step": 27741, "token_acc": 1.0, "train_speed(iter/s)": 0.956564 }, { "epoch": 0.9012117077607771, "grad_norm": 0.26462796330451965, "learning_rate": 2.644615853899346e-07, "loss": 0.007898293435573578, "memory(GiB)": 22.66, "step": 27742, "token_acc": 0.9930795847750865, "train_speed(iter/s)": 0.956569 }, { "epoch": 0.9012441932235324, "grad_norm": 0.29231294989585876, "learning_rate": 2.642892317815371e-07, "loss": 0.013258458115160465, "memory(GiB)": 22.66, "step": 27743, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.956575 }, { "epoch": 0.9012766786862879, "grad_norm": 0.4782739281654358, "learning_rate": 2.641169328295445e-07, "loss": 0.016677938401699066, "memory(GiB)": 22.66, "step": 27744, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.95658 }, { "epoch": 0.9013091641490433, "grad_norm": 0.2869164049625397, "learning_rate": 2.6394468853594647e-07, "loss": 0.007977660745382309, "memory(GiB)": 22.66, "step": 27745, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.956586 }, { "epoch": 0.9013416496117987, "grad_norm": 0.33240655064582825, "learning_rate": 2.637724989027313e-07, "loss": 0.00981366727501154, "memory(GiB)": 22.66, "step": 27746, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956592 }, { "epoch": 0.9013741350745541, "grad_norm": 0.3343304395675659, "learning_rate": 2.6360036393188405e-07, "loss": 0.010243140161037445, "memory(GiB)": 22.66, "step": 27747, "token_acc": 1.0, "train_speed(iter/s)": 0.956598 }, { "epoch": 0.9014066205373096, "grad_norm": 0.4073600769042969, "learning_rate": 2.634282836253932e-07, "loss": 0.011542538180947304, "memory(GiB)": 22.66, "step": 27748, "token_acc": 0.9947916666666666, "train_speed(iter/s)": 0.956604 }, { "epoch": 0.9014391060000649, "grad_norm": 0.33990949392318726, "learning_rate": 2.6325625798524434e-07, "loss": 0.01106588076800108, "memory(GiB)": 22.66, "step": 27749, "token_acc": 1.0, "train_speed(iter/s)": 0.95661 }, { "epoch": 0.9014715914628204, "grad_norm": 0.2965068221092224, "learning_rate": 2.6308428701342313e-07, "loss": 0.009983591735363007, "memory(GiB)": 22.66, "step": 27750, "token_acc": 0.996, "train_speed(iter/s)": 0.956616 }, { "epoch": 0.9015040769255758, "grad_norm": 0.3090052604675293, "learning_rate": 2.6291237071191355e-07, "loss": 0.010258857160806656, "memory(GiB)": 22.66, "step": 27751, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.956622 }, { "epoch": 0.9015365623883312, "grad_norm": 0.25328731536865234, "learning_rate": 2.6274050908270067e-07, "loss": 0.005968877114355564, "memory(GiB)": 22.66, "step": 27752, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956627 }, { "epoch": 0.9015690478510866, "grad_norm": 0.32613861560821533, "learning_rate": 2.625687021277673e-07, "loss": 0.012388746254146099, "memory(GiB)": 22.66, "step": 27753, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.956632 }, { "epoch": 0.9016015333138421, "grad_norm": 0.5105350017547607, "learning_rate": 2.623969498490969e-07, "loss": 0.010087406262755394, "memory(GiB)": 22.66, "step": 27754, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.956638 }, { "epoch": 0.9016340187765974, "grad_norm": 0.4208104610443115, "learning_rate": 2.622252522486712e-07, "loss": 0.015223506838083267, "memory(GiB)": 22.66, "step": 27755, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.956645 }, { "epoch": 0.9016665042393529, "grad_norm": 0.37044429779052734, "learning_rate": 2.6205360932847203e-07, "loss": 0.010693179443478584, "memory(GiB)": 22.66, "step": 27756, "token_acc": 1.0, "train_speed(iter/s)": 0.956651 }, { "epoch": 0.9016989897021083, "grad_norm": 0.3620319962501526, "learning_rate": 2.6188202109048047e-07, "loss": 0.009666244499385357, "memory(GiB)": 22.66, "step": 27757, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956657 }, { "epoch": 0.9017314751648637, "grad_norm": 0.33372363448143005, "learning_rate": 2.617104875366766e-07, "loss": 0.007718295324593782, "memory(GiB)": 22.66, "step": 27758, "token_acc": 0.9943820224719101, "train_speed(iter/s)": 0.956663 }, { "epoch": 0.9017639606276191, "grad_norm": 0.4082428216934204, "learning_rate": 2.6153900866904115e-07, "loss": 0.009280906058847904, "memory(GiB)": 22.66, "step": 27759, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.956669 }, { "epoch": 0.9017964460903746, "grad_norm": 0.26699575781822205, "learning_rate": 2.613675844895519e-07, "loss": 0.004633290693163872, "memory(GiB)": 22.66, "step": 27760, "token_acc": 1.0, "train_speed(iter/s)": 0.956674 }, { "epoch": 0.9018289315531299, "grad_norm": 0.35220611095428467, "learning_rate": 2.6119621500018843e-07, "loss": 0.01261405274271965, "memory(GiB)": 22.66, "step": 27761, "token_acc": 0.996, "train_speed(iter/s)": 0.95668 }, { "epoch": 0.9018614170158854, "grad_norm": 0.37399205565452576, "learning_rate": 2.6102490020292636e-07, "loss": 0.01043619029223919, "memory(GiB)": 22.66, "step": 27762, "token_acc": 0.9802371541501976, "train_speed(iter/s)": 0.956686 }, { "epoch": 0.9018939024786408, "grad_norm": 0.4009850025177002, "learning_rate": 2.608536400997458e-07, "loss": 0.014690113253891468, "memory(GiB)": 22.66, "step": 27763, "token_acc": 1.0, "train_speed(iter/s)": 0.956692 }, { "epoch": 0.9019263879413962, "grad_norm": 0.3762221038341522, "learning_rate": 2.6068243469262177e-07, "loss": 0.013473881408572197, "memory(GiB)": 22.66, "step": 27764, "token_acc": 1.0, "train_speed(iter/s)": 0.956698 }, { "epoch": 0.9019588734041516, "grad_norm": 0.29696300625801086, "learning_rate": 2.605112839835311e-07, "loss": 0.010512599721550941, "memory(GiB)": 22.66, "step": 27765, "token_acc": 0.994475138121547, "train_speed(iter/s)": 0.956705 }, { "epoch": 0.9019913588669071, "grad_norm": 0.8807867765426636, "learning_rate": 2.6034018797444714e-07, "loss": 0.02115572988986969, "memory(GiB)": 22.66, "step": 27766, "token_acc": 1.0, "train_speed(iter/s)": 0.956712 }, { "epoch": 0.9020238443296624, "grad_norm": 0.36974382400512695, "learning_rate": 2.601691466673467e-07, "loss": 0.011583012528717518, "memory(GiB)": 22.66, "step": 27767, "token_acc": 1.0, "train_speed(iter/s)": 0.95672 }, { "epoch": 0.9020563297924179, "grad_norm": 0.347353458404541, "learning_rate": 2.5999816006420266e-07, "loss": 0.007424274459481239, "memory(GiB)": 22.66, "step": 27768, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.956728 }, { "epoch": 0.9020888152551733, "grad_norm": 0.3056192398071289, "learning_rate": 2.5982722816698944e-07, "loss": 0.008888482116162777, "memory(GiB)": 22.66, "step": 27769, "token_acc": 1.0, "train_speed(iter/s)": 0.956735 }, { "epoch": 0.9021213007179287, "grad_norm": 0.2208566665649414, "learning_rate": 2.596563509776784e-07, "loss": 0.008907357230782509, "memory(GiB)": 22.66, "step": 27770, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.956742 }, { "epoch": 0.9021537861806841, "grad_norm": 0.32057711482048035, "learning_rate": 2.594855284982423e-07, "loss": 0.008590071462094784, "memory(GiB)": 22.66, "step": 27771, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.95675 }, { "epoch": 0.9021862716434396, "grad_norm": 0.42833977937698364, "learning_rate": 2.5931476073065345e-07, "loss": 0.01419634185731411, "memory(GiB)": 22.66, "step": 27772, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.956757 }, { "epoch": 0.9022187571061949, "grad_norm": 0.4787505865097046, "learning_rate": 2.591440476768814e-07, "loss": 0.015519567765295506, "memory(GiB)": 22.66, "step": 27773, "token_acc": 1.0, "train_speed(iter/s)": 0.956764 }, { "epoch": 0.9022512425689504, "grad_norm": 0.23559686541557312, "learning_rate": 2.5897338933889794e-07, "loss": 0.006262483540922403, "memory(GiB)": 22.66, "step": 27774, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956772 }, { "epoch": 0.9022837280317058, "grad_norm": 0.4490550458431244, "learning_rate": 2.5880278571867146e-07, "loss": 0.008292124606668949, "memory(GiB)": 22.66, "step": 27775, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956779 }, { "epoch": 0.9023162134944612, "grad_norm": 0.30526816844940186, "learning_rate": 2.586322368181715e-07, "loss": 0.011185027658939362, "memory(GiB)": 22.66, "step": 27776, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956786 }, { "epoch": 0.9023486989572167, "grad_norm": 0.3022310435771942, "learning_rate": 2.5846174263936595e-07, "loss": 0.010454666800796986, "memory(GiB)": 22.66, "step": 27777, "token_acc": 0.991869918699187, "train_speed(iter/s)": 0.956794 }, { "epoch": 0.9023811844199721, "grad_norm": 0.296805739402771, "learning_rate": 2.5829130318422324e-07, "loss": 0.009354040957987309, "memory(GiB)": 22.66, "step": 27778, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956802 }, { "epoch": 0.9024136698827275, "grad_norm": 0.3208896517753601, "learning_rate": 2.581209184547101e-07, "loss": 0.011070381850004196, "memory(GiB)": 22.66, "step": 27779, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956809 }, { "epoch": 0.9024461553454829, "grad_norm": 0.4096938967704773, "learning_rate": 2.5795058845279276e-07, "loss": 0.010269053280353546, "memory(GiB)": 22.66, "step": 27780, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956817 }, { "epoch": 0.9024786408082384, "grad_norm": 0.48173388838768005, "learning_rate": 2.5778031318043686e-07, "loss": 0.014788763597607613, "memory(GiB)": 22.66, "step": 27781, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956823 }, { "epoch": 0.9025111262709937, "grad_norm": 0.4180133640766144, "learning_rate": 2.5761009263960914e-07, "loss": 0.01286840159446001, "memory(GiB)": 22.66, "step": 27782, "token_acc": 1.0, "train_speed(iter/s)": 0.956831 }, { "epoch": 0.9025436117337492, "grad_norm": 0.47928228974342346, "learning_rate": 2.574399268322725e-07, "loss": 0.008922905661165714, "memory(GiB)": 22.66, "step": 27783, "token_acc": 1.0, "train_speed(iter/s)": 0.956838 }, { "epoch": 0.9025760971965046, "grad_norm": 0.3243679404258728, "learning_rate": 2.5726981576039147e-07, "loss": 0.011510586366057396, "memory(GiB)": 22.66, "step": 27784, "token_acc": 1.0, "train_speed(iter/s)": 0.956846 }, { "epoch": 0.90260858265926, "grad_norm": 0.2947304844856262, "learning_rate": 2.5709975942592837e-07, "loss": 0.006556645035743713, "memory(GiB)": 22.66, "step": 27785, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956852 }, { "epoch": 0.9026410681220154, "grad_norm": 0.4020639657974243, "learning_rate": 2.569297578308483e-07, "loss": 0.011182479560375214, "memory(GiB)": 22.66, "step": 27786, "token_acc": 1.0, "train_speed(iter/s)": 0.956857 }, { "epoch": 0.9026735535847709, "grad_norm": 0.5604761838912964, "learning_rate": 2.5675981097711135e-07, "loss": 0.014472149312496185, "memory(GiB)": 22.66, "step": 27787, "token_acc": 1.0, "train_speed(iter/s)": 0.956864 }, { "epoch": 0.9027060390475262, "grad_norm": 0.3539840877056122, "learning_rate": 2.5658991886667873e-07, "loss": 0.014424871653318405, "memory(GiB)": 22.66, "step": 27788, "token_acc": 1.0, "train_speed(iter/s)": 0.956869 }, { "epoch": 0.9027385245102817, "grad_norm": 0.30063673853874207, "learning_rate": 2.564200815015133e-07, "loss": 0.014066973701119423, "memory(GiB)": 22.66, "step": 27789, "token_acc": 1.0, "train_speed(iter/s)": 0.956876 }, { "epoch": 0.902771009973037, "grad_norm": 0.27384862303733826, "learning_rate": 2.562502988835719e-07, "loss": 0.010138045996427536, "memory(GiB)": 22.66, "step": 27790, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956881 }, { "epoch": 0.9028034954357925, "grad_norm": 0.3175945281982422, "learning_rate": 2.5608057101481784e-07, "loss": 0.011712776497006416, "memory(GiB)": 22.66, "step": 27791, "token_acc": 0.996, "train_speed(iter/s)": 0.956887 }, { "epoch": 0.9028359808985479, "grad_norm": 0.32029077410697937, "learning_rate": 2.5591089789720694e-07, "loss": 0.006451837718486786, "memory(GiB)": 22.66, "step": 27792, "token_acc": 1.0, "train_speed(iter/s)": 0.956892 }, { "epoch": 0.9028684663613034, "grad_norm": 0.31535065174102783, "learning_rate": 2.557412795326997e-07, "loss": 0.010077107697725296, "memory(GiB)": 22.66, "step": 27793, "token_acc": 1.0, "train_speed(iter/s)": 0.956898 }, { "epoch": 0.9029009518240587, "grad_norm": 0.3152673542499542, "learning_rate": 2.555717159232518e-07, "loss": 0.01172107644379139, "memory(GiB)": 22.66, "step": 27794, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.956903 }, { "epoch": 0.9029334372868142, "grad_norm": 0.36011362075805664, "learning_rate": 2.554022070708212e-07, "loss": 0.009929897263646126, "memory(GiB)": 22.66, "step": 27795, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956908 }, { "epoch": 0.9029659227495695, "grad_norm": 0.218838170170784, "learning_rate": 2.552327529773641e-07, "loss": 0.005939612165093422, "memory(GiB)": 22.66, "step": 27796, "token_acc": 1.0, "train_speed(iter/s)": 0.956914 }, { "epoch": 0.902998408212325, "grad_norm": 0.3189839720726013, "learning_rate": 2.550633536448371e-07, "loss": 0.006659570150077343, "memory(GiB)": 22.66, "step": 27797, "token_acc": 1.0, "train_speed(iter/s)": 0.956919 }, { "epoch": 0.9030308936750804, "grad_norm": 0.3677981197834015, "learning_rate": 2.548940090751939e-07, "loss": 0.012005047872662544, "memory(GiB)": 22.66, "step": 27798, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.956925 }, { "epoch": 0.9030633791378359, "grad_norm": 0.3375381529331207, "learning_rate": 2.5472471927038945e-07, "loss": 0.017182808369398117, "memory(GiB)": 22.66, "step": 27799, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956931 }, { "epoch": 0.9030958646005912, "grad_norm": 0.281144380569458, "learning_rate": 2.545554842323772e-07, "loss": 0.004714217968285084, "memory(GiB)": 22.66, "step": 27800, "token_acc": 1.0, "train_speed(iter/s)": 0.956936 }, { "epoch": 0.9031283500633467, "grad_norm": 0.2809852957725525, "learning_rate": 2.543863039631123e-07, "loss": 0.0076200589537620544, "memory(GiB)": 22.66, "step": 27801, "token_acc": 1.0, "train_speed(iter/s)": 0.956942 }, { "epoch": 0.903160835526102, "grad_norm": 0.3349752128124237, "learning_rate": 2.5421717846454473e-07, "loss": 0.00889819860458374, "memory(GiB)": 22.66, "step": 27802, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956948 }, { "epoch": 0.9031933209888575, "grad_norm": 0.3766350746154785, "learning_rate": 2.5404810773862754e-07, "loss": 0.009475668892264366, "memory(GiB)": 22.66, "step": 27803, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.956953 }, { "epoch": 0.9032258064516129, "grad_norm": 0.3267945349216461, "learning_rate": 2.538790917873124e-07, "loss": 0.01031399890780449, "memory(GiB)": 22.66, "step": 27804, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956958 }, { "epoch": 0.9032582919143683, "grad_norm": 0.3079865574836731, "learning_rate": 2.5371013061254946e-07, "loss": 0.009598935022950172, "memory(GiB)": 22.66, "step": 27805, "token_acc": 0.995, "train_speed(iter/s)": 0.956964 }, { "epoch": 0.9032907773771237, "grad_norm": 0.41925573348999023, "learning_rate": 2.5354122421628936e-07, "loss": 0.014942320063710213, "memory(GiB)": 22.66, "step": 27806, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.95697 }, { "epoch": 0.9033232628398792, "grad_norm": 0.49448832869529724, "learning_rate": 2.5337237260048053e-07, "loss": 0.015420645475387573, "memory(GiB)": 22.66, "step": 27807, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956975 }, { "epoch": 0.9033557483026345, "grad_norm": 0.454528272151947, "learning_rate": 2.53203575767072e-07, "loss": 0.01681016944348812, "memory(GiB)": 22.66, "step": 27808, "token_acc": 1.0, "train_speed(iter/s)": 0.956981 }, { "epoch": 0.90338823376539, "grad_norm": 0.2845172882080078, "learning_rate": 2.530348337180127e-07, "loss": 0.009392918087542057, "memory(GiB)": 22.66, "step": 27809, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.956987 }, { "epoch": 0.9034207192281454, "grad_norm": 0.3805501163005829, "learning_rate": 2.528661464552501e-07, "loss": 0.013723284937441349, "memory(GiB)": 22.66, "step": 27810, "token_acc": 1.0, "train_speed(iter/s)": 0.956993 }, { "epoch": 0.9034532046909008, "grad_norm": 0.4451642334461212, "learning_rate": 2.5269751398072975e-07, "loss": 0.010359692387282848, "memory(GiB)": 22.66, "step": 27811, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.956999 }, { "epoch": 0.9034856901536562, "grad_norm": 0.3576701879501343, "learning_rate": 2.5252893629639954e-07, "loss": 0.015947464853525162, "memory(GiB)": 22.66, "step": 27812, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.957005 }, { "epoch": 0.9035181756164117, "grad_norm": 0.40377601981163025, "learning_rate": 2.5236041340420294e-07, "loss": 0.012448898516595364, "memory(GiB)": 22.66, "step": 27813, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.95701 }, { "epoch": 0.903550661079167, "grad_norm": 0.34450218081474304, "learning_rate": 2.5219194530608727e-07, "loss": 0.016309719532728195, "memory(GiB)": 22.66, "step": 27814, "token_acc": 1.0, "train_speed(iter/s)": 0.957016 }, { "epoch": 0.9035831465419225, "grad_norm": 0.2916257977485657, "learning_rate": 2.520235320039954e-07, "loss": 0.010315087623894215, "memory(GiB)": 22.66, "step": 27815, "token_acc": 1.0, "train_speed(iter/s)": 0.957022 }, { "epoch": 0.9036156320046779, "grad_norm": 0.17497295141220093, "learning_rate": 2.518551734998725e-07, "loss": 0.00813212338835001, "memory(GiB)": 22.66, "step": 27816, "token_acc": 1.0, "train_speed(iter/s)": 0.957028 }, { "epoch": 0.9036481174674333, "grad_norm": 0.25882649421691895, "learning_rate": 2.5168686979565984e-07, "loss": 0.0073495348915457726, "memory(GiB)": 22.66, "step": 27817, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957033 }, { "epoch": 0.9036806029301887, "grad_norm": 0.34902581572532654, "learning_rate": 2.5151862089329957e-07, "loss": 0.01425573043525219, "memory(GiB)": 22.66, "step": 27818, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.957039 }, { "epoch": 0.9037130883929442, "grad_norm": 0.42323037981987, "learning_rate": 2.5135042679473644e-07, "loss": 0.011940071359276772, "memory(GiB)": 22.66, "step": 27819, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957045 }, { "epoch": 0.9037455738556995, "grad_norm": 0.321376770734787, "learning_rate": 2.511822875019093e-07, "loss": 0.014165783300995827, "memory(GiB)": 22.66, "step": 27820, "token_acc": 0.99, "train_speed(iter/s)": 0.957051 }, { "epoch": 0.903778059318455, "grad_norm": 0.35113969445228577, "learning_rate": 2.5101420301675947e-07, "loss": 0.017640933394432068, "memory(GiB)": 22.66, "step": 27821, "token_acc": 0.992, "train_speed(iter/s)": 0.957057 }, { "epoch": 0.9038105447812104, "grad_norm": 0.21377946436405182, "learning_rate": 2.5084617334122594e-07, "loss": 0.005485782865434885, "memory(GiB)": 22.66, "step": 27822, "token_acc": 1.0, "train_speed(iter/s)": 0.957062 }, { "epoch": 0.9038430302439658, "grad_norm": 0.9714651703834534, "learning_rate": 2.5067819847724885e-07, "loss": 0.0120764235034585, "memory(GiB)": 22.66, "step": 27823, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.957067 }, { "epoch": 0.9038755157067212, "grad_norm": 0.3134087920188904, "learning_rate": 2.505102784267671e-07, "loss": 0.009531687013804913, "memory(GiB)": 22.66, "step": 27824, "token_acc": 1.0, "train_speed(iter/s)": 0.957073 }, { "epoch": 0.9039080011694767, "grad_norm": 0.3262900114059448, "learning_rate": 2.5034241319171815e-07, "loss": 0.008737126365303993, "memory(GiB)": 22.66, "step": 27825, "token_acc": 1.0, "train_speed(iter/s)": 0.957079 }, { "epoch": 0.903940486632232, "grad_norm": 0.31799590587615967, "learning_rate": 2.501746027740393e-07, "loss": 0.013929084874689579, "memory(GiB)": 22.66, "step": 27826, "token_acc": 1.0, "train_speed(iter/s)": 0.957085 }, { "epoch": 0.9039729720949875, "grad_norm": 0.2905180752277374, "learning_rate": 2.500068471756678e-07, "loss": 0.009348198771476746, "memory(GiB)": 22.66, "step": 27827, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957091 }, { "epoch": 0.9040054575577429, "grad_norm": 0.43837520480155945, "learning_rate": 2.498391463985389e-07, "loss": 0.01452643983066082, "memory(GiB)": 22.66, "step": 27828, "token_acc": 0.9923371647509579, "train_speed(iter/s)": 0.957099 }, { "epoch": 0.9040379430204983, "grad_norm": 0.3343067169189453, "learning_rate": 2.496715004445899e-07, "loss": 0.011422804556787014, "memory(GiB)": 22.66, "step": 27829, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957106 }, { "epoch": 0.9040704284832537, "grad_norm": 0.3989446461200714, "learning_rate": 2.4950390931575366e-07, "loss": 0.011819845996797085, "memory(GiB)": 22.66, "step": 27830, "token_acc": 1.0, "train_speed(iter/s)": 0.957114 }, { "epoch": 0.9041029139460092, "grad_norm": 0.3461554944515228, "learning_rate": 2.493363730139647e-07, "loss": 0.009183773770928383, "memory(GiB)": 22.66, "step": 27831, "token_acc": 1.0, "train_speed(iter/s)": 0.957121 }, { "epoch": 0.9041353994087645, "grad_norm": 0.3540217876434326, "learning_rate": 2.4916889154115776e-07, "loss": 0.009689540602266788, "memory(GiB)": 22.66, "step": 27832, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.957128 }, { "epoch": 0.90416788487152, "grad_norm": 0.35034921765327454, "learning_rate": 2.49001464899265e-07, "loss": 0.008696802891790867, "memory(GiB)": 22.66, "step": 27833, "token_acc": 1.0, "train_speed(iter/s)": 0.957136 }, { "epoch": 0.9042003703342754, "grad_norm": 0.2791554629802704, "learning_rate": 2.4883409309021945e-07, "loss": 0.01122994627803564, "memory(GiB)": 22.66, "step": 27834, "token_acc": 1.0, "train_speed(iter/s)": 0.957143 }, { "epoch": 0.9042328557970308, "grad_norm": 0.3645268976688385, "learning_rate": 2.486667761159517e-07, "loss": 0.008572961203753948, "memory(GiB)": 22.66, "step": 27835, "token_acc": 1.0, "train_speed(iter/s)": 0.957151 }, { "epoch": 0.9042653412597862, "grad_norm": 0.3144690692424774, "learning_rate": 2.48499513978393e-07, "loss": 0.015343556180596352, "memory(GiB)": 22.66, "step": 27836, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.957158 }, { "epoch": 0.9042978267225417, "grad_norm": 0.4475899040699005, "learning_rate": 2.483323066794746e-07, "loss": 0.010495180264115334, "memory(GiB)": 22.66, "step": 27837, "token_acc": 0.9927536231884058, "train_speed(iter/s)": 0.957165 }, { "epoch": 0.904330312185297, "grad_norm": 0.502787709236145, "learning_rate": 2.4816515422112666e-07, "loss": 0.015661966055631638, "memory(GiB)": 22.66, "step": 27838, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957173 }, { "epoch": 0.9043627976480525, "grad_norm": 0.31727802753448486, "learning_rate": 2.4799805660527644e-07, "loss": 0.010899955406785011, "memory(GiB)": 22.66, "step": 27839, "token_acc": 1.0, "train_speed(iter/s)": 0.95718 }, { "epoch": 0.904395283110808, "grad_norm": 0.2681120038032532, "learning_rate": 2.478310138338541e-07, "loss": 0.007478167302906513, "memory(GiB)": 22.66, "step": 27840, "token_acc": 1.0, "train_speed(iter/s)": 0.957188 }, { "epoch": 0.9044277685735633, "grad_norm": 0.3235320746898651, "learning_rate": 2.476640259087859e-07, "loss": 0.010150820016860962, "memory(GiB)": 22.66, "step": 27841, "token_acc": 0.99609375, "train_speed(iter/s)": 0.957195 }, { "epoch": 0.9044602540363188, "grad_norm": 0.446296364068985, "learning_rate": 2.474970928320014e-07, "loss": 0.013888911344110966, "memory(GiB)": 22.66, "step": 27842, "token_acc": 1.0, "train_speed(iter/s)": 0.957202 }, { "epoch": 0.9044927394990742, "grad_norm": 0.37741968035697937, "learning_rate": 2.473302146054257e-07, "loss": 0.011621223762631416, "memory(GiB)": 22.66, "step": 27843, "token_acc": 0.99609375, "train_speed(iter/s)": 0.957209 }, { "epoch": 0.9045252249618296, "grad_norm": 0.5208486914634705, "learning_rate": 2.471633912309851e-07, "loss": 0.01224694587290287, "memory(GiB)": 22.66, "step": 27844, "token_acc": 1.0, "train_speed(iter/s)": 0.957217 }, { "epoch": 0.904557710424585, "grad_norm": 0.4125036597251892, "learning_rate": 2.469966227106052e-07, "loss": 0.005800148472189903, "memory(GiB)": 22.66, "step": 27845, "token_acc": 1.0, "train_speed(iter/s)": 0.957222 }, { "epoch": 0.9045901958873405, "grad_norm": 0.3092597723007202, "learning_rate": 2.468299090462095e-07, "loss": 0.008743812330067158, "memory(GiB)": 22.66, "step": 27846, "token_acc": 1.0, "train_speed(iter/s)": 0.957228 }, { "epoch": 0.9046226813500958, "grad_norm": 0.19591954350471497, "learning_rate": 2.466632502397237e-07, "loss": 0.0065530696883797646, "memory(GiB)": 22.66, "step": 27847, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957234 }, { "epoch": 0.9046551668128513, "grad_norm": 0.438082754611969, "learning_rate": 2.464966462930707e-07, "loss": 0.012218071147799492, "memory(GiB)": 22.66, "step": 27848, "token_acc": 0.9947643979057592, "train_speed(iter/s)": 0.95724 }, { "epoch": 0.9046876522756067, "grad_norm": 0.25888389348983765, "learning_rate": 2.463300972081739e-07, "loss": 0.006647699046880007, "memory(GiB)": 22.66, "step": 27849, "token_acc": 1.0, "train_speed(iter/s)": 0.957246 }, { "epoch": 0.9047201377383621, "grad_norm": 0.3207775056362152, "learning_rate": 2.461636029869541e-07, "loss": 0.007625063415616751, "memory(GiB)": 22.66, "step": 27850, "token_acc": 1.0, "train_speed(iter/s)": 0.957252 }, { "epoch": 0.9047526232011175, "grad_norm": 0.21033385396003723, "learning_rate": 2.459971636313341e-07, "loss": 0.00835131760686636, "memory(GiB)": 22.66, "step": 27851, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.957257 }, { "epoch": 0.904785108663873, "grad_norm": 0.3577066957950592, "learning_rate": 2.4583077914323406e-07, "loss": 0.010576806962490082, "memory(GiB)": 22.66, "step": 27852, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.957263 }, { "epoch": 0.9048175941266283, "grad_norm": 0.3236771821975708, "learning_rate": 2.4566444952457527e-07, "loss": 0.008580999448895454, "memory(GiB)": 22.66, "step": 27853, "token_acc": 1.0, "train_speed(iter/s)": 0.957269 }, { "epoch": 0.9048500795893838, "grad_norm": 0.26658937335014343, "learning_rate": 2.454981747772761e-07, "loss": 0.008597392588853836, "memory(GiB)": 22.66, "step": 27854, "token_acc": 1.0, "train_speed(iter/s)": 0.957274 }, { "epoch": 0.9048825650521392, "grad_norm": 0.4687611758708954, "learning_rate": 2.453319549032562e-07, "loss": 0.018525000661611557, "memory(GiB)": 22.66, "step": 27855, "token_acc": 0.9948717948717949, "train_speed(iter/s)": 0.95728 }, { "epoch": 0.9049150505148946, "grad_norm": 0.35706794261932373, "learning_rate": 2.451657899044335e-07, "loss": 0.01039208471775055, "memory(GiB)": 22.66, "step": 27856, "token_acc": 1.0, "train_speed(iter/s)": 0.957286 }, { "epoch": 0.90494753597765, "grad_norm": 0.38144049048423767, "learning_rate": 2.449996797827275e-07, "loss": 0.012023995630443096, "memory(GiB)": 22.66, "step": 27857, "token_acc": 1.0, "train_speed(iter/s)": 0.957291 }, { "epoch": 0.9049800214404055, "grad_norm": 0.47315549850463867, "learning_rate": 2.4483362454005346e-07, "loss": 0.01133220549672842, "memory(GiB)": 22.66, "step": 27858, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.957297 }, { "epoch": 0.9050125069031608, "grad_norm": 0.2498229295015335, "learning_rate": 2.446676241783286e-07, "loss": 0.01070713996887207, "memory(GiB)": 22.66, "step": 27859, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.957303 }, { "epoch": 0.9050449923659163, "grad_norm": 0.2721485197544098, "learning_rate": 2.4450167869946805e-07, "loss": 0.011727634817361832, "memory(GiB)": 22.66, "step": 27860, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957308 }, { "epoch": 0.9050774778286717, "grad_norm": 0.3858266770839691, "learning_rate": 2.4433578810538874e-07, "loss": 0.01057598926126957, "memory(GiB)": 22.66, "step": 27861, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.957314 }, { "epoch": 0.9051099632914271, "grad_norm": 0.3515029549598694, "learning_rate": 2.44169952398004e-07, "loss": 0.01228654570877552, "memory(GiB)": 22.66, "step": 27862, "token_acc": 1.0, "train_speed(iter/s)": 0.95732 }, { "epoch": 0.9051424487541825, "grad_norm": 0.2852313816547394, "learning_rate": 2.44004171579228e-07, "loss": 0.006802020128816366, "memory(GiB)": 22.66, "step": 27863, "token_acc": 1.0, "train_speed(iter/s)": 0.957326 }, { "epoch": 0.905174934216938, "grad_norm": 0.25153613090515137, "learning_rate": 2.4383844565097295e-07, "loss": 0.009210946038365364, "memory(GiB)": 22.66, "step": 27864, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.957333 }, { "epoch": 0.9052074196796933, "grad_norm": 0.2983904480934143, "learning_rate": 2.436727746151535e-07, "loss": 0.010521681979298592, "memory(GiB)": 22.66, "step": 27865, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.95734 }, { "epoch": 0.9052399051424488, "grad_norm": 0.4049784243106842, "learning_rate": 2.4350715847368147e-07, "loss": 0.013979922980070114, "memory(GiB)": 22.66, "step": 27866, "token_acc": 1.0, "train_speed(iter/s)": 0.957348 }, { "epoch": 0.9052723906052041, "grad_norm": 0.3195970356464386, "learning_rate": 2.4334159722846694e-07, "loss": 0.008913656696677208, "memory(GiB)": 22.66, "step": 27867, "token_acc": 1.0, "train_speed(iter/s)": 0.957355 }, { "epoch": 0.9053048760679596, "grad_norm": 0.3311772048473358, "learning_rate": 2.4317609088142237e-07, "loss": 0.00965938251465559, "memory(GiB)": 22.66, "step": 27868, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.957361 }, { "epoch": 0.905337361530715, "grad_norm": 0.32540813088417053, "learning_rate": 2.43010639434455e-07, "loss": 0.011531641706824303, "memory(GiB)": 22.66, "step": 27869, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957367 }, { "epoch": 0.9053698469934705, "grad_norm": 0.3814873695373535, "learning_rate": 2.4284524288947844e-07, "loss": 0.016163069754838943, "memory(GiB)": 22.66, "step": 27870, "token_acc": 1.0, "train_speed(iter/s)": 0.957373 }, { "epoch": 0.9054023324562258, "grad_norm": 0.49969881772994995, "learning_rate": 2.426799012483988e-07, "loss": 0.009906978346407413, "memory(GiB)": 22.66, "step": 27871, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957379 }, { "epoch": 0.9054348179189813, "grad_norm": 0.21526125073432922, "learning_rate": 2.4251461451312576e-07, "loss": 0.005343221593648195, "memory(GiB)": 22.66, "step": 27872, "token_acc": 1.0, "train_speed(iter/s)": 0.957384 }, { "epoch": 0.9054673033817366, "grad_norm": 0.4000243842601776, "learning_rate": 2.423493826855655e-07, "loss": 0.014726577326655388, "memory(GiB)": 22.66, "step": 27873, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.95739 }, { "epoch": 0.9054997888444921, "grad_norm": 0.34210845828056335, "learning_rate": 2.421842057676255e-07, "loss": 0.014567899517714977, "memory(GiB)": 22.66, "step": 27874, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957396 }, { "epoch": 0.9055322743072475, "grad_norm": 0.3315453827381134, "learning_rate": 2.420190837612124e-07, "loss": 0.008959472179412842, "memory(GiB)": 22.66, "step": 27875, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.957402 }, { "epoch": 0.905564759770003, "grad_norm": 0.4402911067008972, "learning_rate": 2.4185401666823315e-07, "loss": 0.009710706770420074, "memory(GiB)": 22.66, "step": 27876, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.957408 }, { "epoch": 0.9055972452327583, "grad_norm": 0.462240070104599, "learning_rate": 2.416890044905906e-07, "loss": 0.01179090328514576, "memory(GiB)": 22.66, "step": 27877, "token_acc": 1.0, "train_speed(iter/s)": 0.957414 }, { "epoch": 0.9056297306955138, "grad_norm": 0.37515026330947876, "learning_rate": 2.415240472301905e-07, "loss": 0.011879364028573036, "memory(GiB)": 22.66, "step": 27878, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.957419 }, { "epoch": 0.9056622161582691, "grad_norm": 0.3626443147659302, "learning_rate": 2.413591448889357e-07, "loss": 0.012989597395062447, "memory(GiB)": 22.66, "step": 27879, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957425 }, { "epoch": 0.9056947016210246, "grad_norm": 0.4082220196723938, "learning_rate": 2.411942974687309e-07, "loss": 0.017473697662353516, "memory(GiB)": 22.66, "step": 27880, "token_acc": 1.0, "train_speed(iter/s)": 0.957431 }, { "epoch": 0.90572718708378, "grad_norm": 0.36001282930374146, "learning_rate": 2.410295049714784e-07, "loss": 0.009252087213099003, "memory(GiB)": 22.66, "step": 27881, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957437 }, { "epoch": 0.9057596725465354, "grad_norm": 0.1726812869310379, "learning_rate": 2.408647673990788e-07, "loss": 0.007249760441482067, "memory(GiB)": 22.66, "step": 27882, "token_acc": 1.0, "train_speed(iter/s)": 0.957443 }, { "epoch": 0.9057921580092908, "grad_norm": 0.44909894466400146, "learning_rate": 2.407000847534341e-07, "loss": 0.013789936900138855, "memory(GiB)": 22.66, "step": 27883, "token_acc": 1.0, "train_speed(iter/s)": 0.957449 }, { "epoch": 0.9058246434720463, "grad_norm": 0.5011711120605469, "learning_rate": 2.4053545703644486e-07, "loss": 0.01807446777820587, "memory(GiB)": 22.66, "step": 27884, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.957455 }, { "epoch": 0.9058571289348016, "grad_norm": 0.30841976404190063, "learning_rate": 2.403708842500124e-07, "loss": 0.007945005781948566, "memory(GiB)": 22.66, "step": 27885, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957461 }, { "epoch": 0.9058896143975571, "grad_norm": 0.27194854617118835, "learning_rate": 2.402063663960341e-07, "loss": 0.0161130428314209, "memory(GiB)": 22.66, "step": 27886, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957467 }, { "epoch": 0.9059220998603125, "grad_norm": 0.41631650924682617, "learning_rate": 2.4004190347640954e-07, "loss": 0.013970490545034409, "memory(GiB)": 22.66, "step": 27887, "token_acc": 1.0, "train_speed(iter/s)": 0.957472 }, { "epoch": 0.9059545853230679, "grad_norm": 0.33538877964019775, "learning_rate": 2.398774954930366e-07, "loss": 0.01117146760225296, "memory(GiB)": 22.66, "step": 27888, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.957478 }, { "epoch": 0.9059870707858233, "grad_norm": 0.29933205246925354, "learning_rate": 2.397131424478144e-07, "loss": 0.008225075900554657, "memory(GiB)": 22.66, "step": 27889, "token_acc": 1.0, "train_speed(iter/s)": 0.957484 }, { "epoch": 0.9060195562485788, "grad_norm": 0.3645623028278351, "learning_rate": 2.3954884434263747e-07, "loss": 0.009550504386425018, "memory(GiB)": 22.66, "step": 27890, "token_acc": 1.0, "train_speed(iter/s)": 0.957489 }, { "epoch": 0.9060520417113341, "grad_norm": 0.2877647876739502, "learning_rate": 2.393846011794038e-07, "loss": 0.008073506876826286, "memory(GiB)": 22.66, "step": 27891, "token_acc": 1.0, "train_speed(iter/s)": 0.957495 }, { "epoch": 0.9060845271740896, "grad_norm": 0.44087401032447815, "learning_rate": 2.3922041296000677e-07, "loss": 0.01233062893152237, "memory(GiB)": 22.66, "step": 27892, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.957501 }, { "epoch": 0.906117012636845, "grad_norm": 0.2570020258426666, "learning_rate": 2.3905627968634384e-07, "loss": 0.009053859859704971, "memory(GiB)": 22.66, "step": 27893, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.957507 }, { "epoch": 0.9061494980996004, "grad_norm": 0.46347537636756897, "learning_rate": 2.388922013603079e-07, "loss": 0.013037266209721565, "memory(GiB)": 22.66, "step": 27894, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957514 }, { "epoch": 0.9061819835623558, "grad_norm": 0.2192716896533966, "learning_rate": 2.38728177983793e-07, "loss": 0.005927503574639559, "memory(GiB)": 22.66, "step": 27895, "token_acc": 1.0, "train_speed(iter/s)": 0.957521 }, { "epoch": 0.9062144690251113, "grad_norm": 0.453857958316803, "learning_rate": 2.385642095586926e-07, "loss": 0.01803751476109028, "memory(GiB)": 22.66, "step": 27896, "token_acc": 0.9789473684210527, "train_speed(iter/s)": 0.957529 }, { "epoch": 0.9062469544878666, "grad_norm": 0.3053169250488281, "learning_rate": 2.384002960868975e-07, "loss": 0.012548821046948433, "memory(GiB)": 22.66, "step": 27897, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.957536 }, { "epoch": 0.9062794399506221, "grad_norm": 0.23097297549247742, "learning_rate": 2.3823643757030169e-07, "loss": 0.01119060069322586, "memory(GiB)": 22.66, "step": 27898, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957544 }, { "epoch": 0.9063119254133775, "grad_norm": 0.3720160126686096, "learning_rate": 2.380726340107953e-07, "loss": 0.010321184061467648, "memory(GiB)": 22.66, "step": 27899, "token_acc": 0.984, "train_speed(iter/s)": 0.957552 }, { "epoch": 0.9063444108761329, "grad_norm": 0.3149207532405853, "learning_rate": 2.37908885410269e-07, "loss": 0.008614098653197289, "memory(GiB)": 22.66, "step": 27900, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957559 }, { "epoch": 0.9063768963388883, "grad_norm": 0.2351628541946411, "learning_rate": 2.3774519177061196e-07, "loss": 0.007674968335777521, "memory(GiB)": 22.66, "step": 27901, "token_acc": 1.0, "train_speed(iter/s)": 0.957567 }, { "epoch": 0.9064093818016438, "grad_norm": 0.3190203011035919, "learning_rate": 2.3758155309371367e-07, "loss": 0.011707013472914696, "memory(GiB)": 22.66, "step": 27902, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957574 }, { "epoch": 0.9064418672643991, "grad_norm": 0.24358102679252625, "learning_rate": 2.3741796938146323e-07, "loss": 0.008103880099952221, "memory(GiB)": 22.66, "step": 27903, "token_acc": 1.0, "train_speed(iter/s)": 0.957581 }, { "epoch": 0.9064743527271546, "grad_norm": 0.31325072050094604, "learning_rate": 2.372544406357491e-07, "loss": 0.009288148954510689, "memory(GiB)": 22.66, "step": 27904, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.957587 }, { "epoch": 0.9065068381899101, "grad_norm": 0.3823189437389374, "learning_rate": 2.3709096685845757e-07, "loss": 0.009838877245783806, "memory(GiB)": 22.66, "step": 27905, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.957593 }, { "epoch": 0.9065393236526654, "grad_norm": 0.4218984842300415, "learning_rate": 2.369275480514749e-07, "loss": 0.014013886451721191, "memory(GiB)": 22.66, "step": 27906, "token_acc": 1.0, "train_speed(iter/s)": 0.957598 }, { "epoch": 0.9065718091154209, "grad_norm": 0.3614911735057831, "learning_rate": 2.3676418421668847e-07, "loss": 0.0131440544500947, "memory(GiB)": 22.66, "step": 27907, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957604 }, { "epoch": 0.9066042945781763, "grad_norm": 0.24127741158008575, "learning_rate": 2.3660087535598398e-07, "loss": 0.006505257450044155, "memory(GiB)": 22.66, "step": 27908, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.957609 }, { "epoch": 0.9066367800409317, "grad_norm": 0.3886404037475586, "learning_rate": 2.3643762147124494e-07, "loss": 0.01367642916738987, "memory(GiB)": 22.66, "step": 27909, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.957615 }, { "epoch": 0.9066692655036871, "grad_norm": 0.4033394455909729, "learning_rate": 2.362744225643554e-07, "loss": 0.016768427565693855, "memory(GiB)": 22.66, "step": 27910, "token_acc": 0.993421052631579, "train_speed(iter/s)": 0.95762 }, { "epoch": 0.9067017509664426, "grad_norm": 0.3402431011199951, "learning_rate": 2.3611127863719996e-07, "loss": 0.013448651880025864, "memory(GiB)": 22.66, "step": 27911, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.957626 }, { "epoch": 0.9067342364291979, "grad_norm": 0.48798519372940063, "learning_rate": 2.35948189691661e-07, "loss": 0.017139557749032974, "memory(GiB)": 22.66, "step": 27912, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.957631 }, { "epoch": 0.9067667218919534, "grad_norm": 0.3435947895050049, "learning_rate": 2.3578515572962147e-07, "loss": 0.020900091156363487, "memory(GiB)": 22.66, "step": 27913, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957637 }, { "epoch": 0.9067992073547088, "grad_norm": 0.29349613189697266, "learning_rate": 2.3562217675296207e-07, "loss": 0.009268107824027538, "memory(GiB)": 22.66, "step": 27914, "token_acc": 1.0, "train_speed(iter/s)": 0.957643 }, { "epoch": 0.9068316928174642, "grad_norm": 0.3584074079990387, "learning_rate": 2.3545925276356352e-07, "loss": 0.010938001796603203, "memory(GiB)": 22.66, "step": 27915, "token_acc": 1.0, "train_speed(iter/s)": 0.957649 }, { "epoch": 0.9068641782802196, "grad_norm": 0.2530207931995392, "learning_rate": 2.352963837633071e-07, "loss": 0.010312803089618683, "memory(GiB)": 22.66, "step": 27916, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957655 }, { "epoch": 0.9068966637429751, "grad_norm": 0.27712777256965637, "learning_rate": 2.3513356975407298e-07, "loss": 0.010901818983256817, "memory(GiB)": 22.66, "step": 27917, "token_acc": 0.9931740614334471, "train_speed(iter/s)": 0.95766 }, { "epoch": 0.9069291492057304, "grad_norm": 0.24000613391399384, "learning_rate": 2.349708107377391e-07, "loss": 0.0069620367139577866, "memory(GiB)": 22.66, "step": 27918, "token_acc": 1.0, "train_speed(iter/s)": 0.957666 }, { "epoch": 0.9069616346684859, "grad_norm": 0.3446429371833801, "learning_rate": 2.3480810671618447e-07, "loss": 0.007000041659921408, "memory(GiB)": 22.66, "step": 27919, "token_acc": 1.0, "train_speed(iter/s)": 0.957673 }, { "epoch": 0.9069941201312413, "grad_norm": 0.40303879976272583, "learning_rate": 2.3464545769128544e-07, "loss": 0.01246360968798399, "memory(GiB)": 22.66, "step": 27920, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957678 }, { "epoch": 0.9070266055939967, "grad_norm": 0.529853880405426, "learning_rate": 2.344828636649221e-07, "loss": 0.012511275708675385, "memory(GiB)": 22.66, "step": 27921, "token_acc": 0.9939024390243902, "train_speed(iter/s)": 0.957684 }, { "epoch": 0.9070590910567521, "grad_norm": 0.39833658933639526, "learning_rate": 2.3432032463896858e-07, "loss": 0.014029551297426224, "memory(GiB)": 22.66, "step": 27922, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.95769 }, { "epoch": 0.9070915765195076, "grad_norm": 0.3653293550014496, "learning_rate": 2.3415784061530223e-07, "loss": 0.009105175733566284, "memory(GiB)": 22.66, "step": 27923, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.957697 }, { "epoch": 0.9071240619822629, "grad_norm": 0.341981440782547, "learning_rate": 2.3399541159579708e-07, "loss": 0.019593965262174606, "memory(GiB)": 22.66, "step": 27924, "token_acc": 0.9804878048780488, "train_speed(iter/s)": 0.957704 }, { "epoch": 0.9071565474450184, "grad_norm": 0.35183194279670715, "learning_rate": 2.3383303758232832e-07, "loss": 0.014233715832233429, "memory(GiB)": 22.66, "step": 27925, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957709 }, { "epoch": 0.9071890329077738, "grad_norm": 0.4635959267616272, "learning_rate": 2.336707185767706e-07, "loss": 0.011716574430465698, "memory(GiB)": 22.66, "step": 27926, "token_acc": 1.0, "train_speed(iter/s)": 0.957717 }, { "epoch": 0.9072215183705292, "grad_norm": 0.48363205790519714, "learning_rate": 2.3350845458099624e-07, "loss": 0.016324292868375778, "memory(GiB)": 22.66, "step": 27927, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.957724 }, { "epoch": 0.9072540038332846, "grad_norm": 0.24244791269302368, "learning_rate": 2.3334624559687936e-07, "loss": 0.008802378550171852, "memory(GiB)": 22.66, "step": 27928, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957731 }, { "epoch": 0.90728648929604, "grad_norm": 0.3932097554206848, "learning_rate": 2.3318409162629006e-07, "loss": 0.006997675634920597, "memory(GiB)": 22.66, "step": 27929, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.957739 }, { "epoch": 0.9073189747587954, "grad_norm": 0.26535290479660034, "learning_rate": 2.3302199267110138e-07, "loss": 0.006230262573808432, "memory(GiB)": 22.66, "step": 27930, "token_acc": 1.0, "train_speed(iter/s)": 0.957746 }, { "epoch": 0.9073514602215509, "grad_norm": 0.3921038806438446, "learning_rate": 2.3285994873318397e-07, "loss": 0.009401660412549973, "memory(GiB)": 22.66, "step": 27931, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957753 }, { "epoch": 0.9073839456843062, "grad_norm": 0.2689566910266876, "learning_rate": 2.32697959814408e-07, "loss": 0.007952344603836536, "memory(GiB)": 22.66, "step": 27932, "token_acc": 1.0, "train_speed(iter/s)": 0.95776 }, { "epoch": 0.9074164311470617, "grad_norm": 0.37316346168518066, "learning_rate": 2.325360259166426e-07, "loss": 0.013609004206955433, "memory(GiB)": 22.66, "step": 27933, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957767 }, { "epoch": 0.9074489166098171, "grad_norm": 0.3150579631328583, "learning_rate": 2.3237414704175676e-07, "loss": 0.009533651173114777, "memory(GiB)": 22.66, "step": 27934, "token_acc": 1.0, "train_speed(iter/s)": 0.957773 }, { "epoch": 0.9074814020725726, "grad_norm": 0.37824878096580505, "learning_rate": 2.3221232319161902e-07, "loss": 0.01124574989080429, "memory(GiB)": 22.66, "step": 27935, "token_acc": 1.0, "train_speed(iter/s)": 0.957779 }, { "epoch": 0.9075138875353279, "grad_norm": 0.3892219364643097, "learning_rate": 2.3205055436809732e-07, "loss": 0.014227127656340599, "memory(GiB)": 22.66, "step": 27936, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.957785 }, { "epoch": 0.9075463729980834, "grad_norm": 0.37053602933883667, "learning_rate": 2.3188884057305794e-07, "loss": 0.009154286235570908, "memory(GiB)": 22.66, "step": 27937, "token_acc": 1.0, "train_speed(iter/s)": 0.957791 }, { "epoch": 0.9075788584608387, "grad_norm": 0.353082537651062, "learning_rate": 2.3172718180836774e-07, "loss": 0.01377265714108944, "memory(GiB)": 22.66, "step": 27938, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.957797 }, { "epoch": 0.9076113439235942, "grad_norm": 0.32117462158203125, "learning_rate": 2.315655780758924e-07, "loss": 0.0096969585865736, "memory(GiB)": 22.66, "step": 27939, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.957803 }, { "epoch": 0.9076438293863496, "grad_norm": 0.22273766994476318, "learning_rate": 2.3140402937749773e-07, "loss": 0.008539239875972271, "memory(GiB)": 22.66, "step": 27940, "token_acc": 1.0, "train_speed(iter/s)": 0.957809 }, { "epoch": 0.907676314849105, "grad_norm": 0.38438886404037476, "learning_rate": 2.3124253571504661e-07, "loss": 0.012479490600526333, "memory(GiB)": 22.66, "step": 27941, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.957814 }, { "epoch": 0.9077088003118604, "grad_norm": 0.4439863860607147, "learning_rate": 2.3108109709040426e-07, "loss": 0.015060026198625565, "memory(GiB)": 22.66, "step": 27942, "token_acc": 1.0, "train_speed(iter/s)": 0.95782 }, { "epoch": 0.9077412857746159, "grad_norm": 0.383234441280365, "learning_rate": 2.3091971350543308e-07, "loss": 0.009946399368345737, "memory(GiB)": 22.66, "step": 27943, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957826 }, { "epoch": 0.9077737712373712, "grad_norm": 0.2555241286754608, "learning_rate": 2.30758384961996e-07, "loss": 0.008882205002009869, "memory(GiB)": 22.66, "step": 27944, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957832 }, { "epoch": 0.9078062567001267, "grad_norm": 0.27081766724586487, "learning_rate": 2.3059711146195596e-07, "loss": 0.01314135454595089, "memory(GiB)": 22.66, "step": 27945, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.957837 }, { "epoch": 0.9078387421628821, "grad_norm": 0.43551239371299744, "learning_rate": 2.3043589300717262e-07, "loss": 0.01288798451423645, "memory(GiB)": 22.66, "step": 27946, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957843 }, { "epoch": 0.9078712276256375, "grad_norm": 0.302357941865921, "learning_rate": 2.302747295995078e-07, "loss": 0.00842484924942255, "memory(GiB)": 22.66, "step": 27947, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.95785 }, { "epoch": 0.9079037130883929, "grad_norm": 0.2967606484889984, "learning_rate": 2.3011362124082004e-07, "loss": 0.009553586132824421, "memory(GiB)": 22.66, "step": 27948, "token_acc": 1.0, "train_speed(iter/s)": 0.957856 }, { "epoch": 0.9079361985511484, "grad_norm": 0.36711689829826355, "learning_rate": 2.2995256793297115e-07, "loss": 0.01129031740128994, "memory(GiB)": 22.66, "step": 27949, "token_acc": 0.9878048780487805, "train_speed(iter/s)": 0.957862 }, { "epoch": 0.9079686840139037, "grad_norm": 0.4680570662021637, "learning_rate": 2.2979156967781747e-07, "loss": 0.018688753247261047, "memory(GiB)": 22.66, "step": 27950, "token_acc": 0.9894179894179894, "train_speed(iter/s)": 0.957868 }, { "epoch": 0.9080011694766592, "grad_norm": 0.31696686148643494, "learning_rate": 2.2963062647721912e-07, "loss": 0.010650360025465488, "memory(GiB)": 22.66, "step": 27951, "token_acc": 1.0, "train_speed(iter/s)": 0.957874 }, { "epoch": 0.9080336549394146, "grad_norm": 0.47095298767089844, "learning_rate": 2.2946973833303242e-07, "loss": 0.016134943813085556, "memory(GiB)": 22.66, "step": 27952, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.957878 }, { "epoch": 0.90806614040217, "grad_norm": 0.3141537010669708, "learning_rate": 2.2930890524711424e-07, "loss": 0.005863727070391178, "memory(GiB)": 22.66, "step": 27953, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957884 }, { "epoch": 0.9080986258649254, "grad_norm": 0.4776257276535034, "learning_rate": 2.2914812722132086e-07, "loss": 0.01961994357407093, "memory(GiB)": 22.66, "step": 27954, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957891 }, { "epoch": 0.9081311113276809, "grad_norm": 0.48967576026916504, "learning_rate": 2.2898740425750855e-07, "loss": 0.016098324209451675, "memory(GiB)": 22.66, "step": 27955, "token_acc": 1.0, "train_speed(iter/s)": 0.957897 }, { "epoch": 0.9081635967904362, "grad_norm": 0.371834933757782, "learning_rate": 2.288267363575314e-07, "loss": 0.00844732765108347, "memory(GiB)": 22.66, "step": 27956, "token_acc": 1.0, "train_speed(iter/s)": 0.957903 }, { "epoch": 0.9081960822531917, "grad_norm": 1.2922090291976929, "learning_rate": 2.2866612352324458e-07, "loss": 0.0101960189640522, "memory(GiB)": 22.66, "step": 27957, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.95791 }, { "epoch": 0.9082285677159471, "grad_norm": 0.27220645546913147, "learning_rate": 2.285055657565005e-07, "loss": 0.00874868594110012, "memory(GiB)": 22.66, "step": 27958, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957917 }, { "epoch": 0.9082610531787025, "grad_norm": 0.18914221227169037, "learning_rate": 2.2834506305915383e-07, "loss": 0.006749160587787628, "memory(GiB)": 22.66, "step": 27959, "token_acc": 0.9857142857142858, "train_speed(iter/s)": 0.957924 }, { "epoch": 0.9082935386414579, "grad_norm": 0.3559624254703522, "learning_rate": 2.2818461543305638e-07, "loss": 0.013295813463628292, "memory(GiB)": 22.66, "step": 27960, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.957931 }, { "epoch": 0.9083260241042134, "grad_norm": 0.4594677984714508, "learning_rate": 2.2802422288005888e-07, "loss": 0.009962810203433037, "memory(GiB)": 22.66, "step": 27961, "token_acc": 1.0, "train_speed(iter/s)": 0.957939 }, { "epoch": 0.9083585095669687, "grad_norm": 0.41113120317459106, "learning_rate": 2.2786388540201376e-07, "loss": 0.012797646224498749, "memory(GiB)": 22.66, "step": 27962, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.957946 }, { "epoch": 0.9083909950297242, "grad_norm": 0.34213125705718994, "learning_rate": 2.2770360300077067e-07, "loss": 0.014944801107048988, "memory(GiB)": 22.66, "step": 27963, "token_acc": 0.9947368421052631, "train_speed(iter/s)": 0.957952 }, { "epoch": 0.9084234804924796, "grad_norm": 0.27877211570739746, "learning_rate": 2.2754337567818085e-07, "loss": 0.009171799756586552, "memory(GiB)": 22.66, "step": 27964, "token_acc": 1.0, "train_speed(iter/s)": 0.957958 }, { "epoch": 0.908455965955235, "grad_norm": 0.5006707310676575, "learning_rate": 2.2738320343609233e-07, "loss": 0.012702628038823605, "memory(GiB)": 22.66, "step": 27965, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957963 }, { "epoch": 0.9084884514179904, "grad_norm": 0.4193805754184723, "learning_rate": 2.272230862763536e-07, "loss": 0.011310297064483166, "memory(GiB)": 22.66, "step": 27966, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.957969 }, { "epoch": 0.9085209368807459, "grad_norm": 0.5521244406700134, "learning_rate": 2.2706302420081262e-07, "loss": 0.016819432377815247, "memory(GiB)": 22.66, "step": 27967, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.957975 }, { "epoch": 0.9085534223435013, "grad_norm": 0.24756278097629547, "learning_rate": 2.2690301721131845e-07, "loss": 0.007259658072143793, "memory(GiB)": 22.66, "step": 27968, "token_acc": 1.0, "train_speed(iter/s)": 0.95798 }, { "epoch": 0.9085859078062567, "grad_norm": 0.3623966872692108, "learning_rate": 2.2674306530971525e-07, "loss": 0.015576038509607315, "memory(GiB)": 22.66, "step": 27969, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.957986 }, { "epoch": 0.9086183932690122, "grad_norm": 0.31581851840019226, "learning_rate": 2.2658316849785144e-07, "loss": 0.011602475307881832, "memory(GiB)": 22.66, "step": 27970, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.957992 }, { "epoch": 0.9086508787317675, "grad_norm": 0.3269953727722168, "learning_rate": 2.264233267775695e-07, "loss": 0.008950847201049328, "memory(GiB)": 22.66, "step": 27971, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.957998 }, { "epoch": 0.908683364194523, "grad_norm": 0.43032583594322205, "learning_rate": 2.2626354015071683e-07, "loss": 0.017519602552056313, "memory(GiB)": 22.66, "step": 27972, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.958003 }, { "epoch": 0.9087158496572784, "grad_norm": 0.4569973051548004, "learning_rate": 2.261038086191375e-07, "loss": 0.017564523965120316, "memory(GiB)": 22.66, "step": 27973, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.958009 }, { "epoch": 0.9087483351200338, "grad_norm": 0.401238352060318, "learning_rate": 2.2594413218467338e-07, "loss": 0.008470720611512661, "memory(GiB)": 22.66, "step": 27974, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.958015 }, { "epoch": 0.9087808205827892, "grad_norm": 0.2686106264591217, "learning_rate": 2.257845108491691e-07, "loss": 0.007893780246376991, "memory(GiB)": 22.66, "step": 27975, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.958021 }, { "epoch": 0.9088133060455447, "grad_norm": 0.3898642063140869, "learning_rate": 2.2562494461446427e-07, "loss": 0.012340886518359184, "memory(GiB)": 22.66, "step": 27976, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.958026 }, { "epoch": 0.9088457915083, "grad_norm": 0.22479358315467834, "learning_rate": 2.2546543348240413e-07, "loss": 0.005738088395446539, "memory(GiB)": 22.66, "step": 27977, "token_acc": 1.0, "train_speed(iter/s)": 0.958032 }, { "epoch": 0.9088782769710555, "grad_norm": 0.27743738889694214, "learning_rate": 2.2530597745482664e-07, "loss": 0.006040925160050392, "memory(GiB)": 22.66, "step": 27978, "token_acc": 1.0, "train_speed(iter/s)": 0.958038 }, { "epoch": 0.9089107624338109, "grad_norm": 0.4366653859615326, "learning_rate": 2.2514657653357419e-07, "loss": 0.014388718642294407, "memory(GiB)": 22.66, "step": 27979, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.958044 }, { "epoch": 0.9089432478965663, "grad_norm": 0.30672284960746765, "learning_rate": 2.2498723072048478e-07, "loss": 0.00872030295431614, "memory(GiB)": 22.66, "step": 27980, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.958049 }, { "epoch": 0.9089757333593217, "grad_norm": 0.37420645356178284, "learning_rate": 2.2482794001739861e-07, "loss": 0.011711334809660912, "memory(GiB)": 22.66, "step": 27981, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.958055 }, { "epoch": 0.9090082188220772, "grad_norm": 0.3177590072154999, "learning_rate": 2.2466870442615363e-07, "loss": 0.009386787191033363, "memory(GiB)": 22.66, "step": 27982, "token_acc": 0.9892857142857143, "train_speed(iter/s)": 0.958061 }, { "epoch": 0.9090407042848325, "grad_norm": 0.625359833240509, "learning_rate": 2.245095239485884e-07, "loss": 0.012946904636919498, "memory(GiB)": 22.66, "step": 27983, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.958066 }, { "epoch": 0.909073189747588, "grad_norm": 0.42653846740722656, "learning_rate": 2.2435039858653861e-07, "loss": 0.01429704949259758, "memory(GiB)": 22.66, "step": 27984, "token_acc": 1.0, "train_speed(iter/s)": 0.958073 }, { "epoch": 0.9091056752103434, "grad_norm": 0.3029423654079437, "learning_rate": 2.2419132834184121e-07, "loss": 0.010117966681718826, "memory(GiB)": 22.66, "step": 27985, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.95808 }, { "epoch": 0.9091381606730988, "grad_norm": 0.24125149846076965, "learning_rate": 2.24032313216333e-07, "loss": 0.011122625321149826, "memory(GiB)": 22.66, "step": 27986, "token_acc": 0.9799196787148594, "train_speed(iter/s)": 0.958088 }, { "epoch": 0.9091706461358542, "grad_norm": 0.2297685146331787, "learning_rate": 2.2387335321184868e-07, "loss": 0.00952826626598835, "memory(GiB)": 22.66, "step": 27987, "token_acc": 0.985, "train_speed(iter/s)": 0.958095 }, { "epoch": 0.9092031315986097, "grad_norm": 0.3278895914554596, "learning_rate": 2.237144483302234e-07, "loss": 0.018010536208748817, "memory(GiB)": 22.66, "step": 27988, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.958102 }, { "epoch": 0.909235617061365, "grad_norm": 0.24309517443180084, "learning_rate": 2.2355559857329013e-07, "loss": 0.009077426046133041, "memory(GiB)": 22.66, "step": 27989, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.958109 }, { "epoch": 0.9092681025241205, "grad_norm": 0.32986167073249817, "learning_rate": 2.2339680394288243e-07, "loss": 0.014055022969841957, "memory(GiB)": 22.66, "step": 27990, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.958117 }, { "epoch": 0.9093005879868759, "grad_norm": 0.3137108385562897, "learning_rate": 2.232380644408333e-07, "loss": 0.009166836738586426, "memory(GiB)": 22.66, "step": 27991, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.958124 }, { "epoch": 0.9093330734496313, "grad_norm": 0.5440956354141235, "learning_rate": 2.2307938006897568e-07, "loss": 0.018124092370271683, "memory(GiB)": 22.66, "step": 27992, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.958131 }, { "epoch": 0.9093655589123867, "grad_norm": 0.33743801712989807, "learning_rate": 2.2292075082913923e-07, "loss": 0.014815171249210835, "memory(GiB)": 22.66, "step": 27993, "token_acc": 0.99609375, "train_speed(iter/s)": 0.958139 }, { "epoch": 0.9093980443751422, "grad_norm": 0.4329203963279724, "learning_rate": 2.227621767231558e-07, "loss": 0.010807465761899948, "memory(GiB)": 22.66, "step": 27994, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.958145 }, { "epoch": 0.9094305298378975, "grad_norm": 0.36030396819114685, "learning_rate": 2.2260365775285509e-07, "loss": 0.01184444036334753, "memory(GiB)": 22.66, "step": 27995, "token_acc": 1.0, "train_speed(iter/s)": 0.958151 }, { "epoch": 0.909463015300653, "grad_norm": 0.660478949546814, "learning_rate": 2.2244519392006724e-07, "loss": 0.010767927393317223, "memory(GiB)": 22.66, "step": 27996, "token_acc": 1.0, "train_speed(iter/s)": 0.958157 }, { "epoch": 0.9094955007634083, "grad_norm": 0.2967352271080017, "learning_rate": 2.2228678522662083e-07, "loss": 0.011296816170215607, "memory(GiB)": 22.66, "step": 27997, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.958162 }, { "epoch": 0.9095279862261638, "grad_norm": 0.2826128602027893, "learning_rate": 2.221284316743444e-07, "loss": 0.008124755695462227, "memory(GiB)": 22.66, "step": 27998, "token_acc": 1.0, "train_speed(iter/s)": 0.958168 }, { "epoch": 0.9095604716889192, "grad_norm": 0.33647334575653076, "learning_rate": 2.2197013326506366e-07, "loss": 0.012050913646817207, "memory(GiB)": 22.66, "step": 27999, "token_acc": 1.0, "train_speed(iter/s)": 0.958174 }, { "epoch": 0.9095929571516747, "grad_norm": 0.4243147075176239, "learning_rate": 2.2181189000060833e-07, "loss": 0.017694171518087387, "memory(GiB)": 22.66, "step": 28000, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.95818 }, { "epoch": 0.9095929571516747, "eval_loss": 0.011312633752822876, "eval_runtime": 81.049, "eval_samples_per_second": 122.765, "eval_steps_per_second": 3.837, "eval_token_acc": 0.9954145837516324, "step": 28000 }, { "epoch": 0.90962544261443, "grad_norm": 0.511451244354248, "learning_rate": 2.21653701882803e-07, "loss": 0.014150600880384445, "memory(GiB)": 22.66, "step": 28001, "token_acc": 0.9950678850817224, "train_speed(iter/s)": 0.955179 }, { "epoch": 0.9096579280771855, "grad_norm": 0.434871107339859, "learning_rate": 2.2149556891347512e-07, "loss": 0.014925858937203884, "memory(GiB)": 22.66, "step": 28002, "token_acc": 1.0, "train_speed(iter/s)": 0.955186 }, { "epoch": 0.9096904135399408, "grad_norm": 0.24393713474273682, "learning_rate": 2.2133749109444714e-07, "loss": 0.004766727797687054, "memory(GiB)": 22.66, "step": 28003, "token_acc": 1.0, "train_speed(iter/s)": 0.955192 }, { "epoch": 0.9097228990026963, "grad_norm": 0.20783595740795135, "learning_rate": 2.2117946842754478e-07, "loss": 0.007131379097700119, "memory(GiB)": 22.66, "step": 28004, "token_acc": 1.0, "train_speed(iter/s)": 0.955198 }, { "epoch": 0.9097553844654517, "grad_norm": 0.2733546793460846, "learning_rate": 2.210215009145933e-07, "loss": 0.011757667176425457, "memory(GiB)": 22.66, "step": 28005, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955204 }, { "epoch": 0.9097878699282071, "grad_norm": 0.2856059968471527, "learning_rate": 2.208635885574134e-07, "loss": 0.010944589972496033, "memory(GiB)": 22.66, "step": 28006, "token_acc": 1.0, "train_speed(iter/s)": 0.95521 }, { "epoch": 0.9098203553909625, "grad_norm": 0.43487662076950073, "learning_rate": 2.207057313578298e-07, "loss": 0.014781329780817032, "memory(GiB)": 22.66, "step": 28007, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.955216 }, { "epoch": 0.909852840853718, "grad_norm": 0.4965759515762329, "learning_rate": 2.205479293176621e-07, "loss": 0.020365171134471893, "memory(GiB)": 22.66, "step": 28008, "token_acc": 1.0, "train_speed(iter/s)": 0.955221 }, { "epoch": 0.9098853263164733, "grad_norm": 0.3190036416053772, "learning_rate": 2.2039018243873334e-07, "loss": 0.01258141826838255, "memory(GiB)": 22.66, "step": 28009, "token_acc": 1.0, "train_speed(iter/s)": 0.955227 }, { "epoch": 0.9099178117792288, "grad_norm": 0.27365782856941223, "learning_rate": 2.2023249072286312e-07, "loss": 0.012127317488193512, "memory(GiB)": 22.66, "step": 28010, "token_acc": 0.9855595667870036, "train_speed(iter/s)": 0.955233 }, { "epoch": 0.9099502972419842, "grad_norm": 0.30539679527282715, "learning_rate": 2.2007485417187224e-07, "loss": 0.0070501365698874, "memory(GiB)": 22.66, "step": 28011, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.955238 }, { "epoch": 0.9099827827047396, "grad_norm": 0.27039965987205505, "learning_rate": 2.1991727278757923e-07, "loss": 0.009637832641601562, "memory(GiB)": 22.66, "step": 28012, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955244 }, { "epoch": 0.910015268167495, "grad_norm": 0.25665417313575745, "learning_rate": 2.1975974657180376e-07, "loss": 0.009007018990814686, "memory(GiB)": 22.66, "step": 28013, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.955251 }, { "epoch": 0.9100477536302505, "grad_norm": 0.43154722452163696, "learning_rate": 2.196022755263627e-07, "loss": 0.014866216108202934, "memory(GiB)": 22.66, "step": 28014, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955256 }, { "epoch": 0.9100802390930058, "grad_norm": 0.37492281198501587, "learning_rate": 2.1944485965307461e-07, "loss": 0.013620695099234581, "memory(GiB)": 22.66, "step": 28015, "token_acc": 1.0, "train_speed(iter/s)": 0.955261 }, { "epoch": 0.9101127245557613, "grad_norm": 0.290679007768631, "learning_rate": 2.1928749895375524e-07, "loss": 0.00696506118401885, "memory(GiB)": 22.66, "step": 28016, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.955267 }, { "epoch": 0.9101452100185167, "grad_norm": 0.22249357402324677, "learning_rate": 2.1913019343022147e-07, "loss": 0.007487993221729994, "memory(GiB)": 22.66, "step": 28017, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955273 }, { "epoch": 0.9101776954812721, "grad_norm": 0.32597437500953674, "learning_rate": 2.1897294308428795e-07, "loss": 0.010841240175068378, "memory(GiB)": 22.66, "step": 28018, "token_acc": 0.992, "train_speed(iter/s)": 0.955279 }, { "epoch": 0.9102101809440275, "grad_norm": 0.2659391760826111, "learning_rate": 2.1881574791777105e-07, "loss": 0.00914710946381092, "memory(GiB)": 22.66, "step": 28019, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.955284 }, { "epoch": 0.910242666406783, "grad_norm": 0.29036280512809753, "learning_rate": 2.1865860793248428e-07, "loss": 0.009789571166038513, "memory(GiB)": 22.66, "step": 28020, "token_acc": 1.0, "train_speed(iter/s)": 0.95529 }, { "epoch": 0.9102751518695383, "grad_norm": 0.4022171199321747, "learning_rate": 2.1850152313024066e-07, "loss": 0.010959722101688385, "memory(GiB)": 22.66, "step": 28021, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.955296 }, { "epoch": 0.9103076373322938, "grad_norm": 0.5356190204620361, "learning_rate": 2.183444935128537e-07, "loss": 0.00971397664397955, "memory(GiB)": 22.66, "step": 28022, "token_acc": 1.0, "train_speed(iter/s)": 0.9553 }, { "epoch": 0.9103401227950492, "grad_norm": 0.3386479914188385, "learning_rate": 2.1818751908213533e-07, "loss": 0.009179230779409409, "memory(GiB)": 22.66, "step": 28023, "token_acc": 1.0, "train_speed(iter/s)": 0.955306 }, { "epoch": 0.9103726082578046, "grad_norm": 0.32104963064193726, "learning_rate": 2.180305998398985e-07, "loss": 0.012222404591739178, "memory(GiB)": 22.66, "step": 28024, "token_acc": 0.9809523809523809, "train_speed(iter/s)": 0.955312 }, { "epoch": 0.91040509372056, "grad_norm": 0.3487330377101898, "learning_rate": 2.1787373578795235e-07, "loss": 0.0132060581818223, "memory(GiB)": 22.66, "step": 28025, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.955317 }, { "epoch": 0.9104375791833155, "grad_norm": 0.3282091021537781, "learning_rate": 2.177169269281093e-07, "loss": 0.010320847854018211, "memory(GiB)": 22.66, "step": 28026, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955323 }, { "epoch": 0.9104700646460708, "grad_norm": 0.3091488182544708, "learning_rate": 2.1756017326217683e-07, "loss": 0.010504009202122688, "memory(GiB)": 22.66, "step": 28027, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.955329 }, { "epoch": 0.9105025501088263, "grad_norm": 0.4344436526298523, "learning_rate": 2.1740347479196623e-07, "loss": 0.012348191812634468, "memory(GiB)": 22.66, "step": 28028, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.955335 }, { "epoch": 0.9105350355715817, "grad_norm": 0.2542383372783661, "learning_rate": 2.1724683151928494e-07, "loss": 0.014453059062361717, "memory(GiB)": 22.66, "step": 28029, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.955342 }, { "epoch": 0.9105675210343371, "grad_norm": 0.31627681851387024, "learning_rate": 2.1709024344594154e-07, "loss": 0.011270282790064812, "memory(GiB)": 22.66, "step": 28030, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.95535 }, { "epoch": 0.9106000064970925, "grad_norm": 0.2724536061286926, "learning_rate": 2.1693371057374236e-07, "loss": 0.010128738358616829, "memory(GiB)": 22.66, "step": 28031, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.955357 }, { "epoch": 0.910632491959848, "grad_norm": 0.2740914225578308, "learning_rate": 2.1677723290449426e-07, "loss": 0.008799152448773384, "memory(GiB)": 22.66, "step": 28032, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.955364 }, { "epoch": 0.9106649774226034, "grad_norm": 0.3688356578350067, "learning_rate": 2.1662081044000306e-07, "loss": 0.013567071408033371, "memory(GiB)": 22.66, "step": 28033, "token_acc": 0.9927007299270073, "train_speed(iter/s)": 0.955371 }, { "epoch": 0.9106974628853588, "grad_norm": 0.38621917366981506, "learning_rate": 2.164644431820745e-07, "loss": 0.012619517743587494, "memory(GiB)": 22.66, "step": 28034, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955378 }, { "epoch": 0.9107299483481143, "grad_norm": 0.34258028864860535, "learning_rate": 2.163081311325138e-07, "loss": 0.015041143633425236, "memory(GiB)": 22.66, "step": 28035, "token_acc": 1.0, "train_speed(iter/s)": 0.955385 }, { "epoch": 0.9107624338108696, "grad_norm": 0.26570945978164673, "learning_rate": 2.1615187429312402e-07, "loss": 0.007285476196557283, "memory(GiB)": 22.66, "step": 28036, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.955372 }, { "epoch": 0.9107949192736251, "grad_norm": 0.35185980796813965, "learning_rate": 2.1599567266570866e-07, "loss": 0.010315832681953907, "memory(GiB)": 22.66, "step": 28037, "token_acc": 1.0, "train_speed(iter/s)": 0.955378 }, { "epoch": 0.9108274047363805, "grad_norm": 0.3827483654022217, "learning_rate": 2.158395262520707e-07, "loss": 0.017082076519727707, "memory(GiB)": 22.66, "step": 28038, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.955384 }, { "epoch": 0.9108598901991359, "grad_norm": 0.35271093249320984, "learning_rate": 2.1568343505401323e-07, "loss": 0.011258463375270367, "memory(GiB)": 22.66, "step": 28039, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.95539 }, { "epoch": 0.9108923756618913, "grad_norm": 0.2865050435066223, "learning_rate": 2.1552739907333586e-07, "loss": 0.008716823533177376, "memory(GiB)": 22.66, "step": 28040, "token_acc": 1.0, "train_speed(iter/s)": 0.955395 }, { "epoch": 0.9109248611246468, "grad_norm": 0.30793434381484985, "learning_rate": 2.1537141831184048e-07, "loss": 0.01011726912111044, "memory(GiB)": 22.66, "step": 28041, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.955401 }, { "epoch": 0.9109573465874021, "grad_norm": 0.40569472312927246, "learning_rate": 2.1521549277132736e-07, "loss": 0.01259095873683691, "memory(GiB)": 22.66, "step": 28042, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.955407 }, { "epoch": 0.9109898320501576, "grad_norm": 0.9567397236824036, "learning_rate": 2.1505962245359612e-07, "loss": 0.016558267176151276, "memory(GiB)": 22.66, "step": 28043, "token_acc": 1.0, "train_speed(iter/s)": 0.955413 }, { "epoch": 0.911022317512913, "grad_norm": 0.3215145468711853, "learning_rate": 2.1490380736044536e-07, "loss": 0.010180521756410599, "memory(GiB)": 22.66, "step": 28044, "token_acc": 1.0, "train_speed(iter/s)": 0.955419 }, { "epoch": 0.9110548029756684, "grad_norm": 0.32874801754951477, "learning_rate": 2.1474804749367362e-07, "loss": 0.012004535645246506, "memory(GiB)": 22.66, "step": 28045, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.955425 }, { "epoch": 0.9110872884384238, "grad_norm": 0.3500176668167114, "learning_rate": 2.145923428550789e-07, "loss": 0.008794104680418968, "memory(GiB)": 22.66, "step": 28046, "token_acc": 1.0, "train_speed(iter/s)": 0.955431 }, { "epoch": 0.9111197739011793, "grad_norm": 0.31967079639434814, "learning_rate": 2.1443669344645812e-07, "loss": 0.014599422924220562, "memory(GiB)": 22.66, "step": 28047, "token_acc": 1.0, "train_speed(iter/s)": 0.955436 }, { "epoch": 0.9111522593639346, "grad_norm": 0.34434887766838074, "learning_rate": 2.1428109926960704e-07, "loss": 0.01048879325389862, "memory(GiB)": 22.66, "step": 28048, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955442 }, { "epoch": 0.9111847448266901, "grad_norm": 0.26970139145851135, "learning_rate": 2.1412556032632147e-07, "loss": 0.009823290631175041, "memory(GiB)": 22.66, "step": 28049, "token_acc": 1.0, "train_speed(iter/s)": 0.955447 }, { "epoch": 0.9112172302894455, "grad_norm": 0.35996773838996887, "learning_rate": 2.139700766183972e-07, "loss": 0.01021926011890173, "memory(GiB)": 22.66, "step": 28050, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955453 }, { "epoch": 0.9112497157522009, "grad_norm": 0.17148758471012115, "learning_rate": 2.1381464814762832e-07, "loss": 0.006066394504159689, "memory(GiB)": 22.66, "step": 28051, "token_acc": 1.0, "train_speed(iter/s)": 0.955459 }, { "epoch": 0.9112822012149563, "grad_norm": 0.3324435353279114, "learning_rate": 2.1365927491581007e-07, "loss": 0.010511571541428566, "memory(GiB)": 22.66, "step": 28052, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.955465 }, { "epoch": 0.9113146866777118, "grad_norm": 0.45033159852027893, "learning_rate": 2.1350395692473269e-07, "loss": 0.013394990935921669, "memory(GiB)": 22.66, "step": 28053, "token_acc": 1.0, "train_speed(iter/s)": 0.955471 }, { "epoch": 0.9113471721404671, "grad_norm": 0.5674231052398682, "learning_rate": 2.1334869417619197e-07, "loss": 0.0125386081635952, "memory(GiB)": 22.66, "step": 28054, "token_acc": 0.9892086330935251, "train_speed(iter/s)": 0.955477 }, { "epoch": 0.9113796576032226, "grad_norm": 0.2932415306568146, "learning_rate": 2.1319348667197647e-07, "loss": 0.00967970211058855, "memory(GiB)": 22.66, "step": 28055, "token_acc": 1.0, "train_speed(iter/s)": 0.955483 }, { "epoch": 0.911412143065978, "grad_norm": 0.2722916007041931, "learning_rate": 2.130383344138809e-07, "loss": 0.008694449439644814, "memory(GiB)": 22.66, "step": 28056, "token_acc": 1.0, "train_speed(iter/s)": 0.955491 }, { "epoch": 0.9114446285287334, "grad_norm": 0.4337981641292572, "learning_rate": 2.1288323740369376e-07, "loss": 0.0137894656509161, "memory(GiB)": 22.66, "step": 28057, "token_acc": 0.9853479853479854, "train_speed(iter/s)": 0.955498 }, { "epoch": 0.9114771139914888, "grad_norm": 0.37426814436912537, "learning_rate": 2.1272819564320645e-07, "loss": 0.013083148747682571, "memory(GiB)": 22.66, "step": 28058, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.955506 }, { "epoch": 0.9115095994542443, "grad_norm": 0.3557678759098053, "learning_rate": 2.12573209134207e-07, "loss": 0.01198648288846016, "memory(GiB)": 22.66, "step": 28059, "token_acc": 1.0, "train_speed(iter/s)": 0.955513 }, { "epoch": 0.9115420849169996, "grad_norm": 0.593853235244751, "learning_rate": 2.1241827787848446e-07, "loss": 0.021897226572036743, "memory(GiB)": 22.66, "step": 28060, "token_acc": 0.9858490566037735, "train_speed(iter/s)": 0.955519 }, { "epoch": 0.9115745703797551, "grad_norm": 0.21315763890743256, "learning_rate": 2.1226340187782746e-07, "loss": 0.006141985766589642, "memory(GiB)": 22.66, "step": 28061, "token_acc": 1.0, "train_speed(iter/s)": 0.955526 }, { "epoch": 0.9116070558425104, "grad_norm": 0.2990134656429291, "learning_rate": 2.1210858113402343e-07, "loss": 0.010640067979693413, "memory(GiB)": 22.66, "step": 28062, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955534 }, { "epoch": 0.9116395413052659, "grad_norm": 0.32710492610931396, "learning_rate": 2.119538156488593e-07, "loss": 0.01212066225707531, "memory(GiB)": 22.66, "step": 28063, "token_acc": 0.9884169884169884, "train_speed(iter/s)": 0.955533 }, { "epoch": 0.9116720267680213, "grad_norm": 0.3834071457386017, "learning_rate": 2.117991054241203e-07, "loss": 0.010501986369490623, "memory(GiB)": 22.66, "step": 28064, "token_acc": 0.9928057553956835, "train_speed(iter/s)": 0.95554 }, { "epoch": 0.9117045122307768, "grad_norm": 0.3049578070640564, "learning_rate": 2.1164445046159275e-07, "loss": 0.011690528132021427, "memory(GiB)": 22.66, "step": 28065, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955546 }, { "epoch": 0.9117369976935321, "grad_norm": 0.3900378346443176, "learning_rate": 2.1148985076306195e-07, "loss": 0.012848982587456703, "memory(GiB)": 22.66, "step": 28066, "token_acc": 0.995, "train_speed(iter/s)": 0.955551 }, { "epoch": 0.9117694831562876, "grad_norm": 0.47306305170059204, "learning_rate": 2.1133530633031196e-07, "loss": 0.010572114028036594, "memory(GiB)": 22.66, "step": 28067, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.955557 }, { "epoch": 0.911801968619043, "grad_norm": 0.25043824315071106, "learning_rate": 2.1118081716512583e-07, "loss": 0.0062436494044959545, "memory(GiB)": 22.66, "step": 28068, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.955563 }, { "epoch": 0.9118344540817984, "grad_norm": 0.4004097878932953, "learning_rate": 2.1102638326928714e-07, "loss": 0.015043983235955238, "memory(GiB)": 22.66, "step": 28069, "token_acc": 0.9804560260586319, "train_speed(iter/s)": 0.955569 }, { "epoch": 0.9118669395445538, "grad_norm": 0.4180492162704468, "learning_rate": 2.108720046445778e-07, "loss": 0.01395072415471077, "memory(GiB)": 22.66, "step": 28070, "token_acc": 1.0, "train_speed(iter/s)": 0.955574 }, { "epoch": 0.9118994250073093, "grad_norm": 0.32097333669662476, "learning_rate": 2.1071768129278025e-07, "loss": 0.01760260760784149, "memory(GiB)": 22.66, "step": 28071, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.95558 }, { "epoch": 0.9119319104700646, "grad_norm": 0.35467424988746643, "learning_rate": 2.105634132156753e-07, "loss": 0.012856082990765572, "memory(GiB)": 22.66, "step": 28072, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.955585 }, { "epoch": 0.9119643959328201, "grad_norm": 0.29862353205680847, "learning_rate": 2.1040920041504265e-07, "loss": 0.009863731451332569, "memory(GiB)": 22.66, "step": 28073, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.955591 }, { "epoch": 0.9119968813955754, "grad_norm": 0.2992170453071594, "learning_rate": 2.1025504289266308e-07, "loss": 0.012189636006951332, "memory(GiB)": 22.66, "step": 28074, "token_acc": 1.0, "train_speed(iter/s)": 0.955597 }, { "epoch": 0.9120293668583309, "grad_norm": 0.2531638443470001, "learning_rate": 2.1010094065031572e-07, "loss": 0.0072363922372460365, "memory(GiB)": 22.66, "step": 28075, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.955603 }, { "epoch": 0.9120618523210863, "grad_norm": 0.4316333532333374, "learning_rate": 2.099468936897786e-07, "loss": 0.013679218478500843, "memory(GiB)": 22.66, "step": 28076, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.955608 }, { "epoch": 0.9120943377838417, "grad_norm": 0.3281194269657135, "learning_rate": 2.0979290201283032e-07, "loss": 0.00923663005232811, "memory(GiB)": 22.66, "step": 28077, "token_acc": 1.0, "train_speed(iter/s)": 0.955614 }, { "epoch": 0.9121268232465971, "grad_norm": 0.32164114713668823, "learning_rate": 2.0963896562124665e-07, "loss": 0.008689546957612038, "memory(GiB)": 22.66, "step": 28078, "token_acc": 1.0, "train_speed(iter/s)": 0.955619 }, { "epoch": 0.9121593087093526, "grad_norm": 0.24895724654197693, "learning_rate": 2.0948508451680615e-07, "loss": 0.007048220373690128, "memory(GiB)": 22.66, "step": 28079, "token_acc": 1.0, "train_speed(iter/s)": 0.955625 }, { "epoch": 0.9121917941721079, "grad_norm": 0.26369360089302063, "learning_rate": 2.0933125870128356e-07, "loss": 0.010538166388869286, "memory(GiB)": 22.66, "step": 28080, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955631 }, { "epoch": 0.9122242796348634, "grad_norm": 0.4210728704929352, "learning_rate": 2.0917748817645466e-07, "loss": 0.009742009453475475, "memory(GiB)": 22.66, "step": 28081, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.955638 }, { "epoch": 0.9122567650976188, "grad_norm": 0.48347216844558716, "learning_rate": 2.0902377294409472e-07, "loss": 0.015555750578641891, "memory(GiB)": 22.66, "step": 28082, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.955643 }, { "epoch": 0.9122892505603742, "grad_norm": 0.39271724224090576, "learning_rate": 2.0887011300597558e-07, "loss": 0.008069988340139389, "memory(GiB)": 22.66, "step": 28083, "token_acc": 0.995, "train_speed(iter/s)": 0.955649 }, { "epoch": 0.9123217360231296, "grad_norm": 0.5808455944061279, "learning_rate": 2.087165083638737e-07, "loss": 0.012447825632989407, "memory(GiB)": 22.66, "step": 28084, "token_acc": 1.0, "train_speed(iter/s)": 0.955654 }, { "epoch": 0.9123542214858851, "grad_norm": 0.2960348427295685, "learning_rate": 2.0856295901956037e-07, "loss": 0.008172018453478813, "memory(GiB)": 22.66, "step": 28085, "token_acc": 0.9966996699669967, "train_speed(iter/s)": 0.955659 }, { "epoch": 0.9123867069486404, "grad_norm": 0.29971814155578613, "learning_rate": 2.0840946497480807e-07, "loss": 0.012203818187117577, "memory(GiB)": 22.66, "step": 28086, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955665 }, { "epoch": 0.9124191924113959, "grad_norm": 0.3969900906085968, "learning_rate": 2.0825602623138764e-07, "loss": 0.013287331908941269, "memory(GiB)": 22.66, "step": 28087, "token_acc": 1.0, "train_speed(iter/s)": 0.955671 }, { "epoch": 0.9124516778741513, "grad_norm": 0.36658233404159546, "learning_rate": 2.081026427910704e-07, "loss": 0.011939093470573425, "memory(GiB)": 22.66, "step": 28088, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955677 }, { "epoch": 0.9124841633369067, "grad_norm": 0.33288639783859253, "learning_rate": 2.0794931465562661e-07, "loss": 0.015132324770092964, "memory(GiB)": 22.66, "step": 28089, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.955682 }, { "epoch": 0.9125166487996621, "grad_norm": 0.3969099223613739, "learning_rate": 2.0779604182682712e-07, "loss": 0.012761158868670464, "memory(GiB)": 22.66, "step": 28090, "token_acc": 0.9815668202764977, "train_speed(iter/s)": 0.955688 }, { "epoch": 0.9125491342624176, "grad_norm": 0.2704465687274933, "learning_rate": 2.0764282430643823e-07, "loss": 0.012631071731448174, "memory(GiB)": 22.66, "step": 28091, "token_acc": 1.0, "train_speed(iter/s)": 0.955696 }, { "epoch": 0.9125816197251729, "grad_norm": 0.2489163875579834, "learning_rate": 2.0748966209623023e-07, "loss": 0.007047679275274277, "memory(GiB)": 22.66, "step": 28092, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955703 }, { "epoch": 0.9126141051879284, "grad_norm": 0.29279080033302307, "learning_rate": 2.0733655519797058e-07, "loss": 0.007026670500636101, "memory(GiB)": 22.66, "step": 28093, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.95571 }, { "epoch": 0.9126465906506838, "grad_norm": 0.3160248100757599, "learning_rate": 2.0718350361342622e-07, "loss": 0.009973220527172089, "memory(GiB)": 22.66, "step": 28094, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955717 }, { "epoch": 0.9126790761134392, "grad_norm": 0.3954665958881378, "learning_rate": 2.0703050734436292e-07, "loss": 0.018142297863960266, "memory(GiB)": 22.66, "step": 28095, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955722 }, { "epoch": 0.9127115615761947, "grad_norm": 0.3184889554977417, "learning_rate": 2.068775663925471e-07, "loss": 0.009502191096544266, "memory(GiB)": 22.66, "step": 28096, "token_acc": 0.992, "train_speed(iter/s)": 0.955728 }, { "epoch": 0.9127440470389501, "grad_norm": 0.3896903395652771, "learning_rate": 2.0672468075974395e-07, "loss": 0.009984992444515228, "memory(GiB)": 22.66, "step": 28097, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.955734 }, { "epoch": 0.9127765325017055, "grad_norm": 0.24496589601039886, "learning_rate": 2.0657185044771765e-07, "loss": 0.00921641755849123, "memory(GiB)": 22.66, "step": 28098, "token_acc": 1.0, "train_speed(iter/s)": 0.95574 }, { "epoch": 0.9128090179644609, "grad_norm": 0.24959242343902588, "learning_rate": 2.064190754582329e-07, "loss": 0.008669372648000717, "memory(GiB)": 22.66, "step": 28099, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.955746 }, { "epoch": 0.9128415034272164, "grad_norm": 0.29100432991981506, "learning_rate": 2.0626635579305166e-07, "loss": 0.008197062648832798, "memory(GiB)": 22.66, "step": 28100, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.955752 }, { "epoch": 0.9128739888899717, "grad_norm": 0.3180622160434723, "learning_rate": 2.061136914539369e-07, "loss": 0.010454856790602207, "memory(GiB)": 22.66, "step": 28101, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955758 }, { "epoch": 0.9129064743527272, "grad_norm": 0.39907318353652954, "learning_rate": 2.0596108244265111e-07, "loss": 0.010767238214612007, "memory(GiB)": 22.66, "step": 28102, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955764 }, { "epoch": 0.9129389598154826, "grad_norm": 0.2742573618888855, "learning_rate": 2.058085287609557e-07, "loss": 0.00932107213884592, "memory(GiB)": 22.66, "step": 28103, "token_acc": 1.0, "train_speed(iter/s)": 0.955769 }, { "epoch": 0.912971445278238, "grad_norm": 0.3534959852695465, "learning_rate": 2.0565603041061034e-07, "loss": 0.009383803233504295, "memory(GiB)": 22.66, "step": 28104, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.955776 }, { "epoch": 0.9130039307409934, "grad_norm": 0.2949480712413788, "learning_rate": 2.055035873933764e-07, "loss": 0.008070390671491623, "memory(GiB)": 22.66, "step": 28105, "token_acc": 1.0, "train_speed(iter/s)": 0.955781 }, { "epoch": 0.9130364162037489, "grad_norm": 0.22579573094844818, "learning_rate": 2.0535119971101137e-07, "loss": 0.01028625387698412, "memory(GiB)": 22.66, "step": 28106, "token_acc": 1.0, "train_speed(iter/s)": 0.955787 }, { "epoch": 0.9130689016665042, "grad_norm": 0.3146984279155731, "learning_rate": 2.0519886736527605e-07, "loss": 0.011603940278291702, "memory(GiB)": 22.66, "step": 28107, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.955793 }, { "epoch": 0.9131013871292597, "grad_norm": 0.32649657130241394, "learning_rate": 2.0504659035792684e-07, "loss": 0.010926751419901848, "memory(GiB)": 22.66, "step": 28108, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.955799 }, { "epoch": 0.9131338725920151, "grad_norm": 0.6605165600776672, "learning_rate": 2.0489436869072344e-07, "loss": 0.010877353139221668, "memory(GiB)": 22.66, "step": 28109, "token_acc": 1.0, "train_speed(iter/s)": 0.955805 }, { "epoch": 0.9131663580547705, "grad_norm": 0.27773964405059814, "learning_rate": 2.0474220236541942e-07, "loss": 0.0070104957558214664, "memory(GiB)": 22.66, "step": 28110, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.955811 }, { "epoch": 0.9131988435175259, "grad_norm": 0.48257264494895935, "learning_rate": 2.0459009138377395e-07, "loss": 0.0090871537104249, "memory(GiB)": 22.66, "step": 28111, "token_acc": 1.0, "train_speed(iter/s)": 0.955816 }, { "epoch": 0.9132313289802814, "grad_norm": 0.40528300404548645, "learning_rate": 2.0443803574754174e-07, "loss": 0.011616367846727371, "memory(GiB)": 22.66, "step": 28112, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.955821 }, { "epoch": 0.9132638144430367, "grad_norm": 0.28442683815956116, "learning_rate": 2.042860354584769e-07, "loss": 0.005698493681848049, "memory(GiB)": 22.66, "step": 28113, "token_acc": 1.0, "train_speed(iter/s)": 0.955827 }, { "epoch": 0.9132962999057922, "grad_norm": 0.2895800471305847, "learning_rate": 2.0413409051833477e-07, "loss": 0.010554354637861252, "memory(GiB)": 22.66, "step": 28114, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.955834 }, { "epoch": 0.9133287853685476, "grad_norm": 0.3756742477416992, "learning_rate": 2.0398220092886778e-07, "loss": 0.012225739657878876, "memory(GiB)": 22.66, "step": 28115, "token_acc": 1.0, "train_speed(iter/s)": 0.955841 }, { "epoch": 0.913361270831303, "grad_norm": 0.33541789650917053, "learning_rate": 2.0383036669183065e-07, "loss": 0.008329031988978386, "memory(GiB)": 22.66, "step": 28116, "token_acc": 1.0, "train_speed(iter/s)": 0.955849 }, { "epoch": 0.9133937562940584, "grad_norm": 0.3183199167251587, "learning_rate": 2.036785878089742e-07, "loss": 0.011759399436414242, "memory(GiB)": 22.66, "step": 28117, "token_acc": 1.0, "train_speed(iter/s)": 0.955856 }, { "epoch": 0.9134262417568139, "grad_norm": 0.2757047116756439, "learning_rate": 2.035268642820515e-07, "loss": 0.013992689549922943, "memory(GiB)": 22.66, "step": 28118, "token_acc": 1.0, "train_speed(iter/s)": 0.955863 }, { "epoch": 0.9134587272195692, "grad_norm": 0.24696116149425507, "learning_rate": 2.0337519611281164e-07, "loss": 0.006384718231856823, "memory(GiB)": 22.66, "step": 28119, "token_acc": 1.0, "train_speed(iter/s)": 0.95587 }, { "epoch": 0.9134912126823247, "grad_norm": 0.4742991030216217, "learning_rate": 2.0322358330300662e-07, "loss": 0.020247958600521088, "memory(GiB)": 22.66, "step": 28120, "token_acc": 1.0, "train_speed(iter/s)": 0.955877 }, { "epoch": 0.91352369814508, "grad_norm": 0.3156444728374481, "learning_rate": 2.030720258543861e-07, "loss": 0.009832438081502914, "memory(GiB)": 22.66, "step": 28121, "token_acc": 0.996415770609319, "train_speed(iter/s)": 0.955885 }, { "epoch": 0.9135561836078355, "grad_norm": 0.36189004778862, "learning_rate": 2.0292052376869985e-07, "loss": 0.01423482783138752, "memory(GiB)": 22.66, "step": 28122, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955892 }, { "epoch": 0.9135886690705909, "grad_norm": 0.6442111134529114, "learning_rate": 2.0276907704769478e-07, "loss": 0.017299704253673553, "memory(GiB)": 22.66, "step": 28123, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.9559 }, { "epoch": 0.9136211545333464, "grad_norm": 0.3583650588989258, "learning_rate": 2.0261768569312e-07, "loss": 0.01041189394891262, "memory(GiB)": 22.66, "step": 28124, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955907 }, { "epoch": 0.9136536399961017, "grad_norm": 0.34519925713539124, "learning_rate": 2.0246634970672197e-07, "loss": 0.007869941182434559, "memory(GiB)": 22.66, "step": 28125, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955914 }, { "epoch": 0.9136861254588572, "grad_norm": 0.2703447639942169, "learning_rate": 2.023150690902481e-07, "loss": 0.011999879032373428, "memory(GiB)": 22.66, "step": 28126, "token_acc": 0.9895287958115183, "train_speed(iter/s)": 0.955922 }, { "epoch": 0.9137186109216126, "grad_norm": 0.44605231285095215, "learning_rate": 2.021638438454443e-07, "loss": 0.014095299877226353, "memory(GiB)": 22.66, "step": 28127, "token_acc": 0.9961977186311787, "train_speed(iter/s)": 0.955929 }, { "epoch": 0.913751096384368, "grad_norm": 0.47798454761505127, "learning_rate": 2.0201267397405523e-07, "loss": 0.010788311250507832, "memory(GiB)": 22.66, "step": 28128, "token_acc": 1.0, "train_speed(iter/s)": 0.955937 }, { "epoch": 0.9137835818471234, "grad_norm": 0.41874560713768005, "learning_rate": 2.0186155947782616e-07, "loss": 0.008226200938224792, "memory(GiB)": 22.66, "step": 28129, "token_acc": 0.9929824561403509, "train_speed(iter/s)": 0.955944 }, { "epoch": 0.9138160673098789, "grad_norm": 0.4911715090274811, "learning_rate": 2.017105003585007e-07, "loss": 0.01324344053864479, "memory(GiB)": 22.66, "step": 28130, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955951 }, { "epoch": 0.9138485527726342, "grad_norm": 0.45714491605758667, "learning_rate": 2.015594966178236e-07, "loss": 0.011878637596964836, "memory(GiB)": 22.66, "step": 28131, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955957 }, { "epoch": 0.9138810382353897, "grad_norm": 0.31212756037712097, "learning_rate": 2.0140854825753563e-07, "loss": 0.01128026656806469, "memory(GiB)": 22.66, "step": 28132, "token_acc": 1.0, "train_speed(iter/s)": 0.955963 }, { "epoch": 0.913913523698145, "grad_norm": 0.27896201610565186, "learning_rate": 2.012576552793799e-07, "loss": 0.007622674573212862, "memory(GiB)": 22.66, "step": 28133, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.955969 }, { "epoch": 0.9139460091609005, "grad_norm": 0.35863035917282104, "learning_rate": 2.0110681768509833e-07, "loss": 0.012974115088582039, "memory(GiB)": 22.66, "step": 28134, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.955975 }, { "epoch": 0.9139784946236559, "grad_norm": 0.3034467101097107, "learning_rate": 2.0095603547643172e-07, "loss": 0.010396683588624, "memory(GiB)": 22.66, "step": 28135, "token_acc": 1.0, "train_speed(iter/s)": 0.955981 }, { "epoch": 0.9140109800864114, "grad_norm": 0.2864445447921753, "learning_rate": 2.0080530865511927e-07, "loss": 0.007684087846428156, "memory(GiB)": 22.66, "step": 28136, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.955987 }, { "epoch": 0.9140434655491667, "grad_norm": 0.2586193382740021, "learning_rate": 2.0065463722290236e-07, "loss": 0.00905270129442215, "memory(GiB)": 22.66, "step": 28137, "token_acc": 1.0, "train_speed(iter/s)": 0.955993 }, { "epoch": 0.9140759510119222, "grad_norm": 0.4482172727584839, "learning_rate": 2.0050402118151734e-07, "loss": 0.008728528395295143, "memory(GiB)": 22.66, "step": 28138, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955999 }, { "epoch": 0.9141084364746775, "grad_norm": 0.2468307763338089, "learning_rate": 2.003534605327051e-07, "loss": 0.006277638487517834, "memory(GiB)": 22.66, "step": 28139, "token_acc": 1.0, "train_speed(iter/s)": 0.956005 }, { "epoch": 0.914140921937433, "grad_norm": 0.27586695551872253, "learning_rate": 2.0020295527820144e-07, "loss": 0.013043523766100407, "memory(GiB)": 22.66, "step": 28140, "token_acc": 1.0, "train_speed(iter/s)": 0.956011 }, { "epoch": 0.9141734074001884, "grad_norm": 0.3757479190826416, "learning_rate": 2.000525054197455e-07, "loss": 0.013240756466984749, "memory(GiB)": 22.66, "step": 28141, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956017 }, { "epoch": 0.9142058928629438, "grad_norm": 0.35854923725128174, "learning_rate": 1.9990211095907096e-07, "loss": 0.011748414486646652, "memory(GiB)": 22.66, "step": 28142, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956023 }, { "epoch": 0.9142383783256992, "grad_norm": 0.45480746030807495, "learning_rate": 1.9975177189791474e-07, "loss": 0.017288710922002792, "memory(GiB)": 22.66, "step": 28143, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.956029 }, { "epoch": 0.9142708637884547, "grad_norm": 0.4309855103492737, "learning_rate": 1.9960148823801373e-07, "loss": 0.0115596242249012, "memory(GiB)": 22.66, "step": 28144, "token_acc": 0.9883268482490273, "train_speed(iter/s)": 0.956035 }, { "epoch": 0.91430334925121, "grad_norm": 0.4271840453147888, "learning_rate": 1.9945125998109994e-07, "loss": 0.007173303514719009, "memory(GiB)": 22.66, "step": 28145, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.956042 }, { "epoch": 0.9143358347139655, "grad_norm": 0.3094366490840912, "learning_rate": 1.993010871289086e-07, "loss": 0.011358784511685371, "memory(GiB)": 22.66, "step": 28146, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.956048 }, { "epoch": 0.9143683201767209, "grad_norm": 0.33162736892700195, "learning_rate": 1.9915096968317228e-07, "loss": 0.01277791615575552, "memory(GiB)": 22.66, "step": 28147, "token_acc": 1.0, "train_speed(iter/s)": 0.956054 }, { "epoch": 0.9144008056394763, "grad_norm": 0.4111834168434143, "learning_rate": 1.9900090764562342e-07, "loss": 0.013450609520077705, "memory(GiB)": 22.66, "step": 28148, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.95606 }, { "epoch": 0.9144332911022317, "grad_norm": 0.33877912163734436, "learning_rate": 1.9885090101799397e-07, "loss": 0.010069254785776138, "memory(GiB)": 22.66, "step": 28149, "token_acc": 1.0, "train_speed(iter/s)": 0.956066 }, { "epoch": 0.9144657765649872, "grad_norm": 0.3489975333213806, "learning_rate": 1.9870094980201593e-07, "loss": 0.009984808042645454, "memory(GiB)": 22.66, "step": 28150, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956072 }, { "epoch": 0.9144982620277425, "grad_norm": 0.28696170449256897, "learning_rate": 1.9855105399941954e-07, "loss": 0.010373571887612343, "memory(GiB)": 22.66, "step": 28151, "token_acc": 1.0, "train_speed(iter/s)": 0.956077 }, { "epoch": 0.914530747490498, "grad_norm": 0.4297735095024109, "learning_rate": 1.9840121361193399e-07, "loss": 0.018142148852348328, "memory(GiB)": 22.66, "step": 28152, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956083 }, { "epoch": 0.9145632329532534, "grad_norm": 0.4050641357898712, "learning_rate": 1.9825142864128955e-07, "loss": 0.011883077211678028, "memory(GiB)": 22.66, "step": 28153, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.956089 }, { "epoch": 0.9145957184160088, "grad_norm": 0.3536393940448761, "learning_rate": 1.9810169908921594e-07, "loss": 0.012362202629446983, "memory(GiB)": 22.66, "step": 28154, "token_acc": 0.99609375, "train_speed(iter/s)": 0.956094 }, { "epoch": 0.9146282038787642, "grad_norm": 0.46250927448272705, "learning_rate": 1.9795202495743848e-07, "loss": 0.010509848594665527, "memory(GiB)": 22.66, "step": 28155, "token_acc": 1.0, "train_speed(iter/s)": 0.9561 }, { "epoch": 0.9146606893415197, "grad_norm": 0.32232630252838135, "learning_rate": 1.9780240624768686e-07, "loss": 0.012160309590399265, "memory(GiB)": 22.66, "step": 28156, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.956106 }, { "epoch": 0.914693174804275, "grad_norm": 0.26082348823547363, "learning_rate": 1.9765284296168697e-07, "loss": 0.006485086865723133, "memory(GiB)": 22.66, "step": 28157, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.956112 }, { "epoch": 0.9147256602670305, "grad_norm": 0.37324652075767517, "learning_rate": 1.9750333510116516e-07, "loss": 0.010930830612778664, "memory(GiB)": 22.66, "step": 28158, "token_acc": 0.9961977186311787, "train_speed(iter/s)": 0.956117 }, { "epoch": 0.9147581457297859, "grad_norm": 0.30048245191574097, "learning_rate": 1.9735388266784729e-07, "loss": 0.007035171613097191, "memory(GiB)": 22.66, "step": 28159, "token_acc": 1.0, "train_speed(iter/s)": 0.956122 }, { "epoch": 0.9147906311925413, "grad_norm": 0.4348924160003662, "learning_rate": 1.9720448566345807e-07, "loss": 0.013687760569155216, "memory(GiB)": 22.66, "step": 28160, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.956129 }, { "epoch": 0.9148231166552968, "grad_norm": 0.3358961343765259, "learning_rate": 1.970551440897206e-07, "loss": 0.012492550536990166, "memory(GiB)": 22.66, "step": 28161, "token_acc": 1.0, "train_speed(iter/s)": 0.956135 }, { "epoch": 0.9148556021180522, "grad_norm": 0.5101508498191833, "learning_rate": 1.9690585794836015e-07, "loss": 0.015024099498987198, "memory(GiB)": 22.66, "step": 28162, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.956141 }, { "epoch": 0.9148880875808076, "grad_norm": 0.1888892650604248, "learning_rate": 1.967566272410998e-07, "loss": 0.005956823937594891, "memory(GiB)": 22.66, "step": 28163, "token_acc": 1.0, "train_speed(iter/s)": 0.956146 }, { "epoch": 0.914920573043563, "grad_norm": 0.2250160574913025, "learning_rate": 1.966074519696598e-07, "loss": 0.005034560337662697, "memory(GiB)": 22.66, "step": 28164, "token_acc": 1.0, "train_speed(iter/s)": 0.956151 }, { "epoch": 0.9149530585063185, "grad_norm": 0.33024653792381287, "learning_rate": 1.9645833213576438e-07, "loss": 0.010783964768052101, "memory(GiB)": 22.66, "step": 28165, "token_acc": 1.0, "train_speed(iter/s)": 0.956157 }, { "epoch": 0.9149855439690738, "grad_norm": 0.21208740770816803, "learning_rate": 1.9630926774113157e-07, "loss": 0.00430875550955534, "memory(GiB)": 22.66, "step": 28166, "token_acc": 1.0, "train_speed(iter/s)": 0.956163 }, { "epoch": 0.9150180294318293, "grad_norm": 0.3121720254421234, "learning_rate": 1.96160258787485e-07, "loss": 0.012285804376006126, "memory(GiB)": 22.66, "step": 28167, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.956168 }, { "epoch": 0.9150505148945847, "grad_norm": 0.28272107243537903, "learning_rate": 1.9601130527654222e-07, "loss": 0.010901078581809998, "memory(GiB)": 22.66, "step": 28168, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.956175 }, { "epoch": 0.9150830003573401, "grad_norm": 0.3277624845504761, "learning_rate": 1.9586240721002346e-07, "loss": 0.011996342800557613, "memory(GiB)": 22.66, "step": 28169, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.95618 }, { "epoch": 0.9151154858200955, "grad_norm": 0.3248124122619629, "learning_rate": 1.957135645896463e-07, "loss": 0.010584680363535881, "memory(GiB)": 22.66, "step": 28170, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.956185 }, { "epoch": 0.915147971282851, "grad_norm": 0.3546634614467621, "learning_rate": 1.9556477741712932e-07, "loss": 0.011765740811824799, "memory(GiB)": 22.66, "step": 28171, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.956191 }, { "epoch": 0.9151804567456063, "grad_norm": 0.37689316272735596, "learning_rate": 1.954160456941895e-07, "loss": 0.008863363415002823, "memory(GiB)": 22.66, "step": 28172, "token_acc": 1.0, "train_speed(iter/s)": 0.956197 }, { "epoch": 0.9152129422083618, "grad_norm": 0.24779346585273743, "learning_rate": 1.9526736942254322e-07, "loss": 0.009543821215629578, "memory(GiB)": 22.66, "step": 28173, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956203 }, { "epoch": 0.9152454276711172, "grad_norm": 0.659080445766449, "learning_rate": 1.9511874860390744e-07, "loss": 0.009009705856442451, "memory(GiB)": 22.66, "step": 28174, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.95621 }, { "epoch": 0.9152779131338726, "grad_norm": 0.34696757793426514, "learning_rate": 1.9497018323999583e-07, "loss": 0.008924232795834541, "memory(GiB)": 22.66, "step": 28175, "token_acc": 1.0, "train_speed(iter/s)": 0.956217 }, { "epoch": 0.915310398596628, "grad_norm": 0.5516526699066162, "learning_rate": 1.9482167333252367e-07, "loss": 0.0080785620957613, "memory(GiB)": 22.66, "step": 28176, "token_acc": 1.0, "train_speed(iter/s)": 0.956224 }, { "epoch": 0.9153428840593835, "grad_norm": 0.2606215178966522, "learning_rate": 1.9467321888320513e-07, "loss": 0.007294666487723589, "memory(GiB)": 22.66, "step": 28177, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.956231 }, { "epoch": 0.9153753695221388, "grad_norm": 0.2598380744457245, "learning_rate": 1.9452481989375437e-07, "loss": 0.008335231803357601, "memory(GiB)": 22.66, "step": 28178, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.956238 }, { "epoch": 0.9154078549848943, "grad_norm": 0.43944793939590454, "learning_rate": 1.943764763658823e-07, "loss": 0.011034892871975899, "memory(GiB)": 22.66, "step": 28179, "token_acc": 1.0, "train_speed(iter/s)": 0.956246 }, { "epoch": 0.9154403404476497, "grad_norm": 0.37595871090888977, "learning_rate": 1.9422818830130196e-07, "loss": 0.012008067220449448, "memory(GiB)": 22.66, "step": 28180, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.956253 }, { "epoch": 0.9154728259104051, "grad_norm": 0.2696186304092407, "learning_rate": 1.9407995570172534e-07, "loss": 0.011116080917418003, "memory(GiB)": 22.66, "step": 28181, "token_acc": 1.0, "train_speed(iter/s)": 0.95626 }, { "epoch": 0.9155053113731605, "grad_norm": 0.5305784940719604, "learning_rate": 1.939317785688627e-07, "loss": 0.023181192576885223, "memory(GiB)": 22.66, "step": 28182, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956268 }, { "epoch": 0.915537796835916, "grad_norm": 0.3389091193675995, "learning_rate": 1.937836569044238e-07, "loss": 0.009916064329445362, "memory(GiB)": 22.66, "step": 28183, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956275 }, { "epoch": 0.9155702822986713, "grad_norm": 0.5576933026313782, "learning_rate": 1.936355907101184e-07, "loss": 0.013599179685115814, "memory(GiB)": 22.66, "step": 28184, "token_acc": 1.0, "train_speed(iter/s)": 0.956283 }, { "epoch": 0.9156027677614268, "grad_norm": 0.23264525830745697, "learning_rate": 1.9348757998765566e-07, "loss": 0.005764545872807503, "memory(GiB)": 22.66, "step": 28185, "token_acc": 1.0, "train_speed(iter/s)": 0.956291 }, { "epoch": 0.9156352532241822, "grad_norm": 0.17550459504127502, "learning_rate": 1.9333962473874478e-07, "loss": 0.005918536800891161, "memory(GiB)": 22.66, "step": 28186, "token_acc": 1.0, "train_speed(iter/s)": 0.956298 }, { "epoch": 0.9156677386869376, "grad_norm": 0.38954758644104004, "learning_rate": 1.9319172496509108e-07, "loss": 0.01637134701013565, "memory(GiB)": 22.66, "step": 28187, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.956305 }, { "epoch": 0.915700224149693, "grad_norm": 0.45843881368637085, "learning_rate": 1.9304388066840263e-07, "loss": 0.013232861645519733, "memory(GiB)": 22.66, "step": 28188, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.956312 }, { "epoch": 0.9157327096124485, "grad_norm": 0.26012399792671204, "learning_rate": 1.9289609185038637e-07, "loss": 0.007155315950512886, "memory(GiB)": 22.66, "step": 28189, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.95632 }, { "epoch": 0.9157651950752038, "grad_norm": 0.34040534496307373, "learning_rate": 1.9274835851274708e-07, "loss": 0.01565106213092804, "memory(GiB)": 22.66, "step": 28190, "token_acc": 0.996, "train_speed(iter/s)": 0.956327 }, { "epoch": 0.9157976805379593, "grad_norm": 0.3494768440723419, "learning_rate": 1.9260068065719063e-07, "loss": 0.009710023179650307, "memory(GiB)": 22.66, "step": 28191, "token_acc": 1.0, "train_speed(iter/s)": 0.956334 }, { "epoch": 0.9158301660007147, "grad_norm": 0.3647393584251404, "learning_rate": 1.924530582854206e-07, "loss": 0.00995905976742506, "memory(GiB)": 22.66, "step": 28192, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.956341 }, { "epoch": 0.9158626514634701, "grad_norm": 0.24632447957992554, "learning_rate": 1.9230549139914179e-07, "loss": 0.006601303815841675, "memory(GiB)": 22.66, "step": 28193, "token_acc": 1.0, "train_speed(iter/s)": 0.956348 }, { "epoch": 0.9158951369262255, "grad_norm": 0.28418979048728943, "learning_rate": 1.9215798000005559e-07, "loss": 0.009135167114436626, "memory(GiB)": 22.66, "step": 28194, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.956354 }, { "epoch": 0.915927622388981, "grad_norm": 0.30110642313957214, "learning_rate": 1.920105240898662e-07, "loss": 0.01534245815128088, "memory(GiB)": 22.66, "step": 28195, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.95636 }, { "epoch": 0.9159601078517363, "grad_norm": 0.35383763909339905, "learning_rate": 1.9186312367027448e-07, "loss": 0.017956241965293884, "memory(GiB)": 22.66, "step": 28196, "token_acc": 0.9742489270386266, "train_speed(iter/s)": 0.956366 }, { "epoch": 0.9159925933144918, "grad_norm": 0.3801732063293457, "learning_rate": 1.9171577874298298e-07, "loss": 0.010251685976982117, "memory(GiB)": 22.66, "step": 28197, "token_acc": 1.0, "train_speed(iter/s)": 0.956372 }, { "epoch": 0.9160250787772471, "grad_norm": 0.29578906297683716, "learning_rate": 1.9156848930968974e-07, "loss": 0.01582474634051323, "memory(GiB)": 22.66, "step": 28198, "token_acc": 0.9927797833935018, "train_speed(iter/s)": 0.956377 }, { "epoch": 0.9160575642400026, "grad_norm": 0.24127596616744995, "learning_rate": 1.9142125537209676e-07, "loss": 0.010282568633556366, "memory(GiB)": 22.66, "step": 28199, "token_acc": 1.0, "train_speed(iter/s)": 0.956383 }, { "epoch": 0.916090049702758, "grad_norm": 0.2788289189338684, "learning_rate": 1.912740769319027e-07, "loss": 0.009105764329433441, "memory(GiB)": 22.66, "step": 28200, "token_acc": 1.0, "train_speed(iter/s)": 0.956389 }, { "epoch": 0.9161225351655135, "grad_norm": 0.26675722002983093, "learning_rate": 1.9112695399080615e-07, "loss": 0.007310478016734123, "memory(GiB)": 22.66, "step": 28201, "token_acc": 1.0, "train_speed(iter/s)": 0.956394 }, { "epoch": 0.9161550206282688, "grad_norm": 0.49646633863449097, "learning_rate": 1.9097988655050526e-07, "loss": 0.014525728300213814, "memory(GiB)": 22.66, "step": 28202, "token_acc": 0.9948979591836735, "train_speed(iter/s)": 0.956401 }, { "epoch": 0.9161875060910243, "grad_norm": 0.3249356150627136, "learning_rate": 1.9083287461269696e-07, "loss": 0.010366132482886314, "memory(GiB)": 22.66, "step": 28203, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.956406 }, { "epoch": 0.9162199915537796, "grad_norm": 0.26748383045196533, "learning_rate": 1.9068591817907823e-07, "loss": 0.007717996835708618, "memory(GiB)": 22.66, "step": 28204, "token_acc": 1.0, "train_speed(iter/s)": 0.956412 }, { "epoch": 0.9162524770165351, "grad_norm": 0.3224359154701233, "learning_rate": 1.905390172513455e-07, "loss": 0.010501261800527573, "memory(GiB)": 22.66, "step": 28205, "token_acc": 1.0, "train_speed(iter/s)": 0.956417 }, { "epoch": 0.9162849624792905, "grad_norm": 0.3012595474720001, "learning_rate": 1.9039217183119408e-07, "loss": 0.007482929155230522, "memory(GiB)": 22.66, "step": 28206, "token_acc": 1.0, "train_speed(iter/s)": 0.956423 }, { "epoch": 0.916317447942046, "grad_norm": 0.457584023475647, "learning_rate": 1.9024538192031816e-07, "loss": 0.012982640415430069, "memory(GiB)": 22.66, "step": 28207, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.956429 }, { "epoch": 0.9163499334048013, "grad_norm": 0.24667710065841675, "learning_rate": 1.9009864752041196e-07, "loss": 0.009046744555234909, "memory(GiB)": 22.66, "step": 28208, "token_acc": 1.0, "train_speed(iter/s)": 0.956435 }, { "epoch": 0.9163824188675568, "grad_norm": 0.31659388542175293, "learning_rate": 1.8995196863316967e-07, "loss": 0.008860055357217789, "memory(GiB)": 22.66, "step": 28209, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956441 }, { "epoch": 0.9164149043303121, "grad_norm": 0.2758505940437317, "learning_rate": 1.898053452602844e-07, "loss": 0.008267128840088844, "memory(GiB)": 22.66, "step": 28210, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.956446 }, { "epoch": 0.9164473897930676, "grad_norm": 0.2376347780227661, "learning_rate": 1.896587774034475e-07, "loss": 0.006898778956383467, "memory(GiB)": 22.66, "step": 28211, "token_acc": 1.0, "train_speed(iter/s)": 0.956452 }, { "epoch": 0.916479875255823, "grad_norm": 0.29133734107017517, "learning_rate": 1.895122650643505e-07, "loss": 0.009573960676789284, "memory(GiB)": 22.66, "step": 28212, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956456 }, { "epoch": 0.9165123607185784, "grad_norm": 0.3095237612724304, "learning_rate": 1.8936580824468476e-07, "loss": 0.014489843510091305, "memory(GiB)": 22.66, "step": 28213, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956462 }, { "epoch": 0.9165448461813338, "grad_norm": 0.42424264550209045, "learning_rate": 1.8921940694614116e-07, "loss": 0.01554887555539608, "memory(GiB)": 22.66, "step": 28214, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.956468 }, { "epoch": 0.9165773316440893, "grad_norm": 0.24469724297523499, "learning_rate": 1.890730611704078e-07, "loss": 0.008536645211279392, "memory(GiB)": 22.66, "step": 28215, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956472 }, { "epoch": 0.9166098171068446, "grad_norm": 0.3669763207435608, "learning_rate": 1.8892677091917556e-07, "loss": 0.01628408208489418, "memory(GiB)": 22.66, "step": 28216, "token_acc": 0.9893238434163701, "train_speed(iter/s)": 0.956478 }, { "epoch": 0.9166423025696001, "grad_norm": 0.34006282687187195, "learning_rate": 1.8878053619413083e-07, "loss": 0.009457845240831375, "memory(GiB)": 22.66, "step": 28217, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956483 }, { "epoch": 0.9166747880323555, "grad_norm": 0.28248345851898193, "learning_rate": 1.8863435699696343e-07, "loss": 0.012542728334665298, "memory(GiB)": 22.66, "step": 28218, "token_acc": 1.0, "train_speed(iter/s)": 0.956489 }, { "epoch": 0.9167072734951109, "grad_norm": 0.36038509011268616, "learning_rate": 1.8848823332935916e-07, "loss": 0.014131488278508186, "memory(GiB)": 22.66, "step": 28219, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.956495 }, { "epoch": 0.9167397589578663, "grad_norm": 0.20424379408359528, "learning_rate": 1.8834216519300453e-07, "loss": 0.006550024729222059, "memory(GiB)": 22.66, "step": 28220, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956501 }, { "epoch": 0.9167722444206218, "grad_norm": 0.22597242891788483, "learning_rate": 1.8819615258958646e-07, "loss": 0.00632612407207489, "memory(GiB)": 22.66, "step": 28221, "token_acc": 1.0, "train_speed(iter/s)": 0.956507 }, { "epoch": 0.9168047298833771, "grad_norm": 0.41429731249809265, "learning_rate": 1.8805019552078753e-07, "loss": 0.018573392182588577, "memory(GiB)": 22.66, "step": 28222, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.956513 }, { "epoch": 0.9168372153461326, "grad_norm": 1.7467780113220215, "learning_rate": 1.879042939882958e-07, "loss": 0.014739223755896091, "memory(GiB)": 22.66, "step": 28223, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.956518 }, { "epoch": 0.9168697008088881, "grad_norm": 0.251604825258255, "learning_rate": 1.8775844799379272e-07, "loss": 0.007610836997628212, "memory(GiB)": 22.66, "step": 28224, "token_acc": 1.0, "train_speed(iter/s)": 0.956524 }, { "epoch": 0.9169021862716434, "grad_norm": 0.35870903730392456, "learning_rate": 1.876126575389625e-07, "loss": 0.012684688903391361, "memory(GiB)": 22.66, "step": 28225, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.956529 }, { "epoch": 0.9169346717343989, "grad_norm": 0.3888601064682007, "learning_rate": 1.8746692262548717e-07, "loss": 0.016458218917250633, "memory(GiB)": 22.66, "step": 28226, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.956535 }, { "epoch": 0.9169671571971543, "grad_norm": 0.4136539399623871, "learning_rate": 1.8732124325504863e-07, "loss": 0.008425242267549038, "memory(GiB)": 22.66, "step": 28227, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.95654 }, { "epoch": 0.9169996426599097, "grad_norm": 0.45120716094970703, "learning_rate": 1.8717561942932895e-07, "loss": 0.009249728173017502, "memory(GiB)": 22.66, "step": 28228, "token_acc": 1.0, "train_speed(iter/s)": 0.956546 }, { "epoch": 0.9170321281226651, "grad_norm": 0.4432710111141205, "learning_rate": 1.8703005115000893e-07, "loss": 0.013666899874806404, "memory(GiB)": 22.66, "step": 28229, "token_acc": 0.98828125, "train_speed(iter/s)": 0.956552 }, { "epoch": 0.9170646135854206, "grad_norm": 0.36442244052886963, "learning_rate": 1.8688453841876786e-07, "loss": 0.012246111407876015, "memory(GiB)": 22.66, "step": 28230, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.956557 }, { "epoch": 0.9170970990481759, "grad_norm": 0.38352295756340027, "learning_rate": 1.8673908123728545e-07, "loss": 0.016321808099746704, "memory(GiB)": 22.66, "step": 28231, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956563 }, { "epoch": 0.9171295845109314, "grad_norm": 0.27266812324523926, "learning_rate": 1.8659367960723984e-07, "loss": 0.008448205888271332, "memory(GiB)": 22.66, "step": 28232, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956569 }, { "epoch": 0.9171620699736868, "grad_norm": 0.1989968866109848, "learning_rate": 1.8644833353031133e-07, "loss": 0.006186781916767359, "memory(GiB)": 22.66, "step": 28233, "token_acc": 1.0, "train_speed(iter/s)": 0.956575 }, { "epoch": 0.9171945554364422, "grad_norm": 0.3436974883079529, "learning_rate": 1.863030430081747e-07, "loss": 0.014670115895569324, "memory(GiB)": 22.66, "step": 28234, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.956582 }, { "epoch": 0.9172270408991976, "grad_norm": 0.45729443430900574, "learning_rate": 1.8615780804250804e-07, "loss": 0.01180429570376873, "memory(GiB)": 22.66, "step": 28235, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.95659 }, { "epoch": 0.9172595263619531, "grad_norm": 0.17226542532444, "learning_rate": 1.8601262863498726e-07, "loss": 0.004746726714074612, "memory(GiB)": 22.66, "step": 28236, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.956597 }, { "epoch": 0.9172920118247084, "grad_norm": 0.36152541637420654, "learning_rate": 1.8586750478728876e-07, "loss": 0.012415746226906776, "memory(GiB)": 22.66, "step": 28237, "token_acc": 1.0, "train_speed(iter/s)": 0.956605 }, { "epoch": 0.9173244972874639, "grad_norm": 0.26564961671829224, "learning_rate": 1.857224365010868e-07, "loss": 0.0073185088112950325, "memory(GiB)": 22.66, "step": 28238, "token_acc": 1.0, "train_speed(iter/s)": 0.956613 }, { "epoch": 0.9173569827502193, "grad_norm": 0.21998581290245056, "learning_rate": 1.8557742377805555e-07, "loss": 0.007549822796136141, "memory(GiB)": 22.66, "step": 28239, "token_acc": 1.0, "train_speed(iter/s)": 0.95662 }, { "epoch": 0.9173894682129747, "grad_norm": 0.4468923509120941, "learning_rate": 1.854324666198687e-07, "loss": 0.01053402479737997, "memory(GiB)": 22.66, "step": 28240, "token_acc": 1.0, "train_speed(iter/s)": 0.956627 }, { "epoch": 0.9174219536757301, "grad_norm": 0.2581080198287964, "learning_rate": 1.8528756502819934e-07, "loss": 0.009670978412032127, "memory(GiB)": 22.66, "step": 28241, "token_acc": 1.0, "train_speed(iter/s)": 0.956635 }, { "epoch": 0.9174544391384856, "grad_norm": 0.4408870041370392, "learning_rate": 1.8514271900472003e-07, "loss": 0.016279365867376328, "memory(GiB)": 22.66, "step": 28242, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956642 }, { "epoch": 0.9174869246012409, "grad_norm": 0.3108693063259125, "learning_rate": 1.8499792855110222e-07, "loss": 0.009286019951105118, "memory(GiB)": 22.66, "step": 28243, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.95665 }, { "epoch": 0.9175194100639964, "grad_norm": 0.3944307863712311, "learning_rate": 1.848531936690179e-07, "loss": 0.013073697686195374, "memory(GiB)": 22.66, "step": 28244, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956657 }, { "epoch": 0.9175518955267518, "grad_norm": 0.32711514830589294, "learning_rate": 1.8470851436013514e-07, "loss": 0.008069461211562157, "memory(GiB)": 22.66, "step": 28245, "token_acc": 1.0, "train_speed(iter/s)": 0.956664 }, { "epoch": 0.9175843809895072, "grad_norm": 0.36198264360427856, "learning_rate": 1.84563890626126e-07, "loss": 0.016451068222522736, "memory(GiB)": 22.66, "step": 28246, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.956671 }, { "epoch": 0.9176168664522626, "grad_norm": 0.30968689918518066, "learning_rate": 1.8441932246865913e-07, "loss": 0.013369372114539146, "memory(GiB)": 22.66, "step": 28247, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.956677 }, { "epoch": 0.9176493519150181, "grad_norm": 0.4557102918624878, "learning_rate": 1.8427480988940262e-07, "loss": 0.019087474793195724, "memory(GiB)": 22.66, "step": 28248, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.956685 }, { "epoch": 0.9176818373777734, "grad_norm": 0.18649743497371674, "learning_rate": 1.8413035289002456e-07, "loss": 0.005373049061745405, "memory(GiB)": 22.66, "step": 28249, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.956693 }, { "epoch": 0.9177143228405289, "grad_norm": 0.33977389335632324, "learning_rate": 1.8398595147219146e-07, "loss": 0.011433402076363564, "memory(GiB)": 22.66, "step": 28250, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.9567 }, { "epoch": 0.9177468083032843, "grad_norm": 0.4239233434200287, "learning_rate": 1.8384160563757137e-07, "loss": 0.016506055369973183, "memory(GiB)": 22.66, "step": 28251, "token_acc": 1.0, "train_speed(iter/s)": 0.956708 }, { "epoch": 0.9177792937660397, "grad_norm": 0.3842064440250397, "learning_rate": 1.8369731538782908e-07, "loss": 0.010123633779585361, "memory(GiB)": 22.66, "step": 28252, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.956715 }, { "epoch": 0.9178117792287951, "grad_norm": 0.20013171434402466, "learning_rate": 1.8355308072463108e-07, "loss": 0.006590516306459904, "memory(GiB)": 22.66, "step": 28253, "token_acc": 1.0, "train_speed(iter/s)": 0.956722 }, { "epoch": 0.9178442646915506, "grad_norm": 0.31073200702667236, "learning_rate": 1.8340890164963987e-07, "loss": 0.013270216062664986, "memory(GiB)": 22.66, "step": 28254, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.95673 }, { "epoch": 0.9178767501543059, "grad_norm": 0.28611472249031067, "learning_rate": 1.8326477816452136e-07, "loss": 0.00874416995793581, "memory(GiB)": 22.66, "step": 28255, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956737 }, { "epoch": 0.9179092356170614, "grad_norm": 0.33847910165786743, "learning_rate": 1.831207102709387e-07, "loss": 0.01478483248502016, "memory(GiB)": 22.66, "step": 28256, "token_acc": 0.984, "train_speed(iter/s)": 0.956743 }, { "epoch": 0.9179417210798168, "grad_norm": 0.3990948498249054, "learning_rate": 1.8297669797055438e-07, "loss": 0.011480582877993584, "memory(GiB)": 22.66, "step": 28257, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956749 }, { "epoch": 0.9179742065425722, "grad_norm": 0.404852956533432, "learning_rate": 1.8283274126502993e-07, "loss": 0.009153582155704498, "memory(GiB)": 22.66, "step": 28258, "token_acc": 1.0, "train_speed(iter/s)": 0.956755 }, { "epoch": 0.9180066920053276, "grad_norm": 0.30047422647476196, "learning_rate": 1.8268884015602728e-07, "loss": 0.011070230044424534, "memory(GiB)": 22.66, "step": 28259, "token_acc": 1.0, "train_speed(iter/s)": 0.956761 }, { "epoch": 0.9180391774680831, "grad_norm": 0.33970749378204346, "learning_rate": 1.8254499464520737e-07, "loss": 0.01375550590455532, "memory(GiB)": 22.66, "step": 28260, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956767 }, { "epoch": 0.9180716629308384, "grad_norm": 0.4620167016983032, "learning_rate": 1.8240120473423052e-07, "loss": 0.010091502219438553, "memory(GiB)": 22.66, "step": 28261, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956773 }, { "epoch": 0.9181041483935939, "grad_norm": 0.225479856133461, "learning_rate": 1.822574704247554e-07, "loss": 0.011217965744435787, "memory(GiB)": 22.66, "step": 28262, "token_acc": 1.0, "train_speed(iter/s)": 0.956779 }, { "epoch": 0.9181366338563492, "grad_norm": 0.4126603901386261, "learning_rate": 1.821137917184418e-07, "loss": 0.010497254319489002, "memory(GiB)": 22.66, "step": 28263, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.956784 }, { "epoch": 0.9181691193191047, "grad_norm": 0.341038316488266, "learning_rate": 1.8197016861694784e-07, "loss": 0.010917778126895428, "memory(GiB)": 22.66, "step": 28264, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.95679 }, { "epoch": 0.9182016047818601, "grad_norm": 0.3075181245803833, "learning_rate": 1.818266011219305e-07, "loss": 0.006972850300371647, "memory(GiB)": 22.66, "step": 28265, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.956796 }, { "epoch": 0.9182340902446156, "grad_norm": 0.3523024022579193, "learning_rate": 1.816830892350474e-07, "loss": 0.012266404926776886, "memory(GiB)": 22.66, "step": 28266, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.956802 }, { "epoch": 0.9182665757073709, "grad_norm": 0.24695837497711182, "learning_rate": 1.815396329579544e-07, "loss": 0.007304942235350609, "memory(GiB)": 22.66, "step": 28267, "token_acc": 1.0, "train_speed(iter/s)": 0.956808 }, { "epoch": 0.9182990611701264, "grad_norm": 0.4315822720527649, "learning_rate": 1.8139623229230796e-07, "loss": 0.010003453120589256, "memory(GiB)": 22.66, "step": 28268, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956814 }, { "epoch": 0.9183315466328817, "grad_norm": 0.23281025886535645, "learning_rate": 1.8125288723976176e-07, "loss": 0.009591849520802498, "memory(GiB)": 22.66, "step": 28269, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.95682 }, { "epoch": 0.9183640320956372, "grad_norm": 0.22239944338798523, "learning_rate": 1.8110959780197167e-07, "loss": 0.005376082845032215, "memory(GiB)": 22.66, "step": 28270, "token_acc": 1.0, "train_speed(iter/s)": 0.956826 }, { "epoch": 0.9183965175583926, "grad_norm": 0.3974354565143585, "learning_rate": 1.809663639805903e-07, "loss": 0.015673860907554626, "memory(GiB)": 22.66, "step": 28271, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.956832 }, { "epoch": 0.918429003021148, "grad_norm": 0.33147159218788147, "learning_rate": 1.8082318577727188e-07, "loss": 0.0075454106554389, "memory(GiB)": 22.66, "step": 28272, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956839 }, { "epoch": 0.9184614884839034, "grad_norm": 0.4322524666786194, "learning_rate": 1.8068006319366727e-07, "loss": 0.015192773193120956, "memory(GiB)": 22.66, "step": 28273, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956845 }, { "epoch": 0.9184939739466589, "grad_norm": 0.717000424861908, "learning_rate": 1.8053699623143016e-07, "loss": 0.011373905465006828, "memory(GiB)": 22.66, "step": 28274, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.95685 }, { "epoch": 0.9185264594094142, "grad_norm": 0.6320986747741699, "learning_rate": 1.8039398489220984e-07, "loss": 0.013295536860823631, "memory(GiB)": 22.66, "step": 28275, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956854 }, { "epoch": 0.9185589448721697, "grad_norm": 0.23743510246276855, "learning_rate": 1.802510291776588e-07, "loss": 0.0053053661249578, "memory(GiB)": 22.66, "step": 28276, "token_acc": 0.9961538461538462, "train_speed(iter/s)": 0.956858 }, { "epoch": 0.9185914303349251, "grad_norm": 0.3744960129261017, "learning_rate": 1.8010812908942522e-07, "loss": 0.011783148162066936, "memory(GiB)": 22.66, "step": 28277, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956863 }, { "epoch": 0.9186239157976805, "grad_norm": 0.3380591869354248, "learning_rate": 1.799652846291594e-07, "loss": 0.015725266188383102, "memory(GiB)": 22.66, "step": 28278, "token_acc": 1.0, "train_speed(iter/s)": 0.956868 }, { "epoch": 0.9186564012604359, "grad_norm": 0.29309120774269104, "learning_rate": 1.7982249579850953e-07, "loss": 0.008495310321450233, "memory(GiB)": 22.66, "step": 28279, "token_acc": 1.0, "train_speed(iter/s)": 0.956873 }, { "epoch": 0.9186888867231914, "grad_norm": 0.5262428522109985, "learning_rate": 1.7967976259912368e-07, "loss": 0.017153557389974594, "memory(GiB)": 22.66, "step": 28280, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.956879 }, { "epoch": 0.9187213721859467, "grad_norm": 0.37385737895965576, "learning_rate": 1.7953708503264943e-07, "loss": 0.014810689724981785, "memory(GiB)": 22.66, "step": 28281, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956884 }, { "epoch": 0.9187538576487022, "grad_norm": 0.3107038736343384, "learning_rate": 1.7939446310073272e-07, "loss": 0.013780388981103897, "memory(GiB)": 22.66, "step": 28282, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.95689 }, { "epoch": 0.9187863431114576, "grad_norm": 0.3708859384059906, "learning_rate": 1.7925189680502053e-07, "loss": 0.011808045208454132, "memory(GiB)": 22.66, "step": 28283, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956895 }, { "epoch": 0.918818828574213, "grad_norm": 0.3297809958457947, "learning_rate": 1.7910938614715767e-07, "loss": 0.013382546603679657, "memory(GiB)": 22.66, "step": 28284, "token_acc": 0.9926470588235294, "train_speed(iter/s)": 0.956901 }, { "epoch": 0.9188513140369684, "grad_norm": 0.25258415937423706, "learning_rate": 1.7896693112878949e-07, "loss": 0.007271988317370415, "memory(GiB)": 22.66, "step": 28285, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956907 }, { "epoch": 0.9188837994997239, "grad_norm": 0.3936120569705963, "learning_rate": 1.788245317515591e-07, "loss": 0.008055374026298523, "memory(GiB)": 22.66, "step": 28286, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.956912 }, { "epoch": 0.9189162849624792, "grad_norm": 0.29279255867004395, "learning_rate": 1.7868218801711079e-07, "loss": 0.012706932611763477, "memory(GiB)": 22.66, "step": 28287, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956917 }, { "epoch": 0.9189487704252347, "grad_norm": 0.328008770942688, "learning_rate": 1.785398999270871e-07, "loss": 0.010726488195359707, "memory(GiB)": 22.66, "step": 28288, "token_acc": 1.0, "train_speed(iter/s)": 0.956922 }, { "epoch": 0.9189812558879902, "grad_norm": 0.3242766559123993, "learning_rate": 1.7839766748313115e-07, "loss": 0.013079337775707245, "memory(GiB)": 22.66, "step": 28289, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.956928 }, { "epoch": 0.9190137413507455, "grad_norm": 0.26763007044792175, "learning_rate": 1.7825549068688274e-07, "loss": 0.006443833000957966, "memory(GiB)": 22.66, "step": 28290, "token_acc": 1.0, "train_speed(iter/s)": 0.956933 }, { "epoch": 0.919046226813501, "grad_norm": 0.40671804547309875, "learning_rate": 1.7811336953998392e-07, "loss": 0.01348915509879589, "memory(GiB)": 22.66, "step": 28291, "token_acc": 1.0, "train_speed(iter/s)": 0.956939 }, { "epoch": 0.9190787122762564, "grad_norm": 0.35320600867271423, "learning_rate": 1.7797130404407448e-07, "loss": 0.008722029626369476, "memory(GiB)": 22.66, "step": 28292, "token_acc": 1.0, "train_speed(iter/s)": 0.956945 }, { "epoch": 0.9191111977390118, "grad_norm": 0.4454967677593231, "learning_rate": 1.778292942007953e-07, "loss": 0.016633663326501846, "memory(GiB)": 22.66, "step": 28293, "token_acc": 1.0, "train_speed(iter/s)": 0.95695 }, { "epoch": 0.9191436832017672, "grad_norm": 0.8041191101074219, "learning_rate": 1.7768734001178346e-07, "loss": 0.01192399486899376, "memory(GiB)": 22.66, "step": 28294, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956957 }, { "epoch": 0.9191761686645227, "grad_norm": 0.23077359795570374, "learning_rate": 1.775454414786787e-07, "loss": 0.009568744339048862, "memory(GiB)": 22.66, "step": 28295, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.956965 }, { "epoch": 0.919208654127278, "grad_norm": 0.4388561546802521, "learning_rate": 1.7740359860311696e-07, "loss": 0.014365487731993198, "memory(GiB)": 22.66, "step": 28296, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956971 }, { "epoch": 0.9192411395900335, "grad_norm": 0.3452226221561432, "learning_rate": 1.7726181138673747e-07, "loss": 0.010652555152773857, "memory(GiB)": 22.66, "step": 28297, "token_acc": 1.0, "train_speed(iter/s)": 0.956979 }, { "epoch": 0.9192736250527889, "grad_norm": 0.3392200469970703, "learning_rate": 1.7712007983117618e-07, "loss": 0.01944321021437645, "memory(GiB)": 22.66, "step": 28298, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.956986 }, { "epoch": 0.9193061105155443, "grad_norm": 0.3650323748588562, "learning_rate": 1.7697840393806786e-07, "loss": 0.010776222683489323, "memory(GiB)": 22.66, "step": 28299, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.956994 }, { "epoch": 0.9193385959782997, "grad_norm": 0.29583752155303955, "learning_rate": 1.76836783709049e-07, "loss": 0.006745155900716782, "memory(GiB)": 22.66, "step": 28300, "token_acc": 1.0, "train_speed(iter/s)": 0.957001 }, { "epoch": 0.9193710814410552, "grad_norm": 0.4085671305656433, "learning_rate": 1.7669521914575161e-07, "loss": 0.013580191880464554, "memory(GiB)": 22.66, "step": 28301, "token_acc": 1.0, "train_speed(iter/s)": 0.957008 }, { "epoch": 0.9194035669038105, "grad_norm": 0.4633098244667053, "learning_rate": 1.7655371024981271e-07, "loss": 0.018578633666038513, "memory(GiB)": 22.66, "step": 28302, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957016 }, { "epoch": 0.919436052366566, "grad_norm": 0.4065389633178711, "learning_rate": 1.7641225702286323e-07, "loss": 0.011630786582827568, "memory(GiB)": 22.66, "step": 28303, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957023 }, { "epoch": 0.9194685378293214, "grad_norm": 0.35977399349212646, "learning_rate": 1.7627085946653743e-07, "loss": 0.011139093898236752, "memory(GiB)": 22.66, "step": 28304, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.95703 }, { "epoch": 0.9195010232920768, "grad_norm": 0.20058919489383698, "learning_rate": 1.7612951758246512e-07, "loss": 0.008089550770819187, "memory(GiB)": 22.66, "step": 28305, "token_acc": 1.0, "train_speed(iter/s)": 0.957037 }, { "epoch": 0.9195335087548322, "grad_norm": 0.38404569029808044, "learning_rate": 1.759882313722794e-07, "loss": 0.010941112414002419, "memory(GiB)": 22.66, "step": 28306, "token_acc": 1.0, "train_speed(iter/s)": 0.957044 }, { "epoch": 0.9195659942175877, "grad_norm": 0.28058549761772156, "learning_rate": 1.7584700083760952e-07, "loss": 0.009690148755908012, "memory(GiB)": 22.66, "step": 28307, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957052 }, { "epoch": 0.919598479680343, "grad_norm": 0.3725786805152893, "learning_rate": 1.75705825980087e-07, "loss": 0.008255560882389545, "memory(GiB)": 22.66, "step": 28308, "token_acc": 1.0, "train_speed(iter/s)": 0.957059 }, { "epoch": 0.9196309651430985, "grad_norm": 0.2894550859928131, "learning_rate": 1.7556470680133996e-07, "loss": 0.009182058274745941, "memory(GiB)": 22.66, "step": 28309, "token_acc": 1.0, "train_speed(iter/s)": 0.957066 }, { "epoch": 0.9196634506058539, "grad_norm": 0.4394661486148834, "learning_rate": 1.7542364330299766e-07, "loss": 0.018908651545643806, "memory(GiB)": 22.66, "step": 28310, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957074 }, { "epoch": 0.9196959360686093, "grad_norm": 0.5354906916618347, "learning_rate": 1.7528263548668766e-07, "loss": 0.014117177575826645, "memory(GiB)": 22.66, "step": 28311, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.957081 }, { "epoch": 0.9197284215313647, "grad_norm": 0.30109792947769165, "learning_rate": 1.7514168335403813e-07, "loss": 0.008319862186908722, "memory(GiB)": 22.66, "step": 28312, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957088 }, { "epoch": 0.9197609069941202, "grad_norm": 0.4707763195037842, "learning_rate": 1.7500078690667554e-07, "loss": 0.01037282682955265, "memory(GiB)": 22.66, "step": 28313, "token_acc": 1.0, "train_speed(iter/s)": 0.957096 }, { "epoch": 0.9197933924568755, "grad_norm": 0.6053850650787354, "learning_rate": 1.7485994614622582e-07, "loss": 0.016570959240198135, "memory(GiB)": 22.66, "step": 28314, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957103 }, { "epoch": 0.919825877919631, "grad_norm": 0.33162444829940796, "learning_rate": 1.747191610743143e-07, "loss": 0.009647911414504051, "memory(GiB)": 22.66, "step": 28315, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957111 }, { "epoch": 0.9198583633823864, "grad_norm": 0.21744464337825775, "learning_rate": 1.7457843169256584e-07, "loss": 0.005688745528459549, "memory(GiB)": 22.66, "step": 28316, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957118 }, { "epoch": 0.9198908488451418, "grad_norm": 0.2882484495639801, "learning_rate": 1.7443775800260575e-07, "loss": 0.006506809964776039, "memory(GiB)": 22.66, "step": 28317, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957126 }, { "epoch": 0.9199233343078972, "grad_norm": 0.3360336422920227, "learning_rate": 1.7429714000605612e-07, "loss": 0.010627810843288898, "memory(GiB)": 22.66, "step": 28318, "token_acc": 0.9871244635193133, "train_speed(iter/s)": 0.957132 }, { "epoch": 0.9199558197706527, "grad_norm": 0.2512548863887787, "learning_rate": 1.7415657770454063e-07, "loss": 0.008742095902562141, "memory(GiB)": 22.66, "step": 28319, "token_acc": 1.0, "train_speed(iter/s)": 0.957137 }, { "epoch": 0.919988305233408, "grad_norm": 0.43187960982322693, "learning_rate": 1.7401607109968133e-07, "loss": 0.014242020435631275, "memory(GiB)": 22.66, "step": 28320, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.957143 }, { "epoch": 0.9200207906961635, "grad_norm": 0.30035674571990967, "learning_rate": 1.7387562019310023e-07, "loss": 0.0065211639739573, "memory(GiB)": 22.66, "step": 28321, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957149 }, { "epoch": 0.9200532761589189, "grad_norm": 0.4334867298603058, "learning_rate": 1.7373522498641826e-07, "loss": 0.017491282895207405, "memory(GiB)": 22.66, "step": 28322, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.957155 }, { "epoch": 0.9200857616216743, "grad_norm": 0.3193756639957428, "learning_rate": 1.7359488548125524e-07, "loss": 0.010653980076313019, "memory(GiB)": 22.66, "step": 28323, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.95716 }, { "epoch": 0.9201182470844297, "grad_norm": 0.3396525979042053, "learning_rate": 1.7345460167923045e-07, "loss": 0.009191173128783703, "memory(GiB)": 22.66, "step": 28324, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957166 }, { "epoch": 0.9201507325471852, "grad_norm": 0.4159294664859772, "learning_rate": 1.733143735819648e-07, "loss": 0.009758456610143185, "memory(GiB)": 22.66, "step": 28325, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.957172 }, { "epoch": 0.9201832180099405, "grad_norm": 0.6435844898223877, "learning_rate": 1.7317420119107475e-07, "loss": 0.0173008032143116, "memory(GiB)": 22.66, "step": 28326, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957177 }, { "epoch": 0.920215703472696, "grad_norm": 0.3166826665401459, "learning_rate": 1.7303408450817906e-07, "loss": 0.010350925847887993, "memory(GiB)": 22.66, "step": 28327, "token_acc": 1.0, "train_speed(iter/s)": 0.957183 }, { "epoch": 0.9202481889354514, "grad_norm": 0.14800159633159637, "learning_rate": 1.7289402353489582e-07, "loss": 0.004767569247633219, "memory(GiB)": 22.66, "step": 28328, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957189 }, { "epoch": 0.9202806743982068, "grad_norm": 0.291782945394516, "learning_rate": 1.727540182728388e-07, "loss": 0.007400372065603733, "memory(GiB)": 22.66, "step": 28329, "token_acc": 1.0, "train_speed(iter/s)": 0.957194 }, { "epoch": 0.9203131598609622, "grad_norm": 0.25185641646385193, "learning_rate": 1.7261406872362607e-07, "loss": 0.013601046055555344, "memory(GiB)": 22.66, "step": 28330, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.9572 }, { "epoch": 0.9203456453237177, "grad_norm": 0.2720741629600525, "learning_rate": 1.7247417488887252e-07, "loss": 0.011244009248912334, "memory(GiB)": 22.66, "step": 28331, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.957205 }, { "epoch": 0.920378130786473, "grad_norm": 0.29968786239624023, "learning_rate": 1.723343367701924e-07, "loss": 0.011796308681368828, "memory(GiB)": 22.66, "step": 28332, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.957211 }, { "epoch": 0.9204106162492285, "grad_norm": 0.27667000889778137, "learning_rate": 1.721945543691994e-07, "loss": 0.01122741773724556, "memory(GiB)": 22.66, "step": 28333, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.957217 }, { "epoch": 0.9204431017119838, "grad_norm": 0.3807104825973511, "learning_rate": 1.720548276875067e-07, "loss": 0.013898522593080997, "memory(GiB)": 22.66, "step": 28334, "token_acc": 1.0, "train_speed(iter/s)": 0.957223 }, { "epoch": 0.9204755871747393, "grad_norm": 0.25745269656181335, "learning_rate": 1.7191515672672798e-07, "loss": 0.008138933219015598, "memory(GiB)": 22.66, "step": 28335, "token_acc": 1.0, "train_speed(iter/s)": 0.957228 }, { "epoch": 0.9205080726374947, "grad_norm": 0.3480350077152252, "learning_rate": 1.717755414884742e-07, "loss": 0.01636137068271637, "memory(GiB)": 22.66, "step": 28336, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.957233 }, { "epoch": 0.9205405581002502, "grad_norm": 0.40535950660705566, "learning_rate": 1.7163598197435682e-07, "loss": 0.012387482449412346, "memory(GiB)": 22.66, "step": 28337, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.957237 }, { "epoch": 0.9205730435630055, "grad_norm": 0.2809444069862366, "learning_rate": 1.714964781859868e-07, "loss": 0.010469276458024979, "memory(GiB)": 22.66, "step": 28338, "token_acc": 1.0, "train_speed(iter/s)": 0.957241 }, { "epoch": 0.920605529025761, "grad_norm": 0.31237563490867615, "learning_rate": 1.7135703012497452e-07, "loss": 0.011885426938533783, "memory(GiB)": 22.66, "step": 28339, "token_acc": 1.0, "train_speed(iter/s)": 0.957246 }, { "epoch": 0.9206380144885163, "grad_norm": 0.3751141428947449, "learning_rate": 1.7121763779292922e-07, "loss": 0.018189582973718643, "memory(GiB)": 22.66, "step": 28340, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957251 }, { "epoch": 0.9206704999512718, "grad_norm": 0.46476826071739197, "learning_rate": 1.710783011914585e-07, "loss": 0.011765887960791588, "memory(GiB)": 22.66, "step": 28341, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.957256 }, { "epoch": 0.9207029854140272, "grad_norm": 0.49038416147232056, "learning_rate": 1.7093902032217225e-07, "loss": 0.00999964028596878, "memory(GiB)": 22.66, "step": 28342, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957261 }, { "epoch": 0.9207354708767826, "grad_norm": 0.365587055683136, "learning_rate": 1.707997951866769e-07, "loss": 0.014199730008840561, "memory(GiB)": 22.66, "step": 28343, "token_acc": 0.9899497487437185, "train_speed(iter/s)": 0.957266 }, { "epoch": 0.920767956339538, "grad_norm": 0.3497799336910248, "learning_rate": 1.7066062578657895e-07, "loss": 0.008162368088960648, "memory(GiB)": 22.66, "step": 28344, "token_acc": 1.0, "train_speed(iter/s)": 0.957271 }, { "epoch": 0.9208004418022935, "grad_norm": 1.5536985397338867, "learning_rate": 1.7052151212348655e-07, "loss": 0.013588537462055683, "memory(GiB)": 22.66, "step": 28345, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.957276 }, { "epoch": 0.9208329272650488, "grad_norm": 0.21754740178585052, "learning_rate": 1.703824541990029e-07, "loss": 0.00697525916621089, "memory(GiB)": 22.66, "step": 28346, "token_acc": 1.0, "train_speed(iter/s)": 0.957281 }, { "epoch": 0.9208654127278043, "grad_norm": 0.2862682342529297, "learning_rate": 1.7024345201473448e-07, "loss": 0.011866814456880093, "memory(GiB)": 22.66, "step": 28347, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.957287 }, { "epoch": 0.9208978981905597, "grad_norm": 0.4206051528453827, "learning_rate": 1.7010450557228442e-07, "loss": 0.01144135557115078, "memory(GiB)": 22.66, "step": 28348, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.957293 }, { "epoch": 0.9209303836533151, "grad_norm": 0.20709766447544098, "learning_rate": 1.6996561487325757e-07, "loss": 0.005843223072588444, "memory(GiB)": 22.66, "step": 28349, "token_acc": 1.0, "train_speed(iter/s)": 0.957298 }, { "epoch": 0.9209628691160705, "grad_norm": 0.4664914011955261, "learning_rate": 1.69826779919256e-07, "loss": 0.013744133524596691, "memory(GiB)": 22.66, "step": 28350, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.957304 }, { "epoch": 0.920995354578826, "grad_norm": 0.3582705557346344, "learning_rate": 1.6968800071188286e-07, "loss": 0.015451706945896149, "memory(GiB)": 22.66, "step": 28351, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.95731 }, { "epoch": 0.9210278400415814, "grad_norm": 0.41002485156059265, "learning_rate": 1.6954927725273796e-07, "loss": 0.012501724064350128, "memory(GiB)": 22.66, "step": 28352, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.957315 }, { "epoch": 0.9210603255043368, "grad_norm": 0.32982128858566284, "learning_rate": 1.6941060954342504e-07, "loss": 0.01174192875623703, "memory(GiB)": 22.66, "step": 28353, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.957322 }, { "epoch": 0.9210928109670923, "grad_norm": 0.3787182867527008, "learning_rate": 1.692719975855428e-07, "loss": 0.009999062865972519, "memory(GiB)": 22.66, "step": 28354, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957328 }, { "epoch": 0.9211252964298476, "grad_norm": 0.2764742970466614, "learning_rate": 1.6913344138069165e-07, "loss": 0.010155513882637024, "memory(GiB)": 22.66, "step": 28355, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.957335 }, { "epoch": 0.9211577818926031, "grad_norm": 0.5728891491889954, "learning_rate": 1.689949409304703e-07, "loss": 0.015145376324653625, "memory(GiB)": 22.66, "step": 28356, "token_acc": 1.0, "train_speed(iter/s)": 0.957342 }, { "epoch": 0.9211902673553585, "grad_norm": 0.29590412974357605, "learning_rate": 1.6885649623647692e-07, "loss": 0.010832368396222591, "memory(GiB)": 22.66, "step": 28357, "token_acc": 1.0, "train_speed(iter/s)": 0.95735 }, { "epoch": 0.9212227528181139, "grad_norm": 0.3701147437095642, "learning_rate": 1.687181073003097e-07, "loss": 0.013941029086709023, "memory(GiB)": 22.66, "step": 28358, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.957357 }, { "epoch": 0.9212552382808693, "grad_norm": 0.21932823956012726, "learning_rate": 1.685797741235662e-07, "loss": 0.006580994930118322, "memory(GiB)": 22.66, "step": 28359, "token_acc": 1.0, "train_speed(iter/s)": 0.957365 }, { "epoch": 0.9212877237436248, "grad_norm": 0.39598676562309265, "learning_rate": 1.6844149670784348e-07, "loss": 0.009503223933279514, "memory(GiB)": 22.66, "step": 28360, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.957372 }, { "epoch": 0.9213202092063801, "grad_norm": 0.345211923122406, "learning_rate": 1.6830327505473586e-07, "loss": 0.007315867580473423, "memory(GiB)": 22.66, "step": 28361, "token_acc": 1.0, "train_speed(iter/s)": 0.95738 }, { "epoch": 0.9213526946691356, "grad_norm": 0.30779024958610535, "learning_rate": 1.6816510916583984e-07, "loss": 0.01434536837041378, "memory(GiB)": 22.66, "step": 28362, "token_acc": 1.0, "train_speed(iter/s)": 0.957387 }, { "epoch": 0.921385180131891, "grad_norm": 0.2364593744277954, "learning_rate": 1.6802699904274911e-07, "loss": 0.008366281166672707, "memory(GiB)": 22.66, "step": 28363, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.957394 }, { "epoch": 0.9214176655946464, "grad_norm": 0.3872547745704651, "learning_rate": 1.6788894468705853e-07, "loss": 0.016412537544965744, "memory(GiB)": 22.66, "step": 28364, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.957401 }, { "epoch": 0.9214501510574018, "grad_norm": 0.3471403121948242, "learning_rate": 1.6775094610036125e-07, "loss": 0.008550331927835941, "memory(GiB)": 22.66, "step": 28365, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957409 }, { "epoch": 0.9214826365201573, "grad_norm": 0.45484259724617004, "learning_rate": 1.6761300328424935e-07, "loss": 0.017669323831796646, "memory(GiB)": 22.66, "step": 28366, "token_acc": 0.987012987012987, "train_speed(iter/s)": 0.957416 }, { "epoch": 0.9215151219829126, "grad_norm": 0.18818698823451996, "learning_rate": 1.6747511624031487e-07, "loss": 0.0052945418283343315, "memory(GiB)": 22.66, "step": 28367, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.957424 }, { "epoch": 0.9215476074456681, "grad_norm": 0.31616100668907166, "learning_rate": 1.67337284970151e-07, "loss": 0.009490042924880981, "memory(GiB)": 22.66, "step": 28368, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957431 }, { "epoch": 0.9215800929084235, "grad_norm": 0.321713924407959, "learning_rate": 1.6719950947534592e-07, "loss": 0.008355975151062012, "memory(GiB)": 22.66, "step": 28369, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957438 }, { "epoch": 0.9216125783711789, "grad_norm": 0.22650417685508728, "learning_rate": 1.670617897574911e-07, "loss": 0.008894752711057663, "memory(GiB)": 22.66, "step": 28370, "token_acc": 1.0, "train_speed(iter/s)": 0.957445 }, { "epoch": 0.9216450638339343, "grad_norm": 0.19267407059669495, "learning_rate": 1.6692412581817585e-07, "loss": 0.006025067530572414, "memory(GiB)": 22.66, "step": 28371, "token_acc": 1.0, "train_speed(iter/s)": 0.957452 }, { "epoch": 0.9216775492966898, "grad_norm": 0.2681247591972351, "learning_rate": 1.6678651765899e-07, "loss": 0.009016284719109535, "memory(GiB)": 22.66, "step": 28372, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957459 }, { "epoch": 0.9217100347594451, "grad_norm": 0.3077169358730316, "learning_rate": 1.6664896528151953e-07, "loss": 0.011700771749019623, "memory(GiB)": 22.66, "step": 28373, "token_acc": 0.992, "train_speed(iter/s)": 0.957466 }, { "epoch": 0.9217425202222006, "grad_norm": 0.3435606360435486, "learning_rate": 1.665114686873537e-07, "loss": 0.011157991364598274, "memory(GiB)": 22.66, "step": 28374, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.957474 }, { "epoch": 0.921775005684956, "grad_norm": 0.5343278050422668, "learning_rate": 1.6637402787807845e-07, "loss": 0.011019155383110046, "memory(GiB)": 22.66, "step": 28375, "token_acc": 1.0, "train_speed(iter/s)": 0.95748 }, { "epoch": 0.9218074911477114, "grad_norm": 0.3915866017341614, "learning_rate": 1.6623664285528086e-07, "loss": 0.015875179320573807, "memory(GiB)": 22.66, "step": 28376, "token_acc": 1.0, "train_speed(iter/s)": 0.957488 }, { "epoch": 0.9218399766104668, "grad_norm": 0.4216293394565582, "learning_rate": 1.6609931362054632e-07, "loss": 0.014196837320923805, "memory(GiB)": 22.66, "step": 28377, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957495 }, { "epoch": 0.9218724620732223, "grad_norm": 0.3087557256221771, "learning_rate": 1.6596204017545914e-07, "loss": 0.011092780157923698, "memory(GiB)": 22.66, "step": 28378, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.957503 }, { "epoch": 0.9219049475359776, "grad_norm": 0.34150153398513794, "learning_rate": 1.658248225216047e-07, "loss": 0.006972525734454393, "memory(GiB)": 22.66, "step": 28379, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.95751 }, { "epoch": 0.9219374329987331, "grad_norm": 0.2812388837337494, "learning_rate": 1.6568766066056508e-07, "loss": 0.004674472846090794, "memory(GiB)": 22.66, "step": 28380, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.957518 }, { "epoch": 0.9219699184614885, "grad_norm": 0.4034249186515808, "learning_rate": 1.655505545939251e-07, "loss": 0.011931915767490864, "memory(GiB)": 22.66, "step": 28381, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.957524 }, { "epoch": 0.9220024039242439, "grad_norm": 0.24769288301467896, "learning_rate": 1.6541350432326631e-07, "loss": 0.00938432291150093, "memory(GiB)": 22.66, "step": 28382, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.95753 }, { "epoch": 0.9220348893869993, "grad_norm": 0.3878977298736572, "learning_rate": 1.6527650985017075e-07, "loss": 0.011191987432539463, "memory(GiB)": 22.66, "step": 28383, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957536 }, { "epoch": 0.9220673748497548, "grad_norm": 0.41624724864959717, "learning_rate": 1.651395711762188e-07, "loss": 0.012438451871275902, "memory(GiB)": 22.66, "step": 28384, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.957541 }, { "epoch": 0.9220998603125101, "grad_norm": 0.34996071457862854, "learning_rate": 1.6500268830299093e-07, "loss": 0.015629667788743973, "memory(GiB)": 22.66, "step": 28385, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.957548 }, { "epoch": 0.9221323457752656, "grad_norm": 0.3369464576244354, "learning_rate": 1.6486586123206804e-07, "loss": 0.008699539117515087, "memory(GiB)": 22.66, "step": 28386, "token_acc": 1.0, "train_speed(iter/s)": 0.957554 }, { "epoch": 0.922164831238021, "grad_norm": 0.3500173091888428, "learning_rate": 1.647290899650289e-07, "loss": 0.009894989430904388, "memory(GiB)": 22.66, "step": 28387, "token_acc": 0.9923954372623575, "train_speed(iter/s)": 0.95756 }, { "epoch": 0.9221973167007764, "grad_norm": 0.39040449261665344, "learning_rate": 1.6459237450345166e-07, "loss": 0.012768788263201714, "memory(GiB)": 22.66, "step": 28388, "token_acc": 1.0, "train_speed(iter/s)": 0.957566 }, { "epoch": 0.9222298021635318, "grad_norm": 0.3543328642845154, "learning_rate": 1.64455714848914e-07, "loss": 0.014893079176545143, "memory(GiB)": 22.66, "step": 28389, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.957572 }, { "epoch": 0.9222622876262873, "grad_norm": 0.39910075068473816, "learning_rate": 1.6431911100299348e-07, "loss": 0.013448690995573997, "memory(GiB)": 22.66, "step": 28390, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957578 }, { "epoch": 0.9222947730890426, "grad_norm": 0.395931214094162, "learning_rate": 1.6418256296726663e-07, "loss": 0.017684293910861015, "memory(GiB)": 22.66, "step": 28391, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.957583 }, { "epoch": 0.9223272585517981, "grad_norm": 0.49007007479667664, "learning_rate": 1.6404607074331003e-07, "loss": 0.014570188708603382, "memory(GiB)": 22.66, "step": 28392, "token_acc": 1.0, "train_speed(iter/s)": 0.957589 }, { "epoch": 0.9223597440145535, "grad_norm": 0.14832769334316254, "learning_rate": 1.639096343326979e-07, "loss": 0.0029313876293599606, "memory(GiB)": 22.66, "step": 28393, "token_acc": 1.0, "train_speed(iter/s)": 0.957595 }, { "epoch": 0.9223922294773089, "grad_norm": 0.38102197647094727, "learning_rate": 1.6377325373700515e-07, "loss": 0.012566105462610722, "memory(GiB)": 22.66, "step": 28394, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.9576 }, { "epoch": 0.9224247149400643, "grad_norm": 0.3884086608886719, "learning_rate": 1.636369289578066e-07, "loss": 0.01076938770711422, "memory(GiB)": 22.66, "step": 28395, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.957605 }, { "epoch": 0.9224572004028198, "grad_norm": 0.2582155764102936, "learning_rate": 1.635006599966754e-07, "loss": 0.007241610903292894, "memory(GiB)": 22.66, "step": 28396, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.95761 }, { "epoch": 0.9224896858655751, "grad_norm": 0.33835268020629883, "learning_rate": 1.6336444685518316e-07, "loss": 0.01716594211757183, "memory(GiB)": 22.66, "step": 28397, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.957616 }, { "epoch": 0.9225221713283306, "grad_norm": 0.42332759499549866, "learning_rate": 1.63228289534903e-07, "loss": 0.011007588356733322, "memory(GiB)": 22.66, "step": 28398, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.957621 }, { "epoch": 0.922554656791086, "grad_norm": 0.24615707993507385, "learning_rate": 1.6309218803740588e-07, "loss": 0.006446138955652714, "memory(GiB)": 22.66, "step": 28399, "token_acc": 1.0, "train_speed(iter/s)": 0.957625 }, { "epoch": 0.9225871422538414, "grad_norm": 0.34903979301452637, "learning_rate": 1.6295614236426337e-07, "loss": 0.006079004146158695, "memory(GiB)": 22.66, "step": 28400, "token_acc": 1.0, "train_speed(iter/s)": 0.95763 }, { "epoch": 0.9226196277165968, "grad_norm": 0.398187518119812, "learning_rate": 1.6282015251704476e-07, "loss": 0.008944827131927013, "memory(GiB)": 22.66, "step": 28401, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.957635 }, { "epoch": 0.9226521131793523, "grad_norm": 0.3616594970226288, "learning_rate": 1.626842184973204e-07, "loss": 0.012698248028755188, "memory(GiB)": 22.66, "step": 28402, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.957639 }, { "epoch": 0.9226845986421076, "grad_norm": 0.2843025326728821, "learning_rate": 1.6254834030665745e-07, "loss": 0.009811117313802242, "memory(GiB)": 22.66, "step": 28403, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957644 }, { "epoch": 0.9227170841048631, "grad_norm": 0.2824625074863434, "learning_rate": 1.6241251794662627e-07, "loss": 0.008714629337191582, "memory(GiB)": 22.66, "step": 28404, "token_acc": 1.0, "train_speed(iter/s)": 0.95765 }, { "epoch": 0.9227495695676184, "grad_norm": 1.13486647605896, "learning_rate": 1.622767514187934e-07, "loss": 0.01973223313689232, "memory(GiB)": 22.66, "step": 28405, "token_acc": 0.9822222222222222, "train_speed(iter/s)": 0.957655 }, { "epoch": 0.9227820550303739, "grad_norm": 0.34449881315231323, "learning_rate": 1.6214104072472537e-07, "loss": 0.013455139473080635, "memory(GiB)": 22.66, "step": 28406, "token_acc": 0.988, "train_speed(iter/s)": 0.957661 }, { "epoch": 0.9228145404931293, "grad_norm": 0.3375910818576813, "learning_rate": 1.6200538586598925e-07, "loss": 0.010036124847829342, "memory(GiB)": 22.66, "step": 28407, "token_acc": 1.0, "train_speed(iter/s)": 0.957667 }, { "epoch": 0.9228470259558847, "grad_norm": 0.3612278997898102, "learning_rate": 1.6186978684414933e-07, "loss": 0.01211589016020298, "memory(GiB)": 22.66, "step": 28408, "token_acc": 1.0, "train_speed(iter/s)": 0.957673 }, { "epoch": 0.9228795114186401, "grad_norm": 0.5769423842430115, "learning_rate": 1.6173424366077328e-07, "loss": 0.015715276822447777, "memory(GiB)": 22.66, "step": 28409, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957678 }, { "epoch": 0.9229119968813956, "grad_norm": 0.2742244005203247, "learning_rate": 1.615987563174226e-07, "loss": 0.008054085075855255, "memory(GiB)": 22.66, "step": 28410, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.957684 }, { "epoch": 0.9229444823441509, "grad_norm": 0.16963593661785126, "learning_rate": 1.614633248156633e-07, "loss": 0.006465058773756027, "memory(GiB)": 22.66, "step": 28411, "token_acc": 1.0, "train_speed(iter/s)": 0.957689 }, { "epoch": 0.9229769678069064, "grad_norm": 0.4113403856754303, "learning_rate": 1.613279491570563e-07, "loss": 0.0170968659222126, "memory(GiB)": 22.66, "step": 28412, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.957694 }, { "epoch": 0.9230094532696618, "grad_norm": 0.5099891424179077, "learning_rate": 1.6119262934316482e-07, "loss": 0.013881473802030087, "memory(GiB)": 22.66, "step": 28413, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.9577 }, { "epoch": 0.9230419387324172, "grad_norm": 0.40739279985427856, "learning_rate": 1.610573653755515e-07, "loss": 0.011084619909524918, "memory(GiB)": 22.66, "step": 28414, "token_acc": 1.0, "train_speed(iter/s)": 0.957706 }, { "epoch": 0.9230744241951726, "grad_norm": 0.41677531599998474, "learning_rate": 1.6092215725577677e-07, "loss": 0.012509679421782494, "memory(GiB)": 22.66, "step": 28415, "token_acc": 1.0, "train_speed(iter/s)": 0.957714 }, { "epoch": 0.9231069096579281, "grad_norm": 0.4169730842113495, "learning_rate": 1.60787004985401e-07, "loss": 0.012891561724245548, "memory(GiB)": 22.66, "step": 28416, "token_acc": 1.0, "train_speed(iter/s)": 0.957721 }, { "epoch": 0.9231393951206835, "grad_norm": 0.30165138840675354, "learning_rate": 1.6065190856598356e-07, "loss": 0.0073463995940983295, "memory(GiB)": 22.66, "step": 28417, "token_acc": 1.0, "train_speed(iter/s)": 0.957728 }, { "epoch": 0.9231718805834389, "grad_norm": 0.3295544981956482, "learning_rate": 1.6051686799908484e-07, "loss": 0.013622734695672989, "memory(GiB)": 22.66, "step": 28418, "token_acc": 0.9894179894179894, "train_speed(iter/s)": 0.957735 }, { "epoch": 0.9232043660461944, "grad_norm": 0.34840089082717896, "learning_rate": 1.6038188328626303e-07, "loss": 0.005539943464100361, "memory(GiB)": 22.66, "step": 28419, "token_acc": 1.0, "train_speed(iter/s)": 0.957743 }, { "epoch": 0.9232368515089497, "grad_norm": 0.18455854058265686, "learning_rate": 1.6024695442907577e-07, "loss": 0.0061153750866651535, "memory(GiB)": 22.66, "step": 28420, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.95775 }, { "epoch": 0.9232693369717052, "grad_norm": 0.34748974442481995, "learning_rate": 1.6011208142907963e-07, "loss": 0.009600233286619186, "memory(GiB)": 22.66, "step": 28421, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957757 }, { "epoch": 0.9233018224344606, "grad_norm": 0.41017910838127136, "learning_rate": 1.5997726428783223e-07, "loss": 0.01549520343542099, "memory(GiB)": 22.66, "step": 28422, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.957765 }, { "epoch": 0.923334307897216, "grad_norm": 0.28795143961906433, "learning_rate": 1.5984250300688954e-07, "loss": 0.012444580905139446, "memory(GiB)": 22.66, "step": 28423, "token_acc": 1.0, "train_speed(iter/s)": 0.957772 }, { "epoch": 0.9233667933599714, "grad_norm": 0.2258991003036499, "learning_rate": 1.5970779758780698e-07, "loss": 0.007219144143164158, "memory(GiB)": 22.66, "step": 28424, "token_acc": 1.0, "train_speed(iter/s)": 0.957779 }, { "epoch": 0.9233992788227269, "grad_norm": 0.243122398853302, "learning_rate": 1.595731480321383e-07, "loss": 0.007015075068920851, "memory(GiB)": 22.66, "step": 28425, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957787 }, { "epoch": 0.9234317642854822, "grad_norm": 0.3768639862537384, "learning_rate": 1.5943855434143784e-07, "loss": 0.015231970697641373, "memory(GiB)": 22.66, "step": 28426, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.957795 }, { "epoch": 0.9234642497482377, "grad_norm": 0.30157580971717834, "learning_rate": 1.5930401651725934e-07, "loss": 0.010022307746112347, "memory(GiB)": 22.66, "step": 28427, "token_acc": 1.0, "train_speed(iter/s)": 0.957802 }, { "epoch": 0.9234967352109931, "grad_norm": 0.17882852256298065, "learning_rate": 1.5916953456115657e-07, "loss": 0.00675366073846817, "memory(GiB)": 22.66, "step": 28428, "token_acc": 1.0, "train_speed(iter/s)": 0.957809 }, { "epoch": 0.9235292206737485, "grad_norm": 0.3242287337779999, "learning_rate": 1.590351084746794e-07, "loss": 0.010337058454751968, "memory(GiB)": 22.66, "step": 28429, "token_acc": 0.9965277777777778, "train_speed(iter/s)": 0.957816 }, { "epoch": 0.9235617061365039, "grad_norm": 0.3218347430229187, "learning_rate": 1.5890073825938103e-07, "loss": 0.013421033509075642, "memory(GiB)": 22.66, "step": 28430, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.957824 }, { "epoch": 0.9235941915992594, "grad_norm": 0.3108152747154236, "learning_rate": 1.5876642391681018e-07, "loss": 0.008019737899303436, "memory(GiB)": 22.66, "step": 28431, "token_acc": 1.0, "train_speed(iter/s)": 0.957831 }, { "epoch": 0.9236266770620147, "grad_norm": 0.1792299598455429, "learning_rate": 1.5863216544852012e-07, "loss": 0.0104824835434556, "memory(GiB)": 22.66, "step": 28432, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.957838 }, { "epoch": 0.9236591625247702, "grad_norm": 0.2885362505912781, "learning_rate": 1.584979628560579e-07, "loss": 0.006972114555537701, "memory(GiB)": 22.66, "step": 28433, "token_acc": 0.9940476190476191, "train_speed(iter/s)": 0.957845 }, { "epoch": 0.9236916479875256, "grad_norm": 0.382495641708374, "learning_rate": 1.5836381614097397e-07, "loss": 0.011094225570559502, "memory(GiB)": 22.66, "step": 28434, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957852 }, { "epoch": 0.923724133450281, "grad_norm": 0.38600921630859375, "learning_rate": 1.5822972530481483e-07, "loss": 0.009913822636008263, "memory(GiB)": 22.66, "step": 28435, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957859 }, { "epoch": 0.9237566189130364, "grad_norm": 0.3503485918045044, "learning_rate": 1.5809569034912875e-07, "loss": 0.01501917652785778, "memory(GiB)": 22.66, "step": 28436, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.957867 }, { "epoch": 0.9237891043757919, "grad_norm": 0.2972496449947357, "learning_rate": 1.5796171127546444e-07, "loss": 0.006115857046097517, "memory(GiB)": 22.66, "step": 28437, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.957874 }, { "epoch": 0.9238215898385472, "grad_norm": 0.2723955810070038, "learning_rate": 1.578277880853657e-07, "loss": 0.0094264792278409, "memory(GiB)": 22.66, "step": 28438, "token_acc": 1.0, "train_speed(iter/s)": 0.957881 }, { "epoch": 0.9238540753013027, "grad_norm": 0.3282011151313782, "learning_rate": 1.576939207803796e-07, "loss": 0.008389524184167385, "memory(GiB)": 22.66, "step": 28439, "token_acc": 1.0, "train_speed(iter/s)": 0.957889 }, { "epoch": 0.9238865607640581, "grad_norm": 0.31886374950408936, "learning_rate": 1.5756010936205046e-07, "loss": 0.013877850957214832, "memory(GiB)": 22.66, "step": 28440, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.957896 }, { "epoch": 0.9239190462268135, "grad_norm": 0.4821629226207733, "learning_rate": 1.5742635383192318e-07, "loss": 0.011629706248641014, "memory(GiB)": 22.66, "step": 28441, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957903 }, { "epoch": 0.9239515316895689, "grad_norm": 0.3550679087638855, "learning_rate": 1.5729265419154093e-07, "loss": 0.006442088633775711, "memory(GiB)": 22.66, "step": 28442, "token_acc": 1.0, "train_speed(iter/s)": 0.957911 }, { "epoch": 0.9239840171523244, "grad_norm": 0.37092041969299316, "learning_rate": 1.5715901044244752e-07, "loss": 0.010426204651594162, "memory(GiB)": 22.66, "step": 28443, "token_acc": 0.9930795847750865, "train_speed(iter/s)": 0.957918 }, { "epoch": 0.9240165026150797, "grad_norm": 0.43300268054008484, "learning_rate": 1.5702542258618448e-07, "loss": 0.015066593885421753, "memory(GiB)": 22.66, "step": 28444, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.957924 }, { "epoch": 0.9240489880778352, "grad_norm": 0.4120161235332489, "learning_rate": 1.5689189062429443e-07, "loss": 0.010613779537379742, "memory(GiB)": 22.66, "step": 28445, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.95793 }, { "epoch": 0.9240814735405906, "grad_norm": 0.3446548581123352, "learning_rate": 1.5675841455831732e-07, "loss": 0.009854204021394253, "memory(GiB)": 22.66, "step": 28446, "token_acc": 1.0, "train_speed(iter/s)": 0.957936 }, { "epoch": 0.924113959003346, "grad_norm": 0.4740627408027649, "learning_rate": 1.5662499438979571e-07, "loss": 0.015023157000541687, "memory(GiB)": 22.66, "step": 28447, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957942 }, { "epoch": 0.9241464444661014, "grad_norm": 0.3861595690250397, "learning_rate": 1.5649163012026735e-07, "loss": 0.013052540831267834, "memory(GiB)": 22.66, "step": 28448, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.957948 }, { "epoch": 0.9241789299288569, "grad_norm": 0.3857392370700836, "learning_rate": 1.5635832175127265e-07, "loss": 0.013972634449601173, "memory(GiB)": 22.66, "step": 28449, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957954 }, { "epoch": 0.9242114153916122, "grad_norm": 0.17042584717273712, "learning_rate": 1.5622506928434922e-07, "loss": 0.003640013514086604, "memory(GiB)": 22.66, "step": 28450, "token_acc": 1.0, "train_speed(iter/s)": 0.95796 }, { "epoch": 0.9242439008543677, "grad_norm": 0.303995817899704, "learning_rate": 1.5609187272103587e-07, "loss": 0.011522360146045685, "memory(GiB)": 22.66, "step": 28451, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957966 }, { "epoch": 0.924276386317123, "grad_norm": 0.36603349447250366, "learning_rate": 1.559587320628697e-07, "loss": 0.011239239946007729, "memory(GiB)": 22.66, "step": 28452, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957972 }, { "epoch": 0.9243088717798785, "grad_norm": 0.425980806350708, "learning_rate": 1.5582564731138728e-07, "loss": 0.019713742658495903, "memory(GiB)": 22.66, "step": 28453, "token_acc": 0.9930313588850174, "train_speed(iter/s)": 0.957978 }, { "epoch": 0.9243413572426339, "grad_norm": 0.2117355465888977, "learning_rate": 1.55692618468124e-07, "loss": 0.008322201669216156, "memory(GiB)": 22.66, "step": 28454, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.957983 }, { "epoch": 0.9243738427053894, "grad_norm": 0.38828620314598083, "learning_rate": 1.555596455346159e-07, "loss": 0.011371606960892677, "memory(GiB)": 22.66, "step": 28455, "token_acc": 1.0, "train_speed(iter/s)": 0.957989 }, { "epoch": 0.9244063281681447, "grad_norm": 0.4442720115184784, "learning_rate": 1.554267285123978e-07, "loss": 0.016766518354415894, "memory(GiB)": 22.66, "step": 28456, "token_acc": 1.0, "train_speed(iter/s)": 0.957994 }, { "epoch": 0.9244388136309002, "grad_norm": 0.42674246430397034, "learning_rate": 1.5529386740300246e-07, "loss": 0.012897319160401821, "memory(GiB)": 22.66, "step": 28457, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957999 }, { "epoch": 0.9244712990936556, "grad_norm": 0.374747097492218, "learning_rate": 1.5516106220796523e-07, "loss": 0.011244980618357658, "memory(GiB)": 22.66, "step": 28458, "token_acc": 0.9928057553956835, "train_speed(iter/s)": 0.958004 }, { "epoch": 0.924503784556411, "grad_norm": 0.37310707569122314, "learning_rate": 1.550283129288166e-07, "loss": 0.010915800929069519, "memory(GiB)": 22.66, "step": 28459, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.958009 }, { "epoch": 0.9245362700191664, "grad_norm": 0.2675514817237854, "learning_rate": 1.5489561956709087e-07, "loss": 0.009123685769736767, "memory(GiB)": 22.66, "step": 28460, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.958014 }, { "epoch": 0.9245687554819219, "grad_norm": 0.2665998041629791, "learning_rate": 1.547629821243185e-07, "loss": 0.008246046490967274, "memory(GiB)": 22.66, "step": 28461, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.958018 }, { "epoch": 0.9246012409446772, "grad_norm": 0.2989453077316284, "learning_rate": 1.5463040060202994e-07, "loss": 0.008116213604807854, "memory(GiB)": 22.66, "step": 28462, "token_acc": 1.0, "train_speed(iter/s)": 0.958023 }, { "epoch": 0.9246337264074327, "grad_norm": 0.3097210228443146, "learning_rate": 1.5449787500175617e-07, "loss": 0.008189731277525425, "memory(GiB)": 22.66, "step": 28463, "token_acc": 1.0, "train_speed(iter/s)": 0.958028 }, { "epoch": 0.924666211870188, "grad_norm": 0.15697252750396729, "learning_rate": 1.5436540532502542e-07, "loss": 0.00716445641592145, "memory(GiB)": 22.66, "step": 28464, "token_acc": 1.0, "train_speed(iter/s)": 0.958033 }, { "epoch": 0.9246986973329435, "grad_norm": 0.2949252724647522, "learning_rate": 1.5423299157336814e-07, "loss": 0.007811041548848152, "memory(GiB)": 22.66, "step": 28465, "token_acc": 1.0, "train_speed(iter/s)": 0.958037 }, { "epoch": 0.9247311827956989, "grad_norm": 0.4168708920478821, "learning_rate": 1.5410063374831142e-07, "loss": 0.012973235920071602, "memory(GiB)": 22.66, "step": 28466, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.958042 }, { "epoch": 0.9247636682584544, "grad_norm": 0.330750048160553, "learning_rate": 1.539683318513835e-07, "loss": 0.010922716930508614, "memory(GiB)": 22.66, "step": 28467, "token_acc": 1.0, "train_speed(iter/s)": 0.958047 }, { "epoch": 0.9247961537212097, "grad_norm": 0.19306036829948425, "learning_rate": 1.538360858841109e-07, "loss": 0.008533606305718422, "memory(GiB)": 22.66, "step": 28468, "token_acc": 1.0, "train_speed(iter/s)": 0.958052 }, { "epoch": 0.9248286391839652, "grad_norm": 0.3077998161315918, "learning_rate": 1.5370389584802024e-07, "loss": 0.013698257505893707, "memory(GiB)": 22.66, "step": 28469, "token_acc": 1.0, "train_speed(iter/s)": 0.958057 }, { "epoch": 0.9248611246467205, "grad_norm": 0.3012567460536957, "learning_rate": 1.5357176174463696e-07, "loss": 0.011253231205046177, "memory(GiB)": 22.66, "step": 28470, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.958063 }, { "epoch": 0.924893610109476, "grad_norm": 0.4026206135749817, "learning_rate": 1.5343968357548643e-07, "loss": 0.012437675148248672, "memory(GiB)": 22.66, "step": 28471, "token_acc": 1.0, "train_speed(iter/s)": 0.958068 }, { "epoch": 0.9249260955722314, "grad_norm": 0.28733310103416443, "learning_rate": 1.5330766134209252e-07, "loss": 0.009083286859095097, "memory(GiB)": 22.66, "step": 28472, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.958074 }, { "epoch": 0.9249585810349868, "grad_norm": 0.3368132710456848, "learning_rate": 1.5317569504597896e-07, "loss": 0.012634655460715294, "memory(GiB)": 22.66, "step": 28473, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.95808 }, { "epoch": 0.9249910664977422, "grad_norm": 0.41541317105293274, "learning_rate": 1.5304378468866898e-07, "loss": 0.015444464981555939, "memory(GiB)": 22.66, "step": 28474, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.958085 }, { "epoch": 0.9250235519604977, "grad_norm": 0.26281535625457764, "learning_rate": 1.5291193027168526e-07, "loss": 0.0045015448704361916, "memory(GiB)": 22.66, "step": 28475, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.958093 }, { "epoch": 0.925056037423253, "grad_norm": 0.44977012276649475, "learning_rate": 1.5278013179654938e-07, "loss": 0.011268623173236847, "memory(GiB)": 22.66, "step": 28476, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.9581 }, { "epoch": 0.9250885228860085, "grad_norm": 0.3205297887325287, "learning_rate": 1.5264838926478233e-07, "loss": 0.009283218532800674, "memory(GiB)": 22.66, "step": 28477, "token_acc": 1.0, "train_speed(iter/s)": 0.958107 }, { "epoch": 0.9251210083487639, "grad_norm": 0.23331701755523682, "learning_rate": 1.5251670267790397e-07, "loss": 0.008840560913085938, "memory(GiB)": 22.66, "step": 28478, "token_acc": 1.0, "train_speed(iter/s)": 0.958114 }, { "epoch": 0.9251534938115193, "grad_norm": 0.3303832411766052, "learning_rate": 1.523850720374359e-07, "loss": 0.009244075044989586, "memory(GiB)": 22.66, "step": 28479, "token_acc": 0.9921875, "train_speed(iter/s)": 0.958122 }, { "epoch": 0.9251859792742748, "grad_norm": 0.29585400223731995, "learning_rate": 1.522534973448958e-07, "loss": 0.01107075810432434, "memory(GiB)": 22.66, "step": 28480, "token_acc": 0.9964028776978417, "train_speed(iter/s)": 0.958129 }, { "epoch": 0.9252184647370302, "grad_norm": 0.46115025877952576, "learning_rate": 1.52121978601803e-07, "loss": 0.014173990115523338, "memory(GiB)": 22.66, "step": 28481, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.958136 }, { "epoch": 0.9252509501997856, "grad_norm": 0.47739291191101074, "learning_rate": 1.5199051580967404e-07, "loss": 0.012398159131407738, "memory(GiB)": 22.66, "step": 28482, "token_acc": 1.0, "train_speed(iter/s)": 0.958143 }, { "epoch": 0.925283435662541, "grad_norm": 0.2915172278881073, "learning_rate": 1.518591089700283e-07, "loss": 0.009958530776202679, "memory(GiB)": 22.66, "step": 28483, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.958151 }, { "epoch": 0.9253159211252965, "grad_norm": 0.24260807037353516, "learning_rate": 1.5172775808438123e-07, "loss": 0.006735692732036114, "memory(GiB)": 22.66, "step": 28484, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.958158 }, { "epoch": 0.9253484065880518, "grad_norm": 0.2578287124633789, "learning_rate": 1.5159646315424826e-07, "loss": 0.010127897374331951, "memory(GiB)": 22.66, "step": 28485, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.958165 }, { "epoch": 0.9253808920508073, "grad_norm": 0.4334237277507782, "learning_rate": 1.5146522418114594e-07, "loss": 0.013538704253733158, "memory(GiB)": 22.66, "step": 28486, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.958173 }, { "epoch": 0.9254133775135627, "grad_norm": 0.2983388900756836, "learning_rate": 1.5133404116658757e-07, "loss": 0.011784913018345833, "memory(GiB)": 22.66, "step": 28487, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.958179 }, { "epoch": 0.9254458629763181, "grad_norm": 0.30551964044570923, "learning_rate": 1.5120291411208854e-07, "loss": 0.008832572028040886, "memory(GiB)": 22.66, "step": 28488, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.958186 }, { "epoch": 0.9254783484390735, "grad_norm": 0.3377011716365814, "learning_rate": 1.5107184301916156e-07, "loss": 0.01361852791160345, "memory(GiB)": 22.66, "step": 28489, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.958193 }, { "epoch": 0.925510833901829, "grad_norm": 0.4278887212276459, "learning_rate": 1.5094082788931986e-07, "loss": 0.012390801683068275, "memory(GiB)": 22.66, "step": 28490, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.958201 }, { "epoch": 0.9255433193645843, "grad_norm": 0.2259032279253006, "learning_rate": 1.5080986872407442e-07, "loss": 0.008898704312741756, "memory(GiB)": 22.66, "step": 28491, "token_acc": 1.0, "train_speed(iter/s)": 0.958208 }, { "epoch": 0.9255758048273398, "grad_norm": 0.40659022331237793, "learning_rate": 1.5067896552493745e-07, "loss": 0.01203938852995634, "memory(GiB)": 22.66, "step": 28492, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.958215 }, { "epoch": 0.9256082902900952, "grad_norm": 0.265678346157074, "learning_rate": 1.5054811829341932e-07, "loss": 0.007923519238829613, "memory(GiB)": 22.66, "step": 28493, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.958222 }, { "epoch": 0.9256407757528506, "grad_norm": 0.4222440719604492, "learning_rate": 1.5041732703103106e-07, "loss": 0.012034405022859573, "memory(GiB)": 22.66, "step": 28494, "token_acc": 1.0, "train_speed(iter/s)": 0.95823 }, { "epoch": 0.925673261215606, "grad_norm": 0.20825226604938507, "learning_rate": 1.5028659173928152e-07, "loss": 0.0061222827062010765, "memory(GiB)": 22.66, "step": 28495, "token_acc": 1.0, "train_speed(iter/s)": 0.958237 }, { "epoch": 0.9257057466783615, "grad_norm": 0.28144681453704834, "learning_rate": 1.5015591241967942e-07, "loss": 0.010120860300958157, "memory(GiB)": 22.66, "step": 28496, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.958245 }, { "epoch": 0.9257382321411168, "grad_norm": 0.3689858317375183, "learning_rate": 1.5002528907373304e-07, "loss": 0.01544357929378748, "memory(GiB)": 22.66, "step": 28497, "token_acc": 1.0, "train_speed(iter/s)": 0.958252 }, { "epoch": 0.9257707176038723, "grad_norm": 0.2540019154548645, "learning_rate": 1.498947217029506e-07, "loss": 0.007448982913047075, "memory(GiB)": 22.66, "step": 28498, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.958259 }, { "epoch": 0.9258032030666277, "grad_norm": 0.24159303307533264, "learning_rate": 1.497642103088387e-07, "loss": 0.00616676639765501, "memory(GiB)": 22.66, "step": 28499, "token_acc": 0.9939024390243902, "train_speed(iter/s)": 0.958267 }, { "epoch": 0.9258356885293831, "grad_norm": 0.16196154057979584, "learning_rate": 1.4963375489290334e-07, "loss": 0.005148525349795818, "memory(GiB)": 22.66, "step": 28500, "token_acc": 1.0, "train_speed(iter/s)": 0.958275 }, { "epoch": 0.9258356885293831, "eval_loss": 0.011271455325186253, "eval_runtime": 80.7945, "eval_samples_per_second": 123.152, "eval_steps_per_second": 3.849, "eval_token_acc": 0.9954540540262254, "step": 28500 }, { "epoch": 0.9258681739921385, "grad_norm": 0.27708056569099426, "learning_rate": 1.4950335545664994e-07, "loss": 0.013655909337103367, "memory(GiB)": 22.66, "step": 28501, "token_acc": 0.9950284869069793, "train_speed(iter/s)": 0.955309 }, { "epoch": 0.925900659454894, "grad_norm": 0.24615462124347687, "learning_rate": 1.493730120015835e-07, "loss": 0.006469826679676771, "memory(GiB)": 22.66, "step": 28502, "token_acc": 1.0, "train_speed(iter/s)": 0.955314 }, { "epoch": 0.9259331449176493, "grad_norm": 0.5219894647598267, "learning_rate": 1.4924272452920995e-07, "loss": 0.00903349183499813, "memory(GiB)": 22.66, "step": 28503, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.955318 }, { "epoch": 0.9259656303804048, "grad_norm": 0.21809403598308563, "learning_rate": 1.4911249304103036e-07, "loss": 0.0058270590379834175, "memory(GiB)": 22.66, "step": 28504, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955323 }, { "epoch": 0.9259981158431602, "grad_norm": 0.4013623893260956, "learning_rate": 1.4898231753854962e-07, "loss": 0.011618769727647305, "memory(GiB)": 22.66, "step": 28505, "token_acc": 1.0, "train_speed(iter/s)": 0.955328 }, { "epoch": 0.9260306013059156, "grad_norm": 0.33465075492858887, "learning_rate": 1.488521980232699e-07, "loss": 0.012905333191156387, "memory(GiB)": 22.66, "step": 28506, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955333 }, { "epoch": 0.926063086768671, "grad_norm": 0.5082982182502747, "learning_rate": 1.487221344966927e-07, "loss": 0.01902598887681961, "memory(GiB)": 22.66, "step": 28507, "token_acc": 0.9931506849315068, "train_speed(iter/s)": 0.955338 }, { "epoch": 0.9260955722314265, "grad_norm": 0.38131365180015564, "learning_rate": 1.4859212696031854e-07, "loss": 0.01787550188601017, "memory(GiB)": 22.66, "step": 28508, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.955343 }, { "epoch": 0.9261280576941818, "grad_norm": 0.24330951273441315, "learning_rate": 1.48462175415649e-07, "loss": 0.007639420218765736, "memory(GiB)": 22.66, "step": 28509, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.955347 }, { "epoch": 0.9261605431569373, "grad_norm": 0.3530791401863098, "learning_rate": 1.4833227986418286e-07, "loss": 0.013013827614486217, "memory(GiB)": 22.66, "step": 28510, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.955352 }, { "epoch": 0.9261930286196927, "grad_norm": 0.2813924252986908, "learning_rate": 1.4820244030742005e-07, "loss": 0.007218425162136555, "memory(GiB)": 22.66, "step": 28511, "token_acc": 0.994475138121547, "train_speed(iter/s)": 0.955357 }, { "epoch": 0.9262255140824481, "grad_norm": 0.682224452495575, "learning_rate": 1.4807265674685878e-07, "loss": 0.013553151860833168, "memory(GiB)": 22.66, "step": 28512, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.955363 }, { "epoch": 0.9262579995452035, "grad_norm": 0.44714662432670593, "learning_rate": 1.4794292918399678e-07, "loss": 0.01228890847414732, "memory(GiB)": 22.66, "step": 28513, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955367 }, { "epoch": 0.926290485007959, "grad_norm": 0.33024537563323975, "learning_rate": 1.478132576203317e-07, "loss": 0.010320084169507027, "memory(GiB)": 22.66, "step": 28514, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955373 }, { "epoch": 0.9263229704707143, "grad_norm": 0.49319329857826233, "learning_rate": 1.4768364205735963e-07, "loss": 0.011036589741706848, "memory(GiB)": 22.66, "step": 28515, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.955378 }, { "epoch": 0.9263554559334698, "grad_norm": 0.37029948830604553, "learning_rate": 1.475540824965771e-07, "loss": 0.016189562156796455, "memory(GiB)": 22.66, "step": 28516, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955383 }, { "epoch": 0.9263879413962252, "grad_norm": 0.3653709590435028, "learning_rate": 1.4742457893947848e-07, "loss": 0.010745573788881302, "memory(GiB)": 22.66, "step": 28517, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.955389 }, { "epoch": 0.9264204268589806, "grad_norm": 0.2349797487258911, "learning_rate": 1.472951313875598e-07, "loss": 0.007838267832994461, "memory(GiB)": 22.66, "step": 28518, "token_acc": 1.0, "train_speed(iter/s)": 0.955395 }, { "epoch": 0.926452912321736, "grad_norm": 0.29211002588272095, "learning_rate": 1.471657398423132e-07, "loss": 0.008795171976089478, "memory(GiB)": 22.66, "step": 28519, "token_acc": 1.0, "train_speed(iter/s)": 0.955401 }, { "epoch": 0.9264853977844915, "grad_norm": 0.2519737482070923, "learning_rate": 1.4703640430523415e-07, "loss": 0.012191064655780792, "memory(GiB)": 22.66, "step": 28520, "token_acc": 1.0, "train_speed(iter/s)": 0.955408 }, { "epoch": 0.9265178832472468, "grad_norm": 0.3348829448223114, "learning_rate": 1.4690712477781366e-07, "loss": 0.01041177473962307, "memory(GiB)": 22.66, "step": 28521, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.955415 }, { "epoch": 0.9265503687100023, "grad_norm": 0.28749725222587585, "learning_rate": 1.46777901261545e-07, "loss": 0.008025970309972763, "memory(GiB)": 22.66, "step": 28522, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955422 }, { "epoch": 0.9265828541727577, "grad_norm": 0.3021930754184723, "learning_rate": 1.466487337579181e-07, "loss": 0.009424615651369095, "memory(GiB)": 22.66, "step": 28523, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955429 }, { "epoch": 0.9266153396355131, "grad_norm": 0.3968026041984558, "learning_rate": 1.4651962226842508e-07, "loss": 0.0117044597864151, "memory(GiB)": 22.66, "step": 28524, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955436 }, { "epoch": 0.9266478250982685, "grad_norm": 0.32982370257377625, "learning_rate": 1.4639056679455533e-07, "loss": 0.008564211428165436, "memory(GiB)": 22.66, "step": 28525, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.955443 }, { "epoch": 0.926680310561024, "grad_norm": 0.5680990815162659, "learning_rate": 1.462615673377993e-07, "loss": 0.018055509775877, "memory(GiB)": 22.66, "step": 28526, "token_acc": 0.9831932773109243, "train_speed(iter/s)": 0.95545 }, { "epoch": 0.9267127960237793, "grad_norm": 0.3701324164867401, "learning_rate": 1.461326238996441e-07, "loss": 0.008254257962107658, "memory(GiB)": 22.66, "step": 28527, "token_acc": 1.0, "train_speed(iter/s)": 0.955457 }, { "epoch": 0.9267452814865348, "grad_norm": 0.35415777564048767, "learning_rate": 1.4600373648157916e-07, "loss": 0.008236158639192581, "memory(GiB)": 22.66, "step": 28528, "token_acc": 1.0, "train_speed(iter/s)": 0.955464 }, { "epoch": 0.9267777669492901, "grad_norm": 0.30355173349380493, "learning_rate": 1.4587490508509216e-07, "loss": 0.01216891035437584, "memory(GiB)": 22.66, "step": 28529, "token_acc": 1.0, "train_speed(iter/s)": 0.955471 }, { "epoch": 0.9268102524120456, "grad_norm": 0.23234571516513824, "learning_rate": 1.4574612971166912e-07, "loss": 0.008664676919579506, "memory(GiB)": 22.66, "step": 28530, "token_acc": 1.0, "train_speed(iter/s)": 0.955478 }, { "epoch": 0.926842737874801, "grad_norm": 0.3973748981952667, "learning_rate": 1.4561741036279719e-07, "loss": 0.016709432005882263, "memory(GiB)": 22.66, "step": 28531, "token_acc": 1.0, "train_speed(iter/s)": 0.955485 }, { "epoch": 0.9268752233375565, "grad_norm": 0.4372067153453827, "learning_rate": 1.454887470399613e-07, "loss": 0.012067139148712158, "memory(GiB)": 22.66, "step": 28532, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.955492 }, { "epoch": 0.9269077088003118, "grad_norm": 0.27111393213272095, "learning_rate": 1.4536013974464636e-07, "loss": 0.007613286841660738, "memory(GiB)": 22.66, "step": 28533, "token_acc": 1.0, "train_speed(iter/s)": 0.955499 }, { "epoch": 0.9269401942630673, "grad_norm": 0.29937034845352173, "learning_rate": 1.452315884783373e-07, "loss": 0.010246160440146923, "memory(GiB)": 22.66, "step": 28534, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.955506 }, { "epoch": 0.9269726797258226, "grad_norm": 0.445685476064682, "learning_rate": 1.4510309324251736e-07, "loss": 0.012059707194566727, "memory(GiB)": 22.66, "step": 28535, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955513 }, { "epoch": 0.9270051651885781, "grad_norm": 0.3474572002887726, "learning_rate": 1.4497465403866984e-07, "loss": 0.011615367606282234, "memory(GiB)": 22.66, "step": 28536, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.95552 }, { "epoch": 0.9270376506513335, "grad_norm": 0.4243484437465668, "learning_rate": 1.448462708682763e-07, "loss": 0.014138182625174522, "memory(GiB)": 22.66, "step": 28537, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.955527 }, { "epoch": 0.927070136114089, "grad_norm": 0.3784171938896179, "learning_rate": 1.4471794373281888e-07, "loss": 0.011044515296816826, "memory(GiB)": 22.66, "step": 28538, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.955534 }, { "epoch": 0.9271026215768443, "grad_norm": 0.30729585886001587, "learning_rate": 1.4458967263377978e-07, "loss": 0.011157834902405739, "memory(GiB)": 22.66, "step": 28539, "token_acc": 0.9929577464788732, "train_speed(iter/s)": 0.955542 }, { "epoch": 0.9271351070395998, "grad_norm": 0.38230234384536743, "learning_rate": 1.4446145757263718e-07, "loss": 0.009095018729567528, "memory(GiB)": 22.66, "step": 28540, "token_acc": 1.0, "train_speed(iter/s)": 0.955549 }, { "epoch": 0.9271675925023551, "grad_norm": 0.3188706636428833, "learning_rate": 1.443332985508733e-07, "loss": 0.008663805201649666, "memory(GiB)": 22.66, "step": 28541, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.955556 }, { "epoch": 0.9272000779651106, "grad_norm": 0.3674618899822235, "learning_rate": 1.4420519556996416e-07, "loss": 0.011532684788107872, "memory(GiB)": 22.66, "step": 28542, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.955563 }, { "epoch": 0.927232563427866, "grad_norm": 0.16448502242565155, "learning_rate": 1.440771486313919e-07, "loss": 0.004736844450235367, "memory(GiB)": 22.66, "step": 28543, "token_acc": 1.0, "train_speed(iter/s)": 0.95557 }, { "epoch": 0.9272650488906214, "grad_norm": 0.24015741050243378, "learning_rate": 1.4394915773663197e-07, "loss": 0.010332634672522545, "memory(GiB)": 22.66, "step": 28544, "token_acc": 1.0, "train_speed(iter/s)": 0.955577 }, { "epoch": 0.9272975343533769, "grad_norm": 0.20931200683116913, "learning_rate": 1.438212228871616e-07, "loss": 0.007824067957699299, "memory(GiB)": 22.66, "step": 28545, "token_acc": 0.995, "train_speed(iter/s)": 0.955584 }, { "epoch": 0.9273300198161323, "grad_norm": 0.35617804527282715, "learning_rate": 1.43693344084459e-07, "loss": 0.009471060708165169, "memory(GiB)": 22.66, "step": 28546, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.955591 }, { "epoch": 0.9273625052788877, "grad_norm": 0.34533098340034485, "learning_rate": 1.4356552132999746e-07, "loss": 0.00983462668955326, "memory(GiB)": 22.66, "step": 28547, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.955598 }, { "epoch": 0.9273949907416431, "grad_norm": 0.2747572660446167, "learning_rate": 1.434377546252552e-07, "loss": 0.007861030288040638, "memory(GiB)": 22.66, "step": 28548, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.955606 }, { "epoch": 0.9274274762043986, "grad_norm": 0.427969366312027, "learning_rate": 1.4331004397170445e-07, "loss": 0.010955609381198883, "memory(GiB)": 22.66, "step": 28549, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955612 }, { "epoch": 0.9274599616671539, "grad_norm": 0.2949381470680237, "learning_rate": 1.4318238937082062e-07, "loss": 0.00874786265194416, "memory(GiB)": 22.66, "step": 28550, "token_acc": 1.0, "train_speed(iter/s)": 0.955618 }, { "epoch": 0.9274924471299094, "grad_norm": 0.5063877105712891, "learning_rate": 1.430547908240759e-07, "loss": 0.011679070070385933, "memory(GiB)": 22.66, "step": 28551, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.955624 }, { "epoch": 0.9275249325926648, "grad_norm": 0.40011832118034363, "learning_rate": 1.4292724833294358e-07, "loss": 0.011161107569932938, "memory(GiB)": 22.66, "step": 28552, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.95563 }, { "epoch": 0.9275574180554202, "grad_norm": 0.1991082727909088, "learning_rate": 1.4279976189889523e-07, "loss": 0.008009667508304119, "memory(GiB)": 22.66, "step": 28553, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955636 }, { "epoch": 0.9275899035181756, "grad_norm": 0.29935553669929504, "learning_rate": 1.4267233152340353e-07, "loss": 0.007182060740888119, "memory(GiB)": 22.66, "step": 28554, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955641 }, { "epoch": 0.9276223889809311, "grad_norm": 0.2042120397090912, "learning_rate": 1.4254495720793736e-07, "loss": 0.006564781069755554, "memory(GiB)": 22.66, "step": 28555, "token_acc": 1.0, "train_speed(iter/s)": 0.955647 }, { "epoch": 0.9276548744436864, "grad_norm": 0.2558738887310028, "learning_rate": 1.4241763895396777e-07, "loss": 0.006373135838657618, "memory(GiB)": 22.66, "step": 28556, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955653 }, { "epoch": 0.9276873599064419, "grad_norm": 0.40928375720977783, "learning_rate": 1.4229037676296408e-07, "loss": 0.009914928115904331, "memory(GiB)": 22.66, "step": 28557, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.955659 }, { "epoch": 0.9277198453691973, "grad_norm": 0.4367976784706116, "learning_rate": 1.4216317063639574e-07, "loss": 0.01480044424533844, "memory(GiB)": 22.66, "step": 28558, "token_acc": 0.9934426229508196, "train_speed(iter/s)": 0.955664 }, { "epoch": 0.9277523308319527, "grad_norm": 0.2594057321548462, "learning_rate": 1.4203602057572928e-07, "loss": 0.008380136452615261, "memory(GiB)": 22.66, "step": 28559, "token_acc": 1.0, "train_speed(iter/s)": 0.955669 }, { "epoch": 0.9277848162947081, "grad_norm": 0.3405762016773224, "learning_rate": 1.4190892658243304e-07, "loss": 0.012893324717879295, "memory(GiB)": 22.66, "step": 28560, "token_acc": 1.0, "train_speed(iter/s)": 0.955674 }, { "epoch": 0.9278173017574636, "grad_norm": 0.400484174489975, "learning_rate": 1.4178188865797414e-07, "loss": 0.017166506499052048, "memory(GiB)": 22.66, "step": 28561, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.955679 }, { "epoch": 0.9278497872202189, "grad_norm": 0.5255091190338135, "learning_rate": 1.4165490680381866e-07, "loss": 0.015393339097499847, "memory(GiB)": 22.66, "step": 28562, "token_acc": 0.9845559845559846, "train_speed(iter/s)": 0.955683 }, { "epoch": 0.9278822726829744, "grad_norm": 0.35512396693229675, "learning_rate": 1.4152798102143206e-07, "loss": 0.016575269401073456, "memory(GiB)": 22.66, "step": 28563, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.955688 }, { "epoch": 0.9279147581457298, "grad_norm": 0.23522990942001343, "learning_rate": 1.414011113122793e-07, "loss": 0.0061340732499957085, "memory(GiB)": 22.66, "step": 28564, "token_acc": 1.0, "train_speed(iter/s)": 0.955693 }, { "epoch": 0.9279472436084852, "grad_norm": 0.4577750265598297, "learning_rate": 1.4127429767782363e-07, "loss": 0.01152694784104824, "memory(GiB)": 22.66, "step": 28565, "token_acc": 1.0, "train_speed(iter/s)": 0.955698 }, { "epoch": 0.9279797290712406, "grad_norm": 0.4359140992164612, "learning_rate": 1.4114754011953002e-07, "loss": 0.010581575334072113, "memory(GiB)": 22.66, "step": 28566, "token_acc": 1.0, "train_speed(iter/s)": 0.955703 }, { "epoch": 0.9280122145339961, "grad_norm": 0.21629777550697327, "learning_rate": 1.4102083863886173e-07, "loss": 0.006789141800254583, "memory(GiB)": 22.66, "step": 28567, "token_acc": 1.0, "train_speed(iter/s)": 0.955707 }, { "epoch": 0.9280446999967514, "grad_norm": 0.4095762073993683, "learning_rate": 1.4089419323727926e-07, "loss": 0.01481542643159628, "memory(GiB)": 22.66, "step": 28568, "token_acc": 0.992, "train_speed(iter/s)": 0.955712 }, { "epoch": 0.9280771854595069, "grad_norm": 0.6599063277244568, "learning_rate": 1.4076760391624588e-07, "loss": 0.01727336458861828, "memory(GiB)": 22.66, "step": 28569, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955716 }, { "epoch": 0.9281096709222623, "grad_norm": 0.41986075043678284, "learning_rate": 1.4064107067722098e-07, "loss": 0.013581791892647743, "memory(GiB)": 22.66, "step": 28570, "token_acc": 1.0, "train_speed(iter/s)": 0.95572 }, { "epoch": 0.9281421563850177, "grad_norm": 0.39615797996520996, "learning_rate": 1.405145935216673e-07, "loss": 0.008976025506854057, "memory(GiB)": 22.66, "step": 28571, "token_acc": 1.0, "train_speed(iter/s)": 0.955716 }, { "epoch": 0.9281746418477731, "grad_norm": 0.3237619698047638, "learning_rate": 1.4038817245104196e-07, "loss": 0.012712191790342331, "memory(GiB)": 22.66, "step": 28572, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.95572 }, { "epoch": 0.9282071273105286, "grad_norm": 0.30460092425346375, "learning_rate": 1.4026180746680605e-07, "loss": 0.008851699531078339, "memory(GiB)": 22.66, "step": 28573, "token_acc": 0.993006993006993, "train_speed(iter/s)": 0.955725 }, { "epoch": 0.9282396127732839, "grad_norm": 0.46394452452659607, "learning_rate": 1.4013549857041675e-07, "loss": 0.01707514375448227, "memory(GiB)": 22.66, "step": 28574, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.955729 }, { "epoch": 0.9282720982360394, "grad_norm": 0.4008900821208954, "learning_rate": 1.4000924576333175e-07, "loss": 0.013109667226672173, "memory(GiB)": 22.66, "step": 28575, "token_acc": 1.0, "train_speed(iter/s)": 0.955735 }, { "epoch": 0.9283045836987948, "grad_norm": 0.36895474791526794, "learning_rate": 1.3988304904700988e-07, "loss": 0.01336762961000204, "memory(GiB)": 22.66, "step": 28576, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.95574 }, { "epoch": 0.9283370691615502, "grad_norm": 0.41397374868392944, "learning_rate": 1.3975690842290556e-07, "loss": 0.014041846618056297, "memory(GiB)": 22.66, "step": 28577, "token_acc": 1.0, "train_speed(iter/s)": 0.955746 }, { "epoch": 0.9283695546243056, "grad_norm": 0.2683519124984741, "learning_rate": 1.396308238924765e-07, "loss": 0.006581766530871391, "memory(GiB)": 22.66, "step": 28578, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955753 }, { "epoch": 0.9284020400870611, "grad_norm": 0.32459405064582825, "learning_rate": 1.39504795457176e-07, "loss": 0.009617198258638382, "memory(GiB)": 22.66, "step": 28579, "token_acc": 1.0, "train_speed(iter/s)": 0.955759 }, { "epoch": 0.9284345255498164, "grad_norm": 0.2979029715061188, "learning_rate": 1.3937882311845897e-07, "loss": 0.009660394862294197, "memory(GiB)": 22.66, "step": 28580, "token_acc": 1.0, "train_speed(iter/s)": 0.955767 }, { "epoch": 0.9284670110125719, "grad_norm": 0.44248032569885254, "learning_rate": 1.3925290687778036e-07, "loss": 0.015464972704648972, "memory(GiB)": 22.66, "step": 28581, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.955774 }, { "epoch": 0.9284994964753273, "grad_norm": 0.21328546106815338, "learning_rate": 1.391270467365935e-07, "loss": 0.009421169757843018, "memory(GiB)": 22.66, "step": 28582, "token_acc": 1.0, "train_speed(iter/s)": 0.955781 }, { "epoch": 0.9285319819380827, "grad_norm": 0.3144594430923462, "learning_rate": 1.390012426963494e-07, "loss": 0.009466076269745827, "memory(GiB)": 22.66, "step": 28583, "token_acc": 1.0, "train_speed(iter/s)": 0.955788 }, { "epoch": 0.9285644674008381, "grad_norm": 0.3017503321170807, "learning_rate": 1.3887549475850138e-07, "loss": 0.007999589666724205, "memory(GiB)": 22.66, "step": 28584, "token_acc": 0.9963503649635036, "train_speed(iter/s)": 0.955795 }, { "epoch": 0.9285969528635936, "grad_norm": 0.35000625252723694, "learning_rate": 1.387498029244999e-07, "loss": 0.008623408153653145, "memory(GiB)": 22.66, "step": 28585, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955802 }, { "epoch": 0.9286294383263489, "grad_norm": 0.4738132059574127, "learning_rate": 1.3862416719579662e-07, "loss": 0.01328978966921568, "memory(GiB)": 22.66, "step": 28586, "token_acc": 0.983739837398374, "train_speed(iter/s)": 0.955809 }, { "epoch": 0.9286619237891044, "grad_norm": 0.3301876187324524, "learning_rate": 1.3849858757384038e-07, "loss": 0.011387597769498825, "memory(GiB)": 22.66, "step": 28587, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.955816 }, { "epoch": 0.9286944092518598, "grad_norm": 0.5128579139709473, "learning_rate": 1.3837306406008167e-07, "loss": 0.01440996490418911, "memory(GiB)": 22.66, "step": 28588, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.955822 }, { "epoch": 0.9287268947146152, "grad_norm": 0.19264084100723267, "learning_rate": 1.3824759665596764e-07, "loss": 0.006348667200654745, "memory(GiB)": 22.66, "step": 28589, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955828 }, { "epoch": 0.9287593801773706, "grad_norm": 0.4071347713470459, "learning_rate": 1.381221853629483e-07, "loss": 0.013284181244671345, "memory(GiB)": 22.66, "step": 28590, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.955835 }, { "epoch": 0.9287918656401261, "grad_norm": 0.3456459939479828, "learning_rate": 1.3799683018246967e-07, "loss": 0.007426878437399864, "memory(GiB)": 22.66, "step": 28591, "token_acc": 1.0, "train_speed(iter/s)": 0.955842 }, { "epoch": 0.9288243511028814, "grad_norm": 0.31558796763420105, "learning_rate": 1.3787153111597895e-07, "loss": 0.006051559001207352, "memory(GiB)": 22.66, "step": 28592, "token_acc": 1.0, "train_speed(iter/s)": 0.955849 }, { "epoch": 0.9288568365656369, "grad_norm": 0.3718532919883728, "learning_rate": 1.377462881649222e-07, "loss": 0.00914215762168169, "memory(GiB)": 22.66, "step": 28593, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955856 }, { "epoch": 0.9288893220283923, "grad_norm": 0.379749059677124, "learning_rate": 1.376211013307449e-07, "loss": 0.011264318600296974, "memory(GiB)": 22.66, "step": 28594, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955863 }, { "epoch": 0.9289218074911477, "grad_norm": 0.37938475608825684, "learning_rate": 1.3749597061489205e-07, "loss": 0.01686044968664646, "memory(GiB)": 22.66, "step": 28595, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955871 }, { "epoch": 0.9289542929539031, "grad_norm": 0.2959417998790741, "learning_rate": 1.3737089601880748e-07, "loss": 0.007679350208491087, "memory(GiB)": 22.66, "step": 28596, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955878 }, { "epoch": 0.9289867784166586, "grad_norm": 0.3968542814254761, "learning_rate": 1.372458775439356e-07, "loss": 0.011655628681182861, "memory(GiB)": 22.66, "step": 28597, "token_acc": 1.0, "train_speed(iter/s)": 0.955885 }, { "epoch": 0.9290192638794139, "grad_norm": 0.21183568239212036, "learning_rate": 1.3712091519171745e-07, "loss": 0.007693436928093433, "memory(GiB)": 22.66, "step": 28598, "token_acc": 1.0, "train_speed(iter/s)": 0.955892 }, { "epoch": 0.9290517493421694, "grad_norm": 0.37789371609687805, "learning_rate": 1.3699600896359743e-07, "loss": 0.024271436035633087, "memory(GiB)": 22.66, "step": 28599, "token_acc": 0.98828125, "train_speed(iter/s)": 0.955899 }, { "epoch": 0.9290842348049247, "grad_norm": 0.24790674448013306, "learning_rate": 1.3687115886101553e-07, "loss": 0.005164319649338722, "memory(GiB)": 22.66, "step": 28600, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.955907 }, { "epoch": 0.9291167202676802, "grad_norm": 0.4002301096916199, "learning_rate": 1.3674636488541394e-07, "loss": 0.008444767445325851, "memory(GiB)": 22.66, "step": 28601, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.955913 }, { "epoch": 0.9291492057304356, "grad_norm": 0.48965585231781006, "learning_rate": 1.3662162703823146e-07, "loss": 0.014191294088959694, "memory(GiB)": 22.66, "step": 28602, "token_acc": 1.0, "train_speed(iter/s)": 0.955921 }, { "epoch": 0.929181691193191, "grad_norm": 0.2932121157646179, "learning_rate": 1.3649694532090917e-07, "loss": 0.01055259257555008, "memory(GiB)": 22.66, "step": 28603, "token_acc": 1.0, "train_speed(iter/s)": 0.955927 }, { "epoch": 0.9292141766559464, "grad_norm": 0.36652401089668274, "learning_rate": 1.3637231973488485e-07, "loss": 0.011784674599766731, "memory(GiB)": 22.66, "step": 28604, "token_acc": 1.0, "train_speed(iter/s)": 0.955934 }, { "epoch": 0.9292466621187019, "grad_norm": 0.6183063387870789, "learning_rate": 1.362477502815973e-07, "loss": 0.012138249352574348, "memory(GiB)": 22.66, "step": 28605, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.955942 }, { "epoch": 0.9292791475814572, "grad_norm": 0.4620888829231262, "learning_rate": 1.361232369624854e-07, "loss": 0.012909195385873318, "memory(GiB)": 22.66, "step": 28606, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.955948 }, { "epoch": 0.9293116330442127, "grad_norm": 0.2969605028629303, "learning_rate": 1.359987797789841e-07, "loss": 0.011156845837831497, "memory(GiB)": 22.66, "step": 28607, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.955955 }, { "epoch": 0.9293441185069682, "grad_norm": 0.301325261592865, "learning_rate": 1.3587437873253118e-07, "loss": 0.007418273016810417, "memory(GiB)": 22.66, "step": 28608, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955962 }, { "epoch": 0.9293766039697235, "grad_norm": 0.4148987829685211, "learning_rate": 1.3575003382456208e-07, "loss": 0.015556429512798786, "memory(GiB)": 22.66, "step": 28609, "token_acc": 1.0, "train_speed(iter/s)": 0.955969 }, { "epoch": 0.929409089432479, "grad_norm": 0.3088587820529938, "learning_rate": 1.3562574505651237e-07, "loss": 0.009010079316794872, "memory(GiB)": 22.66, "step": 28610, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.955976 }, { "epoch": 0.9294415748952344, "grad_norm": 0.25041159987449646, "learning_rate": 1.3550151242981535e-07, "loss": 0.0094190314412117, "memory(GiB)": 22.66, "step": 28611, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955983 }, { "epoch": 0.9294740603579899, "grad_norm": 0.18963837623596191, "learning_rate": 1.3537733594590597e-07, "loss": 0.008366633206605911, "memory(GiB)": 22.66, "step": 28612, "token_acc": 1.0, "train_speed(iter/s)": 0.955991 }, { "epoch": 0.9295065458207452, "grad_norm": 0.26663273572921753, "learning_rate": 1.352532156062164e-07, "loss": 0.006274568848311901, "memory(GiB)": 22.66, "step": 28613, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955997 }, { "epoch": 0.9295390312835007, "grad_norm": 0.36203983426094055, "learning_rate": 1.3512915141218054e-07, "loss": 0.010884071700274944, "memory(GiB)": 22.66, "step": 28614, "token_acc": 1.0, "train_speed(iter/s)": 0.956003 }, { "epoch": 0.929571516746256, "grad_norm": 0.3329112231731415, "learning_rate": 1.3500514336522885e-07, "loss": 0.013346869498491287, "memory(GiB)": 22.66, "step": 28615, "token_acc": 0.9966101694915255, "train_speed(iter/s)": 0.956009 }, { "epoch": 0.9296040022090115, "grad_norm": 0.3040532171726227, "learning_rate": 1.34881191466793e-07, "loss": 0.010715310461819172, "memory(GiB)": 22.66, "step": 28616, "token_acc": 1.0, "train_speed(iter/s)": 0.956015 }, { "epoch": 0.9296364876717669, "grad_norm": 0.34663090109825134, "learning_rate": 1.3475729571830353e-07, "loss": 0.012138600461184978, "memory(GiB)": 22.66, "step": 28617, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.95602 }, { "epoch": 0.9296689731345223, "grad_norm": 0.41982752084732056, "learning_rate": 1.346334561211915e-07, "loss": 0.012223831377923489, "memory(GiB)": 22.66, "step": 28618, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956025 }, { "epoch": 0.9297014585972777, "grad_norm": 0.4310319125652313, "learning_rate": 1.3450967267688407e-07, "loss": 0.018149031326174736, "memory(GiB)": 22.66, "step": 28619, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.95603 }, { "epoch": 0.9297339440600332, "grad_norm": 0.2201579362154007, "learning_rate": 1.3438594538681182e-07, "loss": 0.008538158610463142, "memory(GiB)": 22.66, "step": 28620, "token_acc": 0.9936305732484076, "train_speed(iter/s)": 0.956035 }, { "epoch": 0.9297664295227885, "grad_norm": 0.308495432138443, "learning_rate": 1.3426227425240079e-07, "loss": 0.010090360417962074, "memory(GiB)": 22.66, "step": 28621, "token_acc": 1.0, "train_speed(iter/s)": 0.95604 }, { "epoch": 0.929798914985544, "grad_norm": 0.24871720373630524, "learning_rate": 1.3413865927507984e-07, "loss": 0.008452183566987514, "memory(GiB)": 22.66, "step": 28622, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.956045 }, { "epoch": 0.9298314004482994, "grad_norm": 0.3436284065246582, "learning_rate": 1.3401510045627565e-07, "loss": 0.0107195433229208, "memory(GiB)": 22.66, "step": 28623, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.95605 }, { "epoch": 0.9298638859110548, "grad_norm": 0.36418917775154114, "learning_rate": 1.3389159779741366e-07, "loss": 0.011152287013828754, "memory(GiB)": 22.66, "step": 28624, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.956054 }, { "epoch": 0.9298963713738102, "grad_norm": 0.5030372738838196, "learning_rate": 1.337681512999195e-07, "loss": 0.012020528316497803, "memory(GiB)": 22.66, "step": 28625, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.956059 }, { "epoch": 0.9299288568365657, "grad_norm": 0.43858999013900757, "learning_rate": 1.3364476096521695e-07, "loss": 0.011591721326112747, "memory(GiB)": 22.66, "step": 28626, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956064 }, { "epoch": 0.929961342299321, "grad_norm": 0.3144809305667877, "learning_rate": 1.3352142679473214e-07, "loss": 0.013582509011030197, "memory(GiB)": 22.66, "step": 28627, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.956069 }, { "epoch": 0.9299938277620765, "grad_norm": 0.2990797162055969, "learning_rate": 1.3339814878988667e-07, "loss": 0.008213724009692669, "memory(GiB)": 22.66, "step": 28628, "token_acc": 1.0, "train_speed(iter/s)": 0.956073 }, { "epoch": 0.9300263132248319, "grad_norm": 0.3049381375312805, "learning_rate": 1.3327492695210442e-07, "loss": 0.009968201629817486, "memory(GiB)": 22.66, "step": 28629, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956078 }, { "epoch": 0.9300587986875873, "grad_norm": 0.3058222532272339, "learning_rate": 1.331517612828065e-07, "loss": 0.009953835979104042, "memory(GiB)": 22.66, "step": 28630, "token_acc": 1.0, "train_speed(iter/s)": 0.956083 }, { "epoch": 0.9300912841503427, "grad_norm": 0.40133312344551086, "learning_rate": 1.3302865178341505e-07, "loss": 0.007073376327753067, "memory(GiB)": 22.66, "step": 28631, "token_acc": 1.0, "train_speed(iter/s)": 0.956088 }, { "epoch": 0.9301237696130982, "grad_norm": 0.4456751048564911, "learning_rate": 1.3290559845535124e-07, "loss": 0.014439425431191921, "memory(GiB)": 22.66, "step": 28632, "token_acc": 1.0, "train_speed(iter/s)": 0.956092 }, { "epoch": 0.9301562550758535, "grad_norm": 0.3202371299266815, "learning_rate": 1.3278260130003497e-07, "loss": 0.009083514101803303, "memory(GiB)": 22.66, "step": 28633, "token_acc": 1.0, "train_speed(iter/s)": 0.956097 }, { "epoch": 0.930188740538609, "grad_norm": 0.34610968828201294, "learning_rate": 1.3265966031888565e-07, "loss": 0.011169211938977242, "memory(GiB)": 22.66, "step": 28634, "token_acc": 1.0, "train_speed(iter/s)": 0.956102 }, { "epoch": 0.9302212260013644, "grad_norm": 0.3227512836456299, "learning_rate": 1.3253677551332224e-07, "loss": 0.008406350389122963, "memory(GiB)": 22.66, "step": 28635, "token_acc": 1.0, "train_speed(iter/s)": 0.956107 }, { "epoch": 0.9302537114641198, "grad_norm": 0.2528807520866394, "learning_rate": 1.3241394688476294e-07, "loss": 0.011117203161120415, "memory(GiB)": 22.66, "step": 28636, "token_acc": 0.993006993006993, "train_speed(iter/s)": 0.956112 }, { "epoch": 0.9302861969268752, "grad_norm": 0.27658167481422424, "learning_rate": 1.3229117443462502e-07, "loss": 0.008100055158138275, "memory(GiB)": 22.66, "step": 28637, "token_acc": 1.0, "train_speed(iter/s)": 0.956114 }, { "epoch": 0.9303186823896307, "grad_norm": 0.3751644194126129, "learning_rate": 1.3216845816432676e-07, "loss": 0.011445892043411732, "memory(GiB)": 22.66, "step": 28638, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956121 }, { "epoch": 0.930351167852386, "grad_norm": 0.9099860787391663, "learning_rate": 1.3204579807528318e-07, "loss": 0.015363438054919243, "memory(GiB)": 22.66, "step": 28639, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.956128 }, { "epoch": 0.9303836533151415, "grad_norm": 0.28117573261260986, "learning_rate": 1.319231941689103e-07, "loss": 0.009393515065312386, "memory(GiB)": 22.66, "step": 28640, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.956135 }, { "epoch": 0.9304161387778969, "grad_norm": 0.295187771320343, "learning_rate": 1.318006464466226e-07, "loss": 0.009136034175753593, "memory(GiB)": 22.66, "step": 28641, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.956141 }, { "epoch": 0.9304486242406523, "grad_norm": 0.4998553693294525, "learning_rate": 1.3167815490983615e-07, "loss": 0.01418121624737978, "memory(GiB)": 22.66, "step": 28642, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.956147 }, { "epoch": 0.9304811097034077, "grad_norm": 0.2738209068775177, "learning_rate": 1.315557195599626e-07, "loss": 0.006951144430786371, "memory(GiB)": 22.66, "step": 28643, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.956155 }, { "epoch": 0.9305135951661632, "grad_norm": 0.3518946170806885, "learning_rate": 1.3143334039841637e-07, "loss": 0.00712438253685832, "memory(GiB)": 22.66, "step": 28644, "token_acc": 1.0, "train_speed(iter/s)": 0.956162 }, { "epoch": 0.9305460806289185, "grad_norm": 0.4088415503501892, "learning_rate": 1.313110174266091e-07, "loss": 0.015215696766972542, "memory(GiB)": 22.66, "step": 28645, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.956169 }, { "epoch": 0.930578566091674, "grad_norm": 0.32224395871162415, "learning_rate": 1.3118875064595303e-07, "loss": 0.008336197584867477, "memory(GiB)": 22.66, "step": 28646, "token_acc": 1.0, "train_speed(iter/s)": 0.956176 }, { "epoch": 0.9306110515544294, "grad_norm": 0.3510737121105194, "learning_rate": 1.3106654005785925e-07, "loss": 0.00884639099240303, "memory(GiB)": 22.66, "step": 28647, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956183 }, { "epoch": 0.9306435370171848, "grad_norm": 0.3323605954647064, "learning_rate": 1.3094438566373878e-07, "loss": 0.00986885093152523, "memory(GiB)": 22.66, "step": 28648, "token_acc": 1.0, "train_speed(iter/s)": 0.95619 }, { "epoch": 0.9306760224799402, "grad_norm": 0.3338789939880371, "learning_rate": 1.3082228746499948e-07, "loss": 0.011206124909222126, "memory(GiB)": 22.66, "step": 28649, "token_acc": 1.0, "train_speed(iter/s)": 0.956197 }, { "epoch": 0.9307085079426957, "grad_norm": 0.33561384677886963, "learning_rate": 1.3070024546305239e-07, "loss": 0.008990241214632988, "memory(GiB)": 22.66, "step": 28650, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.956204 }, { "epoch": 0.930740993405451, "grad_norm": 0.41175055503845215, "learning_rate": 1.305782596593058e-07, "loss": 0.01678631082177162, "memory(GiB)": 22.66, "step": 28651, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.956212 }, { "epoch": 0.9307734788682065, "grad_norm": 0.4227798581123352, "learning_rate": 1.30456330055167e-07, "loss": 0.00856385100632906, "memory(GiB)": 22.66, "step": 28652, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956219 }, { "epoch": 0.9308059643309619, "grad_norm": 0.29746928811073303, "learning_rate": 1.303344566520437e-07, "loss": 0.009153395891189575, "memory(GiB)": 22.66, "step": 28653, "token_acc": 1.0, "train_speed(iter/s)": 0.956226 }, { "epoch": 0.9308384497937173, "grad_norm": 0.36872538924217224, "learning_rate": 1.3021263945134144e-07, "loss": 0.011492401361465454, "memory(GiB)": 22.66, "step": 28654, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.956233 }, { "epoch": 0.9308709352564727, "grad_norm": 0.3686753809452057, "learning_rate": 1.30090878454468e-07, "loss": 0.01062441524118185, "memory(GiB)": 22.66, "step": 28655, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.95624 }, { "epoch": 0.9309034207192282, "grad_norm": 0.228094682097435, "learning_rate": 1.2996917366282725e-07, "loss": 0.008477328345179558, "memory(GiB)": 22.66, "step": 28656, "token_acc": 1.0, "train_speed(iter/s)": 0.956247 }, { "epoch": 0.9309359061819835, "grad_norm": 0.21930952370166779, "learning_rate": 1.298475250778247e-07, "loss": 0.005776833742856979, "memory(GiB)": 22.66, "step": 28657, "token_acc": 1.0, "train_speed(iter/s)": 0.956255 }, { "epoch": 0.930968391644739, "grad_norm": 0.48839935660362244, "learning_rate": 1.297259327008632e-07, "loss": 0.013362050987780094, "memory(GiB)": 22.66, "step": 28658, "token_acc": 1.0, "train_speed(iter/s)": 0.956262 }, { "epoch": 0.9310008771074944, "grad_norm": 0.3341578245162964, "learning_rate": 1.2960439653334711e-07, "loss": 0.015888851135969162, "memory(GiB)": 22.66, "step": 28659, "token_acc": 1.0, "train_speed(iter/s)": 0.956269 }, { "epoch": 0.9310333625702498, "grad_norm": 0.39334607124328613, "learning_rate": 1.2948291657667867e-07, "loss": 0.01571105606853962, "memory(GiB)": 22.66, "step": 28660, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956275 }, { "epoch": 0.9310658480330052, "grad_norm": 0.33233121037483215, "learning_rate": 1.2936149283226061e-07, "loss": 0.008064793422818184, "memory(GiB)": 22.66, "step": 28661, "token_acc": 0.9943820224719101, "train_speed(iter/s)": 0.956282 }, { "epoch": 0.9310983334957607, "grad_norm": 0.3416092097759247, "learning_rate": 1.29240125301493e-07, "loss": 0.007702344097197056, "memory(GiB)": 22.66, "step": 28662, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956289 }, { "epoch": 0.931130818958516, "grad_norm": 0.3721105456352234, "learning_rate": 1.291188139857774e-07, "loss": 0.011755680665373802, "memory(GiB)": 22.66, "step": 28663, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956296 }, { "epoch": 0.9311633044212715, "grad_norm": 0.3670421540737152, "learning_rate": 1.2899755888651332e-07, "loss": 0.00930621288716793, "memory(GiB)": 22.66, "step": 28664, "token_acc": 0.995, "train_speed(iter/s)": 0.956303 }, { "epoch": 0.9311957898840268, "grad_norm": 0.4083081781864166, "learning_rate": 1.2887636000510184e-07, "loss": 0.012705905362963676, "memory(GiB)": 22.66, "step": 28665, "token_acc": 1.0, "train_speed(iter/s)": 0.95631 }, { "epoch": 0.9312282753467823, "grad_norm": 0.3793955445289612, "learning_rate": 1.2875521734294017e-07, "loss": 0.019593123346567154, "memory(GiB)": 22.66, "step": 28666, "token_acc": 0.9852216748768473, "train_speed(iter/s)": 0.956318 }, { "epoch": 0.9312607608095377, "grad_norm": 0.31789782643318176, "learning_rate": 1.2863413090142663e-07, "loss": 0.011448154225945473, "memory(GiB)": 22.66, "step": 28667, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.956325 }, { "epoch": 0.9312932462722932, "grad_norm": 0.31623154878616333, "learning_rate": 1.2851310068195898e-07, "loss": 0.013287406414747238, "memory(GiB)": 22.66, "step": 28668, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956332 }, { "epoch": 0.9313257317350485, "grad_norm": 0.33139193058013916, "learning_rate": 1.2839212668593393e-07, "loss": 0.014006752520799637, "memory(GiB)": 22.66, "step": 28669, "token_acc": 0.985781990521327, "train_speed(iter/s)": 0.956339 }, { "epoch": 0.931358217197804, "grad_norm": 0.3157908618450165, "learning_rate": 1.2827120891474864e-07, "loss": 0.007299511227756739, "memory(GiB)": 22.66, "step": 28670, "token_acc": 1.0, "train_speed(iter/s)": 0.956346 }, { "epoch": 0.9313907026605593, "grad_norm": 0.3253282308578491, "learning_rate": 1.2815034736979704e-07, "loss": 0.012432437390089035, "memory(GiB)": 22.66, "step": 28671, "token_acc": 1.0, "train_speed(iter/s)": 0.956353 }, { "epoch": 0.9314231881233148, "grad_norm": 0.2723449766635895, "learning_rate": 1.280295420524752e-07, "loss": 0.007832331582903862, "memory(GiB)": 22.66, "step": 28672, "token_acc": 1.0, "train_speed(iter/s)": 0.956361 }, { "epoch": 0.9314556735860703, "grad_norm": 0.43991217017173767, "learning_rate": 1.2790879296417702e-07, "loss": 0.011975626461207867, "memory(GiB)": 22.66, "step": 28673, "token_acc": 1.0, "train_speed(iter/s)": 0.956368 }, { "epoch": 0.9314881590488256, "grad_norm": 0.3226129710674286, "learning_rate": 1.2778810010629639e-07, "loss": 0.015765443444252014, "memory(GiB)": 22.66, "step": 28674, "token_acc": 1.0, "train_speed(iter/s)": 0.956375 }, { "epoch": 0.9315206445115811, "grad_norm": 0.2899547815322876, "learning_rate": 1.2766746348022553e-07, "loss": 0.008219091221690178, "memory(GiB)": 22.66, "step": 28675, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.956382 }, { "epoch": 0.9315531299743365, "grad_norm": 0.47627687454223633, "learning_rate": 1.2754688308735773e-07, "loss": 0.023129869252443314, "memory(GiB)": 22.66, "step": 28676, "token_acc": 0.9895470383275261, "train_speed(iter/s)": 0.956389 }, { "epoch": 0.931585615437092, "grad_norm": 0.4526110589504242, "learning_rate": 1.2742635892908306e-07, "loss": 0.011473778635263443, "memory(GiB)": 22.66, "step": 28677, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956396 }, { "epoch": 0.9316181008998473, "grad_norm": 0.19495031237602234, "learning_rate": 1.273058910067948e-07, "loss": 0.010085627436637878, "memory(GiB)": 22.66, "step": 28678, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.956402 }, { "epoch": 0.9316505863626028, "grad_norm": 0.6870061755180359, "learning_rate": 1.271854793218813e-07, "loss": 0.013514403253793716, "memory(GiB)": 22.66, "step": 28679, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956406 }, { "epoch": 0.9316830718253581, "grad_norm": 0.2663682699203491, "learning_rate": 1.2706512387573366e-07, "loss": 0.007528525311499834, "memory(GiB)": 22.66, "step": 28680, "token_acc": 1.0, "train_speed(iter/s)": 0.956411 }, { "epoch": 0.9317155572881136, "grad_norm": 0.31527405977249146, "learning_rate": 1.2694482466973969e-07, "loss": 0.00792014505714178, "memory(GiB)": 22.66, "step": 28681, "token_acc": 1.0, "train_speed(iter/s)": 0.956416 }, { "epoch": 0.931748042750869, "grad_norm": 0.40874719619750977, "learning_rate": 1.2682458170528877e-07, "loss": 0.008636100217700005, "memory(GiB)": 22.66, "step": 28682, "token_acc": 1.0, "train_speed(iter/s)": 0.95642 }, { "epoch": 0.9317805282136244, "grad_norm": 0.43337973952293396, "learning_rate": 1.267043949837682e-07, "loss": 0.015429754741489887, "memory(GiB)": 22.66, "step": 28683, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.956425 }, { "epoch": 0.9318130136763798, "grad_norm": 0.45545414090156555, "learning_rate": 1.265842645065657e-07, "loss": 0.01664840802550316, "memory(GiB)": 22.66, "step": 28684, "token_acc": 1.0, "train_speed(iter/s)": 0.95643 }, { "epoch": 0.9318454991391353, "grad_norm": 0.4692830443382263, "learning_rate": 1.264641902750674e-07, "loss": 0.013874861411750317, "memory(GiB)": 22.66, "step": 28685, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956435 }, { "epoch": 0.9318779846018906, "grad_norm": 0.3671194314956665, "learning_rate": 1.2634417229065832e-07, "loss": 0.014211796224117279, "memory(GiB)": 22.66, "step": 28686, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.95644 }, { "epoch": 0.9319104700646461, "grad_norm": 0.38313305377960205, "learning_rate": 1.2622421055472455e-07, "loss": 0.012042418122291565, "memory(GiB)": 22.66, "step": 28687, "token_acc": 1.0, "train_speed(iter/s)": 0.956444 }, { "epoch": 0.9319429555274015, "grad_norm": 0.3572779595851898, "learning_rate": 1.2610430506865056e-07, "loss": 0.009035776369273663, "memory(GiB)": 22.66, "step": 28688, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.956449 }, { "epoch": 0.931975440990157, "grad_norm": 0.27373620867729187, "learning_rate": 1.2598445583382025e-07, "loss": 0.01218319684267044, "memory(GiB)": 22.66, "step": 28689, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.956454 }, { "epoch": 0.9320079264529123, "grad_norm": 0.34591907262802124, "learning_rate": 1.258646628516158e-07, "loss": 0.01584884710609913, "memory(GiB)": 22.66, "step": 28690, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956459 }, { "epoch": 0.9320404119156678, "grad_norm": 0.3316612243652344, "learning_rate": 1.2574492612342116e-07, "loss": 0.009890655055642128, "memory(GiB)": 22.66, "step": 28691, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.956463 }, { "epoch": 0.9320728973784231, "grad_norm": 0.46288684010505676, "learning_rate": 1.256252456506174e-07, "loss": 0.010572018101811409, "memory(GiB)": 22.66, "step": 28692, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956468 }, { "epoch": 0.9321053828411786, "grad_norm": 0.3821541368961334, "learning_rate": 1.2550562143458678e-07, "loss": 0.01179420668631792, "memory(GiB)": 22.66, "step": 28693, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956473 }, { "epoch": 0.932137868303934, "grad_norm": 0.46075788140296936, "learning_rate": 1.2538605347670873e-07, "loss": 0.018684878945350647, "memory(GiB)": 22.66, "step": 28694, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.956477 }, { "epoch": 0.9321703537666894, "grad_norm": 0.2551915943622589, "learning_rate": 1.2526654177836384e-07, "loss": 0.008004972711205482, "memory(GiB)": 22.66, "step": 28695, "token_acc": 0.9849056603773585, "train_speed(iter/s)": 0.956482 }, { "epoch": 0.9322028392294448, "grad_norm": 0.36004889011383057, "learning_rate": 1.2514708634093097e-07, "loss": 0.008350292220711708, "memory(GiB)": 22.66, "step": 28696, "token_acc": 1.0, "train_speed(iter/s)": 0.956487 }, { "epoch": 0.9322353246922003, "grad_norm": 0.38658034801483154, "learning_rate": 1.250276871657896e-07, "loss": 0.007376138586550951, "memory(GiB)": 22.66, "step": 28697, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.956491 }, { "epoch": 0.9322678101549556, "grad_norm": 0.39147520065307617, "learning_rate": 1.2490834425431698e-07, "loss": 0.008923057466745377, "memory(GiB)": 22.66, "step": 28698, "token_acc": 1.0, "train_speed(iter/s)": 0.956496 }, { "epoch": 0.9323002956177111, "grad_norm": 0.3196757137775421, "learning_rate": 1.2478905760789084e-07, "loss": 0.012792734429240227, "memory(GiB)": 22.66, "step": 28699, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.956502 }, { "epoch": 0.9323327810804665, "grad_norm": 0.33130720257759094, "learning_rate": 1.2466982722788789e-07, "loss": 0.007299119606614113, "memory(GiB)": 22.66, "step": 28700, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956508 }, { "epoch": 0.9323652665432219, "grad_norm": 0.3157222270965576, "learning_rate": 1.245506531156837e-07, "loss": 0.013878227211534977, "memory(GiB)": 22.66, "step": 28701, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956513 }, { "epoch": 0.9323977520059773, "grad_norm": 0.3207974433898926, "learning_rate": 1.244315352726555e-07, "loss": 0.007773444056510925, "memory(GiB)": 22.66, "step": 28702, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.956521 }, { "epoch": 0.9324302374687328, "grad_norm": 0.3295300602912903, "learning_rate": 1.2431247370017552e-07, "loss": 0.014442015439271927, "memory(GiB)": 22.66, "step": 28703, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956528 }, { "epoch": 0.9324627229314881, "grad_norm": 0.29887980222702026, "learning_rate": 1.2419346839961988e-07, "loss": 0.009035486727952957, "memory(GiB)": 22.66, "step": 28704, "token_acc": 1.0, "train_speed(iter/s)": 0.956535 }, { "epoch": 0.9324952083942436, "grad_norm": 0.28807082772254944, "learning_rate": 1.2407451937236083e-07, "loss": 0.010714937001466751, "memory(GiB)": 22.66, "step": 28705, "token_acc": 1.0, "train_speed(iter/s)": 0.956542 }, { "epoch": 0.932527693856999, "grad_norm": 0.5117172598838806, "learning_rate": 1.2395562661977167e-07, "loss": 0.02262917160987854, "memory(GiB)": 22.66, "step": 28706, "token_acc": 1.0, "train_speed(iter/s)": 0.956549 }, { "epoch": 0.9325601793197544, "grad_norm": 0.24313046038150787, "learning_rate": 1.238367901432247e-07, "loss": 0.007293635979294777, "memory(GiB)": 22.66, "step": 28707, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.956557 }, { "epoch": 0.9325926647825098, "grad_norm": 0.2615516781806946, "learning_rate": 1.237180099440921e-07, "loss": 0.009074380621314049, "memory(GiB)": 22.66, "step": 28708, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956564 }, { "epoch": 0.9326251502452653, "grad_norm": 0.20947766304016113, "learning_rate": 1.2359928602374284e-07, "loss": 0.006509609520435333, "memory(GiB)": 22.66, "step": 28709, "token_acc": 1.0, "train_speed(iter/s)": 0.956571 }, { "epoch": 0.9326576357080206, "grad_norm": 0.3627653121948242, "learning_rate": 1.2348061838354852e-07, "loss": 0.014868182130157948, "memory(GiB)": 22.66, "step": 28710, "token_acc": 1.0, "train_speed(iter/s)": 0.956579 }, { "epoch": 0.9326901211707761, "grad_norm": 0.2874446511268616, "learning_rate": 1.2336200702487867e-07, "loss": 0.0077795726247131824, "memory(GiB)": 22.66, "step": 28711, "token_acc": 1.0, "train_speed(iter/s)": 0.956586 }, { "epoch": 0.9327226066335315, "grad_norm": 0.2671692967414856, "learning_rate": 1.2324345194910215e-07, "loss": 0.008434567600488663, "memory(GiB)": 22.66, "step": 28712, "token_acc": 1.0, "train_speed(iter/s)": 0.956593 }, { "epoch": 0.9327550920962869, "grad_norm": 0.2429673671722412, "learning_rate": 1.2312495315758733e-07, "loss": 0.00747502688318491, "memory(GiB)": 22.66, "step": 28713, "token_acc": 0.995, "train_speed(iter/s)": 0.956599 }, { "epoch": 0.9327875775590423, "grad_norm": 0.368684858083725, "learning_rate": 1.2300651065170088e-07, "loss": 0.010851163417100906, "memory(GiB)": 22.66, "step": 28714, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.956606 }, { "epoch": 0.9328200630217978, "grad_norm": 0.2857230305671692, "learning_rate": 1.2288812443281119e-07, "loss": 0.013040627352893353, "memory(GiB)": 22.66, "step": 28715, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.956613 }, { "epoch": 0.9328525484845531, "grad_norm": 0.20373868942260742, "learning_rate": 1.2276979450228376e-07, "loss": 0.008975526317954063, "memory(GiB)": 22.66, "step": 28716, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.95662 }, { "epoch": 0.9328850339473086, "grad_norm": 0.2621056139469147, "learning_rate": 1.226515208614848e-07, "loss": 0.0069257644936442375, "memory(GiB)": 22.66, "step": 28717, "token_acc": 1.0, "train_speed(iter/s)": 0.956627 }, { "epoch": 0.932917519410064, "grad_norm": 0.3807247281074524, "learning_rate": 1.2253330351177873e-07, "loss": 0.01155146211385727, "memory(GiB)": 22.66, "step": 28718, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.956633 }, { "epoch": 0.9329500048728194, "grad_norm": 0.3256508708000183, "learning_rate": 1.2241514245453002e-07, "loss": 0.013072367757558823, "memory(GiB)": 22.66, "step": 28719, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.95664 }, { "epoch": 0.9329824903355748, "grad_norm": 0.3358555734157562, "learning_rate": 1.2229703769110257e-07, "loss": 0.010855883359909058, "memory(GiB)": 22.66, "step": 28720, "token_acc": 0.992, "train_speed(iter/s)": 0.956648 }, { "epoch": 0.9330149757983303, "grad_norm": 0.3609668016433716, "learning_rate": 1.221789892228603e-07, "loss": 0.011967858299612999, "memory(GiB)": 22.66, "step": 28721, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.956655 }, { "epoch": 0.9330474612610856, "grad_norm": 0.24449771642684937, "learning_rate": 1.2206099705116438e-07, "loss": 0.010926764458417892, "memory(GiB)": 22.66, "step": 28722, "token_acc": 0.988, "train_speed(iter/s)": 0.956662 }, { "epoch": 0.9330799467238411, "grad_norm": 0.5164763331413269, "learning_rate": 1.2194306117737696e-07, "loss": 0.012546398676931858, "memory(GiB)": 22.66, "step": 28723, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.956669 }, { "epoch": 0.9331124321865965, "grad_norm": 0.32881465554237366, "learning_rate": 1.2182518160285928e-07, "loss": 0.00925283320248127, "memory(GiB)": 22.66, "step": 28724, "token_acc": 1.0, "train_speed(iter/s)": 0.956676 }, { "epoch": 0.9331449176493519, "grad_norm": 0.2630230784416199, "learning_rate": 1.2170735832897184e-07, "loss": 0.01062887255102396, "memory(GiB)": 22.66, "step": 28725, "token_acc": 1.0, "train_speed(iter/s)": 0.956683 }, { "epoch": 0.9331774031121073, "grad_norm": 0.2732231020927429, "learning_rate": 1.215895913570747e-07, "loss": 0.0070500560104846954, "memory(GiB)": 22.66, "step": 28726, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.95669 }, { "epoch": 0.9332098885748628, "grad_norm": 0.3015739619731903, "learning_rate": 1.2147188068852733e-07, "loss": 0.007133240345865488, "memory(GiB)": 22.66, "step": 28727, "token_acc": 1.0, "train_speed(iter/s)": 0.956697 }, { "epoch": 0.9332423740376181, "grad_norm": 0.2630874514579773, "learning_rate": 1.213542263246864e-07, "loss": 0.011228907853364944, "memory(GiB)": 22.66, "step": 28728, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956704 }, { "epoch": 0.9332748595003736, "grad_norm": 0.3218283951282501, "learning_rate": 1.2123662826691196e-07, "loss": 0.00971587747335434, "memory(GiB)": 22.66, "step": 28729, "token_acc": 1.0, "train_speed(iter/s)": 0.956711 }, { "epoch": 0.933307344963129, "grad_norm": 0.2179437279701233, "learning_rate": 1.2111908651656067e-07, "loss": 0.008438327349722385, "memory(GiB)": 22.66, "step": 28730, "token_acc": 0.99609375, "train_speed(iter/s)": 0.956718 }, { "epoch": 0.9333398304258844, "grad_norm": 0.3731023669242859, "learning_rate": 1.2100160107498814e-07, "loss": 0.00927972886711359, "memory(GiB)": 22.66, "step": 28731, "token_acc": 1.0, "train_speed(iter/s)": 0.956725 }, { "epoch": 0.9333723158886398, "grad_norm": 0.38026055693626404, "learning_rate": 1.2088417194355216e-07, "loss": 0.0074999043717980385, "memory(GiB)": 22.66, "step": 28732, "token_acc": 1.0, "train_speed(iter/s)": 0.956732 }, { "epoch": 0.9334048013513953, "grad_norm": 0.3377625346183777, "learning_rate": 1.2076679912360556e-07, "loss": 0.011814475059509277, "memory(GiB)": 22.66, "step": 28733, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.956739 }, { "epoch": 0.9334372868141506, "grad_norm": 0.35467666387557983, "learning_rate": 1.2064948261650556e-07, "loss": 0.009586092084646225, "memory(GiB)": 22.66, "step": 28734, "token_acc": 1.0, "train_speed(iter/s)": 0.956746 }, { "epoch": 0.9334697722769061, "grad_norm": 0.38856154680252075, "learning_rate": 1.205322224236044e-07, "loss": 0.010492494329810143, "memory(GiB)": 22.66, "step": 28735, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.956753 }, { "epoch": 0.9335022577396616, "grad_norm": 0.41063809394836426, "learning_rate": 1.204150185462566e-07, "loss": 0.010903395712375641, "memory(GiB)": 22.66, "step": 28736, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.95676 }, { "epoch": 0.9335347432024169, "grad_norm": 0.46433910727500916, "learning_rate": 1.2029787098581325e-07, "loss": 0.01076943427324295, "memory(GiB)": 22.66, "step": 28737, "token_acc": 0.9921875, "train_speed(iter/s)": 0.956767 }, { "epoch": 0.9335672286651724, "grad_norm": 0.29790347814559937, "learning_rate": 1.2018077974362775e-07, "loss": 0.010860498994588852, "memory(GiB)": 22.66, "step": 28738, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.956774 }, { "epoch": 0.9335997141279277, "grad_norm": 0.44072720408439636, "learning_rate": 1.2006374482105064e-07, "loss": 0.011415080167353153, "memory(GiB)": 22.66, "step": 28739, "token_acc": 1.0, "train_speed(iter/s)": 0.95678 }, { "epoch": 0.9336321995906832, "grad_norm": 0.299630731344223, "learning_rate": 1.1994676621943369e-07, "loss": 0.007519339211285114, "memory(GiB)": 22.66, "step": 28740, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956785 }, { "epoch": 0.9336646850534386, "grad_norm": 0.4347153306007385, "learning_rate": 1.198298439401263e-07, "loss": 0.015074169263243675, "memory(GiB)": 22.66, "step": 28741, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.95679 }, { "epoch": 0.933697170516194, "grad_norm": 0.4833689033985138, "learning_rate": 1.19712977984478e-07, "loss": 0.01178508810698986, "memory(GiB)": 22.66, "step": 28742, "token_acc": 1.0, "train_speed(iter/s)": 0.956794 }, { "epoch": 0.9337296559789494, "grad_norm": 0.31700602173805237, "learning_rate": 1.1959616835383714e-07, "loss": 0.011178683489561081, "memory(GiB)": 22.66, "step": 28743, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.956799 }, { "epoch": 0.9337621414417049, "grad_norm": 0.33071428537368774, "learning_rate": 1.1947941504955262e-07, "loss": 0.010756277479231358, "memory(GiB)": 22.66, "step": 28744, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.956803 }, { "epoch": 0.9337946269044602, "grad_norm": 0.19374561309814453, "learning_rate": 1.1936271807297172e-07, "loss": 0.010293148458003998, "memory(GiB)": 22.66, "step": 28745, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956808 }, { "epoch": 0.9338271123672157, "grad_norm": 0.3556666076183319, "learning_rate": 1.1924607742544114e-07, "loss": 0.009391170926392078, "memory(GiB)": 22.66, "step": 28746, "token_acc": 1.0, "train_speed(iter/s)": 0.956813 }, { "epoch": 0.9338595978299711, "grad_norm": 0.2835533618927002, "learning_rate": 1.1912949310830702e-07, "loss": 0.010199911892414093, "memory(GiB)": 22.66, "step": 28747, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.956817 }, { "epoch": 0.9338920832927265, "grad_norm": 0.22241748869419098, "learning_rate": 1.1901296512291494e-07, "loss": 0.0061624618247151375, "memory(GiB)": 22.66, "step": 28748, "token_acc": 1.0, "train_speed(iter/s)": 0.956822 }, { "epoch": 0.9339245687554819, "grad_norm": 0.6426481008529663, "learning_rate": 1.1889649347060994e-07, "loss": 0.009142103604972363, "memory(GiB)": 22.66, "step": 28749, "token_acc": 1.0, "train_speed(iter/s)": 0.956826 }, { "epoch": 0.9339570542182374, "grad_norm": 0.33196982741355896, "learning_rate": 1.1878007815273595e-07, "loss": 0.013849182054400444, "memory(GiB)": 22.66, "step": 28750, "token_acc": 1.0, "train_speed(iter/s)": 0.956831 }, { "epoch": 0.9339895396809927, "grad_norm": 0.24209602177143097, "learning_rate": 1.186637191706369e-07, "loss": 0.0092323524877429, "memory(GiB)": 22.66, "step": 28751, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.956836 }, { "epoch": 0.9340220251437482, "grad_norm": 0.590204119682312, "learning_rate": 1.1854741652565561e-07, "loss": 0.012736944481730461, "memory(GiB)": 22.66, "step": 28752, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.95684 }, { "epoch": 0.9340545106065036, "grad_norm": 0.3711838722229004, "learning_rate": 1.1843117021913431e-07, "loss": 0.011003023013472557, "memory(GiB)": 22.66, "step": 28753, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956845 }, { "epoch": 0.934086996069259, "grad_norm": 0.4278026223182678, "learning_rate": 1.1831498025241472e-07, "loss": 0.009502747096121311, "memory(GiB)": 22.66, "step": 28754, "token_acc": 1.0, "train_speed(iter/s)": 0.95685 }, { "epoch": 0.9341194815320144, "grad_norm": 0.2656918466091156, "learning_rate": 1.18198846626838e-07, "loss": 0.007784221787005663, "memory(GiB)": 22.66, "step": 28755, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956854 }, { "epoch": 0.9341519669947699, "grad_norm": 0.36072590947151184, "learning_rate": 1.1808276934374308e-07, "loss": 0.007478989660739899, "memory(GiB)": 22.66, "step": 28756, "token_acc": 1.0, "train_speed(iter/s)": 0.956859 }, { "epoch": 0.9341844524575252, "grad_norm": 0.3755078911781311, "learning_rate": 1.179667484044722e-07, "loss": 0.013996539637446404, "memory(GiB)": 22.66, "step": 28757, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956863 }, { "epoch": 0.9342169379202807, "grad_norm": 3.081186056137085, "learning_rate": 1.1785078381036208e-07, "loss": 0.014415962621569633, "memory(GiB)": 22.66, "step": 28758, "token_acc": 1.0, "train_speed(iter/s)": 0.956869 }, { "epoch": 0.9342494233830361, "grad_norm": 0.30955052375793457, "learning_rate": 1.1773487556275275e-07, "loss": 0.00945435930043459, "memory(GiB)": 22.66, "step": 28759, "token_acc": 1.0, "train_speed(iter/s)": 0.956874 }, { "epoch": 0.9342819088457915, "grad_norm": 0.34661349654197693, "learning_rate": 1.1761902366298095e-07, "loss": 0.009019879624247551, "memory(GiB)": 22.66, "step": 28760, "token_acc": 1.0, "train_speed(iter/s)": 0.956878 }, { "epoch": 0.9343143943085469, "grad_norm": 0.38330018520355225, "learning_rate": 1.1750322811238335e-07, "loss": 0.012342605739831924, "memory(GiB)": 22.66, "step": 28761, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.956884 }, { "epoch": 0.9343468797713024, "grad_norm": 0.3474545478820801, "learning_rate": 1.1738748891229835e-07, "loss": 0.010542918927967548, "memory(GiB)": 22.66, "step": 28762, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.956889 }, { "epoch": 0.9343793652340577, "grad_norm": 0.24170315265655518, "learning_rate": 1.172718060640593e-07, "loss": 0.009653184562921524, "memory(GiB)": 22.66, "step": 28763, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.956895 }, { "epoch": 0.9344118506968132, "grad_norm": 0.21607309579849243, "learning_rate": 1.171561795690035e-07, "loss": 0.0044164410792291164, "memory(GiB)": 22.66, "step": 28764, "token_acc": 1.0, "train_speed(iter/s)": 0.956901 }, { "epoch": 0.9344443361595686, "grad_norm": 0.3611374497413635, "learning_rate": 1.1704060942846429e-07, "loss": 0.014109659940004349, "memory(GiB)": 22.66, "step": 28765, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.956909 }, { "epoch": 0.934476821622324, "grad_norm": 0.4595281779766083, "learning_rate": 1.1692509564377508e-07, "loss": 0.015074714086949825, "memory(GiB)": 22.66, "step": 28766, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.956916 }, { "epoch": 0.9345093070850794, "grad_norm": 0.36197197437286377, "learning_rate": 1.1680963821626978e-07, "loss": 0.016426807269454002, "memory(GiB)": 22.66, "step": 28767, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.956923 }, { "epoch": 0.9345417925478349, "grad_norm": 0.3222101926803589, "learning_rate": 1.1669423714728123e-07, "loss": 0.009901161305606365, "memory(GiB)": 22.66, "step": 28768, "token_acc": 1.0, "train_speed(iter/s)": 0.95693 }, { "epoch": 0.9345742780105902, "grad_norm": 0.24561816453933716, "learning_rate": 1.1657889243814058e-07, "loss": 0.006321266293525696, "memory(GiB)": 22.66, "step": 28769, "token_acc": 1.0, "train_speed(iter/s)": 0.956937 }, { "epoch": 0.9346067634733457, "grad_norm": 0.37214529514312744, "learning_rate": 1.1646360409017899e-07, "loss": 0.014134593307971954, "memory(GiB)": 22.66, "step": 28770, "token_acc": 0.99, "train_speed(iter/s)": 0.956944 }, { "epoch": 0.9346392489361011, "grad_norm": 0.3816022574901581, "learning_rate": 1.1634837210472761e-07, "loss": 0.015410000458359718, "memory(GiB)": 22.66, "step": 28771, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.956951 }, { "epoch": 0.9346717343988565, "grad_norm": 0.2951173186302185, "learning_rate": 1.1623319648311648e-07, "loss": 0.006709532346576452, "memory(GiB)": 22.66, "step": 28772, "token_acc": 1.0, "train_speed(iter/s)": 0.956958 }, { "epoch": 0.9347042198616119, "grad_norm": 0.41701406240463257, "learning_rate": 1.16118077226674e-07, "loss": 0.011391237378120422, "memory(GiB)": 22.66, "step": 28773, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956965 }, { "epoch": 0.9347367053243674, "grad_norm": 0.3202686309814453, "learning_rate": 1.1600301433672967e-07, "loss": 0.010248512029647827, "memory(GiB)": 22.66, "step": 28774, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.956973 }, { "epoch": 0.9347691907871227, "grad_norm": 0.25682365894317627, "learning_rate": 1.1588800781461073e-07, "loss": 0.00955953635275364, "memory(GiB)": 22.66, "step": 28775, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.95698 }, { "epoch": 0.9348016762498782, "grad_norm": 0.3282513916492462, "learning_rate": 1.1577305766164504e-07, "loss": 0.013112431392073631, "memory(GiB)": 22.66, "step": 28776, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956987 }, { "epoch": 0.9348341617126336, "grad_norm": 0.2058895230293274, "learning_rate": 1.1565816387915985e-07, "loss": 0.006745701655745506, "memory(GiB)": 22.66, "step": 28777, "token_acc": 1.0, "train_speed(iter/s)": 0.956994 }, { "epoch": 0.934866647175389, "grad_norm": 0.21108649671077728, "learning_rate": 1.1554332646847965e-07, "loss": 0.004038523882627487, "memory(GiB)": 22.66, "step": 28778, "token_acc": 1.0, "train_speed(iter/s)": 0.957001 }, { "epoch": 0.9348991326381444, "grad_norm": 0.2664460241794586, "learning_rate": 1.1542854543093119e-07, "loss": 0.010616648942232132, "memory(GiB)": 22.66, "step": 28779, "token_acc": 0.9966101694915255, "train_speed(iter/s)": 0.957008 }, { "epoch": 0.9349316181008999, "grad_norm": 0.2531157433986664, "learning_rate": 1.1531382076783836e-07, "loss": 0.006399193778634071, "memory(GiB)": 22.66, "step": 28780, "token_acc": 1.0, "train_speed(iter/s)": 0.957015 }, { "epoch": 0.9349641035636552, "grad_norm": 0.4069998562335968, "learning_rate": 1.1519915248052627e-07, "loss": 0.015827005729079247, "memory(GiB)": 22.66, "step": 28781, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.957023 }, { "epoch": 0.9349965890264107, "grad_norm": 0.31703296303749084, "learning_rate": 1.1508454057031659e-07, "loss": 0.013190858066082, "memory(GiB)": 22.66, "step": 28782, "token_acc": 1.0, "train_speed(iter/s)": 0.95703 }, { "epoch": 0.9350290744891661, "grad_norm": 0.26894402503967285, "learning_rate": 1.1496998503853385e-07, "loss": 0.009338947013020515, "memory(GiB)": 22.66, "step": 28783, "token_acc": 1.0, "train_speed(iter/s)": 0.957036 }, { "epoch": 0.9350615599519215, "grad_norm": 0.26138123869895935, "learning_rate": 1.1485548588649864e-07, "loss": 0.010975442826747894, "memory(GiB)": 22.66, "step": 28784, "token_acc": 1.0, "train_speed(iter/s)": 0.957043 }, { "epoch": 0.9350940454146769, "grad_norm": 0.5522728562355042, "learning_rate": 1.1474104311553436e-07, "loss": 0.008527770638465881, "memory(GiB)": 22.66, "step": 28785, "token_acc": 1.0, "train_speed(iter/s)": 0.957051 }, { "epoch": 0.9351265308774324, "grad_norm": 0.5005068182945251, "learning_rate": 1.1462665672695993e-07, "loss": 0.014012258499860764, "memory(GiB)": 22.66, "step": 28786, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.957057 }, { "epoch": 0.9351590163401877, "grad_norm": 0.3417256474494934, "learning_rate": 1.1451232672209656e-07, "loss": 0.007453794125467539, "memory(GiB)": 22.66, "step": 28787, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.957064 }, { "epoch": 0.9351915018029432, "grad_norm": 0.31269723176956177, "learning_rate": 1.1439805310226314e-07, "loss": 0.008409231901168823, "memory(GiB)": 22.66, "step": 28788, "token_acc": 1.0, "train_speed(iter/s)": 0.95707 }, { "epoch": 0.9352239872656986, "grad_norm": 0.7029168009757996, "learning_rate": 1.1428383586877922e-07, "loss": 0.015908725559711456, "memory(GiB)": 22.66, "step": 28789, "token_acc": 1.0, "train_speed(iter/s)": 0.957077 }, { "epoch": 0.935256472728454, "grad_norm": 0.3000878393650055, "learning_rate": 1.1416967502296261e-07, "loss": 0.010604927316308022, "memory(GiB)": 22.66, "step": 28790, "token_acc": 0.9868421052631579, "train_speed(iter/s)": 0.957084 }, { "epoch": 0.9352889581912094, "grad_norm": 0.4340772330760956, "learning_rate": 1.1405557056613059e-07, "loss": 0.018258729949593544, "memory(GiB)": 22.66, "step": 28791, "token_acc": 0.9725490196078431, "train_speed(iter/s)": 0.957091 }, { "epoch": 0.9353214436539649, "grad_norm": 0.5302206873893738, "learning_rate": 1.1394152249960155e-07, "loss": 0.014786476269364357, "memory(GiB)": 22.66, "step": 28792, "token_acc": 1.0, "train_speed(iter/s)": 0.957098 }, { "epoch": 0.9353539291167202, "grad_norm": 0.3848631978034973, "learning_rate": 1.1382753082469e-07, "loss": 0.012938357889652252, "memory(GiB)": 22.66, "step": 28793, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957106 }, { "epoch": 0.9353864145794757, "grad_norm": 0.45833995938301086, "learning_rate": 1.1371359554271211e-07, "loss": 0.01608085446059704, "memory(GiB)": 22.66, "step": 28794, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.957112 }, { "epoch": 0.935418900042231, "grad_norm": 0.3262222707271576, "learning_rate": 1.1359971665498293e-07, "loss": 0.014120545238256454, "memory(GiB)": 22.66, "step": 28795, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957119 }, { "epoch": 0.9354513855049865, "grad_norm": 0.3762470781803131, "learning_rate": 1.1348589416281752e-07, "loss": 0.012692316435277462, "memory(GiB)": 22.66, "step": 28796, "token_acc": 1.0, "train_speed(iter/s)": 0.957126 }, { "epoch": 0.9354838709677419, "grad_norm": 0.3407544195652008, "learning_rate": 1.1337212806752762e-07, "loss": 0.010075820609927177, "memory(GiB)": 22.66, "step": 28797, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957133 }, { "epoch": 0.9355163564304974, "grad_norm": 0.37571555376052856, "learning_rate": 1.1325841837042828e-07, "loss": 0.014259319752454758, "memory(GiB)": 22.66, "step": 28798, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.95714 }, { "epoch": 0.9355488418932527, "grad_norm": 0.9262025356292725, "learning_rate": 1.1314476507283068e-07, "loss": 0.01444518193602562, "memory(GiB)": 22.66, "step": 28799, "token_acc": 1.0, "train_speed(iter/s)": 0.957145 }, { "epoch": 0.9355813273560082, "grad_norm": 0.3202968239784241, "learning_rate": 1.1303116817604765e-07, "loss": 0.00895543023943901, "memory(GiB)": 22.66, "step": 28800, "token_acc": 1.0, "train_speed(iter/s)": 0.95715 }, { "epoch": 0.9356138128187637, "grad_norm": 0.25587141513824463, "learning_rate": 1.129176276813887e-07, "loss": 0.009477348066866398, "memory(GiB)": 22.66, "step": 28801, "token_acc": 1.0, "train_speed(iter/s)": 0.957156 }, { "epoch": 0.935646298281519, "grad_norm": 0.3115138113498688, "learning_rate": 1.12804143590165e-07, "loss": 0.010490698739886284, "memory(GiB)": 22.66, "step": 28802, "token_acc": 1.0, "train_speed(iter/s)": 0.957161 }, { "epoch": 0.9356787837442745, "grad_norm": 0.28137004375457764, "learning_rate": 1.1269071590368663e-07, "loss": 0.009684177115559578, "memory(GiB)": 22.66, "step": 28803, "token_acc": 0.9903846153846154, "train_speed(iter/s)": 0.957166 }, { "epoch": 0.9357112692070298, "grad_norm": 0.37658727169036865, "learning_rate": 1.1257734462326253e-07, "loss": 0.010464332066476345, "memory(GiB)": 22.66, "step": 28804, "token_acc": 1.0, "train_speed(iter/s)": 0.957171 }, { "epoch": 0.9357437546697853, "grad_norm": 0.666428804397583, "learning_rate": 1.1246402975020054e-07, "loss": 0.009628446772694588, "memory(GiB)": 22.66, "step": 28805, "token_acc": 0.9900662251655629, "train_speed(iter/s)": 0.957176 }, { "epoch": 0.9357762401325407, "grad_norm": 0.46135222911834717, "learning_rate": 1.1235077128580907e-07, "loss": 0.011877518147230148, "memory(GiB)": 22.66, "step": 28806, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.957181 }, { "epoch": 0.9358087255952962, "grad_norm": 0.35375234484672546, "learning_rate": 1.1223756923139484e-07, "loss": 0.008591856807470322, "memory(GiB)": 22.66, "step": 28807, "token_acc": 1.0, "train_speed(iter/s)": 0.957185 }, { "epoch": 0.9358412110580515, "grad_norm": 0.25873398780822754, "learning_rate": 1.1212442358826514e-07, "loss": 0.00683039054274559, "memory(GiB)": 22.66, "step": 28808, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.95719 }, { "epoch": 0.935873696520807, "grad_norm": 0.311087429523468, "learning_rate": 1.120113343577256e-07, "loss": 0.006689545698463917, "memory(GiB)": 22.66, "step": 28809, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.957195 }, { "epoch": 0.9359061819835623, "grad_norm": 0.3120036721229553, "learning_rate": 1.1189830154108072e-07, "loss": 0.012572390958666801, "memory(GiB)": 22.66, "step": 28810, "token_acc": 0.9877551020408163, "train_speed(iter/s)": 0.957199 }, { "epoch": 0.9359386674463178, "grad_norm": 0.37279099225997925, "learning_rate": 1.1178532513963614e-07, "loss": 0.011324923485517502, "memory(GiB)": 22.66, "step": 28811, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.957205 }, { "epoch": 0.9359711529090732, "grad_norm": 0.24004150927066803, "learning_rate": 1.1167240515469413e-07, "loss": 0.008529005572199821, "memory(GiB)": 22.66, "step": 28812, "token_acc": 0.9941860465116279, "train_speed(iter/s)": 0.957209 }, { "epoch": 0.9360036383718287, "grad_norm": 0.31836071610450745, "learning_rate": 1.1155954158755922e-07, "loss": 0.012167107313871384, "memory(GiB)": 22.66, "step": 28813, "token_acc": 1.0, "train_speed(iter/s)": 0.957214 }, { "epoch": 0.936036123834584, "grad_norm": 0.3669424057006836, "learning_rate": 1.1144673443953424e-07, "loss": 0.01399723719805479, "memory(GiB)": 22.66, "step": 28814, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.957219 }, { "epoch": 0.9360686092973395, "grad_norm": 0.34663453698158264, "learning_rate": 1.1133398371192039e-07, "loss": 0.010211285203695297, "memory(GiB)": 22.66, "step": 28815, "token_acc": 0.9849056603773585, "train_speed(iter/s)": 0.957224 }, { "epoch": 0.9361010947600948, "grad_norm": 0.35252586007118225, "learning_rate": 1.1122128940601884e-07, "loss": 0.009644078090786934, "memory(GiB)": 22.66, "step": 28816, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957229 }, { "epoch": 0.9361335802228503, "grad_norm": 0.35021302103996277, "learning_rate": 1.1110865152313022e-07, "loss": 0.01043334137648344, "memory(GiB)": 22.66, "step": 28817, "token_acc": 1.0, "train_speed(iter/s)": 0.957233 }, { "epoch": 0.9361660656856057, "grad_norm": 0.6772491931915283, "learning_rate": 1.109960700645557e-07, "loss": 0.010219021700322628, "memory(GiB)": 22.66, "step": 28818, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.957238 }, { "epoch": 0.9361985511483611, "grad_norm": 0.22679707407951355, "learning_rate": 1.1088354503159371e-07, "loss": 0.008290184661746025, "memory(GiB)": 22.66, "step": 28819, "token_acc": 1.0, "train_speed(iter/s)": 0.957244 }, { "epoch": 0.9362310366111165, "grad_norm": 0.3345344662666321, "learning_rate": 1.1077107642554263e-07, "loss": 0.008127509616315365, "memory(GiB)": 22.66, "step": 28820, "token_acc": 1.0, "train_speed(iter/s)": 0.957249 }, { "epoch": 0.936263522073872, "grad_norm": 0.716859757900238, "learning_rate": 1.1065866424770033e-07, "loss": 0.017538536339998245, "memory(GiB)": 22.66, "step": 28821, "token_acc": 0.995, "train_speed(iter/s)": 0.957254 }, { "epoch": 0.9362960075366273, "grad_norm": 0.4205721318721771, "learning_rate": 1.1054630849936577e-07, "loss": 0.011347385123372078, "memory(GiB)": 22.66, "step": 28822, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957259 }, { "epoch": 0.9363284929993828, "grad_norm": 0.4310600161552429, "learning_rate": 1.10434009181834e-07, "loss": 0.009098190814256668, "memory(GiB)": 22.66, "step": 28823, "token_acc": 1.0, "train_speed(iter/s)": 0.957264 }, { "epoch": 0.9363609784621382, "grad_norm": 0.32069236040115356, "learning_rate": 1.1032176629640234e-07, "loss": 0.011512914672493935, "memory(GiB)": 22.66, "step": 28824, "token_acc": 1.0, "train_speed(iter/s)": 0.957269 }, { "epoch": 0.9363934639248936, "grad_norm": 0.4469776153564453, "learning_rate": 1.1020957984436531e-07, "loss": 0.014353018254041672, "memory(GiB)": 22.66, "step": 28825, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.957275 }, { "epoch": 0.936425949387649, "grad_norm": 0.43963825702667236, "learning_rate": 1.1009744982701798e-07, "loss": 0.01028912328183651, "memory(GiB)": 22.66, "step": 28826, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957281 }, { "epoch": 0.9364584348504045, "grad_norm": 0.25061094760894775, "learning_rate": 1.099853762456543e-07, "loss": 0.009521694853901863, "memory(GiB)": 22.66, "step": 28827, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957288 }, { "epoch": 0.9364909203131598, "grad_norm": 0.4057862460613251, "learning_rate": 1.0987335910156826e-07, "loss": 0.012984571047127247, "memory(GiB)": 22.66, "step": 28828, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957295 }, { "epoch": 0.9365234057759153, "grad_norm": 0.26795077323913574, "learning_rate": 1.0976139839605271e-07, "loss": 0.00952948909252882, "memory(GiB)": 22.66, "step": 28829, "token_acc": 1.0, "train_speed(iter/s)": 0.957302 }, { "epoch": 0.9365558912386707, "grad_norm": 0.30474087595939636, "learning_rate": 1.0964949413039883e-07, "loss": 0.010498973540961742, "memory(GiB)": 22.66, "step": 28830, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.957309 }, { "epoch": 0.9365883767014261, "grad_norm": 0.7365502715110779, "learning_rate": 1.0953764630589892e-07, "loss": 0.010253231972455978, "memory(GiB)": 22.66, "step": 28831, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957316 }, { "epoch": 0.9366208621641815, "grad_norm": 0.4439665377140045, "learning_rate": 1.0942585492384472e-07, "loss": 0.011312317103147507, "memory(GiB)": 22.66, "step": 28832, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.957323 }, { "epoch": 0.936653347626937, "grad_norm": 0.3020007014274597, "learning_rate": 1.0931411998552466e-07, "loss": 0.01047097984701395, "memory(GiB)": 22.66, "step": 28833, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.95733 }, { "epoch": 0.9366858330896923, "grad_norm": 0.22685576975345612, "learning_rate": 1.0920244149222936e-07, "loss": 0.008396714925765991, "memory(GiB)": 22.66, "step": 28834, "token_acc": 1.0, "train_speed(iter/s)": 0.957337 }, { "epoch": 0.9367183185524478, "grad_norm": 0.29282885789871216, "learning_rate": 1.0909081944524725e-07, "loss": 0.0077527668327093124, "memory(GiB)": 22.66, "step": 28835, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.957344 }, { "epoch": 0.9367508040152032, "grad_norm": 0.33004310727119446, "learning_rate": 1.0897925384586727e-07, "loss": 0.00882743950933218, "memory(GiB)": 22.66, "step": 28836, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957351 }, { "epoch": 0.9367832894779586, "grad_norm": 0.44957229495048523, "learning_rate": 1.088677446953762e-07, "loss": 0.009937172755599022, "memory(GiB)": 22.66, "step": 28837, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.957358 }, { "epoch": 0.936815774940714, "grad_norm": 0.3398285210132599, "learning_rate": 1.0875629199506188e-07, "loss": 0.013954594731330872, "memory(GiB)": 22.66, "step": 28838, "token_acc": 1.0, "train_speed(iter/s)": 0.957365 }, { "epoch": 0.9368482604034695, "grad_norm": 0.5957232713699341, "learning_rate": 1.0864489574620996e-07, "loss": 0.015304304659366608, "memory(GiB)": 22.66, "step": 28839, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.957372 }, { "epoch": 0.9368807458662248, "grad_norm": 0.32967519760131836, "learning_rate": 1.0853355595010607e-07, "loss": 0.007592167239636183, "memory(GiB)": 22.66, "step": 28840, "token_acc": 1.0, "train_speed(iter/s)": 0.957379 }, { "epoch": 0.9369132313289803, "grad_norm": 0.29515770077705383, "learning_rate": 1.0842227260803584e-07, "loss": 0.013764861971139908, "memory(GiB)": 22.66, "step": 28841, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.957386 }, { "epoch": 0.9369457167917357, "grad_norm": 0.39598318934440613, "learning_rate": 1.0831104572128326e-07, "loss": 0.01111440546810627, "memory(GiB)": 22.66, "step": 28842, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957393 }, { "epoch": 0.9369782022544911, "grad_norm": 0.35430505871772766, "learning_rate": 1.0819987529113229e-07, "loss": 0.010694622993469238, "memory(GiB)": 22.66, "step": 28843, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.9574 }, { "epoch": 0.9370106877172465, "grad_norm": 0.27965837717056274, "learning_rate": 1.0808876131886525e-07, "loss": 0.008701132610440254, "memory(GiB)": 22.66, "step": 28844, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.957408 }, { "epoch": 0.937043173180002, "grad_norm": 0.29735633730888367, "learning_rate": 1.07977703805765e-07, "loss": 0.008962885476648808, "memory(GiB)": 22.66, "step": 28845, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.957415 }, { "epoch": 0.9370756586427573, "grad_norm": 0.4459228813648224, "learning_rate": 1.0786670275311329e-07, "loss": 0.015795940533280373, "memory(GiB)": 22.66, "step": 28846, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957422 }, { "epoch": 0.9371081441055128, "grad_norm": 0.2633180320262909, "learning_rate": 1.0775575816219187e-07, "loss": 0.007664923556149006, "memory(GiB)": 22.66, "step": 28847, "token_acc": 0.996415770609319, "train_speed(iter/s)": 0.957429 }, { "epoch": 0.9371406295682682, "grad_norm": 0.3078470230102539, "learning_rate": 1.0764487003428026e-07, "loss": 0.008773917332291603, "memory(GiB)": 22.66, "step": 28848, "token_acc": 1.0, "train_speed(iter/s)": 0.957436 }, { "epoch": 0.9371731150310236, "grad_norm": 0.3117506206035614, "learning_rate": 1.0753403837065801e-07, "loss": 0.014449835754930973, "memory(GiB)": 22.66, "step": 28849, "token_acc": 1.0, "train_speed(iter/s)": 0.957443 }, { "epoch": 0.937205600493779, "grad_norm": 0.27879345417022705, "learning_rate": 1.0742326317260521e-07, "loss": 0.006385771557688713, "memory(GiB)": 22.66, "step": 28850, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.95745 }, { "epoch": 0.9372380859565345, "grad_norm": 0.3422192633152008, "learning_rate": 1.0731254444140027e-07, "loss": 0.009349875152111053, "memory(GiB)": 22.66, "step": 28851, "token_acc": 0.9932432432432432, "train_speed(iter/s)": 0.957457 }, { "epoch": 0.9372705714192898, "grad_norm": 0.2750528156757355, "learning_rate": 1.072018821783205e-07, "loss": 0.007304200902581215, "memory(GiB)": 22.66, "step": 28852, "token_acc": 1.0, "train_speed(iter/s)": 0.957464 }, { "epoch": 0.9373030568820453, "grad_norm": 0.3305946886539459, "learning_rate": 1.0709127638464268e-07, "loss": 0.014062692411243916, "memory(GiB)": 22.66, "step": 28853, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.957471 }, { "epoch": 0.9373355423448007, "grad_norm": 0.287496954202652, "learning_rate": 1.0698072706164464e-07, "loss": 0.0058350395411252975, "memory(GiB)": 22.66, "step": 28854, "token_acc": 1.0, "train_speed(iter/s)": 0.957478 }, { "epoch": 0.9373680278075561, "grad_norm": 0.8268285989761353, "learning_rate": 1.068702342106015e-07, "loss": 0.012332217767834663, "memory(GiB)": 22.66, "step": 28855, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.957484 }, { "epoch": 0.9374005132703115, "grad_norm": 0.3669736385345459, "learning_rate": 1.0675979783278889e-07, "loss": 0.014983197674155235, "memory(GiB)": 22.66, "step": 28856, "token_acc": 0.9927007299270073, "train_speed(iter/s)": 0.957491 }, { "epoch": 0.937432998733067, "grad_norm": 0.3228449821472168, "learning_rate": 1.066494179294808e-07, "loss": 0.010212564840912819, "memory(GiB)": 22.66, "step": 28857, "token_acc": 0.9811320754716981, "train_speed(iter/s)": 0.957498 }, { "epoch": 0.9374654841958223, "grad_norm": 0.4513106942176819, "learning_rate": 1.0653909450195122e-07, "loss": 0.011561492457985878, "memory(GiB)": 22.66, "step": 28858, "token_acc": 1.0, "train_speed(iter/s)": 0.957505 }, { "epoch": 0.9374979696585778, "grad_norm": 0.26093974709510803, "learning_rate": 1.0642882755147355e-07, "loss": 0.00935374479740858, "memory(GiB)": 22.66, "step": 28859, "token_acc": 1.0, "train_speed(iter/s)": 0.95751 }, { "epoch": 0.9375304551213332, "grad_norm": 0.4043012261390686, "learning_rate": 1.0631861707932124e-07, "loss": 0.010226194746792316, "memory(GiB)": 22.66, "step": 28860, "token_acc": 1.0, "train_speed(iter/s)": 0.957515 }, { "epoch": 0.9375629405840886, "grad_norm": 0.21008454263210297, "learning_rate": 1.0620846308676547e-07, "loss": 0.007569807581603527, "memory(GiB)": 22.66, "step": 28861, "token_acc": 1.0, "train_speed(iter/s)": 0.957521 }, { "epoch": 0.937595426046844, "grad_norm": 0.4194948673248291, "learning_rate": 1.0609836557507747e-07, "loss": 0.010735406540334225, "memory(GiB)": 22.66, "step": 28862, "token_acc": 1.0, "train_speed(iter/s)": 0.957526 }, { "epoch": 0.9376279115095995, "grad_norm": 0.363035649061203, "learning_rate": 1.0598832454552733e-07, "loss": 0.013249441049993038, "memory(GiB)": 22.66, "step": 28863, "token_acc": 0.9850187265917603, "train_speed(iter/s)": 0.957532 }, { "epoch": 0.9376603969723549, "grad_norm": 0.3167443573474884, "learning_rate": 1.0587833999938735e-07, "loss": 0.010533051565289497, "memory(GiB)": 22.66, "step": 28864, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957537 }, { "epoch": 0.9376928824351103, "grad_norm": 0.4126091003417969, "learning_rate": 1.0576841193792431e-07, "loss": 0.01567188650369644, "memory(GiB)": 22.66, "step": 28865, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.957542 }, { "epoch": 0.9377253678978658, "grad_norm": 0.3319317102432251, "learning_rate": 1.0565854036240886e-07, "loss": 0.01131446287035942, "memory(GiB)": 22.66, "step": 28866, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957547 }, { "epoch": 0.9377578533606211, "grad_norm": 0.4571932256221771, "learning_rate": 1.055487252741072e-07, "loss": 0.01312185823917389, "memory(GiB)": 22.66, "step": 28867, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.957552 }, { "epoch": 0.9377903388233766, "grad_norm": 0.40348511934280396, "learning_rate": 1.0543896667428777e-07, "loss": 0.01555495522916317, "memory(GiB)": 22.66, "step": 28868, "token_acc": 0.988, "train_speed(iter/s)": 0.957557 }, { "epoch": 0.937822824286132, "grad_norm": 0.2824867367744446, "learning_rate": 1.0532926456421788e-07, "loss": 0.006946545094251633, "memory(GiB)": 22.66, "step": 28869, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957562 }, { "epoch": 0.9378553097488874, "grad_norm": 0.20570296049118042, "learning_rate": 1.0521961894516264e-07, "loss": 0.0067529696971178055, "memory(GiB)": 22.66, "step": 28870, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957566 }, { "epoch": 0.9378877952116428, "grad_norm": 0.25341156125068665, "learning_rate": 1.051100298183888e-07, "loss": 0.011485358700156212, "memory(GiB)": 22.66, "step": 28871, "token_acc": 1.0, "train_speed(iter/s)": 0.95757 }, { "epoch": 0.9379202806743983, "grad_norm": 0.42272913455963135, "learning_rate": 1.0500049718515982e-07, "loss": 0.01807268336415291, "memory(GiB)": 22.66, "step": 28872, "token_acc": 0.992, "train_speed(iter/s)": 0.957575 }, { "epoch": 0.9379527661371536, "grad_norm": 0.27943918108940125, "learning_rate": 1.0489102104673965e-07, "loss": 0.010687116533517838, "memory(GiB)": 22.66, "step": 28873, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.95758 }, { "epoch": 0.9379852515999091, "grad_norm": 0.43337589502334595, "learning_rate": 1.0478160140439342e-07, "loss": 0.018349818885326385, "memory(GiB)": 22.66, "step": 28874, "token_acc": 0.988929889298893, "train_speed(iter/s)": 0.957585 }, { "epoch": 0.9380177370626644, "grad_norm": 0.4680449962615967, "learning_rate": 1.0467223825938289e-07, "loss": 0.01631404459476471, "memory(GiB)": 22.66, "step": 28875, "token_acc": 1.0, "train_speed(iter/s)": 0.957589 }, { "epoch": 0.9380502225254199, "grad_norm": 0.5199422240257263, "learning_rate": 1.0456293161296981e-07, "loss": 0.013595848344266415, "memory(GiB)": 22.66, "step": 28876, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957594 }, { "epoch": 0.9380827079881753, "grad_norm": 0.3581886887550354, "learning_rate": 1.0445368146641655e-07, "loss": 0.011062823235988617, "memory(GiB)": 22.66, "step": 28877, "token_acc": 0.9961685823754789, "train_speed(iter/s)": 0.957599 }, { "epoch": 0.9381151934509308, "grad_norm": 0.3252236247062683, "learning_rate": 1.0434448782098372e-07, "loss": 0.010507189668715, "memory(GiB)": 22.66, "step": 28878, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957604 }, { "epoch": 0.9381476789136861, "grad_norm": 0.42125800251960754, "learning_rate": 1.04235350677932e-07, "loss": 0.013418857008218765, "memory(GiB)": 22.66, "step": 28879, "token_acc": 0.9927007299270073, "train_speed(iter/s)": 0.957609 }, { "epoch": 0.9381801643764416, "grad_norm": 0.3030972480773926, "learning_rate": 1.0412627003852039e-07, "loss": 0.01235734298825264, "memory(GiB)": 22.66, "step": 28880, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.957615 }, { "epoch": 0.9382126498391969, "grad_norm": 0.3325293958187103, "learning_rate": 1.0401724590400786e-07, "loss": 0.006666318979114294, "memory(GiB)": 22.66, "step": 28881, "token_acc": 1.0, "train_speed(iter/s)": 0.957621 }, { "epoch": 0.9382451353019524, "grad_norm": 0.23293402791023254, "learning_rate": 1.0390827827565287e-07, "loss": 0.007506258320063353, "memory(GiB)": 22.66, "step": 28882, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957626 }, { "epoch": 0.9382776207647078, "grad_norm": 0.4099006652832031, "learning_rate": 1.0379936715471328e-07, "loss": 0.013248689472675323, "memory(GiB)": 22.66, "step": 28883, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.957632 }, { "epoch": 0.9383101062274632, "grad_norm": 0.3890896737575531, "learning_rate": 1.0369051254244588e-07, "loss": 0.011509748175740242, "memory(GiB)": 22.66, "step": 28884, "token_acc": 0.995, "train_speed(iter/s)": 0.957637 }, { "epoch": 0.9383425916902186, "grad_norm": 0.454139769077301, "learning_rate": 1.0358171444010634e-07, "loss": 0.01580466702580452, "memory(GiB)": 22.66, "step": 28885, "token_acc": 0.9940476190476191, "train_speed(iter/s)": 0.957642 }, { "epoch": 0.9383750771529741, "grad_norm": 0.2839096486568451, "learning_rate": 1.0347297284895142e-07, "loss": 0.007959656417369843, "memory(GiB)": 22.66, "step": 28886, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.957648 }, { "epoch": 0.9384075626157294, "grad_norm": 0.19085149466991425, "learning_rate": 1.0336428777023511e-07, "loss": 0.005075057968497276, "memory(GiB)": 22.66, "step": 28887, "token_acc": 1.0, "train_speed(iter/s)": 0.957653 }, { "epoch": 0.9384400480784849, "grad_norm": 0.3510003387928009, "learning_rate": 1.0325565920521308e-07, "loss": 0.008426926098763943, "memory(GiB)": 22.66, "step": 28888, "token_acc": 1.0, "train_speed(iter/s)": 0.957659 }, { "epoch": 0.9384725335412403, "grad_norm": 0.3163674771785736, "learning_rate": 1.0314708715513822e-07, "loss": 0.012041732668876648, "memory(GiB)": 22.66, "step": 28889, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.957664 }, { "epoch": 0.9385050190039957, "grad_norm": 0.19273413717746735, "learning_rate": 1.030385716212634e-07, "loss": 0.006946219131350517, "memory(GiB)": 22.66, "step": 28890, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.957671 }, { "epoch": 0.9385375044667511, "grad_norm": 0.39838510751724243, "learning_rate": 1.0293011260484098e-07, "loss": 0.012813152745366096, "memory(GiB)": 22.66, "step": 28891, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957678 }, { "epoch": 0.9385699899295066, "grad_norm": 0.3301692605018616, "learning_rate": 1.0282171010712383e-07, "loss": 0.012775005772709846, "memory(GiB)": 22.66, "step": 28892, "token_acc": 1.0, "train_speed(iter/s)": 0.957685 }, { "epoch": 0.9386024753922619, "grad_norm": 0.35910534858703613, "learning_rate": 1.027133641293615e-07, "loss": 0.010091852396726608, "memory(GiB)": 22.66, "step": 28893, "token_acc": 0.9965870307167235, "train_speed(iter/s)": 0.957692 }, { "epoch": 0.9386349608550174, "grad_norm": 0.36474135518074036, "learning_rate": 1.0260507467280633e-07, "loss": 0.010937062092125416, "memory(GiB)": 22.66, "step": 28894, "token_acc": 0.9952830188679245, "train_speed(iter/s)": 0.957699 }, { "epoch": 0.9386674463177728, "grad_norm": 0.37251558899879456, "learning_rate": 1.0249684173870567e-07, "loss": 0.01068770233541727, "memory(GiB)": 22.66, "step": 28895, "token_acc": 1.0, "train_speed(iter/s)": 0.957706 }, { "epoch": 0.9386999317805282, "grad_norm": 0.2842774987220764, "learning_rate": 1.023886653283107e-07, "loss": 0.008317476138472557, "memory(GiB)": 22.66, "step": 28896, "token_acc": 0.9897610921501706, "train_speed(iter/s)": 0.957713 }, { "epoch": 0.9387324172432836, "grad_norm": 0.3598170280456543, "learning_rate": 1.0228054544286881e-07, "loss": 0.011744377203285694, "memory(GiB)": 22.66, "step": 28897, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.95772 }, { "epoch": 0.9387649027060391, "grad_norm": 0.30704471468925476, "learning_rate": 1.0217248208362895e-07, "loss": 0.01258244551718235, "memory(GiB)": 22.66, "step": 28898, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.957727 }, { "epoch": 0.9387973881687944, "grad_norm": 0.3123703896999359, "learning_rate": 1.0206447525183738e-07, "loss": 0.007001995109021664, "memory(GiB)": 22.66, "step": 28899, "token_acc": 1.0, "train_speed(iter/s)": 0.957734 }, { "epoch": 0.9388298736315499, "grad_norm": 0.23679694533348083, "learning_rate": 1.0195652494874086e-07, "loss": 0.006577400490641594, "memory(GiB)": 22.66, "step": 28900, "token_acc": 1.0, "train_speed(iter/s)": 0.957741 }, { "epoch": 0.9388623590943053, "grad_norm": 1.149584174156189, "learning_rate": 1.0184863117558507e-07, "loss": 0.011277103796601295, "memory(GiB)": 22.66, "step": 28901, "token_acc": 1.0, "train_speed(iter/s)": 0.957748 }, { "epoch": 0.9388948445570607, "grad_norm": 0.36580538749694824, "learning_rate": 1.0174079393361569e-07, "loss": 0.009729194454848766, "memory(GiB)": 22.66, "step": 28902, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.957755 }, { "epoch": 0.9389273300198161, "grad_norm": 0.6459742188453674, "learning_rate": 1.0163301322407726e-07, "loss": 0.02356785535812378, "memory(GiB)": 22.66, "step": 28903, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957762 }, { "epoch": 0.9389598154825716, "grad_norm": 0.35023343563079834, "learning_rate": 1.0152528904821324e-07, "loss": 0.010010234080255032, "memory(GiB)": 22.66, "step": 28904, "token_acc": 1.0, "train_speed(iter/s)": 0.957769 }, { "epoch": 0.9389923009453269, "grad_norm": 0.38454893231391907, "learning_rate": 1.0141762140726763e-07, "loss": 0.009801715612411499, "memory(GiB)": 22.66, "step": 28905, "token_acc": 1.0, "train_speed(iter/s)": 0.957777 }, { "epoch": 0.9390247864080824, "grad_norm": 0.4084140360355377, "learning_rate": 1.0131001030248222e-07, "loss": 0.009547460824251175, "memory(GiB)": 22.66, "step": 28906, "token_acc": 0.9895470383275261, "train_speed(iter/s)": 0.957784 }, { "epoch": 0.9390572718708378, "grad_norm": 0.317098468542099, "learning_rate": 1.012024557350999e-07, "loss": 0.010426477529108524, "memory(GiB)": 22.66, "step": 28907, "token_acc": 1.0, "train_speed(iter/s)": 0.957791 }, { "epoch": 0.9390897573335932, "grad_norm": 0.2827926278114319, "learning_rate": 1.0109495770636135e-07, "loss": 0.008212309330701828, "memory(GiB)": 22.66, "step": 28908, "token_acc": 0.9963768115942029, "train_speed(iter/s)": 0.957798 }, { "epoch": 0.9391222427963486, "grad_norm": 0.34956812858581543, "learning_rate": 1.0098751621750724e-07, "loss": 0.00795400608330965, "memory(GiB)": 22.66, "step": 28909, "token_acc": 1.0, "train_speed(iter/s)": 0.957805 }, { "epoch": 0.9391547282591041, "grad_norm": 0.29319408535957336, "learning_rate": 1.0088013126977769e-07, "loss": 0.005249332636594772, "memory(GiB)": 22.66, "step": 28910, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.957812 }, { "epoch": 0.9391872137218594, "grad_norm": 0.35993415117263794, "learning_rate": 1.0077280286441283e-07, "loss": 0.009838799014687538, "memory(GiB)": 22.66, "step": 28911, "token_acc": 1.0, "train_speed(iter/s)": 0.957819 }, { "epoch": 0.9392196991846149, "grad_norm": 0.33509859442710876, "learning_rate": 1.0066553100264997e-07, "loss": 0.01248255930840969, "memory(GiB)": 22.66, "step": 28912, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.957827 }, { "epoch": 0.9392521846473703, "grad_norm": 0.41743147373199463, "learning_rate": 1.0055831568572872e-07, "loss": 0.008005491457879543, "memory(GiB)": 22.66, "step": 28913, "token_acc": 1.0, "train_speed(iter/s)": 0.957834 }, { "epoch": 0.9392846701101257, "grad_norm": 0.6003343462944031, "learning_rate": 1.0045115691488472e-07, "loss": 0.017492733895778656, "memory(GiB)": 22.66, "step": 28914, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957841 }, { "epoch": 0.9393171555728811, "grad_norm": 0.36159655451774597, "learning_rate": 1.0034405469135533e-07, "loss": 0.012671059928834438, "memory(GiB)": 22.66, "step": 28915, "token_acc": 0.9866666666666667, "train_speed(iter/s)": 0.957848 }, { "epoch": 0.9393496410356366, "grad_norm": 0.35389167070388794, "learning_rate": 1.0023700901637845e-07, "loss": 0.012766380794346333, "memory(GiB)": 22.66, "step": 28916, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957855 }, { "epoch": 0.9393821264983919, "grad_norm": 0.9694411158561707, "learning_rate": 1.0013001989118697e-07, "loss": 0.01166479755192995, "memory(GiB)": 22.66, "step": 28917, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957862 }, { "epoch": 0.9394146119611474, "grad_norm": 0.414549857378006, "learning_rate": 1.0002308731701715e-07, "loss": 0.010483724996447563, "memory(GiB)": 22.66, "step": 28918, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957869 }, { "epoch": 0.9394470974239028, "grad_norm": 0.18796634674072266, "learning_rate": 9.991621129510243e-08, "loss": 0.0057046180590987206, "memory(GiB)": 22.66, "step": 28919, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957875 }, { "epoch": 0.9394795828866582, "grad_norm": 0.32981544733047485, "learning_rate": 9.980939182667682e-08, "loss": 0.01816198229789734, "memory(GiB)": 22.66, "step": 28920, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.95788 }, { "epoch": 0.9395120683494136, "grad_norm": 0.3010324537754059, "learning_rate": 9.970262891297322e-08, "loss": 0.012013956904411316, "memory(GiB)": 22.66, "step": 28921, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.957886 }, { "epoch": 0.9395445538121691, "grad_norm": 0.36932072043418884, "learning_rate": 9.959592255522344e-08, "loss": 0.015659157186746597, "memory(GiB)": 22.66, "step": 28922, "token_acc": 1.0, "train_speed(iter/s)": 0.957891 }, { "epoch": 0.9395770392749244, "grad_norm": 0.3466937839984894, "learning_rate": 9.948927275465925e-08, "loss": 0.013365568593144417, "memory(GiB)": 22.66, "step": 28923, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.957897 }, { "epoch": 0.9396095247376799, "grad_norm": 0.44500964879989624, "learning_rate": 9.93826795125108e-08, "loss": 0.01300761103630066, "memory(GiB)": 22.66, "step": 28924, "token_acc": 0.9927797833935018, "train_speed(iter/s)": 0.957902 }, { "epoch": 0.9396420102004353, "grad_norm": 0.2854440212249756, "learning_rate": 9.92761428300093e-08, "loss": 0.00857330858707428, "memory(GiB)": 22.66, "step": 28925, "token_acc": 1.0, "train_speed(iter/s)": 0.957907 }, { "epoch": 0.9396744956631907, "grad_norm": 0.3252990245819092, "learning_rate": 9.916966270838435e-08, "loss": 0.007603409234434366, "memory(GiB)": 22.66, "step": 28926, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.957913 }, { "epoch": 0.9397069811259461, "grad_norm": 0.3436300754547119, "learning_rate": 9.906323914886384e-08, "loss": 0.01282704621553421, "memory(GiB)": 22.66, "step": 28927, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.957918 }, { "epoch": 0.9397394665887016, "grad_norm": 0.2636498808860779, "learning_rate": 9.895687215267679e-08, "loss": 0.009900417178869247, "memory(GiB)": 22.66, "step": 28928, "token_acc": 1.0, "train_speed(iter/s)": 0.957924 }, { "epoch": 0.939771952051457, "grad_norm": 0.1809270977973938, "learning_rate": 9.885056172105112e-08, "loss": 0.006896571721881628, "memory(GiB)": 22.66, "step": 28929, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.957929 }, { "epoch": 0.9398044375142124, "grad_norm": 0.3778250515460968, "learning_rate": 9.874430785521305e-08, "loss": 0.009322521276772022, "memory(GiB)": 22.66, "step": 28930, "token_acc": 0.9961685823754789, "train_speed(iter/s)": 0.957934 }, { "epoch": 0.9398369229769679, "grad_norm": 0.40275678038597107, "learning_rate": 9.863811055638938e-08, "loss": 0.0047737909480929375, "memory(GiB)": 22.66, "step": 28931, "token_acc": 1.0, "train_speed(iter/s)": 0.957938 }, { "epoch": 0.9398694084397232, "grad_norm": 0.8332924842834473, "learning_rate": 9.853196982580582e-08, "loss": 0.017597351223230362, "memory(GiB)": 22.66, "step": 28932, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.957943 }, { "epoch": 0.9399018939024787, "grad_norm": 0.29974254965782166, "learning_rate": 9.842588566468636e-08, "loss": 0.010645594447851181, "memory(GiB)": 22.66, "step": 28933, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.957948 }, { "epoch": 0.939934379365234, "grad_norm": 0.37777578830718994, "learning_rate": 9.83198580742567e-08, "loss": 0.012659179978072643, "memory(GiB)": 22.66, "step": 28934, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.957953 }, { "epoch": 0.9399668648279895, "grad_norm": 0.38556426763534546, "learning_rate": 9.821388705574031e-08, "loss": 0.009178401902318, "memory(GiB)": 22.66, "step": 28935, "token_acc": 1.0, "train_speed(iter/s)": 0.957957 }, { "epoch": 0.9399993502907449, "grad_norm": 0.4437060058116913, "learning_rate": 9.810797261035954e-08, "loss": 0.016397040337324142, "memory(GiB)": 22.66, "step": 28936, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.957962 }, { "epoch": 0.9400318357535004, "grad_norm": 0.2760474383831024, "learning_rate": 9.80021147393373e-08, "loss": 0.007333814166486263, "memory(GiB)": 22.66, "step": 28937, "token_acc": 1.0, "train_speed(iter/s)": 0.957967 }, { "epoch": 0.9400643212162557, "grad_norm": 0.411390483379364, "learning_rate": 9.789631344389483e-08, "loss": 0.014390437863767147, "memory(GiB)": 22.66, "step": 28938, "token_acc": 0.9890710382513661, "train_speed(iter/s)": 0.957971 }, { "epoch": 0.9400968066790112, "grad_norm": 0.332315593957901, "learning_rate": 9.77905687252545e-08, "loss": 0.009169444441795349, "memory(GiB)": 22.66, "step": 28939, "token_acc": 1.0, "train_speed(iter/s)": 0.957977 }, { "epoch": 0.9401292921417665, "grad_norm": 0.2880227863788605, "learning_rate": 9.768488058463533e-08, "loss": 0.010362276807427406, "memory(GiB)": 22.66, "step": 28940, "token_acc": 1.0, "train_speed(iter/s)": 0.957982 }, { "epoch": 0.940161777604522, "grad_norm": 0.38214901089668274, "learning_rate": 9.757924902325744e-08, "loss": 0.009692072868347168, "memory(GiB)": 22.66, "step": 28941, "token_acc": 1.0, "train_speed(iter/s)": 0.957987 }, { "epoch": 0.9401942630672774, "grad_norm": 0.24958361685276031, "learning_rate": 9.747367404233987e-08, "loss": 0.006555670872330666, "memory(GiB)": 22.66, "step": 28942, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.957991 }, { "epoch": 0.9402267485300329, "grad_norm": 0.32834699749946594, "learning_rate": 9.73681556431022e-08, "loss": 0.012533939443528652, "memory(GiB)": 22.66, "step": 28943, "token_acc": 1.0, "train_speed(iter/s)": 0.957997 }, { "epoch": 0.9402592339927882, "grad_norm": 0.23783555626869202, "learning_rate": 9.726269382676068e-08, "loss": 0.00608113594353199, "memory(GiB)": 22.66, "step": 28944, "token_acc": 1.0, "train_speed(iter/s)": 0.958002 }, { "epoch": 0.9402917194555437, "grad_norm": 0.40897348523139954, "learning_rate": 9.715728859453322e-08, "loss": 0.011806197464466095, "memory(GiB)": 22.66, "step": 28945, "token_acc": 0.98828125, "train_speed(iter/s)": 0.958007 }, { "epoch": 0.940324204918299, "grad_norm": 0.3197054862976074, "learning_rate": 9.705193994763606e-08, "loss": 0.013155915774405003, "memory(GiB)": 22.66, "step": 28946, "token_acc": 1.0, "train_speed(iter/s)": 0.958013 }, { "epoch": 0.9403566903810545, "grad_norm": 0.27867233753204346, "learning_rate": 9.694664788728602e-08, "loss": 0.007222680374979973, "memory(GiB)": 22.66, "step": 28947, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.958018 }, { "epoch": 0.9403891758438099, "grad_norm": 0.3475838303565979, "learning_rate": 9.68414124146977e-08, "loss": 0.011923476122319698, "memory(GiB)": 22.66, "step": 28948, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.958023 }, { "epoch": 0.9404216613065653, "grad_norm": 0.35746821761131287, "learning_rate": 9.673623353108508e-08, "loss": 0.007966886274516582, "memory(GiB)": 22.66, "step": 28949, "token_acc": 1.0, "train_speed(iter/s)": 0.958027 }, { "epoch": 0.9404541467693207, "grad_norm": 0.4062017798423767, "learning_rate": 9.663111123766277e-08, "loss": 0.011710267513990402, "memory(GiB)": 22.66, "step": 28950, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.958032 }, { "epoch": 0.9404866322320762, "grad_norm": 0.4155786335468292, "learning_rate": 9.652604553564315e-08, "loss": 0.015214631333947182, "memory(GiB)": 22.66, "step": 28951, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.958038 }, { "epoch": 0.9405191176948315, "grad_norm": 0.4731941521167755, "learning_rate": 9.64210364262408e-08, "loss": 0.010718152858316898, "memory(GiB)": 22.66, "step": 28952, "token_acc": 0.992, "train_speed(iter/s)": 0.958043 }, { "epoch": 0.940551603157587, "grad_norm": 0.36180856823921204, "learning_rate": 9.631608391066528e-08, "loss": 0.011986501514911652, "memory(GiB)": 22.66, "step": 28953, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.958049 }, { "epoch": 0.9405840886203424, "grad_norm": 0.3130704462528229, "learning_rate": 9.621118799012952e-08, "loss": 0.0139778358861804, "memory(GiB)": 22.66, "step": 28954, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.958055 }, { "epoch": 0.9406165740830978, "grad_norm": 0.4259040653705597, "learning_rate": 9.610634866584312e-08, "loss": 0.016548145562410355, "memory(GiB)": 22.66, "step": 28955, "token_acc": 1.0, "train_speed(iter/s)": 0.958062 }, { "epoch": 0.9406490595458532, "grad_norm": 0.559167206287384, "learning_rate": 9.600156593901677e-08, "loss": 0.023421894758939743, "memory(GiB)": 22.66, "step": 28956, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.958068 }, { "epoch": 0.9406815450086087, "grad_norm": 0.3096812963485718, "learning_rate": 9.58968398108595e-08, "loss": 0.010040709748864174, "memory(GiB)": 22.66, "step": 28957, "token_acc": 1.0, "train_speed(iter/s)": 0.958075 }, { "epoch": 0.940714030471364, "grad_norm": 0.25057676434516907, "learning_rate": 9.579217028258037e-08, "loss": 0.007695955224335194, "memory(GiB)": 22.66, "step": 28958, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.958081 }, { "epoch": 0.9407465159341195, "grad_norm": 0.4798973500728607, "learning_rate": 9.568755735538671e-08, "loss": 0.007950701750814915, "memory(GiB)": 22.66, "step": 28959, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.958088 }, { "epoch": 0.9407790013968749, "grad_norm": 0.3511010706424713, "learning_rate": 9.55830010304859e-08, "loss": 0.012798134237527847, "memory(GiB)": 22.66, "step": 28960, "token_acc": 0.996, "train_speed(iter/s)": 0.958095 }, { "epoch": 0.9408114868596303, "grad_norm": 0.3669225573539734, "learning_rate": 9.547850130908531e-08, "loss": 0.013835586607456207, "memory(GiB)": 22.66, "step": 28961, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.958101 }, { "epoch": 0.9408439723223857, "grad_norm": 0.3011663854122162, "learning_rate": 9.53740581923912e-08, "loss": 0.011974042281508446, "memory(GiB)": 22.66, "step": 28962, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.958108 }, { "epoch": 0.9408764577851412, "grad_norm": 0.3594306707382202, "learning_rate": 9.526967168160817e-08, "loss": 0.011637113988399506, "memory(GiB)": 22.66, "step": 28963, "token_acc": 1.0, "train_speed(iter/s)": 0.958115 }, { "epoch": 0.9409089432478965, "grad_norm": 0.5087999105453491, "learning_rate": 9.516534177794134e-08, "loss": 0.010529782623052597, "memory(GiB)": 22.66, "step": 28964, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.958121 }, { "epoch": 0.940941428710652, "grad_norm": 0.40808993577957153, "learning_rate": 9.506106848259477e-08, "loss": 0.01413333136588335, "memory(GiB)": 22.66, "step": 28965, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.958128 }, { "epoch": 0.9409739141734074, "grad_norm": 0.37206894159317017, "learning_rate": 9.495685179677139e-08, "loss": 0.011191069148480892, "memory(GiB)": 22.66, "step": 28966, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.958136 }, { "epoch": 0.9410063996361628, "grad_norm": 0.3966522812843323, "learning_rate": 9.485269172167578e-08, "loss": 0.013418291695415974, "memory(GiB)": 22.66, "step": 28967, "token_acc": 1.0, "train_speed(iter/s)": 0.958142 }, { "epoch": 0.9410388850989182, "grad_norm": 0.24879583716392517, "learning_rate": 9.474858825850807e-08, "loss": 0.010270271450281143, "memory(GiB)": 22.66, "step": 28968, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.958149 }, { "epoch": 0.9410713705616737, "grad_norm": 0.40744441747665405, "learning_rate": 9.464454140847013e-08, "loss": 0.009973350912332535, "memory(GiB)": 22.66, "step": 28969, "token_acc": 1.0, "train_speed(iter/s)": 0.958156 }, { "epoch": 0.941103856024429, "grad_norm": 0.26351773738861084, "learning_rate": 9.454055117276372e-08, "loss": 0.007845016196370125, "memory(GiB)": 22.66, "step": 28970, "token_acc": 1.0, "train_speed(iter/s)": 0.958163 }, { "epoch": 0.9411363414871845, "grad_norm": 0.3002000153064728, "learning_rate": 9.443661755258848e-08, "loss": 0.0071528139524161816, "memory(GiB)": 22.66, "step": 28971, "token_acc": 1.0, "train_speed(iter/s)": 0.95817 }, { "epoch": 0.9411688269499399, "grad_norm": 0.3488970994949341, "learning_rate": 9.433274054914399e-08, "loss": 0.008833449333906174, "memory(GiB)": 22.66, "step": 28972, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.958177 }, { "epoch": 0.9412013124126953, "grad_norm": 0.4223402738571167, "learning_rate": 9.422892016362927e-08, "loss": 0.014671172015368938, "memory(GiB)": 22.66, "step": 28973, "token_acc": 0.9935483870967742, "train_speed(iter/s)": 0.958184 }, { "epoch": 0.9412337978754507, "grad_norm": 0.4494447708129883, "learning_rate": 9.412515639724174e-08, "loss": 0.013246305286884308, "memory(GiB)": 22.66, "step": 28974, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.958191 }, { "epoch": 0.9412662833382062, "grad_norm": 0.24907374382019043, "learning_rate": 9.40214492511804e-08, "loss": 0.007765859365463257, "memory(GiB)": 22.66, "step": 28975, "token_acc": 0.992, "train_speed(iter/s)": 0.958198 }, { "epoch": 0.9412987688009615, "grad_norm": 0.2954525351524353, "learning_rate": 9.391779872664042e-08, "loss": 0.01113937608897686, "memory(GiB)": 22.66, "step": 28976, "token_acc": 1.0, "train_speed(iter/s)": 0.958204 }, { "epoch": 0.941331254263717, "grad_norm": 0.29072657227516174, "learning_rate": 9.381420482481973e-08, "loss": 0.010394725017249584, "memory(GiB)": 22.66, "step": 28977, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.95821 }, { "epoch": 0.9413637397264724, "grad_norm": 0.4012354910373688, "learning_rate": 9.37106675469135e-08, "loss": 0.012460202910006046, "memory(GiB)": 22.66, "step": 28978, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.958216 }, { "epoch": 0.9413962251892278, "grad_norm": 0.3011672794818878, "learning_rate": 9.360718689411519e-08, "loss": 0.009550623595714569, "memory(GiB)": 22.66, "step": 28979, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.958221 }, { "epoch": 0.9414287106519832, "grad_norm": 0.4376766085624695, "learning_rate": 9.350376286762164e-08, "loss": 0.008730387315154076, "memory(GiB)": 22.66, "step": 28980, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.958227 }, { "epoch": 0.9414611961147387, "grad_norm": 0.39318448305130005, "learning_rate": 9.340039546862468e-08, "loss": 0.01268664002418518, "memory(GiB)": 22.66, "step": 28981, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.958232 }, { "epoch": 0.941493681577494, "grad_norm": 0.23570162057876587, "learning_rate": 9.329708469831778e-08, "loss": 0.005440386012196541, "memory(GiB)": 22.66, "step": 28982, "token_acc": 1.0, "train_speed(iter/s)": 0.958237 }, { "epoch": 0.9415261670402495, "grad_norm": 0.3507799506187439, "learning_rate": 9.319383055789333e-08, "loss": 0.010685930959880352, "memory(GiB)": 22.66, "step": 28983, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.958242 }, { "epoch": 0.9415586525030049, "grad_norm": 0.27081161737442017, "learning_rate": 9.309063304854316e-08, "loss": 0.009844748303294182, "memory(GiB)": 22.66, "step": 28984, "token_acc": 1.0, "train_speed(iter/s)": 0.958247 }, { "epoch": 0.9415911379657603, "grad_norm": 0.3180519640445709, "learning_rate": 9.298749217145797e-08, "loss": 0.011057235300540924, "memory(GiB)": 22.66, "step": 28985, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.958252 }, { "epoch": 0.9416236234285157, "grad_norm": 0.24103036522865295, "learning_rate": 9.288440792782904e-08, "loss": 0.011328869499266148, "memory(GiB)": 22.66, "step": 28986, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.958257 }, { "epoch": 0.9416561088912712, "grad_norm": 0.29307878017425537, "learning_rate": 9.278138031884432e-08, "loss": 0.011071345768868923, "memory(GiB)": 22.66, "step": 28987, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.958262 }, { "epoch": 0.9416885943540265, "grad_norm": 1.3822946548461914, "learning_rate": 9.26784093456945e-08, "loss": 0.00969375018030405, "memory(GiB)": 22.66, "step": 28988, "token_acc": 0.991304347826087, "train_speed(iter/s)": 0.958268 }, { "epoch": 0.941721079816782, "grad_norm": 0.5036730170249939, "learning_rate": 9.257549500956753e-08, "loss": 0.01703840121626854, "memory(GiB)": 22.66, "step": 28989, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.958273 }, { "epoch": 0.9417535652795374, "grad_norm": 0.437425822019577, "learning_rate": 9.247263731165135e-08, "loss": 0.018251314759254456, "memory(GiB)": 22.66, "step": 28990, "token_acc": 1.0, "train_speed(iter/s)": 0.958278 }, { "epoch": 0.9417860507422928, "grad_norm": 0.5311468243598938, "learning_rate": 9.236983625313222e-08, "loss": 0.01662220060825348, "memory(GiB)": 22.66, "step": 28991, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.958283 }, { "epoch": 0.9418185362050483, "grad_norm": 0.3643498718738556, "learning_rate": 9.226709183519756e-08, "loss": 0.008850375190377235, "memory(GiB)": 22.66, "step": 28992, "token_acc": 0.9921875, "train_speed(iter/s)": 0.958288 }, { "epoch": 0.9418510216678037, "grad_norm": 0.40909749269485474, "learning_rate": 9.216440405903249e-08, "loss": 0.008874550461769104, "memory(GiB)": 22.66, "step": 28993, "token_acc": 1.0, "train_speed(iter/s)": 0.958292 }, { "epoch": 0.9418835071305591, "grad_norm": 0.4746668338775635, "learning_rate": 9.206177292582275e-08, "loss": 0.008863246999680996, "memory(GiB)": 22.66, "step": 28994, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.958296 }, { "epoch": 0.9419159925933145, "grad_norm": 0.3338053822517395, "learning_rate": 9.195919843675294e-08, "loss": 0.011264972388744354, "memory(GiB)": 22.66, "step": 28995, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.958301 }, { "epoch": 0.94194847805607, "grad_norm": 0.43575960397720337, "learning_rate": 9.185668059300601e-08, "loss": 0.010949324816465378, "memory(GiB)": 22.66, "step": 28996, "token_acc": 1.0, "train_speed(iter/s)": 0.958305 }, { "epoch": 0.9419809635188253, "grad_norm": 0.42276903986930847, "learning_rate": 9.175421939576601e-08, "loss": 0.019377732649445534, "memory(GiB)": 22.66, "step": 28997, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.958309 }, { "epoch": 0.9420134489815808, "grad_norm": 0.4503127634525299, "learning_rate": 9.165181484621533e-08, "loss": 0.015521543100476265, "memory(GiB)": 22.66, "step": 28998, "token_acc": 1.0, "train_speed(iter/s)": 0.958315 }, { "epoch": 0.9420459344443362, "grad_norm": 0.2842247188091278, "learning_rate": 9.154946694553579e-08, "loss": 0.01277912501245737, "memory(GiB)": 22.66, "step": 28999, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.95832 }, { "epoch": 0.9420784199070916, "grad_norm": 0.42947468161582947, "learning_rate": 9.144717569490813e-08, "loss": 0.01255049929022789, "memory(GiB)": 22.66, "step": 29000, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.958325 }, { "epoch": 0.9420784199070916, "eval_loss": 0.01118981558829546, "eval_runtime": 81.2128, "eval_samples_per_second": 122.518, "eval_steps_per_second": 3.829, "eval_token_acc": 0.9954746472129696, "step": 29000 }, { "epoch": 0.942110905369847, "grad_norm": 0.27562299370765686, "learning_rate": 9.134494109551362e-08, "loss": 0.008446422405540943, "memory(GiB)": 22.66, "step": 29001, "token_acc": 0.9951242895295487, "train_speed(iter/s)": 0.955358 }, { "epoch": 0.9421433908326025, "grad_norm": 0.34353116154670715, "learning_rate": 9.124276314853131e-08, "loss": 0.010598821565508842, "memory(GiB)": 22.66, "step": 29002, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.955362 }, { "epoch": 0.9421758762953578, "grad_norm": 0.2252991497516632, "learning_rate": 9.114064185514193e-08, "loss": 0.006534575019031763, "memory(GiB)": 22.66, "step": 29003, "token_acc": 1.0, "train_speed(iter/s)": 0.955367 }, { "epoch": 0.9422083617581133, "grad_norm": 0.3553597927093506, "learning_rate": 9.103857721652232e-08, "loss": 0.01126935612410307, "memory(GiB)": 22.66, "step": 29004, "token_acc": 1.0, "train_speed(iter/s)": 0.955373 }, { "epoch": 0.9422408472208686, "grad_norm": 0.5940036773681641, "learning_rate": 9.093656923385207e-08, "loss": 0.009632183238863945, "memory(GiB)": 22.66, "step": 29005, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.955378 }, { "epoch": 0.9422733326836241, "grad_norm": 0.41426077485084534, "learning_rate": 9.08346179083075e-08, "loss": 0.011765331961214542, "memory(GiB)": 22.66, "step": 29006, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.955383 }, { "epoch": 0.9423058181463795, "grad_norm": 0.3015417158603668, "learning_rate": 9.073272324106486e-08, "loss": 0.011385997757315636, "memory(GiB)": 22.66, "step": 29007, "token_acc": 1.0, "train_speed(iter/s)": 0.955387 }, { "epoch": 0.942338303609135, "grad_norm": 0.33644482493400574, "learning_rate": 9.06308852333021e-08, "loss": 0.014860501512885094, "memory(GiB)": 22.66, "step": 29008, "token_acc": 0.9905660377358491, "train_speed(iter/s)": 0.955392 }, { "epoch": 0.9423707890718903, "grad_norm": 0.4551467001438141, "learning_rate": 9.052910388619219e-08, "loss": 0.0160776749253273, "memory(GiB)": 22.66, "step": 29009, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955397 }, { "epoch": 0.9424032745346458, "grad_norm": 0.2963193655014038, "learning_rate": 9.042737920091193e-08, "loss": 0.008101703599095345, "memory(GiB)": 22.66, "step": 29010, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.955402 }, { "epoch": 0.9424357599974011, "grad_norm": 0.44881221652030945, "learning_rate": 9.032571117863375e-08, "loss": 0.014241212047636509, "memory(GiB)": 22.66, "step": 29011, "token_acc": 1.0, "train_speed(iter/s)": 0.955408 }, { "epoch": 0.9424682454601566, "grad_norm": 0.3121627867221832, "learning_rate": 9.022409982053171e-08, "loss": 0.008032452315092087, "memory(GiB)": 22.66, "step": 29012, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.955415 }, { "epoch": 0.942500730922912, "grad_norm": 0.2883869707584381, "learning_rate": 9.012254512777819e-08, "loss": 0.009065592661499977, "memory(GiB)": 22.66, "step": 29013, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955421 }, { "epoch": 0.9425332163856674, "grad_norm": 0.3263293504714966, "learning_rate": 9.002104710154613e-08, "loss": 0.007044203579425812, "memory(GiB)": 22.66, "step": 29014, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955428 }, { "epoch": 0.9425657018484228, "grad_norm": 0.4214455187320709, "learning_rate": 8.991960574300573e-08, "loss": 0.011419042944908142, "memory(GiB)": 22.66, "step": 29015, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.955434 }, { "epoch": 0.9425981873111783, "grad_norm": 0.33867573738098145, "learning_rate": 8.981822105332827e-08, "loss": 0.012827470898628235, "memory(GiB)": 22.66, "step": 29016, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.95544 }, { "epoch": 0.9426306727739336, "grad_norm": 0.40961238741874695, "learning_rate": 8.971689303368447e-08, "loss": 0.014334912411868572, "memory(GiB)": 22.66, "step": 29017, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.955446 }, { "epoch": 0.9426631582366891, "grad_norm": 0.33210116624832153, "learning_rate": 8.961562168524341e-08, "loss": 0.009821772575378418, "memory(GiB)": 22.66, "step": 29018, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955452 }, { "epoch": 0.9426956436994445, "grad_norm": 0.25923600792884827, "learning_rate": 8.951440700917358e-08, "loss": 0.006821929477155209, "memory(GiB)": 22.66, "step": 29019, "token_acc": 1.0, "train_speed(iter/s)": 0.955457 }, { "epoch": 0.9427281291622, "grad_norm": 0.3559369444847107, "learning_rate": 8.941324900664295e-08, "loss": 0.011385651305317879, "memory(GiB)": 22.66, "step": 29020, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.955463 }, { "epoch": 0.9427606146249553, "grad_norm": 0.3946370780467987, "learning_rate": 8.931214767881946e-08, "loss": 0.013943242840468884, "memory(GiB)": 22.66, "step": 29021, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.95547 }, { "epoch": 0.9427931000877108, "grad_norm": 0.6370751857757568, "learning_rate": 8.921110302687053e-08, "loss": 0.01411996316164732, "memory(GiB)": 22.66, "step": 29022, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.955476 }, { "epoch": 0.9428255855504661, "grad_norm": 0.27527186274528503, "learning_rate": 8.911011505196132e-08, "loss": 0.009202349931001663, "memory(GiB)": 22.66, "step": 29023, "token_acc": 0.9824561403508771, "train_speed(iter/s)": 0.955482 }, { "epoch": 0.9428580710132216, "grad_norm": 0.4530394375324249, "learning_rate": 8.900918375525758e-08, "loss": 0.015590415336191654, "memory(GiB)": 22.66, "step": 29024, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.955489 }, { "epoch": 0.942890556475977, "grad_norm": 0.29506102204322815, "learning_rate": 8.890830913792448e-08, "loss": 0.011039013043045998, "memory(GiB)": 22.66, "step": 29025, "token_acc": 1.0, "train_speed(iter/s)": 0.955496 }, { "epoch": 0.9429230419387324, "grad_norm": 0.3716736435890198, "learning_rate": 8.880749120112664e-08, "loss": 0.008135173469781876, "memory(GiB)": 22.66, "step": 29026, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955503 }, { "epoch": 0.9429555274014878, "grad_norm": 0.2390243411064148, "learning_rate": 8.870672994602702e-08, "loss": 0.006693452596664429, "memory(GiB)": 22.66, "step": 29027, "token_acc": 0.9961832061068703, "train_speed(iter/s)": 0.95551 }, { "epoch": 0.9429880128642433, "grad_norm": 0.880638062953949, "learning_rate": 8.860602537378805e-08, "loss": 0.008509554900228977, "memory(GiB)": 22.66, "step": 29028, "token_acc": 1.0, "train_speed(iter/s)": 0.955517 }, { "epoch": 0.9430204983269986, "grad_norm": 0.3899044692516327, "learning_rate": 8.850537748557319e-08, "loss": 0.010514404624700546, "memory(GiB)": 22.66, "step": 29029, "token_acc": 1.0, "train_speed(iter/s)": 0.955525 }, { "epoch": 0.9430529837897541, "grad_norm": 0.4779544174671173, "learning_rate": 8.840478628254267e-08, "loss": 0.020376110449433327, "memory(GiB)": 22.66, "step": 29030, "token_acc": 1.0, "train_speed(iter/s)": 0.955532 }, { "epoch": 0.9430854692525095, "grad_norm": 0.3455532193183899, "learning_rate": 8.830425176585889e-08, "loss": 0.012951691634953022, "memory(GiB)": 22.66, "step": 29031, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955539 }, { "epoch": 0.9431179547152649, "grad_norm": 0.4815480709075928, "learning_rate": 8.820377393668144e-08, "loss": 0.010957200080156326, "memory(GiB)": 22.66, "step": 29032, "token_acc": 0.996, "train_speed(iter/s)": 0.955546 }, { "epoch": 0.9431504401780203, "grad_norm": 0.44523200392723083, "learning_rate": 8.810335279617055e-08, "loss": 0.00948193110525608, "memory(GiB)": 22.66, "step": 29033, "token_acc": 0.992831541218638, "train_speed(iter/s)": 0.955554 }, { "epoch": 0.9431829256407758, "grad_norm": 0.4113514721393585, "learning_rate": 8.80029883454836e-08, "loss": 0.014719013124704361, "memory(GiB)": 22.66, "step": 29034, "token_acc": 0.9875, "train_speed(iter/s)": 0.955561 }, { "epoch": 0.9432154111035311, "grad_norm": 0.29702097177505493, "learning_rate": 8.790268058578077e-08, "loss": 0.011731287464499474, "memory(GiB)": 22.66, "step": 29035, "token_acc": 1.0, "train_speed(iter/s)": 0.955567 }, { "epoch": 0.9432478965662866, "grad_norm": 0.32587364315986633, "learning_rate": 8.780242951821839e-08, "loss": 0.007304128259420395, "memory(GiB)": 22.66, "step": 29036, "token_acc": 1.0, "train_speed(iter/s)": 0.955574 }, { "epoch": 0.943280382029042, "grad_norm": 0.39337676763534546, "learning_rate": 8.770223514395492e-08, "loss": 0.017337625846266747, "memory(GiB)": 22.66, "step": 29037, "token_acc": 0.986013986013986, "train_speed(iter/s)": 0.955581 }, { "epoch": 0.9433128674917974, "grad_norm": 0.2878161072731018, "learning_rate": 8.760209746414506e-08, "loss": 0.010656815953552723, "memory(GiB)": 22.66, "step": 29038, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955588 }, { "epoch": 0.9433453529545528, "grad_norm": 0.5296735763549805, "learning_rate": 8.75020164799456e-08, "loss": 0.011093879118561745, "memory(GiB)": 22.66, "step": 29039, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.955596 }, { "epoch": 0.9433778384173083, "grad_norm": 0.5253976583480835, "learning_rate": 8.740199219251177e-08, "loss": 0.017572734504938126, "memory(GiB)": 22.66, "step": 29040, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.955603 }, { "epoch": 0.9434103238800636, "grad_norm": 0.534516453742981, "learning_rate": 8.730202460299708e-08, "loss": 0.014177758246660233, "memory(GiB)": 22.66, "step": 29041, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.95561 }, { "epoch": 0.9434428093428191, "grad_norm": 0.4593289792537689, "learning_rate": 8.720211371255616e-08, "loss": 0.014549145475029945, "memory(GiB)": 22.66, "step": 29042, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.955616 }, { "epoch": 0.9434752948055745, "grad_norm": 0.5604687929153442, "learning_rate": 8.710225952234197e-08, "loss": 0.014985548332333565, "memory(GiB)": 22.66, "step": 29043, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955623 }, { "epoch": 0.9435077802683299, "grad_norm": 0.39598947763442993, "learning_rate": 8.700246203350637e-08, "loss": 0.010975089855492115, "memory(GiB)": 22.66, "step": 29044, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.95563 }, { "epoch": 0.9435402657310853, "grad_norm": 0.3593871295452118, "learning_rate": 8.690272124720123e-08, "loss": 0.01131180115044117, "memory(GiB)": 22.66, "step": 29045, "token_acc": 1.0, "train_speed(iter/s)": 0.955629 }, { "epoch": 0.9435727511938408, "grad_norm": 0.4102326035499573, "learning_rate": 8.680303716457894e-08, "loss": 0.015109719708561897, "memory(GiB)": 22.66, "step": 29046, "token_acc": 1.0, "train_speed(iter/s)": 0.955636 }, { "epoch": 0.9436052366565961, "grad_norm": 0.3127395808696747, "learning_rate": 8.67034097867886e-08, "loss": 0.01035507582128048, "memory(GiB)": 22.66, "step": 29047, "token_acc": 0.996, "train_speed(iter/s)": 0.955644 }, { "epoch": 0.9436377221193516, "grad_norm": 0.3700977861881256, "learning_rate": 8.660383911498038e-08, "loss": 0.01323625072836876, "memory(GiB)": 22.66, "step": 29048, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955651 }, { "epoch": 0.943670207582107, "grad_norm": 0.45809364318847656, "learning_rate": 8.650432515030338e-08, "loss": 0.010340421460568905, "memory(GiB)": 22.66, "step": 29049, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955658 }, { "epoch": 0.9437026930448624, "grad_norm": 0.33797386288642883, "learning_rate": 8.640486789390667e-08, "loss": 0.008525941520929337, "memory(GiB)": 22.66, "step": 29050, "token_acc": 0.9966329966329966, "train_speed(iter/s)": 0.955664 }, { "epoch": 0.9437351785076178, "grad_norm": 0.3474825620651245, "learning_rate": 8.630546734693767e-08, "loss": 0.007532959338277578, "memory(GiB)": 22.66, "step": 29051, "token_acc": 1.0, "train_speed(iter/s)": 0.955669 }, { "epoch": 0.9437676639703733, "grad_norm": 0.3234531581401825, "learning_rate": 8.620612351054436e-08, "loss": 0.01087981928139925, "memory(GiB)": 22.66, "step": 29052, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.955675 }, { "epoch": 0.9438001494331286, "grad_norm": 0.6943448185920715, "learning_rate": 8.610683638587136e-08, "loss": 0.009823191910982132, "memory(GiB)": 22.66, "step": 29053, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.955681 }, { "epoch": 0.9438326348958841, "grad_norm": 0.6345987915992737, "learning_rate": 8.60076059740661e-08, "loss": 0.012614685110747814, "memory(GiB)": 22.66, "step": 29054, "token_acc": 1.0, "train_speed(iter/s)": 0.955687 }, { "epoch": 0.9438651203586395, "grad_norm": 0.43476513028144836, "learning_rate": 8.590843227627432e-08, "loss": 0.011346348561346531, "memory(GiB)": 22.66, "step": 29055, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.955693 }, { "epoch": 0.9438976058213949, "grad_norm": 0.36783120036125183, "learning_rate": 8.580931529363956e-08, "loss": 0.010286515578627586, "memory(GiB)": 22.66, "step": 29056, "token_acc": 0.992, "train_speed(iter/s)": 0.955699 }, { "epoch": 0.9439300912841504, "grad_norm": 0.34287187457084656, "learning_rate": 8.57102550273059e-08, "loss": 0.013809384778141975, "memory(GiB)": 22.66, "step": 29057, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955704 }, { "epoch": 0.9439625767469058, "grad_norm": 0.20249632000923157, "learning_rate": 8.561125147841687e-08, "loss": 0.009372806176543236, "memory(GiB)": 22.66, "step": 29058, "token_acc": 1.0, "train_speed(iter/s)": 0.95571 }, { "epoch": 0.9439950622096612, "grad_norm": 0.2906985878944397, "learning_rate": 8.551230464811489e-08, "loss": 0.006914862431585789, "memory(GiB)": 22.66, "step": 29059, "token_acc": 1.0, "train_speed(iter/s)": 0.955715 }, { "epoch": 0.9440275476724166, "grad_norm": 0.30815377831459045, "learning_rate": 8.541341453754237e-08, "loss": 0.009458549320697784, "memory(GiB)": 22.66, "step": 29060, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.95572 }, { "epoch": 0.9440600331351721, "grad_norm": 0.2911970615386963, "learning_rate": 8.531458114784064e-08, "loss": 0.011213625781238079, "memory(GiB)": 22.66, "step": 29061, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.955726 }, { "epoch": 0.9440925185979274, "grad_norm": 1.1860460042953491, "learning_rate": 8.521580448014933e-08, "loss": 0.012651695869863033, "memory(GiB)": 22.66, "step": 29062, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.955731 }, { "epoch": 0.9441250040606829, "grad_norm": 0.24002128839492798, "learning_rate": 8.511708453560974e-08, "loss": 0.0067505366168916225, "memory(GiB)": 22.66, "step": 29063, "token_acc": 1.0, "train_speed(iter/s)": 0.955735 }, { "epoch": 0.9441574895234383, "grad_norm": 0.38053032755851746, "learning_rate": 8.501842131536042e-08, "loss": 0.015559861436486244, "memory(GiB)": 22.66, "step": 29064, "token_acc": 1.0, "train_speed(iter/s)": 0.95574 }, { "epoch": 0.9441899749861937, "grad_norm": 0.41412296891212463, "learning_rate": 8.491981482054045e-08, "loss": 0.01100972481071949, "memory(GiB)": 22.66, "step": 29065, "token_acc": 1.0, "train_speed(iter/s)": 0.955745 }, { "epoch": 0.9442224604489491, "grad_norm": 0.30397123098373413, "learning_rate": 8.482126505228783e-08, "loss": 0.012658719904720783, "memory(GiB)": 22.66, "step": 29066, "token_acc": 1.0, "train_speed(iter/s)": 0.95575 }, { "epoch": 0.9442549459117046, "grad_norm": 0.27716848254203796, "learning_rate": 8.472277201173939e-08, "loss": 0.007132899947464466, "memory(GiB)": 22.66, "step": 29067, "token_acc": 1.0, "train_speed(iter/s)": 0.955755 }, { "epoch": 0.9442874313744599, "grad_norm": 0.3827551603317261, "learning_rate": 8.462433570003259e-08, "loss": 0.011388570070266724, "memory(GiB)": 22.66, "step": 29068, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.955761 }, { "epoch": 0.9443199168372154, "grad_norm": 0.4430631697177887, "learning_rate": 8.452595611830373e-08, "loss": 0.014838194474577904, "memory(GiB)": 22.66, "step": 29069, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.955764 }, { "epoch": 0.9443524022999708, "grad_norm": 0.36349523067474365, "learning_rate": 8.442763326768799e-08, "loss": 0.015026547014713287, "memory(GiB)": 22.66, "step": 29070, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.95577 }, { "epoch": 0.9443848877627262, "grad_norm": 0.43391549587249756, "learning_rate": 8.43293671493195e-08, "loss": 0.0124660674482584, "memory(GiB)": 22.66, "step": 29071, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.955776 }, { "epoch": 0.9444173732254816, "grad_norm": 0.3456514775753021, "learning_rate": 8.423115776433233e-08, "loss": 0.010744588449597359, "memory(GiB)": 22.66, "step": 29072, "token_acc": 1.0, "train_speed(iter/s)": 0.955782 }, { "epoch": 0.944449858688237, "grad_norm": 0.4067363739013672, "learning_rate": 8.413300511386113e-08, "loss": 0.011164280585944653, "memory(GiB)": 22.66, "step": 29073, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.955787 }, { "epoch": 0.9444823441509924, "grad_norm": 0.3332558870315552, "learning_rate": 8.403490919903834e-08, "loss": 0.010619265958666801, "memory(GiB)": 22.66, "step": 29074, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955793 }, { "epoch": 0.9445148296137479, "grad_norm": 0.3690628707408905, "learning_rate": 8.393687002099527e-08, "loss": 0.012780597433447838, "memory(GiB)": 22.66, "step": 29075, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.955798 }, { "epoch": 0.9445473150765032, "grad_norm": 0.3193793296813965, "learning_rate": 8.383888758086433e-08, "loss": 0.013302216306328773, "memory(GiB)": 22.66, "step": 29076, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955804 }, { "epoch": 0.9445798005392587, "grad_norm": 0.2860895097255707, "learning_rate": 8.374096187977576e-08, "loss": 0.0064567686058580875, "memory(GiB)": 22.66, "step": 29077, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.955811 }, { "epoch": 0.9446122860020141, "grad_norm": 0.26526451110839844, "learning_rate": 8.36430929188603e-08, "loss": 0.0069838580675423145, "memory(GiB)": 22.66, "step": 29078, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.955817 }, { "epoch": 0.9446447714647696, "grad_norm": 0.3671470284461975, "learning_rate": 8.354528069924705e-08, "loss": 0.012829884886741638, "memory(GiB)": 22.66, "step": 29079, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955814 }, { "epoch": 0.9446772569275249, "grad_norm": 0.41367870569229126, "learning_rate": 8.344752522206512e-08, "loss": 0.0139012411236763, "memory(GiB)": 22.66, "step": 29080, "token_acc": 0.992, "train_speed(iter/s)": 0.955819 }, { "epoch": 0.9447097423902804, "grad_norm": 0.4327322542667389, "learning_rate": 8.334982648844248e-08, "loss": 0.02215423248708248, "memory(GiB)": 22.66, "step": 29081, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.955825 }, { "epoch": 0.9447422278530357, "grad_norm": 0.49170851707458496, "learning_rate": 8.325218449950711e-08, "loss": 0.01675616018474102, "memory(GiB)": 22.66, "step": 29082, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.955831 }, { "epoch": 0.9447747133157912, "grad_norm": 0.2431056946516037, "learning_rate": 8.315459925638537e-08, "loss": 0.008934909477829933, "memory(GiB)": 22.66, "step": 29083, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.955836 }, { "epoch": 0.9448071987785466, "grad_norm": 0.5795060396194458, "learning_rate": 8.305707076020408e-08, "loss": 0.01115361601114273, "memory(GiB)": 22.66, "step": 29084, "token_acc": 1.0, "train_speed(iter/s)": 0.955842 }, { "epoch": 0.944839684241302, "grad_norm": 0.3752666413784027, "learning_rate": 8.295959901208905e-08, "loss": 0.011282196268439293, "memory(GiB)": 22.66, "step": 29085, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955849 }, { "epoch": 0.9448721697040574, "grad_norm": 0.38442426919937134, "learning_rate": 8.286218401316382e-08, "loss": 0.011178969405591488, "memory(GiB)": 22.66, "step": 29086, "token_acc": 1.0, "train_speed(iter/s)": 0.955855 }, { "epoch": 0.9449046551668129, "grad_norm": 0.38623079657554626, "learning_rate": 8.276482576455469e-08, "loss": 0.01218328159302473, "memory(GiB)": 22.66, "step": 29087, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.955862 }, { "epoch": 0.9449371406295682, "grad_norm": 0.30510374903678894, "learning_rate": 8.266752426738356e-08, "loss": 0.012335195206105709, "memory(GiB)": 22.66, "step": 29088, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.95587 }, { "epoch": 0.9449696260923237, "grad_norm": 0.3723902106285095, "learning_rate": 8.257027952277507e-08, "loss": 0.012056063860654831, "memory(GiB)": 22.66, "step": 29089, "token_acc": 1.0, "train_speed(iter/s)": 0.955876 }, { "epoch": 0.9450021115550791, "grad_norm": 0.3452624976634979, "learning_rate": 8.247309153185001e-08, "loss": 0.012058050371706486, "memory(GiB)": 22.66, "step": 29090, "token_acc": 1.0, "train_speed(iter/s)": 0.955884 }, { "epoch": 0.9450345970178345, "grad_norm": 0.39256560802459717, "learning_rate": 8.237596029573025e-08, "loss": 0.010594567283987999, "memory(GiB)": 22.66, "step": 29091, "token_acc": 1.0, "train_speed(iter/s)": 0.955891 }, { "epoch": 0.9450670824805899, "grad_norm": 0.2406473159790039, "learning_rate": 8.227888581553767e-08, "loss": 0.004656774457544088, "memory(GiB)": 22.66, "step": 29092, "token_acc": 0.9965277777777778, "train_speed(iter/s)": 0.955898 }, { "epoch": 0.9450995679433454, "grad_norm": 0.3407095968723297, "learning_rate": 8.218186809239249e-08, "loss": 0.012317514047026634, "memory(GiB)": 22.66, "step": 29093, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955905 }, { "epoch": 0.9451320534061007, "grad_norm": 0.40399911999702454, "learning_rate": 8.208490712741435e-08, "loss": 0.011475339531898499, "memory(GiB)": 22.66, "step": 29094, "token_acc": 1.0, "train_speed(iter/s)": 0.955911 }, { "epoch": 0.9451645388688562, "grad_norm": 0.25514811277389526, "learning_rate": 8.198800292172127e-08, "loss": 0.006352968513965607, "memory(GiB)": 22.66, "step": 29095, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.955918 }, { "epoch": 0.9451970243316116, "grad_norm": 0.3715530335903168, "learning_rate": 8.189115547643289e-08, "loss": 0.009270118549466133, "memory(GiB)": 22.66, "step": 29096, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.955925 }, { "epoch": 0.945229509794367, "grad_norm": 0.3232578933238983, "learning_rate": 8.179436479266667e-08, "loss": 0.008442189544439316, "memory(GiB)": 22.66, "step": 29097, "token_acc": 1.0, "train_speed(iter/s)": 0.955932 }, { "epoch": 0.9452619952571224, "grad_norm": 0.2960885465145111, "learning_rate": 8.169763087153948e-08, "loss": 0.010494120419025421, "memory(GiB)": 22.66, "step": 29098, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955939 }, { "epoch": 0.9452944807198779, "grad_norm": 0.3259803056716919, "learning_rate": 8.160095371416821e-08, "loss": 0.006490098778158426, "memory(GiB)": 22.66, "step": 29099, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.955946 }, { "epoch": 0.9453269661826332, "grad_norm": 0.32234644889831543, "learning_rate": 8.150433332166751e-08, "loss": 0.012439215555787086, "memory(GiB)": 22.66, "step": 29100, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.955953 }, { "epoch": 0.9453594516453887, "grad_norm": 0.4204871654510498, "learning_rate": 8.140776969515374e-08, "loss": 0.012269517406821251, "memory(GiB)": 22.66, "step": 29101, "token_acc": 1.0, "train_speed(iter/s)": 0.95596 }, { "epoch": 0.9453919371081441, "grad_norm": 0.3923800587654114, "learning_rate": 8.131126283574154e-08, "loss": 0.01261418592184782, "memory(GiB)": 22.66, "step": 29102, "token_acc": 1.0, "train_speed(iter/s)": 0.955967 }, { "epoch": 0.9454244225708995, "grad_norm": 0.3762147128582001, "learning_rate": 8.121481274454335e-08, "loss": 0.013718420639634132, "memory(GiB)": 22.66, "step": 29103, "token_acc": 1.0, "train_speed(iter/s)": 0.955974 }, { "epoch": 0.9454569080336549, "grad_norm": 0.23510995507240295, "learning_rate": 8.111841942267329e-08, "loss": 0.007889415137469769, "memory(GiB)": 22.66, "step": 29104, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.95598 }, { "epoch": 0.9454893934964104, "grad_norm": 0.2056911736726761, "learning_rate": 8.10220828712438e-08, "loss": 0.006904867477715015, "memory(GiB)": 22.66, "step": 29105, "token_acc": 1.0, "train_speed(iter/s)": 0.955987 }, { "epoch": 0.9455218789591657, "grad_norm": 0.4248986840248108, "learning_rate": 8.092580309136678e-08, "loss": 0.014922544360160828, "memory(GiB)": 22.66, "step": 29106, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.955994 }, { "epoch": 0.9455543644219212, "grad_norm": 0.36902233958244324, "learning_rate": 8.082958008415298e-08, "loss": 0.010163863189518452, "memory(GiB)": 22.66, "step": 29107, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.956 }, { "epoch": 0.9455868498846766, "grad_norm": 0.21990132331848145, "learning_rate": 8.073341385071376e-08, "loss": 0.009365024976432323, "memory(GiB)": 22.66, "step": 29108, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.956007 }, { "epoch": 0.945619335347432, "grad_norm": 0.26752325892448425, "learning_rate": 8.063730439215712e-08, "loss": 0.007074439432471991, "memory(GiB)": 22.66, "step": 29109, "token_acc": 1.0, "train_speed(iter/s)": 0.956013 }, { "epoch": 0.9456518208101874, "grad_norm": 0.28304240107536316, "learning_rate": 8.054125170959492e-08, "loss": 0.007778579834848642, "memory(GiB)": 22.66, "step": 29110, "token_acc": 1.0, "train_speed(iter/s)": 0.956018 }, { "epoch": 0.9456843062729429, "grad_norm": 0.30631789565086365, "learning_rate": 8.044525580413409e-08, "loss": 0.01051116082817316, "memory(GiB)": 22.66, "step": 29111, "token_acc": 1.0, "train_speed(iter/s)": 0.956023 }, { "epoch": 0.9457167917356982, "grad_norm": 0.32866010069847107, "learning_rate": 8.034931667688317e-08, "loss": 0.0121434535831213, "memory(GiB)": 22.66, "step": 29112, "token_acc": 1.0, "train_speed(iter/s)": 0.956028 }, { "epoch": 0.9457492771984537, "grad_norm": 0.3302842080593109, "learning_rate": 8.025343432894905e-08, "loss": 0.013591698370873928, "memory(GiB)": 22.66, "step": 29113, "token_acc": 0.984375, "train_speed(iter/s)": 0.956033 }, { "epoch": 0.9457817626612091, "grad_norm": 0.2996610999107361, "learning_rate": 8.015760876143807e-08, "loss": 0.010939870029687881, "memory(GiB)": 22.66, "step": 29114, "token_acc": 1.0, "train_speed(iter/s)": 0.956038 }, { "epoch": 0.9458142481239645, "grad_norm": 0.4093291759490967, "learning_rate": 8.006183997545658e-08, "loss": 0.01015429012477398, "memory(GiB)": 22.66, "step": 29115, "token_acc": 1.0, "train_speed(iter/s)": 0.956044 }, { "epoch": 0.9458467335867199, "grad_norm": 0.4202199876308441, "learning_rate": 7.996612797211035e-08, "loss": 0.011472542770206928, "memory(GiB)": 22.66, "step": 29116, "token_acc": 1.0, "train_speed(iter/s)": 0.956049 }, { "epoch": 0.9458792190494754, "grad_norm": 0.41992223262786865, "learning_rate": 7.987047275250348e-08, "loss": 0.009937869384884834, "memory(GiB)": 22.66, "step": 29117, "token_acc": 0.9952830188679245, "train_speed(iter/s)": 0.956054 }, { "epoch": 0.9459117045122307, "grad_norm": 0.38831910490989685, "learning_rate": 7.977487431773956e-08, "loss": 0.011775809340178967, "memory(GiB)": 22.66, "step": 29118, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.95606 }, { "epoch": 0.9459441899749862, "grad_norm": 0.3565613627433777, "learning_rate": 7.967933266892269e-08, "loss": 0.007605655584484339, "memory(GiB)": 22.66, "step": 29119, "token_acc": 1.0, "train_speed(iter/s)": 0.956065 }, { "epoch": 0.9459766754377417, "grad_norm": 0.2827376425266266, "learning_rate": 7.958384780715533e-08, "loss": 0.006329354364424944, "memory(GiB)": 22.66, "step": 29120, "token_acc": 0.9962962962962963, "train_speed(iter/s)": 0.95607 }, { "epoch": 0.946009160900497, "grad_norm": 0.40806978940963745, "learning_rate": 7.948841973353993e-08, "loss": 0.016903286799788475, "memory(GiB)": 22.66, "step": 29121, "token_acc": 0.9963768115942029, "train_speed(iter/s)": 0.956073 }, { "epoch": 0.9460416463632525, "grad_norm": 0.34827443957328796, "learning_rate": 7.939304844917672e-08, "loss": 0.01428508386015892, "memory(GiB)": 22.66, "step": 29122, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956078 }, { "epoch": 0.9460741318260079, "grad_norm": 0.2864679992198944, "learning_rate": 7.929773395516705e-08, "loss": 0.011174908839166164, "memory(GiB)": 22.66, "step": 29123, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956083 }, { "epoch": 0.9461066172887633, "grad_norm": 0.19994892179965973, "learning_rate": 7.920247625261057e-08, "loss": 0.0065750619396567345, "memory(GiB)": 22.66, "step": 29124, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.956089 }, { "epoch": 0.9461391027515187, "grad_norm": 0.2807050049304962, "learning_rate": 7.91072753426081e-08, "loss": 0.005205688066780567, "memory(GiB)": 22.66, "step": 29125, "token_acc": 1.0, "train_speed(iter/s)": 0.956094 }, { "epoch": 0.9461715882142742, "grad_norm": 0.38046032190322876, "learning_rate": 7.901213122625651e-08, "loss": 0.017007822170853615, "memory(GiB)": 22.66, "step": 29126, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956099 }, { "epoch": 0.9462040736770295, "grad_norm": 0.27191662788391113, "learning_rate": 7.891704390465493e-08, "loss": 0.008347326889634132, "memory(GiB)": 22.66, "step": 29127, "token_acc": 1.0, "train_speed(iter/s)": 0.956104 }, { "epoch": 0.946236559139785, "grad_norm": 0.3778756856918335, "learning_rate": 7.882201337890084e-08, "loss": 0.010366843082010746, "memory(GiB)": 22.66, "step": 29128, "token_acc": 1.0, "train_speed(iter/s)": 0.956108 }, { "epoch": 0.9462690446025404, "grad_norm": 0.23125337064266205, "learning_rate": 7.87270396500911e-08, "loss": 0.005780323408544064, "memory(GiB)": 22.66, "step": 29129, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.956114 }, { "epoch": 0.9463015300652958, "grad_norm": 0.3887196183204651, "learning_rate": 7.863212271932096e-08, "loss": 0.010525182820856571, "memory(GiB)": 22.66, "step": 29130, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.95612 }, { "epoch": 0.9463340155280512, "grad_norm": 0.3577762544155121, "learning_rate": 7.853726258768679e-08, "loss": 0.01125369593501091, "memory(GiB)": 22.66, "step": 29131, "token_acc": 1.0, "train_speed(iter/s)": 0.956126 }, { "epoch": 0.9463665009908067, "grad_norm": 0.3309357762336731, "learning_rate": 7.844245925628269e-08, "loss": 0.007528829853981733, "memory(GiB)": 22.66, "step": 29132, "token_acc": 1.0, "train_speed(iter/s)": 0.956132 }, { "epoch": 0.946398986453562, "grad_norm": 0.6229130029678345, "learning_rate": 7.834771272620334e-08, "loss": 0.018091917037963867, "memory(GiB)": 22.66, "step": 29133, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.956138 }, { "epoch": 0.9464314719163175, "grad_norm": 0.34792521595954895, "learning_rate": 7.825302299854232e-08, "loss": 0.012807808816432953, "memory(GiB)": 22.66, "step": 29134, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.956143 }, { "epoch": 0.9464639573790729, "grad_norm": 0.35956674814224243, "learning_rate": 7.815839007439207e-08, "loss": 0.009896786883473396, "memory(GiB)": 22.66, "step": 29135, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.956149 }, { "epoch": 0.9464964428418283, "grad_norm": 0.38412603735923767, "learning_rate": 7.806381395484507e-08, "loss": 0.010888950899243355, "memory(GiB)": 22.66, "step": 29136, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.956155 }, { "epoch": 0.9465289283045837, "grad_norm": 0.31454968452453613, "learning_rate": 7.796929464099213e-08, "loss": 0.007852557115256786, "memory(GiB)": 22.66, "step": 29137, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.956161 }, { "epoch": 0.9465614137673392, "grad_norm": 0.419511079788208, "learning_rate": 7.787483213392566e-08, "loss": 0.016254102811217308, "memory(GiB)": 22.66, "step": 29138, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.956167 }, { "epoch": 0.9465938992300945, "grad_norm": 0.3047787547111511, "learning_rate": 7.778042643473427e-08, "loss": 0.01124654896557331, "memory(GiB)": 22.66, "step": 29139, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.956173 }, { "epoch": 0.94662638469285, "grad_norm": 0.39364001154899597, "learning_rate": 7.768607754450874e-08, "loss": 0.009430211037397385, "memory(GiB)": 22.66, "step": 29140, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956178 }, { "epoch": 0.9466588701556053, "grad_norm": 0.32517170906066895, "learning_rate": 7.75917854643371e-08, "loss": 0.01341219712048769, "memory(GiB)": 22.66, "step": 29141, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956185 }, { "epoch": 0.9466913556183608, "grad_norm": 0.4094589948654175, "learning_rate": 7.749755019530791e-08, "loss": 0.01719418540596962, "memory(GiB)": 22.66, "step": 29142, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.956191 }, { "epoch": 0.9467238410811162, "grad_norm": 0.2949437201023102, "learning_rate": 7.740337173850864e-08, "loss": 0.012643889524042606, "memory(GiB)": 22.66, "step": 29143, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956196 }, { "epoch": 0.9467563265438717, "grad_norm": 0.34044671058654785, "learning_rate": 7.730925009502732e-08, "loss": 0.006760007701814175, "memory(GiB)": 22.66, "step": 29144, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956202 }, { "epoch": 0.946788812006627, "grad_norm": 0.40043067932128906, "learning_rate": 7.721518526594863e-08, "loss": 0.012749306857585907, "memory(GiB)": 22.66, "step": 29145, "token_acc": 1.0, "train_speed(iter/s)": 0.956207 }, { "epoch": 0.9468212974693825, "grad_norm": 0.282158762216568, "learning_rate": 7.712117725235891e-08, "loss": 0.012488292530179024, "memory(GiB)": 22.66, "step": 29146, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956214 }, { "epoch": 0.9468537829321378, "grad_norm": 0.2286987155675888, "learning_rate": 7.70272260553434e-08, "loss": 0.007805074565112591, "memory(GiB)": 22.66, "step": 29147, "token_acc": 1.0, "train_speed(iter/s)": 0.95622 }, { "epoch": 0.9468862683948933, "grad_norm": 0.23949062824249268, "learning_rate": 7.693333167598571e-08, "loss": 0.005655036773532629, "memory(GiB)": 22.66, "step": 29148, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956225 }, { "epoch": 0.9469187538576487, "grad_norm": 0.33896538615226746, "learning_rate": 7.68394941153705e-08, "loss": 0.014041556045413017, "memory(GiB)": 22.66, "step": 29149, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.956232 }, { "epoch": 0.9469512393204041, "grad_norm": 0.35555994510650635, "learning_rate": 7.674571337458025e-08, "loss": 0.00893182959407568, "memory(GiB)": 22.66, "step": 29150, "token_acc": 1.0, "train_speed(iter/s)": 0.956238 }, { "epoch": 0.9469837247831595, "grad_norm": 0.38222163915634155, "learning_rate": 7.665198945469687e-08, "loss": 0.010982006788253784, "memory(GiB)": 22.66, "step": 29151, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956246 }, { "epoch": 0.947016210245915, "grad_norm": 0.2899170219898224, "learning_rate": 7.655832235680283e-08, "loss": 0.007154906168580055, "memory(GiB)": 22.66, "step": 29152, "token_acc": 0.9963898916967509, "train_speed(iter/s)": 0.956253 }, { "epoch": 0.9470486957086703, "grad_norm": 0.5371736884117126, "learning_rate": 7.646471208197892e-08, "loss": 0.015744203701615334, "memory(GiB)": 22.66, "step": 29153, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.95626 }, { "epoch": 0.9470811811714258, "grad_norm": 0.3325735926628113, "learning_rate": 7.637115863130485e-08, "loss": 0.010251898318529129, "memory(GiB)": 22.66, "step": 29154, "token_acc": 0.996, "train_speed(iter/s)": 0.956267 }, { "epoch": 0.9471136666341812, "grad_norm": 0.39850568771362305, "learning_rate": 7.62776620058614e-08, "loss": 0.012988809496164322, "memory(GiB)": 22.66, "step": 29155, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.956274 }, { "epoch": 0.9471461520969366, "grad_norm": 0.35600796341896057, "learning_rate": 7.618422220672717e-08, "loss": 0.00864974595606327, "memory(GiB)": 22.66, "step": 29156, "token_acc": 1.0, "train_speed(iter/s)": 0.956281 }, { "epoch": 0.947178637559692, "grad_norm": 0.5635718703269958, "learning_rate": 7.609083923498073e-08, "loss": 0.009071782231330872, "memory(GiB)": 22.66, "step": 29157, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956288 }, { "epoch": 0.9472111230224475, "grad_norm": 0.32389551401138306, "learning_rate": 7.599751309169956e-08, "loss": 0.008203400298953056, "memory(GiB)": 22.66, "step": 29158, "token_acc": 1.0, "train_speed(iter/s)": 0.956295 }, { "epoch": 0.9472436084852028, "grad_norm": 0.40317946672439575, "learning_rate": 7.590424377796113e-08, "loss": 0.013765234500169754, "memory(GiB)": 22.66, "step": 29159, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.956302 }, { "epoch": 0.9472760939479583, "grad_norm": 0.4111800491809845, "learning_rate": 7.581103129484125e-08, "loss": 0.010730598121881485, "memory(GiB)": 22.66, "step": 29160, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.956309 }, { "epoch": 0.9473085794107137, "grad_norm": 0.3657901883125305, "learning_rate": 7.571787564341626e-08, "loss": 0.009189736098051071, "memory(GiB)": 22.66, "step": 29161, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956316 }, { "epoch": 0.9473410648734691, "grad_norm": 0.3614537715911865, "learning_rate": 7.562477682476143e-08, "loss": 0.007078154012560844, "memory(GiB)": 22.66, "step": 29162, "token_acc": 1.0, "train_speed(iter/s)": 0.956323 }, { "epoch": 0.9473735503362245, "grad_norm": 0.2824322581291199, "learning_rate": 7.553173483995091e-08, "loss": 0.010454574599862099, "memory(GiB)": 22.66, "step": 29163, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.95633 }, { "epoch": 0.94740603579898, "grad_norm": 0.5867152810096741, "learning_rate": 7.543874969005882e-08, "loss": 0.019973283633589745, "memory(GiB)": 22.66, "step": 29164, "token_acc": 0.9840425531914894, "train_speed(iter/s)": 0.956335 }, { "epoch": 0.9474385212617353, "grad_norm": 0.23983195424079895, "learning_rate": 7.534582137615709e-08, "loss": 0.011475041508674622, "memory(GiB)": 22.66, "step": 29165, "token_acc": 1.0, "train_speed(iter/s)": 0.956342 }, { "epoch": 0.9474710067244908, "grad_norm": 0.20017847418785095, "learning_rate": 7.525294989932042e-08, "loss": 0.0070731136947870255, "memory(GiB)": 22.66, "step": 29166, "token_acc": 1.0, "train_speed(iter/s)": 0.956349 }, { "epoch": 0.9475034921872462, "grad_norm": 0.2703368067741394, "learning_rate": 7.516013526061905e-08, "loss": 0.010121611878275871, "memory(GiB)": 22.66, "step": 29167, "token_acc": 0.99609375, "train_speed(iter/s)": 0.956356 }, { "epoch": 0.9475359776500016, "grad_norm": 0.2403598576784134, "learning_rate": 7.506737746112546e-08, "loss": 0.006100357510149479, "memory(GiB)": 22.66, "step": 29168, "token_acc": 1.0, "train_speed(iter/s)": 0.956363 }, { "epoch": 0.947568463112757, "grad_norm": 0.43491291999816895, "learning_rate": 7.497467650190826e-08, "loss": 0.010387522168457508, "memory(GiB)": 22.66, "step": 29169, "token_acc": 1.0, "train_speed(iter/s)": 0.956369 }, { "epoch": 0.9476009485755125, "grad_norm": 0.397379070520401, "learning_rate": 7.488203238403879e-08, "loss": 0.012342510744929314, "memory(GiB)": 22.66, "step": 29170, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956374 }, { "epoch": 0.9476334340382678, "grad_norm": 0.37463945150375366, "learning_rate": 7.478944510858622e-08, "loss": 0.0086966035887599, "memory(GiB)": 22.66, "step": 29171, "token_acc": 1.0, "train_speed(iter/s)": 0.956379 }, { "epoch": 0.9476659195010233, "grad_norm": 0.421032577753067, "learning_rate": 7.46969146766191e-08, "loss": 0.020223800092935562, "memory(GiB)": 22.66, "step": 29172, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.956385 }, { "epoch": 0.9476984049637787, "grad_norm": 0.2592312693595886, "learning_rate": 7.460444108920495e-08, "loss": 0.008263538591563702, "memory(GiB)": 22.66, "step": 29173, "token_acc": 1.0, "train_speed(iter/s)": 0.95639 }, { "epoch": 0.9477308904265341, "grad_norm": 0.3306451439857483, "learning_rate": 7.451202434741123e-08, "loss": 0.009445914067327976, "memory(GiB)": 22.66, "step": 29174, "token_acc": 1.0, "train_speed(iter/s)": 0.956396 }, { "epoch": 0.9477633758892895, "grad_norm": 0.5347924828529358, "learning_rate": 7.44196644523043e-08, "loss": 0.013588760048151016, "memory(GiB)": 22.66, "step": 29175, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956401 }, { "epoch": 0.947795861352045, "grad_norm": 0.2662113904953003, "learning_rate": 7.432736140495056e-08, "loss": 0.007875525392591953, "memory(GiB)": 22.66, "step": 29176, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956406 }, { "epoch": 0.9478283468148003, "grad_norm": 0.2607949674129486, "learning_rate": 7.423511520641524e-08, "loss": 0.009313157759606838, "memory(GiB)": 22.66, "step": 29177, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.95641 }, { "epoch": 0.9478608322775558, "grad_norm": 0.2971371114253998, "learning_rate": 7.414292585776251e-08, "loss": 0.008409148082137108, "memory(GiB)": 22.66, "step": 29178, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.956415 }, { "epoch": 0.9478933177403112, "grad_norm": 0.266012579202652, "learning_rate": 7.405079336005705e-08, "loss": 0.00872662104666233, "memory(GiB)": 22.66, "step": 29179, "token_acc": 1.0, "train_speed(iter/s)": 0.956421 }, { "epoch": 0.9479258032030666, "grad_norm": 0.37279197573661804, "learning_rate": 7.395871771436191e-08, "loss": 0.016605306416749954, "memory(GiB)": 22.66, "step": 29180, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956426 }, { "epoch": 0.947958288665822, "grad_norm": 0.45503127574920654, "learning_rate": 7.386669892173958e-08, "loss": 0.017338445410132408, "memory(GiB)": 22.66, "step": 29181, "token_acc": 0.9795918367346939, "train_speed(iter/s)": 0.95643 }, { "epoch": 0.9479907741285775, "grad_norm": 0.36629292368888855, "learning_rate": 7.377473698325255e-08, "loss": 0.013989045284688473, "memory(GiB)": 22.66, "step": 29182, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956435 }, { "epoch": 0.9480232595913328, "grad_norm": 0.26937970519065857, "learning_rate": 7.368283189996107e-08, "loss": 0.00374866696074605, "memory(GiB)": 22.66, "step": 29183, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.956441 }, { "epoch": 0.9480557450540883, "grad_norm": 0.2752704620361328, "learning_rate": 7.359098367292705e-08, "loss": 0.007484552450478077, "memory(GiB)": 22.66, "step": 29184, "token_acc": 1.0, "train_speed(iter/s)": 0.956446 }, { "epoch": 0.9480882305168438, "grad_norm": 0.25858622789382935, "learning_rate": 7.349919230321079e-08, "loss": 0.009818707592785358, "memory(GiB)": 22.66, "step": 29185, "token_acc": 1.0, "train_speed(iter/s)": 0.956452 }, { "epoch": 0.9481207159795991, "grad_norm": 0.3855638802051544, "learning_rate": 7.340745779186975e-08, "loss": 0.01079610176384449, "memory(GiB)": 22.66, "step": 29186, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.956457 }, { "epoch": 0.9481532014423546, "grad_norm": 0.2723146677017212, "learning_rate": 7.331578013996531e-08, "loss": 0.00996251031756401, "memory(GiB)": 22.66, "step": 29187, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956463 }, { "epoch": 0.94818568690511, "grad_norm": 0.3982604444026947, "learning_rate": 7.322415934855276e-08, "loss": 0.01681249774992466, "memory(GiB)": 22.66, "step": 29188, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956468 }, { "epoch": 0.9482181723678654, "grad_norm": 0.25210583209991455, "learning_rate": 7.313259541869177e-08, "loss": 0.008756663650274277, "memory(GiB)": 22.66, "step": 29189, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956474 }, { "epoch": 0.9482506578306208, "grad_norm": 0.41010239720344543, "learning_rate": 7.304108835143819e-08, "loss": 0.008082550019025803, "memory(GiB)": 22.66, "step": 29190, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.956479 }, { "epoch": 0.9482831432933763, "grad_norm": 0.35315364599227905, "learning_rate": 7.29496381478484e-08, "loss": 0.01068693958222866, "memory(GiB)": 22.66, "step": 29191, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.956485 }, { "epoch": 0.9483156287561316, "grad_norm": 0.3239648938179016, "learning_rate": 7.285824480897707e-08, "loss": 0.008035257458686829, "memory(GiB)": 22.66, "step": 29192, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.956491 }, { "epoch": 0.9483481142188871, "grad_norm": 0.3819340467453003, "learning_rate": 7.276690833587951e-08, "loss": 0.013768700882792473, "memory(GiB)": 22.66, "step": 29193, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.956497 }, { "epoch": 0.9483805996816425, "grad_norm": 0.38427984714508057, "learning_rate": 7.267562872961042e-08, "loss": 0.014603711664676666, "memory(GiB)": 22.66, "step": 29194, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.956503 }, { "epoch": 0.9484130851443979, "grad_norm": 0.32227301597595215, "learning_rate": 7.258440599122285e-08, "loss": 0.011061630211770535, "memory(GiB)": 22.66, "step": 29195, "token_acc": 1.0, "train_speed(iter/s)": 0.956509 }, { "epoch": 0.9484455706071533, "grad_norm": 0.2801451086997986, "learning_rate": 7.249324012176984e-08, "loss": 0.006165171042084694, "memory(GiB)": 22.66, "step": 29196, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956515 }, { "epoch": 0.9484780560699088, "grad_norm": 0.4210514724254608, "learning_rate": 7.240213112230277e-08, "loss": 0.011091341264545918, "memory(GiB)": 22.66, "step": 29197, "token_acc": 1.0, "train_speed(iter/s)": 0.956521 }, { "epoch": 0.9485105415326641, "grad_norm": 0.2725956439971924, "learning_rate": 7.231107899387413e-08, "loss": 0.007532578893005848, "memory(GiB)": 22.66, "step": 29198, "token_acc": 1.0, "train_speed(iter/s)": 0.956526 }, { "epoch": 0.9485430269954196, "grad_norm": 0.33976083993911743, "learning_rate": 7.22200837375342e-08, "loss": 0.009180118329823017, "memory(GiB)": 22.66, "step": 29199, "token_acc": 1.0, "train_speed(iter/s)": 0.956532 }, { "epoch": 0.948575512458175, "grad_norm": 0.3406468331813812, "learning_rate": 7.21291453543338e-08, "loss": 0.009528698399662971, "memory(GiB)": 22.66, "step": 29200, "token_acc": 1.0, "train_speed(iter/s)": 0.956537 }, { "epoch": 0.9486079979209304, "grad_norm": 0.36987239122390747, "learning_rate": 7.203826384532153e-08, "loss": 0.007150566205382347, "memory(GiB)": 22.66, "step": 29201, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956543 }, { "epoch": 0.9486404833836858, "grad_norm": 0.3532586395740509, "learning_rate": 7.194743921154656e-08, "loss": 0.00944727472960949, "memory(GiB)": 22.66, "step": 29202, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956549 }, { "epoch": 0.9486729688464413, "grad_norm": 0.21960458159446716, "learning_rate": 7.185667145405751e-08, "loss": 0.007448169402778149, "memory(GiB)": 22.66, "step": 29203, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956555 }, { "epoch": 0.9487054543091966, "grad_norm": 0.517555296421051, "learning_rate": 7.176596057390239e-08, "loss": 0.01703200489282608, "memory(GiB)": 22.66, "step": 29204, "token_acc": 0.9807692307692307, "train_speed(iter/s)": 0.956561 }, { "epoch": 0.9487379397719521, "grad_norm": 0.4748122990131378, "learning_rate": 7.167530657212706e-08, "loss": 0.016594957560300827, "memory(GiB)": 22.66, "step": 29205, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.956567 }, { "epoch": 0.9487704252347074, "grad_norm": 0.2804437279701233, "learning_rate": 7.158470944977847e-08, "loss": 0.011381767690181732, "memory(GiB)": 22.66, "step": 29206, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.956572 }, { "epoch": 0.9488029106974629, "grad_norm": 0.26007041335105896, "learning_rate": 7.149416920790186e-08, "loss": 0.004858959000557661, "memory(GiB)": 22.66, "step": 29207, "token_acc": 1.0, "train_speed(iter/s)": 0.956577 }, { "epoch": 0.9488353961602183, "grad_norm": 0.30155959725379944, "learning_rate": 7.140368584754253e-08, "loss": 0.009902730584144592, "memory(GiB)": 22.66, "step": 29208, "token_acc": 1.0, "train_speed(iter/s)": 0.956582 }, { "epoch": 0.9488678816229738, "grad_norm": 0.24712137877941132, "learning_rate": 7.131325936974465e-08, "loss": 0.007306791376322508, "memory(GiB)": 22.66, "step": 29209, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956588 }, { "epoch": 0.9489003670857291, "grad_norm": 0.30028074979782104, "learning_rate": 7.122288977555181e-08, "loss": 0.010622807778418064, "memory(GiB)": 22.66, "step": 29210, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956594 }, { "epoch": 0.9489328525484846, "grad_norm": 0.3588886260986328, "learning_rate": 7.113257706600652e-08, "loss": 0.009208733215928078, "memory(GiB)": 22.66, "step": 29211, "token_acc": 1.0, "train_speed(iter/s)": 0.9566 }, { "epoch": 0.94896533801124, "grad_norm": 0.3389579951763153, "learning_rate": 7.104232124215183e-08, "loss": 0.011936334893107414, "memory(GiB)": 22.66, "step": 29212, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.956607 }, { "epoch": 0.9489978234739954, "grad_norm": 0.3376705050468445, "learning_rate": 7.095212230502968e-08, "loss": 0.014077226631343365, "memory(GiB)": 22.66, "step": 29213, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.956614 }, { "epoch": 0.9490303089367508, "grad_norm": 0.28109973669052124, "learning_rate": 7.086198025568036e-08, "loss": 0.007047167047858238, "memory(GiB)": 22.66, "step": 29214, "token_acc": 1.0, "train_speed(iter/s)": 0.956621 }, { "epoch": 0.9490627943995062, "grad_norm": 0.332124263048172, "learning_rate": 7.07718950951447e-08, "loss": 0.016573382541537285, "memory(GiB)": 22.66, "step": 29215, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.956628 }, { "epoch": 0.9490952798622616, "grad_norm": 0.29208895564079285, "learning_rate": 7.068186682446132e-08, "loss": 0.008478965610265732, "memory(GiB)": 22.66, "step": 29216, "token_acc": 1.0, "train_speed(iter/s)": 0.956635 }, { "epoch": 0.9491277653250171, "grad_norm": 0.3755490481853485, "learning_rate": 7.059189544467105e-08, "loss": 0.011346550658345222, "memory(GiB)": 22.66, "step": 29217, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.956642 }, { "epoch": 0.9491602507877724, "grad_norm": 0.6431478261947632, "learning_rate": 7.050198095681082e-08, "loss": 0.00670898612588644, "memory(GiB)": 22.66, "step": 29218, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956649 }, { "epoch": 0.9491927362505279, "grad_norm": 0.25761017203330994, "learning_rate": 7.041212336191872e-08, "loss": 0.00843889731913805, "memory(GiB)": 22.66, "step": 29219, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.956656 }, { "epoch": 0.9492252217132833, "grad_norm": 0.3232285976409912, "learning_rate": 7.032232266103223e-08, "loss": 0.00772374402731657, "memory(GiB)": 22.66, "step": 29220, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.956663 }, { "epoch": 0.9492577071760387, "grad_norm": 0.34844163060188293, "learning_rate": 7.023257885518775e-08, "loss": 0.011247726157307625, "memory(GiB)": 22.66, "step": 29221, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.95667 }, { "epoch": 0.9492901926387941, "grad_norm": 0.21481995284557343, "learning_rate": 7.014289194542002e-08, "loss": 0.009179874323308468, "memory(GiB)": 22.66, "step": 29222, "token_acc": 1.0, "train_speed(iter/s)": 0.956677 }, { "epoch": 0.9493226781015496, "grad_norm": 0.6218960881233215, "learning_rate": 7.00532619327654e-08, "loss": 0.014373255893588066, "memory(GiB)": 22.66, "step": 29223, "token_acc": 0.9895104895104895, "train_speed(iter/s)": 0.956684 }, { "epoch": 0.9493551635643049, "grad_norm": 0.3686319589614868, "learning_rate": 6.996368881825865e-08, "loss": 0.00894167646765709, "memory(GiB)": 22.66, "step": 29224, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.956691 }, { "epoch": 0.9493876490270604, "grad_norm": 0.3641924262046814, "learning_rate": 6.987417260293172e-08, "loss": 0.012088578194379807, "memory(GiB)": 22.66, "step": 29225, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.956698 }, { "epoch": 0.9494201344898158, "grad_norm": 0.45507606863975525, "learning_rate": 6.978471328781932e-08, "loss": 0.01410604640841484, "memory(GiB)": 22.66, "step": 29226, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956705 }, { "epoch": 0.9494526199525712, "grad_norm": 0.36970072984695435, "learning_rate": 6.969531087395287e-08, "loss": 0.008654588833451271, "memory(GiB)": 22.66, "step": 29227, "token_acc": 1.0, "train_speed(iter/s)": 0.956712 }, { "epoch": 0.9494851054153266, "grad_norm": 0.33215585350990295, "learning_rate": 6.960596536236596e-08, "loss": 0.009030463173985481, "memory(GiB)": 22.66, "step": 29228, "token_acc": 1.0, "train_speed(iter/s)": 0.956719 }, { "epoch": 0.9495175908780821, "grad_norm": 0.21767883002758026, "learning_rate": 6.951667675408724e-08, "loss": 0.008101978339254856, "memory(GiB)": 22.66, "step": 29229, "token_acc": 1.0, "train_speed(iter/s)": 0.956725 }, { "epoch": 0.9495500763408374, "grad_norm": 0.3618553578853607, "learning_rate": 6.94274450501492e-08, "loss": 0.012840105220675468, "memory(GiB)": 22.66, "step": 29230, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.95673 }, { "epoch": 0.9495825618035929, "grad_norm": 0.321571946144104, "learning_rate": 6.933827025158102e-08, "loss": 0.011618731543421745, "memory(GiB)": 22.66, "step": 29231, "token_acc": 0.989247311827957, "train_speed(iter/s)": 0.956735 }, { "epoch": 0.9496150472663483, "grad_norm": 0.36576226353645325, "learning_rate": 6.924915235941188e-08, "loss": 0.010759241878986359, "memory(GiB)": 22.66, "step": 29232, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.956741 }, { "epoch": 0.9496475327291037, "grad_norm": 0.41833749413490295, "learning_rate": 6.916009137467039e-08, "loss": 0.01347077265381813, "memory(GiB)": 22.66, "step": 29233, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956746 }, { "epoch": 0.9496800181918591, "grad_norm": 0.6414588689804077, "learning_rate": 6.907108729838408e-08, "loss": 0.011161316186189651, "memory(GiB)": 22.66, "step": 29234, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.956752 }, { "epoch": 0.9497125036546146, "grad_norm": 0.3111119270324707, "learning_rate": 6.898214013158045e-08, "loss": 0.009249350056052208, "memory(GiB)": 22.66, "step": 29235, "token_acc": 0.991869918699187, "train_speed(iter/s)": 0.956757 }, { "epoch": 0.9497449891173699, "grad_norm": 0.43835482001304626, "learning_rate": 6.889324987528646e-08, "loss": 0.016135364770889282, "memory(GiB)": 22.66, "step": 29236, "token_acc": 1.0, "train_speed(iter/s)": 0.956762 }, { "epoch": 0.9497774745801254, "grad_norm": 0.32972678542137146, "learning_rate": 6.880441653052739e-08, "loss": 0.010312462225556374, "memory(GiB)": 22.66, "step": 29237, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.956767 }, { "epoch": 0.9498099600428808, "grad_norm": 0.3350929915904999, "learning_rate": 6.87156400983291e-08, "loss": 0.008242849260568619, "memory(GiB)": 22.66, "step": 29238, "token_acc": 1.0, "train_speed(iter/s)": 0.956772 }, { "epoch": 0.9498424455056362, "grad_norm": 0.3904121220111847, "learning_rate": 6.862692057971521e-08, "loss": 0.011411557905375957, "memory(GiB)": 22.66, "step": 29239, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.956777 }, { "epoch": 0.9498749309683916, "grad_norm": 0.4441383481025696, "learning_rate": 6.853825797571045e-08, "loss": 0.013632798567414284, "memory(GiB)": 22.66, "step": 29240, "token_acc": 0.9933774834437086, "train_speed(iter/s)": 0.956782 }, { "epoch": 0.9499074164311471, "grad_norm": 0.35241296887397766, "learning_rate": 6.844965228733846e-08, "loss": 0.011461482383310795, "memory(GiB)": 22.66, "step": 29241, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.956787 }, { "epoch": 0.9499399018939024, "grad_norm": 0.4150823652744293, "learning_rate": 6.83611035156212e-08, "loss": 0.011156396940350533, "memory(GiB)": 22.66, "step": 29242, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.956793 }, { "epoch": 0.9499723873566579, "grad_norm": 0.47628626227378845, "learning_rate": 6.827261166158061e-08, "loss": 0.019996490329504013, "memory(GiB)": 22.66, "step": 29243, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.956798 }, { "epoch": 0.9500048728194133, "grad_norm": 0.978240430355072, "learning_rate": 6.818417672623812e-08, "loss": 0.008845699019730091, "memory(GiB)": 22.66, "step": 29244, "token_acc": 1.0, "train_speed(iter/s)": 0.956803 }, { "epoch": 0.9500373582821687, "grad_norm": 0.29378819465637207, "learning_rate": 6.809579871061456e-08, "loss": 0.009790593758225441, "memory(GiB)": 22.66, "step": 29245, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956808 }, { "epoch": 0.9500698437449241, "grad_norm": 0.41465699672698975, "learning_rate": 6.800747761572968e-08, "loss": 0.015578098595142365, "memory(GiB)": 22.66, "step": 29246, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956813 }, { "epoch": 0.9501023292076796, "grad_norm": 0.6733373403549194, "learning_rate": 6.791921344260377e-08, "loss": 0.01822672411799431, "memory(GiB)": 22.66, "step": 29247, "token_acc": 0.9880239520958084, "train_speed(iter/s)": 0.956818 }, { "epoch": 0.950134814670435, "grad_norm": 0.2956160306930542, "learning_rate": 6.78310061922538e-08, "loss": 0.009732123464345932, "memory(GiB)": 22.66, "step": 29248, "token_acc": 0.9947643979057592, "train_speed(iter/s)": 0.956824 }, { "epoch": 0.9501673001331904, "grad_norm": 0.24423879384994507, "learning_rate": 6.774285586569895e-08, "loss": 0.007140656933188438, "memory(GiB)": 22.66, "step": 29249, "token_acc": 1.0, "train_speed(iter/s)": 0.95683 }, { "epoch": 0.9501997855959459, "grad_norm": 0.29152897000312805, "learning_rate": 6.765476246395674e-08, "loss": 0.00814690813422203, "memory(GiB)": 22.66, "step": 29250, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956836 }, { "epoch": 0.9502322710587012, "grad_norm": 0.2335047423839569, "learning_rate": 6.756672598804304e-08, "loss": 0.008755207061767578, "memory(GiB)": 22.66, "step": 29251, "token_acc": 0.995, "train_speed(iter/s)": 0.956841 }, { "epoch": 0.9502647565214567, "grad_norm": 0.33241474628448486, "learning_rate": 6.747874643897478e-08, "loss": 0.011218123137950897, "memory(GiB)": 22.66, "step": 29252, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956846 }, { "epoch": 0.9502972419842121, "grad_norm": 0.30648839473724365, "learning_rate": 6.739082381776673e-08, "loss": 0.011350697837769985, "memory(GiB)": 22.66, "step": 29253, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956851 }, { "epoch": 0.9503297274469675, "grad_norm": 0.21800671517848969, "learning_rate": 6.730295812543364e-08, "loss": 0.005530755966901779, "memory(GiB)": 22.66, "step": 29254, "token_acc": 1.0, "train_speed(iter/s)": 0.956857 }, { "epoch": 0.9503622129097229, "grad_norm": 0.34840622544288635, "learning_rate": 6.721514936298968e-08, "loss": 0.014619557186961174, "memory(GiB)": 22.66, "step": 29255, "token_acc": 1.0, "train_speed(iter/s)": 0.956862 }, { "epoch": 0.9503946983724784, "grad_norm": 0.41983455419540405, "learning_rate": 6.712739753144904e-08, "loss": 0.011792686767876148, "memory(GiB)": 22.66, "step": 29256, "token_acc": 1.0, "train_speed(iter/s)": 0.956868 }, { "epoch": 0.9504271838352337, "grad_norm": 0.3056687116622925, "learning_rate": 6.703970263182369e-08, "loss": 0.009064532816410065, "memory(GiB)": 22.66, "step": 29257, "token_acc": 1.0, "train_speed(iter/s)": 0.956874 }, { "epoch": 0.9504596692979892, "grad_norm": 0.23930175602436066, "learning_rate": 6.69520646651256e-08, "loss": 0.006888374220579863, "memory(GiB)": 22.66, "step": 29258, "token_acc": 1.0, "train_speed(iter/s)": 0.956879 }, { "epoch": 0.9504921547607446, "grad_norm": 0.30345168709754944, "learning_rate": 6.686448363236675e-08, "loss": 0.008854042738676071, "memory(GiB)": 22.66, "step": 29259, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956884 }, { "epoch": 0.9505246402235, "grad_norm": 0.39359208941459656, "learning_rate": 6.677695953455798e-08, "loss": 0.014786532148718834, "memory(GiB)": 22.66, "step": 29260, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.95689 }, { "epoch": 0.9505571256862554, "grad_norm": 0.2844642996788025, "learning_rate": 6.668949237270905e-08, "loss": 0.0059684524312615395, "memory(GiB)": 22.66, "step": 29261, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956895 }, { "epoch": 0.9505896111490109, "grad_norm": 0.24983704090118408, "learning_rate": 6.66020821478297e-08, "loss": 0.007440217304974794, "memory(GiB)": 22.66, "step": 29262, "token_acc": 1.0, "train_speed(iter/s)": 0.956901 }, { "epoch": 0.9506220966117662, "grad_norm": 0.4395568072795868, "learning_rate": 6.651472886092858e-08, "loss": 0.01187010295689106, "memory(GiB)": 22.66, "step": 29263, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956906 }, { "epoch": 0.9506545820745217, "grad_norm": 0.3263123631477356, "learning_rate": 6.642743251301376e-08, "loss": 0.013320744037628174, "memory(GiB)": 22.66, "step": 29264, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956911 }, { "epoch": 0.950687067537277, "grad_norm": 0.37447038292884827, "learning_rate": 6.634019310509332e-08, "loss": 0.017431607469916344, "memory(GiB)": 22.66, "step": 29265, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956917 }, { "epoch": 0.9507195530000325, "grad_norm": 0.30522817373275757, "learning_rate": 6.625301063817368e-08, "loss": 0.015674106776714325, "memory(GiB)": 22.66, "step": 29266, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.956923 }, { "epoch": 0.9507520384627879, "grad_norm": 0.37494155764579773, "learning_rate": 6.616588511326017e-08, "loss": 0.015623853541910648, "memory(GiB)": 22.66, "step": 29267, "token_acc": 1.0, "train_speed(iter/s)": 0.956929 }, { "epoch": 0.9507845239255434, "grad_norm": 0.3061322271823883, "learning_rate": 6.607881653136028e-08, "loss": 0.007193791214376688, "memory(GiB)": 22.66, "step": 29268, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.956933 }, { "epoch": 0.9508170093882987, "grad_norm": 0.2988418936729431, "learning_rate": 6.599180489347767e-08, "loss": 0.011288129724562168, "memory(GiB)": 22.66, "step": 29269, "token_acc": 0.995, "train_speed(iter/s)": 0.956938 }, { "epoch": 0.9508494948510542, "grad_norm": 0.4142707586288452, "learning_rate": 6.590485020061654e-08, "loss": 0.012568479403853416, "memory(GiB)": 22.66, "step": 29270, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.956943 }, { "epoch": 0.9508819803138095, "grad_norm": 0.48346033692359924, "learning_rate": 6.581795245378109e-08, "loss": 0.009356294758617878, "memory(GiB)": 22.66, "step": 29271, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.956949 }, { "epoch": 0.950914465776565, "grad_norm": 0.3163701891899109, "learning_rate": 6.573111165397328e-08, "loss": 0.010805787518620491, "memory(GiB)": 22.66, "step": 29272, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956955 }, { "epoch": 0.9509469512393204, "grad_norm": 0.3400590419769287, "learning_rate": 6.564432780219676e-08, "loss": 0.007411235943436623, "memory(GiB)": 22.66, "step": 29273, "token_acc": 1.0, "train_speed(iter/s)": 0.95696 }, { "epoch": 0.9509794367020759, "grad_norm": 0.31113743782043457, "learning_rate": 6.555760089945185e-08, "loss": 0.010742133483290672, "memory(GiB)": 22.66, "step": 29274, "token_acc": 0.9923954372623575, "train_speed(iter/s)": 0.956967 }, { "epoch": 0.9510119221648312, "grad_norm": 0.23325656354427338, "learning_rate": 6.547093094673995e-08, "loss": 0.008984782733023167, "memory(GiB)": 22.66, "step": 29275, "token_acc": 1.0, "train_speed(iter/s)": 0.956972 }, { "epoch": 0.9510444076275867, "grad_norm": 0.35202541947364807, "learning_rate": 6.538431794506139e-08, "loss": 0.007177906110882759, "memory(GiB)": 22.66, "step": 29276, "token_acc": 1.0, "train_speed(iter/s)": 0.956978 }, { "epoch": 0.951076893090342, "grad_norm": 0.43418821692466736, "learning_rate": 6.529776189541592e-08, "loss": 0.011323470622301102, "memory(GiB)": 22.66, "step": 29277, "token_acc": 0.996, "train_speed(iter/s)": 0.956983 }, { "epoch": 0.9511093785530975, "grad_norm": 0.22293823957443237, "learning_rate": 6.521126279880163e-08, "loss": 0.005458765663206577, "memory(GiB)": 22.66, "step": 29278, "token_acc": 1.0, "train_speed(iter/s)": 0.956988 }, { "epoch": 0.9511418640158529, "grad_norm": 0.3536577820777893, "learning_rate": 6.512482065621884e-08, "loss": 0.00904841348528862, "memory(GiB)": 22.66, "step": 29279, "token_acc": 1.0, "train_speed(iter/s)": 0.956994 }, { "epoch": 0.9511743494786083, "grad_norm": 0.4003370404243469, "learning_rate": 6.503843546866285e-08, "loss": 0.014921792782843113, "memory(GiB)": 22.66, "step": 29280, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957 }, { "epoch": 0.9512068349413637, "grad_norm": 0.3189050853252411, "learning_rate": 6.495210723713175e-08, "loss": 0.0117610152810812, "memory(GiB)": 22.66, "step": 29281, "token_acc": 1.0, "train_speed(iter/s)": 0.957005 }, { "epoch": 0.9512393204041192, "grad_norm": 1.1493171453475952, "learning_rate": 6.486583596262252e-08, "loss": 0.018215417861938477, "memory(GiB)": 22.66, "step": 29282, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.957011 }, { "epoch": 0.9512718058668745, "grad_norm": 0.34031030535697937, "learning_rate": 6.477962164612994e-08, "loss": 0.01706574112176895, "memory(GiB)": 22.66, "step": 29283, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.957016 }, { "epoch": 0.95130429132963, "grad_norm": 0.3880658745765686, "learning_rate": 6.469346428864875e-08, "loss": 0.015421233139932156, "memory(GiB)": 22.66, "step": 29284, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.957022 }, { "epoch": 0.9513367767923854, "grad_norm": 0.4254341721534729, "learning_rate": 6.460736389117428e-08, "loss": 0.019648736342787743, "memory(GiB)": 22.66, "step": 29285, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.957027 }, { "epoch": 0.9513692622551408, "grad_norm": 0.3419737219810486, "learning_rate": 6.452132045470016e-08, "loss": 0.008891387842595577, "memory(GiB)": 22.66, "step": 29286, "token_acc": 0.9947368421052631, "train_speed(iter/s)": 0.957033 }, { "epoch": 0.9514017477178962, "grad_norm": 0.3925080895423889, "learning_rate": 6.44353339802184e-08, "loss": 0.007567732594907284, "memory(GiB)": 22.66, "step": 29287, "token_acc": 1.0, "train_speed(iter/s)": 0.957038 }, { "epoch": 0.9514342331806517, "grad_norm": 0.7747835516929626, "learning_rate": 6.434940446872262e-08, "loss": 0.010200673714280128, "memory(GiB)": 22.66, "step": 29288, "token_acc": 1.0, "train_speed(iter/s)": 0.957042 }, { "epoch": 0.951466718643407, "grad_norm": 0.2930789887905121, "learning_rate": 6.426353192120427e-08, "loss": 0.013414125889539719, "memory(GiB)": 22.66, "step": 29289, "token_acc": 1.0, "train_speed(iter/s)": 0.957047 }, { "epoch": 0.9514992041061625, "grad_norm": 0.34225407242774963, "learning_rate": 6.417771633865367e-08, "loss": 0.014210205525159836, "memory(GiB)": 22.66, "step": 29290, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.957051 }, { "epoch": 0.9515316895689179, "grad_norm": 0.5446131229400635, "learning_rate": 6.409195772206167e-08, "loss": 0.01805112138390541, "memory(GiB)": 22.66, "step": 29291, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957056 }, { "epoch": 0.9515641750316733, "grad_norm": 0.5604760050773621, "learning_rate": 6.400625607241917e-08, "loss": 0.013637160882353783, "memory(GiB)": 22.66, "step": 29292, "token_acc": 1.0, "train_speed(iter/s)": 0.957061 }, { "epoch": 0.9515966604944287, "grad_norm": 0.3858352601528168, "learning_rate": 6.392061139071315e-08, "loss": 0.008399798534810543, "memory(GiB)": 22.66, "step": 29293, "token_acc": 1.0, "train_speed(iter/s)": 0.957065 }, { "epoch": 0.9516291459571842, "grad_norm": 0.34651467204093933, "learning_rate": 6.383502367793393e-08, "loss": 0.012177655473351479, "memory(GiB)": 22.66, "step": 29294, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.95707 }, { "epoch": 0.9516616314199395, "grad_norm": 0.28074952960014343, "learning_rate": 6.374949293506794e-08, "loss": 0.010190749540925026, "memory(GiB)": 22.66, "step": 29295, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.957074 }, { "epoch": 0.951694116882695, "grad_norm": 0.30326223373413086, "learning_rate": 6.366401916310327e-08, "loss": 0.01315758191049099, "memory(GiB)": 22.66, "step": 29296, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.957078 }, { "epoch": 0.9517266023454504, "grad_norm": 0.2771832346916199, "learning_rate": 6.357860236302582e-08, "loss": 0.006886911578476429, "memory(GiB)": 22.66, "step": 29297, "token_acc": 1.0, "train_speed(iter/s)": 0.957083 }, { "epoch": 0.9517590878082058, "grad_norm": 0.19625310599803925, "learning_rate": 6.349324253582256e-08, "loss": 0.007335158064961433, "memory(GiB)": 22.66, "step": 29298, "token_acc": 1.0, "train_speed(iter/s)": 0.957088 }, { "epoch": 0.9517915732709612, "grad_norm": 0.3487919867038727, "learning_rate": 6.34079396824766e-08, "loss": 0.009011521935462952, "memory(GiB)": 22.66, "step": 29299, "token_acc": 1.0, "train_speed(iter/s)": 0.957092 }, { "epoch": 0.9518240587337167, "grad_norm": 0.3338869512081146, "learning_rate": 6.332269380397382e-08, "loss": 0.009564102627336979, "memory(GiB)": 22.66, "step": 29300, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.957097 }, { "epoch": 0.951856544196472, "grad_norm": 0.4001823961734772, "learning_rate": 6.323750490129843e-08, "loss": 0.014448225498199463, "memory(GiB)": 22.66, "step": 29301, "token_acc": 1.0, "train_speed(iter/s)": 0.957102 }, { "epoch": 0.9518890296592275, "grad_norm": 0.4842917323112488, "learning_rate": 6.315237297543242e-08, "loss": 0.02033454179763794, "memory(GiB)": 22.66, "step": 29302, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.957107 }, { "epoch": 0.9519215151219829, "grad_norm": 0.26549988985061646, "learning_rate": 6.306729802735944e-08, "loss": 0.008722496218979359, "memory(GiB)": 22.66, "step": 29303, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.957113 }, { "epoch": 0.9519540005847383, "grad_norm": 0.441887766122818, "learning_rate": 6.298228005806096e-08, "loss": 0.01433053333312273, "memory(GiB)": 22.66, "step": 29304, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.957117 }, { "epoch": 0.9519864860474937, "grad_norm": 0.48783501982688904, "learning_rate": 6.289731906851781e-08, "loss": 0.01203547790646553, "memory(GiB)": 22.66, "step": 29305, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.957122 }, { "epoch": 0.9520189715102492, "grad_norm": 0.244105726480484, "learning_rate": 6.281241505971092e-08, "loss": 0.007081989198923111, "memory(GiB)": 22.66, "step": 29306, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957127 }, { "epoch": 0.9520514569730045, "grad_norm": 0.24942812323570251, "learning_rate": 6.272756803262003e-08, "loss": 0.00904780998826027, "memory(GiB)": 22.66, "step": 29307, "token_acc": 1.0, "train_speed(iter/s)": 0.957133 }, { "epoch": 0.95208394243576, "grad_norm": 0.2505960464477539, "learning_rate": 6.264277798822493e-08, "loss": 0.007397012319415808, "memory(GiB)": 22.66, "step": 29308, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.95714 }, { "epoch": 0.9521164278985154, "grad_norm": 0.4769079387187958, "learning_rate": 6.255804492750317e-08, "loss": 0.013169685378670692, "memory(GiB)": 22.66, "step": 29309, "token_acc": 1.0, "train_speed(iter/s)": 0.957146 }, { "epoch": 0.9521489133612708, "grad_norm": 0.3822982609272003, "learning_rate": 6.247336885143341e-08, "loss": 0.012132201343774796, "memory(GiB)": 22.66, "step": 29310, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957153 }, { "epoch": 0.9521813988240262, "grad_norm": 0.3841097354888916, "learning_rate": 6.238874976099319e-08, "loss": 0.009945346042513847, "memory(GiB)": 22.66, "step": 29311, "token_acc": 1.0, "train_speed(iter/s)": 0.957159 }, { "epoch": 0.9522138842867817, "grad_norm": 0.15864373743534088, "learning_rate": 6.23041876571584e-08, "loss": 0.004526956006884575, "memory(GiB)": 22.66, "step": 29312, "token_acc": 1.0, "train_speed(iter/s)": 0.957167 }, { "epoch": 0.9522463697495371, "grad_norm": 0.3422599136829376, "learning_rate": 6.22196825409055e-08, "loss": 0.009918058291077614, "memory(GiB)": 22.66, "step": 29313, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957174 }, { "epoch": 0.9522788552122925, "grad_norm": 0.24647945165634155, "learning_rate": 6.213523441320979e-08, "loss": 0.007794711738824844, "memory(GiB)": 22.66, "step": 29314, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957181 }, { "epoch": 0.952311340675048, "grad_norm": 0.30445268750190735, "learning_rate": 6.205084327504551e-08, "loss": 0.01196649856865406, "memory(GiB)": 22.66, "step": 29315, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.957187 }, { "epoch": 0.9523438261378033, "grad_norm": 0.3238857388496399, "learning_rate": 6.196650912738688e-08, "loss": 0.007699314039200544, "memory(GiB)": 22.66, "step": 29316, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.957195 }, { "epoch": 0.9523763116005588, "grad_norm": 0.36261630058288574, "learning_rate": 6.188223197120757e-08, "loss": 0.005212201736867428, "memory(GiB)": 22.66, "step": 29317, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.957201 }, { "epoch": 0.9524087970633142, "grad_norm": 0.39373084902763367, "learning_rate": 6.179801180747957e-08, "loss": 0.010557052679359913, "memory(GiB)": 22.66, "step": 29318, "token_acc": 0.9966887417218543, "train_speed(iter/s)": 0.957208 }, { "epoch": 0.9524412825260696, "grad_norm": 0.35456329584121704, "learning_rate": 6.17138486371749e-08, "loss": 0.011231377720832825, "memory(GiB)": 22.66, "step": 29319, "token_acc": 1.0, "train_speed(iter/s)": 0.957215 }, { "epoch": 0.952473767988825, "grad_norm": 0.40692389011383057, "learning_rate": 6.162974246126608e-08, "loss": 0.013351911678910255, "memory(GiB)": 22.66, "step": 29320, "token_acc": 1.0, "train_speed(iter/s)": 0.957221 }, { "epoch": 0.9525062534515805, "grad_norm": 0.25808244943618774, "learning_rate": 6.154569328072235e-08, "loss": 0.006851489655673504, "memory(GiB)": 22.66, "step": 29321, "token_acc": 1.0, "train_speed(iter/s)": 0.957227 }, { "epoch": 0.9525387389143358, "grad_norm": 0.40633225440979004, "learning_rate": 6.146170109651517e-08, "loss": 0.010088855400681496, "memory(GiB)": 22.66, "step": 29322, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.957234 }, { "epoch": 0.9525712243770913, "grad_norm": 0.37485048174858093, "learning_rate": 6.137776590961209e-08, "loss": 0.013436781242489815, "memory(GiB)": 22.66, "step": 29323, "token_acc": 0.9930555555555556, "train_speed(iter/s)": 0.957241 }, { "epoch": 0.9526037098398467, "grad_norm": 0.4377378225326538, "learning_rate": 6.1293887720984e-08, "loss": 0.011943256482481956, "memory(GiB)": 22.66, "step": 29324, "token_acc": 1.0, "train_speed(iter/s)": 0.957248 }, { "epoch": 0.9526361953026021, "grad_norm": 0.2564903795719147, "learning_rate": 6.121006653159678e-08, "loss": 0.00794571079313755, "memory(GiB)": 22.66, "step": 29325, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.957255 }, { "epoch": 0.9526686807653575, "grad_norm": 0.25773221254348755, "learning_rate": 6.112630234241967e-08, "loss": 0.01469082199037075, "memory(GiB)": 22.66, "step": 29326, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.957262 }, { "epoch": 0.952701166228113, "grad_norm": 0.2731724977493286, "learning_rate": 6.104259515441857e-08, "loss": 0.009769121184945107, "memory(GiB)": 22.66, "step": 29327, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.957269 }, { "epoch": 0.9527336516908683, "grad_norm": 0.2703778147697449, "learning_rate": 6.095894496855936e-08, "loss": 0.011096328496932983, "memory(GiB)": 22.66, "step": 29328, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957275 }, { "epoch": 0.9527661371536238, "grad_norm": 0.3998807370662689, "learning_rate": 6.087535178580795e-08, "loss": 0.010116110555827618, "memory(GiB)": 22.66, "step": 29329, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957282 }, { "epoch": 0.9527986226163792, "grad_norm": 0.378885418176651, "learning_rate": 6.07918156071291e-08, "loss": 0.010852694511413574, "memory(GiB)": 22.66, "step": 29330, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957289 }, { "epoch": 0.9528311080791346, "grad_norm": 0.38693130016326904, "learning_rate": 6.07083364334865e-08, "loss": 0.01545384619385004, "memory(GiB)": 22.66, "step": 29331, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957296 }, { "epoch": 0.95286359354189, "grad_norm": 0.24484214186668396, "learning_rate": 6.062491426584327e-08, "loss": 0.008807454258203506, "memory(GiB)": 22.66, "step": 29332, "token_acc": 0.995, "train_speed(iter/s)": 0.957302 }, { "epoch": 0.9528960790046455, "grad_norm": 0.22209084033966064, "learning_rate": 6.054154910516363e-08, "loss": 0.005812574177980423, "memory(GiB)": 22.66, "step": 29333, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.957309 }, { "epoch": 0.9529285644674008, "grad_norm": 0.2781752645969391, "learning_rate": 6.045824095240904e-08, "loss": 0.010621365159749985, "memory(GiB)": 22.66, "step": 29334, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957316 }, { "epoch": 0.9529610499301563, "grad_norm": 0.516872763633728, "learning_rate": 6.037498980854095e-08, "loss": 0.01579192653298378, "memory(GiB)": 22.66, "step": 29335, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.957323 }, { "epoch": 0.9529935353929117, "grad_norm": 0.22747522592544556, "learning_rate": 6.029179567451916e-08, "loss": 0.004594774916768074, "memory(GiB)": 22.66, "step": 29336, "token_acc": 1.0, "train_speed(iter/s)": 0.95733 }, { "epoch": 0.9530260208556671, "grad_norm": 0.4214302599430084, "learning_rate": 6.020865855130564e-08, "loss": 0.013718641363084316, "memory(GiB)": 22.66, "step": 29337, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957337 }, { "epoch": 0.9530585063184225, "grad_norm": 0.2978963553905487, "learning_rate": 6.012557843985855e-08, "loss": 0.008961616083979607, "memory(GiB)": 22.66, "step": 29338, "token_acc": 0.9964285714285714, "train_speed(iter/s)": 0.957342 }, { "epoch": 0.953090991781178, "grad_norm": 0.3578599691390991, "learning_rate": 6.004255534113768e-08, "loss": 0.015649035573005676, "memory(GiB)": 22.66, "step": 29339, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.957348 }, { "epoch": 0.9531234772439333, "grad_norm": 0.31604278087615967, "learning_rate": 5.995958925610056e-08, "loss": 0.00933514628559351, "memory(GiB)": 22.66, "step": 29340, "token_acc": 0.9921875, "train_speed(iter/s)": 0.957353 }, { "epoch": 0.9531559627066888, "grad_norm": 0.7033827304840088, "learning_rate": 5.98766801857048e-08, "loss": 0.016411881893873215, "memory(GiB)": 22.66, "step": 29341, "token_acc": 0.984375, "train_speed(iter/s)": 0.957358 }, { "epoch": 0.9531884481694441, "grad_norm": 0.34081533551216125, "learning_rate": 5.979382813090739e-08, "loss": 0.009689349681138992, "memory(GiB)": 22.66, "step": 29342, "token_acc": 1.0, "train_speed(iter/s)": 0.957364 }, { "epoch": 0.9532209336321996, "grad_norm": 0.3767890930175781, "learning_rate": 5.971103309266535e-08, "loss": 0.012541000731289387, "memory(GiB)": 22.66, "step": 29343, "token_acc": 0.9966666666666667, "train_speed(iter/s)": 0.957369 }, { "epoch": 0.953253419094955, "grad_norm": 0.3393263816833496, "learning_rate": 5.96282950719329e-08, "loss": 0.01179148256778717, "memory(GiB)": 22.66, "step": 29344, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957375 }, { "epoch": 0.9532859045577105, "grad_norm": 1.0831586122512817, "learning_rate": 5.954561406966597e-08, "loss": 0.015894562005996704, "memory(GiB)": 22.66, "step": 29345, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.95738 }, { "epoch": 0.9533183900204658, "grad_norm": 0.4262782335281372, "learning_rate": 5.9462990086817666e-08, "loss": 0.007921554148197174, "memory(GiB)": 22.66, "step": 29346, "token_acc": 1.0, "train_speed(iter/s)": 0.957386 }, { "epoch": 0.9533508754832213, "grad_norm": 0.24278385937213898, "learning_rate": 5.9380423124342225e-08, "loss": 0.006827057804912329, "memory(GiB)": 22.66, "step": 29347, "token_acc": 1.0, "train_speed(iter/s)": 0.957391 }, { "epoch": 0.9533833609459766, "grad_norm": 0.3346562683582306, "learning_rate": 5.9297913183193336e-08, "loss": 0.012015780434012413, "memory(GiB)": 22.66, "step": 29348, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957395 }, { "epoch": 0.9534158464087321, "grad_norm": 0.6359005570411682, "learning_rate": 5.9215460264322453e-08, "loss": 0.01525246910750866, "memory(GiB)": 22.66, "step": 29349, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.9574 }, { "epoch": 0.9534483318714875, "grad_norm": 0.34931209683418274, "learning_rate": 5.913306436868105e-08, "loss": 0.01244056224822998, "memory(GiB)": 22.66, "step": 29350, "token_acc": 1.0, "train_speed(iter/s)": 0.957405 }, { "epoch": 0.953480817334243, "grad_norm": 0.35774704813957214, "learning_rate": 5.9050725497220575e-08, "loss": 0.008414149284362793, "memory(GiB)": 22.66, "step": 29351, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.957409 }, { "epoch": 0.9535133027969983, "grad_norm": 1.7500232458114624, "learning_rate": 5.8968443650891385e-08, "loss": 0.013112546876072884, "memory(GiB)": 22.66, "step": 29352, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.957414 }, { "epoch": 0.9535457882597538, "grad_norm": 0.4458056688308716, "learning_rate": 5.888621883064272e-08, "loss": 0.011132226325571537, "memory(GiB)": 22.66, "step": 29353, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.957418 }, { "epoch": 0.9535782737225091, "grad_norm": 0.30564725399017334, "learning_rate": 5.880405103742326e-08, "loss": 0.011783801950514317, "memory(GiB)": 22.66, "step": 29354, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.957422 }, { "epoch": 0.9536107591852646, "grad_norm": 0.25518178939819336, "learning_rate": 5.87219402721817e-08, "loss": 0.009094837121665478, "memory(GiB)": 22.66, "step": 29355, "token_acc": 1.0, "train_speed(iter/s)": 0.957427 }, { "epoch": 0.95364324464802, "grad_norm": 0.5995129942893982, "learning_rate": 5.8639886535866716e-08, "loss": 0.009157136082649231, "memory(GiB)": 22.66, "step": 29356, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.957431 }, { "epoch": 0.9536757301107754, "grad_norm": 0.42535609006881714, "learning_rate": 5.8557889829423677e-08, "loss": 0.009208016097545624, "memory(GiB)": 22.66, "step": 29357, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957436 }, { "epoch": 0.9537082155735308, "grad_norm": 0.3851556181907654, "learning_rate": 5.847595015379959e-08, "loss": 0.009815978817641735, "memory(GiB)": 22.66, "step": 29358, "token_acc": 0.9890710382513661, "train_speed(iter/s)": 0.95744 }, { "epoch": 0.9537407010362863, "grad_norm": 0.29192736744880676, "learning_rate": 5.839406750994037e-08, "loss": 0.010407942347228527, "memory(GiB)": 22.66, "step": 29359, "token_acc": 0.983739837398374, "train_speed(iter/s)": 0.957445 }, { "epoch": 0.9537731864990416, "grad_norm": 0.3274436593055725, "learning_rate": 5.831224189879081e-08, "loss": 0.009775741957128048, "memory(GiB)": 22.66, "step": 29360, "token_acc": 1.0, "train_speed(iter/s)": 0.957449 }, { "epoch": 0.9538056719617971, "grad_norm": 0.3577702045440674, "learning_rate": 5.823047332129517e-08, "loss": 0.00906593818217516, "memory(GiB)": 22.66, "step": 29361, "token_acc": 1.0, "train_speed(iter/s)": 0.957454 }, { "epoch": 0.9538381574245525, "grad_norm": 0.18374522030353546, "learning_rate": 5.8148761778397676e-08, "loss": 0.004586589522659779, "memory(GiB)": 22.66, "step": 29362, "token_acc": 1.0, "train_speed(iter/s)": 0.957459 }, { "epoch": 0.9538706428873079, "grad_norm": 0.2793712317943573, "learning_rate": 5.8067107271040925e-08, "loss": 0.008706844411790371, "memory(GiB)": 22.66, "step": 29363, "token_acc": 1.0, "train_speed(iter/s)": 0.957464 }, { "epoch": 0.9539031283500633, "grad_norm": 0.3412427604198456, "learning_rate": 5.798550980016748e-08, "loss": 0.010341847315430641, "memory(GiB)": 22.66, "step": 29364, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.95747 }, { "epoch": 0.9539356138128188, "grad_norm": 0.3484269082546234, "learning_rate": 5.7903969366718825e-08, "loss": 0.016572393476963043, "memory(GiB)": 22.66, "step": 29365, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.957475 }, { "epoch": 0.9539680992755741, "grad_norm": 0.4127631187438965, "learning_rate": 5.782248597163642e-08, "loss": 0.01172583643347025, "memory(GiB)": 22.66, "step": 29366, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.95748 }, { "epoch": 0.9540005847383296, "grad_norm": 0.2738721966743469, "learning_rate": 5.774105961586118e-08, "loss": 0.010218597948551178, "memory(GiB)": 22.66, "step": 29367, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.957487 }, { "epoch": 0.954033070201085, "grad_norm": 0.4265691637992859, "learning_rate": 5.7659690300331806e-08, "loss": 0.018088869750499725, "memory(GiB)": 22.66, "step": 29368, "token_acc": 1.0, "train_speed(iter/s)": 0.957494 }, { "epoch": 0.9540655556638404, "grad_norm": 0.5308858752250671, "learning_rate": 5.7578378025987536e-08, "loss": 0.0220707468688488, "memory(GiB)": 22.66, "step": 29369, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.957501 }, { "epoch": 0.9540980411265958, "grad_norm": 0.22849085927009583, "learning_rate": 5.749712279376707e-08, "loss": 0.007148034404963255, "memory(GiB)": 22.66, "step": 29370, "token_acc": 1.0, "train_speed(iter/s)": 0.957508 }, { "epoch": 0.9541305265893513, "grad_norm": 0.2822416424751282, "learning_rate": 5.741592460460854e-08, "loss": 0.009823406115174294, "memory(GiB)": 22.66, "step": 29371, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.957515 }, { "epoch": 0.9541630120521066, "grad_norm": 0.393964946269989, "learning_rate": 5.7334783459448985e-08, "loss": 0.007813668809831142, "memory(GiB)": 22.66, "step": 29372, "token_acc": 0.9905660377358491, "train_speed(iter/s)": 0.957522 }, { "epoch": 0.9541954975148621, "grad_norm": 0.4254540205001831, "learning_rate": 5.7253699359224315e-08, "loss": 0.010733276605606079, "memory(GiB)": 22.66, "step": 29373, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.957528 }, { "epoch": 0.9542279829776175, "grad_norm": 0.5810042023658752, "learning_rate": 5.7172672304871e-08, "loss": 0.015346135944128036, "memory(GiB)": 22.66, "step": 29374, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.957535 }, { "epoch": 0.9542604684403729, "grad_norm": 0.38463708758354187, "learning_rate": 5.709170229732386e-08, "loss": 0.014093488454818726, "memory(GiB)": 22.66, "step": 29375, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.957542 }, { "epoch": 0.9542929539031284, "grad_norm": 0.3421558141708374, "learning_rate": 5.701078933751769e-08, "loss": 0.009880295023322105, "memory(GiB)": 22.66, "step": 29376, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.957548 }, { "epoch": 0.9543254393658838, "grad_norm": 0.34596821665763855, "learning_rate": 5.69299334263862e-08, "loss": 0.011778747662901878, "memory(GiB)": 22.66, "step": 29377, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957555 }, { "epoch": 0.9543579248286392, "grad_norm": 0.26511329412460327, "learning_rate": 5.684913456486141e-08, "loss": 0.009216544218361378, "memory(GiB)": 22.66, "step": 29378, "token_acc": 1.0, "train_speed(iter/s)": 0.957562 }, { "epoch": 0.9543904102913946, "grad_norm": 0.3913247883319855, "learning_rate": 5.6768392753877576e-08, "loss": 0.010213887318968773, "memory(GiB)": 22.66, "step": 29379, "token_acc": 1.0, "train_speed(iter/s)": 0.957569 }, { "epoch": 0.9544228957541501, "grad_norm": 0.2994181215763092, "learning_rate": 5.668770799436674e-08, "loss": 0.00915297120809555, "memory(GiB)": 22.66, "step": 29380, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.957576 }, { "epoch": 0.9544553812169054, "grad_norm": 0.7789552807807922, "learning_rate": 5.6607080287258146e-08, "loss": 0.016496650874614716, "memory(GiB)": 22.66, "step": 29381, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.957583 }, { "epoch": 0.9544878666796609, "grad_norm": 0.39030662178993225, "learning_rate": 5.652650963348438e-08, "loss": 0.015194128267467022, "memory(GiB)": 22.66, "step": 29382, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.95759 }, { "epoch": 0.9545203521424163, "grad_norm": 0.34410402178764343, "learning_rate": 5.6445996033973605e-08, "loss": 0.007844512350857258, "memory(GiB)": 22.66, "step": 29383, "token_acc": 1.0, "train_speed(iter/s)": 0.957596 }, { "epoch": 0.9545528376051717, "grad_norm": 0.3615399897098541, "learning_rate": 5.636553948965617e-08, "loss": 0.014491286128759384, "memory(GiB)": 22.66, "step": 29384, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957604 }, { "epoch": 0.9545853230679271, "grad_norm": 0.3566492199897766, "learning_rate": 5.628514000146079e-08, "loss": 0.009021319448947906, "memory(GiB)": 22.66, "step": 29385, "token_acc": 1.0, "train_speed(iter/s)": 0.957611 }, { "epoch": 0.9546178085306826, "grad_norm": 0.6969236135482788, "learning_rate": 5.6204797570314494e-08, "loss": 0.0119716115295887, "memory(GiB)": 22.66, "step": 29386, "token_acc": 1.0, "train_speed(iter/s)": 0.957618 }, { "epoch": 0.9546502939934379, "grad_norm": 0.3998132646083832, "learning_rate": 5.6124512197145434e-08, "loss": 0.016393570229411125, "memory(GiB)": 22.66, "step": 29387, "token_acc": 0.9775784753363229, "train_speed(iter/s)": 0.957625 }, { "epoch": 0.9546827794561934, "grad_norm": 0.30583909153938293, "learning_rate": 5.6044283882879524e-08, "loss": 0.013128940016031265, "memory(GiB)": 22.66, "step": 29388, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.957631 }, { "epoch": 0.9547152649189488, "grad_norm": 0.2949603497982025, "learning_rate": 5.59641126284427e-08, "loss": 0.011463158763945103, "memory(GiB)": 22.66, "step": 29389, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.957638 }, { "epoch": 0.9547477503817042, "grad_norm": 0.4622619152069092, "learning_rate": 5.588399843476089e-08, "loss": 0.015252063982188702, "memory(GiB)": 22.66, "step": 29390, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.957645 }, { "epoch": 0.9547802358444596, "grad_norm": 0.44352129101753235, "learning_rate": 5.580394130275835e-08, "loss": 0.013811830431222916, "memory(GiB)": 22.66, "step": 29391, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957651 }, { "epoch": 0.9548127213072151, "grad_norm": 0.2702294588088989, "learning_rate": 5.572394123335878e-08, "loss": 0.013304704800248146, "memory(GiB)": 22.66, "step": 29392, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.957658 }, { "epoch": 0.9548452067699704, "grad_norm": 0.24719074368476868, "learning_rate": 5.5643998227485896e-08, "loss": 0.006310507655143738, "memory(GiB)": 22.66, "step": 29393, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.957665 }, { "epoch": 0.9548776922327259, "grad_norm": 0.330606609582901, "learning_rate": 5.556411228606173e-08, "loss": 0.010862931609153748, "memory(GiB)": 22.66, "step": 29394, "token_acc": 1.0, "train_speed(iter/s)": 0.957672 }, { "epoch": 0.9549101776954813, "grad_norm": 0.40450575947761536, "learning_rate": 5.5484283410009445e-08, "loss": 0.015800103545188904, "memory(GiB)": 22.66, "step": 29395, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957679 }, { "epoch": 0.9549426631582367, "grad_norm": 0.3529984652996063, "learning_rate": 5.54045116002494e-08, "loss": 0.009414201602339745, "memory(GiB)": 22.66, "step": 29396, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.957686 }, { "epoch": 0.9549751486209921, "grad_norm": 0.2513839304447174, "learning_rate": 5.5324796857702533e-08, "loss": 0.007134915329515934, "memory(GiB)": 22.66, "step": 29397, "token_acc": 1.0, "train_speed(iter/s)": 0.957693 }, { "epoch": 0.9550076340837476, "grad_norm": 0.2207321673631668, "learning_rate": 5.524513918328866e-08, "loss": 0.008561298251152039, "memory(GiB)": 22.66, "step": 29398, "token_acc": 1.0, "train_speed(iter/s)": 0.9577 }, { "epoch": 0.9550401195465029, "grad_norm": 0.4237916171550751, "learning_rate": 5.516553857792817e-08, "loss": 0.014554670080542564, "memory(GiB)": 22.66, "step": 29399, "token_acc": 1.0, "train_speed(iter/s)": 0.957707 }, { "epoch": 0.9550726050092584, "grad_norm": 0.3786129355430603, "learning_rate": 5.508599504253809e-08, "loss": 0.011100530624389648, "memory(GiB)": 22.66, "step": 29400, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.957714 }, { "epoch": 0.9551050904720138, "grad_norm": 0.46222949028015137, "learning_rate": 5.5006508578037134e-08, "loss": 0.011955168098211288, "memory(GiB)": 22.66, "step": 29401, "token_acc": 1.0, "train_speed(iter/s)": 0.957719 }, { "epoch": 0.9551375759347692, "grad_norm": 0.36268994212150574, "learning_rate": 5.4927079185343456e-08, "loss": 0.008932540193200111, "memory(GiB)": 22.66, "step": 29402, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.957725 }, { "epoch": 0.9551700613975246, "grad_norm": 0.36625853180885315, "learning_rate": 5.4847706865373e-08, "loss": 0.01328609324991703, "memory(GiB)": 22.66, "step": 29403, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.95773 }, { "epoch": 0.95520254686028, "grad_norm": 0.4048072099685669, "learning_rate": 5.476839161904168e-08, "loss": 0.012122994288802147, "memory(GiB)": 22.66, "step": 29404, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.957736 }, { "epoch": 0.9552350323230354, "grad_norm": 0.4028974771499634, "learning_rate": 5.468913344726601e-08, "loss": 0.01966594159603119, "memory(GiB)": 22.66, "step": 29405, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.957741 }, { "epoch": 0.9552675177857909, "grad_norm": 0.2879488170146942, "learning_rate": 5.4609932350958575e-08, "loss": 0.011384193785488605, "memory(GiB)": 22.66, "step": 29406, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957747 }, { "epoch": 0.9553000032485462, "grad_norm": 0.3052608072757721, "learning_rate": 5.453078833103587e-08, "loss": 0.007569814566522837, "memory(GiB)": 22.66, "step": 29407, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.957753 }, { "epoch": 0.9553324887113017, "grad_norm": 0.42237865924835205, "learning_rate": 5.445170138840994e-08, "loss": 0.016730897128582, "memory(GiB)": 22.66, "step": 29408, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.957757 }, { "epoch": 0.9553649741740571, "grad_norm": 0.2308736890554428, "learning_rate": 5.437267152399395e-08, "loss": 0.005793095100671053, "memory(GiB)": 22.66, "step": 29409, "token_acc": 1.0, "train_speed(iter/s)": 0.957761 }, { "epoch": 0.9553974596368126, "grad_norm": 0.36070942878723145, "learning_rate": 5.4293698738700497e-08, "loss": 0.011026235297322273, "memory(GiB)": 22.66, "step": 29410, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957766 }, { "epoch": 0.9554299450995679, "grad_norm": 0.3433989882469177, "learning_rate": 5.421478303343941e-08, "loss": 0.006058854516595602, "memory(GiB)": 22.66, "step": 29411, "token_acc": 1.0, "train_speed(iter/s)": 0.95777 }, { "epoch": 0.9554624305623234, "grad_norm": 0.2947690188884735, "learning_rate": 5.41359244091233e-08, "loss": 0.01151246391236782, "memory(GiB)": 22.66, "step": 29412, "token_acc": 0.9890510948905109, "train_speed(iter/s)": 0.957775 }, { "epoch": 0.9554949160250787, "grad_norm": 0.3262328803539276, "learning_rate": 5.405712286666087e-08, "loss": 0.00944683887064457, "memory(GiB)": 22.66, "step": 29413, "token_acc": 1.0, "train_speed(iter/s)": 0.95778 }, { "epoch": 0.9555274014878342, "grad_norm": 0.3403705358505249, "learning_rate": 5.397837840696307e-08, "loss": 0.015994329005479813, "memory(GiB)": 22.66, "step": 29414, "token_acc": 1.0, "train_speed(iter/s)": 0.957784 }, { "epoch": 0.9555598869505896, "grad_norm": 0.26491579413414, "learning_rate": 5.3899691030937505e-08, "loss": 0.009355831891298294, "memory(GiB)": 22.66, "step": 29415, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.957788 }, { "epoch": 0.955592372413345, "grad_norm": 0.3897307515144348, "learning_rate": 5.382106073949289e-08, "loss": 0.01574203558266163, "memory(GiB)": 22.66, "step": 29416, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957793 }, { "epoch": 0.9556248578761004, "grad_norm": 0.5794622898101807, "learning_rate": 5.374248753353628e-08, "loss": 0.01006692461669445, "memory(GiB)": 22.66, "step": 29417, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.957797 }, { "epoch": 0.9556573433388559, "grad_norm": 0.42611342668533325, "learning_rate": 5.366397141397528e-08, "loss": 0.012251514941453934, "memory(GiB)": 22.66, "step": 29418, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957802 }, { "epoch": 0.9556898288016112, "grad_norm": 0.22590461373329163, "learning_rate": 5.3585512381715276e-08, "loss": 0.004258151166141033, "memory(GiB)": 22.66, "step": 29419, "token_acc": 0.9965277777777778, "train_speed(iter/s)": 0.957806 }, { "epoch": 0.9557223142643667, "grad_norm": 0.26713237166404724, "learning_rate": 5.3507110437662214e-08, "loss": 0.007418598048388958, "memory(GiB)": 22.66, "step": 29420, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.95781 }, { "epoch": 0.9557547997271221, "grad_norm": 0.2059687077999115, "learning_rate": 5.342876558272092e-08, "loss": 0.007576110307127237, "memory(GiB)": 22.66, "step": 29421, "token_acc": 1.0, "train_speed(iter/s)": 0.957815 }, { "epoch": 0.9557872851898775, "grad_norm": 0.5438498854637146, "learning_rate": 5.3350477817795674e-08, "loss": 0.006694271694868803, "memory(GiB)": 22.66, "step": 29422, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.957819 }, { "epoch": 0.9558197706526329, "grad_norm": 0.3605581223964691, "learning_rate": 5.3272247143789644e-08, "loss": 0.011256711557507515, "memory(GiB)": 22.66, "step": 29423, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.957824 }, { "epoch": 0.9558522561153884, "grad_norm": 0.3269636332988739, "learning_rate": 5.3194073561606e-08, "loss": 0.010296259075403214, "memory(GiB)": 22.66, "step": 29424, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.957829 }, { "epoch": 0.9558847415781437, "grad_norm": 0.24846313893795013, "learning_rate": 5.3115957072146786e-08, "loss": 0.007597708143293858, "memory(GiB)": 22.66, "step": 29425, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957834 }, { "epoch": 0.9559172270408992, "grad_norm": 0.4838034510612488, "learning_rate": 5.3037897676314064e-08, "loss": 0.013841008767485619, "memory(GiB)": 22.66, "step": 29426, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.95784 }, { "epoch": 0.9559497125036546, "grad_norm": 0.3104338049888611, "learning_rate": 5.295989537500823e-08, "loss": 0.009673211723566055, "memory(GiB)": 22.66, "step": 29427, "token_acc": 1.0, "train_speed(iter/s)": 0.957847 }, { "epoch": 0.95598219796641, "grad_norm": 0.27792832255363464, "learning_rate": 5.288195016912967e-08, "loss": 0.009221244603395462, "memory(GiB)": 22.66, "step": 29428, "token_acc": 1.0, "train_speed(iter/s)": 0.957853 }, { "epoch": 0.9560146834291654, "grad_norm": 0.4044207036495209, "learning_rate": 5.280406205957822e-08, "loss": 0.008841611444950104, "memory(GiB)": 22.66, "step": 29429, "token_acc": 1.0, "train_speed(iter/s)": 0.95786 }, { "epoch": 0.9560471688919209, "grad_norm": 0.4699242115020752, "learning_rate": 5.272623104725205e-08, "loss": 0.012890949845314026, "memory(GiB)": 22.66, "step": 29430, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.957867 }, { "epoch": 0.9560796543546762, "grad_norm": 0.426096647977829, "learning_rate": 5.2648457133050446e-08, "loss": 0.01437612995505333, "memory(GiB)": 22.66, "step": 29431, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.957874 }, { "epoch": 0.9561121398174317, "grad_norm": 0.24696645140647888, "learning_rate": 5.257074031787046e-08, "loss": 0.0070339678786695, "memory(GiB)": 22.66, "step": 29432, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.95788 }, { "epoch": 0.9561446252801871, "grad_norm": 0.5034135580062866, "learning_rate": 5.2493080602609717e-08, "loss": 0.014767827466130257, "memory(GiB)": 22.66, "step": 29433, "token_acc": 0.9895104895104895, "train_speed(iter/s)": 0.957886 }, { "epoch": 0.9561771107429425, "grad_norm": 0.3960064649581909, "learning_rate": 5.2415477988162486e-08, "loss": 0.015687014907598495, "memory(GiB)": 22.66, "step": 29434, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.957893 }, { "epoch": 0.9562095962056979, "grad_norm": 0.4315420687198639, "learning_rate": 5.2337932475426954e-08, "loss": 0.012426539324223995, "memory(GiB)": 22.66, "step": 29435, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.9579 }, { "epoch": 0.9562420816684534, "grad_norm": 0.4996715486049652, "learning_rate": 5.2260444065296845e-08, "loss": 0.014825256541371346, "memory(GiB)": 22.66, "step": 29436, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.957906 }, { "epoch": 0.9562745671312087, "grad_norm": 0.35418206453323364, "learning_rate": 5.2183012758667e-08, "loss": 0.010067232884466648, "memory(GiB)": 22.66, "step": 29437, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957913 }, { "epoch": 0.9563070525939642, "grad_norm": 0.4197428226470947, "learning_rate": 5.2105638556430026e-08, "loss": 0.018935810774564743, "memory(GiB)": 22.66, "step": 29438, "token_acc": 0.985981308411215, "train_speed(iter/s)": 0.95792 }, { "epoch": 0.9563395380567196, "grad_norm": 0.24764439463615417, "learning_rate": 5.202832145947967e-08, "loss": 0.008879560977220535, "memory(GiB)": 22.66, "step": 29439, "token_acc": 1.0, "train_speed(iter/s)": 0.957927 }, { "epoch": 0.956372023519475, "grad_norm": 0.3664761781692505, "learning_rate": 5.195106146870854e-08, "loss": 0.010650592856109142, "memory(GiB)": 22.66, "step": 29440, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.957934 }, { "epoch": 0.9564045089822305, "grad_norm": 0.2530691921710968, "learning_rate": 5.18738585850076e-08, "loss": 0.010282632894814014, "memory(GiB)": 22.66, "step": 29441, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.95794 }, { "epoch": 0.9564369944449859, "grad_norm": 0.3836435377597809, "learning_rate": 5.17967128092689e-08, "loss": 0.012118369340896606, "memory(GiB)": 22.66, "step": 29442, "token_acc": 1.0, "train_speed(iter/s)": 0.957947 }, { "epoch": 0.9564694799077413, "grad_norm": 0.45642367005348206, "learning_rate": 5.171962414238174e-08, "loss": 0.011556295678019524, "memory(GiB)": 22.66, "step": 29443, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.957954 }, { "epoch": 0.9565019653704967, "grad_norm": 0.3273935317993164, "learning_rate": 5.164259258523652e-08, "loss": 0.00913683045655489, "memory(GiB)": 22.66, "step": 29444, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.957961 }, { "epoch": 0.9565344508332522, "grad_norm": 0.3685986399650574, "learning_rate": 5.1565618138721965e-08, "loss": 0.010880934074521065, "memory(GiB)": 22.66, "step": 29445, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.957968 }, { "epoch": 0.9565669362960075, "grad_norm": 0.2367803007364273, "learning_rate": 5.148870080372625e-08, "loss": 0.008519096300005913, "memory(GiB)": 22.66, "step": 29446, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.957974 }, { "epoch": 0.956599421758763, "grad_norm": 0.3583446145057678, "learning_rate": 5.141184058113757e-08, "loss": 0.01231719832867384, "memory(GiB)": 22.66, "step": 29447, "token_acc": 1.0, "train_speed(iter/s)": 0.957981 }, { "epoch": 0.9566319072215184, "grad_norm": 0.3609967827796936, "learning_rate": 5.1335037471842984e-08, "loss": 0.011653097346425056, "memory(GiB)": 22.66, "step": 29448, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.957988 }, { "epoch": 0.9566643926842738, "grad_norm": 0.28448915481567383, "learning_rate": 5.125829147672845e-08, "loss": 0.01080649346113205, "memory(GiB)": 22.66, "step": 29449, "token_acc": 0.993006993006993, "train_speed(iter/s)": 0.957995 }, { "epoch": 0.9566968781470292, "grad_norm": 0.490429550409317, "learning_rate": 5.1181602596679926e-08, "loss": 0.010305777192115784, "memory(GiB)": 22.66, "step": 29450, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.958002 }, { "epoch": 0.9567293636097847, "grad_norm": 0.43746915459632874, "learning_rate": 5.1104970832582814e-08, "loss": 0.012192247435450554, "memory(GiB)": 22.66, "step": 29451, "token_acc": 1.0, "train_speed(iter/s)": 0.958008 }, { "epoch": 0.95676184907254, "grad_norm": 0.3109157681465149, "learning_rate": 5.1028396185321405e-08, "loss": 0.014059249311685562, "memory(GiB)": 22.66, "step": 29452, "token_acc": 0.9941176470588236, "train_speed(iter/s)": 0.958015 }, { "epoch": 0.9567943345352955, "grad_norm": 0.28831684589385986, "learning_rate": 5.095187865577888e-08, "loss": 0.008531701751053333, "memory(GiB)": 22.66, "step": 29453, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.958022 }, { "epoch": 0.9568268199980509, "grad_norm": 0.36169034242630005, "learning_rate": 5.087541824483955e-08, "loss": 0.011043520644307137, "memory(GiB)": 22.66, "step": 29454, "token_acc": 1.0, "train_speed(iter/s)": 0.958029 }, { "epoch": 0.9568593054608063, "grad_norm": 1.0091263055801392, "learning_rate": 5.0799014953384354e-08, "loss": 0.020563021302223206, "memory(GiB)": 22.66, "step": 29455, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.958036 }, { "epoch": 0.9568917909235617, "grad_norm": 0.3593124449253082, "learning_rate": 5.072266878229648e-08, "loss": 0.009955947287380695, "memory(GiB)": 22.66, "step": 29456, "token_acc": 0.9938650306748467, "train_speed(iter/s)": 0.958042 }, { "epoch": 0.9569242763863172, "grad_norm": 0.431779146194458, "learning_rate": 5.064637973245634e-08, "loss": 0.012979087419807911, "memory(GiB)": 22.66, "step": 29457, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.958049 }, { "epoch": 0.9569567618490725, "grad_norm": 0.4818708598613739, "learning_rate": 5.0570147804744344e-08, "loss": 0.012674899771809578, "memory(GiB)": 22.66, "step": 29458, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.958056 }, { "epoch": 0.956989247311828, "grad_norm": 0.399675190448761, "learning_rate": 5.0493973000040886e-08, "loss": 0.015336938202381134, "memory(GiB)": 22.66, "step": 29459, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.958063 }, { "epoch": 0.9570217327745834, "grad_norm": 0.48049232363700867, "learning_rate": 5.041785531922472e-08, "loss": 0.013824020512402058, "memory(GiB)": 22.66, "step": 29460, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.958069 }, { "epoch": 0.9570542182373388, "grad_norm": 0.25161460041999817, "learning_rate": 5.034179476317402e-08, "loss": 0.008596040308475494, "memory(GiB)": 22.66, "step": 29461, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.958075 }, { "epoch": 0.9570867037000942, "grad_norm": 0.36407560110092163, "learning_rate": 5.026579133276699e-08, "loss": 0.008240500465035439, "memory(GiB)": 22.66, "step": 29462, "token_acc": 1.0, "train_speed(iter/s)": 0.95808 }, { "epoch": 0.9571191891628497, "grad_norm": 0.38966479897499084, "learning_rate": 5.0189845028881244e-08, "loss": 0.012073272839188576, "memory(GiB)": 22.66, "step": 29463, "token_acc": 0.9926739926739927, "train_speed(iter/s)": 0.958086 }, { "epoch": 0.957151674625605, "grad_norm": 0.4106777608394623, "learning_rate": 5.011395585239276e-08, "loss": 0.0128655219450593, "memory(GiB)": 22.66, "step": 29464, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.958091 }, { "epoch": 0.9571841600883605, "grad_norm": 0.36786404252052307, "learning_rate": 5.00381238041775e-08, "loss": 0.006922335363924503, "memory(GiB)": 22.66, "step": 29465, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.958097 }, { "epoch": 0.9572166455511159, "grad_norm": 0.3068099915981293, "learning_rate": 4.9962348885110314e-08, "loss": 0.012739483267068863, "memory(GiB)": 22.66, "step": 29466, "token_acc": 0.9905660377358491, "train_speed(iter/s)": 0.958102 }, { "epoch": 0.9572491310138713, "grad_norm": 0.2268109917640686, "learning_rate": 4.988663109606662e-08, "loss": 0.0071746669709682465, "memory(GiB)": 22.66, "step": 29467, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.958107 }, { "epoch": 0.9572816164766267, "grad_norm": 0.5508272051811218, "learning_rate": 4.981097043791905e-08, "loss": 0.0128976721316576, "memory(GiB)": 22.66, "step": 29468, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.958112 }, { "epoch": 0.9573141019393822, "grad_norm": 0.3839898109436035, "learning_rate": 4.973536691154246e-08, "loss": 0.008861512877047062, "memory(GiB)": 22.66, "step": 29469, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.958117 }, { "epoch": 0.9573465874021375, "grad_norm": 0.24376949667930603, "learning_rate": 4.965982051780782e-08, "loss": 0.005691714584827423, "memory(GiB)": 22.66, "step": 29470, "token_acc": 1.0, "train_speed(iter/s)": 0.958122 }, { "epoch": 0.957379072864893, "grad_norm": 0.2895646095275879, "learning_rate": 4.958433125758832e-08, "loss": 0.013787442818284035, "memory(GiB)": 22.66, "step": 29471, "token_acc": 1.0, "train_speed(iter/s)": 0.958127 }, { "epoch": 0.9574115583276483, "grad_norm": 0.24007996916770935, "learning_rate": 4.950889913175383e-08, "loss": 0.00872781127691269, "memory(GiB)": 22.66, "step": 29472, "token_acc": 0.9964664310954063, "train_speed(iter/s)": 0.958131 }, { "epoch": 0.9574440437904038, "grad_norm": 0.27904877066612244, "learning_rate": 4.9433524141176416e-08, "loss": 0.008747123181819916, "memory(GiB)": 22.66, "step": 29473, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.958135 }, { "epoch": 0.9574765292531592, "grad_norm": 0.3294191360473633, "learning_rate": 4.935820628672539e-08, "loss": 0.009849374182522297, "memory(GiB)": 22.66, "step": 29474, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.95814 }, { "epoch": 0.9575090147159147, "grad_norm": 0.28728196024894714, "learning_rate": 4.9282945569269494e-08, "loss": 0.009083257988095284, "memory(GiB)": 22.66, "step": 29475, "token_acc": 1.0, "train_speed(iter/s)": 0.958145 }, { "epoch": 0.95754150017867, "grad_norm": 0.2712477147579193, "learning_rate": 4.920774198967804e-08, "loss": 0.006636711768805981, "memory(GiB)": 22.66, "step": 29476, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.958149 }, { "epoch": 0.9575739856414255, "grad_norm": 0.34328439831733704, "learning_rate": 4.913259554881866e-08, "loss": 0.012431007809937, "memory(GiB)": 22.66, "step": 29477, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.958154 }, { "epoch": 0.9576064711041808, "grad_norm": 0.3791738450527191, "learning_rate": 4.905750624755901e-08, "loss": 0.013924935832619667, "memory(GiB)": 22.66, "step": 29478, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.958158 }, { "epoch": 0.9576389565669363, "grad_norm": 0.47289255261421204, "learning_rate": 4.898247408676504e-08, "loss": 0.01534947007894516, "memory(GiB)": 22.66, "step": 29479, "token_acc": 0.9939759036144579, "train_speed(iter/s)": 0.958163 }, { "epoch": 0.9576714420296917, "grad_norm": 0.23841965198516846, "learning_rate": 4.89074990673033e-08, "loss": 0.007102844770997763, "memory(GiB)": 22.66, "step": 29480, "token_acc": 1.0, "train_speed(iter/s)": 0.958168 }, { "epoch": 0.9577039274924471, "grad_norm": 0.39878788590431213, "learning_rate": 4.8832581190038644e-08, "loss": 0.014475845731794834, "memory(GiB)": 22.66, "step": 29481, "token_acc": 0.9765625, "train_speed(iter/s)": 0.958172 }, { "epoch": 0.9577364129552025, "grad_norm": 0.2580641210079193, "learning_rate": 4.875772045583649e-08, "loss": 0.009323597885668278, "memory(GiB)": 22.66, "step": 29482, "token_acc": 1.0, "train_speed(iter/s)": 0.958177 }, { "epoch": 0.957768898417958, "grad_norm": 1.828335165977478, "learning_rate": 4.868291686556004e-08, "loss": 0.016636120155453682, "memory(GiB)": 22.66, "step": 29483, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.958182 }, { "epoch": 0.9578013838807133, "grad_norm": 0.2898293435573578, "learning_rate": 4.860817042007304e-08, "loss": 0.011842507869005203, "memory(GiB)": 22.66, "step": 29484, "token_acc": 1.0, "train_speed(iter/s)": 0.958186 }, { "epoch": 0.9578338693434688, "grad_norm": 0.44756072759628296, "learning_rate": 4.853348112023759e-08, "loss": 0.01498832181096077, "memory(GiB)": 22.66, "step": 29485, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.958191 }, { "epoch": 0.9578663548062242, "grad_norm": 0.27183422446250916, "learning_rate": 4.845884896691633e-08, "loss": 0.01007264293730259, "memory(GiB)": 22.66, "step": 29486, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.958196 }, { "epoch": 0.9578988402689796, "grad_norm": 0.270723819732666, "learning_rate": 4.8384273960970786e-08, "loss": 0.009500620886683464, "memory(GiB)": 22.66, "step": 29487, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.958203 }, { "epoch": 0.957931325731735, "grad_norm": 0.26053938269615173, "learning_rate": 4.830975610326083e-08, "loss": 0.008279666304588318, "memory(GiB)": 22.66, "step": 29488, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.95821 }, { "epoch": 0.9579638111944905, "grad_norm": 0.43163374066352844, "learning_rate": 4.823529539464744e-08, "loss": 0.018507350236177444, "memory(GiB)": 22.66, "step": 29489, "token_acc": 0.9898477157360406, "train_speed(iter/s)": 0.958217 }, { "epoch": 0.9579962966572458, "grad_norm": 0.3584916293621063, "learning_rate": 4.8160891835989377e-08, "loss": 0.010562460869550705, "memory(GiB)": 22.66, "step": 29490, "token_acc": 1.0, "train_speed(iter/s)": 0.958224 }, { "epoch": 0.9580287821200013, "grad_norm": 0.27426108717918396, "learning_rate": 4.808654542814539e-08, "loss": 0.009172877296805382, "memory(GiB)": 22.66, "step": 29491, "token_acc": 1.0, "train_speed(iter/s)": 0.958231 }, { "epoch": 0.9580612675827567, "grad_norm": 0.30395403504371643, "learning_rate": 4.801225617197369e-08, "loss": 0.015504788607358932, "memory(GiB)": 22.66, "step": 29492, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.958238 }, { "epoch": 0.9580937530455121, "grad_norm": 0.5331217646598816, "learning_rate": 4.7938024068331925e-08, "loss": 0.007447416894137859, "memory(GiB)": 22.66, "step": 29493, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.958244 }, { "epoch": 0.9581262385082675, "grad_norm": 0.30776557326316833, "learning_rate": 4.786384911807607e-08, "loss": 0.010710534639656544, "memory(GiB)": 22.66, "step": 29494, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.958252 }, { "epoch": 0.958158723971023, "grad_norm": 0.45743757486343384, "learning_rate": 4.778973132206322e-08, "loss": 0.009942613542079926, "memory(GiB)": 22.66, "step": 29495, "token_acc": 0.9948979591836735, "train_speed(iter/s)": 0.958259 }, { "epoch": 0.9581912094337783, "grad_norm": 0.39472565054893494, "learning_rate": 4.771567068114769e-08, "loss": 0.014113407582044601, "memory(GiB)": 22.66, "step": 29496, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.958266 }, { "epoch": 0.9582236948965338, "grad_norm": 0.3044792413711548, "learning_rate": 4.764166719618546e-08, "loss": 0.00861554779112339, "memory(GiB)": 22.66, "step": 29497, "token_acc": 1.0, "train_speed(iter/s)": 0.958273 }, { "epoch": 0.9582561803592892, "grad_norm": 0.24385270476341248, "learning_rate": 4.756772086802919e-08, "loss": 0.009107464924454689, "memory(GiB)": 22.66, "step": 29498, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.95828 }, { "epoch": 0.9582886658220446, "grad_norm": 0.26702624559402466, "learning_rate": 4.749383169753319e-08, "loss": 0.007310403510928154, "memory(GiB)": 22.66, "step": 29499, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.958287 }, { "epoch": 0.9583211512848, "grad_norm": 0.2738857865333557, "learning_rate": 4.741999968555067e-08, "loss": 0.01090109720826149, "memory(GiB)": 22.66, "step": 29500, "token_acc": 1.0, "train_speed(iter/s)": 0.958294 }, { "epoch": 0.9583211512848, "eval_loss": 0.011118299327790737, "eval_runtime": 79.4655, "eval_samples_per_second": 125.212, "eval_steps_per_second": 3.914, "eval_token_acc": 0.9955175496853532, "step": 29500 }, { "epoch": 0.9583536367475555, "grad_norm": 0.4315597414970398, "learning_rate": 4.7346224832932606e-08, "loss": 0.013496069237589836, "memory(GiB)": 22.66, "step": 29501, "token_acc": 0.9949998630099455, "train_speed(iter/s)": 0.955458 }, { "epoch": 0.9583861222103108, "grad_norm": 0.4480857849121094, "learning_rate": 4.727250714053167e-08, "loss": 0.013755587860941887, "memory(GiB)": 22.66, "step": 29502, "token_acc": 0.9952380952380953, "train_speed(iter/s)": 0.955463 }, { "epoch": 0.9584186076730663, "grad_norm": 0.2951699495315552, "learning_rate": 4.719884660919771e-08, "loss": 0.010626100935041904, "memory(GiB)": 22.66, "step": 29503, "token_acc": 1.0, "train_speed(iter/s)": 0.955467 }, { "epoch": 0.9584510931358218, "grad_norm": 0.4338133633136749, "learning_rate": 4.7125243239781185e-08, "loss": 0.015572477132081985, "memory(GiB)": 22.66, "step": 29504, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.955471 }, { "epoch": 0.9584835785985771, "grad_norm": 0.41960257291793823, "learning_rate": 4.705169703313195e-08, "loss": 0.013687491416931152, "memory(GiB)": 22.66, "step": 29505, "token_acc": 1.0, "train_speed(iter/s)": 0.955476 }, { "epoch": 0.9585160640613326, "grad_norm": 0.24797478318214417, "learning_rate": 4.6978207990098224e-08, "loss": 0.007032185327261686, "memory(GiB)": 22.66, "step": 29506, "token_acc": 1.0, "train_speed(iter/s)": 0.955481 }, { "epoch": 0.958548549524088, "grad_norm": 0.3165810704231262, "learning_rate": 4.690477611152877e-08, "loss": 0.010972030460834503, "memory(GiB)": 22.66, "step": 29507, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955485 }, { "epoch": 0.9585810349868434, "grad_norm": 0.24907714128494263, "learning_rate": 4.683140139827014e-08, "loss": 0.012649355456233025, "memory(GiB)": 22.66, "step": 29508, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.95549 }, { "epoch": 0.9586135204495988, "grad_norm": 0.3543699383735657, "learning_rate": 4.675808385116998e-08, "loss": 0.012250731699168682, "memory(GiB)": 22.66, "step": 29509, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955484 }, { "epoch": 0.9586460059123543, "grad_norm": 0.30099642276763916, "learning_rate": 4.6684823471074836e-08, "loss": 0.007019108161330223, "memory(GiB)": 22.66, "step": 29510, "token_acc": 1.0, "train_speed(iter/s)": 0.955489 }, { "epoch": 0.9586784913751096, "grad_norm": 0.23018576204776764, "learning_rate": 4.661162025882904e-08, "loss": 0.007612451910972595, "memory(GiB)": 22.66, "step": 29511, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955494 }, { "epoch": 0.9587109768378651, "grad_norm": 0.332741379737854, "learning_rate": 4.6538474215278574e-08, "loss": 0.012509353458881378, "memory(GiB)": 22.66, "step": 29512, "token_acc": 1.0, "train_speed(iter/s)": 0.955499 }, { "epoch": 0.9587434623006205, "grad_norm": 0.26750439405441284, "learning_rate": 4.6465385341266656e-08, "loss": 0.006325067952275276, "memory(GiB)": 22.66, "step": 29513, "token_acc": 1.0, "train_speed(iter/s)": 0.955505 }, { "epoch": 0.9587759477633759, "grad_norm": 0.4189891517162323, "learning_rate": 4.639235363763761e-08, "loss": 0.015683740377426147, "memory(GiB)": 22.66, "step": 29514, "token_acc": 0.9878787878787879, "train_speed(iter/s)": 0.95551 }, { "epoch": 0.9588084332261313, "grad_norm": 0.4833511710166931, "learning_rate": 4.6319379105234094e-08, "loss": 0.016288161277770996, "memory(GiB)": 22.66, "step": 29515, "token_acc": 1.0, "train_speed(iter/s)": 0.955516 }, { "epoch": 0.9588409186888868, "grad_norm": 0.36730891466140747, "learning_rate": 4.6246461744898774e-08, "loss": 0.010820399969816208, "memory(GiB)": 22.66, "step": 29516, "token_acc": 1.0, "train_speed(iter/s)": 0.955521 }, { "epoch": 0.9588734041516421, "grad_norm": 0.20865437388420105, "learning_rate": 4.617360155747208e-08, "loss": 0.008321326225996017, "memory(GiB)": 22.66, "step": 29517, "token_acc": 1.0, "train_speed(iter/s)": 0.955526 }, { "epoch": 0.9589058896143976, "grad_norm": 0.3028903603553772, "learning_rate": 4.610079854379501e-08, "loss": 0.0111131202429533, "memory(GiB)": 22.66, "step": 29518, "token_acc": 1.0, "train_speed(iter/s)": 0.95553 }, { "epoch": 0.958938375077153, "grad_norm": 0.9503821730613708, "learning_rate": 4.6028052704709115e-08, "loss": 0.01902163401246071, "memory(GiB)": 22.66, "step": 29519, "token_acc": 0.9728506787330317, "train_speed(iter/s)": 0.955535 }, { "epoch": 0.9589708605399084, "grad_norm": 0.2711530327796936, "learning_rate": 4.5955364041053166e-08, "loss": 0.007216949947178364, "memory(GiB)": 22.66, "step": 29520, "token_acc": 1.0, "train_speed(iter/s)": 0.955541 }, { "epoch": 0.9590033460026638, "grad_norm": 0.44003379344940186, "learning_rate": 4.5882732553665933e-08, "loss": 0.019509930163621902, "memory(GiB)": 22.66, "step": 29521, "token_acc": 0.99609375, "train_speed(iter/s)": 0.955547 }, { "epoch": 0.9590358314654193, "grad_norm": 0.26816773414611816, "learning_rate": 4.581015824338619e-08, "loss": 0.009594345465302467, "memory(GiB)": 22.66, "step": 29522, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955552 }, { "epoch": 0.9590683169281746, "grad_norm": 0.35620537400245667, "learning_rate": 4.573764111105106e-08, "loss": 0.00948396883904934, "memory(GiB)": 22.66, "step": 29523, "token_acc": 1.0, "train_speed(iter/s)": 0.955559 }, { "epoch": 0.9591008023909301, "grad_norm": 0.25515198707580566, "learning_rate": 4.5665181157497075e-08, "loss": 0.006898974068462849, "memory(GiB)": 22.66, "step": 29524, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.955566 }, { "epoch": 0.9591332878536855, "grad_norm": 0.3268054723739624, "learning_rate": 4.559277838356191e-08, "loss": 0.012740744277834892, "memory(GiB)": 22.66, "step": 29525, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.955573 }, { "epoch": 0.9591657733164409, "grad_norm": 0.2680032253265381, "learning_rate": 4.552043279008045e-08, "loss": 0.004462746437638998, "memory(GiB)": 22.66, "step": 29526, "token_acc": 1.0, "train_speed(iter/s)": 0.95558 }, { "epoch": 0.9591982587791963, "grad_norm": 0.2878662049770355, "learning_rate": 4.5448144377887024e-08, "loss": 0.013369828462600708, "memory(GiB)": 22.66, "step": 29527, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.955586 }, { "epoch": 0.9592307442419518, "grad_norm": 0.381634920835495, "learning_rate": 4.537591314781653e-08, "loss": 0.011174245737493038, "memory(GiB)": 22.66, "step": 29528, "token_acc": 1.0, "train_speed(iter/s)": 0.955593 }, { "epoch": 0.9592632297047071, "grad_norm": 0.3337627947330475, "learning_rate": 4.530373910070329e-08, "loss": 0.011573923751711845, "memory(GiB)": 22.66, "step": 29529, "token_acc": 1.0, "train_speed(iter/s)": 0.955601 }, { "epoch": 0.9592957151674626, "grad_norm": 0.36653947830200195, "learning_rate": 4.523162223737887e-08, "loss": 0.014440391212701797, "memory(GiB)": 22.66, "step": 29530, "token_acc": 1.0, "train_speed(iter/s)": 0.955608 }, { "epoch": 0.959328200630218, "grad_norm": 0.38765013217926025, "learning_rate": 4.5159562558676483e-08, "loss": 0.013629731722176075, "memory(GiB)": 22.66, "step": 29531, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955615 }, { "epoch": 0.9593606860929734, "grad_norm": 0.2515914738178253, "learning_rate": 4.5087560065427695e-08, "loss": 0.006315208505839109, "memory(GiB)": 22.66, "step": 29532, "token_acc": 1.0, "train_speed(iter/s)": 0.955622 }, { "epoch": 0.9593931715557288, "grad_norm": 0.3839140236377716, "learning_rate": 4.5015614758463497e-08, "loss": 0.01171928271651268, "memory(GiB)": 22.66, "step": 29533, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955629 }, { "epoch": 0.9594256570184843, "grad_norm": 0.34026235342025757, "learning_rate": 4.4943726638613794e-08, "loss": 0.012414418160915375, "memory(GiB)": 22.66, "step": 29534, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.955635 }, { "epoch": 0.9594581424812396, "grad_norm": 0.40167757868766785, "learning_rate": 4.4871895706709025e-08, "loss": 0.009241372346878052, "memory(GiB)": 22.66, "step": 29535, "token_acc": 1.0, "train_speed(iter/s)": 0.955642 }, { "epoch": 0.9594906279439951, "grad_norm": 0.29502353072166443, "learning_rate": 4.480012196357797e-08, "loss": 0.010997025296092033, "memory(GiB)": 22.66, "step": 29536, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.955649 }, { "epoch": 0.9595231134067504, "grad_norm": 0.3175913393497467, "learning_rate": 4.47284054100483e-08, "loss": 0.01093947421759367, "memory(GiB)": 22.66, "step": 29537, "token_acc": 1.0, "train_speed(iter/s)": 0.955655 }, { "epoch": 0.9595555988695059, "grad_norm": 0.2989027500152588, "learning_rate": 4.4656746046948805e-08, "loss": 0.009686857461929321, "memory(GiB)": 22.66, "step": 29538, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955663 }, { "epoch": 0.9595880843322613, "grad_norm": 0.40286433696746826, "learning_rate": 4.458514387510604e-08, "loss": 0.014405287802219391, "memory(GiB)": 22.66, "step": 29539, "token_acc": 1.0, "train_speed(iter/s)": 0.95567 }, { "epoch": 0.9596205697950168, "grad_norm": 0.32990148663520813, "learning_rate": 4.4513598895346563e-08, "loss": 0.01067243330180645, "memory(GiB)": 22.66, "step": 29540, "token_acc": 0.9880478087649402, "train_speed(iter/s)": 0.955677 }, { "epoch": 0.9596530552577721, "grad_norm": 0.24497897922992706, "learning_rate": 4.4442111108494724e-08, "loss": 0.010614508762955666, "memory(GiB)": 22.66, "step": 29541, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.955683 }, { "epoch": 0.9596855407205276, "grad_norm": 0.2651793658733368, "learning_rate": 4.437068051537763e-08, "loss": 0.005898621864616871, "memory(GiB)": 22.66, "step": 29542, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.95569 }, { "epoch": 0.959718026183283, "grad_norm": 0.2829843759536743, "learning_rate": 4.4299307116819065e-08, "loss": 0.008321892470121384, "memory(GiB)": 22.66, "step": 29543, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.955698 }, { "epoch": 0.9597505116460384, "grad_norm": 0.33286741375923157, "learning_rate": 4.422799091364227e-08, "loss": 0.008690135553479195, "memory(GiB)": 22.66, "step": 29544, "token_acc": 1.0, "train_speed(iter/s)": 0.955705 }, { "epoch": 0.9597829971087938, "grad_norm": 0.32473501563072205, "learning_rate": 4.415673190667047e-08, "loss": 0.014025181531906128, "memory(GiB)": 22.66, "step": 29545, "token_acc": 1.0, "train_speed(iter/s)": 0.955712 }, { "epoch": 0.9598154825715492, "grad_norm": 0.3575865924358368, "learning_rate": 4.408553009672578e-08, "loss": 0.014645576477050781, "memory(GiB)": 22.66, "step": 29546, "token_acc": 0.991701244813278, "train_speed(iter/s)": 0.955719 }, { "epoch": 0.9598479680343046, "grad_norm": 0.3773026764392853, "learning_rate": 4.4014385484630887e-08, "loss": 0.007821723818778992, "memory(GiB)": 22.66, "step": 29547, "token_acc": 1.0, "train_speed(iter/s)": 0.955725 }, { "epoch": 0.9598804534970601, "grad_norm": 0.34860092401504517, "learning_rate": 4.394329807120623e-08, "loss": 0.015203313902020454, "memory(GiB)": 22.66, "step": 29548, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.955732 }, { "epoch": 0.9599129389598154, "grad_norm": 0.5276046991348267, "learning_rate": 4.3872267857272834e-08, "loss": 0.016437161713838577, "memory(GiB)": 22.66, "step": 29549, "token_acc": 1.0, "train_speed(iter/s)": 0.955738 }, { "epoch": 0.9599454244225709, "grad_norm": 0.4775412082672119, "learning_rate": 4.380129484364948e-08, "loss": 0.011661099269986153, "memory(GiB)": 22.66, "step": 29550, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.955743 }, { "epoch": 0.9599779098853263, "grad_norm": 0.4072597622871399, "learning_rate": 4.373037903115607e-08, "loss": 0.01515034306794405, "memory(GiB)": 22.66, "step": 29551, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.955748 }, { "epoch": 0.9600103953480817, "grad_norm": 0.38416457176208496, "learning_rate": 4.3659520420610834e-08, "loss": 0.011844013817608356, "memory(GiB)": 22.66, "step": 29552, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.955753 }, { "epoch": 0.9600428808108371, "grad_norm": 0.437549889087677, "learning_rate": 4.3588719012832014e-08, "loss": 0.011502319015562534, "memory(GiB)": 22.66, "step": 29553, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.955758 }, { "epoch": 0.9600753662735926, "grad_norm": 0.3390171527862549, "learning_rate": 4.351797480863562e-08, "loss": 0.013737771660089493, "memory(GiB)": 22.66, "step": 29554, "token_acc": 0.9809523809523809, "train_speed(iter/s)": 0.955763 }, { "epoch": 0.9601078517363479, "grad_norm": 0.4938003420829773, "learning_rate": 4.344728780883933e-08, "loss": 0.014398589730262756, "memory(GiB)": 22.66, "step": 29555, "token_acc": 1.0, "train_speed(iter/s)": 0.955768 }, { "epoch": 0.9601403371991034, "grad_norm": 0.24313080310821533, "learning_rate": 4.3376658014258055e-08, "loss": 0.008502652868628502, "memory(GiB)": 22.66, "step": 29556, "token_acc": 1.0, "train_speed(iter/s)": 0.955774 }, { "epoch": 0.9601728226618588, "grad_norm": 0.2990081310272217, "learning_rate": 4.330608542570836e-08, "loss": 0.011704150587320328, "memory(GiB)": 22.66, "step": 29557, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955778 }, { "epoch": 0.9602053081246142, "grad_norm": 0.38458800315856934, "learning_rate": 4.323557004400292e-08, "loss": 0.011051834560930729, "memory(GiB)": 22.66, "step": 29558, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.955784 }, { "epoch": 0.9602377935873696, "grad_norm": 0.27521616220474243, "learning_rate": 4.3165111869956645e-08, "loss": 0.010317161679267883, "memory(GiB)": 22.66, "step": 29559, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.955789 }, { "epoch": 0.9602702790501251, "grad_norm": 0.3130762279033661, "learning_rate": 4.309471090438222e-08, "loss": 0.010474126785993576, "memory(GiB)": 22.66, "step": 29560, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.955794 }, { "epoch": 0.9603027645128804, "grad_norm": 0.528946042060852, "learning_rate": 4.302436714809344e-08, "loss": 0.02807823196053505, "memory(GiB)": 22.66, "step": 29561, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.955799 }, { "epoch": 0.9603352499756359, "grad_norm": 0.3036172389984131, "learning_rate": 4.2954080601900204e-08, "loss": 0.010800482705235481, "memory(GiB)": 22.66, "step": 29562, "token_acc": 1.0, "train_speed(iter/s)": 0.955804 }, { "epoch": 0.9603677354383913, "grad_norm": 0.33868545293807983, "learning_rate": 4.288385126661465e-08, "loss": 0.011015230789780617, "memory(GiB)": 22.66, "step": 29563, "token_acc": 1.0, "train_speed(iter/s)": 0.955808 }, { "epoch": 0.9604002209011467, "grad_norm": 0.5136125683784485, "learning_rate": 4.2813679143047793e-08, "loss": 0.01366911455988884, "memory(GiB)": 22.66, "step": 29564, "token_acc": 0.9845559845559846, "train_speed(iter/s)": 0.955813 }, { "epoch": 0.9604327063639021, "grad_norm": 0.3773714303970337, "learning_rate": 4.2743564232008983e-08, "loss": 0.011075002141296864, "memory(GiB)": 22.66, "step": 29565, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.955818 }, { "epoch": 0.9604651918266576, "grad_norm": 0.25424182415008545, "learning_rate": 4.267350653430757e-08, "loss": 0.0069639962166547775, "memory(GiB)": 22.66, "step": 29566, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.955823 }, { "epoch": 0.9604976772894129, "grad_norm": 0.49393588304519653, "learning_rate": 4.2603506050752365e-08, "loss": 0.012061523273587227, "memory(GiB)": 22.66, "step": 29567, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.955827 }, { "epoch": 0.9605301627521684, "grad_norm": 0.3987717926502228, "learning_rate": 4.253356278215104e-08, "loss": 0.010764623060822487, "memory(GiB)": 22.66, "step": 29568, "token_acc": 1.0, "train_speed(iter/s)": 0.95583 }, { "epoch": 0.9605626482149239, "grad_norm": 0.4086519181728363, "learning_rate": 4.246367672930962e-08, "loss": 0.014540945179760456, "memory(GiB)": 22.66, "step": 29569, "token_acc": 1.0, "train_speed(iter/s)": 0.955835 }, { "epoch": 0.9605951336776792, "grad_norm": 0.30861592292785645, "learning_rate": 4.239384789303691e-08, "loss": 0.009606387466192245, "memory(GiB)": 22.66, "step": 29570, "token_acc": 1.0, "train_speed(iter/s)": 0.955841 }, { "epoch": 0.9606276191404347, "grad_norm": 0.4072684645652771, "learning_rate": 4.232407627413781e-08, "loss": 0.015306019224226475, "memory(GiB)": 22.66, "step": 29571, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955846 }, { "epoch": 0.9606601046031901, "grad_norm": 0.35590896010398865, "learning_rate": 4.225436187341725e-08, "loss": 0.009942637756466866, "memory(GiB)": 22.66, "step": 29572, "token_acc": 1.0, "train_speed(iter/s)": 0.955851 }, { "epoch": 0.9606925900659455, "grad_norm": 0.258682519197464, "learning_rate": 4.2184704691680125e-08, "loss": 0.00867126788944006, "memory(GiB)": 22.66, "step": 29573, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.955856 }, { "epoch": 0.9607250755287009, "grad_norm": 0.7985286712646484, "learning_rate": 4.211510472972968e-08, "loss": 0.022143356502056122, "memory(GiB)": 22.66, "step": 29574, "token_acc": 0.9836734693877551, "train_speed(iter/s)": 0.955862 }, { "epoch": 0.9607575609914564, "grad_norm": 0.6943411231040955, "learning_rate": 4.2045561988370844e-08, "loss": 0.009520616382360458, "memory(GiB)": 22.66, "step": 29575, "token_acc": 1.0, "train_speed(iter/s)": 0.955868 }, { "epoch": 0.9607900464542117, "grad_norm": 0.4026351869106293, "learning_rate": 4.197607646840462e-08, "loss": 0.011627549305558205, "memory(GiB)": 22.66, "step": 29576, "token_acc": 0.9823321554770318, "train_speed(iter/s)": 0.955873 }, { "epoch": 0.9608225319169672, "grad_norm": 0.15878631174564362, "learning_rate": 4.190664817063372e-08, "loss": 0.003669285448268056, "memory(GiB)": 22.66, "step": 29577, "token_acc": 1.0, "train_speed(iter/s)": 0.955879 }, { "epoch": 0.9608550173797226, "grad_norm": 0.9351999759674072, "learning_rate": 4.1837277095859716e-08, "loss": 0.011665918864309788, "memory(GiB)": 22.66, "step": 29578, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.955884 }, { "epoch": 0.960887502842478, "grad_norm": 0.3028418719768524, "learning_rate": 4.176796324488197e-08, "loss": 0.007919864729046822, "memory(GiB)": 22.66, "step": 29579, "token_acc": 1.0, "train_speed(iter/s)": 0.955883 }, { "epoch": 0.9609199883052334, "grad_norm": 0.33594897389411926, "learning_rate": 4.169870661850206e-08, "loss": 0.01096787303686142, "memory(GiB)": 22.66, "step": 29580, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.955888 }, { "epoch": 0.9609524737679889, "grad_norm": 0.4756845235824585, "learning_rate": 4.1629507217518797e-08, "loss": 0.014611461199820042, "memory(GiB)": 22.66, "step": 29581, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955893 }, { "epoch": 0.9609849592307442, "grad_norm": 0.3581702411174774, "learning_rate": 4.1560365042729864e-08, "loss": 0.013139024376869202, "memory(GiB)": 22.66, "step": 29582, "token_acc": 1.0, "train_speed(iter/s)": 0.955898 }, { "epoch": 0.9610174446934997, "grad_norm": 0.4505704939365387, "learning_rate": 4.1491280094934084e-08, "loss": 0.015139734372496605, "memory(GiB)": 22.66, "step": 29583, "token_acc": 0.9821428571428571, "train_speed(iter/s)": 0.955903 }, { "epoch": 0.9610499301562551, "grad_norm": 0.19649368524551392, "learning_rate": 4.142225237492914e-08, "loss": 0.005285586230456829, "memory(GiB)": 22.66, "step": 29584, "token_acc": 1.0, "train_speed(iter/s)": 0.955908 }, { "epoch": 0.9610824156190105, "grad_norm": 0.4506218731403351, "learning_rate": 4.135328188351106e-08, "loss": 0.009899720549583435, "memory(GiB)": 22.66, "step": 29585, "token_acc": 1.0, "train_speed(iter/s)": 0.955915 }, { "epoch": 0.9611149010817659, "grad_norm": 0.36575376987457275, "learning_rate": 4.128436862147589e-08, "loss": 0.013471100479364395, "memory(GiB)": 22.66, "step": 29586, "token_acc": 0.9933333333333333, "train_speed(iter/s)": 0.95592 }, { "epoch": 0.9611473865445214, "grad_norm": 0.30999046564102173, "learning_rate": 4.121551258961909e-08, "loss": 0.010271292179822922, "memory(GiB)": 22.66, "step": 29587, "token_acc": 1.0, "train_speed(iter/s)": 0.955926 }, { "epoch": 0.9611798720072767, "grad_norm": 0.3551611006259918, "learning_rate": 4.114671378873558e-08, "loss": 0.011326245963573456, "memory(GiB)": 22.66, "step": 29588, "token_acc": 1.0, "train_speed(iter/s)": 0.955932 }, { "epoch": 0.9612123574700322, "grad_norm": 0.4011225402355194, "learning_rate": 4.1077972219619176e-08, "loss": 0.015695879235863686, "memory(GiB)": 22.66, "step": 29589, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955939 }, { "epoch": 0.9612448429327876, "grad_norm": 0.46095776557922363, "learning_rate": 4.1009287883063686e-08, "loss": 0.01681293174624443, "memory(GiB)": 22.66, "step": 29590, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955946 }, { "epoch": 0.961277328395543, "grad_norm": 0.332388699054718, "learning_rate": 4.09406607798607e-08, "loss": 0.01759534887969494, "memory(GiB)": 22.66, "step": 29591, "token_acc": 1.0, "train_speed(iter/s)": 0.955952 }, { "epoch": 0.9613098138582984, "grad_norm": 0.40775105357170105, "learning_rate": 4.087209091080346e-08, "loss": 0.010135270655155182, "memory(GiB)": 22.66, "step": 29592, "token_acc": 1.0, "train_speed(iter/s)": 0.955959 }, { "epoch": 0.9613422993210539, "grad_norm": 0.4185188412666321, "learning_rate": 4.080357827668246e-08, "loss": 0.012233978137373924, "memory(GiB)": 22.66, "step": 29593, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955966 }, { "epoch": 0.9613747847838092, "grad_norm": 0.36555787920951843, "learning_rate": 4.073512287828929e-08, "loss": 0.014755995944142342, "memory(GiB)": 22.66, "step": 29594, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.955973 }, { "epoch": 0.9614072702465647, "grad_norm": 0.3162882328033447, "learning_rate": 4.066672471641331e-08, "loss": 0.011034386232495308, "memory(GiB)": 22.66, "step": 29595, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.95598 }, { "epoch": 0.96143975570932, "grad_norm": 0.28284090757369995, "learning_rate": 4.059838379184444e-08, "loss": 0.0089598149061203, "memory(GiB)": 22.66, "step": 29596, "token_acc": 1.0, "train_speed(iter/s)": 0.955987 }, { "epoch": 0.9614722411720755, "grad_norm": 0.2869265675544739, "learning_rate": 4.0530100105370394e-08, "loss": 0.010693080723285675, "memory(GiB)": 22.66, "step": 29597, "token_acc": 1.0, "train_speed(iter/s)": 0.955994 }, { "epoch": 0.9615047266348309, "grad_norm": 0.2214605212211609, "learning_rate": 4.0461873657781095e-08, "loss": 0.008512336760759354, "memory(GiB)": 22.66, "step": 29598, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.956002 }, { "epoch": 0.9615372120975864, "grad_norm": 0.3389335572719574, "learning_rate": 4.039370444986257e-08, "loss": 0.007398006971925497, "memory(GiB)": 22.66, "step": 29599, "token_acc": 1.0, "train_speed(iter/s)": 0.956008 }, { "epoch": 0.9615696975603417, "grad_norm": 0.3372940123081207, "learning_rate": 4.0325592482401975e-08, "loss": 0.008859790861606598, "memory(GiB)": 22.66, "step": 29600, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.956015 }, { "epoch": 0.9616021830230972, "grad_norm": 0.4706648290157318, "learning_rate": 4.0257537756184796e-08, "loss": 0.02072799950838089, "memory(GiB)": 22.66, "step": 29601, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.956022 }, { "epoch": 0.9616346684858526, "grad_norm": 0.26439473032951355, "learning_rate": 4.0189540271997615e-08, "loss": 0.008988925255835056, "memory(GiB)": 22.66, "step": 29602, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.956029 }, { "epoch": 0.961667153948608, "grad_norm": 0.5316608548164368, "learning_rate": 4.012160003062426e-08, "loss": 0.01299172081053257, "memory(GiB)": 22.66, "step": 29603, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956036 }, { "epoch": 0.9616996394113634, "grad_norm": 0.2823726236820221, "learning_rate": 4.005371703284966e-08, "loss": 0.010842963121831417, "memory(GiB)": 22.66, "step": 29604, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956043 }, { "epoch": 0.9617321248741189, "grad_norm": 0.4141598045825958, "learning_rate": 3.998589127945651e-08, "loss": 0.010991416871547699, "memory(GiB)": 22.66, "step": 29605, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.95605 }, { "epoch": 0.9617646103368742, "grad_norm": 0.4049094021320343, "learning_rate": 3.991812277122753e-08, "loss": 0.01202603429555893, "memory(GiB)": 22.66, "step": 29606, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956056 }, { "epoch": 0.9617970957996297, "grad_norm": 0.2847476303577423, "learning_rate": 3.985041150894597e-08, "loss": 0.010303967632353306, "memory(GiB)": 22.66, "step": 29607, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956064 }, { "epoch": 0.961829581262385, "grad_norm": 0.3148459494113922, "learning_rate": 3.978275749339233e-08, "loss": 0.010314179584383965, "memory(GiB)": 22.66, "step": 29608, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.95607 }, { "epoch": 0.9618620667251405, "grad_norm": 0.3256799876689911, "learning_rate": 3.9715160725347645e-08, "loss": 0.011353341862559319, "memory(GiB)": 22.66, "step": 29609, "token_acc": 1.0, "train_speed(iter/s)": 0.956075 }, { "epoch": 0.9618945521878959, "grad_norm": 0.20665371417999268, "learning_rate": 3.96476212055924e-08, "loss": 0.00634723249822855, "memory(GiB)": 22.66, "step": 29610, "token_acc": 0.993421052631579, "train_speed(iter/s)": 0.956081 }, { "epoch": 0.9619270376506514, "grad_norm": 0.3880239725112915, "learning_rate": 3.958013893490542e-08, "loss": 0.009463883936405182, "memory(GiB)": 22.66, "step": 29611, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956087 }, { "epoch": 0.9619595231134067, "grad_norm": 0.29712483286857605, "learning_rate": 3.951271391406608e-08, "loss": 0.005812487564980984, "memory(GiB)": 22.66, "step": 29612, "token_acc": 1.0, "train_speed(iter/s)": 0.956092 }, { "epoch": 0.9619920085761622, "grad_norm": 0.2983141839504242, "learning_rate": 3.944534614385265e-08, "loss": 0.007607668172568083, "memory(GiB)": 22.66, "step": 29613, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956097 }, { "epoch": 0.9620244940389175, "grad_norm": 0.2518186867237091, "learning_rate": 3.937803562504228e-08, "loss": 0.005339461378753185, "memory(GiB)": 22.66, "step": 29614, "token_acc": 1.0, "train_speed(iter/s)": 0.956102 }, { "epoch": 0.962056979501673, "grad_norm": 0.369806170463562, "learning_rate": 3.931078235841212e-08, "loss": 0.014718526974320412, "memory(GiB)": 22.66, "step": 29615, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956108 }, { "epoch": 0.9620894649644284, "grad_norm": 0.36207810044288635, "learning_rate": 3.924358634473768e-08, "loss": 0.0073953536339104176, "memory(GiB)": 22.66, "step": 29616, "token_acc": 1.0, "train_speed(iter/s)": 0.956113 }, { "epoch": 0.9621219504271838, "grad_norm": 0.4809867739677429, "learning_rate": 3.917644758479555e-08, "loss": 0.008391526527702808, "memory(GiB)": 22.66, "step": 29617, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956118 }, { "epoch": 0.9621544358899392, "grad_norm": 0.2797384560108185, "learning_rate": 3.9109366079359556e-08, "loss": 0.006739430595189333, "memory(GiB)": 22.66, "step": 29618, "token_acc": 0.9895833333333334, "train_speed(iter/s)": 0.956123 }, { "epoch": 0.9621869213526947, "grad_norm": 0.41199490427970886, "learning_rate": 3.9042341829205186e-08, "loss": 0.015291599556803703, "memory(GiB)": 22.66, "step": 29619, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.956128 }, { "epoch": 0.96221940681545, "grad_norm": 0.3841497004032135, "learning_rate": 3.897537483510405e-08, "loss": 0.010027183219790459, "memory(GiB)": 22.66, "step": 29620, "token_acc": 1.0, "train_speed(iter/s)": 0.956133 }, { "epoch": 0.9622518922782055, "grad_norm": 0.472393661737442, "learning_rate": 3.890846509783108e-08, "loss": 0.007584506645798683, "memory(GiB)": 22.66, "step": 29621, "token_acc": 1.0, "train_speed(iter/s)": 0.956138 }, { "epoch": 0.9622843777409609, "grad_norm": 0.37382209300994873, "learning_rate": 3.884161261815733e-08, "loss": 0.010489502921700478, "memory(GiB)": 22.66, "step": 29622, "token_acc": 1.0, "train_speed(iter/s)": 0.956144 }, { "epoch": 0.9623168632037163, "grad_norm": 0.3686365485191345, "learning_rate": 3.877481739685496e-08, "loss": 0.010694400407373905, "memory(GiB)": 22.66, "step": 29623, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.95615 }, { "epoch": 0.9623493486664717, "grad_norm": 0.3925243318080902, "learning_rate": 3.870807943469446e-08, "loss": 0.010691412724554539, "memory(GiB)": 22.66, "step": 29624, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.956155 }, { "epoch": 0.9623818341292272, "grad_norm": 0.4604581296443939, "learning_rate": 3.864139873244577e-08, "loss": 0.0093260258436203, "memory(GiB)": 22.66, "step": 29625, "token_acc": 1.0, "train_speed(iter/s)": 0.95616 }, { "epoch": 0.9624143195919825, "grad_norm": 0.4458922743797302, "learning_rate": 3.8574775290879383e-08, "loss": 0.019462306052446365, "memory(GiB)": 22.66, "step": 29626, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.956166 }, { "epoch": 0.962446805054738, "grad_norm": 0.2735140919685364, "learning_rate": 3.850820911076358e-08, "loss": 0.010066566057503223, "memory(GiB)": 22.66, "step": 29627, "token_acc": 0.9911504424778761, "train_speed(iter/s)": 0.95617 }, { "epoch": 0.9624792905174934, "grad_norm": 0.2844338119029999, "learning_rate": 3.844170019286664e-08, "loss": 0.009562505409121513, "memory(GiB)": 22.66, "step": 29628, "token_acc": 0.995, "train_speed(iter/s)": 0.956175 }, { "epoch": 0.9625117759802488, "grad_norm": 0.37782901525497437, "learning_rate": 3.837524853795627e-08, "loss": 0.01291192602366209, "memory(GiB)": 22.66, "step": 29629, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.95618 }, { "epoch": 0.9625442614430042, "grad_norm": 0.3633713126182556, "learning_rate": 3.830885414679963e-08, "loss": 0.0053107887506484985, "memory(GiB)": 22.66, "step": 29630, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956186 }, { "epoch": 0.9625767469057597, "grad_norm": 0.315935343503952, "learning_rate": 3.8242517020162794e-08, "loss": 0.013529572635889053, "memory(GiB)": 22.66, "step": 29631, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.956191 }, { "epoch": 0.9626092323685151, "grad_norm": 0.3347724676132202, "learning_rate": 3.817623715881125e-08, "loss": 0.008773826994001865, "memory(GiB)": 22.66, "step": 29632, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.956196 }, { "epoch": 0.9626417178312705, "grad_norm": 0.2803630232810974, "learning_rate": 3.811001456351049e-08, "loss": 0.008318116888403893, "memory(GiB)": 22.66, "step": 29633, "token_acc": 1.0, "train_speed(iter/s)": 0.956201 }, { "epoch": 0.962674203294026, "grad_norm": 0.2730863094329834, "learning_rate": 3.80438492350238e-08, "loss": 0.007539277896285057, "memory(GiB)": 22.66, "step": 29634, "token_acc": 1.0, "train_speed(iter/s)": 0.956206 }, { "epoch": 0.9627066887567813, "grad_norm": 0.3247336149215698, "learning_rate": 3.797774117411557e-08, "loss": 0.008668768219649792, "memory(GiB)": 22.66, "step": 29635, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956212 }, { "epoch": 0.9627391742195368, "grad_norm": 0.10290627181529999, "learning_rate": 3.791169038154907e-08, "loss": 0.0018487486522644758, "memory(GiB)": 22.66, "step": 29636, "token_acc": 1.0, "train_speed(iter/s)": 0.956217 }, { "epoch": 0.9627716596822922, "grad_norm": 0.35685819387435913, "learning_rate": 3.784569685808592e-08, "loss": 0.009081033058464527, "memory(GiB)": 22.66, "step": 29637, "token_acc": 1.0, "train_speed(iter/s)": 0.956222 }, { "epoch": 0.9628041451450476, "grad_norm": 0.26208969950675964, "learning_rate": 3.777976060448829e-08, "loss": 0.007633680943399668, "memory(GiB)": 22.66, "step": 29638, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.956227 }, { "epoch": 0.962836630607803, "grad_norm": 0.2010698765516281, "learning_rate": 3.771388162151668e-08, "loss": 0.004936474375426769, "memory(GiB)": 22.66, "step": 29639, "token_acc": 1.0, "train_speed(iter/s)": 0.956233 }, { "epoch": 0.9628691160705585, "grad_norm": 0.4742862582206726, "learning_rate": 3.764805990993159e-08, "loss": 0.01153123565018177, "memory(GiB)": 22.66, "step": 29640, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.956238 }, { "epoch": 0.9629016015333138, "grad_norm": 0.30056101083755493, "learning_rate": 3.758229547049297e-08, "loss": 0.01418921910226345, "memory(GiB)": 22.66, "step": 29641, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.956244 }, { "epoch": 0.9629340869960693, "grad_norm": 0.3441597521305084, "learning_rate": 3.75165883039591e-08, "loss": 0.013087639585137367, "memory(GiB)": 22.66, "step": 29642, "token_acc": 1.0, "train_speed(iter/s)": 0.956249 }, { "epoch": 0.9629665724588247, "grad_norm": 0.3250619173049927, "learning_rate": 3.745093841108938e-08, "loss": 0.012291046790778637, "memory(GiB)": 22.66, "step": 29643, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956255 }, { "epoch": 0.9629990579215801, "grad_norm": 0.2989547550678253, "learning_rate": 3.738534579264097e-08, "loss": 0.0071833450347185135, "memory(GiB)": 22.66, "step": 29644, "token_acc": 1.0, "train_speed(iter/s)": 0.956261 }, { "epoch": 0.9630315433843355, "grad_norm": 0.20123551785945892, "learning_rate": 3.7319810449371054e-08, "loss": 0.005156803876161575, "memory(GiB)": 22.66, "step": 29645, "token_acc": 1.0, "train_speed(iter/s)": 0.956266 }, { "epoch": 0.963064028847091, "grad_norm": 0.31612199544906616, "learning_rate": 3.725433238203513e-08, "loss": 0.010663188062608242, "memory(GiB)": 22.66, "step": 29646, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956271 }, { "epoch": 0.9630965143098463, "grad_norm": 0.32548075914382935, "learning_rate": 3.7188911591390375e-08, "loss": 0.019710475578904152, "memory(GiB)": 22.66, "step": 29647, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.956276 }, { "epoch": 0.9631289997726018, "grad_norm": 0.29651081562042236, "learning_rate": 3.712354807819063e-08, "loss": 0.008047433570027351, "memory(GiB)": 22.66, "step": 29648, "token_acc": 1.0, "train_speed(iter/s)": 0.956282 }, { "epoch": 0.9631614852353572, "grad_norm": 0.3412770628929138, "learning_rate": 3.7058241843190846e-08, "loss": 0.010434826835989952, "memory(GiB)": 22.66, "step": 29649, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.956287 }, { "epoch": 0.9631939706981126, "grad_norm": 0.33049020171165466, "learning_rate": 3.699299288714431e-08, "loss": 0.009352057240903378, "memory(GiB)": 22.66, "step": 29650, "token_acc": 1.0, "train_speed(iter/s)": 0.956293 }, { "epoch": 0.963226456160868, "grad_norm": 0.4038583040237427, "learning_rate": 3.692780121080486e-08, "loss": 0.010349602438509464, "memory(GiB)": 22.66, "step": 29651, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956299 }, { "epoch": 0.9632589416236235, "grad_norm": 0.2793677747249603, "learning_rate": 3.686266681492412e-08, "loss": 0.009347337298095226, "memory(GiB)": 22.66, "step": 29652, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.956305 }, { "epoch": 0.9632914270863788, "grad_norm": 0.34482625126838684, "learning_rate": 3.679758970025427e-08, "loss": 0.006819379050284624, "memory(GiB)": 22.66, "step": 29653, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956312 }, { "epoch": 0.9633239125491343, "grad_norm": 0.31246721744537354, "learning_rate": 3.6732569867546365e-08, "loss": 0.006699523888528347, "memory(GiB)": 22.66, "step": 29654, "token_acc": 1.0, "train_speed(iter/s)": 0.956319 }, { "epoch": 0.9633563980118897, "grad_norm": 0.3465273082256317, "learning_rate": 3.666760731755037e-08, "loss": 0.012924819253385067, "memory(GiB)": 22.66, "step": 29655, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.956326 }, { "epoch": 0.9633888834746451, "grad_norm": 0.36367738246917725, "learning_rate": 3.660270205101679e-08, "loss": 0.01504366286098957, "memory(GiB)": 22.66, "step": 29656, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.956333 }, { "epoch": 0.9634213689374005, "grad_norm": 0.7595889568328857, "learning_rate": 3.653785406869392e-08, "loss": 0.008632933720946312, "memory(GiB)": 22.66, "step": 29657, "token_acc": 1.0, "train_speed(iter/s)": 0.956339 }, { "epoch": 0.963453854400156, "grad_norm": 0.44046732783317566, "learning_rate": 3.647306337133061e-08, "loss": 0.014381361193954945, "memory(GiB)": 22.66, "step": 29658, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956346 }, { "epoch": 0.9634863398629113, "grad_norm": 0.44963550567626953, "learning_rate": 3.6408329959674585e-08, "loss": 0.011409884318709373, "memory(GiB)": 22.66, "step": 29659, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.956353 }, { "epoch": 0.9635188253256668, "grad_norm": 0.3442458212375641, "learning_rate": 3.6343653834473027e-08, "loss": 0.011598499491810799, "memory(GiB)": 22.66, "step": 29660, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.95636 }, { "epoch": 0.9635513107884222, "grad_norm": 0.3862062692642212, "learning_rate": 3.627903499647256e-08, "loss": 0.013360712677240372, "memory(GiB)": 22.66, "step": 29661, "token_acc": 0.9904306220095693, "train_speed(iter/s)": 0.956366 }, { "epoch": 0.9635837962511776, "grad_norm": 0.38309773802757263, "learning_rate": 3.621447344641815e-08, "loss": 0.01695593073964119, "memory(GiB)": 22.66, "step": 29662, "token_acc": 1.0, "train_speed(iter/s)": 0.956373 }, { "epoch": 0.963616281713933, "grad_norm": 0.4369587302207947, "learning_rate": 3.6149969185055863e-08, "loss": 0.013453077524900436, "memory(GiB)": 22.66, "step": 29663, "token_acc": 1.0, "train_speed(iter/s)": 0.95638 }, { "epoch": 0.9636487671766885, "grad_norm": 0.4081546366214752, "learning_rate": 3.608552221313011e-08, "loss": 0.014958653599023819, "memory(GiB)": 22.66, "step": 29664, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956387 }, { "epoch": 0.9636812526394438, "grad_norm": 0.42072853446006775, "learning_rate": 3.602113253138417e-08, "loss": 0.009899911470711231, "memory(GiB)": 22.66, "step": 29665, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.956394 }, { "epoch": 0.9637137381021993, "grad_norm": 0.34717831015586853, "learning_rate": 3.59568001405608e-08, "loss": 0.010720775462687016, "memory(GiB)": 22.66, "step": 29666, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956401 }, { "epoch": 0.9637462235649547, "grad_norm": 0.3098641633987427, "learning_rate": 3.589252504140384e-08, "loss": 0.005495971534401178, "memory(GiB)": 22.66, "step": 29667, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956408 }, { "epoch": 0.9637787090277101, "grad_norm": 0.33947446942329407, "learning_rate": 3.5828307234653806e-08, "loss": 0.014012837782502174, "memory(GiB)": 22.66, "step": 29668, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.956414 }, { "epoch": 0.9638111944904655, "grad_norm": 0.39324459433555603, "learning_rate": 3.5764146721052885e-08, "loss": 0.010222830809652805, "memory(GiB)": 22.66, "step": 29669, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956418 }, { "epoch": 0.963843679953221, "grad_norm": 0.4001288414001465, "learning_rate": 3.570004350134104e-08, "loss": 0.01190902665257454, "memory(GiB)": 22.66, "step": 29670, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.956424 }, { "epoch": 0.9638761654159763, "grad_norm": 0.3048796057701111, "learning_rate": 3.563599757625769e-08, "loss": 0.011797092854976654, "memory(GiB)": 22.66, "step": 29671, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956429 }, { "epoch": 0.9639086508787318, "grad_norm": 0.6877995133399963, "learning_rate": 3.557200894654278e-08, "loss": 0.01509047206491232, "memory(GiB)": 22.66, "step": 29672, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.956434 }, { "epoch": 0.9639411363414871, "grad_norm": 0.298962265253067, "learning_rate": 3.5508077612934624e-08, "loss": 0.01105319894850254, "memory(GiB)": 22.66, "step": 29673, "token_acc": 1.0, "train_speed(iter/s)": 0.95644 }, { "epoch": 0.9639736218042426, "grad_norm": 0.3906071186065674, "learning_rate": 3.544420357617095e-08, "loss": 0.01158853154629469, "memory(GiB)": 22.66, "step": 29674, "token_acc": 1.0, "train_speed(iter/s)": 0.956445 }, { "epoch": 0.964006107266998, "grad_norm": 0.41049453616142273, "learning_rate": 3.538038683698897e-08, "loss": 0.0156802237033844, "memory(GiB)": 22.66, "step": 29675, "token_acc": 0.9817518248175182, "train_speed(iter/s)": 0.95645 }, { "epoch": 0.9640385927297535, "grad_norm": 0.4260343611240387, "learning_rate": 3.531662739612473e-08, "loss": 0.014887452125549316, "memory(GiB)": 22.66, "step": 29676, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.956455 }, { "epoch": 0.9640710781925088, "grad_norm": 0.25922125577926636, "learning_rate": 3.525292525431545e-08, "loss": 0.00872582197189331, "memory(GiB)": 22.66, "step": 29677, "token_acc": 1.0, "train_speed(iter/s)": 0.956461 }, { "epoch": 0.9641035636552643, "grad_norm": 0.39109644293785095, "learning_rate": 3.518928041229497e-08, "loss": 0.015391952358186245, "memory(GiB)": 22.66, "step": 29678, "token_acc": 1.0, "train_speed(iter/s)": 0.956466 }, { "epoch": 0.9641360491180196, "grad_norm": 0.1848306506872177, "learning_rate": 3.512569287079881e-08, "loss": 0.006947976537048817, "memory(GiB)": 22.66, "step": 29679, "token_acc": 1.0, "train_speed(iter/s)": 0.956471 }, { "epoch": 0.9641685345807751, "grad_norm": 0.2812499701976776, "learning_rate": 3.5062162630559727e-08, "loss": 0.011986611410975456, "memory(GiB)": 22.66, "step": 29680, "token_acc": 0.9923371647509579, "train_speed(iter/s)": 0.956477 }, { "epoch": 0.9642010200435305, "grad_norm": 0.26142147183418274, "learning_rate": 3.499868969231157e-08, "loss": 0.008124863728880882, "memory(GiB)": 22.66, "step": 29681, "token_acc": 1.0, "train_speed(iter/s)": 0.956482 }, { "epoch": 0.964233505506286, "grad_norm": 0.3779858350753784, "learning_rate": 3.4935274056787646e-08, "loss": 0.012403635308146477, "memory(GiB)": 22.66, "step": 29682, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.956487 }, { "epoch": 0.9642659909690413, "grad_norm": 0.32932063937187195, "learning_rate": 3.4871915724718484e-08, "loss": 0.01342782098799944, "memory(GiB)": 22.66, "step": 29683, "token_acc": 1.0, "train_speed(iter/s)": 0.956492 }, { "epoch": 0.9642984764317968, "grad_norm": 0.36471089720726013, "learning_rate": 3.480861469683627e-08, "loss": 0.009811880066990852, "memory(GiB)": 22.66, "step": 29684, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.956498 }, { "epoch": 0.9643309618945521, "grad_norm": 0.32694733142852783, "learning_rate": 3.474537097387154e-08, "loss": 0.008020097389817238, "memory(GiB)": 22.66, "step": 29685, "token_acc": 1.0, "train_speed(iter/s)": 0.956503 }, { "epoch": 0.9643634473573076, "grad_norm": 0.3235601782798767, "learning_rate": 3.4682184556553697e-08, "loss": 0.011923235841095448, "memory(GiB)": 22.66, "step": 29686, "token_acc": 1.0, "train_speed(iter/s)": 0.956508 }, { "epoch": 0.964395932820063, "grad_norm": 0.24394726753234863, "learning_rate": 3.461905544561217e-08, "loss": 0.009574446827173233, "memory(GiB)": 22.66, "step": 29687, "token_acc": 0.996415770609319, "train_speed(iter/s)": 0.956513 }, { "epoch": 0.9644284182828184, "grad_norm": 0.3528635501861572, "learning_rate": 3.4555983641775814e-08, "loss": 0.013038570061326027, "memory(GiB)": 22.66, "step": 29688, "token_acc": 0.9946236559139785, "train_speed(iter/s)": 0.956521 }, { "epoch": 0.9644609037455738, "grad_norm": 0.35555499792099, "learning_rate": 3.449296914577238e-08, "loss": 0.014301531948149204, "memory(GiB)": 22.66, "step": 29689, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.956528 }, { "epoch": 0.9644933892083293, "grad_norm": 0.25776827335357666, "learning_rate": 3.443001195832907e-08, "loss": 0.010240118950605392, "memory(GiB)": 22.66, "step": 29690, "token_acc": 0.9923954372623575, "train_speed(iter/s)": 0.956534 }, { "epoch": 0.9645258746710846, "grad_norm": 0.23377220332622528, "learning_rate": 3.436711208017252e-08, "loss": 0.008295003324747086, "memory(GiB)": 22.66, "step": 29691, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956541 }, { "epoch": 0.9645583601338401, "grad_norm": 0.3540036678314209, "learning_rate": 3.4304269512028806e-08, "loss": 0.00635230029001832, "memory(GiB)": 22.66, "step": 29692, "token_acc": 1.0, "train_speed(iter/s)": 0.956547 }, { "epoch": 0.9645908455965955, "grad_norm": 0.2548119127750397, "learning_rate": 3.424148425462348e-08, "loss": 0.014323350973427296, "memory(GiB)": 22.66, "step": 29693, "token_acc": 0.9902439024390244, "train_speed(iter/s)": 0.956553 }, { "epoch": 0.9646233310593509, "grad_norm": 0.23916032910346985, "learning_rate": 3.417875630868039e-08, "loss": 0.010489122942090034, "memory(GiB)": 22.66, "step": 29694, "token_acc": 1.0, "train_speed(iter/s)": 0.956558 }, { "epoch": 0.9646558165221063, "grad_norm": 0.4104132652282715, "learning_rate": 3.411608567492397e-08, "loss": 0.013947086408734322, "memory(GiB)": 22.66, "step": 29695, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956564 }, { "epoch": 0.9646883019848618, "grad_norm": 0.36835479736328125, "learning_rate": 3.405347235407752e-08, "loss": 0.008477923460304737, "memory(GiB)": 22.66, "step": 29696, "token_acc": 1.0, "train_speed(iter/s)": 0.956569 }, { "epoch": 0.9647207874476172, "grad_norm": 0.49075478315353394, "learning_rate": 3.39909163468638e-08, "loss": 0.013367535546422005, "memory(GiB)": 22.66, "step": 29697, "token_acc": 0.9849246231155779, "train_speed(iter/s)": 0.956575 }, { "epoch": 0.9647532729103726, "grad_norm": 0.3888440430164337, "learning_rate": 3.3928417654005006e-08, "loss": 0.011193700134754181, "memory(GiB)": 22.66, "step": 29698, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956581 }, { "epoch": 0.9647857583731281, "grad_norm": 0.341739296913147, "learning_rate": 3.386597627622112e-08, "loss": 0.010307795368134975, "memory(GiB)": 22.66, "step": 29699, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956587 }, { "epoch": 0.9648182438358834, "grad_norm": 0.2715093195438385, "learning_rate": 3.3803592214234907e-08, "loss": 0.008511293679475784, "memory(GiB)": 22.66, "step": 29700, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.956592 }, { "epoch": 0.9648507292986389, "grad_norm": 0.3283877670764923, "learning_rate": 3.374126546876466e-08, "loss": 0.011650538071990013, "memory(GiB)": 22.66, "step": 29701, "token_acc": 1.0, "train_speed(iter/s)": 0.956598 }, { "epoch": 0.9648832147613943, "grad_norm": 0.4626712501049042, "learning_rate": 3.3678996040529824e-08, "loss": 0.01099206693470478, "memory(GiB)": 22.66, "step": 29702, "token_acc": 1.0, "train_speed(iter/s)": 0.956603 }, { "epoch": 0.9649157002241497, "grad_norm": 0.22132650017738342, "learning_rate": 3.3616783930250364e-08, "loss": 0.007100000977516174, "memory(GiB)": 22.66, "step": 29703, "token_acc": 1.0, "train_speed(iter/s)": 0.956608 }, { "epoch": 0.9649481856869051, "grad_norm": 0.5638545751571655, "learning_rate": 3.355462913864294e-08, "loss": 0.023468896746635437, "memory(GiB)": 22.66, "step": 29704, "token_acc": 1.0, "train_speed(iter/s)": 0.956613 }, { "epoch": 0.9649806711496606, "grad_norm": 0.2663033902645111, "learning_rate": 3.3492531666425856e-08, "loss": 0.009992120787501335, "memory(GiB)": 22.66, "step": 29705, "token_acc": 0.9948186528497409, "train_speed(iter/s)": 0.956617 }, { "epoch": 0.9650131566124159, "grad_norm": 0.1449051797389984, "learning_rate": 3.343049151431466e-08, "loss": 0.005404267460107803, "memory(GiB)": 22.66, "step": 29706, "token_acc": 1.0, "train_speed(iter/s)": 0.956623 }, { "epoch": 0.9650456420751714, "grad_norm": 0.3243349492549896, "learning_rate": 3.33685086830271e-08, "loss": 0.011987959034740925, "memory(GiB)": 22.66, "step": 29707, "token_acc": 1.0, "train_speed(iter/s)": 0.956629 }, { "epoch": 0.9650781275379268, "grad_norm": 0.27875372767448425, "learning_rate": 3.330658317327651e-08, "loss": 0.0056395987048745155, "memory(GiB)": 22.66, "step": 29708, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.956635 }, { "epoch": 0.9651106130006822, "grad_norm": 0.8306626081466675, "learning_rate": 3.324471498577897e-08, "loss": 0.012728441506624222, "memory(GiB)": 22.66, "step": 29709, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.956641 }, { "epoch": 0.9651430984634376, "grad_norm": 0.33479464054107666, "learning_rate": 3.318290412124836e-08, "loss": 0.011746269650757313, "memory(GiB)": 22.66, "step": 29710, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.956645 }, { "epoch": 0.9651755839261931, "grad_norm": 0.2730293869972229, "learning_rate": 3.3121150580398e-08, "loss": 0.010044476017355919, "memory(GiB)": 22.66, "step": 29711, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956651 }, { "epoch": 0.9652080693889484, "grad_norm": 0.4359232783317566, "learning_rate": 3.305945436394009e-08, "loss": 0.011175436899065971, "memory(GiB)": 22.66, "step": 29712, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.956656 }, { "epoch": 0.9652405548517039, "grad_norm": 0.3251552879810333, "learning_rate": 3.2997815472586845e-08, "loss": 0.008171280845999718, "memory(GiB)": 22.66, "step": 29713, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956662 }, { "epoch": 0.9652730403144593, "grad_norm": 0.4746147692203522, "learning_rate": 3.2936233907050475e-08, "loss": 0.013425786048173904, "memory(GiB)": 22.66, "step": 29714, "token_acc": 0.9948979591836735, "train_speed(iter/s)": 0.956667 }, { "epoch": 0.9653055257772147, "grad_norm": 0.3515494465827942, "learning_rate": 3.287470966804096e-08, "loss": 0.011122938245534897, "memory(GiB)": 22.66, "step": 29715, "token_acc": 1.0, "train_speed(iter/s)": 0.956672 }, { "epoch": 0.9653380112399701, "grad_norm": 0.38606566190719604, "learning_rate": 3.281324275626829e-08, "loss": 0.01230740174651146, "memory(GiB)": 22.66, "step": 29716, "token_acc": 1.0, "train_speed(iter/s)": 0.956679 }, { "epoch": 0.9653704967027256, "grad_norm": 0.27462270855903625, "learning_rate": 3.27518331724419e-08, "loss": 0.007806078065186739, "memory(GiB)": 22.66, "step": 29717, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956685 }, { "epoch": 0.9654029821654809, "grad_norm": 0.15496036410331726, "learning_rate": 3.2690480917271226e-08, "loss": 0.007219497114419937, "memory(GiB)": 22.66, "step": 29718, "token_acc": 1.0, "train_speed(iter/s)": 0.956692 }, { "epoch": 0.9654354676282364, "grad_norm": 0.34338393807411194, "learning_rate": 3.2629185991463477e-08, "loss": 0.012592168524861336, "memory(GiB)": 22.66, "step": 29719, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.956699 }, { "epoch": 0.9654679530909918, "grad_norm": 0.3723781704902649, "learning_rate": 3.2567948395726965e-08, "loss": 0.011841067112982273, "memory(GiB)": 22.66, "step": 29720, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.956706 }, { "epoch": 0.9655004385537472, "grad_norm": 0.21207047998905182, "learning_rate": 3.2506768130767805e-08, "loss": 0.005533565767109394, "memory(GiB)": 22.66, "step": 29721, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.956713 }, { "epoch": 0.9655329240165026, "grad_norm": 0.32408323884010315, "learning_rate": 3.24456451972921e-08, "loss": 0.00971940066665411, "memory(GiB)": 22.66, "step": 29722, "token_acc": 0.9928057553956835, "train_speed(iter/s)": 0.956719 }, { "epoch": 0.9655654094792581, "grad_norm": 0.42065322399139404, "learning_rate": 3.2384579596005386e-08, "loss": 0.00910224299877882, "memory(GiB)": 22.66, "step": 29723, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956726 }, { "epoch": 0.9655978949420134, "grad_norm": 0.536932647228241, "learning_rate": 3.2323571327612655e-08, "loss": 0.011266965419054031, "memory(GiB)": 22.66, "step": 29724, "token_acc": 1.0, "train_speed(iter/s)": 0.956733 }, { "epoch": 0.9656303804047689, "grad_norm": 0.42494553327560425, "learning_rate": 3.22626203928178e-08, "loss": 0.01658014766871929, "memory(GiB)": 22.66, "step": 29725, "token_acc": 0.9958847736625515, "train_speed(iter/s)": 0.95674 }, { "epoch": 0.9656628658675243, "grad_norm": 0.26572197675704956, "learning_rate": 3.2201726792324695e-08, "loss": 0.007217718288302422, "memory(GiB)": 22.66, "step": 29726, "token_acc": 1.0, "train_speed(iter/s)": 0.956747 }, { "epoch": 0.9656953513302797, "grad_norm": 0.459773451089859, "learning_rate": 3.2140890526835e-08, "loss": 0.015596561133861542, "memory(GiB)": 22.66, "step": 29727, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.956754 }, { "epoch": 0.9657278367930351, "grad_norm": 0.25087764859199524, "learning_rate": 3.2080111597052043e-08, "loss": 0.009659577161073685, "memory(GiB)": 22.66, "step": 29728, "token_acc": 0.9929328621908127, "train_speed(iter/s)": 0.956759 }, { "epoch": 0.9657603222557906, "grad_norm": 0.32794618606567383, "learning_rate": 3.201939000367693e-08, "loss": 0.011397412046790123, "memory(GiB)": 22.66, "step": 29729, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956764 }, { "epoch": 0.9657928077185459, "grad_norm": 0.7548697590827942, "learning_rate": 3.1958725747410765e-08, "loss": 0.008808707818388939, "memory(GiB)": 22.66, "step": 29730, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956769 }, { "epoch": 0.9658252931813014, "grad_norm": 0.5053635239601135, "learning_rate": 3.189811882895299e-08, "loss": 0.013808337971568108, "memory(GiB)": 22.66, "step": 29731, "token_acc": 1.0, "train_speed(iter/s)": 0.956775 }, { "epoch": 0.9658577786440568, "grad_norm": 0.4750264286994934, "learning_rate": 3.183756924900361e-08, "loss": 0.016896124929189682, "memory(GiB)": 22.66, "step": 29732, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.95678 }, { "epoch": 0.9658902641068122, "grad_norm": 0.44895127415657043, "learning_rate": 3.1777077008260936e-08, "loss": 0.01418232824653387, "memory(GiB)": 22.66, "step": 29733, "token_acc": 0.9930313588850174, "train_speed(iter/s)": 0.956785 }, { "epoch": 0.9659227495695676, "grad_norm": 0.3406476676464081, "learning_rate": 3.1716642107423865e-08, "loss": 0.008470289409160614, "memory(GiB)": 22.66, "step": 29734, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956789 }, { "epoch": 0.9659552350323231, "grad_norm": 0.286631315946579, "learning_rate": 3.165626454718962e-08, "loss": 0.009888541884720325, "memory(GiB)": 22.66, "step": 29735, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956794 }, { "epoch": 0.9659877204950784, "grad_norm": 0.2712765038013458, "learning_rate": 3.15959443282543e-08, "loss": 0.005555623210966587, "memory(GiB)": 22.66, "step": 29736, "token_acc": 1.0, "train_speed(iter/s)": 0.956799 }, { "epoch": 0.9660202059578339, "grad_norm": 0.3057474195957184, "learning_rate": 3.1535681451315135e-08, "loss": 0.010014694184064865, "memory(GiB)": 22.66, "step": 29737, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.956804 }, { "epoch": 0.9660526914205892, "grad_norm": 0.3255285322666168, "learning_rate": 3.1475475917067125e-08, "loss": 0.016552681103348732, "memory(GiB)": 22.66, "step": 29738, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.95681 }, { "epoch": 0.9660851768833447, "grad_norm": 0.25043874979019165, "learning_rate": 3.141532772620581e-08, "loss": 0.005762338638305664, "memory(GiB)": 22.66, "step": 29739, "token_acc": 1.0, "train_speed(iter/s)": 0.956815 }, { "epoch": 0.9661176623461001, "grad_norm": 0.45117858052253723, "learning_rate": 3.135523687942399e-08, "loss": 0.011718334630131721, "memory(GiB)": 22.66, "step": 29740, "token_acc": 1.0, "train_speed(iter/s)": 0.95682 }, { "epoch": 0.9661501478088556, "grad_norm": 0.32304975390434265, "learning_rate": 3.1295203377416095e-08, "loss": 0.010684296488761902, "memory(GiB)": 22.66, "step": 29741, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956825 }, { "epoch": 0.9661826332716109, "grad_norm": 0.3328252136707306, "learning_rate": 3.123522722087491e-08, "loss": 0.011940483003854752, "memory(GiB)": 22.66, "step": 29742, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.95683 }, { "epoch": 0.9662151187343664, "grad_norm": 0.37901580333709717, "learning_rate": 3.117530841049321e-08, "loss": 0.010214455425739288, "memory(GiB)": 22.66, "step": 29743, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956835 }, { "epoch": 0.9662476041971217, "grad_norm": 0.3986775577068329, "learning_rate": 3.1115446946961005e-08, "loss": 0.011529709212481976, "memory(GiB)": 22.66, "step": 29744, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.95684 }, { "epoch": 0.9662800896598772, "grad_norm": 0.22561219334602356, "learning_rate": 3.105564283097051e-08, "loss": 0.008417569100856781, "memory(GiB)": 22.66, "step": 29745, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956846 }, { "epoch": 0.9663125751226326, "grad_norm": 0.42530375719070435, "learning_rate": 3.099589606321174e-08, "loss": 0.012537907809019089, "memory(GiB)": 22.66, "step": 29746, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.956851 }, { "epoch": 0.966345060585388, "grad_norm": 0.33483725786209106, "learning_rate": 3.093620664437413e-08, "loss": 0.010522371158003807, "memory(GiB)": 22.66, "step": 29747, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.956857 }, { "epoch": 0.9663775460481434, "grad_norm": 0.3508816361427307, "learning_rate": 3.0876574575146034e-08, "loss": 0.011010359972715378, "memory(GiB)": 22.66, "step": 29748, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.956864 }, { "epoch": 0.9664100315108989, "grad_norm": 0.36200791597366333, "learning_rate": 3.081699985621633e-08, "loss": 0.007856190204620361, "memory(GiB)": 22.66, "step": 29749, "token_acc": 1.0, "train_speed(iter/s)": 0.956871 }, { "epoch": 0.9664425169736542, "grad_norm": 0.3556423485279083, "learning_rate": 3.0757482488272814e-08, "loss": 0.007207317277789116, "memory(GiB)": 22.66, "step": 29750, "token_acc": 0.995575221238938, "train_speed(iter/s)": 0.956878 }, { "epoch": 0.9664750024364097, "grad_norm": 0.4550231099128723, "learning_rate": 3.06980224720016e-08, "loss": 0.014010998420417309, "memory(GiB)": 22.66, "step": 29751, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.956885 }, { "epoch": 0.9665074878991651, "grad_norm": 0.2769465148448944, "learning_rate": 3.063861980808935e-08, "loss": 0.004401929676532745, "memory(GiB)": 22.66, "step": 29752, "token_acc": 1.0, "train_speed(iter/s)": 0.956892 }, { "epoch": 0.9665399733619205, "grad_norm": 0.28921663761138916, "learning_rate": 3.057927449722165e-08, "loss": 0.00740619283169508, "memory(GiB)": 22.66, "step": 29753, "token_acc": 1.0, "train_speed(iter/s)": 0.956899 }, { "epoch": 0.9665724588246759, "grad_norm": 0.25796186923980713, "learning_rate": 3.0519986540084036e-08, "loss": 0.009836533106863499, "memory(GiB)": 22.66, "step": 29754, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.956904 }, { "epoch": 0.9666049442874314, "grad_norm": 0.49094584584236145, "learning_rate": 3.046075593735931e-08, "loss": 0.014122486114501953, "memory(GiB)": 22.66, "step": 29755, "token_acc": 0.994535519125683, "train_speed(iter/s)": 0.95691 }, { "epoch": 0.9666374297501867, "grad_norm": 0.407713383436203, "learning_rate": 3.0401582689731923e-08, "loss": 0.016312377527356148, "memory(GiB)": 22.66, "step": 29756, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956915 }, { "epoch": 0.9666699152129422, "grad_norm": 0.47382599115371704, "learning_rate": 3.0342466797885215e-08, "loss": 0.010980091989040375, "memory(GiB)": 22.66, "step": 29757, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.95692 }, { "epoch": 0.9667024006756976, "grad_norm": 0.36527493596076965, "learning_rate": 3.028340826250087e-08, "loss": 0.010900203138589859, "memory(GiB)": 22.66, "step": 29758, "token_acc": 1.0, "train_speed(iter/s)": 0.956925 }, { "epoch": 0.966734886138453, "grad_norm": 0.45559537410736084, "learning_rate": 3.0224407084260555e-08, "loss": 0.016360178589820862, "memory(GiB)": 22.66, "step": 29759, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.956931 }, { "epoch": 0.9667673716012085, "grad_norm": 0.30088338255882263, "learning_rate": 3.0165463263845394e-08, "loss": 0.007010858040302992, "memory(GiB)": 22.66, "step": 29760, "token_acc": 1.0, "train_speed(iter/s)": 0.956936 }, { "epoch": 0.9667998570639639, "grad_norm": 0.3643522262573242, "learning_rate": 3.0106576801935964e-08, "loss": 0.010390551760792732, "memory(GiB)": 22.66, "step": 29761, "token_acc": 1.0, "train_speed(iter/s)": 0.956942 }, { "epoch": 0.9668323425267193, "grad_norm": 0.40775740146636963, "learning_rate": 3.004774769921115e-08, "loss": 0.008080989122390747, "memory(GiB)": 22.66, "step": 29762, "token_acc": 1.0, "train_speed(iter/s)": 0.956947 }, { "epoch": 0.9668648279894747, "grad_norm": 0.5276029109954834, "learning_rate": 2.998897595635042e-08, "loss": 0.014776308089494705, "memory(GiB)": 22.66, "step": 29763, "token_acc": 1.0, "train_speed(iter/s)": 0.956953 }, { "epoch": 0.9668973134522302, "grad_norm": 0.4185199737548828, "learning_rate": 2.993026157403156e-08, "loss": 0.009317548014223576, "memory(GiB)": 22.66, "step": 29764, "token_acc": 1.0, "train_speed(iter/s)": 0.956958 }, { "epoch": 0.9669297989149855, "grad_norm": 0.31579214334487915, "learning_rate": 2.987160455293348e-08, "loss": 0.0146626653149724, "memory(GiB)": 22.66, "step": 29765, "token_acc": 1.0, "train_speed(iter/s)": 0.956963 }, { "epoch": 0.966962284377741, "grad_norm": 0.44084540009498596, "learning_rate": 2.981300489373173e-08, "loss": 0.011570506729185581, "memory(GiB)": 22.66, "step": 29766, "token_acc": 0.9903381642512077, "train_speed(iter/s)": 0.956968 }, { "epoch": 0.9669947698404964, "grad_norm": 0.24336211383342743, "learning_rate": 2.9754462597103572e-08, "loss": 0.009136607870459557, "memory(GiB)": 22.66, "step": 29767, "token_acc": 1.0, "train_speed(iter/s)": 0.956974 }, { "epoch": 0.9670272553032518, "grad_norm": 0.351272314786911, "learning_rate": 2.969597766372401e-08, "loss": 0.009716102853417397, "memory(GiB)": 22.66, "step": 29768, "token_acc": 0.992, "train_speed(iter/s)": 0.956979 }, { "epoch": 0.9670597407660072, "grad_norm": 0.27621546387672424, "learning_rate": 2.963755009426861e-08, "loss": 0.009451789781451225, "memory(GiB)": 22.66, "step": 29769, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.956984 }, { "epoch": 0.9670922262287627, "grad_norm": 0.38001489639282227, "learning_rate": 2.9579179889410724e-08, "loss": 0.014243091456592083, "memory(GiB)": 22.66, "step": 29770, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.95699 }, { "epoch": 0.967124711691518, "grad_norm": 0.40430572628974915, "learning_rate": 2.9520867049825374e-08, "loss": 0.015748612582683563, "memory(GiB)": 22.66, "step": 29771, "token_acc": 1.0, "train_speed(iter/s)": 0.956995 }, { "epoch": 0.9671571971542735, "grad_norm": 0.5170897245407104, "learning_rate": 2.9462611576184798e-08, "loss": 0.014445476233959198, "memory(GiB)": 22.66, "step": 29772, "token_acc": 0.9904761904761905, "train_speed(iter/s)": 0.956999 }, { "epoch": 0.9671896826170289, "grad_norm": 0.2561272382736206, "learning_rate": 2.9404413469161232e-08, "loss": 0.009566998109221458, "memory(GiB)": 22.66, "step": 29773, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.957003 }, { "epoch": 0.9672221680797843, "grad_norm": 0.3477667570114136, "learning_rate": 2.9346272729426917e-08, "loss": 0.011248031631112099, "memory(GiB)": 22.66, "step": 29774, "token_acc": 0.988, "train_speed(iter/s)": 0.957008 }, { "epoch": 0.9672546535425397, "grad_norm": 0.3314678370952606, "learning_rate": 2.928818935765243e-08, "loss": 0.009092401713132858, "memory(GiB)": 22.66, "step": 29775, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.957014 }, { "epoch": 0.9672871390052952, "grad_norm": 0.23720969259738922, "learning_rate": 2.9230163354508345e-08, "loss": 0.005811051465570927, "memory(GiB)": 22.66, "step": 29776, "token_acc": 1.0, "train_speed(iter/s)": 0.95702 }, { "epoch": 0.9673196244680505, "grad_norm": 0.4099373519420624, "learning_rate": 2.9172194720664125e-08, "loss": 0.012927457690238953, "memory(GiB)": 22.66, "step": 29777, "token_acc": 0.988, "train_speed(iter/s)": 0.957025 }, { "epoch": 0.967352109930806, "grad_norm": 0.323472797870636, "learning_rate": 2.9114283456788685e-08, "loss": 0.007336888462305069, "memory(GiB)": 22.66, "step": 29778, "token_acc": 1.0, "train_speed(iter/s)": 0.957032 }, { "epoch": 0.9673845953935614, "grad_norm": 0.3175390958786011, "learning_rate": 2.9056429563550925e-08, "loss": 0.013680151663720608, "memory(GiB)": 22.66, "step": 29779, "token_acc": 0.9929824561403509, "train_speed(iter/s)": 0.957039 }, { "epoch": 0.9674170808563168, "grad_norm": 0.30972301959991455, "learning_rate": 2.8998633041618096e-08, "loss": 0.009756801649928093, "memory(GiB)": 22.66, "step": 29780, "token_acc": 1.0, "train_speed(iter/s)": 0.957046 }, { "epoch": 0.9674495663190722, "grad_norm": 0.8708516955375671, "learning_rate": 2.894089389165744e-08, "loss": 0.015006830915808678, "memory(GiB)": 22.66, "step": 29781, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.957053 }, { "epoch": 0.9674820517818277, "grad_norm": 0.4755626320838928, "learning_rate": 2.8883212114335647e-08, "loss": 0.011111278086900711, "memory(GiB)": 22.66, "step": 29782, "token_acc": 1.0, "train_speed(iter/s)": 0.95706 }, { "epoch": 0.967514537244583, "grad_norm": 0.2884061634540558, "learning_rate": 2.8825587710317737e-08, "loss": 0.007215049117803574, "memory(GiB)": 22.66, "step": 29783, "token_acc": 1.0, "train_speed(iter/s)": 0.957067 }, { "epoch": 0.9675470227073385, "grad_norm": 0.43558168411254883, "learning_rate": 2.8768020680269848e-08, "loss": 0.013232327997684479, "memory(GiB)": 22.66, "step": 29784, "token_acc": 0.985239852398524, "train_speed(iter/s)": 0.957073 }, { "epoch": 0.9675795081700939, "grad_norm": 0.34291544556617737, "learning_rate": 2.871051102485478e-08, "loss": 0.008593068458139896, "memory(GiB)": 22.66, "step": 29785, "token_acc": 1.0, "train_speed(iter/s)": 0.95708 }, { "epoch": 0.9676119936328493, "grad_norm": 0.3146103620529175, "learning_rate": 2.8653058744738115e-08, "loss": 0.009943504817783833, "memory(GiB)": 22.66, "step": 29786, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.957087 }, { "epoch": 0.9676444790956047, "grad_norm": 0.3789980411529541, "learning_rate": 2.859566384058099e-08, "loss": 0.011299299076199532, "memory(GiB)": 22.66, "step": 29787, "token_acc": 1.0, "train_speed(iter/s)": 0.957092 }, { "epoch": 0.9676769645583602, "grad_norm": 0.27536433935165405, "learning_rate": 2.8538326313047315e-08, "loss": 0.01192531455308199, "memory(GiB)": 22.66, "step": 29788, "token_acc": 1.0, "train_speed(iter/s)": 0.957097 }, { "epoch": 0.9677094500211155, "grad_norm": 0.3689436912536621, "learning_rate": 2.848104616279823e-08, "loss": 0.013028048910200596, "memory(GiB)": 22.66, "step": 29789, "token_acc": 0.990990990990991, "train_speed(iter/s)": 0.957103 }, { "epoch": 0.967741935483871, "grad_norm": 0.791351318359375, "learning_rate": 2.842382339049543e-08, "loss": 0.014275968074798584, "memory(GiB)": 22.66, "step": 29790, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.957109 }, { "epoch": 0.9677744209466264, "grad_norm": 0.41179850697517395, "learning_rate": 2.8366657996798386e-08, "loss": 0.011245008558034897, "memory(GiB)": 22.66, "step": 29791, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.957114 }, { "epoch": 0.9678069064093818, "grad_norm": 0.26688575744628906, "learning_rate": 2.8309549982367678e-08, "loss": 0.00935388170182705, "memory(GiB)": 22.66, "step": 29792, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.957119 }, { "epoch": 0.9678393918721372, "grad_norm": 0.378177672624588, "learning_rate": 2.8252499347861672e-08, "loss": 0.01144446525722742, "memory(GiB)": 22.66, "step": 29793, "token_acc": 1.0, "train_speed(iter/s)": 0.957124 }, { "epoch": 0.9678718773348927, "grad_norm": 0.33070898056030273, "learning_rate": 2.819550609393984e-08, "loss": 0.010570868849754333, "memory(GiB)": 22.66, "step": 29794, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.95713 }, { "epoch": 0.967904362797648, "grad_norm": 0.5227820873260498, "learning_rate": 2.813857022125832e-08, "loss": 0.013193795457482338, "memory(GiB)": 22.66, "step": 29795, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957135 }, { "epoch": 0.9679368482604035, "grad_norm": 0.3847436010837555, "learning_rate": 2.8081691730476036e-08, "loss": 0.017768394201993942, "memory(GiB)": 22.66, "step": 29796, "token_acc": 0.9851851851851852, "train_speed(iter/s)": 0.95714 }, { "epoch": 0.9679693337231589, "grad_norm": 0.3955010771751404, "learning_rate": 2.8024870622248567e-08, "loss": 0.013289340771734715, "memory(GiB)": 22.66, "step": 29797, "token_acc": 0.9882352941176471, "train_speed(iter/s)": 0.957146 }, { "epoch": 0.9680018191859143, "grad_norm": 0.39200037717819214, "learning_rate": 2.7968106897231505e-08, "loss": 0.016319239512085915, "memory(GiB)": 22.66, "step": 29798, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.957151 }, { "epoch": 0.9680343046486697, "grad_norm": 0.4775654971599579, "learning_rate": 2.7911400556080438e-08, "loss": 0.017062455415725708, "memory(GiB)": 22.66, "step": 29799, "token_acc": 1.0, "train_speed(iter/s)": 0.957156 }, { "epoch": 0.9680667901114252, "grad_norm": 0.290199339389801, "learning_rate": 2.785475159944928e-08, "loss": 0.008282732218503952, "memory(GiB)": 22.66, "step": 29800, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.957161 }, { "epoch": 0.9680992755741805, "grad_norm": 0.25340208411216736, "learning_rate": 2.7798160027992516e-08, "loss": 0.009098293259739876, "memory(GiB)": 22.66, "step": 29801, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957167 }, { "epoch": 0.968131761036936, "grad_norm": 0.3429594337940216, "learning_rate": 2.774162584236295e-08, "loss": 0.015209035947918892, "memory(GiB)": 22.66, "step": 29802, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957172 }, { "epoch": 0.9681642464996913, "grad_norm": 0.25332051515579224, "learning_rate": 2.7685149043212846e-08, "loss": 0.00855079386383295, "memory(GiB)": 22.66, "step": 29803, "token_acc": 1.0, "train_speed(iter/s)": 0.957177 }, { "epoch": 0.9681967319624468, "grad_norm": 0.261579304933548, "learning_rate": 2.7628729631194452e-08, "loss": 0.008913081139326096, "memory(GiB)": 22.66, "step": 29804, "token_acc": 1.0, "train_speed(iter/s)": 0.957181 }, { "epoch": 0.9682292174252022, "grad_norm": 0.4488394856452942, "learning_rate": 2.7572367606958917e-08, "loss": 0.013214093632996082, "memory(GiB)": 22.66, "step": 29805, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.957186 }, { "epoch": 0.9682617028879577, "grad_norm": 0.3796393573284149, "learning_rate": 2.7516062971156278e-08, "loss": 0.01426719594746828, "memory(GiB)": 22.66, "step": 29806, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957191 }, { "epoch": 0.968294188350713, "grad_norm": 0.3910568058490753, "learning_rate": 2.7459815724437124e-08, "loss": 0.008485882543027401, "memory(GiB)": 22.66, "step": 29807, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.957197 }, { "epoch": 0.9683266738134685, "grad_norm": 0.3030744791030884, "learning_rate": 2.7403625867449268e-08, "loss": 0.015385310165584087, "memory(GiB)": 22.66, "step": 29808, "token_acc": 0.9859649122807017, "train_speed(iter/s)": 0.957203 }, { "epoch": 0.9683591592762238, "grad_norm": 0.2817150354385376, "learning_rate": 2.734749340084275e-08, "loss": 0.00919300690293312, "memory(GiB)": 22.66, "step": 29809, "token_acc": 1.0, "train_speed(iter/s)": 0.95721 }, { "epoch": 0.9683916447389793, "grad_norm": 0.3011944890022278, "learning_rate": 2.7291418325264275e-08, "loss": 0.007917041890323162, "memory(GiB)": 22.66, "step": 29810, "token_acc": 1.0, "train_speed(iter/s)": 0.957216 }, { "epoch": 0.9684241302017347, "grad_norm": 0.24190229177474976, "learning_rate": 2.7235400641361652e-08, "loss": 0.005754439625889063, "memory(GiB)": 22.66, "step": 29811, "token_acc": 1.0, "train_speed(iter/s)": 0.957223 }, { "epoch": 0.9684566156644902, "grad_norm": 0.34220778942108154, "learning_rate": 2.7179440349781593e-08, "loss": 0.012599285691976547, "memory(GiB)": 22.66, "step": 29812, "token_acc": 1.0, "train_speed(iter/s)": 0.957229 }, { "epoch": 0.9684891011272455, "grad_norm": 0.4277668297290802, "learning_rate": 2.712353745116969e-08, "loss": 0.010607728734612465, "memory(GiB)": 22.66, "step": 29813, "token_acc": 0.9858657243816255, "train_speed(iter/s)": 0.957236 }, { "epoch": 0.968521586590001, "grad_norm": 0.40196868777275085, "learning_rate": 2.7067691946170427e-08, "loss": 0.013729838654398918, "memory(GiB)": 22.66, "step": 29814, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.957243 }, { "epoch": 0.9685540720527563, "grad_norm": 0.32766392827033997, "learning_rate": 2.70119038354294e-08, "loss": 0.013773663900792599, "memory(GiB)": 22.66, "step": 29815, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.95725 }, { "epoch": 0.9685865575155118, "grad_norm": 0.33315303921699524, "learning_rate": 2.695617311958998e-08, "loss": 0.013785546645522118, "memory(GiB)": 22.66, "step": 29816, "token_acc": 0.9931972789115646, "train_speed(iter/s)": 0.957255 }, { "epoch": 0.9686190429782672, "grad_norm": 0.24360397458076477, "learning_rate": 2.6900499799294987e-08, "loss": 0.008357170969247818, "memory(GiB)": 22.66, "step": 29817, "token_acc": 1.0, "train_speed(iter/s)": 0.957261 }, { "epoch": 0.9686515284410226, "grad_norm": 0.4759414792060852, "learning_rate": 2.6844883875188354e-08, "loss": 0.008678125217556953, "memory(GiB)": 22.66, "step": 29818, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957266 }, { "epoch": 0.968684013903778, "grad_norm": 0.2910058796405792, "learning_rate": 2.678932534791012e-08, "loss": 0.0118692796677351, "memory(GiB)": 22.66, "step": 29819, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.957271 }, { "epoch": 0.9687164993665335, "grad_norm": 0.40362420678138733, "learning_rate": 2.6733824218102556e-08, "loss": 0.013748303055763245, "memory(GiB)": 22.66, "step": 29820, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.957276 }, { "epoch": 0.9687489848292888, "grad_norm": 0.3377358317375183, "learning_rate": 2.6678380486406252e-08, "loss": 0.008285761810839176, "memory(GiB)": 22.66, "step": 29821, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.957281 }, { "epoch": 0.9687814702920443, "grad_norm": 0.31719642877578735, "learning_rate": 2.6622994153461258e-08, "loss": 0.014007952064275742, "memory(GiB)": 22.66, "step": 29822, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.957287 }, { "epoch": 0.9688139557547997, "grad_norm": 0.2725951075553894, "learning_rate": 2.656766521990595e-08, "loss": 0.006866994313895702, "memory(GiB)": 22.66, "step": 29823, "token_acc": 1.0, "train_speed(iter/s)": 0.957292 }, { "epoch": 0.9688464412175551, "grad_norm": 0.22140300273895264, "learning_rate": 2.6512393686379258e-08, "loss": 0.006810924969613552, "memory(GiB)": 22.66, "step": 29824, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957297 }, { "epoch": 0.9688789266803106, "grad_norm": 0.2766557037830353, "learning_rate": 2.6457179553519564e-08, "loss": 0.010358589701354504, "memory(GiB)": 22.66, "step": 29825, "token_acc": 1.0, "train_speed(iter/s)": 0.957302 }, { "epoch": 0.968911412143066, "grad_norm": 0.26555055379867554, "learning_rate": 2.640202282196358e-08, "loss": 0.007357367780059576, "memory(GiB)": 22.66, "step": 29826, "token_acc": 1.0, "train_speed(iter/s)": 0.957307 }, { "epoch": 0.9689438976058214, "grad_norm": 0.3276035785675049, "learning_rate": 2.6346923492348574e-08, "loss": 0.01298130489885807, "memory(GiB)": 22.66, "step": 29827, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.957312 }, { "epoch": 0.9689763830685768, "grad_norm": 0.2977650463581085, "learning_rate": 2.6291881565309597e-08, "loss": 0.011104952543973923, "memory(GiB)": 22.66, "step": 29828, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957318 }, { "epoch": 0.9690088685313323, "grad_norm": 0.32086068391799927, "learning_rate": 2.6236897041482246e-08, "loss": 0.010346729308366776, "memory(GiB)": 22.66, "step": 29829, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.957322 }, { "epoch": 0.9690413539940876, "grad_norm": 0.2481926828622818, "learning_rate": 2.6181969921501016e-08, "loss": 0.008911876007914543, "memory(GiB)": 22.66, "step": 29830, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.957328 }, { "epoch": 0.9690738394568431, "grad_norm": 0.2768166959285736, "learning_rate": 2.61271002060004e-08, "loss": 0.006330009084194899, "memory(GiB)": 22.66, "step": 29831, "token_acc": 1.0, "train_speed(iter/s)": 0.957333 }, { "epoch": 0.9691063249195985, "grad_norm": 0.3243617117404938, "learning_rate": 2.6072287895613224e-08, "loss": 0.012905516661703587, "memory(GiB)": 22.66, "step": 29832, "token_acc": 0.9872340425531915, "train_speed(iter/s)": 0.957339 }, { "epoch": 0.9691388103823539, "grad_norm": 0.297023743391037, "learning_rate": 2.6017532990971762e-08, "loss": 0.0074625518172979355, "memory(GiB)": 22.66, "step": 29833, "token_acc": 1.0, "train_speed(iter/s)": 0.957344 }, { "epoch": 0.9691712958451093, "grad_norm": 0.3277655839920044, "learning_rate": 2.5962835492708837e-08, "loss": 0.009868263266980648, "memory(GiB)": 22.66, "step": 29834, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.957349 }, { "epoch": 0.9692037813078648, "grad_norm": 0.3533126413822174, "learning_rate": 2.590819540145506e-08, "loss": 0.011439153924584389, "memory(GiB)": 22.66, "step": 29835, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.957354 }, { "epoch": 0.9692362667706201, "grad_norm": 0.29410320520401, "learning_rate": 2.5853612717841037e-08, "loss": 0.005276298150420189, "memory(GiB)": 22.66, "step": 29836, "token_acc": 1.0, "train_speed(iter/s)": 0.957359 }, { "epoch": 0.9692687522333756, "grad_norm": 0.31995561718940735, "learning_rate": 2.5799087442496817e-08, "loss": 0.008560879155993462, "memory(GiB)": 22.66, "step": 29837, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.957364 }, { "epoch": 0.969301237696131, "grad_norm": 0.39134880900382996, "learning_rate": 2.5744619576051898e-08, "loss": 0.013453628867864609, "memory(GiB)": 22.66, "step": 29838, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.95737 }, { "epoch": 0.9693337231588864, "grad_norm": 0.3878149092197418, "learning_rate": 2.5690209119135225e-08, "loss": 0.015617523342370987, "memory(GiB)": 22.66, "step": 29839, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.957375 }, { "epoch": 0.9693662086216418, "grad_norm": 0.33241006731987, "learning_rate": 2.5635856072373512e-08, "loss": 0.0061272019520401955, "memory(GiB)": 22.66, "step": 29840, "token_acc": 0.9949748743718593, "train_speed(iter/s)": 0.957382 }, { "epoch": 0.9693986940843973, "grad_norm": 0.3535715639591217, "learning_rate": 2.558156043639515e-08, "loss": 0.012487320229411125, "memory(GiB)": 22.66, "step": 29841, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957389 }, { "epoch": 0.9694311795471526, "grad_norm": 0.401994913816452, "learning_rate": 2.5527322211826854e-08, "loss": 0.013978821225464344, "memory(GiB)": 22.66, "step": 29842, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.957396 }, { "epoch": 0.9694636650099081, "grad_norm": 0.29958271980285645, "learning_rate": 2.5473141399294244e-08, "loss": 0.011098278686404228, "memory(GiB)": 22.66, "step": 29843, "token_acc": 1.0, "train_speed(iter/s)": 0.957402 }, { "epoch": 0.9694961504726635, "grad_norm": 0.36916232109069824, "learning_rate": 2.541901799942237e-08, "loss": 0.008141825906932354, "memory(GiB)": 22.66, "step": 29844, "token_acc": 1.0, "train_speed(iter/s)": 0.957409 }, { "epoch": 0.9695286359354189, "grad_norm": 0.5013949275016785, "learning_rate": 2.5364952012836285e-08, "loss": 0.012274142354726791, "memory(GiB)": 22.66, "step": 29845, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957416 }, { "epoch": 0.9695611213981743, "grad_norm": 0.33089613914489746, "learning_rate": 2.5310943440159942e-08, "loss": 0.014236364513635635, "memory(GiB)": 22.66, "step": 29846, "token_acc": 0.9741379310344828, "train_speed(iter/s)": 0.957422 }, { "epoch": 0.9695936068609298, "grad_norm": 0.3132131099700928, "learning_rate": 2.5256992282016725e-08, "loss": 0.012871945276856422, "memory(GiB)": 22.66, "step": 29847, "token_acc": 1.0, "train_speed(iter/s)": 0.957428 }, { "epoch": 0.9696260923236851, "grad_norm": 0.35384780168533325, "learning_rate": 2.5203098539028913e-08, "loss": 0.011819977313280106, "memory(GiB)": 22.66, "step": 29848, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957433 }, { "epoch": 0.9696585777864406, "grad_norm": 0.41400375962257385, "learning_rate": 2.5149262211818793e-08, "loss": 0.013864429667592049, "memory(GiB)": 22.66, "step": 29849, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.957439 }, { "epoch": 0.969691063249196, "grad_norm": 0.19094692170619965, "learning_rate": 2.5095483301008083e-08, "loss": 0.010808839462697506, "memory(GiB)": 22.66, "step": 29850, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957444 }, { "epoch": 0.9697235487119514, "grad_norm": 0.30403199791908264, "learning_rate": 2.5041761807216847e-08, "loss": 0.005722401663661003, "memory(GiB)": 22.66, "step": 29851, "token_acc": 1.0, "train_speed(iter/s)": 0.957449 }, { "epoch": 0.9697560341747068, "grad_norm": 0.40536123514175415, "learning_rate": 2.4988097731065143e-08, "loss": 0.013992763124406338, "memory(GiB)": 22.66, "step": 29852, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.957455 }, { "epoch": 0.9697885196374623, "grad_norm": 0.4182290732860565, "learning_rate": 2.4934491073172474e-08, "loss": 0.019809268414974213, "memory(GiB)": 22.66, "step": 29853, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.95746 }, { "epoch": 0.9698210051002176, "grad_norm": 0.316144734621048, "learning_rate": 2.488094183415779e-08, "loss": 0.00947658158838749, "memory(GiB)": 22.66, "step": 29854, "token_acc": 0.9895470383275261, "train_speed(iter/s)": 0.957466 }, { "epoch": 0.9698534905629731, "grad_norm": 0.37273329496383667, "learning_rate": 2.4827450014638932e-08, "loss": 0.011868134140968323, "memory(GiB)": 22.66, "step": 29855, "token_acc": 0.996415770609319, "train_speed(iter/s)": 0.957471 }, { "epoch": 0.9698859760257285, "grad_norm": 0.3784426152706146, "learning_rate": 2.4774015615233182e-08, "loss": 0.01454479806125164, "memory(GiB)": 22.66, "step": 29856, "token_acc": 1.0, "train_speed(iter/s)": 0.957475 }, { "epoch": 0.9699184614884839, "grad_norm": 0.4606819152832031, "learning_rate": 2.472063863655727e-08, "loss": 0.013239890336990356, "memory(GiB)": 22.66, "step": 29857, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.957481 }, { "epoch": 0.9699509469512393, "grad_norm": 0.3713262677192688, "learning_rate": 2.466731907922737e-08, "loss": 0.009121105074882507, "memory(GiB)": 22.66, "step": 29858, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.957486 }, { "epoch": 0.9699834324139948, "grad_norm": 0.2669099271297455, "learning_rate": 2.4614056943858543e-08, "loss": 0.0077416179701685905, "memory(GiB)": 22.66, "step": 29859, "token_acc": 1.0, "train_speed(iter/s)": 0.957491 }, { "epoch": 0.9700159178767501, "grad_norm": 0.2966982424259186, "learning_rate": 2.4560852231065856e-08, "loss": 0.006950179114937782, "memory(GiB)": 22.66, "step": 29860, "token_acc": 1.0, "train_speed(iter/s)": 0.957496 }, { "epoch": 0.9700484033395056, "grad_norm": 0.24532479047775269, "learning_rate": 2.4507704941463262e-08, "loss": 0.006700878497213125, "memory(GiB)": 22.66, "step": 29861, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957501 }, { "epoch": 0.970080888802261, "grad_norm": 0.24751058220863342, "learning_rate": 2.4454615075664155e-08, "loss": 0.008094938471913338, "memory(GiB)": 22.66, "step": 29862, "token_acc": 1.0, "train_speed(iter/s)": 0.957506 }, { "epoch": 0.9701133742650164, "grad_norm": 0.5703722238540649, "learning_rate": 2.4401582634281384e-08, "loss": 0.01429130882024765, "memory(GiB)": 22.66, "step": 29863, "token_acc": 0.996, "train_speed(iter/s)": 0.957498 }, { "epoch": 0.9701458597277718, "grad_norm": 0.2966557741165161, "learning_rate": 2.4348607617926677e-08, "loss": 0.011579718440771103, "memory(GiB)": 22.66, "step": 29864, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.957503 }, { "epoch": 0.9701783451905273, "grad_norm": 0.20340608060359955, "learning_rate": 2.4295690027211772e-08, "loss": 0.007681076880544424, "memory(GiB)": 22.66, "step": 29865, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.957509 }, { "epoch": 0.9702108306532826, "grad_norm": 0.5062676668167114, "learning_rate": 2.424282986274673e-08, "loss": 0.010083569213747978, "memory(GiB)": 22.66, "step": 29866, "token_acc": 1.0, "train_speed(iter/s)": 0.957515 }, { "epoch": 0.9702433161160381, "grad_norm": 0.24885901808738708, "learning_rate": 2.4190027125142734e-08, "loss": 0.013729868456721306, "memory(GiB)": 22.66, "step": 29867, "token_acc": 0.996, "train_speed(iter/s)": 0.957522 }, { "epoch": 0.9702758015787935, "grad_norm": 0.37114226818084717, "learning_rate": 2.4137281815008183e-08, "loss": 0.007832104340195656, "memory(GiB)": 22.66, "step": 29868, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.957529 }, { "epoch": 0.9703082870415489, "grad_norm": 0.4083039462566376, "learning_rate": 2.4084593932952594e-08, "loss": 0.01609315350651741, "memory(GiB)": 22.66, "step": 29869, "token_acc": 1.0, "train_speed(iter/s)": 0.957536 }, { "epoch": 0.9703407725043043, "grad_norm": 0.523041844367981, "learning_rate": 2.4031963479583253e-08, "loss": 0.01879897341132164, "memory(GiB)": 22.66, "step": 29870, "token_acc": 0.9881656804733728, "train_speed(iter/s)": 0.957543 }, { "epoch": 0.9703732579670598, "grad_norm": 0.2688756287097931, "learning_rate": 2.3979390455508565e-08, "loss": 0.004942365922033787, "memory(GiB)": 22.66, "step": 29871, "token_acc": 1.0, "train_speed(iter/s)": 0.95755 }, { "epoch": 0.9704057434298151, "grad_norm": 0.28330478072166443, "learning_rate": 2.3926874861334158e-08, "loss": 0.0054135797545313835, "memory(GiB)": 22.66, "step": 29872, "token_acc": 1.0, "train_speed(iter/s)": 0.957557 }, { "epoch": 0.9704382288925706, "grad_norm": 0.416182279586792, "learning_rate": 2.3874416697667326e-08, "loss": 0.011606279760599136, "memory(GiB)": 22.66, "step": 29873, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.957564 }, { "epoch": 0.970470714355326, "grad_norm": 0.37769243121147156, "learning_rate": 2.3822015965113133e-08, "loss": 0.012159999459981918, "memory(GiB)": 22.66, "step": 29874, "token_acc": 0.9888475836431226, "train_speed(iter/s)": 0.957571 }, { "epoch": 0.9705031998180814, "grad_norm": 0.2236752063035965, "learning_rate": 2.3769672664275545e-08, "loss": 0.006543394178152084, "memory(GiB)": 22.66, "step": 29875, "token_acc": 0.9959016393442623, "train_speed(iter/s)": 0.957578 }, { "epoch": 0.9705356852808368, "grad_norm": 0.6008189916610718, "learning_rate": 2.3717386795759635e-08, "loss": 0.016691526398062706, "memory(GiB)": 22.66, "step": 29876, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.957585 }, { "epoch": 0.9705681707435923, "grad_norm": 0.45731908082962036, "learning_rate": 2.366515836016825e-08, "loss": 0.01343935914337635, "memory(GiB)": 22.66, "step": 29877, "token_acc": 0.99609375, "train_speed(iter/s)": 0.957591 }, { "epoch": 0.9706006562063476, "grad_norm": 0.25174859166145325, "learning_rate": 2.36129873581048e-08, "loss": 0.007717878557741642, "memory(GiB)": 22.66, "step": 29878, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957596 }, { "epoch": 0.9706331416691031, "grad_norm": 0.22415035963058472, "learning_rate": 2.3560873790170468e-08, "loss": 0.00690178107470274, "memory(GiB)": 22.66, "step": 29879, "token_acc": 1.0, "train_speed(iter/s)": 0.957601 }, { "epoch": 0.9706656271318584, "grad_norm": 0.44547033309936523, "learning_rate": 2.3508817656967552e-08, "loss": 0.01184774935245514, "memory(GiB)": 22.66, "step": 29880, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.957607 }, { "epoch": 0.9706981125946139, "grad_norm": 0.21220062673091888, "learning_rate": 2.3456818959096684e-08, "loss": 0.008913576602935791, "memory(GiB)": 22.66, "step": 29881, "token_acc": 1.0, "train_speed(iter/s)": 0.957613 }, { "epoch": 0.9707305980573693, "grad_norm": 0.3042915165424347, "learning_rate": 2.3404877697157936e-08, "loss": 0.008604157716035843, "memory(GiB)": 22.66, "step": 29882, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.957618 }, { "epoch": 0.9707630835201247, "grad_norm": 0.8382711410522461, "learning_rate": 2.335299387175083e-08, "loss": 0.010028540156781673, "memory(GiB)": 22.66, "step": 29883, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.957624 }, { "epoch": 0.9707955689828801, "grad_norm": 0.4397757649421692, "learning_rate": 2.3301167483473775e-08, "loss": 0.01579374261200428, "memory(GiB)": 22.66, "step": 29884, "token_acc": 1.0, "train_speed(iter/s)": 0.95763 }, { "epoch": 0.9708280544456356, "grad_norm": 0.2715533971786499, "learning_rate": 2.324939853292518e-08, "loss": 0.009153002873063087, "memory(GiB)": 22.66, "step": 29885, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.957635 }, { "epoch": 0.9708605399083909, "grad_norm": 0.2951170802116394, "learning_rate": 2.3197687020702908e-08, "loss": 0.012412762269377708, "memory(GiB)": 22.66, "step": 29886, "token_acc": 1.0, "train_speed(iter/s)": 0.95764 }, { "epoch": 0.9708930253711464, "grad_norm": 0.6452410221099854, "learning_rate": 2.314603294740314e-08, "loss": 0.009630194865167141, "memory(GiB)": 22.66, "step": 29887, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.957645 }, { "epoch": 0.9709255108339019, "grad_norm": 0.2596614360809326, "learning_rate": 2.3094436313622626e-08, "loss": 0.007934974506497383, "memory(GiB)": 22.66, "step": 29888, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.95765 }, { "epoch": 0.9709579962966572, "grad_norm": 0.28620320558547974, "learning_rate": 2.3042897119956442e-08, "loss": 0.00832565501332283, "memory(GiB)": 22.66, "step": 29889, "token_acc": 1.0, "train_speed(iter/s)": 0.957655 }, { "epoch": 0.9709904817594127, "grad_norm": 0.3477286994457245, "learning_rate": 2.2991415366999114e-08, "loss": 0.008482519537210464, "memory(GiB)": 22.66, "step": 29890, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.957661 }, { "epoch": 0.9710229672221681, "grad_norm": 0.2729186713695526, "learning_rate": 2.2939991055345724e-08, "loss": 0.009368712082505226, "memory(GiB)": 22.66, "step": 29891, "token_acc": 0.9870689655172413, "train_speed(iter/s)": 0.957667 }, { "epoch": 0.9710554526849235, "grad_norm": 0.24733301997184753, "learning_rate": 2.2888624185589124e-08, "loss": 0.008985592052340508, "memory(GiB)": 22.66, "step": 29892, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.957673 }, { "epoch": 0.9710879381476789, "grad_norm": 0.83317631483078, "learning_rate": 2.283731475832274e-08, "loss": 0.017818789929151535, "memory(GiB)": 22.66, "step": 29893, "token_acc": 1.0, "train_speed(iter/s)": 0.957679 }, { "epoch": 0.9711204236104344, "grad_norm": 0.388453871011734, "learning_rate": 2.278606277413775e-08, "loss": 0.016733793541789055, "memory(GiB)": 22.66, "step": 29894, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957684 }, { "epoch": 0.9711529090731897, "grad_norm": 0.38537195324897766, "learning_rate": 2.2734868233626473e-08, "loss": 0.015452965162694454, "memory(GiB)": 22.66, "step": 29895, "token_acc": 1.0, "train_speed(iter/s)": 0.95769 }, { "epoch": 0.9711853945359452, "grad_norm": 0.4835648536682129, "learning_rate": 2.268373113737954e-08, "loss": 0.01170077919960022, "memory(GiB)": 22.66, "step": 29896, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957694 }, { "epoch": 0.9712178799987006, "grad_norm": 0.22561942040920258, "learning_rate": 2.2632651485987032e-08, "loss": 0.005840142257511616, "memory(GiB)": 22.66, "step": 29897, "token_acc": 1.0, "train_speed(iter/s)": 0.9577 }, { "epoch": 0.971250365461456, "grad_norm": 0.28859275579452515, "learning_rate": 2.2581629280038487e-08, "loss": 0.011997830122709274, "memory(GiB)": 22.66, "step": 29898, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957705 }, { "epoch": 0.9712828509242114, "grad_norm": 0.42430195212364197, "learning_rate": 2.2530664520123424e-08, "loss": 0.007500829175114632, "memory(GiB)": 22.66, "step": 29899, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.957711 }, { "epoch": 0.9713153363869669, "grad_norm": 0.3313208222389221, "learning_rate": 2.2479757206829156e-08, "loss": 0.00918484851717949, "memory(GiB)": 22.66, "step": 29900, "token_acc": 1.0, "train_speed(iter/s)": 0.957717 }, { "epoch": 0.9713478218497222, "grad_norm": 0.38826853036880493, "learning_rate": 2.2428907340743546e-08, "loss": 0.011940237134695053, "memory(GiB)": 22.66, "step": 29901, "token_acc": 1.0, "train_speed(iter/s)": 0.957723 }, { "epoch": 0.9713803073124777, "grad_norm": 0.3317423462867737, "learning_rate": 2.2378114922453343e-08, "loss": 0.009325744584202766, "memory(GiB)": 22.66, "step": 29902, "token_acc": 1.0, "train_speed(iter/s)": 0.957729 }, { "epoch": 0.9714127927752331, "grad_norm": 0.3633502721786499, "learning_rate": 2.2327379952544748e-08, "loss": 0.01275996956974268, "memory(GiB)": 22.66, "step": 29903, "token_acc": 1.0, "train_speed(iter/s)": 0.957736 }, { "epoch": 0.9714452782379885, "grad_norm": 0.2801934480667114, "learning_rate": 2.2276702431603957e-08, "loss": 0.010421130806207657, "memory(GiB)": 22.66, "step": 29904, "token_acc": 0.9907407407407407, "train_speed(iter/s)": 0.957743 }, { "epoch": 0.9714777637007439, "grad_norm": 0.29253077507019043, "learning_rate": 2.222608236021495e-08, "loss": 0.008096368983387947, "memory(GiB)": 22.66, "step": 29905, "token_acc": 1.0, "train_speed(iter/s)": 0.95775 }, { "epoch": 0.9715102491634994, "grad_norm": 0.379657506942749, "learning_rate": 2.2175519738962814e-08, "loss": 0.010851025581359863, "memory(GiB)": 22.66, "step": 29906, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957756 }, { "epoch": 0.9715427346262547, "grad_norm": 0.5329073667526245, "learning_rate": 2.212501456842986e-08, "loss": 0.013285062275826931, "memory(GiB)": 22.66, "step": 29907, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.957761 }, { "epoch": 0.9715752200890102, "grad_norm": 0.32358449697494507, "learning_rate": 2.207456684920062e-08, "loss": 0.008406862616539001, "memory(GiB)": 22.66, "step": 29908, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.957766 }, { "epoch": 0.9716077055517656, "grad_norm": 0.46253177523612976, "learning_rate": 2.202417658185574e-08, "loss": 0.017195899039506912, "memory(GiB)": 22.66, "step": 29909, "token_acc": 0.992, "train_speed(iter/s)": 0.957772 }, { "epoch": 0.971640191014521, "grad_norm": 0.3901321589946747, "learning_rate": 2.1973843766978088e-08, "loss": 0.012973133474588394, "memory(GiB)": 22.66, "step": 29910, "token_acc": 1.0, "train_speed(iter/s)": 0.957777 }, { "epoch": 0.9716726764772764, "grad_norm": 0.31869253516197205, "learning_rate": 2.1923568405147756e-08, "loss": 0.015620987862348557, "memory(GiB)": 22.66, "step": 29911, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957783 }, { "epoch": 0.9717051619400319, "grad_norm": 0.42615586519241333, "learning_rate": 2.187335049694539e-08, "loss": 0.007882515899837017, "memory(GiB)": 22.66, "step": 29912, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.957788 }, { "epoch": 0.9717376474027872, "grad_norm": 0.44041717052459717, "learning_rate": 2.1823190042950527e-08, "loss": 0.018800731748342514, "memory(GiB)": 22.66, "step": 29913, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.957792 }, { "epoch": 0.9717701328655427, "grad_norm": 0.2634650468826294, "learning_rate": 2.1773087043742148e-08, "loss": 0.010372846387326717, "memory(GiB)": 22.66, "step": 29914, "token_acc": 0.9947089947089947, "train_speed(iter/s)": 0.957797 }, { "epoch": 0.9718026183282981, "grad_norm": 0.580948531627655, "learning_rate": 2.172304149989757e-08, "loss": 0.012889106757938862, "memory(GiB)": 22.66, "step": 29915, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.957803 }, { "epoch": 0.9718351037910535, "grad_norm": 0.34343698620796204, "learning_rate": 2.167305341199577e-08, "loss": 0.007799660786986351, "memory(GiB)": 22.66, "step": 29916, "token_acc": 1.0, "train_speed(iter/s)": 0.957808 }, { "epoch": 0.9718675892538089, "grad_norm": 0.4200635850429535, "learning_rate": 2.1623122780612958e-08, "loss": 0.009223639965057373, "memory(GiB)": 22.66, "step": 29917, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.957814 }, { "epoch": 0.9719000747165644, "grad_norm": 0.5369284749031067, "learning_rate": 2.1573249606325342e-08, "loss": 0.013955947011709213, "memory(GiB)": 22.66, "step": 29918, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.95782 }, { "epoch": 0.9719325601793197, "grad_norm": 0.37938904762268066, "learning_rate": 2.1523433889709123e-08, "loss": 0.010214269161224365, "memory(GiB)": 22.66, "step": 29919, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.957825 }, { "epoch": 0.9719650456420752, "grad_norm": 0.5044911503791809, "learning_rate": 2.1473675631338287e-08, "loss": 0.013484521768987179, "memory(GiB)": 22.66, "step": 29920, "token_acc": 1.0, "train_speed(iter/s)": 0.95783 }, { "epoch": 0.9719975311048306, "grad_norm": 0.3203003406524658, "learning_rate": 2.1423974831788487e-08, "loss": 0.008902190253138542, "memory(GiB)": 22.66, "step": 29921, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.957836 }, { "epoch": 0.972030016567586, "grad_norm": 0.6905205249786377, "learning_rate": 2.1374331491631485e-08, "loss": 0.011960326693952084, "memory(GiB)": 22.66, "step": 29922, "token_acc": 1.0, "train_speed(iter/s)": 0.95784 }, { "epoch": 0.9720625020303414, "grad_norm": 1.0836420059204102, "learning_rate": 2.1324745611441823e-08, "loss": 0.010325955227017403, "memory(GiB)": 22.66, "step": 29923, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957846 }, { "epoch": 0.9720949874930969, "grad_norm": 0.3968401551246643, "learning_rate": 2.127521719179071e-08, "loss": 0.012490173801779747, "memory(GiB)": 22.66, "step": 29924, "token_acc": 1.0, "train_speed(iter/s)": 0.957851 }, { "epoch": 0.9721274729558522, "grad_norm": 0.3408125042915344, "learning_rate": 2.1225746233251022e-08, "loss": 0.012044149450957775, "memory(GiB)": 22.66, "step": 29925, "token_acc": 1.0, "train_speed(iter/s)": 0.957857 }, { "epoch": 0.9721599584186077, "grad_norm": 0.29397761821746826, "learning_rate": 2.1176332736392303e-08, "loss": 0.011730905622243881, "memory(GiB)": 22.66, "step": 29926, "token_acc": 1.0, "train_speed(iter/s)": 0.957863 }, { "epoch": 0.972192443881363, "grad_norm": 0.33427920937538147, "learning_rate": 2.1126976701785763e-08, "loss": 0.009129907935857773, "memory(GiB)": 22.66, "step": 29927, "token_acc": 1.0, "train_speed(iter/s)": 0.957869 }, { "epoch": 0.9722249293441185, "grad_norm": 0.38213595747947693, "learning_rate": 2.107767813000039e-08, "loss": 0.011861812323331833, "memory(GiB)": 22.66, "step": 29928, "token_acc": 1.0, "train_speed(iter/s)": 0.957876 }, { "epoch": 0.9722574148068739, "grad_norm": 0.5279254913330078, "learning_rate": 2.1028437021606284e-08, "loss": 0.016084836795926094, "memory(GiB)": 22.66, "step": 29929, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.957883 }, { "epoch": 0.9722899002696294, "grad_norm": 0.277976393699646, "learning_rate": 2.0979253377170216e-08, "loss": 0.01163024827837944, "memory(GiB)": 22.66, "step": 29930, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.957889 }, { "epoch": 0.9723223857323847, "grad_norm": 0.17851462960243225, "learning_rate": 2.0930127197261176e-08, "loss": 0.010350066237151623, "memory(GiB)": 22.66, "step": 29931, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.957896 }, { "epoch": 0.9723548711951402, "grad_norm": 0.3621002733707428, "learning_rate": 2.0881058482445372e-08, "loss": 0.021450364962220192, "memory(GiB)": 22.66, "step": 29932, "token_acc": 0.9929824561403509, "train_speed(iter/s)": 0.957902 }, { "epoch": 0.9723873566578956, "grad_norm": 0.3827182948589325, "learning_rate": 2.0832047233289575e-08, "loss": 0.009407993406057358, "memory(GiB)": 22.66, "step": 29933, "token_acc": 1.0, "train_speed(iter/s)": 0.957909 }, { "epoch": 0.972419842120651, "grad_norm": 0.4289965331554413, "learning_rate": 2.0783093450358893e-08, "loss": 0.008880943059921265, "memory(GiB)": 22.66, "step": 29934, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.957916 }, { "epoch": 0.9724523275834064, "grad_norm": 0.3913942873477936, "learning_rate": 2.073419713421898e-08, "loss": 0.013185524381697178, "memory(GiB)": 22.66, "step": 29935, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.957922 }, { "epoch": 0.9724848130461619, "grad_norm": 0.3316616117954254, "learning_rate": 2.0685358285433276e-08, "loss": 0.010146589018404484, "memory(GiB)": 22.66, "step": 29936, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.957929 }, { "epoch": 0.9725172985089172, "grad_norm": 0.2130119800567627, "learning_rate": 2.063657690456633e-08, "loss": 0.004893543664366007, "memory(GiB)": 22.66, "step": 29937, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957936 }, { "epoch": 0.9725497839716727, "grad_norm": 0.27753981947898865, "learning_rate": 2.0587852992181024e-08, "loss": 0.005262037273496389, "memory(GiB)": 22.66, "step": 29938, "token_acc": 0.9917695473251029, "train_speed(iter/s)": 0.957942 }, { "epoch": 0.972582269434428, "grad_norm": 0.3655698895454407, "learning_rate": 2.053918654883913e-08, "loss": 0.012608474120497704, "memory(GiB)": 22.66, "step": 29939, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.957947 }, { "epoch": 0.9726147548971835, "grad_norm": 0.35479673743247986, "learning_rate": 2.0490577575102978e-08, "loss": 0.010764023289084435, "memory(GiB)": 22.66, "step": 29940, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957953 }, { "epoch": 0.9726472403599389, "grad_norm": 0.2804698646068573, "learning_rate": 2.0442026071532674e-08, "loss": 0.00814332626760006, "memory(GiB)": 22.66, "step": 29941, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.957958 }, { "epoch": 0.9726797258226944, "grad_norm": 0.3903229832649231, "learning_rate": 2.039353203868999e-08, "loss": 0.00906204804778099, "memory(GiB)": 22.66, "step": 29942, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.957963 }, { "epoch": 0.9727122112854497, "grad_norm": 0.3750452697277069, "learning_rate": 2.034509547713337e-08, "loss": 0.010866380296647549, "memory(GiB)": 22.66, "step": 29943, "token_acc": 1.0, "train_speed(iter/s)": 0.957968 }, { "epoch": 0.9727446967482052, "grad_norm": 0.2589944303035736, "learning_rate": 2.0296716387422367e-08, "loss": 0.010154541581869125, "memory(GiB)": 22.66, "step": 29944, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.957974 }, { "epoch": 0.9727771822109605, "grad_norm": 0.28522780537605286, "learning_rate": 2.0248394770114865e-08, "loss": 0.006972596049308777, "memory(GiB)": 22.66, "step": 29945, "token_acc": 0.9900497512437811, "train_speed(iter/s)": 0.957979 }, { "epoch": 0.972809667673716, "grad_norm": 0.2995077073574066, "learning_rate": 2.020013062576931e-08, "loss": 0.009123132564127445, "memory(GiB)": 22.66, "step": 29946, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.957985 }, { "epoch": 0.9728421531364714, "grad_norm": 0.5467513799667358, "learning_rate": 2.015192395494192e-08, "loss": 0.011870680376887321, "memory(GiB)": 22.66, "step": 29947, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.95799 }, { "epoch": 0.9728746385992268, "grad_norm": 0.37603089213371277, "learning_rate": 2.010377475819003e-08, "loss": 0.012643558904528618, "memory(GiB)": 22.66, "step": 29948, "token_acc": 1.0, "train_speed(iter/s)": 0.957995 }, { "epoch": 0.9729071240619822, "grad_norm": 0.43308356404304504, "learning_rate": 2.005568303606875e-08, "loss": 0.009640798904001713, "memory(GiB)": 22.66, "step": 29949, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.958 }, { "epoch": 0.9729396095247377, "grad_norm": 0.2553607225418091, "learning_rate": 2.000764878913264e-08, "loss": 0.010741003789007664, "memory(GiB)": 22.66, "step": 29950, "token_acc": 1.0, "train_speed(iter/s)": 0.958005 }, { "epoch": 0.972972094987493, "grad_norm": 0.2932044565677643, "learning_rate": 1.9959672017937358e-08, "loss": 0.009645042009651661, "memory(GiB)": 22.66, "step": 29951, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.95801 }, { "epoch": 0.9730045804502485, "grad_norm": 0.4364621043205261, "learning_rate": 1.991175272303525e-08, "loss": 0.010134035721421242, "memory(GiB)": 22.66, "step": 29952, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.958016 }, { "epoch": 0.973037065913004, "grad_norm": 0.43866053223609924, "learning_rate": 1.986389090498031e-08, "loss": 0.015816088765859604, "memory(GiB)": 22.66, "step": 29953, "token_acc": 0.9829059829059829, "train_speed(iter/s)": 0.958021 }, { "epoch": 0.9730695513757593, "grad_norm": 0.3035707175731659, "learning_rate": 1.981608656432432e-08, "loss": 0.011949043720960617, "memory(GiB)": 22.66, "step": 29954, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.958026 }, { "epoch": 0.9731020368385148, "grad_norm": 0.2620457410812378, "learning_rate": 1.9768339701619622e-08, "loss": 0.008684237487614155, "memory(GiB)": 22.66, "step": 29955, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.958032 }, { "epoch": 0.9731345223012702, "grad_norm": 0.2780194580554962, "learning_rate": 1.9720650317416878e-08, "loss": 0.006714577786624432, "memory(GiB)": 22.66, "step": 29956, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.958037 }, { "epoch": 0.9731670077640256, "grad_norm": 0.3550984263420105, "learning_rate": 1.9673018412266764e-08, "loss": 0.011672061868011951, "memory(GiB)": 22.66, "step": 29957, "token_acc": 1.0, "train_speed(iter/s)": 0.958042 }, { "epoch": 0.973199493226781, "grad_norm": 0.3986903131008148, "learning_rate": 1.9625443986718283e-08, "loss": 0.013294199481606483, "memory(GiB)": 22.66, "step": 29958, "token_acc": 1.0, "train_speed(iter/s)": 0.958047 }, { "epoch": 0.9732319786895365, "grad_norm": 0.3088392913341522, "learning_rate": 1.9577927041321554e-08, "loss": 0.008880902081727982, "memory(GiB)": 22.66, "step": 29959, "token_acc": 1.0, "train_speed(iter/s)": 0.958052 }, { "epoch": 0.9732644641522918, "grad_norm": 0.45462334156036377, "learning_rate": 1.953046757662447e-08, "loss": 0.01190081238746643, "memory(GiB)": 22.66, "step": 29960, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.958057 }, { "epoch": 0.9732969496150473, "grad_norm": 0.48140671849250793, "learning_rate": 1.9483065593174365e-08, "loss": 0.01711646094918251, "memory(GiB)": 22.66, "step": 29961, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.958062 }, { "epoch": 0.9733294350778027, "grad_norm": 0.2730712890625, "learning_rate": 1.9435721091519144e-08, "loss": 0.009929344989359379, "memory(GiB)": 22.66, "step": 29962, "token_acc": 1.0, "train_speed(iter/s)": 0.958067 }, { "epoch": 0.9733619205405581, "grad_norm": 0.2520212233066559, "learning_rate": 1.9388434072204477e-08, "loss": 0.01650448516011238, "memory(GiB)": 22.66, "step": 29963, "token_acc": 0.9923371647509579, "train_speed(iter/s)": 0.958073 }, { "epoch": 0.9733944060033135, "grad_norm": 0.4072115421295166, "learning_rate": 1.9341204535776592e-08, "loss": 0.008176569826900959, "memory(GiB)": 22.66, "step": 29964, "token_acc": 1.0, "train_speed(iter/s)": 0.95808 }, { "epoch": 0.973426891466069, "grad_norm": 0.42226389050483704, "learning_rate": 1.929403248278061e-08, "loss": 0.012177521362900734, "memory(GiB)": 22.66, "step": 29965, "token_acc": 1.0, "train_speed(iter/s)": 0.958086 }, { "epoch": 0.9734593769288243, "grad_norm": 0.3265238106250763, "learning_rate": 1.9246917913760542e-08, "loss": 0.009860867634415627, "memory(GiB)": 22.66, "step": 29966, "token_acc": 1.0, "train_speed(iter/s)": 0.958091 }, { "epoch": 0.9734918623915798, "grad_norm": 0.31642407178878784, "learning_rate": 1.9199860829260398e-08, "loss": 0.00874263234436512, "memory(GiB)": 22.66, "step": 29967, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.958096 }, { "epoch": 0.9735243478543352, "grad_norm": 0.3278580904006958, "learning_rate": 1.9152861229823628e-08, "loss": 0.012990412302315235, "memory(GiB)": 22.66, "step": 29968, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.958101 }, { "epoch": 0.9735568333170906, "grad_norm": 0.3493106961250305, "learning_rate": 1.9105919115992023e-08, "loss": 0.010293733328580856, "memory(GiB)": 22.66, "step": 29969, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.958107 }, { "epoch": 0.973589318779846, "grad_norm": 0.3228035569190979, "learning_rate": 1.9059034488307926e-08, "loss": 0.011246902868151665, "memory(GiB)": 22.66, "step": 29970, "token_acc": 0.9912280701754386, "train_speed(iter/s)": 0.958112 }, { "epoch": 0.9736218042426015, "grad_norm": 0.3042132556438446, "learning_rate": 1.9012207347312018e-08, "loss": 0.007726582698523998, "memory(GiB)": 22.66, "step": 29971, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.958117 }, { "epoch": 0.9736542897053568, "grad_norm": 0.2771912217140198, "learning_rate": 1.8965437693544974e-08, "loss": 0.008972867392003536, "memory(GiB)": 22.66, "step": 29972, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.958121 }, { "epoch": 0.9736867751681123, "grad_norm": 0.26504307985305786, "learning_rate": 1.8918725527546367e-08, "loss": 0.007342160679399967, "memory(GiB)": 22.66, "step": 29973, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.958126 }, { "epoch": 0.9737192606308677, "grad_norm": 0.6151122450828552, "learning_rate": 1.887207084985576e-08, "loss": 0.013318509794771671, "memory(GiB)": 22.66, "step": 29974, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.958131 }, { "epoch": 0.9737517460936231, "grad_norm": 0.32023265957832336, "learning_rate": 1.8825473661011062e-08, "loss": 0.009127350524067879, "memory(GiB)": 22.66, "step": 29975, "token_acc": 1.0, "train_speed(iter/s)": 0.95813 }, { "epoch": 0.9737842315563785, "grad_norm": 0.2716422975063324, "learning_rate": 1.877893396155017e-08, "loss": 0.00907333754003048, "memory(GiB)": 22.66, "step": 29976, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.958135 }, { "epoch": 0.973816717019134, "grad_norm": 0.259092777967453, "learning_rate": 1.8732451752010995e-08, "loss": 0.008501555770635605, "memory(GiB)": 22.66, "step": 29977, "token_acc": 1.0, "train_speed(iter/s)": 0.95814 }, { "epoch": 0.9738492024818893, "grad_norm": 0.45881015062332153, "learning_rate": 1.8686027032928655e-08, "loss": 0.011663307435810566, "memory(GiB)": 22.66, "step": 29978, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.958145 }, { "epoch": 0.9738816879446448, "grad_norm": 0.4416128993034363, "learning_rate": 1.863965980483995e-08, "loss": 0.010357079096138477, "memory(GiB)": 22.66, "step": 29979, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.95815 }, { "epoch": 0.9739141734074002, "grad_norm": 0.295284628868103, "learning_rate": 1.8593350068280003e-08, "loss": 0.00927705317735672, "memory(GiB)": 22.66, "step": 29980, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.958155 }, { "epoch": 0.9739466588701556, "grad_norm": 0.40016573667526245, "learning_rate": 1.8547097823782834e-08, "loss": 0.017134007066488266, "memory(GiB)": 22.66, "step": 29981, "token_acc": 0.9848484848484849, "train_speed(iter/s)": 0.958159 }, { "epoch": 0.973979144332911, "grad_norm": 0.35202401876449585, "learning_rate": 1.850090307188246e-08, "loss": 0.011238062754273415, "memory(GiB)": 22.66, "step": 29982, "token_acc": 1.0, "train_speed(iter/s)": 0.958164 }, { "epoch": 0.9740116297956665, "grad_norm": 0.3276729881763458, "learning_rate": 1.8454765813111786e-08, "loss": 0.010922839865088463, "memory(GiB)": 22.66, "step": 29983, "token_acc": 1.0, "train_speed(iter/s)": 0.958169 }, { "epoch": 0.9740441152584218, "grad_norm": 0.3398585915565491, "learning_rate": 1.840868604800372e-08, "loss": 0.009629967622458935, "memory(GiB)": 22.66, "step": 29984, "token_acc": 1.0, "train_speed(iter/s)": 0.958174 }, { "epoch": 0.9740766007211773, "grad_norm": 0.2563426196575165, "learning_rate": 1.8362663777090617e-08, "loss": 0.012905566021800041, "memory(GiB)": 22.66, "step": 29985, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.958181 }, { "epoch": 0.9741090861839327, "grad_norm": 0.47989922761917114, "learning_rate": 1.8316699000902048e-08, "loss": 0.01663355715572834, "memory(GiB)": 22.66, "step": 29986, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.958188 }, { "epoch": 0.9741415716466881, "grad_norm": 0.3761308789253235, "learning_rate": 1.8270791719969262e-08, "loss": 0.014230184257030487, "memory(GiB)": 22.66, "step": 29987, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.958194 }, { "epoch": 0.9741740571094435, "grad_norm": 0.3787883222103119, "learning_rate": 1.8224941934822936e-08, "loss": 0.008584227412939072, "memory(GiB)": 22.66, "step": 29988, "token_acc": 1.0, "train_speed(iter/s)": 0.958201 }, { "epoch": 0.974206542572199, "grad_norm": 0.34879830479621887, "learning_rate": 1.817914964599099e-08, "loss": 0.012804536148905754, "memory(GiB)": 22.66, "step": 29989, "token_acc": 1.0, "train_speed(iter/s)": 0.958208 }, { "epoch": 0.9742390280349543, "grad_norm": 0.3033252954483032, "learning_rate": 1.813341485400244e-08, "loss": 0.00982588343322277, "memory(GiB)": 22.66, "step": 29990, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.958215 }, { "epoch": 0.9742715134977098, "grad_norm": 0.3322489261627197, "learning_rate": 1.80877375593852e-08, "loss": 0.011960407719016075, "memory(GiB)": 22.66, "step": 29991, "token_acc": 1.0, "train_speed(iter/s)": 0.958222 }, { "epoch": 0.9743039989604652, "grad_norm": 0.3842179477214813, "learning_rate": 1.804211776266662e-08, "loss": 0.011380326002836227, "memory(GiB)": 22.66, "step": 29992, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.958229 }, { "epoch": 0.9743364844232206, "grad_norm": 0.2936321496963501, "learning_rate": 1.7996555464372955e-08, "loss": 0.011735532432794571, "memory(GiB)": 22.66, "step": 29993, "token_acc": 1.0, "train_speed(iter/s)": 0.958235 }, { "epoch": 0.974368969885976, "grad_norm": 0.177494615316391, "learning_rate": 1.795105066502989e-08, "loss": 0.006143943872302771, "memory(GiB)": 22.66, "step": 29994, "token_acc": 1.0, "train_speed(iter/s)": 0.958242 }, { "epoch": 0.9744014553487315, "grad_norm": 0.2810991704463959, "learning_rate": 1.790560336516256e-08, "loss": 0.009027830325067043, "memory(GiB)": 22.66, "step": 29995, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.958249 }, { "epoch": 0.9744339408114868, "grad_norm": 0.29408568143844604, "learning_rate": 1.7860213565296102e-08, "loss": 0.005906352773308754, "memory(GiB)": 22.66, "step": 29996, "token_acc": 1.0, "train_speed(iter/s)": 0.958256 }, { "epoch": 0.9744664262742423, "grad_norm": 0.2800525724887848, "learning_rate": 1.7814881265953433e-08, "loss": 0.007951869629323483, "memory(GiB)": 22.66, "step": 29997, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.958262 }, { "epoch": 0.9744989117369977, "grad_norm": 0.2686308026313782, "learning_rate": 1.776960646765913e-08, "loss": 0.01038382202386856, "memory(GiB)": 22.66, "step": 29998, "token_acc": 1.0, "train_speed(iter/s)": 0.958269 }, { "epoch": 0.9745313971997531, "grad_norm": 0.5301390886306763, "learning_rate": 1.772438917093444e-08, "loss": 0.011580515652894974, "memory(GiB)": 22.66, "step": 29999, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.958275 }, { "epoch": 0.9745638826625085, "grad_norm": 0.2745630443096161, "learning_rate": 1.767922937630173e-08, "loss": 0.008780662901699543, "memory(GiB)": 22.66, "step": 30000, "token_acc": 0.9967532467532467, "train_speed(iter/s)": 0.958281 }, { "epoch": 0.9745638826625085, "eval_loss": 0.011103212833404541, "eval_runtime": 81.1592, "eval_samples_per_second": 122.598, "eval_steps_per_second": 3.832, "eval_token_acc": 0.9955501555643649, "step": 30000 }, { "epoch": 0.974596368125264, "grad_norm": 0.29766222834587097, "learning_rate": 1.7634127084282248e-08, "loss": 0.009176626801490784, "memory(GiB)": 22.66, "step": 30001, "token_acc": 0.9951649796600419, "train_speed(iter/s)": 0.955454 }, { "epoch": 0.9746288535880193, "grad_norm": 0.34085792303085327, "learning_rate": 1.7589082295396132e-08, "loss": 0.01184902898967266, "memory(GiB)": 22.66, "step": 30002, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.955459 }, { "epoch": 0.9746613390507748, "grad_norm": 0.2866804003715515, "learning_rate": 1.7544095010164076e-08, "loss": 0.009979608468711376, "memory(GiB)": 22.66, "step": 30003, "token_acc": 1.0, "train_speed(iter/s)": 0.955464 }, { "epoch": 0.9746938245135301, "grad_norm": 0.4873966574668884, "learning_rate": 1.7499165229104555e-08, "loss": 0.011306371539831161, "memory(GiB)": 22.66, "step": 30004, "token_acc": 1.0, "train_speed(iter/s)": 0.955469 }, { "epoch": 0.9747263099762856, "grad_norm": 0.3219059407711029, "learning_rate": 1.7454292952736597e-08, "loss": 0.010167403146624565, "memory(GiB)": 22.66, "step": 30005, "token_acc": 1.0, "train_speed(iter/s)": 0.955474 }, { "epoch": 0.974758795439041, "grad_norm": 0.34488776326179504, "learning_rate": 1.7409478181577567e-08, "loss": 0.010182520374655724, "memory(GiB)": 22.66, "step": 30006, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.955479 }, { "epoch": 0.9747912809017965, "grad_norm": 0.43540289998054504, "learning_rate": 1.7364720916144828e-08, "loss": 0.012588558718562126, "memory(GiB)": 22.66, "step": 30007, "token_acc": 1.0, "train_speed(iter/s)": 0.955484 }, { "epoch": 0.9748237663645518, "grad_norm": 0.36250075697898865, "learning_rate": 1.732002115695519e-08, "loss": 0.008216070011258125, "memory(GiB)": 22.66, "step": 30008, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955488 }, { "epoch": 0.9748562518273073, "grad_norm": 0.3008449375629425, "learning_rate": 1.7275378904524907e-08, "loss": 0.009496606886386871, "memory(GiB)": 22.66, "step": 30009, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.955493 }, { "epoch": 0.9748887372900626, "grad_norm": 0.34091871976852417, "learning_rate": 1.7230794159368013e-08, "loss": 0.00927108433097601, "memory(GiB)": 22.66, "step": 30010, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.955497 }, { "epoch": 0.9749212227528181, "grad_norm": 0.38501712679862976, "learning_rate": 1.7186266922000205e-08, "loss": 0.011310606263577938, "memory(GiB)": 22.66, "step": 30011, "token_acc": 1.0, "train_speed(iter/s)": 0.955502 }, { "epoch": 0.9749537082155735, "grad_norm": 0.39909008145332336, "learning_rate": 1.714179719293496e-08, "loss": 0.011504449881613255, "memory(GiB)": 22.66, "step": 30012, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.955506 }, { "epoch": 0.974986193678329, "grad_norm": 0.39469823241233826, "learning_rate": 1.709738497268576e-08, "loss": 0.01264990121126175, "memory(GiB)": 22.66, "step": 30013, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.95551 }, { "epoch": 0.9750186791410843, "grad_norm": 0.3514942228794098, "learning_rate": 1.7053030261764415e-08, "loss": 0.00955876987427473, "memory(GiB)": 22.66, "step": 30014, "token_acc": 1.0, "train_speed(iter/s)": 0.955514 }, { "epoch": 0.9750511646038398, "grad_norm": 0.28499090671539307, "learning_rate": 1.7008733060683847e-08, "loss": 0.008776840753853321, "memory(GiB)": 22.66, "step": 30015, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955519 }, { "epoch": 0.9750836500665953, "grad_norm": 0.2594071924686432, "learning_rate": 1.696449336995476e-08, "loss": 0.006212931126356125, "memory(GiB)": 22.66, "step": 30016, "token_acc": 1.0, "train_speed(iter/s)": 0.955523 }, { "epoch": 0.9751161355293506, "grad_norm": 0.2678103446960449, "learning_rate": 1.6920311190087857e-08, "loss": 0.01004899200052023, "memory(GiB)": 22.66, "step": 30017, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.95552 }, { "epoch": 0.9751486209921061, "grad_norm": 0.45835304260253906, "learning_rate": 1.687618652159273e-08, "loss": 0.012572143226861954, "memory(GiB)": 22.66, "step": 30018, "token_acc": 1.0, "train_speed(iter/s)": 0.955524 }, { "epoch": 0.9751811064548614, "grad_norm": 0.30785858631134033, "learning_rate": 1.6832119364978973e-08, "loss": 0.010607858188450336, "memory(GiB)": 22.66, "step": 30019, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955529 }, { "epoch": 0.9752135919176169, "grad_norm": 0.42671266198158264, "learning_rate": 1.6788109720755617e-08, "loss": 0.01165239978581667, "memory(GiB)": 22.66, "step": 30020, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.955533 }, { "epoch": 0.9752460773803723, "grad_norm": 0.3232659697532654, "learning_rate": 1.6744157589430043e-08, "loss": 0.008402775973081589, "memory(GiB)": 22.66, "step": 30021, "token_acc": 0.9963898916967509, "train_speed(iter/s)": 0.955539 }, { "epoch": 0.9752785628431277, "grad_norm": 0.2829335033893585, "learning_rate": 1.6700262971509063e-08, "loss": 0.012042293325066566, "memory(GiB)": 22.66, "step": 30022, "token_acc": 1.0, "train_speed(iter/s)": 0.955544 }, { "epoch": 0.9753110483058831, "grad_norm": 0.3842097818851471, "learning_rate": 1.665642586750005e-08, "loss": 0.011358840391039848, "memory(GiB)": 22.66, "step": 30023, "token_acc": 0.9851485148514851, "train_speed(iter/s)": 0.95555 }, { "epoch": 0.9753435337686386, "grad_norm": 0.34724515676498413, "learning_rate": 1.6612646277908705e-08, "loss": 0.008589770644903183, "memory(GiB)": 22.66, "step": 30024, "token_acc": 1.0, "train_speed(iter/s)": 0.955555 }, { "epoch": 0.9753760192313939, "grad_norm": 0.4820452332496643, "learning_rate": 1.6568924203240745e-08, "loss": 0.009388969279825687, "memory(GiB)": 22.66, "step": 30025, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.95556 }, { "epoch": 0.9754085046941494, "grad_norm": 0.27824220061302185, "learning_rate": 1.6525259643999647e-08, "loss": 0.007861146703362465, "memory(GiB)": 22.66, "step": 30026, "token_acc": 0.9946808510638298, "train_speed(iter/s)": 0.955566 }, { "epoch": 0.9754409901569048, "grad_norm": 0.3454623222351074, "learning_rate": 1.648165260069001e-08, "loss": 0.0075751496478915215, "memory(GiB)": 22.66, "step": 30027, "token_acc": 0.9966777408637874, "train_speed(iter/s)": 0.955572 }, { "epoch": 0.9754734756196602, "grad_norm": 0.47976091504096985, "learning_rate": 1.6438103073815882e-08, "loss": 0.013952568173408508, "memory(GiB)": 22.66, "step": 30028, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.955577 }, { "epoch": 0.9755059610824156, "grad_norm": 0.38982245326042175, "learning_rate": 1.6394611063878517e-08, "loss": 0.017198314890265465, "memory(GiB)": 22.66, "step": 30029, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.955582 }, { "epoch": 0.9755384465451711, "grad_norm": 0.4087996780872345, "learning_rate": 1.6351176571380856e-08, "loss": 0.0184723399579525, "memory(GiB)": 22.66, "step": 30030, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.955587 }, { "epoch": 0.9755709320079264, "grad_norm": 0.26211902499198914, "learning_rate": 1.6307799596823605e-08, "loss": 0.008266087621450424, "memory(GiB)": 22.66, "step": 30031, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.955592 }, { "epoch": 0.9756034174706819, "grad_norm": 0.32373878359794617, "learning_rate": 1.6264480140708028e-08, "loss": 0.008580727502703667, "memory(GiB)": 22.66, "step": 30032, "token_acc": 0.9964788732394366, "train_speed(iter/s)": 0.955597 }, { "epoch": 0.9756359029334373, "grad_norm": 0.3929744064807892, "learning_rate": 1.6221218203533173e-08, "loss": 0.012030771002173424, "memory(GiB)": 22.66, "step": 30033, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.955603 }, { "epoch": 0.9756683883961927, "grad_norm": 0.3394167423248291, "learning_rate": 1.6178013785799197e-08, "loss": 0.008181956596672535, "memory(GiB)": 22.66, "step": 30034, "token_acc": 0.9893617021276596, "train_speed(iter/s)": 0.955609 }, { "epoch": 0.9757008738589481, "grad_norm": 0.4138880968093872, "learning_rate": 1.613486688800403e-08, "loss": 0.010995179414749146, "memory(GiB)": 22.66, "step": 30035, "token_acc": 0.9890710382513661, "train_speed(iter/s)": 0.955616 }, { "epoch": 0.9757333593217036, "grad_norm": 0.2374146282672882, "learning_rate": 1.6091777510646168e-08, "loss": 0.005327296443283558, "memory(GiB)": 22.66, "step": 30036, "token_acc": 1.0, "train_speed(iter/s)": 0.955623 }, { "epoch": 0.9757658447844589, "grad_norm": 0.3513893783092499, "learning_rate": 1.6048745654222985e-08, "loss": 0.009205836802721024, "memory(GiB)": 22.66, "step": 30037, "token_acc": 1.0, "train_speed(iter/s)": 0.955629 }, { "epoch": 0.9757983302472144, "grad_norm": 0.4489890933036804, "learning_rate": 1.6005771319230758e-08, "loss": 0.016914943233132362, "memory(GiB)": 22.66, "step": 30038, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.955637 }, { "epoch": 0.9758308157099698, "grad_norm": 0.37090790271759033, "learning_rate": 1.596285450616575e-08, "loss": 0.012940446846187115, "memory(GiB)": 22.66, "step": 30039, "token_acc": 0.984251968503937, "train_speed(iter/s)": 0.955643 }, { "epoch": 0.9758633011727252, "grad_norm": 0.31479886174201965, "learning_rate": 1.591999521552312e-08, "loss": 0.009030054323375225, "memory(GiB)": 22.66, "step": 30040, "token_acc": 0.9861111111111112, "train_speed(iter/s)": 0.95565 }, { "epoch": 0.9758957866354806, "grad_norm": 0.3419613242149353, "learning_rate": 1.5877193447796924e-08, "loss": 0.004970600828528404, "memory(GiB)": 22.66, "step": 30041, "token_acc": 1.0, "train_speed(iter/s)": 0.955657 }, { "epoch": 0.9759282720982361, "grad_norm": 0.3607215881347656, "learning_rate": 1.5834449203482317e-08, "loss": 0.011509924195706844, "memory(GiB)": 22.66, "step": 30042, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.955664 }, { "epoch": 0.9759607575609914, "grad_norm": 0.4431201219558716, "learning_rate": 1.5791762483072237e-08, "loss": 0.010738825425505638, "memory(GiB)": 22.66, "step": 30043, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.95567 }, { "epoch": 0.9759932430237469, "grad_norm": 0.2827049493789673, "learning_rate": 1.5749133287058517e-08, "loss": 0.008819391950964928, "memory(GiB)": 22.66, "step": 30044, "token_acc": 1.0, "train_speed(iter/s)": 0.955678 }, { "epoch": 0.9760257284865023, "grad_norm": 0.3962865173816681, "learning_rate": 1.5706561615934644e-08, "loss": 0.012137141078710556, "memory(GiB)": 22.66, "step": 30045, "token_acc": 1.0, "train_speed(iter/s)": 0.955684 }, { "epoch": 0.9760582139492577, "grad_norm": 0.39148256182670593, "learning_rate": 1.566404747019079e-08, "loss": 0.014378473162651062, "memory(GiB)": 22.66, "step": 30046, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.955691 }, { "epoch": 0.9760906994120131, "grad_norm": 0.25751444697380066, "learning_rate": 1.562159085031767e-08, "loss": 0.011601109057664871, "memory(GiB)": 22.66, "step": 30047, "token_acc": 1.0, "train_speed(iter/s)": 0.955699 }, { "epoch": 0.9761231848747686, "grad_norm": 1.6038001775741577, "learning_rate": 1.5579191756806e-08, "loss": 0.013155538588762283, "memory(GiB)": 22.66, "step": 30048, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.955705 }, { "epoch": 0.9761556703375239, "grad_norm": 0.3376538157463074, "learning_rate": 1.5536850190144283e-08, "loss": 0.012944681569933891, "memory(GiB)": 22.66, "step": 30049, "token_acc": 0.9885496183206107, "train_speed(iter/s)": 0.955712 }, { "epoch": 0.9761881558002794, "grad_norm": 0.42156144976615906, "learning_rate": 1.5494566150822122e-08, "loss": 0.013826870359480381, "memory(GiB)": 22.66, "step": 30050, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.955719 }, { "epoch": 0.9762206412630348, "grad_norm": 0.30150729417800903, "learning_rate": 1.5452339639326907e-08, "loss": 0.009390903636813164, "memory(GiB)": 22.66, "step": 30051, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.955726 }, { "epoch": 0.9762531267257902, "grad_norm": 0.29831939935684204, "learning_rate": 1.5410170656145475e-08, "loss": 0.010774275287985802, "memory(GiB)": 22.66, "step": 30052, "token_acc": 1.0, "train_speed(iter/s)": 0.955733 }, { "epoch": 0.9762856121885456, "grad_norm": 0.4483262002468109, "learning_rate": 1.5368059201766317e-08, "loss": 0.014100377447903156, "memory(GiB)": 22.66, "step": 30053, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.95574 }, { "epoch": 0.9763180976513011, "grad_norm": 0.259504497051239, "learning_rate": 1.5326005276673495e-08, "loss": 0.012611367739737034, "memory(GiB)": 22.66, "step": 30054, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.955746 }, { "epoch": 0.9763505831140564, "grad_norm": 0.3018184304237366, "learning_rate": 1.5284008881353285e-08, "loss": 0.010546895675361156, "memory(GiB)": 22.66, "step": 30055, "token_acc": 1.0, "train_speed(iter/s)": 0.955753 }, { "epoch": 0.9763830685768119, "grad_norm": 0.3866576552391052, "learning_rate": 1.52420700162903e-08, "loss": 0.01114216260612011, "memory(GiB)": 22.66, "step": 30056, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.95576 }, { "epoch": 0.9764155540395673, "grad_norm": 0.23304933309555054, "learning_rate": 1.520018868196804e-08, "loss": 0.0072998604737222195, "memory(GiB)": 22.66, "step": 30057, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.955767 }, { "epoch": 0.9764480395023227, "grad_norm": 0.37261703610420227, "learning_rate": 1.5158364878871125e-08, "loss": 0.011292300187051296, "memory(GiB)": 22.66, "step": 30058, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.955775 }, { "epoch": 0.9764805249650781, "grad_norm": 0.37587350606918335, "learning_rate": 1.5116598607480826e-08, "loss": 0.010746153071522713, "memory(GiB)": 22.66, "step": 30059, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.955781 }, { "epoch": 0.9765130104278336, "grad_norm": 0.41732150316238403, "learning_rate": 1.507488986828065e-08, "loss": 0.014687050133943558, "memory(GiB)": 22.66, "step": 30060, "token_acc": 1.0, "train_speed(iter/s)": 0.955786 }, { "epoch": 0.9765454958905889, "grad_norm": 0.3134777843952179, "learning_rate": 1.503323866175077e-08, "loss": 0.007333528250455856, "memory(GiB)": 22.66, "step": 30061, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.955792 }, { "epoch": 0.9765779813533444, "grad_norm": 0.36628198623657227, "learning_rate": 1.499164498837191e-08, "loss": 0.014079985208809376, "memory(GiB)": 22.66, "step": 30062, "token_acc": 1.0, "train_speed(iter/s)": 0.955797 }, { "epoch": 0.9766104668160998, "grad_norm": 0.36856600642204285, "learning_rate": 1.49501088486248e-08, "loss": 0.01337522268295288, "memory(GiB)": 22.66, "step": 30063, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955802 }, { "epoch": 0.9766429522788552, "grad_norm": 0.3751530647277832, "learning_rate": 1.4908630242988496e-08, "loss": 0.010310966521501541, "memory(GiB)": 22.66, "step": 30064, "token_acc": 0.9948979591836735, "train_speed(iter/s)": 0.955807 }, { "epoch": 0.9766754377416106, "grad_norm": 0.38610443472862244, "learning_rate": 1.4867209171942066e-08, "loss": 0.016102196648716927, "memory(GiB)": 22.66, "step": 30065, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.955813 }, { "epoch": 0.9767079232043661, "grad_norm": 0.42537444829940796, "learning_rate": 1.4825845635962898e-08, "loss": 0.008907221257686615, "memory(GiB)": 22.66, "step": 30066, "token_acc": 1.0, "train_speed(iter/s)": 0.955819 }, { "epoch": 0.9767404086671214, "grad_norm": 0.26937586069107056, "learning_rate": 1.4784539635528395e-08, "loss": 0.008645998314023018, "memory(GiB)": 22.66, "step": 30067, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.955823 }, { "epoch": 0.9767728941298769, "grad_norm": 0.3734249174594879, "learning_rate": 1.4743291171116503e-08, "loss": 0.019014619290828705, "memory(GiB)": 22.66, "step": 30068, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955828 }, { "epoch": 0.9768053795926323, "grad_norm": 0.243567556142807, "learning_rate": 1.4702100243201844e-08, "loss": 0.005716770887374878, "memory(GiB)": 22.66, "step": 30069, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.955833 }, { "epoch": 0.9768378650553877, "grad_norm": 0.2986193299293518, "learning_rate": 1.466096685226015e-08, "loss": 0.01247607171535492, "memory(GiB)": 22.66, "step": 30070, "token_acc": 0.9922178988326849, "train_speed(iter/s)": 0.955838 }, { "epoch": 0.9768703505181431, "grad_norm": 0.3019517660140991, "learning_rate": 1.4619890998766596e-08, "loss": 0.008429883047938347, "memory(GiB)": 22.66, "step": 30071, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.955843 }, { "epoch": 0.9769028359808986, "grad_norm": 0.39216378331184387, "learning_rate": 1.4578872683195244e-08, "loss": 0.014528412371873856, "memory(GiB)": 22.66, "step": 30072, "token_acc": 1.0, "train_speed(iter/s)": 0.955848 }, { "epoch": 0.9769353214436539, "grad_norm": 0.37018200755119324, "learning_rate": 1.4537911906019053e-08, "loss": 0.011190077289938927, "memory(GiB)": 22.66, "step": 30073, "token_acc": 0.9873949579831933, "train_speed(iter/s)": 0.955853 }, { "epoch": 0.9769678069064094, "grad_norm": 0.4574962556362152, "learning_rate": 1.4497008667710977e-08, "loss": 0.012276380322873592, "memory(GiB)": 22.66, "step": 30074, "token_acc": 0.9965277777777778, "train_speed(iter/s)": 0.955858 }, { "epoch": 0.9770002923691647, "grad_norm": 0.3259139955043793, "learning_rate": 1.445616296874286e-08, "loss": 0.012681877240538597, "memory(GiB)": 22.66, "step": 30075, "token_acc": 1.0, "train_speed(iter/s)": 0.955862 }, { "epoch": 0.9770327778319202, "grad_norm": 0.27454325556755066, "learning_rate": 1.4415374809587102e-08, "loss": 0.007542305160313845, "memory(GiB)": 22.66, "step": 30076, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.955867 }, { "epoch": 0.9770652632946756, "grad_norm": 0.2780541479587555, "learning_rate": 1.4374644190713326e-08, "loss": 0.009285207837820053, "memory(GiB)": 22.66, "step": 30077, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.955872 }, { "epoch": 0.977097748757431, "grad_norm": 0.32003170251846313, "learning_rate": 1.4333971112591716e-08, "loss": 0.008823683485388756, "memory(GiB)": 22.66, "step": 30078, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955876 }, { "epoch": 0.9771302342201864, "grad_norm": 0.3019410967826843, "learning_rate": 1.4293355575692448e-08, "loss": 0.009282202459871769, "memory(GiB)": 22.66, "step": 30079, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.955881 }, { "epoch": 0.9771627196829419, "grad_norm": 0.2994062006473541, "learning_rate": 1.425279758048348e-08, "loss": 0.010007205419242382, "memory(GiB)": 22.66, "step": 30080, "token_acc": 1.0, "train_speed(iter/s)": 0.955887 }, { "epoch": 0.9771952051456974, "grad_norm": 0.44309377670288086, "learning_rate": 1.4212297127433328e-08, "loss": 0.015679392963647842, "memory(GiB)": 22.66, "step": 30081, "token_acc": 0.991869918699187, "train_speed(iter/s)": 0.955893 }, { "epoch": 0.9772276906084527, "grad_norm": 0.40077659487724304, "learning_rate": 1.4171854217009395e-08, "loss": 0.009905161336064339, "memory(GiB)": 22.66, "step": 30082, "token_acc": 1.0, "train_speed(iter/s)": 0.955898 }, { "epoch": 0.9772601760712082, "grad_norm": 0.3339744806289673, "learning_rate": 1.4131468849678532e-08, "loss": 0.013343403115868568, "memory(GiB)": 22.66, "step": 30083, "token_acc": 0.9888888888888889, "train_speed(iter/s)": 0.955902 }, { "epoch": 0.9772926615339635, "grad_norm": 0.36225104331970215, "learning_rate": 1.4091141025906473e-08, "loss": 0.009968515485525131, "memory(GiB)": 22.66, "step": 30084, "token_acc": 1.0, "train_speed(iter/s)": 0.955908 }, { "epoch": 0.977325146996719, "grad_norm": 0.3843286633491516, "learning_rate": 1.4050870746158963e-08, "loss": 0.01129424199461937, "memory(GiB)": 22.66, "step": 30085, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.955913 }, { "epoch": 0.9773576324594744, "grad_norm": 0.3845386207103729, "learning_rate": 1.4010658010900625e-08, "loss": 0.012633264996111393, "memory(GiB)": 22.66, "step": 30086, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.955918 }, { "epoch": 0.9773901179222299, "grad_norm": 0.46490728855133057, "learning_rate": 1.3970502820595533e-08, "loss": 0.015109740197658539, "memory(GiB)": 22.66, "step": 30087, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.955923 }, { "epoch": 0.9774226033849852, "grad_norm": 0.2473745048046112, "learning_rate": 1.3930405175707207e-08, "loss": 0.006618412211537361, "memory(GiB)": 22.66, "step": 30088, "token_acc": 1.0, "train_speed(iter/s)": 0.955929 }, { "epoch": 0.9774550888477407, "grad_norm": 0.32814741134643555, "learning_rate": 1.3890365076698608e-08, "loss": 0.012575821951031685, "memory(GiB)": 22.66, "step": 30089, "token_acc": 0.9915611814345991, "train_speed(iter/s)": 0.955934 }, { "epoch": 0.977487574310496, "grad_norm": 0.37106096744537354, "learning_rate": 1.3850382524032147e-08, "loss": 0.012384368106722832, "memory(GiB)": 22.66, "step": 30090, "token_acc": 0.984313725490196, "train_speed(iter/s)": 0.95594 }, { "epoch": 0.9775200597732515, "grad_norm": 0.3113654851913452, "learning_rate": 1.3810457518168008e-08, "loss": 0.01400048192590475, "memory(GiB)": 22.66, "step": 30091, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.955946 }, { "epoch": 0.9775525452360069, "grad_norm": 0.30577751994132996, "learning_rate": 1.3770590059568045e-08, "loss": 0.008496114984154701, "memory(GiB)": 22.66, "step": 30092, "token_acc": 1.0, "train_speed(iter/s)": 0.95595 }, { "epoch": 0.9775850306987623, "grad_norm": 0.3708048462867737, "learning_rate": 1.3730780148692447e-08, "loss": 0.009793358854949474, "memory(GiB)": 22.66, "step": 30093, "token_acc": 0.9884615384615385, "train_speed(iter/s)": 0.955955 }, { "epoch": 0.9776175161615177, "grad_norm": 0.3859648108482361, "learning_rate": 1.3691027786000288e-08, "loss": 0.010985728353261948, "memory(GiB)": 22.66, "step": 30094, "token_acc": 1.0, "train_speed(iter/s)": 0.95596 }, { "epoch": 0.9776500016242732, "grad_norm": 0.3610292673110962, "learning_rate": 1.3651332971950093e-08, "loss": 0.017937490716576576, "memory(GiB)": 22.66, "step": 30095, "token_acc": 0.99609375, "train_speed(iter/s)": 0.955966 }, { "epoch": 0.9776824870870285, "grad_norm": 0.34966152906417847, "learning_rate": 1.3611695707000384e-08, "loss": 0.010999968275427818, "memory(GiB)": 22.66, "step": 30096, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.955971 }, { "epoch": 0.977714972549784, "grad_norm": 0.19514746963977814, "learning_rate": 1.3572115991609124e-08, "loss": 0.008575838059186935, "memory(GiB)": 22.66, "step": 30097, "token_acc": 1.0, "train_speed(iter/s)": 0.955976 }, { "epoch": 0.9777474580125394, "grad_norm": 0.32501471042633057, "learning_rate": 1.3532593826232065e-08, "loss": 0.012710633687675, "memory(GiB)": 22.66, "step": 30098, "token_acc": 0.98828125, "train_speed(iter/s)": 0.955982 }, { "epoch": 0.9777799434752948, "grad_norm": 0.3370949923992157, "learning_rate": 1.3493129211325506e-08, "loss": 0.008465240709483624, "memory(GiB)": 22.66, "step": 30099, "token_acc": 1.0, "train_speed(iter/s)": 0.955989 }, { "epoch": 0.9778124289380502, "grad_norm": 0.26702865958213806, "learning_rate": 1.3453722147345749e-08, "loss": 0.008016245439648628, "memory(GiB)": 22.66, "step": 30100, "token_acc": 1.0, "train_speed(iter/s)": 0.955995 }, { "epoch": 0.9778449144008057, "grad_norm": 0.39639419317245483, "learning_rate": 1.341437263474743e-08, "loss": 0.011699415743350983, "memory(GiB)": 22.66, "step": 30101, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.956002 }, { "epoch": 0.977877399863561, "grad_norm": 0.2586263120174408, "learning_rate": 1.3375080673984076e-08, "loss": 0.009777417406439781, "memory(GiB)": 22.66, "step": 30102, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956009 }, { "epoch": 0.9779098853263165, "grad_norm": 0.22405891120433807, "learning_rate": 1.333584626550921e-08, "loss": 0.009269625879824162, "memory(GiB)": 22.66, "step": 30103, "token_acc": 1.0, "train_speed(iter/s)": 0.956015 }, { "epoch": 0.9779423707890719, "grad_norm": 0.40296098589897156, "learning_rate": 1.329666940977581e-08, "loss": 0.014973209239542484, "memory(GiB)": 22.66, "step": 30104, "token_acc": 1.0, "train_speed(iter/s)": 0.956022 }, { "epoch": 0.9779748562518273, "grad_norm": 0.18844231963157654, "learning_rate": 1.3257550107236839e-08, "loss": 0.007912006229162216, "memory(GiB)": 22.66, "step": 30105, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956029 }, { "epoch": 0.9780073417145827, "grad_norm": 0.4257669448852539, "learning_rate": 1.3218488358342496e-08, "loss": 0.010177301242947578, "memory(GiB)": 22.66, "step": 30106, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.956035 }, { "epoch": 0.9780398271773382, "grad_norm": 0.2849406898021698, "learning_rate": 1.3179484163544642e-08, "loss": 0.009919054806232452, "memory(GiB)": 22.66, "step": 30107, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956042 }, { "epoch": 0.9780723126400935, "grad_norm": 0.33609357476234436, "learning_rate": 1.3140537523292362e-08, "loss": 0.008715824224054813, "memory(GiB)": 22.66, "step": 30108, "token_acc": 0.9814126394052045, "train_speed(iter/s)": 0.956049 }, { "epoch": 0.978104798102849, "grad_norm": 0.3461393713951111, "learning_rate": 1.3101648438036407e-08, "loss": 0.010208919644355774, "memory(GiB)": 22.66, "step": 30109, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956056 }, { "epoch": 0.9781372835656044, "grad_norm": 0.3684793710708618, "learning_rate": 1.3062816908224196e-08, "loss": 0.012357390485703945, "memory(GiB)": 22.66, "step": 30110, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956063 }, { "epoch": 0.9781697690283598, "grad_norm": 0.29212644696235657, "learning_rate": 1.302404293430537e-08, "loss": 0.008551619946956635, "memory(GiB)": 22.66, "step": 30111, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.95607 }, { "epoch": 0.9782022544911152, "grad_norm": 0.2504383325576782, "learning_rate": 1.2985326516726793e-08, "loss": 0.008522223681211472, "memory(GiB)": 22.66, "step": 30112, "token_acc": 0.976, "train_speed(iter/s)": 0.956076 }, { "epoch": 0.9782347399538707, "grad_norm": 0.24366994202136993, "learning_rate": 1.2946667655934775e-08, "loss": 0.009629027917981148, "memory(GiB)": 22.66, "step": 30113, "token_acc": 1.0, "train_speed(iter/s)": 0.956083 }, { "epoch": 0.978267225416626, "grad_norm": 0.21055090427398682, "learning_rate": 1.290806635237618e-08, "loss": 0.009175914339721203, "memory(GiB)": 22.66, "step": 30114, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.95609 }, { "epoch": 0.9782997108793815, "grad_norm": 0.488207072019577, "learning_rate": 1.286952260649621e-08, "loss": 0.023283522576093674, "memory(GiB)": 22.66, "step": 30115, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956096 }, { "epoch": 0.9783321963421369, "grad_norm": 0.31893882155418396, "learning_rate": 1.2831036418740062e-08, "loss": 0.00820893794298172, "memory(GiB)": 22.66, "step": 30116, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.956103 }, { "epoch": 0.9783646818048923, "grad_norm": 0.33746767044067383, "learning_rate": 1.279260778955127e-08, "loss": 0.011185871437191963, "memory(GiB)": 22.66, "step": 30117, "token_acc": 1.0, "train_speed(iter/s)": 0.95611 }, { "epoch": 0.9783971672676477, "grad_norm": 0.3026444613933563, "learning_rate": 1.2754236719373925e-08, "loss": 0.010880311951041222, "memory(GiB)": 22.66, "step": 30118, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.956117 }, { "epoch": 0.9784296527304032, "grad_norm": 0.2710079252719879, "learning_rate": 1.2715923208651004e-08, "loss": 0.009910864755511284, "memory(GiB)": 22.66, "step": 30119, "token_acc": 1.0, "train_speed(iter/s)": 0.956122 }, { "epoch": 0.9784621381931585, "grad_norm": 0.38278523087501526, "learning_rate": 1.2677667257824378e-08, "loss": 0.00727749103680253, "memory(GiB)": 22.66, "step": 30120, "token_acc": 1.0, "train_speed(iter/s)": 0.956127 }, { "epoch": 0.978494623655914, "grad_norm": 0.3508603572845459, "learning_rate": 1.2639468867334804e-08, "loss": 0.012448608875274658, "memory(GiB)": 22.66, "step": 30121, "token_acc": 0.9878787878787879, "train_speed(iter/s)": 0.956133 }, { "epoch": 0.9785271091186694, "grad_norm": 0.35493728518486023, "learning_rate": 1.2601328037624706e-08, "loss": 0.011913176625967026, "memory(GiB)": 22.66, "step": 30122, "token_acc": 0.9897959183673469, "train_speed(iter/s)": 0.956139 }, { "epoch": 0.9785595945814248, "grad_norm": 0.3352578580379486, "learning_rate": 1.2563244769133176e-08, "loss": 0.014355899766087532, "memory(GiB)": 22.66, "step": 30123, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956144 }, { "epoch": 0.9785920800441802, "grad_norm": 0.26290246844291687, "learning_rate": 1.252521906230042e-08, "loss": 0.009854534640908241, "memory(GiB)": 22.66, "step": 30124, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.95615 }, { "epoch": 0.9786245655069357, "grad_norm": 0.3133825957775116, "learning_rate": 1.248725091756442e-08, "loss": 0.009704167023301125, "memory(GiB)": 22.66, "step": 30125, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.956155 }, { "epoch": 0.978657050969691, "grad_norm": 0.26848405599594116, "learning_rate": 1.2449340335364268e-08, "loss": 0.010363489389419556, "memory(GiB)": 22.66, "step": 30126, "token_acc": 1.0, "train_speed(iter/s)": 0.95616 }, { "epoch": 0.9786895364324465, "grad_norm": 0.3956701159477234, "learning_rate": 1.2411487316136839e-08, "loss": 0.013396572321653366, "memory(GiB)": 22.66, "step": 30127, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.956165 }, { "epoch": 0.9787220218952019, "grad_norm": 0.28701484203338623, "learning_rate": 1.2373691860319559e-08, "loss": 0.012230205349624157, "memory(GiB)": 22.66, "step": 30128, "token_acc": 1.0, "train_speed(iter/s)": 0.95617 }, { "epoch": 0.9787545073579573, "grad_norm": 0.3852117955684662, "learning_rate": 1.2335953968348191e-08, "loss": 0.0150233693420887, "memory(GiB)": 22.66, "step": 30129, "token_acc": 1.0, "train_speed(iter/s)": 0.956176 }, { "epoch": 0.9787869928207127, "grad_norm": 0.572040319442749, "learning_rate": 1.2298273640659054e-08, "loss": 0.020855098962783813, "memory(GiB)": 22.66, "step": 30130, "token_acc": 1.0, "train_speed(iter/s)": 0.95618 }, { "epoch": 0.9788194782834682, "grad_norm": 0.39105871319770813, "learning_rate": 1.2260650877685687e-08, "loss": 0.015166673809289932, "memory(GiB)": 22.66, "step": 30131, "token_acc": 0.9890710382513661, "train_speed(iter/s)": 0.956185 }, { "epoch": 0.9788519637462235, "grad_norm": 0.17003659904003143, "learning_rate": 1.2223085679863301e-08, "loss": 0.008731557987630367, "memory(GiB)": 22.66, "step": 30132, "token_acc": 1.0, "train_speed(iter/s)": 0.95619 }, { "epoch": 0.978884449208979, "grad_norm": 0.3124311864376068, "learning_rate": 1.2185578047625435e-08, "loss": 0.016286088153719902, "memory(GiB)": 22.66, "step": 30133, "token_acc": 0.995, "train_speed(iter/s)": 0.956195 }, { "epoch": 0.9789169346717344, "grad_norm": 0.3264818787574768, "learning_rate": 1.2148127981405078e-08, "loss": 0.012294556945562363, "memory(GiB)": 22.66, "step": 30134, "token_acc": 1.0, "train_speed(iter/s)": 0.9562 }, { "epoch": 0.9789494201344898, "grad_norm": 0.48255643248558044, "learning_rate": 1.2110735481633551e-08, "loss": 0.013417502865195274, "memory(GiB)": 22.66, "step": 30135, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.956205 }, { "epoch": 0.9789819055972452, "grad_norm": 0.438355028629303, "learning_rate": 1.2073400548742731e-08, "loss": 0.0096660777926445, "memory(GiB)": 22.66, "step": 30136, "token_acc": 1.0, "train_speed(iter/s)": 0.95621 }, { "epoch": 0.9790143910600007, "grad_norm": 0.3690905272960663, "learning_rate": 1.2036123183164495e-08, "loss": 0.01641058549284935, "memory(GiB)": 22.66, "step": 30137, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.956216 }, { "epoch": 0.979046876522756, "grad_norm": 0.24940520524978638, "learning_rate": 1.1998903385327943e-08, "loss": 0.00900481827557087, "memory(GiB)": 22.66, "step": 30138, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.956219 }, { "epoch": 0.9790793619855115, "grad_norm": 0.46864184737205505, "learning_rate": 1.1961741155663286e-08, "loss": 0.014844216406345367, "memory(GiB)": 22.66, "step": 30139, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.956225 }, { "epoch": 0.9791118474482668, "grad_norm": 0.3458632528781891, "learning_rate": 1.1924636494599074e-08, "loss": 0.012394560500979424, "memory(GiB)": 22.66, "step": 30140, "token_acc": 0.9832635983263598, "train_speed(iter/s)": 0.956231 }, { "epoch": 0.9791443329110223, "grad_norm": 0.5483750104904175, "learning_rate": 1.1887589402563848e-08, "loss": 0.01079106330871582, "memory(GiB)": 22.66, "step": 30141, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.956236 }, { "epoch": 0.9791768183737777, "grad_norm": 0.3863825500011444, "learning_rate": 1.1850599879984492e-08, "loss": 0.01090297196060419, "memory(GiB)": 22.66, "step": 30142, "token_acc": 0.9887640449438202, "train_speed(iter/s)": 0.956242 }, { "epoch": 0.9792093038365332, "grad_norm": 0.3602805435657501, "learning_rate": 1.1813667927288997e-08, "loss": 0.014420108869671822, "memory(GiB)": 22.66, "step": 30143, "token_acc": 0.98828125, "train_speed(iter/s)": 0.956247 }, { "epoch": 0.9792417892992886, "grad_norm": 0.36892586946487427, "learning_rate": 1.1776793544903131e-08, "loss": 0.009063875302672386, "memory(GiB)": 22.66, "step": 30144, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.956252 }, { "epoch": 0.979274274762044, "grad_norm": 0.21218541264533997, "learning_rate": 1.1739976733252112e-08, "loss": 0.00481827650219202, "memory(GiB)": 22.66, "step": 30145, "token_acc": 1.0, "train_speed(iter/s)": 0.956258 }, { "epoch": 0.9793067602247995, "grad_norm": 0.41054973006248474, "learning_rate": 1.1703217492760598e-08, "loss": 0.012496561743319035, "memory(GiB)": 22.66, "step": 30146, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.956263 }, { "epoch": 0.9793392456875548, "grad_norm": 0.40377092361450195, "learning_rate": 1.1666515823854363e-08, "loss": 0.012246929109096527, "memory(GiB)": 22.66, "step": 30147, "token_acc": 1.0, "train_speed(iter/s)": 0.956268 }, { "epoch": 0.9793717311503103, "grad_norm": 0.4295181334018707, "learning_rate": 1.162987172695529e-08, "loss": 0.01846381463110447, "memory(GiB)": 22.66, "step": 30148, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956273 }, { "epoch": 0.9794042166130656, "grad_norm": 0.43262240290641785, "learning_rate": 1.1593285202486925e-08, "loss": 0.010098909959197044, "memory(GiB)": 22.66, "step": 30149, "token_acc": 0.99, "train_speed(iter/s)": 0.956279 }, { "epoch": 0.9794367020758211, "grad_norm": 0.44232451915740967, "learning_rate": 1.155675625087227e-08, "loss": 0.008496259339153767, "memory(GiB)": 22.66, "step": 30150, "token_acc": 1.0, "train_speed(iter/s)": 0.956284 }, { "epoch": 0.9794691875385765, "grad_norm": 0.4107700288295746, "learning_rate": 1.152028487253154e-08, "loss": 0.012124395929276943, "memory(GiB)": 22.66, "step": 30151, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.95629 }, { "epoch": 0.979501673001332, "grad_norm": 0.4864497780799866, "learning_rate": 1.1483871067887175e-08, "loss": 0.013387756422162056, "memory(GiB)": 22.66, "step": 30152, "token_acc": 1.0, "train_speed(iter/s)": 0.956295 }, { "epoch": 0.9795341584640873, "grad_norm": 0.30391937494277954, "learning_rate": 1.1447514837357732e-08, "loss": 0.014902614988386631, "memory(GiB)": 22.66, "step": 30153, "token_acc": 0.99609375, "train_speed(iter/s)": 0.956299 }, { "epoch": 0.9795666439268428, "grad_norm": 0.37148359417915344, "learning_rate": 1.1411216181363983e-08, "loss": 0.012067705392837524, "memory(GiB)": 22.66, "step": 30154, "token_acc": 1.0, "train_speed(iter/s)": 0.956305 }, { "epoch": 0.9795991293895981, "grad_norm": 0.34060192108154297, "learning_rate": 1.1374975100325037e-08, "loss": 0.012831438332796097, "memory(GiB)": 22.66, "step": 30155, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.95631 }, { "epoch": 0.9796316148523536, "grad_norm": 0.6868036985397339, "learning_rate": 1.1338791594658338e-08, "loss": 0.0081765316426754, "memory(GiB)": 22.66, "step": 30156, "token_acc": 0.9955947136563876, "train_speed(iter/s)": 0.956315 }, { "epoch": 0.979664100315109, "grad_norm": 0.4302605390548706, "learning_rate": 1.1302665664781887e-08, "loss": 0.013043506070971489, "memory(GiB)": 22.66, "step": 30157, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.956319 }, { "epoch": 0.9796965857778644, "grad_norm": 0.22890695929527283, "learning_rate": 1.1266597311113125e-08, "loss": 0.0057437242940068245, "memory(GiB)": 22.66, "step": 30158, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.956324 }, { "epoch": 0.9797290712406198, "grad_norm": 0.45773857831954956, "learning_rate": 1.1230586534067278e-08, "loss": 0.012201260775327682, "memory(GiB)": 22.66, "step": 30159, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.956329 }, { "epoch": 0.9797615567033753, "grad_norm": 0.3917281925678253, "learning_rate": 1.1194633334060678e-08, "loss": 0.015894634649157524, "memory(GiB)": 22.66, "step": 30160, "token_acc": 1.0, "train_speed(iter/s)": 0.956335 }, { "epoch": 0.9797940421661306, "grad_norm": 0.301354318857193, "learning_rate": 1.1158737711507995e-08, "loss": 0.01185409165918827, "memory(GiB)": 22.66, "step": 30161, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.956341 }, { "epoch": 0.9798265276288861, "grad_norm": 0.4813040792942047, "learning_rate": 1.1122899666823894e-08, "loss": 0.015541961416602135, "memory(GiB)": 22.66, "step": 30162, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956348 }, { "epoch": 0.9798590130916415, "grad_norm": 0.22077713906764984, "learning_rate": 1.1087119200421381e-08, "loss": 0.008048197254538536, "memory(GiB)": 22.66, "step": 30163, "token_acc": 0.9962121212121212, "train_speed(iter/s)": 0.956354 }, { "epoch": 0.979891498554397, "grad_norm": 0.29125791788101196, "learning_rate": 1.1051396312713458e-08, "loss": 0.010168977081775665, "memory(GiB)": 22.66, "step": 30164, "token_acc": 1.0, "train_speed(iter/s)": 0.956361 }, { "epoch": 0.9799239840171523, "grad_norm": 0.38739365339279175, "learning_rate": 1.1015731004113129e-08, "loss": 0.015317456796765327, "memory(GiB)": 22.66, "step": 30165, "token_acc": 0.9867256637168141, "train_speed(iter/s)": 0.956367 }, { "epoch": 0.9799564694799078, "grad_norm": 0.31698909401893616, "learning_rate": 1.098012327503173e-08, "loss": 0.007979417219758034, "memory(GiB)": 22.66, "step": 30166, "token_acc": 1.0, "train_speed(iter/s)": 0.956374 }, { "epoch": 0.9799889549426631, "grad_norm": 0.31112390756607056, "learning_rate": 1.0944573125880042e-08, "loss": 0.006785622797906399, "memory(GiB)": 22.66, "step": 30167, "token_acc": 1.0, "train_speed(iter/s)": 0.95638 }, { "epoch": 0.9800214404054186, "grad_norm": 0.5389814376831055, "learning_rate": 1.0909080557067741e-08, "loss": 0.016590118408203125, "memory(GiB)": 22.66, "step": 30168, "token_acc": 0.9931972789115646, "train_speed(iter/s)": 0.956387 }, { "epoch": 0.980053925868174, "grad_norm": 0.4363572299480438, "learning_rate": 1.0873645569005608e-08, "loss": 0.013955879956483841, "memory(GiB)": 22.66, "step": 30169, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956394 }, { "epoch": 0.9800864113309294, "grad_norm": 0.397549033164978, "learning_rate": 1.0838268162102206e-08, "loss": 0.011045150458812714, "memory(GiB)": 22.66, "step": 30170, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.9564 }, { "epoch": 0.9801188967936848, "grad_norm": 0.3504795432090759, "learning_rate": 1.0802948336766095e-08, "loss": 0.010797275230288506, "memory(GiB)": 22.66, "step": 30171, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.956407 }, { "epoch": 0.9801513822564403, "grad_norm": 0.3913958668708801, "learning_rate": 1.0767686093404173e-08, "loss": 0.013365089893341064, "memory(GiB)": 22.66, "step": 30172, "token_acc": 1.0, "train_speed(iter/s)": 0.956413 }, { "epoch": 0.9801838677191956, "grad_norm": 0.3996409773826599, "learning_rate": 1.0732481432423337e-08, "loss": 0.012593820691108704, "memory(GiB)": 22.66, "step": 30173, "token_acc": 1.0, "train_speed(iter/s)": 0.95642 }, { "epoch": 0.9802163531819511, "grad_norm": 0.29549577832221985, "learning_rate": 1.0697334354231037e-08, "loss": 0.011308432556688786, "memory(GiB)": 22.66, "step": 30174, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956427 }, { "epoch": 0.9802488386447065, "grad_norm": 0.451648086309433, "learning_rate": 1.0662244859231952e-08, "loss": 0.010294304229319096, "memory(GiB)": 22.66, "step": 30175, "token_acc": 0.99609375, "train_speed(iter/s)": 0.956433 }, { "epoch": 0.9802813241074619, "grad_norm": 0.5155032873153687, "learning_rate": 1.0627212947831312e-08, "loss": 0.012661602348089218, "memory(GiB)": 22.66, "step": 30176, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956439 }, { "epoch": 0.9803138095702173, "grad_norm": 0.3162137567996979, "learning_rate": 1.0592238620433793e-08, "loss": 0.006158923264592886, "memory(GiB)": 22.66, "step": 30177, "token_acc": 1.0, "train_speed(iter/s)": 0.956446 }, { "epoch": 0.9803462950329728, "grad_norm": 0.3312963545322418, "learning_rate": 1.0557321877442406e-08, "loss": 0.00867875013500452, "memory(GiB)": 22.66, "step": 30178, "token_acc": 1.0, "train_speed(iter/s)": 0.956453 }, { "epoch": 0.9803787804957281, "grad_norm": 0.30732646584510803, "learning_rate": 1.0522462719260717e-08, "loss": 0.010495912283658981, "memory(GiB)": 22.66, "step": 30179, "token_acc": 1.0, "train_speed(iter/s)": 0.956458 }, { "epoch": 0.9804112659584836, "grad_norm": 0.276994526386261, "learning_rate": 1.0487661146290629e-08, "loss": 0.011214103549718857, "memory(GiB)": 22.66, "step": 30180, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.956463 }, { "epoch": 0.980443751421239, "grad_norm": 0.4273820221424103, "learning_rate": 1.0452917158933485e-08, "loss": 0.013325837440788746, "memory(GiB)": 22.66, "step": 30181, "token_acc": 0.9886363636363636, "train_speed(iter/s)": 0.956469 }, { "epoch": 0.9804762368839944, "grad_norm": 0.34521713852882385, "learning_rate": 1.0418230757591186e-08, "loss": 0.011966976337134838, "memory(GiB)": 22.66, "step": 30182, "token_acc": 0.9854368932038835, "train_speed(iter/s)": 0.956474 }, { "epoch": 0.9805087223467498, "grad_norm": 0.2847886085510254, "learning_rate": 1.0383601942663969e-08, "loss": 0.008359977975487709, "memory(GiB)": 22.66, "step": 30183, "token_acc": 1.0, "train_speed(iter/s)": 0.956479 }, { "epoch": 0.9805412078095053, "grad_norm": 0.387661874294281, "learning_rate": 1.0349030714550956e-08, "loss": 0.015122147276997566, "memory(GiB)": 22.66, "step": 30184, "token_acc": 0.9873417721518988, "train_speed(iter/s)": 0.956484 }, { "epoch": 0.9805736932722606, "grad_norm": 0.26607659459114075, "learning_rate": 1.0314517073651275e-08, "loss": 0.00878187082707882, "memory(GiB)": 22.66, "step": 30185, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.95649 }, { "epoch": 0.9806061787350161, "grad_norm": 0.20723029971122742, "learning_rate": 1.0280061020362941e-08, "loss": 0.008374511264264584, "memory(GiB)": 22.66, "step": 30186, "token_acc": 1.0, "train_speed(iter/s)": 0.956495 }, { "epoch": 0.9806386641977715, "grad_norm": 0.4763525128364563, "learning_rate": 1.0245662555084524e-08, "loss": 0.01634654402732849, "memory(GiB)": 22.66, "step": 30187, "token_acc": 0.9852941176470589, "train_speed(iter/s)": 0.9565 }, { "epoch": 0.9806711496605269, "grad_norm": 0.40525686740875244, "learning_rate": 1.0211321678212372e-08, "loss": 0.011435313150286674, "memory(GiB)": 22.66, "step": 30188, "token_acc": 1.0, "train_speed(iter/s)": 0.956505 }, { "epoch": 0.9807036351232823, "grad_norm": 0.29928791522979736, "learning_rate": 1.0177038390142834e-08, "loss": 0.004479381255805492, "memory(GiB)": 22.66, "step": 30189, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.95651 }, { "epoch": 0.9807361205860378, "grad_norm": 0.2947661280632019, "learning_rate": 1.0142812691271708e-08, "loss": 0.006762963719666004, "memory(GiB)": 22.66, "step": 30190, "token_acc": 1.0, "train_speed(iter/s)": 0.956514 }, { "epoch": 0.9807686060487931, "grad_norm": 0.3762393593788147, "learning_rate": 1.010864458199423e-08, "loss": 0.00846296176314354, "memory(GiB)": 22.66, "step": 30191, "token_acc": 0.9963369963369964, "train_speed(iter/s)": 0.95652 }, { "epoch": 0.9808010915115486, "grad_norm": 0.34979408979415894, "learning_rate": 1.0074534062704533e-08, "loss": 0.010355014353990555, "memory(GiB)": 22.66, "step": 30192, "token_acc": 1.0, "train_speed(iter/s)": 0.956525 }, { "epoch": 0.980833576974304, "grad_norm": 0.27576717734336853, "learning_rate": 1.0040481133795632e-08, "loss": 0.011255549266934395, "memory(GiB)": 22.66, "step": 30193, "token_acc": 1.0, "train_speed(iter/s)": 0.95653 }, { "epoch": 0.9808660624370594, "grad_norm": 0.23548801243305206, "learning_rate": 1.0006485795662213e-08, "loss": 0.011173632927238941, "memory(GiB)": 22.66, "step": 30194, "token_acc": 1.0, "train_speed(iter/s)": 0.956535 }, { "epoch": 0.9808985478998148, "grad_norm": 0.4472869038581848, "learning_rate": 9.972548048695075e-09, "loss": 0.012302890419960022, "memory(GiB)": 22.66, "step": 30195, "token_acc": 1.0, "train_speed(iter/s)": 0.956541 }, { "epoch": 0.9809310333625703, "grad_norm": 0.8554612398147583, "learning_rate": 9.938667893286125e-09, "loss": 0.012377648614346981, "memory(GiB)": 22.66, "step": 30196, "token_acc": 0.9939024390243902, "train_speed(iter/s)": 0.956546 }, { "epoch": 0.9809635188253256, "grad_norm": 0.30861547589302063, "learning_rate": 9.904845329827273e-09, "loss": 0.008694624528288841, "memory(GiB)": 22.66, "step": 30197, "token_acc": 0.9924812030075187, "train_speed(iter/s)": 0.956551 }, { "epoch": 0.9809960042880811, "grad_norm": 0.35392531752586365, "learning_rate": 9.871080358708208e-09, "loss": 0.010753628797829151, "memory(GiB)": 22.66, "step": 30198, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.956557 }, { "epoch": 0.9810284897508365, "grad_norm": 0.28873756527900696, "learning_rate": 9.837372980318616e-09, "loss": 0.009980736300349236, "memory(GiB)": 22.66, "step": 30199, "token_acc": 1.0, "train_speed(iter/s)": 0.956564 }, { "epoch": 0.9810609752135919, "grad_norm": 0.3161904215812683, "learning_rate": 9.803723195047632e-09, "loss": 0.008619440719485283, "memory(GiB)": 22.66, "step": 30200, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956569 }, { "epoch": 0.9810934606763473, "grad_norm": 0.287563681602478, "learning_rate": 9.770131003283278e-09, "loss": 0.007162655703723431, "memory(GiB)": 22.66, "step": 30201, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956575 }, { "epoch": 0.9811259461391028, "grad_norm": 0.39500027894973755, "learning_rate": 9.736596405414134e-09, "loss": 0.01164545863866806, "memory(GiB)": 22.66, "step": 30202, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.95658 }, { "epoch": 0.9811584316018581, "grad_norm": 0.39828595519065857, "learning_rate": 9.703119401826555e-09, "loss": 0.012252163141965866, "memory(GiB)": 22.66, "step": 30203, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956586 }, { "epoch": 0.9811909170646136, "grad_norm": 0.2452595829963684, "learning_rate": 9.6696999929069e-09, "loss": 0.007802865467965603, "memory(GiB)": 22.66, "step": 30204, "token_acc": 0.9878787878787879, "train_speed(iter/s)": 0.956591 }, { "epoch": 0.981223402527369, "grad_norm": 0.30267155170440674, "learning_rate": 9.636338179040972e-09, "loss": 0.009179828688502312, "memory(GiB)": 22.66, "step": 30205, "token_acc": 0.9866071428571429, "train_speed(iter/s)": 0.956596 }, { "epoch": 0.9812558879901244, "grad_norm": 0.4091889262199402, "learning_rate": 9.603033960614016e-09, "loss": 0.012993612326681614, "memory(GiB)": 22.66, "step": 30206, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.956601 }, { "epoch": 0.9812883734528798, "grad_norm": 0.2839149534702301, "learning_rate": 9.569787338010172e-09, "loss": 0.007589611224830151, "memory(GiB)": 22.66, "step": 30207, "token_acc": 1.0, "train_speed(iter/s)": 0.956606 }, { "epoch": 0.9813208589156353, "grad_norm": 0.30696386098861694, "learning_rate": 9.536598311613021e-09, "loss": 0.009429940953850746, "memory(GiB)": 22.66, "step": 30208, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.956611 }, { "epoch": 0.9813533443783907, "grad_norm": 0.3144226372241974, "learning_rate": 9.503466881806145e-09, "loss": 0.008642255328595638, "memory(GiB)": 22.66, "step": 30209, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.956617 }, { "epoch": 0.9813858298411461, "grad_norm": 0.31711921095848083, "learning_rate": 9.47039304897146e-09, "loss": 0.009989961981773376, "memory(GiB)": 22.66, "step": 30210, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.956622 }, { "epoch": 0.9814183153039016, "grad_norm": 0.2791878879070282, "learning_rate": 9.43737681349144e-09, "loss": 0.007271387614309788, "memory(GiB)": 22.66, "step": 30211, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.956627 }, { "epoch": 0.9814508007666569, "grad_norm": 0.4518859088420868, "learning_rate": 9.404418175745778e-09, "loss": 0.01922617480158806, "memory(GiB)": 22.66, "step": 30212, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.956632 }, { "epoch": 0.9814832862294124, "grad_norm": 0.3488229215145111, "learning_rate": 9.371517136115838e-09, "loss": 0.01134241372346878, "memory(GiB)": 22.66, "step": 30213, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956637 }, { "epoch": 0.9815157716921677, "grad_norm": 0.35349512100219727, "learning_rate": 9.33867369498076e-09, "loss": 0.011252306401729584, "memory(GiB)": 22.66, "step": 30214, "token_acc": 0.9819004524886877, "train_speed(iter/s)": 0.956642 }, { "epoch": 0.9815482571549232, "grad_norm": 0.6122331619262695, "learning_rate": 9.305887852720242e-09, "loss": 0.01721654087305069, "memory(GiB)": 22.66, "step": 30215, "token_acc": 1.0, "train_speed(iter/s)": 0.956648 }, { "epoch": 0.9815807426176786, "grad_norm": 0.42846620082855225, "learning_rate": 9.273159609712312e-09, "loss": 0.010507169179618359, "memory(GiB)": 22.66, "step": 30216, "token_acc": 1.0, "train_speed(iter/s)": 0.956653 }, { "epoch": 0.981613228080434, "grad_norm": 0.31647738814353943, "learning_rate": 9.240488966335004e-09, "loss": 0.009795378893613815, "memory(GiB)": 22.66, "step": 30217, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.956658 }, { "epoch": 0.9816457135431894, "grad_norm": 0.22233819961547852, "learning_rate": 9.207875922965238e-09, "loss": 0.005745544098317623, "memory(GiB)": 22.66, "step": 30218, "token_acc": 1.0, "train_speed(iter/s)": 0.956663 }, { "epoch": 0.9816781990059449, "grad_norm": 0.38234880566596985, "learning_rate": 9.175320479978822e-09, "loss": 0.00924188457429409, "memory(GiB)": 22.66, "step": 30219, "token_acc": 0.9904761904761905, "train_speed(iter/s)": 0.956667 }, { "epoch": 0.9817106844687002, "grad_norm": 0.29036054015159607, "learning_rate": 9.142822637752124e-09, "loss": 0.007963915355503559, "memory(GiB)": 22.66, "step": 30220, "token_acc": 1.0, "train_speed(iter/s)": 0.956672 }, { "epoch": 0.9817431699314557, "grad_norm": 0.3598582148551941, "learning_rate": 9.110382396659844e-09, "loss": 0.013309916481375694, "memory(GiB)": 22.66, "step": 30221, "token_acc": 0.9945945945945946, "train_speed(iter/s)": 0.956677 }, { "epoch": 0.9817756553942111, "grad_norm": 0.3315112292766571, "learning_rate": 9.077999757077238e-09, "loss": 0.009822643361985683, "memory(GiB)": 22.66, "step": 30222, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956682 }, { "epoch": 0.9818081408569665, "grad_norm": 0.28657981753349304, "learning_rate": 9.045674719376785e-09, "loss": 0.007393042556941509, "memory(GiB)": 22.66, "step": 30223, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956688 }, { "epoch": 0.9818406263197219, "grad_norm": 0.3195161521434784, "learning_rate": 9.013407283932629e-09, "loss": 0.012491922825574875, "memory(GiB)": 22.66, "step": 30224, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.956694 }, { "epoch": 0.9818731117824774, "grad_norm": 0.8352528214454651, "learning_rate": 8.981197451116696e-09, "loss": 0.009283322840929031, "memory(GiB)": 22.66, "step": 30225, "token_acc": 0.996, "train_speed(iter/s)": 0.956702 }, { "epoch": 0.9819055972452327, "grad_norm": 0.25761452317237854, "learning_rate": 8.949045221300356e-09, "loss": 0.007395624183118343, "memory(GiB)": 22.66, "step": 30226, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.956708 }, { "epoch": 0.9819380827079882, "grad_norm": 0.28337934613227844, "learning_rate": 8.916950594855533e-09, "loss": 0.009072843007743359, "memory(GiB)": 22.66, "step": 30227, "token_acc": 1.0, "train_speed(iter/s)": 0.956715 }, { "epoch": 0.9819705681707436, "grad_norm": 0.3765023648738861, "learning_rate": 8.884913572151933e-09, "loss": 0.009478006511926651, "memory(GiB)": 22.66, "step": 30228, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.956721 }, { "epoch": 0.982003053633499, "grad_norm": 0.23473387956619263, "learning_rate": 8.852934153559811e-09, "loss": 0.005077390931546688, "memory(GiB)": 22.66, "step": 30229, "token_acc": 1.0, "train_speed(iter/s)": 0.956728 }, { "epoch": 0.9820355390962544, "grad_norm": 0.5578125715255737, "learning_rate": 8.82101233944832e-09, "loss": 0.009845936670899391, "memory(GiB)": 22.66, "step": 30230, "token_acc": 1.0, "train_speed(iter/s)": 0.956735 }, { "epoch": 0.9820680245590099, "grad_norm": 0.3479018211364746, "learning_rate": 8.789148130184943e-09, "loss": 0.01163674145936966, "memory(GiB)": 22.66, "step": 30231, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.956742 }, { "epoch": 0.9821005100217652, "grad_norm": 0.3527728021144867, "learning_rate": 8.757341526138274e-09, "loss": 0.010438124649226665, "memory(GiB)": 22.66, "step": 30232, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.956749 }, { "epoch": 0.9821329954845207, "grad_norm": 0.18250015377998352, "learning_rate": 8.725592527675796e-09, "loss": 0.0057169534265995026, "memory(GiB)": 22.66, "step": 30233, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956755 }, { "epoch": 0.9821654809472761, "grad_norm": 0.3101986050605774, "learning_rate": 8.693901135162774e-09, "loss": 0.010173502378165722, "memory(GiB)": 22.66, "step": 30234, "token_acc": 1.0, "train_speed(iter/s)": 0.956762 }, { "epoch": 0.9821979664100315, "grad_norm": 0.23103296756744385, "learning_rate": 8.662267348965581e-09, "loss": 0.005106568802148104, "memory(GiB)": 22.66, "step": 30235, "token_acc": 1.0, "train_speed(iter/s)": 0.956769 }, { "epoch": 0.9822304518727869, "grad_norm": 0.3464778959751129, "learning_rate": 8.63069116944948e-09, "loss": 0.014136160723865032, "memory(GiB)": 22.66, "step": 30236, "token_acc": 0.992, "train_speed(iter/s)": 0.956776 }, { "epoch": 0.9822629373355424, "grad_norm": 0.45052066445350647, "learning_rate": 8.599172596978622e-09, "loss": 0.017755333334207535, "memory(GiB)": 22.66, "step": 30237, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.956783 }, { "epoch": 0.9822954227982977, "grad_norm": 0.34000760316848755, "learning_rate": 8.567711631916608e-09, "loss": 0.01036705169826746, "memory(GiB)": 22.66, "step": 30238, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.956788 }, { "epoch": 0.9823279082610532, "grad_norm": 0.34145089983940125, "learning_rate": 8.536308274627037e-09, "loss": 0.010401523672044277, "memory(GiB)": 22.66, "step": 30239, "token_acc": 1.0, "train_speed(iter/s)": 0.956794 }, { "epoch": 0.9823603937238086, "grad_norm": 0.2544953227043152, "learning_rate": 8.504962525471838e-09, "loss": 0.00975795742124319, "memory(GiB)": 22.66, "step": 30240, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.956797 }, { "epoch": 0.982392879186564, "grad_norm": 0.3520515561103821, "learning_rate": 8.473674384813502e-09, "loss": 0.013266474939882755, "memory(GiB)": 22.66, "step": 30241, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956803 }, { "epoch": 0.9824253646493194, "grad_norm": 0.48077771067619324, "learning_rate": 8.442443853012294e-09, "loss": 0.011142091825604439, "memory(GiB)": 22.66, "step": 30242, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.956808 }, { "epoch": 0.9824578501120749, "grad_norm": 0.31780222058296204, "learning_rate": 8.411270930429039e-09, "loss": 0.006858181208372116, "memory(GiB)": 22.66, "step": 30243, "token_acc": 1.0, "train_speed(iter/s)": 0.956813 }, { "epoch": 0.9824903355748302, "grad_norm": 0.3254697918891907, "learning_rate": 8.380155617423447e-09, "loss": 0.010993897914886475, "memory(GiB)": 22.66, "step": 30244, "token_acc": 0.9945652173913043, "train_speed(iter/s)": 0.956818 }, { "epoch": 0.9825228210375857, "grad_norm": 0.2637912333011627, "learning_rate": 8.34909791435523e-09, "loss": 0.008417169563472271, "memory(GiB)": 22.66, "step": 30245, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956823 }, { "epoch": 0.9825553065003411, "grad_norm": 0.23868520557880402, "learning_rate": 8.318097821581884e-09, "loss": 0.007773615885525942, "memory(GiB)": 22.66, "step": 30246, "token_acc": 0.9962264150943396, "train_speed(iter/s)": 0.956828 }, { "epoch": 0.9825877919630965, "grad_norm": 0.4026999771595001, "learning_rate": 8.28715533946145e-09, "loss": 0.010943281464278698, "memory(GiB)": 22.66, "step": 30247, "token_acc": 1.0, "train_speed(iter/s)": 0.956832 }, { "epoch": 0.9826202774258519, "grad_norm": 0.3914153277873993, "learning_rate": 8.256270468351424e-09, "loss": 0.013788641430437565, "memory(GiB)": 22.66, "step": 30248, "token_acc": 0.989010989010989, "train_speed(iter/s)": 0.956837 }, { "epoch": 0.9826527628886074, "grad_norm": 0.44390255212783813, "learning_rate": 8.225443208608186e-09, "loss": 0.00952342338860035, "memory(GiB)": 22.66, "step": 30249, "token_acc": 1.0, "train_speed(iter/s)": 0.956842 }, { "epoch": 0.9826852483513627, "grad_norm": 0.37050190567970276, "learning_rate": 8.194673560587008e-09, "loss": 0.00905567780137062, "memory(GiB)": 22.66, "step": 30250, "token_acc": 1.0, "train_speed(iter/s)": 0.956847 }, { "epoch": 0.9827177338141182, "grad_norm": 0.4448067247867584, "learning_rate": 8.163961524643716e-09, "loss": 0.013119462877511978, "memory(GiB)": 22.66, "step": 30251, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956853 }, { "epoch": 0.9827502192768736, "grad_norm": 0.4522126019001007, "learning_rate": 8.133307101131915e-09, "loss": 0.014303650707006454, "memory(GiB)": 22.66, "step": 30252, "token_acc": 0.9865470852017937, "train_speed(iter/s)": 0.956857 }, { "epoch": 0.982782704739629, "grad_norm": 0.23716963827610016, "learning_rate": 8.102710290406323e-09, "loss": 0.007781720720231533, "memory(GiB)": 22.66, "step": 30253, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.956862 }, { "epoch": 0.9828151902023844, "grad_norm": 0.3537171185016632, "learning_rate": 8.072171092819436e-09, "loss": 0.014864947646856308, "memory(GiB)": 22.66, "step": 30254, "token_acc": 1.0, "train_speed(iter/s)": 0.956866 }, { "epoch": 0.9828476756651399, "grad_norm": 0.31418928503990173, "learning_rate": 8.041689508724304e-09, "loss": 0.011092081665992737, "memory(GiB)": 22.66, "step": 30255, "token_acc": 1.0, "train_speed(iter/s)": 0.956871 }, { "epoch": 0.9828801611278952, "grad_norm": 0.27096617221832275, "learning_rate": 8.011265538472313e-09, "loss": 0.014528082683682442, "memory(GiB)": 22.66, "step": 30256, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.956876 }, { "epoch": 0.9829126465906507, "grad_norm": 0.5495583415031433, "learning_rate": 7.980899182414292e-09, "loss": 0.020623603835701942, "memory(GiB)": 22.66, "step": 30257, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.956881 }, { "epoch": 0.9829451320534061, "grad_norm": 0.2712274193763733, "learning_rate": 7.950590440901073e-09, "loss": 0.006210021674633026, "memory(GiB)": 22.66, "step": 30258, "token_acc": 1.0, "train_speed(iter/s)": 0.956888 }, { "epoch": 0.9829776175161615, "grad_norm": 0.26454970240592957, "learning_rate": 7.920339314282932e-09, "loss": 0.007431456819176674, "memory(GiB)": 22.66, "step": 30259, "token_acc": 1.0, "train_speed(iter/s)": 0.956894 }, { "epoch": 0.9830101029789169, "grad_norm": 0.34849274158477783, "learning_rate": 7.890145802907922e-09, "loss": 0.015852969139814377, "memory(GiB)": 22.66, "step": 30260, "token_acc": 1.0, "train_speed(iter/s)": 0.956901 }, { "epoch": 0.9830425884416724, "grad_norm": 0.3153153657913208, "learning_rate": 7.86000990712521e-09, "loss": 0.009691370651125908, "memory(GiB)": 22.66, "step": 30261, "token_acc": 1.0, "train_speed(iter/s)": 0.956908 }, { "epoch": 0.9830750739044277, "grad_norm": 0.3647806644439697, "learning_rate": 7.82993162728285e-09, "loss": 0.013220036402344704, "memory(GiB)": 22.66, "step": 30262, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956913 }, { "epoch": 0.9831075593671832, "grad_norm": 0.3871566355228424, "learning_rate": 7.79991096372723e-09, "loss": 0.013394766487181187, "memory(GiB)": 22.66, "step": 30263, "token_acc": 0.9858657243816255, "train_speed(iter/s)": 0.956919 }, { "epoch": 0.9831400448299386, "grad_norm": 0.3279612362384796, "learning_rate": 7.769947916805853e-09, "loss": 0.010994626209139824, "memory(GiB)": 22.66, "step": 30264, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956924 }, { "epoch": 0.983172530292694, "grad_norm": 0.4738936722278595, "learning_rate": 7.740042486863441e-09, "loss": 0.012030407786369324, "memory(GiB)": 22.66, "step": 30265, "token_acc": 1.0, "train_speed(iter/s)": 0.956929 }, { "epoch": 0.9832050157554494, "grad_norm": 0.4083094596862793, "learning_rate": 7.71019467424583e-09, "loss": 0.012949270196259022, "memory(GiB)": 22.66, "step": 30266, "token_acc": 1.0, "train_speed(iter/s)": 0.956934 }, { "epoch": 0.9832375012182049, "grad_norm": 0.38864633440971375, "learning_rate": 7.680404479297188e-09, "loss": 0.013087179511785507, "memory(GiB)": 22.66, "step": 30267, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.95694 }, { "epoch": 0.9832699866809602, "grad_norm": 0.2677088677883148, "learning_rate": 7.650671902361129e-09, "loss": 0.0072315591387450695, "memory(GiB)": 22.66, "step": 30268, "token_acc": 1.0, "train_speed(iter/s)": 0.956946 }, { "epoch": 0.9833024721437157, "grad_norm": 0.315048485994339, "learning_rate": 7.620996943781823e-09, "loss": 0.010106519795954227, "memory(GiB)": 22.66, "step": 30269, "token_acc": 1.0, "train_speed(iter/s)": 0.956951 }, { "epoch": 0.983334957606471, "grad_norm": 0.2826845347881317, "learning_rate": 7.591379603900107e-09, "loss": 0.006590018980205059, "memory(GiB)": 22.66, "step": 30270, "token_acc": 1.0, "train_speed(iter/s)": 0.956956 }, { "epoch": 0.9833674430692265, "grad_norm": 0.30226805806159973, "learning_rate": 7.561819883059596e-09, "loss": 0.015184395015239716, "memory(GiB)": 22.66, "step": 30271, "token_acc": 0.9875, "train_speed(iter/s)": 0.956961 }, { "epoch": 0.983399928531982, "grad_norm": 0.4070742130279541, "learning_rate": 7.532317781600019e-09, "loss": 0.007432159036397934, "memory(GiB)": 22.66, "step": 30272, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956967 }, { "epoch": 0.9834324139947374, "grad_norm": 0.7553274035453796, "learning_rate": 7.502873299863322e-09, "loss": 0.019766703248023987, "memory(GiB)": 22.66, "step": 30273, "token_acc": 1.0, "train_speed(iter/s)": 0.956972 }, { "epoch": 0.9834648994574928, "grad_norm": 0.27127087116241455, "learning_rate": 7.473486438187572e-09, "loss": 0.0077452827244997025, "memory(GiB)": 22.66, "step": 30274, "token_acc": 1.0, "train_speed(iter/s)": 0.956977 }, { "epoch": 0.9834973849202482, "grad_norm": 0.21861274540424347, "learning_rate": 7.444157196913049e-09, "loss": 0.005649098195135593, "memory(GiB)": 22.66, "step": 30275, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956983 }, { "epoch": 0.9835298703830037, "grad_norm": 0.2626355290412903, "learning_rate": 7.4148855763783725e-09, "loss": 0.007160333916544914, "memory(GiB)": 22.66, "step": 30276, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956988 }, { "epoch": 0.983562355845759, "grad_norm": 0.36110788583755493, "learning_rate": 7.385671576920495e-09, "loss": 0.011872431263327599, "memory(GiB)": 22.66, "step": 30277, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.956994 }, { "epoch": 0.9835948413085145, "grad_norm": 0.21645495295524597, "learning_rate": 7.356515198877479e-09, "loss": 0.006404777988791466, "memory(GiB)": 22.66, "step": 30278, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.957 }, { "epoch": 0.9836273267712698, "grad_norm": 0.29512202739715576, "learning_rate": 7.327416442585722e-09, "loss": 0.011558598838746548, "memory(GiB)": 22.66, "step": 30279, "token_acc": 1.0, "train_speed(iter/s)": 0.957005 }, { "epoch": 0.9836598122340253, "grad_norm": 0.32206296920776367, "learning_rate": 7.298375308379957e-09, "loss": 0.012105471454560757, "memory(GiB)": 22.66, "step": 30280, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957009 }, { "epoch": 0.9836922976967807, "grad_norm": 0.28767773509025574, "learning_rate": 7.269391796597136e-09, "loss": 0.008731256239116192, "memory(GiB)": 22.66, "step": 30281, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957014 }, { "epoch": 0.9837247831595362, "grad_norm": 0.2788383960723877, "learning_rate": 7.240465907570327e-09, "loss": 0.009717612527310848, "memory(GiB)": 22.66, "step": 30282, "token_acc": 1.0, "train_speed(iter/s)": 0.957019 }, { "epoch": 0.9837572686222915, "grad_norm": 0.47051697969436646, "learning_rate": 7.21159764163426e-09, "loss": 0.012361420318484306, "memory(GiB)": 22.66, "step": 30283, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.957025 }, { "epoch": 0.983789754085047, "grad_norm": 0.4372557997703552, "learning_rate": 7.1827869991214494e-09, "loss": 0.018990734592080116, "memory(GiB)": 22.66, "step": 30284, "token_acc": 1.0, "train_speed(iter/s)": 0.95703 }, { "epoch": 0.9838222395478023, "grad_norm": 0.4825497269630432, "learning_rate": 7.154033980364406e-09, "loss": 0.010766878724098206, "memory(GiB)": 22.66, "step": 30285, "token_acc": 1.0, "train_speed(iter/s)": 0.957036 }, { "epoch": 0.9838547250105578, "grad_norm": 0.35843998193740845, "learning_rate": 7.125338585695641e-09, "loss": 0.012997422367334366, "memory(GiB)": 22.66, "step": 30286, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.957042 }, { "epoch": 0.9838872104733132, "grad_norm": 0.3780832886695862, "learning_rate": 7.096700815446e-09, "loss": 0.012497217394411564, "memory(GiB)": 22.66, "step": 30287, "token_acc": 1.0, "train_speed(iter/s)": 0.957048 }, { "epoch": 0.9839196959360687, "grad_norm": 0.319681316614151, "learning_rate": 7.0681206699457776e-09, "loss": 0.008170516230165958, "memory(GiB)": 22.66, "step": 30288, "token_acc": 0.9897959183673469, "train_speed(iter/s)": 0.957055 }, { "epoch": 0.983952181398824, "grad_norm": 0.3584934175014496, "learning_rate": 7.039598149525262e-09, "loss": 0.009258383885025978, "memory(GiB)": 22.66, "step": 30289, "token_acc": 1.0, "train_speed(iter/s)": 0.957062 }, { "epoch": 0.9839846668615795, "grad_norm": 0.38131478428840637, "learning_rate": 7.01113325451308e-09, "loss": 0.012625638395547867, "memory(GiB)": 22.66, "step": 30290, "token_acc": 0.9863636363636363, "train_speed(iter/s)": 0.957068 }, { "epoch": 0.9840171523243348, "grad_norm": 0.3218298852443695, "learning_rate": 6.982725985238414e-09, "loss": 0.009457015432417393, "memory(GiB)": 22.66, "step": 30291, "token_acc": 1.0, "train_speed(iter/s)": 0.957075 }, { "epoch": 0.9840496377870903, "grad_norm": 0.3918535113334656, "learning_rate": 6.954376342028779e-09, "loss": 0.012349866330623627, "memory(GiB)": 22.66, "step": 30292, "token_acc": 1.0, "train_speed(iter/s)": 0.957081 }, { "epoch": 0.9840821232498457, "grad_norm": 0.318941205739975, "learning_rate": 6.926084325211136e-09, "loss": 0.011111635714769363, "memory(GiB)": 22.66, "step": 30293, "token_acc": 1.0, "train_speed(iter/s)": 0.957087 }, { "epoch": 0.9841146087126011, "grad_norm": 0.40198561549186707, "learning_rate": 6.897849935112444e-09, "loss": 0.011354543268680573, "memory(GiB)": 22.66, "step": 30294, "token_acc": 1.0, "train_speed(iter/s)": 0.957094 }, { "epoch": 0.9841470941753565, "grad_norm": 0.23214790225028992, "learning_rate": 6.869673172058555e-09, "loss": 0.006507996469736099, "memory(GiB)": 22.66, "step": 30295, "token_acc": 1.0, "train_speed(iter/s)": 0.957101 }, { "epoch": 0.984179579638112, "grad_norm": 0.4468008875846863, "learning_rate": 6.84155403637421e-09, "loss": 0.012767985463142395, "memory(GiB)": 22.66, "step": 30296, "token_acc": 1.0, "train_speed(iter/s)": 0.957107 }, { "epoch": 0.9842120651008673, "grad_norm": 0.5041795969009399, "learning_rate": 6.813492528384147e-09, "loss": 0.018726851791143417, "memory(GiB)": 22.66, "step": 30297, "token_acc": 0.9913419913419913, "train_speed(iter/s)": 0.957113 }, { "epoch": 0.9842445505636228, "grad_norm": 0.3080090880393982, "learning_rate": 6.785488648411997e-09, "loss": 0.015939027070999146, "memory(GiB)": 22.66, "step": 30298, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957118 }, { "epoch": 0.9842770360263782, "grad_norm": 0.2683364748954773, "learning_rate": 6.757542396781946e-09, "loss": 0.006129665300250053, "memory(GiB)": 22.66, "step": 30299, "token_acc": 1.0, "train_speed(iter/s)": 0.957123 }, { "epoch": 0.9843095214891336, "grad_norm": 0.27301520109176636, "learning_rate": 6.7296537738154035e-09, "loss": 0.007150782272219658, "memory(GiB)": 22.66, "step": 30300, "token_acc": 1.0, "train_speed(iter/s)": 0.957128 }, { "epoch": 0.984342006951889, "grad_norm": 0.2854650318622589, "learning_rate": 6.701822779834888e-09, "loss": 0.008269097656011581, "memory(GiB)": 22.66, "step": 30301, "token_acc": 1.0, "train_speed(iter/s)": 0.957134 }, { "epoch": 0.9843744924146445, "grad_norm": 0.41191497445106506, "learning_rate": 6.6740494151612544e-09, "loss": 0.011465702205896378, "memory(GiB)": 22.66, "step": 30302, "token_acc": 1.0, "train_speed(iter/s)": 0.957139 }, { "epoch": 0.9844069778773998, "grad_norm": 0.323729932308197, "learning_rate": 6.646333680115357e-09, "loss": 0.006336622405797243, "memory(GiB)": 22.66, "step": 30303, "token_acc": 1.0, "train_speed(iter/s)": 0.957144 }, { "epoch": 0.9844394633401553, "grad_norm": 0.36578482389450073, "learning_rate": 6.6186755750163864e-09, "loss": 0.014337604865431786, "memory(GiB)": 22.66, "step": 30304, "token_acc": 0.9812734082397003, "train_speed(iter/s)": 0.957149 }, { "epoch": 0.9844719488029107, "grad_norm": 0.41287142038345337, "learning_rate": 6.591075100184641e-09, "loss": 0.008458651602268219, "memory(GiB)": 22.66, "step": 30305, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.957154 }, { "epoch": 0.9845044342656661, "grad_norm": 0.3508400321006775, "learning_rate": 6.563532255938199e-09, "loss": 0.00941995158791542, "memory(GiB)": 22.66, "step": 30306, "token_acc": 1.0, "train_speed(iter/s)": 0.957159 }, { "epoch": 0.9845369197284215, "grad_norm": 0.2572227716445923, "learning_rate": 6.536047042594584e-09, "loss": 0.00868668407201767, "memory(GiB)": 22.66, "step": 30307, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.957163 }, { "epoch": 0.984569405191177, "grad_norm": 0.27925851941108704, "learning_rate": 6.508619460471322e-09, "loss": 0.009663421660661697, "memory(GiB)": 22.66, "step": 30308, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957168 }, { "epoch": 0.9846018906539323, "grad_norm": 0.3002697229385376, "learning_rate": 6.481249509884824e-09, "loss": 0.007361285388469696, "memory(GiB)": 22.66, "step": 30309, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.957173 }, { "epoch": 0.9846343761166878, "grad_norm": 0.3194234371185303, "learning_rate": 6.453937191150949e-09, "loss": 0.01157435029745102, "memory(GiB)": 22.66, "step": 30310, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.957178 }, { "epoch": 0.9846668615794432, "grad_norm": 0.32090499997138977, "learning_rate": 6.4266825045850025e-09, "loss": 0.0092872753739357, "memory(GiB)": 22.66, "step": 30311, "token_acc": 0.9910313901345291, "train_speed(iter/s)": 0.957183 }, { "epoch": 0.9846993470421986, "grad_norm": 0.3454444110393524, "learning_rate": 6.39948545050173e-09, "loss": 0.014958919957280159, "memory(GiB)": 22.66, "step": 30312, "token_acc": 1.0, "train_speed(iter/s)": 0.957188 }, { "epoch": 0.984731832504954, "grad_norm": 0.4141390025615692, "learning_rate": 6.372346029215326e-09, "loss": 0.01430345419794321, "memory(GiB)": 22.66, "step": 30313, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957194 }, { "epoch": 0.9847643179677095, "grad_norm": 0.4686822295188904, "learning_rate": 6.345264241037763e-09, "loss": 0.015031008049845695, "memory(GiB)": 22.66, "step": 30314, "token_acc": 0.987603305785124, "train_speed(iter/s)": 0.957198 }, { "epoch": 0.9847968034304648, "grad_norm": 0.3303888738155365, "learning_rate": 6.318240086283234e-09, "loss": 0.007514352444559336, "memory(GiB)": 22.66, "step": 30315, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.957203 }, { "epoch": 0.9848292888932203, "grad_norm": 0.31696707010269165, "learning_rate": 6.291273565262046e-09, "loss": 0.005947993602603674, "memory(GiB)": 22.66, "step": 30316, "token_acc": 1.0, "train_speed(iter/s)": 0.957209 }, { "epoch": 0.9848617743559757, "grad_norm": 0.3654242157936096, "learning_rate": 6.2643646782861726e-09, "loss": 0.00976611115038395, "memory(GiB)": 22.66, "step": 30317, "token_acc": 0.99609375, "train_speed(iter/s)": 0.957214 }, { "epoch": 0.9848942598187311, "grad_norm": 0.3503986895084381, "learning_rate": 6.237513425666475e-09, "loss": 0.008991430513560772, "memory(GiB)": 22.66, "step": 30318, "token_acc": 0.9773755656108597, "train_speed(iter/s)": 0.957221 }, { "epoch": 0.9849267452814865, "grad_norm": 0.28248345851898193, "learning_rate": 6.210719807712706e-09, "loss": 0.008037666790187359, "memory(GiB)": 22.66, "step": 30319, "token_acc": 1.0, "train_speed(iter/s)": 0.957228 }, { "epoch": 0.984959230744242, "grad_norm": 0.452838659286499, "learning_rate": 6.1839838247335084e-09, "loss": 0.014978112652897835, "memory(GiB)": 22.66, "step": 30320, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.957235 }, { "epoch": 0.9849917162069973, "grad_norm": 0.41546666622161865, "learning_rate": 6.157305477038633e-09, "loss": 0.016050968319177628, "memory(GiB)": 22.66, "step": 30321, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957241 }, { "epoch": 0.9850242016697528, "grad_norm": 0.22190506756305695, "learning_rate": 6.130684764934503e-09, "loss": 0.009553102776408195, "memory(GiB)": 22.66, "step": 30322, "token_acc": 0.9894736842105263, "train_speed(iter/s)": 0.957248 }, { "epoch": 0.9850566871325082, "grad_norm": 0.3040325343608856, "learning_rate": 6.104121688729203e-09, "loss": 0.010110666044056416, "memory(GiB)": 22.66, "step": 30323, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957254 }, { "epoch": 0.9850891725952636, "grad_norm": 0.4051488935947418, "learning_rate": 6.077616248729712e-09, "loss": 0.010964644141495228, "memory(GiB)": 22.66, "step": 30324, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.957259 }, { "epoch": 0.985121658058019, "grad_norm": 0.25910428166389465, "learning_rate": 6.051168445240785e-09, "loss": 0.007610561326146126, "memory(GiB)": 22.66, "step": 30325, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.957264 }, { "epoch": 0.9851541435207745, "grad_norm": 0.4040583074092865, "learning_rate": 6.024778278568844e-09, "loss": 0.013055284507572651, "memory(GiB)": 22.66, "step": 30326, "token_acc": 0.9795918367346939, "train_speed(iter/s)": 0.95727 }, { "epoch": 0.9851866289835298, "grad_norm": 0.5233426094055176, "learning_rate": 5.9984457490175344e-09, "loss": 0.015589429996907711, "memory(GiB)": 22.66, "step": 30327, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.957275 }, { "epoch": 0.9852191144462853, "grad_norm": 0.2622871696949005, "learning_rate": 5.9721708568910576e-09, "loss": 0.007123190443962812, "memory(GiB)": 22.66, "step": 30328, "token_acc": 1.0, "train_speed(iter/s)": 0.957281 }, { "epoch": 0.9852515999090407, "grad_norm": 0.3458741307258606, "learning_rate": 5.9459536024930595e-09, "loss": 0.011308923363685608, "memory(GiB)": 22.66, "step": 30329, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957286 }, { "epoch": 0.9852840853717961, "grad_norm": 0.34134790301322937, "learning_rate": 5.919793986126077e-09, "loss": 0.008982726372778416, "memory(GiB)": 22.66, "step": 30330, "token_acc": 0.9961685823754789, "train_speed(iter/s)": 0.957291 }, { "epoch": 0.9853165708345515, "grad_norm": 0.31753501296043396, "learning_rate": 5.893692008091534e-09, "loss": 0.007239790633320808, "memory(GiB)": 22.66, "step": 30331, "token_acc": 1.0, "train_speed(iter/s)": 0.957297 }, { "epoch": 0.985349056297307, "grad_norm": 0.29439952969551086, "learning_rate": 5.867647668690857e-09, "loss": 0.009288495406508446, "memory(GiB)": 22.66, "step": 30332, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957303 }, { "epoch": 0.9853815417600623, "grad_norm": 0.27802035212516785, "learning_rate": 5.841660968224361e-09, "loss": 0.00505534652620554, "memory(GiB)": 22.66, "step": 30333, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.957308 }, { "epoch": 0.9854140272228178, "grad_norm": 0.4355582892894745, "learning_rate": 5.8157319069929166e-09, "loss": 0.010679705068469048, "memory(GiB)": 22.66, "step": 30334, "token_acc": 1.0, "train_speed(iter/s)": 0.957313 }, { "epoch": 0.9854465126855732, "grad_norm": 0.24363823235034943, "learning_rate": 5.789860485294618e-09, "loss": 0.007182367146015167, "memory(GiB)": 22.66, "step": 30335, "token_acc": 0.9961832061068703, "train_speed(iter/s)": 0.957318 }, { "epoch": 0.9854789981483286, "grad_norm": 0.24005268514156342, "learning_rate": 5.764046703429227e-09, "loss": 0.004586946219205856, "memory(GiB)": 22.66, "step": 30336, "token_acc": 1.0, "train_speed(iter/s)": 0.957323 }, { "epoch": 0.9855114836110841, "grad_norm": 0.3399936854839325, "learning_rate": 5.738290561693726e-09, "loss": 0.005544146057218313, "memory(GiB)": 22.66, "step": 30337, "token_acc": 0.992, "train_speed(iter/s)": 0.957329 }, { "epoch": 0.9855439690738395, "grad_norm": 0.4989281892776489, "learning_rate": 5.712592060385657e-09, "loss": 0.012294645421206951, "memory(GiB)": 22.66, "step": 30338, "token_acc": 0.9911111111111112, "train_speed(iter/s)": 0.957335 }, { "epoch": 0.9855764545365949, "grad_norm": 0.47542014718055725, "learning_rate": 5.6869511998014494e-09, "loss": 0.014753066003322601, "memory(GiB)": 22.66, "step": 30339, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.95734 }, { "epoch": 0.9856089399993503, "grad_norm": 0.347998708486557, "learning_rate": 5.661367980237531e-09, "loss": 0.011108187958598137, "memory(GiB)": 22.66, "step": 30340, "token_acc": 1.0, "train_speed(iter/s)": 0.957346 }, { "epoch": 0.9856414254621058, "grad_norm": 0.41957977414131165, "learning_rate": 5.635842401988667e-09, "loss": 0.011959009803831577, "memory(GiB)": 22.66, "step": 30341, "token_acc": 0.9916666666666667, "train_speed(iter/s)": 0.957351 }, { "epoch": 0.9856739109248611, "grad_norm": 0.29226309061050415, "learning_rate": 5.6103744653501764e-09, "loss": 0.007744426839053631, "memory(GiB)": 22.66, "step": 30342, "token_acc": 1.0, "train_speed(iter/s)": 0.957356 }, { "epoch": 0.9857063963876166, "grad_norm": 0.3129335343837738, "learning_rate": 5.584964170615159e-09, "loss": 0.009108097292482853, "memory(GiB)": 22.66, "step": 30343, "token_acc": 1.0, "train_speed(iter/s)": 0.957361 }, { "epoch": 0.985738881850372, "grad_norm": 0.2802477777004242, "learning_rate": 5.559611518076713e-09, "loss": 0.010057363659143448, "memory(GiB)": 22.66, "step": 30344, "token_acc": 0.9953703703703703, "train_speed(iter/s)": 0.957366 }, { "epoch": 0.9857713673131274, "grad_norm": 0.28838762640953064, "learning_rate": 5.534316508028492e-09, "loss": 0.004643995314836502, "memory(GiB)": 22.66, "step": 30345, "token_acc": 1.0, "train_speed(iter/s)": 0.957371 }, { "epoch": 0.9858038527758828, "grad_norm": 0.24464374780654907, "learning_rate": 5.509079140761931e-09, "loss": 0.00663158530369401, "memory(GiB)": 22.66, "step": 30346, "token_acc": 1.0, "train_speed(iter/s)": 0.957376 }, { "epoch": 0.9858363382386383, "grad_norm": 0.34470707178115845, "learning_rate": 5.483899416567906e-09, "loss": 0.012502018362283707, "memory(GiB)": 22.66, "step": 30347, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957381 }, { "epoch": 0.9858688237013936, "grad_norm": 0.3876594007015228, "learning_rate": 5.458777335737852e-09, "loss": 0.013514257967472076, "memory(GiB)": 22.66, "step": 30348, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957388 }, { "epoch": 0.9859013091641491, "grad_norm": 0.22329817712306976, "learning_rate": 5.433712898560983e-09, "loss": 0.010845374315977097, "memory(GiB)": 22.66, "step": 30349, "token_acc": 0.9881422924901185, "train_speed(iter/s)": 0.957395 }, { "epoch": 0.9859337946269044, "grad_norm": 0.2929134964942932, "learning_rate": 5.408706105326511e-09, "loss": 0.009560422040522099, "memory(GiB)": 22.66, "step": 30350, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.957401 }, { "epoch": 0.9859662800896599, "grad_norm": 0.3502848148345947, "learning_rate": 5.383756956323649e-09, "loss": 0.012410384602844715, "memory(GiB)": 22.66, "step": 30351, "token_acc": 0.9880952380952381, "train_speed(iter/s)": 0.957408 }, { "epoch": 0.9859987655524153, "grad_norm": 0.35623568296432495, "learning_rate": 5.358865451839945e-09, "loss": 0.007449849043041468, "memory(GiB)": 22.66, "step": 30352, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.957415 }, { "epoch": 0.9860312510151708, "grad_norm": 0.21413922309875488, "learning_rate": 5.334031592162947e-09, "loss": 0.007399579975754023, "memory(GiB)": 22.66, "step": 30353, "token_acc": 1.0, "train_speed(iter/s)": 0.957421 }, { "epoch": 0.9860637364779261, "grad_norm": 0.33595532178878784, "learning_rate": 5.309255377579093e-09, "loss": 0.012451006099581718, "memory(GiB)": 22.66, "step": 30354, "token_acc": 1.0, "train_speed(iter/s)": 0.957428 }, { "epoch": 0.9860962219406816, "grad_norm": 0.29749757051467896, "learning_rate": 5.2845368083742635e-09, "loss": 0.010904823429882526, "memory(GiB)": 22.66, "step": 30355, "token_acc": 1.0, "train_speed(iter/s)": 0.957435 }, { "epoch": 0.9861287074034369, "grad_norm": 0.41537079215049744, "learning_rate": 5.259875884833787e-09, "loss": 0.014811648987233639, "memory(GiB)": 22.66, "step": 30356, "token_acc": 0.9913793103448276, "train_speed(iter/s)": 0.957442 }, { "epoch": 0.9861611928661924, "grad_norm": 0.3766452372074127, "learning_rate": 5.235272607242436e-09, "loss": 0.014992360025644302, "memory(GiB)": 22.66, "step": 30357, "token_acc": 1.0, "train_speed(iter/s)": 0.957447 }, { "epoch": 0.9861936783289478, "grad_norm": 0.31811457872390747, "learning_rate": 5.210726975883873e-09, "loss": 0.008509401232004166, "memory(GiB)": 22.66, "step": 30358, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.957453 }, { "epoch": 0.9862261637917032, "grad_norm": 0.36286821961402893, "learning_rate": 5.186238991041759e-09, "loss": 0.013596959412097931, "memory(GiB)": 22.66, "step": 30359, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.957458 }, { "epoch": 0.9862586492544586, "grad_norm": 0.3771436810493469, "learning_rate": 5.161808652998646e-09, "loss": 0.01010860688984394, "memory(GiB)": 22.66, "step": 30360, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.957463 }, { "epoch": 0.9862911347172141, "grad_norm": 0.20548826456069946, "learning_rate": 5.13743596203653e-09, "loss": 0.007012138143181801, "memory(GiB)": 22.66, "step": 30361, "token_acc": 1.0, "train_speed(iter/s)": 0.957468 }, { "epoch": 0.9863236201799694, "grad_norm": 0.3384905159473419, "learning_rate": 5.113120918435743e-09, "loss": 0.011399151757359505, "memory(GiB)": 22.66, "step": 30362, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957472 }, { "epoch": 0.9863561056427249, "grad_norm": 0.36573874950408936, "learning_rate": 5.088863522478282e-09, "loss": 0.013401487842202187, "memory(GiB)": 22.66, "step": 30363, "token_acc": 0.9854545454545455, "train_speed(iter/s)": 0.957477 }, { "epoch": 0.9863885911054803, "grad_norm": 0.393350213766098, "learning_rate": 5.064663774443368e-09, "loss": 0.010863937437534332, "memory(GiB)": 22.66, "step": 30364, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.957482 }, { "epoch": 0.9864210765682357, "grad_norm": 0.3250143229961395, "learning_rate": 5.040521674610776e-09, "loss": 0.01254298072308302, "memory(GiB)": 22.66, "step": 30365, "token_acc": 0.9919678714859438, "train_speed(iter/s)": 0.957487 }, { "epoch": 0.9864535620309911, "grad_norm": 0.3219878375530243, "learning_rate": 5.016437223258619e-09, "loss": 0.008234865963459015, "memory(GiB)": 22.66, "step": 30366, "token_acc": 1.0, "train_speed(iter/s)": 0.957492 }, { "epoch": 0.9864860474937466, "grad_norm": 0.405971497297287, "learning_rate": 4.992410420665006e-09, "loss": 0.011841587722301483, "memory(GiB)": 22.66, "step": 30367, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.957498 }, { "epoch": 0.9865185329565019, "grad_norm": 0.40992116928100586, "learning_rate": 4.968441267107493e-09, "loss": 0.009364360943436623, "memory(GiB)": 22.66, "step": 30368, "token_acc": 1.0, "train_speed(iter/s)": 0.957503 }, { "epoch": 0.9865510184192574, "grad_norm": 0.3201653063297272, "learning_rate": 4.944529762862527e-09, "loss": 0.008101512677967548, "memory(GiB)": 22.66, "step": 30369, "token_acc": 1.0, "train_speed(iter/s)": 0.957508 }, { "epoch": 0.9865835038820128, "grad_norm": 0.47662195563316345, "learning_rate": 4.920675908205996e-09, "loss": 0.011149322614073753, "memory(GiB)": 22.66, "step": 30370, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.957513 }, { "epoch": 0.9866159893447682, "grad_norm": 0.43243110179901123, "learning_rate": 4.896879703413237e-09, "loss": 0.013119369745254517, "memory(GiB)": 22.66, "step": 30371, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957518 }, { "epoch": 0.9866484748075236, "grad_norm": 0.4535799026489258, "learning_rate": 4.873141148759031e-09, "loss": 0.014650378376245499, "memory(GiB)": 22.66, "step": 30372, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.957524 }, { "epoch": 0.9866809602702791, "grad_norm": 0.30364900827407837, "learning_rate": 4.849460244517601e-09, "loss": 0.00969104003161192, "memory(GiB)": 22.66, "step": 30373, "token_acc": 0.9872881355932204, "train_speed(iter/s)": 0.957529 }, { "epoch": 0.9867134457330344, "grad_norm": 0.2468843162059784, "learning_rate": 4.825836990961508e-09, "loss": 0.006488766986876726, "memory(GiB)": 22.66, "step": 30374, "token_acc": 1.0, "train_speed(iter/s)": 0.957534 }, { "epoch": 0.9867459311957899, "grad_norm": 0.377398282289505, "learning_rate": 4.802271388363866e-09, "loss": 0.014693137258291245, "memory(GiB)": 22.66, "step": 30375, "token_acc": 1.0, "train_speed(iter/s)": 0.957539 }, { "epoch": 0.9867784166585453, "grad_norm": 0.4142920970916748, "learning_rate": 4.77876343699668e-09, "loss": 0.014792162925004959, "memory(GiB)": 22.66, "step": 30376, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957544 }, { "epoch": 0.9868109021213007, "grad_norm": 0.35273608565330505, "learning_rate": 4.755313137131401e-09, "loss": 0.009541385807096958, "memory(GiB)": 22.66, "step": 30377, "token_acc": 1.0, "train_speed(iter/s)": 0.957551 }, { "epoch": 0.9868433875840561, "grad_norm": 0.3902616798877716, "learning_rate": 4.731920489037811e-09, "loss": 0.010504797101020813, "memory(GiB)": 22.66, "step": 30378, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.957558 }, { "epoch": 0.9868758730468116, "grad_norm": 0.4060397446155548, "learning_rate": 4.708585492987361e-09, "loss": 0.01226167194545269, "memory(GiB)": 22.66, "step": 30379, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957564 }, { "epoch": 0.9869083585095669, "grad_norm": 0.2577609121799469, "learning_rate": 4.685308149248169e-09, "loss": 0.007956785149872303, "memory(GiB)": 22.66, "step": 30380, "token_acc": 1.0, "train_speed(iter/s)": 0.957571 }, { "epoch": 0.9869408439723224, "grad_norm": 0.2612711489200592, "learning_rate": 4.662088458089464e-09, "loss": 0.006323328707367182, "memory(GiB)": 22.66, "step": 30381, "token_acc": 1.0, "train_speed(iter/s)": 0.957578 }, { "epoch": 0.9869733294350778, "grad_norm": 0.42982807755470276, "learning_rate": 4.638926419778256e-09, "loss": 0.01572156883776188, "memory(GiB)": 22.66, "step": 30382, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957584 }, { "epoch": 0.9870058148978332, "grad_norm": 0.21354393661022186, "learning_rate": 4.615822034583772e-09, "loss": 0.009996360167860985, "memory(GiB)": 22.66, "step": 30383, "token_acc": 1.0, "train_speed(iter/s)": 0.957591 }, { "epoch": 0.9870383003605886, "grad_norm": 0.3437383472919464, "learning_rate": 4.5927753027708024e-09, "loss": 0.009997894987463951, "memory(GiB)": 22.66, "step": 30384, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.957598 }, { "epoch": 0.9870707858233441, "grad_norm": 0.2809036076068878, "learning_rate": 4.569786224606354e-09, "loss": 0.010097945109009743, "memory(GiB)": 22.66, "step": 30385, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957604 }, { "epoch": 0.9871032712860994, "grad_norm": 0.3759424686431885, "learning_rate": 4.546854800355216e-09, "loss": 0.01320636086165905, "memory(GiB)": 22.66, "step": 30386, "token_acc": 0.9918032786885246, "train_speed(iter/s)": 0.95761 }, { "epoch": 0.9871357567488549, "grad_norm": 0.2875106632709503, "learning_rate": 4.523981030282732e-09, "loss": 0.009330063126981258, "memory(GiB)": 22.66, "step": 30387, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957615 }, { "epoch": 0.9871682422116103, "grad_norm": 0.295574814081192, "learning_rate": 4.501164914652023e-09, "loss": 0.0090645682066679, "memory(GiB)": 22.66, "step": 30388, "token_acc": 1.0, "train_speed(iter/s)": 0.95762 }, { "epoch": 0.9872007276743657, "grad_norm": 0.4216651916503906, "learning_rate": 4.478406453727324e-09, "loss": 0.01180737279355526, "memory(GiB)": 22.66, "step": 30389, "token_acc": 0.9949494949494949, "train_speed(iter/s)": 0.957625 }, { "epoch": 0.9872332131371211, "grad_norm": 0.28212037682533264, "learning_rate": 4.455705647770647e-09, "loss": 0.014373651705682278, "memory(GiB)": 22.66, "step": 30390, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.957631 }, { "epoch": 0.9872656985998766, "grad_norm": 0.30401697754859924, "learning_rate": 4.433062497044005e-09, "loss": 0.01428441982716322, "memory(GiB)": 22.66, "step": 30391, "token_acc": 1.0, "train_speed(iter/s)": 0.957636 }, { "epoch": 0.9872981840626319, "grad_norm": 0.508236289024353, "learning_rate": 4.410477001808855e-09, "loss": 0.0150680523365736, "memory(GiB)": 22.66, "step": 30392, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.957641 }, { "epoch": 0.9873306695253874, "grad_norm": 0.3166719079017639, "learning_rate": 4.387949162326099e-09, "loss": 0.009902635589241982, "memory(GiB)": 22.66, "step": 30393, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957646 }, { "epoch": 0.9873631549881428, "grad_norm": 0.3245457410812378, "learning_rate": 4.365478978855531e-09, "loss": 0.007215573918074369, "memory(GiB)": 22.66, "step": 30394, "token_acc": 1.0, "train_speed(iter/s)": 0.957652 }, { "epoch": 0.9873956404508982, "grad_norm": 0.2922990918159485, "learning_rate": 4.343066451656386e-09, "loss": 0.012647924944758415, "memory(GiB)": 22.66, "step": 30395, "token_acc": 0.9901960784313726, "train_speed(iter/s)": 0.957657 }, { "epoch": 0.9874281259136536, "grad_norm": 0.31328266859054565, "learning_rate": 4.320711580987902e-09, "loss": 0.010342778638005257, "memory(GiB)": 22.66, "step": 30396, "token_acc": 1.0, "train_speed(iter/s)": 0.957662 }, { "epoch": 0.9874606113764091, "grad_norm": 0.40368905663490295, "learning_rate": 4.2984143671070955e-09, "loss": 0.012506606057286263, "memory(GiB)": 22.66, "step": 30397, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.957667 }, { "epoch": 0.9874930968391644, "grad_norm": 0.33718839287757874, "learning_rate": 4.2761748102720935e-09, "loss": 0.011407814919948578, "memory(GiB)": 22.66, "step": 30398, "token_acc": 0.9921875, "train_speed(iter/s)": 0.957672 }, { "epoch": 0.9875255823019199, "grad_norm": 0.6946812868118286, "learning_rate": 4.253992910739358e-09, "loss": 0.009499717503786087, "memory(GiB)": 22.66, "step": 30399, "token_acc": 1.0, "train_speed(iter/s)": 0.957677 }, { "epoch": 0.9875580677646754, "grad_norm": 0.5990253686904907, "learning_rate": 4.231868668764794e-09, "loss": 0.01242711953818798, "memory(GiB)": 22.66, "step": 30400, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.957683 }, { "epoch": 0.9875905532274307, "grad_norm": 0.2531229853630066, "learning_rate": 4.209802084603753e-09, "loss": 0.006382693536579609, "memory(GiB)": 22.66, "step": 30401, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.957689 }, { "epoch": 0.9876230386901862, "grad_norm": 0.2789452373981476, "learning_rate": 4.187793158511589e-09, "loss": 0.00935730617493391, "memory(GiB)": 22.66, "step": 30402, "token_acc": 1.0, "train_speed(iter/s)": 0.957694 }, { "epoch": 0.9876555241529416, "grad_norm": 0.37480628490448, "learning_rate": 4.1658418907408735e-09, "loss": 0.015178891830146313, "memory(GiB)": 22.66, "step": 30403, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.9577 }, { "epoch": 0.987688009615697, "grad_norm": 0.33891770243644714, "learning_rate": 4.143948281546406e-09, "loss": 0.01039675809442997, "memory(GiB)": 22.66, "step": 30404, "token_acc": 1.0, "train_speed(iter/s)": 0.957704 }, { "epoch": 0.9877204950784524, "grad_norm": 0.37944546341896057, "learning_rate": 4.122112331179651e-09, "loss": 0.013544470071792603, "memory(GiB)": 22.66, "step": 30405, "token_acc": 1.0, "train_speed(iter/s)": 0.957709 }, { "epoch": 0.9877529805412079, "grad_norm": 0.3096488118171692, "learning_rate": 4.100334039893739e-09, "loss": 0.007993079721927643, "memory(GiB)": 22.66, "step": 30406, "token_acc": 0.9959677419354839, "train_speed(iter/s)": 0.957715 }, { "epoch": 0.9877854660039632, "grad_norm": 0.5058179497718811, "learning_rate": 4.078613407939025e-09, "loss": 0.019132345914840698, "memory(GiB)": 22.66, "step": 30407, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.957721 }, { "epoch": 0.9878179514667187, "grad_norm": 0.38659724593162537, "learning_rate": 4.056950435566975e-09, "loss": 0.01430603675544262, "memory(GiB)": 22.66, "step": 30408, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957726 }, { "epoch": 0.987850436929474, "grad_norm": 0.27453315258026123, "learning_rate": 4.03534512302739e-09, "loss": 0.008273645304143429, "memory(GiB)": 22.66, "step": 30409, "token_acc": 0.99609375, "train_speed(iter/s)": 0.957732 }, { "epoch": 0.9878829223922295, "grad_norm": 0.42233806848526, "learning_rate": 4.013797470568959e-09, "loss": 0.015408966690301895, "memory(GiB)": 22.66, "step": 30410, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.957739 }, { "epoch": 0.9879154078549849, "grad_norm": 0.3496015965938568, "learning_rate": 3.992307478441482e-09, "loss": 0.012513944879174232, "memory(GiB)": 22.66, "step": 30411, "token_acc": 0.9956709956709957, "train_speed(iter/s)": 0.957745 }, { "epoch": 0.9879478933177404, "grad_norm": 0.40477195382118225, "learning_rate": 3.970875146892539e-09, "loss": 0.010856321081519127, "memory(GiB)": 22.66, "step": 30412, "token_acc": 1.0, "train_speed(iter/s)": 0.957752 }, { "epoch": 0.9879803787804957, "grad_norm": 0.30273494124412537, "learning_rate": 3.949500476169155e-09, "loss": 0.011635331436991692, "memory(GiB)": 22.66, "step": 30413, "token_acc": 0.9919354838709677, "train_speed(iter/s)": 0.957759 }, { "epoch": 0.9880128642432512, "grad_norm": 0.2961116433143616, "learning_rate": 3.928183466518354e-09, "loss": 0.012717061676084995, "memory(GiB)": 22.66, "step": 30414, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.957765 }, { "epoch": 0.9880453497060065, "grad_norm": 0.30429625511169434, "learning_rate": 3.906924118186051e-09, "loss": 0.01219354011118412, "memory(GiB)": 22.66, "step": 30415, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.957772 }, { "epoch": 0.988077835168762, "grad_norm": 0.38190728425979614, "learning_rate": 3.885722431417604e-09, "loss": 0.013893596827983856, "memory(GiB)": 22.66, "step": 30416, "token_acc": 0.9947643979057592, "train_speed(iter/s)": 0.957778 }, { "epoch": 0.9881103206315174, "grad_norm": 0.3261687755584717, "learning_rate": 3.8645784064578195e-09, "loss": 0.007916273549199104, "memory(GiB)": 22.66, "step": 30417, "token_acc": 1.0, "train_speed(iter/s)": 0.957783 }, { "epoch": 0.9881428060942729, "grad_norm": 0.19796574115753174, "learning_rate": 3.84349204355039e-09, "loss": 0.005490199662744999, "memory(GiB)": 22.66, "step": 30418, "token_acc": 1.0, "train_speed(iter/s)": 0.957789 }, { "epoch": 0.9881752915570282, "grad_norm": 0.35061129927635193, "learning_rate": 3.822463342939009e-09, "loss": 0.0067708552815020084, "memory(GiB)": 22.66, "step": 30419, "token_acc": 1.0, "train_speed(iter/s)": 0.957794 }, { "epoch": 0.9882077770197837, "grad_norm": 0.3503643572330475, "learning_rate": 3.801492304866261e-09, "loss": 0.009508904069662094, "memory(GiB)": 22.66, "step": 30420, "token_acc": 0.9921259842519685, "train_speed(iter/s)": 0.957799 }, { "epoch": 0.988240262482539, "grad_norm": 0.3978111445903778, "learning_rate": 3.78057892957473e-09, "loss": 0.015826696529984474, "memory(GiB)": 22.66, "step": 30421, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.957804 }, { "epoch": 0.9882727479452945, "grad_norm": 0.29785215854644775, "learning_rate": 3.7597232173047786e-09, "loss": 0.009583793580532074, "memory(GiB)": 22.66, "step": 30422, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.957809 }, { "epoch": 0.9883052334080499, "grad_norm": 0.40767061710357666, "learning_rate": 3.73892516829788e-09, "loss": 0.013903331942856312, "memory(GiB)": 22.66, "step": 30423, "token_acc": 0.9887218045112782, "train_speed(iter/s)": 0.957815 }, { "epoch": 0.9883377188708053, "grad_norm": 0.35239607095718384, "learning_rate": 3.7181847827932883e-09, "loss": 0.010278533212840557, "memory(GiB)": 22.66, "step": 30424, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.95782 }, { "epoch": 0.9883702043335607, "grad_norm": 0.5104233026504517, "learning_rate": 3.697502061031366e-09, "loss": 0.014346946030855179, "memory(GiB)": 22.66, "step": 30425, "token_acc": 0.9921875, "train_speed(iter/s)": 0.957825 }, { "epoch": 0.9884026897963162, "grad_norm": 0.322724848985672, "learning_rate": 3.6768770032502566e-09, "loss": 0.009323881939053535, "memory(GiB)": 22.66, "step": 30426, "token_acc": 1.0, "train_speed(iter/s)": 0.95783 }, { "epoch": 0.9884351752590715, "grad_norm": 0.450711190700531, "learning_rate": 3.6563096096881023e-09, "loss": 0.012142951600253582, "memory(GiB)": 22.66, "step": 30427, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.957835 }, { "epoch": 0.988467660721827, "grad_norm": 0.31782135367393494, "learning_rate": 3.635799880581936e-09, "loss": 0.005939147435128689, "memory(GiB)": 22.66, "step": 30428, "token_acc": 1.0, "train_speed(iter/s)": 0.95784 }, { "epoch": 0.9885001461845824, "grad_norm": 0.2910952866077423, "learning_rate": 3.6153478161687903e-09, "loss": 0.009196741506457329, "memory(GiB)": 22.66, "step": 30429, "token_acc": 0.9925650557620818, "train_speed(iter/s)": 0.957845 }, { "epoch": 0.9885326316473378, "grad_norm": 0.2747861444950104, "learning_rate": 3.594953416685143e-09, "loss": 0.009362302720546722, "memory(GiB)": 22.66, "step": 30430, "token_acc": 0.9925373134328358, "train_speed(iter/s)": 0.95785 }, { "epoch": 0.9885651171100932, "grad_norm": 0.49902185797691345, "learning_rate": 3.5746166823658055e-09, "loss": 0.012301364913582802, "memory(GiB)": 22.66, "step": 30431, "token_acc": 0.9905660377358491, "train_speed(iter/s)": 0.957856 }, { "epoch": 0.9885976025728487, "grad_norm": 0.3587677776813507, "learning_rate": 3.554337613445591e-09, "loss": 0.01250732783228159, "memory(GiB)": 22.66, "step": 30432, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.95786 }, { "epoch": 0.988630088035604, "grad_norm": 0.37118807435035706, "learning_rate": 3.5341162101582004e-09, "loss": 0.009501954540610313, "memory(GiB)": 22.66, "step": 30433, "token_acc": 1.0, "train_speed(iter/s)": 0.957865 }, { "epoch": 0.9886625734983595, "grad_norm": 0.427995890378952, "learning_rate": 3.5139524727378916e-09, "loss": 0.010484744794666767, "memory(GiB)": 22.66, "step": 30434, "token_acc": 0.9961685823754789, "train_speed(iter/s)": 0.95787 }, { "epoch": 0.9886950589611149, "grad_norm": 0.16349327564239502, "learning_rate": 3.4938464014167006e-09, "loss": 0.006331855431199074, "memory(GiB)": 22.66, "step": 30435, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.957875 }, { "epoch": 0.9887275444238703, "grad_norm": 0.3154071271419525, "learning_rate": 3.4737979964266645e-09, "loss": 0.00823214277625084, "memory(GiB)": 22.66, "step": 30436, "token_acc": 1.0, "train_speed(iter/s)": 0.957881 }, { "epoch": 0.9887600298866257, "grad_norm": 0.3065325915813446, "learning_rate": 3.4538072579998193e-09, "loss": 0.006394106429070234, "memory(GiB)": 22.66, "step": 30437, "token_acc": 1.0, "train_speed(iter/s)": 0.957888 }, { "epoch": 0.9887925153493812, "grad_norm": 0.42095839977264404, "learning_rate": 3.433874186365982e-09, "loss": 0.010734127834439278, "memory(GiB)": 22.66, "step": 30438, "token_acc": 1.0, "train_speed(iter/s)": 0.957894 }, { "epoch": 0.9888250008121365, "grad_norm": 0.31018587946891785, "learning_rate": 3.4139987817555233e-09, "loss": 0.01148323342204094, "memory(GiB)": 22.66, "step": 30439, "token_acc": 0.9924528301886792, "train_speed(iter/s)": 0.957901 }, { "epoch": 0.988857486274892, "grad_norm": 0.3121275007724762, "learning_rate": 3.394181044398259e-09, "loss": 0.00937812402844429, "memory(GiB)": 22.66, "step": 30440, "token_acc": 0.988, "train_speed(iter/s)": 0.957908 }, { "epoch": 0.9888899717376474, "grad_norm": 0.3196130692958832, "learning_rate": 3.374420974522896e-09, "loss": 0.01140783540904522, "memory(GiB)": 22.66, "step": 30441, "token_acc": 0.9946524064171123, "train_speed(iter/s)": 0.957914 }, { "epoch": 0.9889224572004028, "grad_norm": 0.45614299178123474, "learning_rate": 3.35471857235703e-09, "loss": 0.013182688504457474, "memory(GiB)": 22.66, "step": 30442, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957921 }, { "epoch": 0.9889549426631582, "grad_norm": 0.33288300037384033, "learning_rate": 3.3350738381277006e-09, "loss": 0.008544697426259518, "memory(GiB)": 22.66, "step": 30443, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957928 }, { "epoch": 0.9889874281259137, "grad_norm": 0.26012343168258667, "learning_rate": 3.315486772062504e-09, "loss": 0.010132179595530033, "memory(GiB)": 22.66, "step": 30444, "token_acc": 1.0, "train_speed(iter/s)": 0.957935 }, { "epoch": 0.989019913588669, "grad_norm": 0.3389449119567871, "learning_rate": 3.295957374387371e-09, "loss": 0.013438031077384949, "memory(GiB)": 22.66, "step": 30445, "token_acc": 0.9952153110047847, "train_speed(iter/s)": 0.957941 }, { "epoch": 0.9890523990514245, "grad_norm": 0.25549811124801636, "learning_rate": 3.276485645327676e-09, "loss": 0.010561997070908546, "memory(GiB)": 22.66, "step": 30446, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.957948 }, { "epoch": 0.9890848845141799, "grad_norm": 0.40800344944000244, "learning_rate": 3.25707158510713e-09, "loss": 0.011872314848005772, "memory(GiB)": 22.66, "step": 30447, "token_acc": 1.0, "train_speed(iter/s)": 0.957955 }, { "epoch": 0.9891173699769353, "grad_norm": 0.24800671637058258, "learning_rate": 3.2377151939516628e-09, "loss": 0.008170276880264282, "memory(GiB)": 22.66, "step": 30448, "token_acc": 1.0, "train_speed(iter/s)": 0.957961 }, { "epoch": 0.9891498554396907, "grad_norm": 0.3242754638195038, "learning_rate": 3.2184164720827638e-09, "loss": 0.006892026402056217, "memory(GiB)": 22.66, "step": 30449, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.957966 }, { "epoch": 0.9891823409024462, "grad_norm": 0.3960956037044525, "learning_rate": 3.199175419724698e-09, "loss": 0.012120503932237625, "memory(GiB)": 22.66, "step": 30450, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957971 }, { "epoch": 0.9892148263652015, "grad_norm": 0.2683347761631012, "learning_rate": 3.1799920370984004e-09, "loss": 0.007956412620842457, "memory(GiB)": 22.66, "step": 30451, "token_acc": 0.996268656716418, "train_speed(iter/s)": 0.957977 }, { "epoch": 0.989247311827957, "grad_norm": 0.2466781735420227, "learning_rate": 3.1608663244259153e-09, "loss": 0.0074616242200136185, "memory(GiB)": 22.66, "step": 30452, "token_acc": 0.9949238578680203, "train_speed(iter/s)": 0.957982 }, { "epoch": 0.9892797972907124, "grad_norm": 0.2895495891571045, "learning_rate": 3.1417982819276214e-09, "loss": 0.009337019175291061, "memory(GiB)": 22.66, "step": 30453, "token_acc": 1.0, "train_speed(iter/s)": 0.957988 }, { "epoch": 0.9893122827534678, "grad_norm": 0.5082962512969971, "learning_rate": 3.1227879098238988e-09, "loss": 0.01378743164241314, "memory(GiB)": 22.66, "step": 30454, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.957993 }, { "epoch": 0.9893447682162232, "grad_norm": 0.3472079932689667, "learning_rate": 3.1038352083345714e-09, "loss": 0.009337479248642921, "memory(GiB)": 22.66, "step": 30455, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.957998 }, { "epoch": 0.9893772536789787, "grad_norm": 0.4103642404079437, "learning_rate": 3.0849401776772424e-09, "loss": 0.01411384716629982, "memory(GiB)": 22.66, "step": 30456, "token_acc": 0.9864253393665159, "train_speed(iter/s)": 0.958003 }, { "epoch": 0.989409739141734, "grad_norm": 0.3378198444843292, "learning_rate": 3.0661028180706266e-09, "loss": 0.008212706074118614, "memory(GiB)": 22.66, "step": 30457, "token_acc": 1.0, "train_speed(iter/s)": 0.958009 }, { "epoch": 0.9894422246044895, "grad_norm": 0.32778599858283997, "learning_rate": 3.047323129732327e-09, "loss": 0.010946052148938179, "memory(GiB)": 22.66, "step": 30458, "token_acc": 1.0, "train_speed(iter/s)": 0.958014 }, { "epoch": 0.9894747100672449, "grad_norm": 1.2076719999313354, "learning_rate": 3.0286011128782823e-09, "loss": 0.012755917385220528, "memory(GiB)": 22.66, "step": 30459, "token_acc": 1.0, "train_speed(iter/s)": 0.958019 }, { "epoch": 0.9895071955300003, "grad_norm": 0.5333963632583618, "learning_rate": 3.0099367677255407e-09, "loss": 0.02184389717876911, "memory(GiB)": 22.66, "step": 30460, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.958024 }, { "epoch": 0.9895396809927557, "grad_norm": 0.32038697600364685, "learning_rate": 2.9913300944889314e-09, "loss": 0.008976353332400322, "memory(GiB)": 22.66, "step": 30461, "token_acc": 1.0, "train_speed(iter/s)": 0.958029 }, { "epoch": 0.9895721664555112, "grad_norm": 0.40454769134521484, "learning_rate": 2.9727810933838365e-09, "loss": 0.011297278106212616, "memory(GiB)": 22.66, "step": 30462, "token_acc": 0.9889705882352942, "train_speed(iter/s)": 0.958035 }, { "epoch": 0.9896046519182665, "grad_norm": 0.3078480362892151, "learning_rate": 2.9542897646234192e-09, "loss": 0.013025381602346897, "memory(GiB)": 22.66, "step": 30463, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.958041 }, { "epoch": 0.989637137381022, "grad_norm": 0.3628990948200226, "learning_rate": 2.935856108421953e-09, "loss": 0.012464253231883049, "memory(GiB)": 22.66, "step": 30464, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.958047 }, { "epoch": 0.9896696228437775, "grad_norm": 0.34115466475486755, "learning_rate": 2.9174801249914897e-09, "loss": 0.008472234010696411, "memory(GiB)": 22.66, "step": 30465, "token_acc": 1.0, "train_speed(iter/s)": 0.958052 }, { "epoch": 0.9897021083065328, "grad_norm": 0.38135528564453125, "learning_rate": 2.8991618145446376e-09, "loss": 0.01159330178052187, "memory(GiB)": 22.66, "step": 30466, "token_acc": 1.0, "train_speed(iter/s)": 0.958058 }, { "epoch": 0.9897345937692883, "grad_norm": 0.23127016425132751, "learning_rate": 2.880901177292339e-09, "loss": 0.006329802796244621, "memory(GiB)": 22.66, "step": 30467, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.958062 }, { "epoch": 0.9897670792320437, "grad_norm": 0.3483629822731018, "learning_rate": 2.862698213445536e-09, "loss": 0.014257512986660004, "memory(GiB)": 22.66, "step": 30468, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.958067 }, { "epoch": 0.9897995646947991, "grad_norm": 0.34584447741508484, "learning_rate": 2.8445529232146164e-09, "loss": 0.00874759629368782, "memory(GiB)": 22.66, "step": 30469, "token_acc": 0.99644128113879, "train_speed(iter/s)": 0.958073 }, { "epoch": 0.9898320501575545, "grad_norm": 0.4284692406654358, "learning_rate": 2.8264653068088564e-09, "loss": 0.01636303961277008, "memory(GiB)": 22.66, "step": 30470, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.958078 }, { "epoch": 0.98986453562031, "grad_norm": 0.5863123536109924, "learning_rate": 2.808435364436424e-09, "loss": 0.013533465564250946, "memory(GiB)": 22.66, "step": 30471, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.958084 }, { "epoch": 0.9898970210830653, "grad_norm": 0.23576276004314423, "learning_rate": 2.7904630963060397e-09, "loss": 0.007637157570570707, "memory(GiB)": 22.66, "step": 30472, "token_acc": 1.0, "train_speed(iter/s)": 0.95809 }, { "epoch": 0.9899295065458208, "grad_norm": 1.0580037832260132, "learning_rate": 2.7725485026253164e-09, "loss": 0.010354814119637012, "memory(GiB)": 22.66, "step": 30473, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.958097 }, { "epoch": 0.9899619920085762, "grad_norm": 0.2742493450641632, "learning_rate": 2.7546915836001995e-09, "loss": 0.007770500145852566, "memory(GiB)": 22.66, "step": 30474, "token_acc": 0.9881889763779528, "train_speed(iter/s)": 0.958103 }, { "epoch": 0.9899944774713316, "grad_norm": 0.41461458802223206, "learning_rate": 2.736892339437747e-09, "loss": 0.010156587697565556, "memory(GiB)": 22.66, "step": 30475, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.95811 }, { "epoch": 0.990026962934087, "grad_norm": 0.2900962829589844, "learning_rate": 2.719150770342793e-09, "loss": 0.0061994194984436035, "memory(GiB)": 22.66, "step": 30476, "token_acc": 1.0, "train_speed(iter/s)": 0.958117 }, { "epoch": 0.9900594483968425, "grad_norm": 0.4917232394218445, "learning_rate": 2.7014668765201756e-09, "loss": 0.010902335867285728, "memory(GiB)": 22.66, "step": 30477, "token_acc": 1.0, "train_speed(iter/s)": 0.958122 }, { "epoch": 0.9900919338595978, "grad_norm": 0.3610142469406128, "learning_rate": 2.6838406581736197e-09, "loss": 0.010855378583073616, "memory(GiB)": 22.66, "step": 30478, "token_acc": 0.9820627802690582, "train_speed(iter/s)": 0.958127 }, { "epoch": 0.9901244193223533, "grad_norm": 0.3678860068321228, "learning_rate": 2.666272115507407e-09, "loss": 0.0110268983989954, "memory(GiB)": 22.66, "step": 30479, "token_acc": 1.0, "train_speed(iter/s)": 0.958132 }, { "epoch": 0.9901569047851086, "grad_norm": 0.34462523460388184, "learning_rate": 2.648761248723597e-09, "loss": 0.007236634846776724, "memory(GiB)": 22.66, "step": 30480, "token_acc": 1.0, "train_speed(iter/s)": 0.958138 }, { "epoch": 0.9901893902478641, "grad_norm": 0.31571200489997864, "learning_rate": 2.631308058024806e-09, "loss": 0.008162477985024452, "memory(GiB)": 22.66, "step": 30481, "token_acc": 0.9929824561403509, "train_speed(iter/s)": 0.958143 }, { "epoch": 0.9902218757106195, "grad_norm": 0.27360936999320984, "learning_rate": 2.613912543611985e-09, "loss": 0.0095262061804533, "memory(GiB)": 22.66, "step": 30482, "token_acc": 1.0, "train_speed(iter/s)": 0.958148 }, { "epoch": 0.990254361173375, "grad_norm": 0.31047168374061584, "learning_rate": 2.5965747056860835e-09, "loss": 0.012954611331224442, "memory(GiB)": 22.66, "step": 30483, "token_acc": 0.9908256880733946, "train_speed(iter/s)": 0.958153 }, { "epoch": 0.9902868466361303, "grad_norm": 0.2715163230895996, "learning_rate": 2.579294544447497e-09, "loss": 0.007670421153306961, "memory(GiB)": 22.66, "step": 30484, "token_acc": 0.9893992932862191, "train_speed(iter/s)": 0.958158 }, { "epoch": 0.9903193320988858, "grad_norm": 0.26367947459220886, "learning_rate": 2.5620720600955105e-09, "loss": 0.008883515372872353, "memory(GiB)": 22.66, "step": 30485, "token_acc": 0.995475113122172, "train_speed(iter/s)": 0.958163 }, { "epoch": 0.9903518175616411, "grad_norm": 0.3078192174434662, "learning_rate": 2.5449072528282993e-09, "loss": 0.012762360274791718, "memory(GiB)": 22.66, "step": 30486, "token_acc": 1.0, "train_speed(iter/s)": 0.958168 }, { "epoch": 0.9903843030243966, "grad_norm": 0.40046489238739014, "learning_rate": 2.5278001228445927e-09, "loss": 0.021983427926898003, "memory(GiB)": 22.66, "step": 30487, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.958173 }, { "epoch": 0.990416788487152, "grad_norm": 0.3379955589771271, "learning_rate": 2.5107506703420104e-09, "loss": 0.012022588402032852, "memory(GiB)": 22.66, "step": 30488, "token_acc": 1.0, "train_speed(iter/s)": 0.958178 }, { "epoch": 0.9904492739499074, "grad_norm": 0.373405396938324, "learning_rate": 2.493758895516507e-09, "loss": 0.014283446595072746, "memory(GiB)": 22.66, "step": 30489, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.958183 }, { "epoch": 0.9904817594126628, "grad_norm": 0.28216853737831116, "learning_rate": 2.4768247985651473e-09, "loss": 0.010842774994671345, "memory(GiB)": 22.66, "step": 30490, "token_acc": 1.0, "train_speed(iter/s)": 0.958187 }, { "epoch": 0.9905142448754183, "grad_norm": 0.21439622342586517, "learning_rate": 2.459948379682775e-09, "loss": 0.007484124973416328, "memory(GiB)": 22.66, "step": 30491, "token_acc": 1.0, "train_speed(iter/s)": 0.958192 }, { "epoch": 0.9905467303381736, "grad_norm": 0.38896608352661133, "learning_rate": 2.4431296390642346e-09, "loss": 0.008579222485423088, "memory(GiB)": 22.66, "step": 30492, "token_acc": 1.0, "train_speed(iter/s)": 0.958197 }, { "epoch": 0.9905792158009291, "grad_norm": 0.345532089471817, "learning_rate": 2.4263685769038146e-09, "loss": 0.010852683335542679, "memory(GiB)": 22.66, "step": 30493, "token_acc": 1.0, "train_speed(iter/s)": 0.958202 }, { "epoch": 0.9906117012636845, "grad_norm": 0.27219510078430176, "learning_rate": 2.4096651933946945e-09, "loss": 0.009748117998242378, "memory(GiB)": 22.66, "step": 30494, "token_acc": 1.0, "train_speed(iter/s)": 0.958208 }, { "epoch": 0.99064418672644, "grad_norm": 0.4017806351184845, "learning_rate": 2.393019488730053e-09, "loss": 0.013509616255760193, "memory(GiB)": 22.66, "step": 30495, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.958212 }, { "epoch": 0.9906766721891953, "grad_norm": 0.34212565422058105, "learning_rate": 2.376431463101403e-09, "loss": 0.01163622085005045, "memory(GiB)": 22.66, "step": 30496, "token_acc": 1.0, "train_speed(iter/s)": 0.958218 }, { "epoch": 0.9907091576519508, "grad_norm": 0.4175620377063751, "learning_rate": 2.3599011167008134e-09, "loss": 0.011650758795440197, "memory(GiB)": 22.66, "step": 30497, "token_acc": 0.9943820224719101, "train_speed(iter/s)": 0.958225 }, { "epoch": 0.9907416431147061, "grad_norm": 0.6982474327087402, "learning_rate": 2.343428449718688e-09, "loss": 0.007893888279795647, "memory(GiB)": 22.66, "step": 30498, "token_acc": 1.0, "train_speed(iter/s)": 0.958231 }, { "epoch": 0.9907741285774616, "grad_norm": 0.4138402044773102, "learning_rate": 2.3270134623454288e-09, "loss": 0.01953805983066559, "memory(GiB)": 22.66, "step": 30499, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.958238 }, { "epoch": 0.990806614040217, "grad_norm": 0.46863338351249695, "learning_rate": 2.31065615477033e-09, "loss": 0.012131446972489357, "memory(GiB)": 22.66, "step": 30500, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.958245 }, { "epoch": 0.990806614040217, "eval_loss": 0.01110515370965004, "eval_runtime": 80.1699, "eval_samples_per_second": 124.111, "eval_steps_per_second": 3.879, "eval_token_acc": 0.9955158335864579, "step": 30500 }, { "epoch": 0.9908390995029724, "grad_norm": 0.714604914188385, "learning_rate": 2.294356527182129e-09, "loss": 0.020915497094392776, "memory(GiB)": 22.66, "step": 30501, "token_acc": 0.9950819222971121, "train_speed(iter/s)": 0.955433 }, { "epoch": 0.9908715849657278, "grad_norm": 0.41952043771743774, "learning_rate": 2.2781145797690084e-09, "loss": 0.013699159026145935, "memory(GiB)": 22.66, "step": 30502, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.955436 }, { "epoch": 0.9909040704284833, "grad_norm": 0.5156562328338623, "learning_rate": 2.2619303127180413e-09, "loss": 0.013839991763234138, "memory(GiB)": 22.66, "step": 30503, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.95544 }, { "epoch": 0.9909365558912386, "grad_norm": 0.33060839772224426, "learning_rate": 2.2458037262168553e-09, "loss": 0.011665789410471916, "memory(GiB)": 22.66, "step": 30504, "token_acc": 1.0, "train_speed(iter/s)": 0.955444 }, { "epoch": 0.9909690413539941, "grad_norm": 0.43400517106056213, "learning_rate": 2.229734820450857e-09, "loss": 0.011966823600232601, "memory(GiB)": 22.66, "step": 30505, "token_acc": 0.9963636363636363, "train_speed(iter/s)": 0.955449 }, { "epoch": 0.9910015268167495, "grad_norm": 0.449455589056015, "learning_rate": 2.21372359560601e-09, "loss": 0.012460557743906975, "memory(GiB)": 22.66, "step": 30506, "token_acc": 0.9895470383275261, "train_speed(iter/s)": 0.955454 }, { "epoch": 0.9910340122795049, "grad_norm": 0.34391748905181885, "learning_rate": 2.19777005186661e-09, "loss": 0.009172679856419563, "memory(GiB)": 22.66, "step": 30507, "token_acc": 1.0, "train_speed(iter/s)": 0.955459 }, { "epoch": 0.9910664977422603, "grad_norm": 0.43814173340797424, "learning_rate": 2.181874189416955e-09, "loss": 0.021020615473389626, "memory(GiB)": 22.66, "step": 30508, "token_acc": 0.9928825622775801, "train_speed(iter/s)": 0.955465 }, { "epoch": 0.9910989832050158, "grad_norm": 0.3435944616794586, "learning_rate": 2.166036008440231e-09, "loss": 0.013260302133858204, "memory(GiB)": 22.66, "step": 30509, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955471 }, { "epoch": 0.9911314686677711, "grad_norm": 0.2694540023803711, "learning_rate": 2.1502555091201805e-09, "loss": 0.01042386144399643, "memory(GiB)": 22.66, "step": 30510, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.955478 }, { "epoch": 0.9911639541305266, "grad_norm": 0.3421345055103302, "learning_rate": 2.1345326916377695e-09, "loss": 0.009843908250331879, "memory(GiB)": 22.66, "step": 30511, "token_acc": 0.9962406015037594, "train_speed(iter/s)": 0.955484 }, { "epoch": 0.991196439593282, "grad_norm": 0.2863451838493347, "learning_rate": 2.11886755617563e-09, "loss": 0.010703472420573235, "memory(GiB)": 22.66, "step": 30512, "token_acc": 1.0, "train_speed(iter/s)": 0.955491 }, { "epoch": 0.9912289250560374, "grad_norm": 0.29420551657676697, "learning_rate": 2.103260102913618e-09, "loss": 0.009011533111333847, "memory(GiB)": 22.66, "step": 30513, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.955497 }, { "epoch": 0.9912614105187928, "grad_norm": 0.3321518301963806, "learning_rate": 2.0877103320315895e-09, "loss": 0.012748523615300655, "memory(GiB)": 22.66, "step": 30514, "token_acc": 0.9915966386554622, "train_speed(iter/s)": 0.955504 }, { "epoch": 0.9912938959815483, "grad_norm": 0.23521734774112701, "learning_rate": 2.072218243710511e-09, "loss": 0.0070342132821679115, "memory(GiB)": 22.66, "step": 30515, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.95551 }, { "epoch": 0.9913263814443036, "grad_norm": 0.5593221783638, "learning_rate": 2.056783838127463e-09, "loss": 0.022029832005500793, "memory(GiB)": 22.66, "step": 30516, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.955517 }, { "epoch": 0.9913588669070591, "grad_norm": 0.3458034098148346, "learning_rate": 2.0414071154617466e-09, "loss": 0.008325262926518917, "memory(GiB)": 22.66, "step": 30517, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955524 }, { "epoch": 0.9913913523698145, "grad_norm": 0.411040723323822, "learning_rate": 2.0260880758909974e-09, "loss": 0.010502812452614307, "memory(GiB)": 22.66, "step": 30518, "token_acc": 0.9965156794425087, "train_speed(iter/s)": 0.95553 }, { "epoch": 0.9914238378325699, "grad_norm": 0.28435829281806946, "learning_rate": 2.0108267195906304e-09, "loss": 0.00894103478640318, "memory(GiB)": 22.66, "step": 30519, "token_acc": 1.0, "train_speed(iter/s)": 0.955537 }, { "epoch": 0.9914563232953253, "grad_norm": 0.31778883934020996, "learning_rate": 1.995623046738282e-09, "loss": 0.006932188291102648, "memory(GiB)": 22.66, "step": 30520, "token_acc": 1.0, "train_speed(iter/s)": 0.955544 }, { "epoch": 0.9914888087580808, "grad_norm": 0.36511915922164917, "learning_rate": 1.9804770575082564e-09, "loss": 0.0083696898072958, "memory(GiB)": 22.66, "step": 30521, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955551 }, { "epoch": 0.9915212942208361, "grad_norm": 0.24185681343078613, "learning_rate": 1.9653887520759694e-09, "loss": 0.008174259215593338, "memory(GiB)": 22.66, "step": 30522, "token_acc": 1.0, "train_speed(iter/s)": 0.955558 }, { "epoch": 0.9915537796835916, "grad_norm": 0.3097277879714966, "learning_rate": 1.950358130615726e-09, "loss": 0.008739159442484379, "memory(GiB)": 22.66, "step": 30523, "token_acc": 1.0, "train_speed(iter/s)": 0.955564 }, { "epoch": 0.991586265146347, "grad_norm": 0.28589925169944763, "learning_rate": 1.9353851933007205e-09, "loss": 0.008861344307661057, "memory(GiB)": 22.66, "step": 30524, "token_acc": 1.0, "train_speed(iter/s)": 0.955563 }, { "epoch": 0.9916187506091024, "grad_norm": 0.34605535864830017, "learning_rate": 1.9204699403035933e-09, "loss": 0.01138528436422348, "memory(GiB)": 22.66, "step": 30525, "token_acc": 0.9917355371900827, "train_speed(iter/s)": 0.955569 }, { "epoch": 0.9916512360718578, "grad_norm": 0.39831599593162537, "learning_rate": 1.9056123717964283e-09, "loss": 0.013182077556848526, "memory(GiB)": 22.66, "step": 30526, "token_acc": 1.0, "train_speed(iter/s)": 0.955576 }, { "epoch": 0.9916837215346133, "grad_norm": 0.34753742814064026, "learning_rate": 1.89081248795131e-09, "loss": 0.012813583016395569, "memory(GiB)": 22.66, "step": 30527, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.955581 }, { "epoch": 0.9917162069973687, "grad_norm": 0.45633456110954285, "learning_rate": 1.8760702889386586e-09, "loss": 0.010113496333360672, "memory(GiB)": 22.66, "step": 30528, "token_acc": 1.0, "train_speed(iter/s)": 0.955588 }, { "epoch": 0.9917486924601241, "grad_norm": 0.43135306239128113, "learning_rate": 1.861385774928337e-09, "loss": 0.01584744080901146, "memory(GiB)": 22.66, "step": 30529, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.955595 }, { "epoch": 0.9917811779228796, "grad_norm": 0.3268637955188751, "learning_rate": 1.8467589460907653e-09, "loss": 0.008649954572319984, "memory(GiB)": 22.66, "step": 30530, "token_acc": 0.9959183673469387, "train_speed(iter/s)": 0.955602 }, { "epoch": 0.9918136633856349, "grad_norm": 0.40138497948646545, "learning_rate": 1.832189802593587e-09, "loss": 0.01342870108783245, "memory(GiB)": 22.66, "step": 30531, "token_acc": 0.9957446808510638, "train_speed(iter/s)": 0.955608 }, { "epoch": 0.9918461488483904, "grad_norm": 0.28348657488822937, "learning_rate": 1.8176783446055557e-09, "loss": 0.008416136726737022, "memory(GiB)": 22.66, "step": 30532, "token_acc": 1.0, "train_speed(iter/s)": 0.955613 }, { "epoch": 0.9918786343111458, "grad_norm": 0.2776528596878052, "learning_rate": 1.8032245722943153e-09, "loss": 0.009730091318488121, "memory(GiB)": 22.66, "step": 30533, "token_acc": 0.9791666666666666, "train_speed(iter/s)": 0.955619 }, { "epoch": 0.9919111197739012, "grad_norm": 0.23937025666236877, "learning_rate": 1.7888284858263994e-09, "loss": 0.008340897038578987, "memory(GiB)": 22.66, "step": 30534, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.955626 }, { "epoch": 0.9919436052366566, "grad_norm": 0.386306494474411, "learning_rate": 1.774490085367786e-09, "loss": 0.010924161411821842, "memory(GiB)": 22.66, "step": 30535, "token_acc": 0.9899328859060402, "train_speed(iter/s)": 0.955632 }, { "epoch": 0.9919760906994121, "grad_norm": 0.3757113814353943, "learning_rate": 1.7602093710844536e-09, "loss": 0.010213619098067284, "memory(GiB)": 22.66, "step": 30536, "token_acc": 1.0, "train_speed(iter/s)": 0.95562 }, { "epoch": 0.9920085761621674, "grad_norm": 0.5057774782180786, "learning_rate": 1.7459863431407154e-09, "loss": 0.009143495932221413, "memory(GiB)": 22.66, "step": 30537, "token_acc": 1.0, "train_speed(iter/s)": 0.955627 }, { "epoch": 0.9920410616249229, "grad_norm": 0.3251889944076538, "learning_rate": 1.7318210017008842e-09, "loss": 0.0070141954347491264, "memory(GiB)": 22.66, "step": 30538, "token_acc": 1.0, "train_speed(iter/s)": 0.955634 }, { "epoch": 0.9920735470876783, "grad_norm": 0.3218841552734375, "learning_rate": 1.7177133469287178e-09, "loss": 0.016416184604167938, "memory(GiB)": 22.66, "step": 30539, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955641 }, { "epoch": 0.9921060325504337, "grad_norm": 0.25403374433517456, "learning_rate": 1.7036633789868639e-09, "loss": 0.008097030222415924, "memory(GiB)": 22.66, "step": 30540, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.955648 }, { "epoch": 0.9921385180131891, "grad_norm": 0.3371076285839081, "learning_rate": 1.6896710980374153e-09, "loss": 0.008560380898416042, "memory(GiB)": 22.66, "step": 30541, "token_acc": 0.9956140350877193, "train_speed(iter/s)": 0.955654 }, { "epoch": 0.9921710034759446, "grad_norm": 0.3196217715740204, "learning_rate": 1.6757365042419093e-09, "loss": 0.013943193480372429, "memory(GiB)": 22.66, "step": 30542, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.955661 }, { "epoch": 0.9922034889386999, "grad_norm": 0.2829236686229706, "learning_rate": 1.6618595977613283e-09, "loss": 0.00851704366505146, "memory(GiB)": 22.66, "step": 30543, "token_acc": 1.0, "train_speed(iter/s)": 0.955668 }, { "epoch": 0.9922359744014554, "grad_norm": 0.19198431074619293, "learning_rate": 1.6480403787549892e-09, "loss": 0.006313582882285118, "memory(GiB)": 22.66, "step": 30544, "token_acc": 1.0, "train_speed(iter/s)": 0.955675 }, { "epoch": 0.9922684598642108, "grad_norm": 0.3309101462364197, "learning_rate": 1.6342788473838743e-09, "loss": 0.011634496040642262, "memory(GiB)": 22.66, "step": 30545, "token_acc": 1.0, "train_speed(iter/s)": 0.955682 }, { "epoch": 0.9923009453269662, "grad_norm": 0.281063437461853, "learning_rate": 1.6205750038056355e-09, "loss": 0.0065617989748716354, "memory(GiB)": 22.66, "step": 30546, "token_acc": 1.0, "train_speed(iter/s)": 0.955687 }, { "epoch": 0.9923334307897216, "grad_norm": 0.43065309524536133, "learning_rate": 1.6069288481784794e-09, "loss": 0.015288014896214008, "memory(GiB)": 22.66, "step": 30547, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.955692 }, { "epoch": 0.992365916252477, "grad_norm": 0.33539891242980957, "learning_rate": 1.593340380660613e-09, "loss": 0.010615188628435135, "memory(GiB)": 22.66, "step": 30548, "token_acc": 0.9954954954954955, "train_speed(iter/s)": 0.955697 }, { "epoch": 0.9923984017152324, "grad_norm": 0.2130841314792633, "learning_rate": 1.5798096014085774e-09, "loss": 0.010980835184454918, "memory(GiB)": 22.66, "step": 30549, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.955702 }, { "epoch": 0.9924308871779879, "grad_norm": 0.47390472888946533, "learning_rate": 1.5663365105778038e-09, "loss": 0.011227298527956009, "memory(GiB)": 22.66, "step": 30550, "token_acc": 1.0, "train_speed(iter/s)": 0.955707 }, { "epoch": 0.9924633726407432, "grad_norm": 0.26894089579582214, "learning_rate": 1.5529211083248342e-09, "loss": 0.006822842173278332, "memory(GiB)": 22.66, "step": 30551, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.955711 }, { "epoch": 0.9924958581034987, "grad_norm": 0.3438473641872406, "learning_rate": 1.5395633948045442e-09, "loss": 0.010808032006025314, "memory(GiB)": 22.66, "step": 30552, "token_acc": 1.0, "train_speed(iter/s)": 0.955716 }, { "epoch": 0.9925283435662541, "grad_norm": 0.38090112805366516, "learning_rate": 1.5262633701701446e-09, "loss": 0.018009981140494347, "memory(GiB)": 22.66, "step": 30553, "token_acc": 0.996, "train_speed(iter/s)": 0.955721 }, { "epoch": 0.9925608290290096, "grad_norm": 0.3635082542896271, "learning_rate": 1.5130210345754015e-09, "loss": 0.01113869808614254, "memory(GiB)": 22.66, "step": 30554, "token_acc": 0.9945054945054945, "train_speed(iter/s)": 0.955725 }, { "epoch": 0.9925933144917649, "grad_norm": 0.32703933119773865, "learning_rate": 1.4998363881735256e-09, "loss": 0.011086337268352509, "memory(GiB)": 22.66, "step": 30555, "token_acc": 1.0, "train_speed(iter/s)": 0.955729 }, { "epoch": 0.9926257999545204, "grad_norm": 0.45200300216674805, "learning_rate": 1.4867094311166174e-09, "loss": 0.013785479590296745, "memory(GiB)": 22.66, "step": 30556, "token_acc": 0.9953271028037384, "train_speed(iter/s)": 0.955734 }, { "epoch": 0.9926582854172757, "grad_norm": 0.3633839190006256, "learning_rate": 1.4736401635562225e-09, "loss": 0.011437865905463696, "memory(GiB)": 22.66, "step": 30557, "token_acc": 0.99609375, "train_speed(iter/s)": 0.955739 }, { "epoch": 0.9926907708800312, "grad_norm": 0.3301876485347748, "learning_rate": 1.460628585642776e-09, "loss": 0.012666050344705582, "memory(GiB)": 22.66, "step": 30558, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.955744 }, { "epoch": 0.9927232563427866, "grad_norm": 0.2743205726146698, "learning_rate": 1.447674697526713e-09, "loss": 0.00780496234074235, "memory(GiB)": 22.66, "step": 30559, "token_acc": 1.0, "train_speed(iter/s)": 0.955749 }, { "epoch": 0.992755741805542, "grad_norm": 0.43259498476982117, "learning_rate": 1.4347784993573589e-09, "loss": 0.01183427032083273, "memory(GiB)": 22.66, "step": 30560, "token_acc": 1.0, "train_speed(iter/s)": 0.955753 }, { "epoch": 0.9927882272682974, "grad_norm": 0.35204017162323, "learning_rate": 1.4219399912840382e-09, "loss": 0.007436682935804129, "memory(GiB)": 22.66, "step": 30561, "token_acc": 1.0, "train_speed(iter/s)": 0.955758 }, { "epoch": 0.9928207127310529, "grad_norm": 0.4886321723461151, "learning_rate": 1.4091591734549659e-09, "loss": 0.01526094414293766, "memory(GiB)": 22.66, "step": 30562, "token_acc": 0.9879032258064516, "train_speed(iter/s)": 0.955762 }, { "epoch": 0.9928531981938082, "grad_norm": 0.3897804915904999, "learning_rate": 1.3964360460166915e-09, "loss": 0.010261226445436478, "memory(GiB)": 22.66, "step": 30563, "token_acc": 0.9900990099009901, "train_speed(iter/s)": 0.955767 }, { "epoch": 0.9928856836565637, "grad_norm": 0.27906519174575806, "learning_rate": 1.3837706091168746e-09, "loss": 0.008385589346289635, "memory(GiB)": 22.66, "step": 30564, "token_acc": 1.0, "train_speed(iter/s)": 0.955771 }, { "epoch": 0.9929181691193191, "grad_norm": 0.37716689705848694, "learning_rate": 1.3711628629015095e-09, "loss": 0.011294782161712646, "memory(GiB)": 22.66, "step": 30565, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.955775 }, { "epoch": 0.9929506545820745, "grad_norm": 0.6423786282539368, "learning_rate": 1.3586128075165906e-09, "loss": 0.015397809445858002, "memory(GiB)": 22.66, "step": 30566, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.95578 }, { "epoch": 0.9929831400448299, "grad_norm": 0.2355332225561142, "learning_rate": 1.3461204431058916e-09, "loss": 0.008614764548838139, "memory(GiB)": 22.66, "step": 30567, "token_acc": 0.9902912621359223, "train_speed(iter/s)": 0.955784 }, { "epoch": 0.9930156255075854, "grad_norm": 0.2870340943336487, "learning_rate": 1.3336857698137418e-09, "loss": 0.009952500462532043, "memory(GiB)": 22.66, "step": 30568, "token_acc": 0.9955555555555555, "train_speed(iter/s)": 0.95579 }, { "epoch": 0.9930481109703407, "grad_norm": 0.33418992161750793, "learning_rate": 1.321308787785025e-09, "loss": 0.013921349309384823, "memory(GiB)": 22.66, "step": 30569, "token_acc": 0.9921568627450981, "train_speed(iter/s)": 0.955795 }, { "epoch": 0.9930805964330962, "grad_norm": 0.23021386563777924, "learning_rate": 1.3089894971607397e-09, "loss": 0.00527089461684227, "memory(GiB)": 22.66, "step": 30570, "token_acc": 0.9961089494163424, "train_speed(iter/s)": 0.955801 }, { "epoch": 0.9931130818958516, "grad_norm": 0.31048092246055603, "learning_rate": 1.2967278980846599e-09, "loss": 0.018925528973340988, "memory(GiB)": 22.66, "step": 30571, "token_acc": 1.0, "train_speed(iter/s)": 0.955806 }, { "epoch": 0.993145567358607, "grad_norm": 0.24652059376239777, "learning_rate": 1.2845239906972285e-09, "loss": 0.007083413656800985, "memory(GiB)": 22.66, "step": 30572, "token_acc": 1.0, "train_speed(iter/s)": 0.955813 }, { "epoch": 0.9931780528213624, "grad_norm": 0.38368067145347595, "learning_rate": 1.272377775139999e-09, "loss": 0.006195173133164644, "memory(GiB)": 22.66, "step": 30573, "token_acc": 0.9920318725099602, "train_speed(iter/s)": 0.955819 }, { "epoch": 0.9932105382841179, "grad_norm": 0.29890456795692444, "learning_rate": 1.2602892515523047e-09, "loss": 0.01402580738067627, "memory(GiB)": 22.66, "step": 30574, "token_acc": 1.0, "train_speed(iter/s)": 0.955826 }, { "epoch": 0.9932430237468732, "grad_norm": 0.3065639138221741, "learning_rate": 1.2482584200745885e-09, "loss": 0.008437594398856163, "memory(GiB)": 22.66, "step": 30575, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.955832 }, { "epoch": 0.9932755092096287, "grad_norm": 0.3563913404941559, "learning_rate": 1.236285280845073e-09, "loss": 0.009717343375086784, "memory(GiB)": 22.66, "step": 30576, "token_acc": 0.9863013698630136, "train_speed(iter/s)": 0.955839 }, { "epoch": 0.9933079946723841, "grad_norm": 0.4009377658367157, "learning_rate": 1.2243698340025368e-09, "loss": 0.012988153845071793, "memory(GiB)": 22.66, "step": 30577, "token_acc": 0.9963235294117647, "train_speed(iter/s)": 0.955846 }, { "epoch": 0.9933404801351395, "grad_norm": 0.35459768772125244, "learning_rate": 1.2125120796835366e-09, "loss": 0.01311800442636013, "memory(GiB)": 22.66, "step": 30578, "token_acc": 0.9961389961389961, "train_speed(iter/s)": 0.955853 }, { "epoch": 0.9933729655978949, "grad_norm": 0.3563769459724426, "learning_rate": 1.2007120180262953e-09, "loss": 0.009949171915650368, "memory(GiB)": 22.66, "step": 30579, "token_acc": 1.0, "train_speed(iter/s)": 0.955859 }, { "epoch": 0.9934054510606504, "grad_norm": 0.3134116530418396, "learning_rate": 1.1889696491657054e-09, "loss": 0.010432318784296513, "memory(GiB)": 22.66, "step": 30580, "token_acc": 0.9844559585492227, "train_speed(iter/s)": 0.955866 }, { "epoch": 0.9934379365234057, "grad_norm": 0.389082670211792, "learning_rate": 1.1772849732377689e-09, "loss": 0.013873044401407242, "memory(GiB)": 22.66, "step": 30581, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.955873 }, { "epoch": 0.9934704219861612, "grad_norm": 0.35085442662239075, "learning_rate": 1.165657990377378e-09, "loss": 0.009534039534628391, "memory(GiB)": 22.66, "step": 30582, "token_acc": 0.9906976744186047, "train_speed(iter/s)": 0.95588 }, { "epoch": 0.9935029074489166, "grad_norm": 0.6148058176040649, "learning_rate": 1.1540887007188695e-09, "loss": 0.017259199172258377, "memory(GiB)": 22.66, "step": 30583, "token_acc": 0.9888059701492538, "train_speed(iter/s)": 0.955886 }, { "epoch": 0.993535392911672, "grad_norm": 0.450623095035553, "learning_rate": 1.1425771043954704e-09, "loss": 0.01888236403465271, "memory(GiB)": 22.66, "step": 30584, "token_acc": 0.9956331877729258, "train_speed(iter/s)": 0.955893 }, { "epoch": 0.9935678783744274, "grad_norm": 0.28139013051986694, "learning_rate": 1.1311232015404073e-09, "loss": 0.010933265089988708, "memory(GiB)": 22.66, "step": 30585, "token_acc": 0.9959349593495935, "train_speed(iter/s)": 0.9559 }, { "epoch": 0.9936003638371829, "grad_norm": 0.2911897897720337, "learning_rate": 1.1197269922852416e-09, "loss": 0.009053697809576988, "memory(GiB)": 22.66, "step": 30586, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.955906 }, { "epoch": 0.9936328492999382, "grad_norm": 0.2673814296722412, "learning_rate": 1.1083884767626452e-09, "loss": 0.006949096918106079, "memory(GiB)": 22.66, "step": 30587, "token_acc": 1.0, "train_speed(iter/s)": 0.955913 }, { "epoch": 0.9936653347626937, "grad_norm": 0.45615389943122864, "learning_rate": 1.0971076551025139e-09, "loss": 0.01551777683198452, "memory(GiB)": 22.66, "step": 30588, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.95592 }, { "epoch": 0.9936978202254491, "grad_norm": 0.3712289333343506, "learning_rate": 1.085884527434744e-09, "loss": 0.006879338063299656, "memory(GiB)": 22.66, "step": 30589, "token_acc": 1.0, "train_speed(iter/s)": 0.955927 }, { "epoch": 0.9937303056882045, "grad_norm": 0.3089694380760193, "learning_rate": 1.0747190938897867e-09, "loss": 0.00935774389654398, "memory(GiB)": 22.66, "step": 30590, "token_acc": 1.0, "train_speed(iter/s)": 0.955933 }, { "epoch": 0.9937627911509599, "grad_norm": 0.37464457750320435, "learning_rate": 1.0636113545964276e-09, "loss": 0.014420812018215656, "memory(GiB)": 22.66, "step": 30591, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.95594 }, { "epoch": 0.9937952766137154, "grad_norm": 0.4235692620277405, "learning_rate": 1.0525613096823428e-09, "loss": 0.009931588545441628, "memory(GiB)": 22.66, "step": 30592, "token_acc": 1.0, "train_speed(iter/s)": 0.955946 }, { "epoch": 0.9938277620764708, "grad_norm": 0.3848651349544525, "learning_rate": 1.0415689592746524e-09, "loss": 0.013544866815209389, "memory(GiB)": 22.66, "step": 30593, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.955953 }, { "epoch": 0.9938602475392262, "grad_norm": 0.40089255571365356, "learning_rate": 1.0306343035015876e-09, "loss": 0.0101042240858078, "memory(GiB)": 22.66, "step": 30594, "token_acc": 1.0, "train_speed(iter/s)": 0.95596 }, { "epoch": 0.9938927330019817, "grad_norm": 0.33384791016578674, "learning_rate": 1.0197573424886032e-09, "loss": 0.012020101770758629, "memory(GiB)": 22.66, "step": 30595, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.955967 }, { "epoch": 0.993925218464737, "grad_norm": 0.31064271926879883, "learning_rate": 1.0089380763611545e-09, "loss": 0.011489501222968102, "memory(GiB)": 22.66, "step": 30596, "token_acc": 0.9955156950672646, "train_speed(iter/s)": 0.955973 }, { "epoch": 0.9939577039274925, "grad_norm": 0.2576155364513397, "learning_rate": 9.981765052441417e-10, "loss": 0.01255855057388544, "memory(GiB)": 22.66, "step": 30597, "token_acc": 0.9920948616600791, "train_speed(iter/s)": 0.95598 }, { "epoch": 0.9939901893902479, "grad_norm": 0.2147456854581833, "learning_rate": 9.874726292619097e-10, "loss": 0.005452902056276798, "memory(GiB)": 22.66, "step": 30598, "token_acc": 0.9921875, "train_speed(iter/s)": 0.955986 }, { "epoch": 0.9940226748530033, "grad_norm": 0.4504801034927368, "learning_rate": 9.768264485376933e-10, "loss": 0.01623084396123886, "memory(GiB)": 22.66, "step": 30599, "token_acc": 1.0, "train_speed(iter/s)": 0.955993 }, { "epoch": 0.9940551603157587, "grad_norm": 0.4590611457824707, "learning_rate": 9.662379631947272e-10, "loss": 0.014457684941589832, "memory(GiB)": 22.66, "step": 30600, "token_acc": 0.9961240310077519, "train_speed(iter/s)": 0.955999 }, { "epoch": 0.9940876457785142, "grad_norm": 0.3455497622489929, "learning_rate": 9.55707173355136e-10, "loss": 0.009411177597939968, "memory(GiB)": 22.66, "step": 30601, "token_acc": 0.9914529914529915, "train_speed(iter/s)": 0.956006 }, { "epoch": 0.9941201312412695, "grad_norm": 0.33631178736686707, "learning_rate": 9.45234079139934e-10, "loss": 0.007178572937846184, "memory(GiB)": 22.66, "step": 30602, "token_acc": 0.9877049180327869, "train_speed(iter/s)": 0.956012 }, { "epoch": 0.994152616704025, "grad_norm": 0.2736864387989044, "learning_rate": 9.348186806712457e-10, "loss": 0.010287613607943058, "memory(GiB)": 22.66, "step": 30603, "token_acc": 0.9951923076923077, "train_speed(iter/s)": 0.956018 }, { "epoch": 0.9941851021667804, "grad_norm": 0.3640085756778717, "learning_rate": 9.244609780678649e-10, "loss": 0.013272685930132866, "memory(GiB)": 22.66, "step": 30604, "token_acc": 1.0, "train_speed(iter/s)": 0.956025 }, { "epoch": 0.9942175876295358, "grad_norm": 0.36774319410324097, "learning_rate": 9.141609714502508e-10, "loss": 0.011499624699354172, "memory(GiB)": 22.66, "step": 30605, "token_acc": 1.0, "train_speed(iter/s)": 0.956032 }, { "epoch": 0.9942500730922912, "grad_norm": 0.21667765080928802, "learning_rate": 9.039186609366424e-10, "loss": 0.00570644112303853, "memory(GiB)": 22.66, "step": 30606, "token_acc": 1.0, "train_speed(iter/s)": 0.956038 }, { "epoch": 0.9942825585550467, "grad_norm": 0.322270005941391, "learning_rate": 8.937340466458333e-10, "loss": 0.009597270749509335, "memory(GiB)": 22.66, "step": 30607, "token_acc": 1.0, "train_speed(iter/s)": 0.956042 }, { "epoch": 0.994315044017802, "grad_norm": 0.31556734442710876, "learning_rate": 8.836071286955073e-10, "loss": 0.00847749225795269, "memory(GiB)": 22.66, "step": 30608, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956048 }, { "epoch": 0.9943475294805575, "grad_norm": 0.35094040632247925, "learning_rate": 8.735379072016825e-10, "loss": 0.008858196437358856, "memory(GiB)": 22.66, "step": 30609, "token_acc": 1.0, "train_speed(iter/s)": 0.956053 }, { "epoch": 0.9943800149433129, "grad_norm": 0.38404253125190735, "learning_rate": 8.635263822809326e-10, "loss": 0.010256506502628326, "memory(GiB)": 22.66, "step": 30610, "token_acc": 0.993006993006993, "train_speed(iter/s)": 0.956058 }, { "epoch": 0.9944125004060683, "grad_norm": 0.37249258160591125, "learning_rate": 8.535725540492756e-10, "loss": 0.012862654402852058, "memory(GiB)": 22.66, "step": 30611, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956063 }, { "epoch": 0.9944449858688237, "grad_norm": 0.39113494753837585, "learning_rate": 8.436764226210648e-10, "loss": 0.013950095511972904, "memory(GiB)": 22.66, "step": 30612, "token_acc": 1.0, "train_speed(iter/s)": 0.956069 }, { "epoch": 0.9944774713315792, "grad_norm": 0.37015479803085327, "learning_rate": 8.33837988111208e-10, "loss": 0.012186634354293346, "memory(GiB)": 22.66, "step": 30613, "token_acc": 0.9965034965034965, "train_speed(iter/s)": 0.956073 }, { "epoch": 0.9945099567943345, "grad_norm": 0.34953638911247253, "learning_rate": 8.240572506318379e-10, "loss": 0.011060552671551704, "memory(GiB)": 22.66, "step": 30614, "token_acc": 0.9922779922779923, "train_speed(iter/s)": 0.956078 }, { "epoch": 0.99454244225709, "grad_norm": 0.3296789824962616, "learning_rate": 8.143342102973073e-10, "loss": 0.01124042272567749, "memory(GiB)": 22.66, "step": 30615, "token_acc": 0.9901477832512315, "train_speed(iter/s)": 0.956083 }, { "epoch": 0.9945749277198453, "grad_norm": 0.3303717374801636, "learning_rate": 8.046688672191938e-10, "loss": 0.008673559874296188, "memory(GiB)": 22.66, "step": 30616, "token_acc": 0.9925093632958801, "train_speed(iter/s)": 0.956088 }, { "epoch": 0.9946074131826008, "grad_norm": 0.3756653964519501, "learning_rate": 7.950612215090747e-10, "loss": 0.01244976744055748, "memory(GiB)": 22.66, "step": 30617, "token_acc": 1.0, "train_speed(iter/s)": 0.956093 }, { "epoch": 0.9946398986453562, "grad_norm": 0.28468114137649536, "learning_rate": 7.855112732779724e-10, "loss": 0.010629270225763321, "memory(GiB)": 22.66, "step": 30618, "token_acc": 1.0, "train_speed(iter/s)": 0.956098 }, { "epoch": 0.9946723841081117, "grad_norm": 0.345476359128952, "learning_rate": 7.760190226357989e-10, "loss": 0.012181250378489494, "memory(GiB)": 22.66, "step": 30619, "token_acc": 0.9953917050691244, "train_speed(iter/s)": 0.956102 }, { "epoch": 0.994704869570867, "grad_norm": 2.763309955596924, "learning_rate": 7.665844696924663e-10, "loss": 0.018058471381664276, "memory(GiB)": 22.66, "step": 30620, "token_acc": 1.0, "train_speed(iter/s)": 0.956107 }, { "epoch": 0.9947373550336225, "grad_norm": 0.44728177785873413, "learning_rate": 7.572076145567764e-10, "loss": 0.015377644449472427, "memory(GiB)": 22.66, "step": 30621, "token_acc": 0.9906103286384976, "train_speed(iter/s)": 0.956112 }, { "epoch": 0.9947698404963778, "grad_norm": 0.31163325905799866, "learning_rate": 7.47888457336976e-10, "loss": 0.010644098743796349, "memory(GiB)": 22.66, "step": 30622, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956117 }, { "epoch": 0.9948023259591333, "grad_norm": 0.3136308193206787, "learning_rate": 7.386269981402017e-10, "loss": 0.007365237455815077, "memory(GiB)": 22.66, "step": 30623, "token_acc": 1.0, "train_speed(iter/s)": 0.956121 }, { "epoch": 0.9948348114218887, "grad_norm": 0.2671522796154022, "learning_rate": 7.294232370735899e-10, "loss": 0.009038882330060005, "memory(GiB)": 22.66, "step": 30624, "token_acc": 0.99609375, "train_speed(iter/s)": 0.956126 }, { "epoch": 0.9948672968846441, "grad_norm": 0.3260963559150696, "learning_rate": 7.20277174243722e-10, "loss": 0.010027427226305008, "memory(GiB)": 22.66, "step": 30625, "token_acc": 1.0, "train_speed(iter/s)": 0.956131 }, { "epoch": 0.9948997823473995, "grad_norm": 0.377605140209198, "learning_rate": 7.111888097560693e-10, "loss": 0.012707054615020752, "memory(GiB)": 22.66, "step": 30626, "token_acc": 1.0, "train_speed(iter/s)": 0.956137 }, { "epoch": 0.994932267810155, "grad_norm": 0.2429821938276291, "learning_rate": 7.021581437149927e-10, "loss": 0.011268730275332928, "memory(GiB)": 22.66, "step": 30627, "token_acc": 0.995850622406639, "train_speed(iter/s)": 0.956143 }, { "epoch": 0.9949647532729103, "grad_norm": 0.49677324295043945, "learning_rate": 6.931851762254083e-10, "loss": 0.01114906370639801, "memory(GiB)": 22.66, "step": 30628, "token_acc": 0.9953051643192489, "train_speed(iter/s)": 0.956149 }, { "epoch": 0.9949972387356658, "grad_norm": 0.4144580066204071, "learning_rate": 6.842699073900117e-10, "loss": 0.013020774349570274, "memory(GiB)": 22.66, "step": 30629, "token_acc": 1.0, "train_speed(iter/s)": 0.956154 }, { "epoch": 0.9950297241984212, "grad_norm": 0.21523045003414154, "learning_rate": 6.754123373126087e-10, "loss": 0.008069684728980064, "memory(GiB)": 22.66, "step": 30630, "token_acc": 1.0, "train_speed(iter/s)": 0.95616 }, { "epoch": 0.9950622096611766, "grad_norm": 0.22726427018642426, "learning_rate": 6.666124660947848e-10, "loss": 0.007626904174685478, "memory(GiB)": 22.66, "step": 30631, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.956165 }, { "epoch": 0.995094695123932, "grad_norm": 0.24838750064373016, "learning_rate": 6.578702938386805e-10, "loss": 0.009742752648890018, "memory(GiB)": 22.66, "step": 30632, "token_acc": 1.0, "train_speed(iter/s)": 0.95617 }, { "epoch": 0.9951271805866875, "grad_norm": 0.4272421598434448, "learning_rate": 6.49185820644771e-10, "loss": 0.014632843434810638, "memory(GiB)": 22.66, "step": 30633, "token_acc": 0.99581589958159, "train_speed(iter/s)": 0.956175 }, { "epoch": 0.9951596660494428, "grad_norm": 0.38252002000808716, "learning_rate": 6.405590466129763e-10, "loss": 0.01280146837234497, "memory(GiB)": 22.66, "step": 30634, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956181 }, { "epoch": 0.9951921515121983, "grad_norm": 0.584208071231842, "learning_rate": 6.319899718437716e-10, "loss": 0.017050717025995255, "memory(GiB)": 22.66, "step": 30635, "token_acc": 0.9861751152073732, "train_speed(iter/s)": 0.956187 }, { "epoch": 0.9952246369749537, "grad_norm": 0.5791705250740051, "learning_rate": 6.234785964354117e-10, "loss": 0.013608437031507492, "memory(GiB)": 22.66, "step": 30636, "token_acc": 1.0, "train_speed(iter/s)": 0.956193 }, { "epoch": 0.9952571224377091, "grad_norm": 0.2832498252391815, "learning_rate": 6.150249204861513e-10, "loss": 0.006477737799286842, "memory(GiB)": 22.66, "step": 30637, "token_acc": 0.995260663507109, "train_speed(iter/s)": 0.956199 }, { "epoch": 0.9952896079004645, "grad_norm": 0.3726467788219452, "learning_rate": 6.066289440936901e-10, "loss": 0.017281033098697662, "memory(GiB)": 22.66, "step": 30638, "token_acc": 0.9779005524861878, "train_speed(iter/s)": 0.956205 }, { "epoch": 0.99532209336322, "grad_norm": 0.3068341910839081, "learning_rate": 5.982906673546174e-10, "loss": 0.005158234387636185, "memory(GiB)": 22.66, "step": 30639, "token_acc": 1.0, "train_speed(iter/s)": 0.956212 }, { "epoch": 0.9953545788259753, "grad_norm": 0.2873780429363251, "learning_rate": 5.900100903660777e-10, "loss": 0.008847247809171677, "memory(GiB)": 22.66, "step": 30640, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956219 }, { "epoch": 0.9953870642887308, "grad_norm": 0.30639588832855225, "learning_rate": 5.817872132224401e-10, "loss": 0.00990556925535202, "memory(GiB)": 22.66, "step": 30641, "token_acc": 0.9916317991631799, "train_speed(iter/s)": 0.956226 }, { "epoch": 0.9954195497514862, "grad_norm": 0.3297611176967621, "learning_rate": 5.736220360197386e-10, "loss": 0.0083464365452528, "memory(GiB)": 22.66, "step": 30642, "token_acc": 1.0, "train_speed(iter/s)": 0.956234 }, { "epoch": 0.9954520352142416, "grad_norm": 0.22199797630310059, "learning_rate": 5.655145588512323e-10, "loss": 0.007145323790609837, "memory(GiB)": 22.66, "step": 30643, "token_acc": 0.995, "train_speed(iter/s)": 0.956241 }, { "epoch": 0.995484520676997, "grad_norm": 0.2688053548336029, "learning_rate": 5.5746478181129e-10, "loss": 0.008367660455405712, "memory(GiB)": 22.66, "step": 30644, "token_acc": 1.0, "train_speed(iter/s)": 0.956248 }, { "epoch": 0.9955170061397525, "grad_norm": 0.29763224720954895, "learning_rate": 5.4947270499206e-10, "loss": 0.006696470081806183, "memory(GiB)": 22.66, "step": 30645, "token_acc": 1.0, "train_speed(iter/s)": 0.956255 }, { "epoch": 0.9955494916025078, "grad_norm": 0.31097277998924255, "learning_rate": 5.415383284862463e-10, "loss": 0.014172421768307686, "memory(GiB)": 22.66, "step": 30646, "token_acc": 0.9871794871794872, "train_speed(iter/s)": 0.956262 }, { "epoch": 0.9955819770652633, "grad_norm": 0.22625450789928436, "learning_rate": 5.336616523854421e-10, "loss": 0.009759007953107357, "memory(GiB)": 22.66, "step": 30647, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956269 }, { "epoch": 0.9956144625280187, "grad_norm": 0.33779555559158325, "learning_rate": 5.258426767806857e-10, "loss": 0.009219909086823463, "memory(GiB)": 22.66, "step": 30648, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956276 }, { "epoch": 0.9956469479907741, "grad_norm": 0.5949245691299438, "learning_rate": 5.180814017613501e-10, "loss": 0.012926407158374786, "memory(GiB)": 22.66, "step": 30649, "token_acc": 1.0, "train_speed(iter/s)": 0.956283 }, { "epoch": 0.9956794334535295, "grad_norm": 0.3234291672706604, "learning_rate": 5.103778274179183e-10, "loss": 0.0082589415833354, "memory(GiB)": 22.66, "step": 30650, "token_acc": 0.99609375, "train_speed(iter/s)": 0.95629 }, { "epoch": 0.995711918916285, "grad_norm": 0.3212921619415283, "learning_rate": 5.027319538392084e-10, "loss": 0.008090518414974213, "memory(GiB)": 22.66, "step": 30651, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.956297 }, { "epoch": 0.9957444043790403, "grad_norm": 0.3742302656173706, "learning_rate": 4.951437811134829e-10, "loss": 0.012804869562387466, "memory(GiB)": 22.66, "step": 30652, "token_acc": 0.9907834101382489, "train_speed(iter/s)": 0.956304 }, { "epoch": 0.9957768898417958, "grad_norm": 0.38056060671806335, "learning_rate": 4.876133093278946e-10, "loss": 0.008846789598464966, "memory(GiB)": 22.66, "step": 30653, "token_acc": 1.0, "train_speed(iter/s)": 0.956311 }, { "epoch": 0.9958093753045512, "grad_norm": 0.25690212845802307, "learning_rate": 4.801405385695956e-10, "loss": 0.011324102990329266, "memory(GiB)": 22.66, "step": 30654, "token_acc": 1.0, "train_speed(iter/s)": 0.956318 }, { "epoch": 0.9958418607673066, "grad_norm": 0.43212243914604187, "learning_rate": 4.727254689246286e-10, "loss": 0.012965230271220207, "memory(GiB)": 22.66, "step": 30655, "token_acc": 0.9922480620155039, "train_speed(iter/s)": 0.956326 }, { "epoch": 0.9958743462300621, "grad_norm": 0.32559359073638916, "learning_rate": 4.653681004790356e-10, "loss": 0.011078432202339172, "memory(GiB)": 22.66, "step": 30656, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956332 }, { "epoch": 0.9959068316928175, "grad_norm": 0.2880564332008362, "learning_rate": 4.5806843331719365e-10, "loss": 0.008539527654647827, "memory(GiB)": 22.66, "step": 30657, "token_acc": 0.9883720930232558, "train_speed(iter/s)": 0.956339 }, { "epoch": 0.9959393171555729, "grad_norm": 0.35766685009002686, "learning_rate": 4.508264675240348e-10, "loss": 0.014102338813245296, "memory(GiB)": 22.66, "step": 30658, "token_acc": 1.0, "train_speed(iter/s)": 0.956346 }, { "epoch": 0.9959718026183283, "grad_norm": 0.5199665427207947, "learning_rate": 4.436422031822707e-10, "loss": 0.013298963196575642, "memory(GiB)": 22.66, "step": 30659, "token_acc": 0.9951219512195122, "train_speed(iter/s)": 0.956353 }, { "epoch": 0.9960042880810838, "grad_norm": 0.4374484717845917, "learning_rate": 4.3651564037572314e-10, "loss": 0.009995114989578724, "memory(GiB)": 22.66, "step": 30660, "token_acc": 1.0, "train_speed(iter/s)": 0.95636 }, { "epoch": 0.9960367735438391, "grad_norm": 0.35800105333328247, "learning_rate": 4.294467791859935e-10, "loss": 0.009114649146795273, "memory(GiB)": 22.66, "step": 30661, "token_acc": 1.0, "train_speed(iter/s)": 0.956367 }, { "epoch": 0.9960692590065946, "grad_norm": 0.39345237612724304, "learning_rate": 4.2243561969468326e-10, "loss": 0.011191098019480705, "memory(GiB)": 22.66, "step": 30662, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.956375 }, { "epoch": 0.99610174446935, "grad_norm": 0.32793092727661133, "learning_rate": 4.1548216198283863e-10, "loss": 0.008736778050661087, "memory(GiB)": 22.66, "step": 30663, "token_acc": 1.0, "train_speed(iter/s)": 0.956382 }, { "epoch": 0.9961342299321054, "grad_norm": 0.4140249788761139, "learning_rate": 4.085864061309508e-10, "loss": 0.014733913354575634, "memory(GiB)": 22.66, "step": 30664, "token_acc": 0.9961977186311787, "train_speed(iter/s)": 0.956389 }, { "epoch": 0.9961667153948608, "grad_norm": 0.31059613823890686, "learning_rate": 4.017483522184007e-10, "loss": 0.011522852815687656, "memory(GiB)": 22.66, "step": 30665, "token_acc": 1.0, "train_speed(iter/s)": 0.956396 }, { "epoch": 0.9961992008576163, "grad_norm": 0.26398229598999023, "learning_rate": 3.9496800032401416e-10, "loss": 0.010209780186414719, "memory(GiB)": 22.66, "step": 30666, "token_acc": 1.0, "train_speed(iter/s)": 0.956403 }, { "epoch": 0.9962316863203716, "grad_norm": 0.40721395611763, "learning_rate": 3.8824535052661704e-10, "loss": 0.015959393233060837, "memory(GiB)": 22.66, "step": 30667, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.956409 }, { "epoch": 0.9962641717831271, "grad_norm": 0.2501468062400818, "learning_rate": 3.8158040290281475e-10, "loss": 0.0069342125207185745, "memory(GiB)": 22.66, "step": 30668, "token_acc": 0.9950248756218906, "train_speed(iter/s)": 0.956413 }, { "epoch": 0.9962966572458825, "grad_norm": 0.2878941595554352, "learning_rate": 3.7497315753032284e-10, "loss": 0.007473780773580074, "memory(GiB)": 22.66, "step": 30669, "token_acc": 1.0, "train_speed(iter/s)": 0.956419 }, { "epoch": 0.9963291427086379, "grad_norm": 0.5252863168716431, "learning_rate": 3.684236144851916e-10, "loss": 0.019251137971878052, "memory(GiB)": 22.66, "step": 30670, "token_acc": 0.9954128440366973, "train_speed(iter/s)": 0.956424 }, { "epoch": 0.9963616281713933, "grad_norm": 0.35949259996414185, "learning_rate": 3.619317738423611e-10, "loss": 0.009658136405050755, "memory(GiB)": 22.66, "step": 30671, "token_acc": 1.0, "train_speed(iter/s)": 0.95643 }, { "epoch": 0.9963941136341488, "grad_norm": 1.7774142026901245, "learning_rate": 3.5549763567788167e-10, "loss": 0.00962049514055252, "memory(GiB)": 22.66, "step": 30672, "token_acc": 1.0, "train_speed(iter/s)": 0.956436 }, { "epoch": 0.9964265990969041, "grad_norm": 0.47563955187797546, "learning_rate": 3.49121200065583e-10, "loss": 0.017448242753744125, "memory(GiB)": 22.66, "step": 30673, "token_acc": 0.9957983193277311, "train_speed(iter/s)": 0.956441 }, { "epoch": 0.9964590845596596, "grad_norm": 0.2894445061683655, "learning_rate": 3.4280246707874e-10, "loss": 0.00745840510353446, "memory(GiB)": 22.66, "step": 30674, "token_acc": 1.0, "train_speed(iter/s)": 0.956446 }, { "epoch": 0.996491570022415, "grad_norm": 0.3844943046569824, "learning_rate": 3.3654143679007214e-10, "loss": 0.011876202188432217, "memory(GiB)": 22.66, "step": 30675, "token_acc": 1.0, "train_speed(iter/s)": 0.956451 }, { "epoch": 0.9965240554851704, "grad_norm": 0.3425709903240204, "learning_rate": 3.3033810927285414e-10, "loss": 0.006978640798479319, "memory(GiB)": 22.66, "step": 30676, "token_acc": 1.0, "train_speed(iter/s)": 0.956457 }, { "epoch": 0.9965565409479258, "grad_norm": 0.39006906747817993, "learning_rate": 3.2419248459814036e-10, "loss": 0.015023596584796906, "memory(GiB)": 22.66, "step": 30677, "token_acc": 1.0, "train_speed(iter/s)": 0.956462 }, { "epoch": 0.9965890264106813, "grad_norm": 0.31217291951179504, "learning_rate": 3.181045628364299e-10, "loss": 0.009160559624433517, "memory(GiB)": 22.66, "step": 30678, "token_acc": 0.9964539007092199, "train_speed(iter/s)": 0.956467 }, { "epoch": 0.9966215118734366, "grad_norm": 0.28806859254837036, "learning_rate": 3.120743440582219e-10, "loss": 0.011396365240216255, "memory(GiB)": 22.66, "step": 30679, "token_acc": 0.9957081545064378, "train_speed(iter/s)": 0.956471 }, { "epoch": 0.9966539973361921, "grad_norm": 0.2875577509403229, "learning_rate": 3.061018283334605e-10, "loss": 0.012353090569376945, "memory(GiB)": 22.66, "step": 30680, "token_acc": 0.9875518672199171, "train_speed(iter/s)": 0.956476 }, { "epoch": 0.9966864827989474, "grad_norm": 0.28232690691947937, "learning_rate": 3.001870157309794e-10, "loss": 0.006572936661541462, "memory(GiB)": 22.66, "step": 30681, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.956481 }, { "epoch": 0.9967189682617029, "grad_norm": 0.4481545090675354, "learning_rate": 2.9432990631905746e-10, "loss": 0.013388470746576786, "memory(GiB)": 22.66, "step": 30682, "token_acc": 0.9950980392156863, "train_speed(iter/s)": 0.956486 }, { "epoch": 0.9967514537244583, "grad_norm": 0.2324949949979782, "learning_rate": 2.885305001648631e-10, "loss": 0.009369375184178352, "memory(GiB)": 22.66, "step": 30683, "token_acc": 0.9963503649635036, "train_speed(iter/s)": 0.956491 }, { "epoch": 0.9967839391872138, "grad_norm": 0.4700776934623718, "learning_rate": 2.8278879733556475e-10, "loss": 0.014979311265051365, "memory(GiB)": 22.66, "step": 30684, "token_acc": 0.9915254237288136, "train_speed(iter/s)": 0.956496 }, { "epoch": 0.9968164246499691, "grad_norm": 0.32695674896240234, "learning_rate": 2.771047978977759e-10, "loss": 0.009285712614655495, "memory(GiB)": 22.66, "step": 30685, "token_acc": 0.990909090909091, "train_speed(iter/s)": 0.956501 }, { "epoch": 0.9968489101127246, "grad_norm": 0.2632991075515747, "learning_rate": 2.714785019164445e-10, "loss": 0.008836779743432999, "memory(GiB)": 22.66, "step": 30686, "token_acc": 0.9962825278810409, "train_speed(iter/s)": 0.956506 }, { "epoch": 0.99688139557548, "grad_norm": 0.4249269664287567, "learning_rate": 2.6590990945707383e-10, "loss": 0.013575900346040726, "memory(GiB)": 22.66, "step": 30687, "token_acc": 0.9908675799086758, "train_speed(iter/s)": 0.956512 }, { "epoch": 0.9969138810382354, "grad_norm": 0.27275747060775757, "learning_rate": 2.6039902058350166e-10, "loss": 0.009923149831593037, "memory(GiB)": 22.66, "step": 30688, "token_acc": 0.9956896551724138, "train_speed(iter/s)": 0.956517 }, { "epoch": 0.9969463665009908, "grad_norm": 0.3092481195926666, "learning_rate": 2.549458353601209e-10, "loss": 0.00835362821817398, "memory(GiB)": 22.66, "step": 30689, "token_acc": 1.0, "train_speed(iter/s)": 0.956523 }, { "epoch": 0.9969788519637462, "grad_norm": 0.2385471910238266, "learning_rate": 2.4955035384910397e-10, "loss": 0.008788017556071281, "memory(GiB)": 22.66, "step": 30690, "token_acc": 0.9960629921259843, "train_speed(iter/s)": 0.956529 }, { "epoch": 0.9970113374265016, "grad_norm": 0.4663311839103699, "learning_rate": 2.442125761126235e-10, "loss": 0.01816406659781933, "memory(GiB)": 22.66, "step": 30691, "token_acc": 0.986046511627907, "train_speed(iter/s)": 0.956535 }, { "epoch": 0.9970438228892571, "grad_norm": 0.4195515215396881, "learning_rate": 2.389325022128519e-10, "loss": 0.009516390040516853, "memory(GiB)": 22.66, "step": 30692, "token_acc": 1.0, "train_speed(iter/s)": 0.956541 }, { "epoch": 0.9970763083520124, "grad_norm": 0.3132445216178894, "learning_rate": 2.3371013221029636e-10, "loss": 0.011206691153347492, "memory(GiB)": 22.66, "step": 30693, "token_acc": 1.0, "train_speed(iter/s)": 0.956547 }, { "epoch": 0.9971087938147679, "grad_norm": 0.18238066136837006, "learning_rate": 2.2854546616546402e-10, "loss": 0.009167449548840523, "memory(GiB)": 22.66, "step": 30694, "token_acc": 1.0, "train_speed(iter/s)": 0.956552 }, { "epoch": 0.9971412792775233, "grad_norm": 0.40684881806373596, "learning_rate": 2.2343850413830692e-10, "loss": 0.011107760481536388, "memory(GiB)": 22.66, "step": 30695, "token_acc": 0.9891891891891892, "train_speed(iter/s)": 0.956557 }, { "epoch": 0.9971737647402787, "grad_norm": 0.4055425226688385, "learning_rate": 2.1838924618655665e-10, "loss": 0.009993486106395721, "memory(GiB)": 22.66, "step": 30696, "token_acc": 0.98989898989899, "train_speed(iter/s)": 0.956563 }, { "epoch": 0.9972062502030341, "grad_norm": 0.3626596927642822, "learning_rate": 2.1339769236961017e-10, "loss": 0.012653980404138565, "memory(GiB)": 22.66, "step": 30697, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.956569 }, { "epoch": 0.9972387356657896, "grad_norm": 0.3464375138282776, "learning_rate": 2.0846384274519903e-10, "loss": 0.013287758454680443, "memory(GiB)": 22.66, "step": 30698, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956575 }, { "epoch": 0.9972712211285449, "grad_norm": 0.26456719636917114, "learning_rate": 2.0358769736938955e-10, "loss": 0.007739505264908075, "memory(GiB)": 22.66, "step": 30699, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.956581 }, { "epoch": 0.9973037065913004, "grad_norm": 0.3996630012989044, "learning_rate": 1.9876925629880306e-10, "loss": 0.013044748455286026, "memory(GiB)": 22.66, "step": 30700, "token_acc": 0.995, "train_speed(iter/s)": 0.956588 }, { "epoch": 0.9973361920540558, "grad_norm": 0.32203301787376404, "learning_rate": 1.9400851958950584e-10, "loss": 0.00816553458571434, "memory(GiB)": 22.66, "step": 30701, "token_acc": 1.0, "train_speed(iter/s)": 0.956595 }, { "epoch": 0.9973686775168112, "grad_norm": 3.356898546218872, "learning_rate": 1.8930548729589882e-10, "loss": 0.015351107344031334, "memory(GiB)": 22.66, "step": 30702, "token_acc": 1.0, "train_speed(iter/s)": 0.956602 }, { "epoch": 0.9974011629795666, "grad_norm": 0.23526433110237122, "learning_rate": 1.8466015947238292e-10, "loss": 0.007767094764858484, "memory(GiB)": 22.66, "step": 30703, "token_acc": 0.9954337899543378, "train_speed(iter/s)": 0.956609 }, { "epoch": 0.9974336484423221, "grad_norm": 0.28742775321006775, "learning_rate": 1.8007253617280397e-10, "loss": 0.007005071733146906, "memory(GiB)": 22.66, "step": 30704, "token_acc": 0.9966666666666667, "train_speed(iter/s)": 0.956616 }, { "epoch": 0.9974661339050774, "grad_norm": 0.4496851861476898, "learning_rate": 1.7554261744989754e-10, "loss": 0.017787322402000427, "memory(GiB)": 22.66, "step": 30705, "token_acc": 0.9959839357429718, "train_speed(iter/s)": 0.956623 }, { "epoch": 0.9974986193678329, "grad_norm": 0.2609005868434906, "learning_rate": 1.7107040335584412e-10, "loss": 0.008648022077977657, "memory(GiB)": 22.66, "step": 30706, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.95663 }, { "epoch": 0.9975311048305883, "grad_norm": 0.39213088154792786, "learning_rate": 1.6665589394226912e-10, "loss": 0.01090298779308796, "memory(GiB)": 22.66, "step": 30707, "token_acc": 0.9929078014184397, "train_speed(iter/s)": 0.956638 }, { "epoch": 0.9975635902933437, "grad_norm": 0.28544872999191284, "learning_rate": 1.6229908926079785e-10, "loss": 0.013532312586903572, "memory(GiB)": 22.66, "step": 30708, "token_acc": 0.9957627118644068, "train_speed(iter/s)": 0.956645 }, { "epoch": 0.9975960757560991, "grad_norm": 0.35452163219451904, "learning_rate": 1.5799998936083527e-10, "loss": 0.011749526485800743, "memory(GiB)": 22.66, "step": 30709, "token_acc": 0.9956521739130435, "train_speed(iter/s)": 0.956652 }, { "epoch": 0.9976285612188546, "grad_norm": 0.4298083186149597, "learning_rate": 1.537585942917863e-10, "loss": 0.011002965271472931, "memory(GiB)": 22.66, "step": 30710, "token_acc": 0.9906542056074766, "train_speed(iter/s)": 0.956659 }, { "epoch": 0.9976610466816099, "grad_norm": 0.3734707236289978, "learning_rate": 1.4957490410361098e-10, "loss": 0.00968787632882595, "memory(GiB)": 22.66, "step": 30711, "token_acc": 0.9919028340080972, "train_speed(iter/s)": 0.956665 }, { "epoch": 0.9976935321443654, "grad_norm": 0.23551635444164276, "learning_rate": 1.45448918844604e-10, "loss": 0.006610640324652195, "memory(GiB)": 22.66, "step": 30712, "token_acc": 1.0, "train_speed(iter/s)": 0.956673 }, { "epoch": 0.9977260176071208, "grad_norm": 0.3948880434036255, "learning_rate": 1.413806385613947e-10, "loss": 0.010700742714107037, "memory(GiB)": 22.66, "step": 30713, "token_acc": 0.992619926199262, "train_speed(iter/s)": 0.95668 }, { "epoch": 0.9977585030698762, "grad_norm": 0.3476055860519409, "learning_rate": 1.3737006330116765e-10, "loss": 0.010377483442425728, "memory(GiB)": 22.66, "step": 30714, "token_acc": 0.9957264957264957, "train_speed(iter/s)": 0.956687 }, { "epoch": 0.9977909885326316, "grad_norm": 0.5125776529312134, "learning_rate": 1.3341719311110723e-10, "loss": 0.015266967937350273, "memory(GiB)": 22.66, "step": 30715, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956694 }, { "epoch": 0.9978234739953871, "grad_norm": 0.34355875849723816, "learning_rate": 1.2952202803617754e-10, "loss": 0.009228292852640152, "memory(GiB)": 22.66, "step": 30716, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.956701 }, { "epoch": 0.9978559594581424, "grad_norm": 0.4678346812725067, "learning_rate": 1.2568456812078745e-10, "loss": 0.013486728072166443, "memory(GiB)": 22.66, "step": 30717, "token_acc": 0.9964912280701754, "train_speed(iter/s)": 0.956709 }, { "epoch": 0.9978884449208979, "grad_norm": 0.31084948778152466, "learning_rate": 1.219048134104561e-10, "loss": 0.006697993725538254, "memory(GiB)": 22.66, "step": 30718, "token_acc": 0.9955357142857143, "train_speed(iter/s)": 0.956716 }, { "epoch": 0.9979209303836533, "grad_norm": 0.6257913112640381, "learning_rate": 1.1818276394792716e-10, "loss": 0.009717019274830818, "memory(GiB)": 22.66, "step": 30719, "token_acc": 0.9960474308300395, "train_speed(iter/s)": 0.956723 }, { "epoch": 0.9979534158464087, "grad_norm": 0.390535831451416, "learning_rate": 1.1451841977594414e-10, "loss": 0.015257769264280796, "memory(GiB)": 22.66, "step": 30720, "token_acc": 1.0, "train_speed(iter/s)": 0.95673 }, { "epoch": 0.9979859013091642, "grad_norm": 0.38430479168891907, "learning_rate": 1.1091178093780575e-10, "loss": 0.014315489679574966, "memory(GiB)": 22.66, "step": 30721, "token_acc": 1.0, "train_speed(iter/s)": 0.956737 }, { "epoch": 0.9980183867719196, "grad_norm": 0.4937524199485779, "learning_rate": 1.0736284747459025e-10, "loss": 0.011118900030851364, "memory(GiB)": 22.66, "step": 30722, "token_acc": 0.9950738916256158, "train_speed(iter/s)": 0.956744 }, { "epoch": 0.998050872234675, "grad_norm": 0.28492385149002075, "learning_rate": 1.0387161942682079e-10, "loss": 0.0098994430154562, "memory(GiB)": 22.66, "step": 30723, "token_acc": 0.9959514170040485, "train_speed(iter/s)": 0.956751 }, { "epoch": 0.9980833576974304, "grad_norm": 0.3249557614326477, "learning_rate": 1.004380968355756e-10, "loss": 0.007112803868949413, "memory(GiB)": 22.66, "step": 30724, "token_acc": 1.0, "train_speed(iter/s)": 0.956758 }, { "epoch": 0.9981158431601859, "grad_norm": 0.44701695442199707, "learning_rate": 9.706227974026761e-11, "loss": 0.009980956092476845, "memory(GiB)": 22.66, "step": 30725, "token_acc": 1.0, "train_speed(iter/s)": 0.956765 }, { "epoch": 0.9981483286229412, "grad_norm": 1.2785972356796265, "learning_rate": 9.374416817919951e-11, "loss": 0.0118692796677351, "memory(GiB)": 22.66, "step": 30726, "token_acc": 1.0, "train_speed(iter/s)": 0.956772 }, { "epoch": 0.9981808140856967, "grad_norm": 0.3303375840187073, "learning_rate": 9.048376219178424e-11, "loss": 0.014238111674785614, "memory(GiB)": 22.66, "step": 30727, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.956779 }, { "epoch": 0.9982132995484521, "grad_norm": 0.31276360154151917, "learning_rate": 8.728106181465912e-11, "loss": 0.009446343407034874, "memory(GiB)": 22.66, "step": 30728, "token_acc": 1.0, "train_speed(iter/s)": 0.956785 }, { "epoch": 0.9982457850112075, "grad_norm": 0.4793412685394287, "learning_rate": 8.413606708557176e-11, "loss": 0.013275202363729477, "memory(GiB)": 22.66, "step": 30729, "token_acc": 0.9920634920634921, "train_speed(iter/s)": 0.95679 }, { "epoch": 0.9982782704739629, "grad_norm": 0.36038345098495483, "learning_rate": 8.104877804004929e-11, "loss": 0.019225047901272774, "memory(GiB)": 22.66, "step": 30730, "token_acc": 0.995049504950495, "train_speed(iter/s)": 0.956796 }, { "epoch": 0.9983107559367184, "grad_norm": 0.2562107443809509, "learning_rate": 7.801919471417397e-11, "loss": 0.009878336451947689, "memory(GiB)": 22.66, "step": 30731, "token_acc": 1.0, "train_speed(iter/s)": 0.956801 }, { "epoch": 0.9983432413994737, "grad_norm": 0.43325886130332947, "learning_rate": 7.50473171429178e-11, "loss": 0.014536159113049507, "memory(GiB)": 22.66, "step": 30732, "token_acc": 0.9958333333333333, "train_speed(iter/s)": 0.956806 }, { "epoch": 0.9983757268622292, "grad_norm": 0.20580564439296722, "learning_rate": 7.21331453601426e-11, "loss": 0.00583602674305439, "memory(GiB)": 22.66, "step": 30733, "token_acc": 1.0, "train_speed(iter/s)": 0.956811 }, { "epoch": 0.9984082123249846, "grad_norm": 0.31717148423194885, "learning_rate": 6.927667940026527e-11, "loss": 0.009317119605839252, "memory(GiB)": 22.66, "step": 30734, "token_acc": 0.9954545454545455, "train_speed(iter/s)": 0.956817 }, { "epoch": 0.99844069778774, "grad_norm": 0.4754085838794708, "learning_rate": 6.647791929548231e-11, "loss": 0.013561745174229145, "memory(GiB)": 22.66, "step": 30735, "token_acc": 0.9957805907172996, "train_speed(iter/s)": 0.956822 }, { "epoch": 0.9984731832504954, "grad_norm": 0.3499637842178345, "learning_rate": 6.373686507854527e-11, "loss": 0.00840237457305193, "memory(GiB)": 22.66, "step": 30736, "token_acc": 1.0, "train_speed(iter/s)": 0.956828 }, { "epoch": 0.9985056687132509, "grad_norm": 0.2838682234287262, "learning_rate": 6.10535167810955e-11, "loss": 0.0072471387684345245, "memory(GiB)": 22.66, "step": 30737, "token_acc": 0.9964285714285714, "train_speed(iter/s)": 0.956834 }, { "epoch": 0.9985381541760062, "grad_norm": 0.4539641737937927, "learning_rate": 5.842787443421927e-11, "loss": 0.009983748197555542, "memory(GiB)": 22.66, "step": 30738, "token_acc": 0.9862385321100917, "train_speed(iter/s)": 0.95684 }, { "epoch": 0.9985706396387617, "grad_norm": 0.37101197242736816, "learning_rate": 5.5859938067892585e-11, "loss": 0.01139853335916996, "memory(GiB)": 22.66, "step": 30739, "token_acc": 0.9953488372093023, "train_speed(iter/s)": 0.956844 }, { "epoch": 0.998603125101517, "grad_norm": 0.25746580958366394, "learning_rate": 5.3349707711536356e-11, "loss": 0.006013743579387665, "memory(GiB)": 22.66, "step": 30740, "token_acc": 1.0, "train_speed(iter/s)": 0.956849 }, { "epoch": 0.9986356105642725, "grad_norm": 0.5428087711334229, "learning_rate": 5.08971833945715e-11, "loss": 0.010865996591746807, "memory(GiB)": 22.66, "step": 30741, "token_acc": 0.996078431372549, "train_speed(iter/s)": 0.956854 }, { "epoch": 0.9986680960270279, "grad_norm": 0.4283323287963867, "learning_rate": 4.850236514530871e-11, "loss": 0.013755464926362038, "memory(GiB)": 22.66, "step": 30742, "token_acc": 1.0, "train_speed(iter/s)": 0.956859 }, { "epoch": 0.9987005814897834, "grad_norm": 0.4339843988418579, "learning_rate": 4.616525299094843e-11, "loss": 0.011391950771212578, "memory(GiB)": 22.66, "step": 30743, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 0.956863 }, { "epoch": 0.9987330669525387, "grad_norm": 0.2593638598918915, "learning_rate": 4.388584695924625e-11, "loss": 0.006266272626817226, "memory(GiB)": 22.66, "step": 30744, "token_acc": 1.0, "train_speed(iter/s)": 0.956868 }, { "epoch": 0.9987655524152942, "grad_norm": 0.3801734149456024, "learning_rate": 4.1664147075737294e-11, "loss": 0.01358889788389206, "memory(GiB)": 22.66, "step": 30745, "token_acc": 1.0, "train_speed(iter/s)": 0.956871 }, { "epoch": 0.9987980378780495, "grad_norm": 0.4250258207321167, "learning_rate": 3.9500153366511804e-11, "loss": 0.013249024748802185, "memory(GiB)": 22.66, "step": 30746, "token_acc": 1.0, "train_speed(iter/s)": 0.956877 }, { "epoch": 0.998830523340805, "grad_norm": 0.42001456022262573, "learning_rate": 3.7393865855994694e-11, "loss": 0.010977540165185928, "memory(GiB)": 22.66, "step": 30747, "token_acc": 1.0, "train_speed(iter/s)": 0.956881 }, { "epoch": 0.9988630088035604, "grad_norm": 0.3344929814338684, "learning_rate": 3.5345284569165975e-11, "loss": 0.010501030832529068, "memory(GiB)": 22.66, "step": 30748, "token_acc": 1.0, "train_speed(iter/s)": 0.956887 }, { "epoch": 0.9988954942663159, "grad_norm": 0.25547462701797485, "learning_rate": 3.3354409529340325e-11, "loss": 0.013514474034309387, "memory(GiB)": 22.66, "step": 30749, "token_acc": 0.9951456310679612, "train_speed(iter/s)": 0.956893 }, { "epoch": 0.9989279797290712, "grad_norm": 0.21529096364974976, "learning_rate": 3.142124075983244e-11, "loss": 0.008237680420279503, "memory(GiB)": 22.66, "step": 30750, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956898 }, { "epoch": 0.9989604651918267, "grad_norm": 0.3632243275642395, "learning_rate": 2.954577828229166e-11, "loss": 0.011509435251355171, "memory(GiB)": 22.66, "step": 30751, "token_acc": 1.0, "train_speed(iter/s)": 0.956904 }, { "epoch": 0.998992950654582, "grad_norm": 0.3463436961174011, "learning_rate": 2.7728022118922448e-11, "loss": 0.009157950058579445, "memory(GiB)": 22.66, "step": 30752, "token_acc": 1.0, "train_speed(iter/s)": 0.95691 }, { "epoch": 0.9990254361173375, "grad_norm": 0.8020269274711609, "learning_rate": 2.5967972290263933e-11, "loss": 0.011659445241093636, "memory(GiB)": 22.66, "step": 30753, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.956916 }, { "epoch": 0.9990579215800929, "grad_norm": 0.2986462712287903, "learning_rate": 2.4265628816855237e-11, "loss": 0.01488853245973587, "memory(GiB)": 22.66, "step": 30754, "token_acc": 0.9910714285714286, "train_speed(iter/s)": 0.956922 }, { "epoch": 0.9990904070428483, "grad_norm": 0.373689740896225, "learning_rate": 2.2620991718680374e-11, "loss": 0.010135388001799583, "memory(GiB)": 22.66, "step": 30755, "token_acc": 0.996309963099631, "train_speed(iter/s)": 0.956928 }, { "epoch": 0.9991228925056037, "grad_norm": 0.4202406704425812, "learning_rate": 2.103406101461314e-11, "loss": 0.01238178089261055, "memory(GiB)": 22.66, "step": 30756, "token_acc": 0.9925925925925926, "train_speed(iter/s)": 0.956934 }, { "epoch": 0.9991553779683592, "grad_norm": 0.3204509913921356, "learning_rate": 1.9504836722417096e-11, "loss": 0.009123380295932293, "memory(GiB)": 22.66, "step": 30757, "token_acc": 0.9914163090128756, "train_speed(iter/s)": 0.95694 }, { "epoch": 0.9991878634311145, "grad_norm": 0.29747259616851807, "learning_rate": 1.803331885985582e-11, "loss": 0.009722873568534851, "memory(GiB)": 22.66, "step": 30758, "token_acc": 0.9961538461538462, "train_speed(iter/s)": 0.956946 }, { "epoch": 0.99922034889387, "grad_norm": 0.2008071094751358, "learning_rate": 1.6619507444692872e-11, "loss": 0.007823271676898003, "memory(GiB)": 22.66, "step": 30759, "token_acc": 0.9951690821256038, "train_speed(iter/s)": 0.956951 }, { "epoch": 0.9992528343566254, "grad_norm": 0.46565020084381104, "learning_rate": 1.526340249247138e-11, "loss": 0.015548638999462128, "memory(GiB)": 22.66, "step": 30760, "token_acc": 0.9864864864864865, "train_speed(iter/s)": 0.956957 }, { "epoch": 0.9992853198193808, "grad_norm": 0.38102781772613525, "learning_rate": 1.3965004019289573e-11, "loss": 0.01059191208332777, "memory(GiB)": 22.66, "step": 30761, "token_acc": 0.9909502262443439, "train_speed(iter/s)": 0.956962 }, { "epoch": 0.9993178052821362, "grad_norm": 0.3340960741043091, "learning_rate": 1.2724312039580355e-11, "loss": 0.010521497577428818, "memory(GiB)": 22.66, "step": 30762, "token_acc": 0.9879518072289156, "train_speed(iter/s)": 0.956968 }, { "epoch": 0.9993502907448917, "grad_norm": 0.20134833455085754, "learning_rate": 1.1541326567776623e-11, "loss": 0.007150572724640369, "memory(GiB)": 22.66, "step": 30763, "token_acc": 1.0, "train_speed(iter/s)": 0.956974 }, { "epoch": 0.999382776207647, "grad_norm": 0.21521887183189392, "learning_rate": 1.0416047618311275e-11, "loss": 0.008354559540748596, "memory(GiB)": 22.66, "step": 30764, "token_acc": 1.0, "train_speed(iter/s)": 0.956981 }, { "epoch": 0.9994152616704025, "grad_norm": 0.28914833068847656, "learning_rate": 9.348475203396768e-12, "loss": 0.007163546979427338, "memory(GiB)": 22.66, "step": 30765, "token_acc": 0.9958677685950413, "train_speed(iter/s)": 0.956988 }, { "epoch": 0.9994477471331579, "grad_norm": 0.4176745116710663, "learning_rate": 8.338609335800662e-12, "loss": 0.012521302327513695, "memory(GiB)": 22.66, "step": 30766, "token_acc": 0.988, "train_speed(iter/s)": 0.956995 }, { "epoch": 0.9994802325959133, "grad_norm": 0.5274630188941956, "learning_rate": 7.386450026625192e-12, "loss": 0.007168459240347147, "memory(GiB)": 22.66, "step": 30767, "token_acc": 1.0, "train_speed(iter/s)": 0.957002 }, { "epoch": 0.9995127180586687, "grad_norm": 0.3633677363395691, "learning_rate": 6.491997286972584e-12, "loss": 0.0072964453138411045, "memory(GiB)": 22.66, "step": 30768, "token_acc": 1.0, "train_speed(iter/s)": 0.957009 }, { "epoch": 0.9995452035214242, "grad_norm": 0.36868685483932495, "learning_rate": 5.6552511279450716e-12, "loss": 0.01379459723830223, "memory(GiB)": 22.66, "step": 30769, "token_acc": 0.988, "train_speed(iter/s)": 0.957016 }, { "epoch": 0.9995776889841795, "grad_norm": 0.36927372217178345, "learning_rate": 4.876211558424437e-12, "loss": 0.01221967488527298, "memory(GiB)": 22.66, "step": 30770, "token_acc": 0.9962546816479401, "train_speed(iter/s)": 0.957023 }, { "epoch": 0.999610174446935, "grad_norm": 0.3844362199306488, "learning_rate": 4.1548785872924655e-12, "loss": 0.009130388498306274, "memory(GiB)": 22.66, "step": 30771, "token_acc": 1.0, "train_speed(iter/s)": 0.95703 }, { "epoch": 0.9996426599096904, "grad_norm": 0.4261927604675293, "learning_rate": 3.491252223430941e-12, "loss": 0.01536897849291563, "memory(GiB)": 22.66, "step": 30772, "token_acc": 1.0, "train_speed(iter/s)": 0.957037 }, { "epoch": 0.9996751453724458, "grad_norm": 0.2873091697692871, "learning_rate": 2.8853324746114242e-12, "loss": 0.01019647903740406, "memory(GiB)": 22.66, "step": 30773, "token_acc": 0.9912663755458515, "train_speed(iter/s)": 0.957044 }, { "epoch": 0.9997076308352012, "grad_norm": 0.42197275161743164, "learning_rate": 2.3371193469401423e-12, "loss": 0.012834717519581318, "memory(GiB)": 22.66, "step": 30774, "token_acc": 0.9893048128342246, "train_speed(iter/s)": 0.957051 }, { "epoch": 0.9997401162979567, "grad_norm": 0.3476264774799347, "learning_rate": 1.8466128476335445e-12, "loss": 0.0071222479455173016, "memory(GiB)": 22.66, "step": 30775, "token_acc": 0.9914893617021276, "train_speed(iter/s)": 0.957058 }, { "epoch": 0.999772601760712, "grad_norm": 0.3684327006340027, "learning_rate": 1.4138129816876344e-12, "loss": 0.01100651640444994, "memory(GiB)": 22.66, "step": 30776, "token_acc": 0.9961977186311787, "train_speed(iter/s)": 0.957066 }, { "epoch": 0.9998050872234675, "grad_norm": 0.2596253752708435, "learning_rate": 1.0387197546535278e-12, "loss": 0.011838800273835659, "memory(GiB)": 22.66, "step": 30777, "token_acc": 0.9960159362549801, "train_speed(iter/s)": 0.957073 }, { "epoch": 0.9998375726862229, "grad_norm": 0.30252575874328613, "learning_rate": 7.213331704170045e-13, "loss": 0.0118678268045187, "memory(GiB)": 22.66, "step": 30778, "token_acc": 0.996031746031746, "train_speed(iter/s)": 0.95708 }, { "epoch": 0.9998700581489783, "grad_norm": 0.3823351263999939, "learning_rate": 4.616532334189571e-13, "loss": 0.010158486664295197, "memory(GiB)": 22.66, "step": 30779, "token_acc": 1.0, "train_speed(iter/s)": 0.957087 }, { "epoch": 0.9999025436117337, "grad_norm": 0.269962877035141, "learning_rate": 2.596799453247201e-13, "loss": 0.00913385208696127, "memory(GiB)": 22.66, "step": 30780, "token_acc": 1.0, "train_speed(iter/s)": 0.957094 }, { "epoch": 0.9999350290744892, "grad_norm": 0.43602311611175537, "learning_rate": 1.1541330946496233e-13, "loss": 0.013277817517518997, "memory(GiB)": 22.66, "step": 30781, "token_acc": 0.9930795847750865, "train_speed(iter/s)": 0.9571 }, { "epoch": 0.9999675145372445, "grad_norm": 0.22620439529418945, "learning_rate": 2.885332750501846e-14, "loss": 0.0068231383338570595, "memory(GiB)": 22.66, "step": 30782, "token_acc": 1.0, "train_speed(iter/s)": 0.957107 }, { "epoch": 1.0, "grad_norm": 0.36173006892204285, "learning_rate": 0.0, "loss": 0.014988595619797707, "memory(GiB)": 22.66, "step": 30783, "token_acc": 0.9818181818181818, "train_speed(iter/s)": 0.957117 }, { "epoch": 1.0, "eval_loss": 0.011084680445492268, "eval_runtime": 80.2616, "eval_samples_per_second": 123.97, "eval_steps_per_second": 3.875, "eval_token_acc": 0.9955192657842487, "step": 30783 } ], "logging_steps": 1, "max_steps": 30783, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.539669836407243e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }