danding12315's picture
Upload folder using huggingface_hub
e3d265f verified
{"loss": 0.67619711, "token_acc": 0.83606557, "grad_norm": 64.5, "learning_rate": 5.2e-07, "memory(GiB)": 82.76, "train_speed(iter/s)": 0.019472, "epoch": 0.00259572, "global_step/max_steps": "1/1925", "percentage": "0.05%", "elapsed_time": "48s", "remaining_time": "1d 1h 58m 9s"}
{"loss": 0.66505557, "token_acc": 0.82316443, "grad_norm": 66.0, "learning_rate": 2.59e-06, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.021208, "epoch": 0.01297859, "global_step/max_steps": "5/1925", "percentage": "0.26%", "elapsed_time": "3m 52s", "remaining_time": "1d 0h 51m 10s"}
{"loss": 0.4840095, "token_acc": 0.84672742, "grad_norm": 48.5, "learning_rate": 5.18e-06, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020876, "epoch": 0.02595717, "global_step/max_steps": "10/1925", "percentage": "0.52%", "elapsed_time": "7m 56s", "remaining_time": "1d 1h 20m 0s"}
{"loss": 0.3320765, "token_acc": 0.86963563, "grad_norm": 137.0, "learning_rate": 7.77e-06, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.021309, "epoch": 0.03893576, "global_step/max_steps": "15/1925", "percentage": "0.78%", "elapsed_time": "11m 41s", "remaining_time": "1d 0h 48m 0s"}
{"loss": 0.31110988, "token_acc": 0.86650083, "grad_norm": 21.25, "learning_rate": 1.036e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02131, "epoch": 0.05191434, "global_step/max_steps": "20/1925", "percentage": "1.04%", "elapsed_time": "15m 35s", "remaining_time": "1d 0h 45m 32s"}
{"loss": 0.24997964, "token_acc": 0.8876128, "grad_norm": 21.5, "learning_rate": 1.295e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.021488, "epoch": 0.06489293, "global_step/max_steps": "25/1925", "percentage": "1.30%", "elapsed_time": "19m 20s", "remaining_time": "1d 0h 30m 9s"}
{"loss": 0.21027086, "token_acc": 0.91719745, "grad_norm": 25.5, "learning_rate": 1.554e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.021184, "epoch": 0.07787151, "global_step/max_steps": "30/1925", "percentage": "1.56%", "elapsed_time": "23m 33s", "remaining_time": "1d 0h 47m 58s"}
{"loss": 0.18175366, "token_acc": 0.93227425, "grad_norm": 19.25, "learning_rate": 1.813e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.021197, "epoch": 0.0908501, "global_step/max_steps": "35/1925", "percentage": "1.82%", "elapsed_time": "27m 28s", "remaining_time": "1d 0h 43m 32s"}
{"loss": 0.20702987, "token_acc": 0.9324547, "grad_norm": 42.0, "learning_rate": 2.073e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.021199, "epoch": 0.10382868, "global_step/max_steps": "40/1925", "percentage": "2.08%", "elapsed_time": "31m 24s", "remaining_time": "1d 0h 39m 48s"}
{"loss": 0.14612391, "token_acc": 0.94313566, "grad_norm": 31.0, "learning_rate": 2.332e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.021162, "epoch": 0.11680727, "global_step/max_steps": "45/1925", "percentage": "2.34%", "elapsed_time": "35m 23s", "remaining_time": "1d 0h 38m 43s"}
{"loss": 0.12520741, "token_acc": 0.95541401, "grad_norm": 33.5, "learning_rate": 2.591e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02116, "epoch": 0.12978585, "global_step/max_steps": "50/1925", "percentage": "2.60%", "elapsed_time": "39m 20s", "remaining_time": "1d 0h 35m 5s"}
{"loss": 0.08470201, "token_acc": 0.97323601, "grad_norm": 14.6875, "learning_rate": 2.85e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.021224, "epoch": 0.14276444, "global_step/max_steps": "55/1925", "percentage": "2.86%", "elapsed_time": "43m 8s", "remaining_time": "1d 0h 26m 54s"}
{"loss": 0.14687307, "token_acc": 0.94895397, "grad_norm": 24.75, "learning_rate": 3.109e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.021111, "epoch": 0.15574302, "global_step/max_steps": "60/1925", "percentage": "3.12%", "elapsed_time": "47m 19s", "remaining_time": "1d 0h 30m 58s"}
{"loss": 0.09898703, "token_acc": 0.96260163, "grad_norm": 13.375, "learning_rate": 3.368e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.021069, "epoch": 0.16872161, "global_step/max_steps": "65/1925", "percentage": "3.38%", "elapsed_time": "51m 22s", "remaining_time": "1d 0h 30m 3s"}
{"loss": 0.15612952, "token_acc": 0.95714286, "grad_norm": 29.25, "learning_rate": 3.627e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.021002, "epoch": 0.18170019, "global_step/max_steps": "70/1925", "percentage": "3.64%", "elapsed_time": "55m 30s", "remaining_time": "1d 0h 30m 51s"}
{"loss": 0.10940381, "token_acc": 0.96906355, "grad_norm": 16.0, "learning_rate": 3.886e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02104, "epoch": 0.19467878, "global_step/max_steps": "75/1925", "percentage": "3.90%", "elapsed_time": "59m 21s", "remaining_time": "1d 0h 24m 18s"}
{"loss": 0.13801569, "token_acc": 0.95733788, "grad_norm": 11.5, "learning_rate": 4.145e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020886, "epoch": 0.20765737, "global_step/max_steps": "80/1925", "percentage": "4.16%", "elapsed_time": "1h 3m 47s", "remaining_time": "1d 0h 31m 14s"}
{"loss": 0.1114679, "token_acc": 0.96624473, "grad_norm": 28.375, "learning_rate": 4.404e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020723, "epoch": 0.22063595, "global_step/max_steps": "85/1925", "percentage": "4.42%", "elapsed_time": "1h 8m 19s", "remaining_time": "1d 0h 38m 52s"}
{"loss": 0.13793828, "token_acc": 0.95662848, "grad_norm": 8.125, "learning_rate": 4.663e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020619, "epoch": 0.23361454, "global_step/max_steps": "90/1925", "percentage": "4.68%", "elapsed_time": "1h 12m 42s", "remaining_time": "1d 0h 42m 21s"}
{"loss": 0.132942, "token_acc": 0.96169858, "grad_norm": 11.9375, "learning_rate": 4.922e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020597, "epoch": 0.24659312, "global_step/max_steps": "95/1925", "percentage": "4.94%", "elapsed_time": "1h 16m 49s", "remaining_time": "1d 0h 39m 53s"}
{"loss": 0.15107508, "token_acc": 0.96494157, "grad_norm": 78.5, "learning_rate": 5.181e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020542, "epoch": 0.25957171, "global_step/max_steps": "100/1925", "percentage": "5.19%", "elapsed_time": "1h 21m 5s", "remaining_time": "1d 0h 39m 51s"}
{"loss": 0.27871945, "token_acc": 0.92410342, "grad_norm": 13.75, "learning_rate": 5.44e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02047, "epoch": 0.27255029, "global_step/max_steps": "105/1925", "percentage": "5.45%", "elapsed_time": "1h 25m 26s", "remaining_time": "1d 0h 41m 1s"}
{"loss": 0.14314145, "token_acc": 0.95670275, "grad_norm": 7.1875, "learning_rate": 5.699e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020365, "epoch": 0.28552888, "global_step/max_steps": "110/1925", "percentage": "5.71%", "elapsed_time": "1h 29m 58s", "remaining_time": "1d 0h 44m 39s"}
{"loss": 0.14128265, "token_acc": 0.9611249, "grad_norm": 7.40625, "learning_rate": 5.959e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02028, "epoch": 0.29850746, "global_step/max_steps": "115/1925", "percentage": "5.97%", "elapsed_time": "1h 34m 27s", "remaining_time": "1d 0h 46m 49s"}
{"loss": 0.15114243, "token_acc": 0.95281457, "grad_norm": 7.625, "learning_rate": 6.218e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02023, "epoch": 0.31148605, "global_step/max_steps": "120/1925", "percentage": "6.23%", "elapsed_time": "1h 38m 48s", "remaining_time": "1d 0h 46m 21s"}
{"loss": 0.14014119, "token_acc": 0.95687551, "grad_norm": 15.875, "learning_rate": 6.477e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020211, "epoch": 0.32446463, "global_step/max_steps": "125/1925", "percentage": "6.49%", "elapsed_time": "1h 43m 2s", "remaining_time": "1d 0h 43m 40s"}
{"loss": 0.14676701, "token_acc": 0.9623431, "grad_norm": 13.0625, "learning_rate": 6.736e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020168, "epoch": 0.33744322, "global_step/max_steps": "130/1925", "percentage": "6.75%", "elapsed_time": "1h 47m 23s", "remaining_time": "1d 0h 42m 45s"}
{"loss": 0.15908314, "token_acc": 0.9551495, "grad_norm": 11.5625, "learning_rate": 6.995e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020164, "epoch": 0.3504218, "global_step/max_steps": "135/1925", "percentage": "7.01%", "elapsed_time": "1h 51m 32s", "remaining_time": "1d 0h 38m 54s"}
{"loss": 0.14115771, "token_acc": 0.95375723, "grad_norm": 8.375, "learning_rate": 7.254e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02013, "epoch": 0.36340039, "global_step/max_steps": "140/1925", "percentage": "7.27%", "elapsed_time": "1h 55m 52s", "remaining_time": "1d 0h 37m 19s"}
{"loss": 0.17947659, "token_acc": 0.94798658, "grad_norm": 12.5, "learning_rate": 7.513e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020045, "epoch": 0.37637897, "global_step/max_steps": "145/1925", "percentage": "7.53%", "elapsed_time": "2h 0m 30s", "remaining_time": "1d 0h 39m 24s"}
{"loss": 0.13564353, "token_acc": 0.95454545, "grad_norm": 12.6875, "learning_rate": 7.772e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020024, "epoch": 0.38935756, "global_step/max_steps": "150/1925", "percentage": "7.79%", "elapsed_time": "2h 4m 48s", "remaining_time": "1d 0h 36m 48s"}
{"loss": 0.14768316, "token_acc": 0.95424837, "grad_norm": 9.375, "learning_rate": 8.031e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02003, "epoch": 0.40233615, "global_step/max_steps": "155/1925", "percentage": "8.05%", "elapsed_time": "2h 8m 55s", "remaining_time": "1d 0h 32m 17s"}
{"loss": 0.15876762, "token_acc": 0.95570139, "grad_norm": 8.5625, "learning_rate": 8.29e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020035, "epoch": 0.41531473, "global_step/max_steps": "160/1925", "percentage": "8.31%", "elapsed_time": "2h 13m 3s", "remaining_time": "1d 0h 27m 44s"}
{"loss": 0.20421419, "token_acc": 0.93634841, "grad_norm": 5.6875, "learning_rate": 8.549e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020018, "epoch": 0.42829332, "global_step/max_steps": "165/1925", "percentage": "8.57%", "elapsed_time": "2h 17m 19s", "remaining_time": "1d 0h 24m 51s"}
{"loss": 0.13153988, "token_acc": 0.96378601, "grad_norm": 8.4375, "learning_rate": 8.808e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02002, "epoch": 0.4412719, "global_step/max_steps": "170/1925", "percentage": "8.83%", "elapsed_time": "2h 21m 28s", "remaining_time": "1d 0h 20m 34s"}
{"loss": 0.32761114, "token_acc": 0.93582453, "grad_norm": 8.375, "learning_rate": 9.067e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020018, "epoch": 0.45425049, "global_step/max_steps": "175/1925", "percentage": "9.09%", "elapsed_time": "2h 25m 39s", "remaining_time": "1d 0h 16m 34s"}
{"loss": 0.13705795, "token_acc": 0.96460905, "grad_norm": 12.1875, "learning_rate": 9.326e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020066, "epoch": 0.46722907, "global_step/max_steps": "180/1925", "percentage": "9.35%", "elapsed_time": "2h 29m 27s", "remaining_time": "1d 0h 8m 55s"}
{"loss": 0.17206544, "token_acc": 0.94859038, "grad_norm": 10.0, "learning_rate": 9.585e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020072, "epoch": 0.48020766, "global_step/max_steps": "185/1925", "percentage": "9.61%", "elapsed_time": "2h 33m 34s", "remaining_time": "1d 0h 4m 23s"}
{"loss": 0.15847107, "token_acc": 0.95101351, "grad_norm": 65.5, "learning_rate": 9.845e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020079, "epoch": 0.49318624, "global_step/max_steps": "190/1925", "percentage": "9.87%", "elapsed_time": "2h 37m 39s", "remaining_time": "23h 59m 42s"}
{"loss": 0.12832046, "token_acc": 0.96084829, "grad_norm": 6.75, "learning_rate": 0.0001, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020091, "epoch": 0.50616483, "global_step/max_steps": "195/1925", "percentage": "10.13%", "elapsed_time": "2h 41m 42s", "remaining_time": "23h 54m 42s"}
{"loss": 0.22241216, "token_acc": 0.94673123, "grad_norm": 21.625, "learning_rate": 0.0001, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020136, "epoch": 0.51914341, "global_step/max_steps": "200/1925", "percentage": "10.39%", "elapsed_time": "2h 45m 29s", "remaining_time": "23h 47m 22s"}
{"loss": 0.31893365, "token_acc": 0.91421569, "grad_norm": 105.5, "learning_rate": 9.999e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020143, "epoch": 0.532122, "global_step/max_steps": "205/1925", "percentage": "10.65%", "elapsed_time": "2h 49m 34s", "remaining_time": "23h 42m 46s"}
{"loss": 0.35053172, "token_acc": 0.9171452, "grad_norm": 14.0, "learning_rate": 9.998e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020174, "epoch": 0.54510058, "global_step/max_steps": "210/1925", "percentage": "10.91%", "elapsed_time": "2h 53m 26s", "remaining_time": "23h 36m 26s"}
{"loss": 0.17262498, "token_acc": 0.94381107, "grad_norm": 7.8125, "learning_rate": 9.996e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020194, "epoch": 0.55807917, "global_step/max_steps": "215/1925", "percentage": "11.17%", "elapsed_time": "2h 57m 24s", "remaining_time": "23h 30m 57s"}
{"loss": 0.1965905, "token_acc": 0.94338052, "grad_norm": 9.6875, "learning_rate": 9.994e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020193, "epoch": 0.57105775, "global_step/max_steps": "220/1925", "percentage": "11.43%", "elapsed_time": "3h 1m 31s", "remaining_time": "23h 26m 52s"}
{"loss": 0.16929464, "token_acc": 0.94641385, "grad_norm": 5.78125, "learning_rate": 9.992e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020175, "epoch": 0.58403634, "global_step/max_steps": "225/1925", "percentage": "11.69%", "elapsed_time": "3h 5m 49s", "remaining_time": "23h 24m 3s"}
{"loss": 0.15234674, "token_acc": 0.95390947, "grad_norm": 8.0625, "learning_rate": 9.989e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020195, "epoch": 0.59701493, "global_step/max_steps": "230/1925", "percentage": "11.95%", "elapsed_time": "3h 9m 46s", "remaining_time": "23h 18m 30s"}
{"loss": 0.1638909, "token_acc": 0.95333333, "grad_norm": 3.796875, "learning_rate": 9.985e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020211, "epoch": 0.60999351, "global_step/max_steps": "235/1925", "percentage": "12.21%", "elapsed_time": "3h 13m 44s", "remaining_time": "23h 13m 19s"}
{"loss": 0.12358022, "token_acc": 0.96379726, "grad_norm": 5.65625, "learning_rate": 9.982e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020238, "epoch": 0.6229721, "global_step/max_steps": "240/1925", "percentage": "12.47%", "elapsed_time": "3h 17m 35s", "remaining_time": "23h 7m 18s"}
{"loss": 0.21105986, "token_acc": 0.94040816, "grad_norm": 9.5, "learning_rate": 9.978e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020262, "epoch": 0.63595068, "global_step/max_steps": "245/1925", "percentage": "12.73%", "elapsed_time": "3h 21m 28s", "remaining_time": "23h 1m 34s"}
{"loss": 0.12423412, "token_acc": 0.95598007, "grad_norm": 4.34375, "learning_rate": 9.973e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020264, "epoch": 0.64892927, "global_step/max_steps": "250/1925", "percentage": "12.99%", "elapsed_time": "3h 25m 34s", "remaining_time": "22h 57m 22s"}
{"loss": 1.12987251, "token_acc": 0.85561936, "grad_norm": 32.25, "learning_rate": 9.968e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020271, "epoch": 0.66190785, "global_step/max_steps": "255/1925", "percentage": "13.25%", "elapsed_time": "3h 29m 36s", "remaining_time": "22h 52m 46s"}
{"loss": 0.62461486, "token_acc": 0.84471545, "grad_norm": 20.5, "learning_rate": 9.963e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020304, "epoch": 0.67488644, "global_step/max_steps": "260/1925", "percentage": "13.51%", "elapsed_time": "3h 33m 22s", "remaining_time": "22h 46m 25s"}
{"loss": 0.54928823, "token_acc": 0.91524042, "grad_norm": 596.0, "learning_rate": 9.957e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020343, "epoch": 0.68786502, "global_step/max_steps": "265/1925", "percentage": "13.77%", "elapsed_time": "3h 37m 4s", "remaining_time": "22h 39m 44s"}
{"loss": 0.15117992, "token_acc": 0.9506689, "grad_norm": 5.8125, "learning_rate": 9.951e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02037, "epoch": 0.70084361, "global_step/max_steps": "270/1925", "percentage": "14.03%", "elapsed_time": "3h 40m 51s", "remaining_time": "22h 33m 48s"}
{"loss": 1.64415321, "token_acc": 0.82741536, "grad_norm": 154.0, "learning_rate": 9.945e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020369, "epoch": 0.71382219, "global_step/max_steps": "275/1925", "percentage": "14.29%", "elapsed_time": "3h 44m 57s", "remaining_time": "22h 29m 47s"}
{"loss": 1.14118338, "token_acc": 0.85433715, "grad_norm": 7.75, "learning_rate": 9.938e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020393, "epoch": 0.72680078, "global_step/max_steps": "280/1925", "percentage": "14.55%", "elapsed_time": "3h 48m 47s", "remaining_time": "22h 24m 10s"}
{"loss": 0.25474195, "token_acc": 0.94947025, "grad_norm": 13.6875, "learning_rate": 9.931e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020415, "epoch": 0.73977936, "global_step/max_steps": "285/1925", "percentage": "14.81%", "elapsed_time": "3h 52m 37s", "remaining_time": "22h 18m 38s"}
{"loss": 0.21210158, "token_acc": 0.94285714, "grad_norm": 27.75, "learning_rate": 9.923e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020426, "epoch": 0.75275795, "global_step/max_steps": "290/1925", "percentage": "15.06%", "elapsed_time": "3h 56m 35s", "remaining_time": "22h 13m 50s"}
{"loss": 0.17410607, "token_acc": 0.94741167, "grad_norm": 220.0, "learning_rate": 9.915e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020432, "epoch": 0.76573653, "global_step/max_steps": "295/1925", "percentage": "15.32%", "elapsed_time": "4h 0m 35s", "remaining_time": "22h 9m 23s"}
{"loss": 0.19344786, "token_acc": 0.92845395, "grad_norm": 11.5, "learning_rate": 9.906e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020448, "epoch": 0.77871512, "global_step/max_steps": "300/1925", "percentage": "15.58%", "elapsed_time": "4h 4m 28s", "remaining_time": "22h 4m 14s"}
{"loss": 0.2023138, "token_acc": 0.92868988, "grad_norm": 3.109375, "learning_rate": 9.897e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020452, "epoch": 0.79169371, "global_step/max_steps": "305/1925", "percentage": "15.84%", "elapsed_time": "4h 8m 30s", "remaining_time": "21h 59m 55s"}
{"loss": 0.1684238, "token_acc": 0.94444444, "grad_norm": 4.59375, "learning_rate": 9.888e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020467, "epoch": 0.80467229, "global_step/max_steps": "310/1925", "percentage": "16.10%", "elapsed_time": "4h 12m 23s", "remaining_time": "21h 54m 51s"}
{"loss": 0.16689838, "token_acc": 0.94690265, "grad_norm": 2.875, "learning_rate": 9.878e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020482, "epoch": 0.81765088, "global_step/max_steps": "315/1925", "percentage": "16.36%", "elapsed_time": "4h 16m 16s", "remaining_time": "21h 49m 52s"}
{"loss": 0.15300171, "token_acc": 0.95253955, "grad_norm": 4.40625, "learning_rate": 9.868e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020508, "epoch": 0.83062946, "global_step/max_steps": "320/1925", "percentage": "16.62%", "elapsed_time": "4h 20m 0s", "remaining_time": "21h 44m 7s"}
{"loss": 0.16267442, "token_acc": 0.95390947, "grad_norm": 3.3125, "learning_rate": 9.857e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020505, "epoch": 0.84360805, "global_step/max_steps": "325/1925", "percentage": "16.88%", "elapsed_time": "4h 24m 6s", "remaining_time": "21h 40m 14s"}
{"loss": 0.18282036, "token_acc": 0.94176373, "grad_norm": 7.21875, "learning_rate": 9.846e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020508, "epoch": 0.85658663, "global_step/max_steps": "330/1925", "percentage": "17.14%", "elapsed_time": "4h 28m 8s", "remaining_time": "21h 36m 2s"}
{"loss": 0.1513697, "token_acc": 0.95778146, "grad_norm": 4.53125, "learning_rate": 9.835e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020508, "epoch": 0.86956522, "global_step/max_steps": "335/1925", "percentage": "17.40%", "elapsed_time": "4h 32m 12s", "remaining_time": "21h 31m 57s"}
{"loss": 0.12393852, "token_acc": 0.96305419, "grad_norm": 2.515625, "learning_rate": 9.823e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020525, "epoch": 0.8825438, "global_step/max_steps": "340/1925", "percentage": "17.66%", "elapsed_time": "4h 36m 2s", "remaining_time": "21h 26m 48s"}
{"loss": 0.12096944, "token_acc": 0.95507488, "grad_norm": 5.65625, "learning_rate": 9.811e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020516, "epoch": 0.89552239, "global_step/max_steps": "345/1925", "percentage": "17.92%", "elapsed_time": "4h 40m 13s", "remaining_time": "21h 23m 21s"}
{"loss": 0.14237204, "token_acc": 0.95795548, "grad_norm": 5.03125, "learning_rate": 9.799e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020518, "epoch": 0.90850097, "global_step/max_steps": "350/1925", "percentage": "18.18%", "elapsed_time": "4h 44m 15s", "remaining_time": "21h 19m 8s"}
{"loss": 0.15217357, "token_acc": 0.95676998, "grad_norm": 3.609375, "learning_rate": 9.786e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020531, "epoch": 0.92147956, "global_step/max_steps": "355/1925", "percentage": "18.44%", "elapsed_time": "4h 48m 8s", "remaining_time": "21h 14m 18s"}
{"loss": 0.14129272, "token_acc": 0.95016611, "grad_norm": 2.078125, "learning_rate": 9.772e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020532, "epoch": 0.93445814, "global_step/max_steps": "360/1925", "percentage": "18.70%", "elapsed_time": "4h 52m 10s", "remaining_time": "21h 10m 8s"}
{"loss": 0.14151201, "token_acc": 0.9601626, "grad_norm": 3.828125, "learning_rate": 9.759e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020539, "epoch": 0.94743673, "global_step/max_steps": "365/1925", "percentage": "18.96%", "elapsed_time": "4h 56m 8s", "remaining_time": "21h 5m 40s"}
{"loss": 0.1126719, "token_acc": 0.95960429, "grad_norm": 2.0625, "learning_rate": 9.745e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020569, "epoch": 0.96041531, "global_step/max_steps": "370/1925", "percentage": "19.22%", "elapsed_time": "4h 59m 45s", "remaining_time": "20h 59m 48s"}
{"loss": 0.13800282, "token_acc": 0.95321637, "grad_norm": 2.796875, "learning_rate": 9.73e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02059, "epoch": 0.9733939, "global_step/max_steps": "375/1925", "percentage": "19.48%", "elapsed_time": "5h 3m 30s", "remaining_time": "20h 54m 28s"}
{"loss": 0.1334955, "token_acc": 0.95539335, "grad_norm": 4.21875, "learning_rate": 9.715e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020592, "epoch": 0.98637249, "global_step/max_steps": "380/1925", "percentage": "19.74%", "elapsed_time": "5h 7m 30s", "remaining_time": "20h 50m 17s"}
{"loss": 0.11987956, "token_acc": 0.96043771, "grad_norm": 4.25, "learning_rate": 9.7e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020608, "epoch": 0.99935107, "global_step/max_steps": "385/1925", "percentage": "20.00%", "elapsed_time": "5h 11m 19s", "remaining_time": "20h 45m 18s"}
{"loss": 0.07847664, "token_acc": 0.97016362, "grad_norm": 2.890625, "learning_rate": 9.684e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020645, "epoch": 1.01038287, "global_step/max_steps": "390/1925", "percentage": "20.26%", "elapsed_time": "5h 14m 47s", "remaining_time": "20h 38m 59s"}
{"loss": 0.28727553, "token_acc": 0.95206612, "grad_norm": 2.578125, "learning_rate": 9.668e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020648, "epoch": 1.02336145, "global_step/max_steps": "395/1925", "percentage": "20.52%", "elapsed_time": "5h 18m 47s", "remaining_time": "20h 34m 48s"}
{"loss": 0.10458629, "token_acc": 0.96109272, "grad_norm": 2.71875, "learning_rate": 9.652e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020649, "epoch": 1.03634004, "global_step/max_steps": "400/1925", "percentage": "20.78%", "elapsed_time": "5h 22m 48s", "remaining_time": "20h 30m 42s"}
{"loss": 0.07832456, "token_acc": 0.96940419, "grad_norm": 4.0625, "learning_rate": 9.635e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020671, "epoch": 1.04931862, "global_step/max_steps": "405/1925", "percentage": "21.04%", "elapsed_time": "5h 26m 30s", "remaining_time": "20h 25m 23s"}
{"loss": 0.09629099, "token_acc": 0.96989422, "grad_norm": 9.375, "learning_rate": 9.618e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02067, "epoch": 1.06229721, "global_step/max_steps": "410/1925", "percentage": "21.30%", "elapsed_time": "5h 30m 32s", "remaining_time": "20h 21m 24s"}
{"loss": 0.08838108, "token_acc": 0.97622951, "grad_norm": 1.5625, "learning_rate": 9.6e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020675, "epoch": 1.07527579, "global_step/max_steps": "415/1925", "percentage": "21.56%", "elapsed_time": "5h 34m 29s", "remaining_time": "20h 17m 5s"}
{"loss": 0.09818625, "token_acc": 0.96236099, "grad_norm": 2.203125, "learning_rate": 9.582e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020676, "epoch": 1.08825438, "global_step/max_steps": "420/1925", "percentage": "21.82%", "elapsed_time": "5h 38m 30s", "remaining_time": "20h 12m 59s"}
{"loss": 0.09333943, "token_acc": 0.97264682, "grad_norm": 3.734375, "learning_rate": 9.564e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020684, "epoch": 1.10123297, "global_step/max_steps": "425/1925", "percentage": "22.08%", "elapsed_time": "5h 42m 24s", "remaining_time": "20h 8m 29s"}
{"loss": 0.06768833, "token_acc": 0.97898141, "grad_norm": 1.6953125, "learning_rate": 9.545e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020692, "epoch": 1.11421155, "global_step/max_steps": "430/1925", "percentage": "22.34%", "elapsed_time": "5h 46m 18s", "remaining_time": "20h 4m 0s"}
{"loss": 0.09929285, "token_acc": 0.97603834, "grad_norm": 2.390625, "learning_rate": 9.526e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020704, "epoch": 1.12719014, "global_step/max_steps": "435/1925", "percentage": "22.60%", "elapsed_time": "5h 50m 8s", "remaining_time": "19h 59m 18s"}
{"loss": 0.11887519, "token_acc": 0.95698925, "grad_norm": 1.7890625, "learning_rate": 9.507e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020705, "epoch": 1.14016872, "global_step/max_steps": "440/1925", "percentage": "22.86%", "elapsed_time": "5h 54m 7s", "remaining_time": "19h 55m 11s"}
{"loss": 0.09447285, "token_acc": 0.96600332, "grad_norm": 1.6484375, "learning_rate": 9.487e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020692, "epoch": 1.15314731, "global_step/max_steps": "445/1925", "percentage": "23.12%", "elapsed_time": "5h 58m 23s", "remaining_time": "19h 51m 56s"}
{"loss": 0.11140995, "token_acc": 0.96522477, "grad_norm": 2.796875, "learning_rate": 9.467e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020698, "epoch": 1.16612589, "global_step/max_steps": "450/1925", "percentage": "23.38%", "elapsed_time": "6h 2m 18s", "remaining_time": "19h 47m 34s"}
{"loss": 0.10376272, "token_acc": 0.96495518, "grad_norm": 4.9375, "learning_rate": 9.446e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020701, "epoch": 1.17910448, "global_step/max_steps": "455/1925", "percentage": "23.64%", "elapsed_time": "6h 6m 17s", "remaining_time": "19h 43m 23s"}
{"loss": 0.10616806, "token_acc": 0.96710526, "grad_norm": 5.6875, "learning_rate": 9.425e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020708, "epoch": 1.19208306, "global_step/max_steps": "460/1925", "percentage": "23.90%", "elapsed_time": "6h 10m 10s", "remaining_time": "19h 38m 55s"}
{"loss": 0.10312039, "token_acc": 0.97004049, "grad_norm": 2.046875, "learning_rate": 9.404e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02071, "epoch": 1.20506165, "global_step/max_steps": "465/1925", "percentage": "24.16%", "elapsed_time": "6h 14m 10s", "remaining_time": "19h 34m 49s"}
{"loss": 0.11268963, "token_acc": 0.96431468, "grad_norm": 1.6796875, "learning_rate": 9.382e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020713, "epoch": 1.21804023, "global_step/max_steps": "470/1925", "percentage": "24.42%", "elapsed_time": "6h 18m 8s", "remaining_time": "19h 30m 36s"}
{"loss": 0.11029823, "token_acc": 0.96143959, "grad_norm": 2.125, "learning_rate": 9.36e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02071, "epoch": 1.23101882, "global_step/max_steps": "475/1925", "percentage": "24.68%", "elapsed_time": "6h 22m 13s", "remaining_time": "19h 26m 47s"}
{"loss": 0.08803771, "token_acc": 0.97379197, "grad_norm": 2.734375, "learning_rate": 9.338e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020707, "epoch": 1.2439974, "global_step/max_steps": "480/1925", "percentage": "24.94%", "elapsed_time": "6h 26m 18s", "remaining_time": "19h 22m 55s"}
{"loss": 0.12014461, "token_acc": 0.96006656, "grad_norm": 2.46875, "learning_rate": 9.315e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020714, "epoch": 1.25697599, "global_step/max_steps": "485/1925", "percentage": "25.19%", "elapsed_time": "6h 30m 11s", "remaining_time": "19h 18m 29s"}
{"loss": 0.06196777, "token_acc": 0.98192276, "grad_norm": 1.2890625, "learning_rate": 9.292e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020719, "epoch": 1.26995457, "global_step/max_steps": "490/1925", "percentage": "25.45%", "elapsed_time": "6h 34m 7s", "remaining_time": "19h 14m 13s"}
{"loss": 0.08717445, "token_acc": 0.97124076, "grad_norm": 3.328125, "learning_rate": 9.268e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020714, "epoch": 1.28293316, "global_step/max_steps": "495/1925", "percentage": "25.71%", "elapsed_time": "6h 38m 14s", "remaining_time": "19h 10m 28s"}
{"loss": 0.11612108, "token_acc": 0.96360629, "grad_norm": 3.53125, "learning_rate": 9.245e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020724, "epoch": 1.29591175, "global_step/max_steps": "500/1925", "percentage": "25.97%", "elapsed_time": "6h 42m 3s", "remaining_time": "19h 5m 52s"}
{"loss": 0.09390705, "token_acc": 0.96994992, "grad_norm": 2.453125, "learning_rate": 9.22e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020673, "epoch": 1.30889033, "global_step/max_steps": "505/1925", "percentage": "26.23%", "elapsed_time": "6h 47m 4s", "remaining_time": "19h 4m 39s"}
{"loss": 0.09252657, "token_acc": 0.9639866, "grad_norm": 3.546875, "learning_rate": 9.196e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020679, "epoch": 1.32186892, "global_step/max_steps": "510/1925", "percentage": "26.49%", "elapsed_time": "6h 51m 0s", "remaining_time": "19h 0m 20s"}
{"loss": 0.08763872, "token_acc": 0.97364086, "grad_norm": 1.4765625, "learning_rate": 9.171e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020688, "epoch": 1.3348475, "global_step/max_steps": "515/1925", "percentage": "26.75%", "elapsed_time": "6h 54m 50s", "remaining_time": "18h 55m 47s"}
{"loss": 0.08335199, "token_acc": 0.96964725, "grad_norm": 1.9609375, "learning_rate": 9.146e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020699, "epoch": 1.34782609, "global_step/max_steps": "520/1925", "percentage": "27.01%", "elapsed_time": "6h 58m 38s", "remaining_time": "18h 51m 9s"}
{"loss": 0.08465139, "token_acc": 0.97540288, "grad_norm": 1.734375, "learning_rate": 9.12e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020692, "epoch": 1.36080467, "global_step/max_steps": "525/1925", "percentage": "27.27%", "elapsed_time": "7h 2m 48s", "remaining_time": "18h 47m 30s"}
{"loss": 0.08926347, "token_acc": 0.96908939, "grad_norm": 2.21875, "learning_rate": 9.095e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020699, "epoch": 1.37378326, "global_step/max_steps": "530/1925", "percentage": "27.53%", "elapsed_time": "7h 6m 42s", "remaining_time": "18h 43m 8s"}
{"loss": 0.07581965, "token_acc": 0.97261411, "grad_norm": 2.28125, "learning_rate": 9.068e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020705, "epoch": 1.38676184, "global_step/max_steps": "535/1925", "percentage": "27.79%", "elapsed_time": "7h 10m 35s", "remaining_time": "18h 38m 44s"}
{"loss": 0.08940108, "token_acc": 0.97306397, "grad_norm": 1.5, "learning_rate": 9.042e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020709, "epoch": 1.39974043, "global_step/max_steps": "540/1925", "percentage": "28.05%", "elapsed_time": "7h 14m 32s", "remaining_time": "18h 34m 31s"}
{"loss": 0.1053141, "token_acc": 0.96568627, "grad_norm": 3.90625, "learning_rate": 9.015e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020718, "epoch": 1.41271901, "global_step/max_steps": "545/1925", "percentage": "28.31%", "elapsed_time": "7h 18m 22s", "remaining_time": "18h 30m 1s"}
{"loss": 0.10772655, "token_acc": 0.96706081, "grad_norm": 1.09375, "learning_rate": 8.988e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020719, "epoch": 1.4256976, "global_step/max_steps": "550/1925", "percentage": "28.57%", "elapsed_time": "7h 22m 22s", "remaining_time": "18h 25m 56s"}
{"loss": 0.09822122, "token_acc": 0.96982055, "grad_norm": 0.9921875, "learning_rate": 8.96e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020733, "epoch": 1.43867618, "global_step/max_steps": "555/1925", "percentage": "28.83%", "elapsed_time": "7h 26m 6s", "remaining_time": "18h 21m 11s"}
{"loss": 0.08714378, "token_acc": 0.96698873, "grad_norm": 3.953125, "learning_rate": 8.932e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020744, "epoch": 1.45165477, "global_step/max_steps": "560/1925", "percentage": "29.09%", "elapsed_time": "7h 29m 52s", "remaining_time": "18h 16m 34s"}
{"loss": 0.1327032, "token_acc": 0.95986896, "grad_norm": 3.6875, "learning_rate": 8.904e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020746, "epoch": 1.46463335, "global_step/max_steps": "565/1925", "percentage": "29.35%", "elapsed_time": "7h 33m 51s", "remaining_time": "18h 12m 27s"}
{"loss": 0.08868689, "token_acc": 0.96795399, "grad_norm": 2.4375, "learning_rate": 8.876e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020754, "epoch": 1.47761194, "global_step/max_steps": "570/1925", "percentage": "29.61%", "elapsed_time": "7h 37m 41s", "remaining_time": "18h 8m 2s"}
{"loss": 0.07921121, "token_acc": 0.97471452, "grad_norm": 2.296875, "learning_rate": 8.847e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020753, "epoch": 1.49059053, "global_step/max_steps": "575/1925", "percentage": "29.87%", "elapsed_time": "7h 41m 44s", "remaining_time": "18h 4m 5s"}
{"loss": 0.0657312, "token_acc": 0.97983871, "grad_norm": 0.96875, "learning_rate": 8.818e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020759, "epoch": 1.50356911, "global_step/max_steps": "580/1925", "percentage": "30.13%", "elapsed_time": "7h 45m 36s", "remaining_time": "17h 59m 44s"}
{"loss": 0.0668935, "token_acc": 0.9744856, "grad_norm": 1.2265625, "learning_rate": 8.788e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020758, "epoch": 1.5165477, "global_step/max_steps": "585/1925", "percentage": "30.39%", "elapsed_time": "7h 49m 39s", "remaining_time": "17h 55m 47s"}
{"loss": 0.05711462, "token_acc": 0.97848606, "grad_norm": 2.03125, "learning_rate": 8.759e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02076, "epoch": 1.52952628, "global_step/max_steps": "590/1925", "percentage": "30.65%", "elapsed_time": "7h 53m 37s", "remaining_time": "17h 51m 40s"}
{"loss": 0.09383676, "token_acc": 0.97125951, "grad_norm": 1.546875, "learning_rate": 8.729e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020756, "epoch": 1.54250487, "global_step/max_steps": "595/1925", "percentage": "30.91%", "elapsed_time": "7h 57m 43s", "remaining_time": "17h 47m 52s"}
{"loss": 0.08457305, "token_acc": 0.97004049, "grad_norm": 1.140625, "learning_rate": 8.698e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020755, "epoch": 1.55548345, "global_step/max_steps": "600/1925", "percentage": "31.17%", "elapsed_time": "8h 1m 45s", "remaining_time": "17h 43m 53s"}
{"loss": 0.05290308, "token_acc": 0.98300971, "grad_norm": 1.171875, "learning_rate": 8.668e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020756, "epoch": 1.56846204, "global_step/max_steps": "605/1925", "percentage": "31.43%", "elapsed_time": "8h 5m 46s", "remaining_time": "17h 39m 51s"}
{"loss": 0.08796421, "token_acc": 0.97235772, "grad_norm": 3.734375, "learning_rate": 8.637e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020761, "epoch": 1.58144062, "global_step/max_steps": "610/1925", "percentage": "31.69%", "elapsed_time": "8h 9m 39s", "remaining_time": "17h 35m 34s"}
{"loss": 0.05958071, "token_acc": 0.98122449, "grad_norm": 1.1875, "learning_rate": 8.605e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020757, "epoch": 1.59441921, "global_step/max_steps": "615/1925", "percentage": "31.95%", "elapsed_time": "8h 13m 45s", "remaining_time": "17h 31m 45s"}
{"loss": 0.15476706, "token_acc": 0.95899582, "grad_norm": 14.3125, "learning_rate": 8.574e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020758, "epoch": 1.60739779, "global_step/max_steps": "620/1925", "percentage": "32.21%", "elapsed_time": "8h 17m 45s", "remaining_time": "17h 27m 42s"}
{"loss": 0.08144988, "token_acc": 0.97779605, "grad_norm": 1.484375, "learning_rate": 8.542e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020753, "epoch": 1.62037638, "global_step/max_steps": "625/1925", "percentage": "32.47%", "elapsed_time": "8h 21m 53s", "remaining_time": "17h 23m 55s"}
{"loss": 0.1207248, "token_acc": 0.96491228, "grad_norm": 2.375, "learning_rate": 8.51e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020756, "epoch": 1.63335496, "global_step/max_steps": "630/1925", "percentage": "32.73%", "elapsed_time": "8h 25m 49s", "remaining_time": "17h 19m 44s"}
{"loss": 0.05741075, "token_acc": 0.98055105, "grad_norm": 1.171875, "learning_rate": 8.477e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020755, "epoch": 1.64633355, "global_step/max_steps": "635/1925", "percentage": "32.99%", "elapsed_time": "8h 29m 51s", "remaining_time": "17h 15m 47s"}
{"loss": 0.08364247, "token_acc": 0.97545008, "grad_norm": 2.890625, "learning_rate": 8.445e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020763, "epoch": 1.65931213, "global_step/max_steps": "640/1925", "percentage": "33.25%", "elapsed_time": "8h 33m 41s", "remaining_time": "17h 11m 23s"}
{"loss": 0.07338098, "token_acc": 0.97634584, "grad_norm": 1.4296875, "learning_rate": 8.412e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020764, "epoch": 1.67229072, "global_step/max_steps": "645/1925", "percentage": "33.51%", "elapsed_time": "8h 37m 39s", "remaining_time": "17h 7m 18s"}
{"loss": 0.06728041, "token_acc": 0.98004988, "grad_norm": 2.375, "learning_rate": 8.378e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020765, "epoch": 1.68526931, "global_step/max_steps": "650/1925", "percentage": "33.77%", "elapsed_time": "8h 41m 40s", "remaining_time": "17h 3m 16s"}
{"loss": 0.09220197, "token_acc": 0.97190083, "grad_norm": 3.328125, "learning_rate": 8.345e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020763, "epoch": 1.69824789, "global_step/max_steps": "655/1925", "percentage": "34.03%", "elapsed_time": "8h 45m 43s", "remaining_time": "16h 59m 19s"}
{"loss": 0.09654331, "token_acc": 0.96903015, "grad_norm": 2.109375, "learning_rate": 8.311e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020766, "epoch": 1.71122648, "global_step/max_steps": "660/1925", "percentage": "34.29%", "elapsed_time": "8h 49m 40s", "remaining_time": "16h 55m 12s"}
{"loss": 0.09199823, "token_acc": 0.97440273, "grad_norm": 1.609375, "learning_rate": 8.277e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020764, "epoch": 1.72420506, "global_step/max_steps": "665/1925", "percentage": "34.55%", "elapsed_time": "8h 53m 43s", "remaining_time": "16h 51m 15s"}
{"loss": 0.08355655, "token_acc": 0.9717608, "grad_norm": 1.9609375, "learning_rate": 8.242e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020765, "epoch": 1.73718365, "global_step/max_steps": "670/1925", "percentage": "34.81%", "elapsed_time": "8h 57m 42s", "remaining_time": "16h 47m 12s"}
{"loss": 0.07680344, "token_acc": 0.97847682, "grad_norm": 1.3671875, "learning_rate": 8.208e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020769, "epoch": 1.75016223, "global_step/max_steps": "675/1925", "percentage": "35.06%", "elapsed_time": "9h 1m 38s", "remaining_time": "16h 43m 2s"}
{"loss": 0.05557376, "token_acc": 0.9831508, "grad_norm": 1.1328125, "learning_rate": 8.173e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02077, "epoch": 1.76314082, "global_step/max_steps": "680/1925", "percentage": "35.32%", "elapsed_time": "9h 5m 36s", "remaining_time": "16h 38m 57s"}
{"loss": 0.08825137, "token_acc": 0.97527707, "grad_norm": 3.46875, "learning_rate": 8.138e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020769, "epoch": 1.7761194, "global_step/max_steps": "685/1925", "percentage": "35.58%", "elapsed_time": "9h 9m 38s", "remaining_time": "16h 34m 58s"}
{"loss": 0.10723648, "token_acc": 0.97135843, "grad_norm": 2.96875, "learning_rate": 8.102e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020778, "epoch": 1.78909799, "global_step/max_steps": "690/1925", "percentage": "35.84%", "elapsed_time": "9h 13m 25s", "remaining_time": "16h 30m 33s"}
{"loss": 0.07498295, "token_acc": 0.97770438, "grad_norm": 1.5078125, "learning_rate": 8.067e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020784, "epoch": 1.80207657, "global_step/max_steps": "695/1925", "percentage": "36.10%", "elapsed_time": "9h 17m 17s", "remaining_time": "16h 26m 16s"}
{"loss": 0.06255127, "token_acc": 0.97886179, "grad_norm": 2.84375, "learning_rate": 8.031e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020786, "epoch": 1.81505516, "global_step/max_steps": "700/1925", "percentage": "36.36%", "elapsed_time": "9h 21m 13s", "remaining_time": "16h 22m 8s"}
{"loss": 0.09357862, "token_acc": 0.97197032, "grad_norm": 2.65625, "learning_rate": 7.994e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02079, "epoch": 1.82803374, "global_step/max_steps": "705/1925", "percentage": "36.62%", "elapsed_time": "9h 25m 7s", "remaining_time": "16h 17m 57s"}
{"loss": 0.04933436, "token_acc": 0.984375, "grad_norm": 2.078125, "learning_rate": 7.958e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020784, "epoch": 1.84101233, "global_step/max_steps": "710/1925", "percentage": "36.88%", "elapsed_time": "9h 29m 17s", "remaining_time": "16h 14m 13s"}
{"loss": 0.04773304, "token_acc": 0.98353909, "grad_norm": 1.2421875, "learning_rate": 7.921e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020777, "epoch": 1.85399091, "global_step/max_steps": "715/1925", "percentage": "37.14%", "elapsed_time": "9h 33m 30s", "remaining_time": "16h 10m 32s"}
{"loss": 0.10059412, "token_acc": 0.96901173, "grad_norm": 2.40625, "learning_rate": 7.884e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020775, "epoch": 1.8669695, "global_step/max_steps": "720/1925", "percentage": "37.40%", "elapsed_time": "9h 37m 34s", "remaining_time": "16h 6m 37s"}
{"loss": 0.07531643, "token_acc": 0.97727273, "grad_norm": 2.65625, "learning_rate": 7.847e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020777, "epoch": 1.87994809, "global_step/max_steps": "725/1925", "percentage": "37.66%", "elapsed_time": "9h 41m 32s", "remaining_time": "16h 2m 32s"}
{"loss": 0.06520426, "token_acc": 0.97863599, "grad_norm": 3.03125, "learning_rate": 7.81e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020783, "epoch": 1.89292667, "global_step/max_steps": "730/1925", "percentage": "37.92%", "elapsed_time": "9h 45m 21s", "remaining_time": "15h 58m 14s"}
{"loss": 0.08142446, "token_acc": 0.96934548, "grad_norm": 1.9609375, "learning_rate": 7.772e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020794, "epoch": 1.90590526, "global_step/max_steps": "735/1925", "percentage": "38.18%", "elapsed_time": "9h 49m 4s", "remaining_time": "15h 53m 44s"}
{"loss": 0.07234653, "token_acc": 0.97886179, "grad_norm": 1.03125, "learning_rate": 7.734e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020798, "epoch": 1.91888384, "global_step/max_steps": "740/1925", "percentage": "38.44%", "elapsed_time": "9h 52m 57s", "remaining_time": "15h 49m 32s"}
{"loss": 0.07108656, "token_acc": 0.97510373, "grad_norm": 2.03125, "learning_rate": 7.696e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020799, "epoch": 1.93186243, "global_step/max_steps": "745/1925", "percentage": "38.70%", "elapsed_time": "9h 56m 56s", "remaining_time": "15h 45m 29s"}
{"loss": 0.06093554, "token_acc": 0.97783251, "grad_norm": 2.125, "learning_rate": 7.658e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020808, "epoch": 1.94484101, "global_step/max_steps": "750/1925", "percentage": "38.96%", "elapsed_time": "10h 0m 40s", "remaining_time": "15h 41m 3s"}
{"loss": 0.0692849, "token_acc": 0.97359736, "grad_norm": 1.28125, "learning_rate": 7.619e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020817, "epoch": 1.9578196, "global_step/max_steps": "755/1925", "percentage": "39.22%", "elapsed_time": "10h 4m 26s", "remaining_time": "15h 36m 41s"}
{"loss": 0.07247485, "token_acc": 0.97133221, "grad_norm": 1.7734375, "learning_rate": 7.581e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020819, "epoch": 1.97079818, "global_step/max_steps": "760/1925", "percentage": "39.48%", "elapsed_time": "10h 8m 21s", "remaining_time": "15h 32m 33s"}
{"loss": 0.07819902, "token_acc": 0.97591362, "grad_norm": 0.9375, "learning_rate": 7.542e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02082, "epoch": 1.98377677, "global_step/max_steps": "765/1925", "percentage": "39.74%", "elapsed_time": "10h 12m 20s", "remaining_time": "15h 28m 30s"}
{"loss": 0.06084737, "token_acc": 0.97836938, "grad_norm": 1.5859375, "learning_rate": 7.503e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020823, "epoch": 1.99675535, "global_step/max_steps": "770/1925", "percentage": "40.00%", "elapsed_time": "10h 16m 16s", "remaining_time": "15h 24m 24s"}
{"loss": 0.0481132, "token_acc": 0.98098098, "grad_norm": 1.625, "learning_rate": 7.463e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020841, "epoch": 2.00778715, "global_step/max_steps": "775/1925", "percentage": "40.26%", "elapsed_time": "10h 19m 43s", "remaining_time": "15h 19m 36s"}
{"loss": 0.04549453, "token_acc": 0.9852338, "grad_norm": 0.42773438, "learning_rate": 7.424e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020848, "epoch": 2.02076574, "global_step/max_steps": "780/1925", "percentage": "40.52%", "elapsed_time": "10h 23m 31s", "remaining_time": "15h 15m 18s"}
{"loss": 0.0581861, "token_acc": 0.97941681, "grad_norm": 5.78125, "learning_rate": 7.384e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020848, "epoch": 2.03374432, "global_step/max_steps": "785/1925", "percentage": "40.78%", "elapsed_time": "10h 27m 29s", "remaining_time": "15h 11m 16s"}
{"loss": 0.03766953, "token_acc": 0.99000833, "grad_norm": 0.890625, "learning_rate": 7.344e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020846, "epoch": 2.04672291, "global_step/max_steps": "790/1925", "percentage": "41.04%", "elapsed_time": "10h 31m 33s", "remaining_time": "15h 7m 21s"}
{"loss": 0.09385979, "token_acc": 0.97359736, "grad_norm": 2.9375, "learning_rate": 7.304e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020851, "epoch": 2.05970149, "global_step/max_steps": "795/1925", "percentage": "41.30%", "elapsed_time": "10h 35m 24s", "remaining_time": "15h 3m 9s"}
{"loss": 0.05340881, "token_acc": 0.98535395, "grad_norm": 1.3359375, "learning_rate": 7.263e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020854, "epoch": 2.07268008, "global_step/max_steps": "800/1925", "percentage": "41.56%", "elapsed_time": "10h 39m 18s", "remaining_time": "14h 59m 1s"}
{"loss": 0.05104266, "token_acc": 0.9834574, "grad_norm": 1.0625, "learning_rate": 7.223e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020863, "epoch": 2.08565866, "global_step/max_steps": "805/1925", "percentage": "41.82%", "elapsed_time": "10h 43m 2s", "remaining_time": "14h 54m 39s"}
{"loss": 0.05211966, "token_acc": 0.98547215, "grad_norm": 1.828125, "learning_rate": 7.182e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020866, "epoch": 2.09863725, "global_step/max_steps": "810/1925", "percentage": "42.08%", "elapsed_time": "10h 46m 55s", "remaining_time": "14h 50m 31s"}
{"loss": 0.0611392, "token_acc": 0.98163606, "grad_norm": 1.96875, "learning_rate": 7.141e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02087, "epoch": 2.11161583, "global_step/max_steps": "815/1925", "percentage": "42.34%", "elapsed_time": "10h 50m 48s", "remaining_time": "14h 46m 23s"}
{"loss": 0.06530061, "token_acc": 0.98088113, "grad_norm": 1.3125, "learning_rate": 7.1e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020872, "epoch": 2.12459442, "global_step/max_steps": "820/1925", "percentage": "42.60%", "elapsed_time": "10h 54m 44s", "remaining_time": "14h 42m 17s"}
{"loss": 0.07137448, "token_acc": 0.97718011, "grad_norm": 2.0, "learning_rate": 7.059e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020874, "epoch": 2.137573, "global_step/max_steps": "825/1925", "percentage": "42.86%", "elapsed_time": "10h 58m 39s", "remaining_time": "14h 38m 12s"}
{"loss": 0.0458478, "token_acc": 0.98242678, "grad_norm": 1.4609375, "learning_rate": 7.018e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020875, "epoch": 2.15055159, "global_step/max_steps": "830/1925", "percentage": "43.12%", "elapsed_time": "11h 2m 37s", "remaining_time": "14h 34m 10s"}
{"loss": 0.04792321, "token_acc": 0.98754153, "grad_norm": 1.2265625, "learning_rate": 6.976e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020875, "epoch": 2.16353018, "global_step/max_steps": "835/1925", "percentage": "43.38%", "elapsed_time": "11h 6m 37s", "remaining_time": "14h 30m 12s"}
{"loss": 0.04856709, "token_acc": 0.9852459, "grad_norm": 1.125, "learning_rate": 6.934e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020879, "epoch": 2.17650876, "global_step/max_steps": "840/1925", "percentage": "43.64%", "elapsed_time": "11h 10m 29s", "remaining_time": "14h 26m 2s"}
{"loss": 0.05035427, "token_acc": 0.9832636, "grad_norm": 1.4453125, "learning_rate": 6.892e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020881, "epoch": 2.18948735, "global_step/max_steps": "845/1925", "percentage": "43.90%", "elapsed_time": "11h 14m 24s", "remaining_time": "14h 21m 57s"}
{"loss": 0.03542698, "token_acc": 0.98844884, "grad_norm": 1.5703125, "learning_rate": 6.85e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02088, "epoch": 2.20246593, "global_step/max_steps": "850/1925", "percentage": "44.16%", "elapsed_time": "11h 18m 25s", "remaining_time": "14h 18m 0s"}
{"loss": 0.03674183, "token_acc": 0.98592715, "grad_norm": 1.8515625, "learning_rate": 6.808e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020878, "epoch": 2.21544452, "global_step/max_steps": "855/1925", "percentage": "44.42%", "elapsed_time": "11h 22m 28s", "remaining_time": "14h 14m 5s"}
{"loss": 0.0341266, "token_acc": 0.98595041, "grad_norm": 0.40820312, "learning_rate": 6.766e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020878, "epoch": 2.2284231, "global_step/max_steps": "860/1925", "percentage": "44.68%", "elapsed_time": "11h 26m 28s", "remaining_time": "14h 10m 6s"}
{"loss": 0.04752223, "token_acc": 0.98583333, "grad_norm": 0.8828125, "learning_rate": 6.723e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020875, "epoch": 2.24140169, "global_step/max_steps": "865/1925", "percentage": "44.94%", "elapsed_time": "11h 30m 34s", "remaining_time": "14h 6m 15s"}
{"loss": 0.04341579, "token_acc": 0.98870056, "grad_norm": 0.6953125, "learning_rate": 6.681e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020876, "epoch": 2.25438027, "global_step/max_steps": "870/1925", "percentage": "45.19%", "elapsed_time": "11h 34m 31s", "remaining_time": "14h 2m 12s"}
{"loss": 0.04864708, "token_acc": 0.98370008, "grad_norm": 1.25, "learning_rate": 6.638e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020885, "epoch": 2.26735886, "global_step/max_steps": "875/1925", "percentage": "45.45%", "elapsed_time": "11h 38m 13s", "remaining_time": "13h 57m 51s"}
{"loss": 0.02881385, "token_acc": 0.99286846, "grad_norm": 0.7421875, "learning_rate": 6.595e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02089, "epoch": 2.28033744, "global_step/max_steps": "880/1925", "percentage": "45.71%", "elapsed_time": "11h 42m 3s", "remaining_time": "13h 53m 41s"}
{"loss": 0.03752846, "token_acc": 0.98926507, "grad_norm": 1.0625, "learning_rate": 6.552e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020893, "epoch": 2.29331603, "global_step/max_steps": "885/1925", "percentage": "45.97%", "elapsed_time": "11h 45m 56s", "remaining_time": "13h 49m 34s"}
{"loss": 0.04644352, "token_acc": 0.98771499, "grad_norm": 0.70703125, "learning_rate": 6.509e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020898, "epoch": 2.30629461, "global_step/max_steps": "890/1925", "percentage": "46.23%", "elapsed_time": "11h 49m 45s", "remaining_time": "13h 45m 23s"}
{"loss": 0.04913321, "token_acc": 0.97952218, "grad_norm": 2.21875, "learning_rate": 6.466e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020898, "epoch": 2.3192732, "global_step/max_steps": "895/1925", "percentage": "46.49%", "elapsed_time": "11h 53m 44s", "remaining_time": "13h 41m 24s"}
{"loss": 0.03812339, "token_acc": 0.98693878, "grad_norm": 1.59375, "learning_rate": 6.422e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020893, "epoch": 2.33225178, "global_step/max_steps": "900/1925", "percentage": "46.75%", "elapsed_time": "11h 57m 54s", "remaining_time": "13h 37m 36s"}
{"loss": 0.04143717, "token_acc": 0.98507463, "grad_norm": 1.5625, "learning_rate": 6.379e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020891, "epoch": 2.34523037, "global_step/max_steps": "905/1925", "percentage": "47.01%", "elapsed_time": "12h 1m 57s", "remaining_time": "13h 33m 41s"}
{"loss": 0.05560383, "token_acc": 0.98270181, "grad_norm": 1.6875, "learning_rate": 6.335e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020899, "epoch": 2.35820896, "global_step/max_steps": "910/1925", "percentage": "47.27%", "elapsed_time": "12h 5m 40s", "remaining_time": "13h 29m 24s"}
{"loss": 0.04442576, "token_acc": 0.98336106, "grad_norm": 1.7890625, "learning_rate": 6.291e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020901, "epoch": 2.37118754, "global_step/max_steps": "915/1925", "percentage": "47.53%", "elapsed_time": "12h 9m 34s", "remaining_time": "13h 25m 19s"}
{"loss": 0.05044782, "token_acc": 0.98527005, "grad_norm": 0.8359375, "learning_rate": 6.247e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020905, "epoch": 2.38416613, "global_step/max_steps": "920/1925", "percentage": "47.79%", "elapsed_time": "12h 13m 25s", "remaining_time": "13h 21m 10s"}
{"loss": 0.02263379, "token_acc": 0.99333888, "grad_norm": 0.5, "learning_rate": 6.203e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020904, "epoch": 2.39714471, "global_step/max_steps": "925/1925", "percentage": "48.05%", "elapsed_time": "12h 17m 28s", "remaining_time": "13h 17m 15s"}
{"loss": 0.04340067, "token_acc": 0.98450245, "grad_norm": 1.2109375, "learning_rate": 6.159e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020906, "epoch": 2.4101233, "global_step/max_steps": "930/1925", "percentage": "48.31%", "elapsed_time": "12h 21m 21s", "remaining_time": "13h 13m 9s"}
{"loss": 0.05528547, "token_acc": 0.97860082, "grad_norm": 1.2890625, "learning_rate": 6.115e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020909, "epoch": 2.42310188, "global_step/max_steps": "935/1925", "percentage": "48.57%", "elapsed_time": "12h 25m 14s", "remaining_time": "13h 9m 5s"}
{"loss": 0.04441152, "token_acc": 0.98830409, "grad_norm": 1.25, "learning_rate": 6.071e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020911, "epoch": 2.43608047, "global_step/max_steps": "940/1925", "percentage": "48.83%", "elapsed_time": "12h 29m 10s", "remaining_time": "13h 5m 2s"}
{"loss": 0.05162731, "token_acc": 0.98116298, "grad_norm": 1.6015625, "learning_rate": 6.027e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020914, "epoch": 2.44905905, "global_step/max_steps": "945/1925", "percentage": "49.09%", "elapsed_time": "12h 33m 1s", "remaining_time": "13h 0m 54s"}
{"loss": 0.05563767, "token_acc": 0.98361998, "grad_norm": 1.0546875, "learning_rate": 5.982e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020911, "epoch": 2.46203764, "global_step/max_steps": "950/1925", "percentage": "49.35%", "elapsed_time": "12h 37m 8s", "remaining_time": "12h 57m 3s"}
{"loss": 0.04207773, "token_acc": 0.98694943, "grad_norm": 5.96875, "learning_rate": 5.938e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020911, "epoch": 2.47501622, "global_step/max_steps": "955/1925", "percentage": "49.61%", "elapsed_time": "12h 41m 6s", "remaining_time": "12h 53m 3s"}
{"loss": 0.04367494, "token_acc": 0.98591549, "grad_norm": 1.1875, "learning_rate": 5.893e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020917, "epoch": 2.48799481, "global_step/max_steps": "960/1925", "percentage": "49.87%", "elapsed_time": "12h 44m 52s", "remaining_time": "12h 48m 51s"}
{"loss": 0.03488151, "token_acc": 0.98993289, "grad_norm": 0.41601562, "learning_rate": 5.848e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02092, "epoch": 2.50097339, "global_step/max_steps": "965/1925", "percentage": "50.13%", "elapsed_time": "12h 48m 44s", "remaining_time": "12h 44m 45s"}
{"loss": 0.02391402, "token_acc": 0.98954143, "grad_norm": 0.73828125, "learning_rate": 5.804e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02092, "epoch": 2.51395198, "global_step/max_steps": "970/1925", "percentage": "50.39%", "elapsed_time": "12h 52m 43s", "remaining_time": "12h 40m 46s"}
{"loss": 0.05280606, "token_acc": 0.98503741, "grad_norm": 1.359375, "learning_rate": 5.759e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020918, "epoch": 2.52693056, "global_step/max_steps": "975/1925", "percentage": "50.65%", "elapsed_time": "12h 56m 47s", "remaining_time": "12h 36m 52s"}
{"loss": 0.04982931, "token_acc": 0.98083333, "grad_norm": 1.1796875, "learning_rate": 5.714e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020914, "epoch": 2.53990915, "global_step/max_steps": "980/1925", "percentage": "50.91%", "elapsed_time": "13h 0m 56s", "remaining_time": "12h 33m 2s"}
{"loss": 0.03910037, "token_acc": 0.98690671, "grad_norm": 1.234375, "learning_rate": 5.669e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02092, "epoch": 2.55288774, "global_step/max_steps": "985/1925", "percentage": "51.17%", "elapsed_time": "13h 4m 42s", "remaining_time": "12h 28m 51s"}
{"loss": 0.03974202, "token_acc": 0.98740554, "grad_norm": 0.5234375, "learning_rate": 5.624e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020921, "epoch": 2.56586632, "global_step/max_steps": "990/1925", "percentage": "51.43%", "elapsed_time": "13h 8m 37s", "remaining_time": "12h 24m 49s"}
{"loss": 0.04704872, "token_acc": 0.98544867, "grad_norm": 1.625, "learning_rate": 5.579e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020919, "epoch": 2.57884491, "global_step/max_steps": "995/1925", "percentage": "51.69%", "elapsed_time": "13h 12m 42s", "remaining_time": "12h 20m 54s"}
{"loss": 0.02566139, "token_acc": 0.989318, "grad_norm": 0.640625, "learning_rate": 5.534e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020923, "epoch": 2.59182349, "global_step/max_steps": "1000/1925", "percentage": "51.95%", "elapsed_time": "13h 16m 32s", "remaining_time": "12h 16m 47s"}
{"loss": 0.04738644, "token_acc": 0.98817568, "grad_norm": 1.453125, "learning_rate": 5.489e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020894, "epoch": 2.60480208, "global_step/max_steps": "1005/1925", "percentage": "52.21%", "elapsed_time": "13h 21m 38s", "remaining_time": "12h 13m 50s"}
{"loss": 0.03394437, "token_acc": 0.98906644, "grad_norm": 1.7421875, "learning_rate": 5.444e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020898, "epoch": 2.61778066, "global_step/max_steps": "1010/1925", "percentage": "52.47%", "elapsed_time": "13h 25m 27s", "remaining_time": "12h 9m 42s"}
{"loss": 0.04944002, "token_acc": 0.98469388, "grad_norm": 2.09375, "learning_rate": 5.399e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020897, "epoch": 2.63075925, "global_step/max_steps": "1015/1925", "percentage": "52.73%", "elapsed_time": "13h 29m 29s", "remaining_time": "12h 5m 44s"}
{"loss": 0.03849258, "token_acc": 0.9902439, "grad_norm": 0.8046875, "learning_rate": 5.353e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.0209, "epoch": 2.64373783, "global_step/max_steps": "1020/1925", "percentage": "52.99%", "elapsed_time": "13h 33m 21s", "remaining_time": "12h 1m 39s"}
{"loss": 0.02624518, "token_acc": 0.98796148, "grad_norm": 0.734375, "learning_rate": 5.308e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.0209, "epoch": 2.65671642, "global_step/max_steps": "1025/1925", "percentage": "53.25%", "elapsed_time": "13h 37m 20s", "remaining_time": "11h 57m 40s"}
{"loss": 0.03164054, "token_acc": 0.98792271, "grad_norm": 1.2421875, "learning_rate": 5.263e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020902, "epoch": 2.669695, "global_step/max_steps": "1030/1925", "percentage": "53.51%", "elapsed_time": "13h 41m 15s", "remaining_time": "11h 53m 36s"}
{"loss": 0.03785187, "token_acc": 0.98830409, "grad_norm": 1.09375, "learning_rate": 5.218e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020904, "epoch": 2.68267359, "global_step/max_steps": "1035/1925", "percentage": "53.77%", "elapsed_time": "13h 45m 10s", "remaining_time": "11h 49m 34s"}
{"loss": 0.02233656, "token_acc": 0.98949919, "grad_norm": 1.1328125, "learning_rate": 5.172e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020904, "epoch": 2.69565217, "global_step/max_steps": "1040/1925", "percentage": "54.03%", "elapsed_time": "13h 49m 8s", "remaining_time": "11h 45m 34s"}
{"loss": 0.02416084, "token_acc": 0.99183007, "grad_norm": 0.8671875, "learning_rate": 5.127e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020901, "epoch": 2.70863076, "global_step/max_steps": "1045/1925", "percentage": "54.29%", "elapsed_time": "13h 53m 13s", "remaining_time": "11h 41m 40s"}
{"loss": 0.03095039, "token_acc": 0.99185004, "grad_norm": 1.3125, "learning_rate": 5.082e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020906, "epoch": 2.72160934, "global_step/max_steps": "1050/1925", "percentage": "54.55%", "elapsed_time": "13h 57m 1s", "remaining_time": "11h 37m 31s"}
{"loss": 0.03349081, "token_acc": 0.9894907, "grad_norm": 1.59375, "learning_rate": 5.036e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020909, "epoch": 2.73458793, "global_step/max_steps": "1055/1925", "percentage": "54.81%", "elapsed_time": "14h 0m 54s", "remaining_time": "11h 33m 27s"}
{"loss": 0.03164245, "token_acc": 0.9892916, "grad_norm": 1.2421875, "learning_rate": 4.991e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020906, "epoch": 2.74756652, "global_step/max_steps": "1060/1925", "percentage": "55.06%", "elapsed_time": "14h 5m 0s", "remaining_time": "11h 29m 33s"}
{"loss": 0.03462515, "token_acc": 0.98940505, "grad_norm": 0.46679688, "learning_rate": 4.946e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020908, "epoch": 2.7605451, "global_step/max_steps": "1065/1925", "percentage": "55.32%", "elapsed_time": "14h 8m 54s", "remaining_time": "11h 25m 30s"}
{"loss": 0.02611087, "token_acc": 0.98860862, "grad_norm": 0.640625, "learning_rate": 4.9e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020906, "epoch": 2.77352369, "global_step/max_steps": "1070/1925", "percentage": "55.58%", "elapsed_time": "14h 12m 58s", "remaining_time": "11h 21m 34s"}
{"loss": 0.02528858, "token_acc": 0.99254967, "grad_norm": 0.47070312, "learning_rate": 4.855e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020904, "epoch": 2.78650227, "global_step/max_steps": "1075/1925", "percentage": "55.84%", "elapsed_time": "14h 17m 1s", "remaining_time": "11h 17m 39s"}
{"loss": 0.01707041, "token_acc": 0.99248748, "grad_norm": 0.63671875, "learning_rate": 4.81e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020904, "epoch": 2.79948086, "global_step/max_steps": "1080/1925", "percentage": "56.10%", "elapsed_time": "14h 21m 2s", "remaining_time": "11h 13m 41s"}
{"loss": 0.03686348, "token_acc": 0.98757249, "grad_norm": 2.03125, "learning_rate": 4.764e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020902, "epoch": 2.81245944, "global_step/max_steps": "1085/1925", "percentage": "56.36%", "elapsed_time": "14h 25m 6s", "remaining_time": "11h 9m 45s"}
{"loss": 0.04762331, "token_acc": 0.98738436, "grad_norm": 1.5546875, "learning_rate": 4.719e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020899, "epoch": 2.82543803, "global_step/max_steps": "1090/1925", "percentage": "56.62%", "elapsed_time": "14h 29m 12s", "remaining_time": "11h 5m 51s"}
{"loss": 0.03311816, "token_acc": 0.99013969, "grad_norm": 1.28125, "learning_rate": 4.674e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020899, "epoch": 2.83841661, "global_step/max_steps": "1095/1925", "percentage": "56.88%", "elapsed_time": "14h 33m 10s", "remaining_time": "11h 1m 51s"}
{"loss": 0.06551987, "token_acc": 0.98265896, "grad_norm": 1.078125, "learning_rate": 4.629e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020901, "epoch": 2.8513952, "global_step/max_steps": "1100/1925", "percentage": "57.14%", "elapsed_time": "14h 37m 5s", "remaining_time": "10h 57m 48s"}
{"loss": 0.04220965, "token_acc": 0.98765432, "grad_norm": 1.203125, "learning_rate": 4.583e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020908, "epoch": 2.86437378, "global_step/max_steps": "1105/1925", "percentage": "57.40%", "elapsed_time": "14h 40m 48s", "remaining_time": "10h 53m 37s"}
{"loss": 0.03140734, "token_acc": 0.98825503, "grad_norm": 0.41210938, "learning_rate": 4.538e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02091, "epoch": 2.87735237, "global_step/max_steps": "1110/1925", "percentage": "57.66%", "elapsed_time": "14h 44m 42s", "remaining_time": "10h 49m 34s"}
{"loss": 0.05650691, "token_acc": 0.97607261, "grad_norm": 1.46875, "learning_rate": 4.493e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020913, "epoch": 2.89033095, "global_step/max_steps": "1115/1925", "percentage": "57.92%", "elapsed_time": "14h 48m 32s", "remaining_time": "10h 45m 29s"}
{"loss": 0.03302388, "token_acc": 0.98866397, "grad_norm": 0.875, "learning_rate": 4.448e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020919, "epoch": 2.90330954, "global_step/max_steps": "1120/1925", "percentage": "58.18%", "elapsed_time": "14h 52m 16s", "remaining_time": "10h 41m 19s"}
{"loss": 0.0436511, "token_acc": 0.98696007, "grad_norm": 1.0546875, "learning_rate": 4.403e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020924, "epoch": 2.91628812, "global_step/max_steps": "1125/1925", "percentage": "58.44%", "elapsed_time": "14h 56m 4s", "remaining_time": "10h 37m 12s"}
{"loss": 0.03166374, "token_acc": 0.99070161, "grad_norm": 0.49023438, "learning_rate": 4.358e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020924, "epoch": 2.92926671, "global_step/max_steps": "1130/1925", "percentage": "58.70%", "elapsed_time": "15h 0m 1s", "remaining_time": "10h 33m 12s"}
{"loss": 0.03244772, "token_acc": 0.98742666, "grad_norm": 0.79296875, "learning_rate": 4.313e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020925, "epoch": 2.9422453, "global_step/max_steps": "1135/1925", "percentage": "58.96%", "elapsed_time": "15h 3m 58s", "remaining_time": "10h 29m 12s"}
{"loss": 0.02605614, "token_acc": 0.98934426, "grad_norm": 1.296875, "learning_rate": 4.268e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020921, "epoch": 2.95522388, "global_step/max_steps": "1140/1925", "percentage": "59.22%", "elapsed_time": "15h 8m 8s", "remaining_time": "10h 25m 20s"}
{"loss": 0.0244123, "token_acc": 0.99077955, "grad_norm": 1.1015625, "learning_rate": 4.223e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020915, "epoch": 2.96820247, "global_step/max_steps": "1145/1925", "percentage": "59.48%", "elapsed_time": "15h 12m 21s", "remaining_time": "10h 21m 31s"}
{"loss": 0.01829095, "token_acc": 0.99187652, "grad_norm": 1.546875, "learning_rate": 4.178e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020917, "epoch": 2.98118105, "global_step/max_steps": "1150/1925", "percentage": "59.74%", "elapsed_time": "15h 16m 16s", "remaining_time": "10h 17m 29s"}
{"loss": 0.02461191, "token_acc": 0.99090157, "grad_norm": 0.3984375, "learning_rate": 4.134e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020918, "epoch": 2.99415964, "global_step/max_steps": "1155/1925", "percentage": "60.00%", "elapsed_time": "15h 20m 14s", "remaining_time": "10h 13m 29s"}
{"loss": 0.02061373, "token_acc": 0.99332061, "grad_norm": 0.4765625, "learning_rate": 4.089e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020936, "epoch": 3.00519143, "global_step/max_steps": "1160/1925", "percentage": "60.26%", "elapsed_time": "15h 23m 23s", "remaining_time": "10h 8m 57s"}
{"loss": 0.02193851, "token_acc": 0.99256813, "grad_norm": 1.0703125, "learning_rate": 4.045e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020936, "epoch": 3.01817002, "global_step/max_steps": "1165/1925", "percentage": "60.52%", "elapsed_time": "15h 27m 23s", "remaining_time": "10h 4m 59s"}
{"loss": 0.0101957, "token_acc": 0.99668325, "grad_norm": 0.56640625, "learning_rate": 4e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020933, "epoch": 3.0311486, "global_step/max_steps": "1170/1925", "percentage": "60.78%", "elapsed_time": "15h 31m 29s", "remaining_time": "10h 1m 5s"}
{"loss": 0.01060466, "token_acc": 0.99662162, "grad_norm": 0.22460938, "learning_rate": 3.956e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020933, "epoch": 3.04412719, "global_step/max_steps": "1175/1925", "percentage": "61.04%", "elapsed_time": "15h 35m 29s", "remaining_time": "9h 57m 7s"}
{"loss": 0.02132482, "token_acc": 0.996724, "grad_norm": 0.38671875, "learning_rate": 3.911e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020935, "epoch": 3.05710578, "global_step/max_steps": "1180/1925", "percentage": "61.30%", "elapsed_time": "15h 39m 21s", "remaining_time": "9h 53m 4s"}
{"loss": 0.01656489, "token_acc": 0.99671862, "grad_norm": 0.09814453, "learning_rate": 3.867e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020935, "epoch": 3.07008436, "global_step/max_steps": "1185/1925", "percentage": "61.56%", "elapsed_time": "15h 43m 22s", "remaining_time": "9h 49m 6s"}
{"loss": 0.01909568, "token_acc": 0.99235344, "grad_norm": 0.51171875, "learning_rate": 3.823e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020933, "epoch": 3.08306295, "global_step/max_steps": "1190/1925", "percentage": "61.82%", "elapsed_time": "15h 47m 25s", "remaining_time": "9h 45m 10s"}
{"loss": 0.00921143, "token_acc": 0.99589491, "grad_norm": 0.61328125, "learning_rate": 3.779e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02093, "epoch": 3.09604153, "global_step/max_steps": "1195/1925", "percentage": "62.08%", "elapsed_time": "15h 51m 31s", "remaining_time": "9h 41m 16s"}
{"loss": 0.02649273, "token_acc": 0.9933665, "grad_norm": 1.578125, "learning_rate": 3.735e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020932, "epoch": 3.10902012, "global_step/max_steps": "1200/1925", "percentage": "62.34%", "elapsed_time": "15h 55m 26s", "remaining_time": "9h 37m 14s"}
{"loss": 0.01079259, "token_acc": 0.99583333, "grad_norm": 0.66015625, "learning_rate": 3.691e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020937, "epoch": 3.1219987, "global_step/max_steps": "1205/1925", "percentage": "62.60%", "elapsed_time": "15h 59m 9s", "remaining_time": "9h 33m 6s"}
{"loss": 0.01512955, "token_acc": 0.99427637, "grad_norm": 1.3515625, "learning_rate": 3.648e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020937, "epoch": 3.13497729, "global_step/max_steps": "1210/1925", "percentage": "62.86%", "elapsed_time": "16h 3m 9s", "remaining_time": "9h 29m 8s"}
{"loss": 0.01460842, "token_acc": 0.99497487, "grad_norm": 0.921875, "learning_rate": 3.604e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020937, "epoch": 3.14795587, "global_step/max_steps": "1215/1925", "percentage": "63.12%", "elapsed_time": "16h 7m 9s", "remaining_time": "9h 25m 10s"}
{"loss": 0.0109226, "token_acc": 0.99669421, "grad_norm": 0.6875, "learning_rate": 3.561e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020931, "epoch": 3.16093446, "global_step/max_steps": "1220/1925", "percentage": "63.38%", "elapsed_time": "16h 11m 22s", "remaining_time": "9h 21m 19s"}
{"loss": 0.02576659, "token_acc": 0.99267101, "grad_norm": 0.828125, "learning_rate": 3.517e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020933, "epoch": 3.17391304, "global_step/max_steps": "1225/1925", "percentage": "63.64%", "elapsed_time": "16h 15m 18s", "remaining_time": "9h 17m 19s"}
{"loss": 0.01006721, "token_acc": 0.99592834, "grad_norm": 0.48632812, "learning_rate": 3.474e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020937, "epoch": 3.18689163, "global_step/max_steps": "1230/1925", "percentage": "63.90%", "elapsed_time": "16h 19m 4s", "remaining_time": "9h 13m 13s"}
{"loss": 0.0241868, "token_acc": 0.99342646, "grad_norm": 1.3203125, "learning_rate": 3.431e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020937, "epoch": 3.19987021, "global_step/max_steps": "1235/1925", "percentage": "64.16%", "elapsed_time": "16h 23m 3s", "remaining_time": "9h 9m 14s"}
{"loss": 0.02058341, "token_acc": 0.99248748, "grad_norm": 0.33984375, "learning_rate": 3.388e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020941, "epoch": 3.2128488, "global_step/max_steps": "1240/1925", "percentage": "64.42%", "elapsed_time": "16h 26m 50s", "remaining_time": "9h 5m 9s"}
{"loss": 0.03402402, "token_acc": 0.9892916, "grad_norm": 1.359375, "learning_rate": 3.345e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02094, "epoch": 3.22582738, "global_step/max_steps": "1245/1925", "percentage": "64.68%", "elapsed_time": "16h 30m 53s", "remaining_time": "9h 1m 12s"}
{"loss": 0.01100642, "token_acc": 0.99587118, "grad_norm": 0.29296875, "learning_rate": 3.302e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020942, "epoch": 3.23880597, "global_step/max_steps": "1250/1925", "percentage": "64.94%", "elapsed_time": "16h 34m 46s", "remaining_time": "8h 57m 10s"}
{"loss": 0.0154213, "token_acc": 0.99510604, "grad_norm": 0.64453125, "learning_rate": 3.26e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020945, "epoch": 3.25178456, "global_step/max_steps": "1255/1925", "percentage": "65.19%", "elapsed_time": "16h 38m 37s", "remaining_time": "8h 53m 7s"}
{"loss": 0.00781999, "token_acc": 0.99752679, "grad_norm": 0.34960938, "learning_rate": 3.217e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020948, "epoch": 3.26476314, "global_step/max_steps": "1260/1925", "percentage": "65.45%", "elapsed_time": "16h 42m 26s", "remaining_time": "8h 49m 3s"}
{"loss": 0.01458774, "token_acc": 0.99587799, "grad_norm": 0.50390625, "learning_rate": 3.175e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020954, "epoch": 3.27774173, "global_step/max_steps": "1265/1925", "percentage": "65.71%", "elapsed_time": "16h 46m 8s", "remaining_time": "8h 44m 56s"}
{"loss": 0.01909294, "token_acc": 0.99246231, "grad_norm": 0.84375, "learning_rate": 3.133e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020953, "epoch": 3.29072031, "global_step/max_steps": "1270/1925", "percentage": "65.97%", "elapsed_time": "16h 50m 10s", "remaining_time": "8h 40m 59s"}
{"loss": 0.01978545, "token_acc": 0.99260477, "grad_norm": 0.90625, "learning_rate": 3.091e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020952, "epoch": 3.3036989, "global_step/max_steps": "1275/1925", "percentage": "66.23%", "elapsed_time": "16h 54m 12s", "remaining_time": "8h 37m 2s"}
{"loss": 0.00995972, "token_acc": 0.99589828, "grad_norm": 0.26953125, "learning_rate": 3.049e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020951, "epoch": 3.31667748, "global_step/max_steps": "1280/1925", "percentage": "66.49%", "elapsed_time": "16h 58m 11s", "remaining_time": "8h 33m 4s"}
{"loss": 0.0074767, "token_acc": 0.99915683, "grad_norm": 0.703125, "learning_rate": 3.007e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020953, "epoch": 3.32965607, "global_step/max_steps": "1285/1925", "percentage": "66.75%", "elapsed_time": "17h 2m 5s", "remaining_time": "8h 29m 3s"}
{"loss": 0.00489792, "token_acc": 0.99753086, "grad_norm": 0.515625, "learning_rate": 2.966e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020954, "epoch": 3.34263465, "global_step/max_steps": "1290/1925", "percentage": "67.01%", "elapsed_time": "17h 6m 0s", "remaining_time": "8h 25m 2s"}
{"loss": 0.01626905, "token_acc": 0.9958159, "grad_norm": 0.65234375, "learning_rate": 2.924e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020954, "epoch": 3.35561324, "global_step/max_steps": "1295/1925", "percentage": "67.27%", "elapsed_time": "17h 10m 0s", "remaining_time": "8h 21m 5s"}
{"loss": 0.01682026, "token_acc": 0.99165972, "grad_norm": 1.421875, "learning_rate": 2.883e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020953, "epoch": 3.36859182, "global_step/max_steps": "1300/1925", "percentage": "67.53%", "elapsed_time": "17h 14m 1s", "remaining_time": "8h 17m 7s"}
{"loss": 0.01683239, "token_acc": 0.99422918, "grad_norm": 0.45898438, "learning_rate": 2.842e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020954, "epoch": 3.38157041, "global_step/max_steps": "1305/1925", "percentage": "67.79%", "elapsed_time": "17h 17m 55s", "remaining_time": "8h 13m 6s"}
{"loss": 0.00947859, "token_acc": 0.99590835, "grad_norm": 0.49609375, "learning_rate": 2.801e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020958, "epoch": 3.39454899, "global_step/max_steps": "1310/1925", "percentage": "68.05%", "elapsed_time": "17h 21m 44s", "remaining_time": "8h 9m 3s"}
{"loss": 0.01408139, "token_acc": 0.99587118, "grad_norm": 0.68359375, "learning_rate": 2.761e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020956, "epoch": 3.40752758, "global_step/max_steps": "1315/1925", "percentage": "68.31%", "elapsed_time": "17h 25m 47s", "remaining_time": "8h 5m 7s"}
{"loss": 0.02275892, "token_acc": 0.99165972, "grad_norm": 0.79296875, "learning_rate": 2.72e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020956, "epoch": 3.42050616, "global_step/max_steps": "1320/1925", "percentage": "68.57%", "elapsed_time": "17h 29m 45s", "remaining_time": "8h 1m 8s"}
{"loss": 0.0119983, "token_acc": 0.99594156, "grad_norm": 1.0546875, "learning_rate": 2.68e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020958, "epoch": 3.43348475, "global_step/max_steps": "1325/1925", "percentage": "68.83%", "elapsed_time": "17h 33m 38s", "remaining_time": "7h 57m 7s"}
{"loss": 0.01625697, "token_acc": 0.99415693, "grad_norm": 0.6015625, "learning_rate": 2.64e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020955, "epoch": 3.44646334, "global_step/max_steps": "1330/1925", "percentage": "69.09%", "elapsed_time": "17h 37m 46s", "remaining_time": "7h 53m 12s"}
{"loss": 0.00799477, "token_acc": 0.99833611, "grad_norm": 0.13085938, "learning_rate": 2.6e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020952, "epoch": 3.45944192, "global_step/max_steps": "1335/1925", "percentage": "69.35%", "elapsed_time": "17h 41m 53s", "remaining_time": "7h 49m 18s"}
{"loss": 0.01807798, "token_acc": 0.99424342, "grad_norm": 0.8203125, "learning_rate": 2.56e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020956, "epoch": 3.47242051, "global_step/max_steps": "1340/1925", "percentage": "69.61%", "elapsed_time": "17h 45m 40s", "remaining_time": "7h 45m 14s"}
{"loss": 0.00846443, "token_acc": 0.99754702, "grad_norm": 0.43554688, "learning_rate": 2.521e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020958, "epoch": 3.48539909, "global_step/max_steps": "1345/1925", "percentage": "69.87%", "elapsed_time": "17h 49m 33s", "remaining_time": "7h 41m 13s"}
{"loss": 0.01159798, "token_acc": 0.99590164, "grad_norm": 0.39257812, "learning_rate": 2.482e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02096, "epoch": 3.49837768, "global_step/max_steps": "1350/1925", "percentage": "70.13%", "elapsed_time": "17h 53m 25s", "remaining_time": "7h 37m 11s"}
{"loss": 0.01330241, "token_acc": 0.99579125, "grad_norm": 0.32421875, "learning_rate": 2.443e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020962, "epoch": 3.51135626, "global_step/max_steps": "1355/1925", "percentage": "70.39%", "elapsed_time": "17h 57m 19s", "remaining_time": "7h 33m 11s"}
{"loss": 0.01054449, "token_acc": 0.99752679, "grad_norm": 0.58984375, "learning_rate": 2.404e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02096, "epoch": 3.52433485, "global_step/max_steps": "1360/1925", "percentage": "70.65%", "elapsed_time": "18h 1m 21s", "remaining_time": "7h 29m 14s"}
{"loss": 0.01150445, "token_acc": 0.99589154, "grad_norm": 0.7109375, "learning_rate": 2.365e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020962, "epoch": 3.53731343, "global_step/max_steps": "1365/1925", "percentage": "70.91%", "elapsed_time": "18h 5m 15s", "remaining_time": "7h 25m 14s"}
{"loss": 0.01624694, "token_acc": 0.9942623, "grad_norm": 0.7109375, "learning_rate": 2.327e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020961, "epoch": 3.55029202, "global_step/max_steps": "1370/1925", "percentage": "71.17%", "elapsed_time": "18h 9m 17s", "remaining_time": "7h 21m 17s"}
{"loss": 0.00581565, "token_acc": 0.99837662, "grad_norm": 0.6640625, "learning_rate": 2.288e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020964, "epoch": 3.5632706, "global_step/max_steps": "1375/1925", "percentage": "71.43%", "elapsed_time": "18h 13m 6s", "remaining_time": "7h 17m 14s"}
{"loss": 0.00923417, "token_acc": 0.99757282, "grad_norm": 0.45117188, "learning_rate": 2.25e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020964, "epoch": 3.57624919, "global_step/max_steps": "1380/1925", "percentage": "71.69%", "elapsed_time": "18h 17m 3s", "remaining_time": "7h 13m 15s"}
{"loss": 0.00487949, "token_acc": 0.99835255, "grad_norm": 0.265625, "learning_rate": 2.213e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020965, "epoch": 3.58922777, "global_step/max_steps": "1385/1925", "percentage": "71.95%", "elapsed_time": "18h 20m 59s", "remaining_time": "7h 9m 16s"}
{"loss": 0.00725206, "token_acc": 0.99678715, "grad_norm": 0.00704956, "learning_rate": 2.175e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020963, "epoch": 3.60220636, "global_step/max_steps": "1390/1925", "percentage": "72.21%", "elapsed_time": "18h 25m 5s", "remaining_time": "7h 5m 20s"}
{"loss": 0.00617726, "token_acc": 0.99756098, "grad_norm": 0.08349609, "learning_rate": 2.138e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020959, "epoch": 3.61518494, "global_step/max_steps": "1395/1925", "percentage": "72.47%", "elapsed_time": "18h 29m 15s", "remaining_time": "7h 1m 26s"}
{"loss": 0.03482025, "token_acc": 0.99246231, "grad_norm": 1.1796875, "learning_rate": 2.101e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020959, "epoch": 3.62816353, "global_step/max_steps": "1400/1925", "percentage": "72.73%", "elapsed_time": "18h 33m 13s", "remaining_time": "6h 57m 27s"}
{"loss": 0.02210452, "token_acc": 0.99425287, "grad_norm": 0.83984375, "learning_rate": 2.064e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020962, "epoch": 3.64114212, "global_step/max_steps": "1405/1925", "percentage": "72.99%", "elapsed_time": "18h 37m 1s", "remaining_time": "6h 53m 25s"}
{"loss": 0.00946889, "token_acc": 0.99500416, "grad_norm": 0.89453125, "learning_rate": 2.027e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020963, "epoch": 3.6541207, "global_step/max_steps": "1410/1925", "percentage": "73.25%", "elapsed_time": "18h 40m 58s", "remaining_time": "6h 49m 25s"}
{"loss": 0.01620859, "token_acc": 0.99500832, "grad_norm": 0.484375, "learning_rate": 1.991e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020962, "epoch": 3.66709929, "global_step/max_steps": "1415/1925", "percentage": "73.51%", "elapsed_time": "18h 45m 0s", "remaining_time": "6h 45m 28s"}
{"loss": 0.01302923, "token_acc": 0.99341021, "grad_norm": 0.671875, "learning_rate": 1.955e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020967, "epoch": 3.68007787, "global_step/max_steps": "1420/1925", "percentage": "73.77%", "elapsed_time": "18h 48m 41s", "remaining_time": "6h 41m 24s"}
{"loss": 0.01667369, "token_acc": 0.99506579, "grad_norm": 0.08447266, "learning_rate": 1.919e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020966, "epoch": 3.69305646, "global_step/max_steps": "1425/1925", "percentage": "74.03%", "elapsed_time": "18h 52m 44s", "remaining_time": "6h 37m 27s"}
{"loss": 0.0103557, "token_acc": 0.99509403, "grad_norm": 0.20703125, "learning_rate": 1.884e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020966, "epoch": 3.70603504, "global_step/max_steps": "1430/1925", "percentage": "74.29%", "elapsed_time": "18h 56m 43s", "remaining_time": "6h 33m 28s"}
{"loss": 0.011119, "token_acc": 0.99349593, "grad_norm": 1.8125, "learning_rate": 1.848e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020968, "epoch": 3.71901363, "global_step/max_steps": "1435/1925", "percentage": "74.55%", "elapsed_time": "19h 0m 34s", "remaining_time": "6h 29m 27s"}
{"loss": 0.00731016, "token_acc": 0.9958368, "grad_norm": 0.08642578, "learning_rate": 1.813e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020965, "epoch": 3.73199221, "global_step/max_steps": "1440/1925", "percentage": "74.81%", "elapsed_time": "19h 4m 42s", "remaining_time": "6h 25m 32s"}
{"loss": 0.00580742, "token_acc": 0.99831791, "grad_norm": 0.24707031, "learning_rate": 1.778e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020965, "epoch": 3.7449708, "global_step/max_steps": "1445/1925", "percentage": "75.06%", "elapsed_time": "19h 8m 41s", "remaining_time": "6h 21m 34s"}
{"loss": 0.01226257, "token_acc": 0.99589491, "grad_norm": 0.890625, "learning_rate": 1.744e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020964, "epoch": 3.75794938, "global_step/max_steps": "1450/1925", "percentage": "75.32%", "elapsed_time": "19h 12m 42s", "remaining_time": "6h 17m 36s"}
{"loss": 0.00421866, "token_acc": 0.99918434, "grad_norm": 0.55859375, "learning_rate": 1.71e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020966, "epoch": 3.77092797, "global_step/max_steps": "1455/1925", "percentage": "75.58%", "elapsed_time": "19h 16m 36s", "remaining_time": "6h 13m 36s"}
{"loss": 0.00747007, "token_acc": 0.99917355, "grad_norm": 0.21875, "learning_rate": 1.676e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020964, "epoch": 3.78390655, "global_step/max_steps": "1460/1925", "percentage": "75.84%", "elapsed_time": "19h 20m 41s", "remaining_time": "6h 9m 40s"}
{"loss": 0.01331443, "token_acc": 0.99665272, "grad_norm": 0.27148438, "learning_rate": 1.642e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020963, "epoch": 3.79688514, "global_step/max_steps": "1465/1925", "percentage": "76.10%", "elapsed_time": "19h 24m 42s", "remaining_time": "6h 5m 42s"}
{"loss": 0.0081139, "token_acc": 0.99831508, "grad_norm": 0.54296875, "learning_rate": 1.608e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020962, "epoch": 3.80986372, "global_step/max_steps": "1470/1925", "percentage": "76.36%", "elapsed_time": "19h 28m 44s", "remaining_time": "6h 1m 45s"}
{"loss": 0.00888101, "token_acc": 0.995029, "grad_norm": 0.3828125, "learning_rate": 1.575e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020962, "epoch": 3.82284231, "global_step/max_steps": "1475/1925", "percentage": "76.62%", "elapsed_time": "19h 32m 42s", "remaining_time": "5h 57m 46s"}
{"loss": 0.00938113, "token_acc": 0.99676113, "grad_norm": 0.76171875, "learning_rate": 1.542e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020964, "epoch": 3.8358209, "global_step/max_steps": "1480/1925", "percentage": "76.88%", "elapsed_time": "19h 36m 33s", "remaining_time": "5h 53m 45s"}
{"loss": 0.01805245, "token_acc": 0.99357945, "grad_norm": 0.6015625, "learning_rate": 1.51e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020963, "epoch": 3.84879948, "global_step/max_steps": "1485/1925", "percentage": "77.14%", "elapsed_time": "19h 40m 35s", "remaining_time": "5h 49m 48s"}
{"loss": 0.00908992, "token_acc": 0.99750416, "grad_norm": 0.58984375, "learning_rate": 1.477e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020964, "epoch": 3.86177807, "global_step/max_steps": "1490/1925", "percentage": "77.40%", "elapsed_time": "19h 44m 31s", "remaining_time": "5h 45m 49s"}
{"loss": 0.01190311, "token_acc": 0.99749791, "grad_norm": 0.14941406, "learning_rate": 1.445e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020962, "epoch": 3.87475665, "global_step/max_steps": "1495/1925", "percentage": "77.66%", "elapsed_time": "19h 48m 36s", "remaining_time": "5h 41m 52s"}
{"loss": 0.00848971, "token_acc": 0.99672668, "grad_norm": 1.28125, "learning_rate": 1.414e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020964, "epoch": 3.88773524, "global_step/max_steps": "1500/1925", "percentage": "77.92%", "elapsed_time": "19h 52m 27s", "remaining_time": "5h 37m 51s"}
{"loss": 0.00586327, "token_acc": 0.99838188, "grad_norm": 0.07421875, "learning_rate": 1.382e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020943, "epoch": 3.90071382, "global_step/max_steps": "1505/1925", "percentage": "78.18%", "elapsed_time": "19h 57m 38s", "remaining_time": "5h 34m 13s"}
{"loss": 0.01391073, "token_acc": 0.991715, "grad_norm": 0.05029297, "learning_rate": 1.351e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020943, "epoch": 3.91369241, "global_step/max_steps": "1510/1925", "percentage": "78.44%", "elapsed_time": "20h 1m 38s", "remaining_time": "5h 30m 15s"}
{"loss": 0.00785744, "token_acc": 0.99754098, "grad_norm": 0.6640625, "learning_rate": 1.32e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020935, "epoch": 3.92667099, "global_step/max_steps": "1515/1925", "percentage": "78.70%", "elapsed_time": "20h 6m 3s", "remaining_time": "5h 26m 23s"}
{"loss": 0.01427224, "token_acc": 0.99597747, "grad_norm": 0.296875, "learning_rate": 1.29e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020937, "epoch": 3.93964958, "global_step/max_steps": "1520/1925", "percentage": "78.96%", "elapsed_time": "20h 9m 55s", "remaining_time": "5h 22m 22s"}
{"loss": 0.01153183, "token_acc": 0.99748111, "grad_norm": 0.10888672, "learning_rate": 1.259e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020935, "epoch": 3.95262816, "global_step/max_steps": "1525/1925", "percentage": "79.22%", "elapsed_time": "20h 14m 1s", "remaining_time": "5h 18m 25s"}
{"loss": 0.00549206, "token_acc": 0.99837134, "grad_norm": 0.41210938, "learning_rate": 1.229e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02093, "epoch": 3.96560675, "global_step/max_steps": "1530/1925", "percentage": "79.48%", "elapsed_time": "20h 18m 17s", "remaining_time": "5h 14m 31s"}
{"loss": 0.0118423, "token_acc": 0.99754501, "grad_norm": 0.81640625, "learning_rate": 1.2e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020928, "epoch": 3.97858533, "global_step/max_steps": "1535/1925", "percentage": "79.74%", "elapsed_time": "20h 22m 22s", "remaining_time": "5h 10m 34s"}
{"loss": 0.00989996, "token_acc": 0.99586093, "grad_norm": 0.30273438, "learning_rate": 1.17e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020926, "epoch": 3.99156392, "global_step/max_steps": "1540/1925", "percentage": "80.00%", "elapsed_time": "20h 26m 30s", "remaining_time": "5h 6m 37s"}
{"loss": 0.00593512, "token_acc": 0.99903288, "grad_norm": 0.13085938, "learning_rate": 1.141e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020936, "epoch": 4.00259572, "global_step/max_steps": "1545/1925", "percentage": "80.26%", "elapsed_time": "20h 29m 55s", "remaining_time": "5h 2m 30s"}
{"loss": 0.00166136, "token_acc": 1.0, "grad_norm": 0.05517578, "learning_rate": 1.113e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02093, "epoch": 4.0155743, "global_step/max_steps": "1550/1925", "percentage": "80.52%", "elapsed_time": "20h 34m 14s", "remaining_time": "4h 58m 36s"}
{"loss": 0.00122228, "token_acc": 1.0, "grad_norm": 0.09423828, "learning_rate": 1.084e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02093, "epoch": 4.02855289, "global_step/max_steps": "1555/1925", "percentage": "80.78%", "elapsed_time": "20h 38m 13s", "remaining_time": "4h 54m 37s"}
{"loss": 0.00207209, "token_acc": 1.0, "grad_norm": 0.14941406, "learning_rate": 1.056e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020927, "epoch": 4.04153147, "global_step/max_steps": "1560/1925", "percentage": "81.04%", "elapsed_time": "20h 42m 22s", "remaining_time": "4h 50m 41s"}
{"loss": 0.00325605, "token_acc": 0.9984, "grad_norm": 0.40429688, "learning_rate": 1.029e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020925, "epoch": 4.05451006, "global_step/max_steps": "1565/1925", "percentage": "81.30%", "elapsed_time": "20h 46m 28s", "remaining_time": "4h 46m 43s"}
{"loss": 0.00114265, "token_acc": 1.0, "grad_norm": 0.37304688, "learning_rate": 1.001e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020924, "epoch": 4.06748864, "global_step/max_steps": "1570/1925", "percentage": "81.56%", "elapsed_time": "20h 50m 30s", "remaining_time": "4h 42m 45s"}
{"loss": 0.00458365, "token_acc": 0.999181, "grad_norm": 0.59375, "learning_rate": 9.74e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020917, "epoch": 4.08046723, "global_step/max_steps": "1575/1925", "percentage": "81.82%", "elapsed_time": "20h 54m 54s", "remaining_time": "4h 38m 52s"}
{"loss": 0.00344792, "token_acc": 0.99917492, "grad_norm": 0.46484375, "learning_rate": 9.47e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020909, "epoch": 4.09344581, "global_step/max_steps": "1580/1925", "percentage": "82.08%", "elapsed_time": "20h 59m 24s", "remaining_time": "4h 34m 59s"}
{"loss": 0.00743032, "token_acc": 0.99832355, "grad_norm": 0.875, "learning_rate": 9.21e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020896, "epoch": 4.1064244, "global_step/max_steps": "1585/1925", "percentage": "82.34%", "elapsed_time": "21h 4m 8s", "remaining_time": "4h 31m 10s"}
{"loss": 0.00263328, "token_acc": 0.99919225, "grad_norm": 0.08398438, "learning_rate": 8.95e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020886, "epoch": 4.11940299, "global_step/max_steps": "1590/1925", "percentage": "82.60%", "elapsed_time": "21h 8m 44s", "remaining_time": "4h 27m 18s"}
{"loss": 0.00627142, "token_acc": 0.99754098, "grad_norm": 0.73828125, "learning_rate": 8.69e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020881, "epoch": 4.13238157, "global_step/max_steps": "1595/1925", "percentage": "82.86%", "elapsed_time": "21h 13m 2s", "remaining_time": "4h 23m 23s"}
{"loss": 0.00123384, "token_acc": 1.0, "grad_norm": 0.05322266, "learning_rate": 8.44e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02087, "epoch": 4.14536016, "global_step/max_steps": "1600/1925", "percentage": "83.12%", "elapsed_time": "21h 17m 43s", "remaining_time": "4h 19m 32s"}
{"loss": 0.00354467, "token_acc": 0.99918897, "grad_norm": 0.05371094, "learning_rate": 8.19e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020869, "epoch": 4.15833874, "global_step/max_steps": "1605/1925", "percentage": "83.38%", "elapsed_time": "21h 21m 46s", "remaining_time": "4h 15m 33s"}
{"loss": 0.00352853, "token_acc": 0.99919614, "grad_norm": 0.19433594, "learning_rate": 7.94e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020868, "epoch": 4.17131733, "global_step/max_steps": "1610/1925", "percentage": "83.64%", "elapsed_time": "21h 25m 48s", "remaining_time": "4h 11m 34s"}
{"loss": 0.00357976, "token_acc": 0.99832636, "grad_norm": 0.07568359, "learning_rate": 7.7e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020864, "epoch": 4.18429591, "global_step/max_steps": "1615/1925", "percentage": "83.90%", "elapsed_time": "21h 30m 4s", "remaining_time": "4h 7m 37s"}
{"loss": 0.00439212, "token_acc": 0.99917424, "grad_norm": 0.05175781, "learning_rate": 7.46e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02086, "epoch": 4.1972745, "global_step/max_steps": "1620/1925", "percentage": "84.16%", "elapsed_time": "21h 34m 16s", "remaining_time": "4h 3m 40s"}
{"loss": 0.00388254, "token_acc": 0.99915326, "grad_norm": 0.40625, "learning_rate": 7.22e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02086, "epoch": 4.21025308, "global_step/max_steps": "1625/1925", "percentage": "84.42%", "elapsed_time": "21h 38m 18s", "remaining_time": "3h 59m 41s"}
{"loss": 0.00089738, "token_acc": 1.0, "grad_norm": 0.16308594, "learning_rate": 6.99e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020866, "epoch": 4.22323167, "global_step/max_steps": "1630/1925", "percentage": "84.68%", "elapsed_time": "21h 41m 56s", "remaining_time": "3h 55m 37s"}
{"loss": 0.006377, "token_acc": 0.99917355, "grad_norm": 0.17480469, "learning_rate": 6.76e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02087, "epoch": 4.23621025, "global_step/max_steps": "1635/1925", "percentage": "84.94%", "elapsed_time": "21h 45m 39s", "remaining_time": "3h 51m 35s"}
{"loss": 0.00409768, "token_acc": 0.99917219, "grad_norm": 0.22460938, "learning_rate": 6.53e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020869, "epoch": 4.24918884, "global_step/max_steps": "1640/1925", "percentage": "85.19%", "elapsed_time": "21h 49m 42s", "remaining_time": "3h 47m 36s"}
{"loss": 0.00224936, "token_acc": 1.0, "grad_norm": 0.29492188, "learning_rate": 6.31e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020866, "epoch": 4.26216742, "global_step/max_steps": "1645/1925", "percentage": "85.45%", "elapsed_time": "21h 53m 54s", "remaining_time": "3h 43m 38s"}
{"loss": 0.00122885, "token_acc": 1.0, "grad_norm": 0.01721191, "learning_rate": 6.09e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020865, "epoch": 4.27514601, "global_step/max_steps": "1650/1925", "percentage": "85.71%", "elapsed_time": "21h 57m 57s", "remaining_time": "3h 39m 39s"}
{"loss": 0.00142137, "token_acc": 0.999185, "grad_norm": 0.14257812, "learning_rate": 5.88e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020866, "epoch": 4.28812459, "global_step/max_steps": "1655/1925", "percentage": "85.97%", "elapsed_time": "22h 1m 51s", "remaining_time": "3h 35m 39s"}
{"loss": 0.00079734, "token_acc": 1.0, "grad_norm": 0.02246094, "learning_rate": 5.67e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020865, "epoch": 4.30110318, "global_step/max_steps": "1660/1925", "percentage": "86.23%", "elapsed_time": "22h 5m 54s", "remaining_time": "3h 31m 39s"}
{"loss": 0.00324922, "token_acc": 0.99917898, "grad_norm": 0.28125, "learning_rate": 5.46e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020867, "epoch": 4.31408177, "global_step/max_steps": "1665/1925", "percentage": "86.49%", "elapsed_time": "22h 9m 49s", "remaining_time": "3h 27m 39s"}
{"loss": 0.00195887, "token_acc": 1.0, "grad_norm": 0.30859375, "learning_rate": 5.25e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020868, "epoch": 4.32706035, "global_step/max_steps": "1670/1925", "percentage": "86.75%", "elapsed_time": "22h 13m 44s", "remaining_time": "3h 23m 39s"}
{"loss": 0.00554874, "token_acc": 0.9991756, "grad_norm": 0.24902344, "learning_rate": 5.05e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02087, "epoch": 4.34003894, "global_step/max_steps": "1675/1925", "percentage": "87.01%", "elapsed_time": "22h 17m 37s", "remaining_time": "3h 19m 38s"}
{"loss": 0.00663116, "token_acc": 0.99835661, "grad_norm": 0.22851562, "learning_rate": 4.86e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020869, "epoch": 4.35301752, "global_step/max_steps": "1680/1925", "percentage": "87.27%", "elapsed_time": "22h 21m 40s", "remaining_time": "3h 15m 39s"}
{"loss": 0.00100116, "token_acc": 1.0, "grad_norm": 0.29101562, "learning_rate": 4.66e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020871, "epoch": 4.36599611, "global_step/max_steps": "1685/1925", "percentage": "87.53%", "elapsed_time": "22h 25m 32s", "remaining_time": "3h 11m 38s"}
{"loss": 0.00117063, "token_acc": 1.0, "grad_norm": 0.04003906, "learning_rate": 4.47e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020872, "epoch": 4.37897469, "global_step/max_steps": "1690/1925", "percentage": "87.79%", "elapsed_time": "22h 29m 27s", "remaining_time": "3h 7m 38s"}
{"loss": 0.00051829, "token_acc": 1.0, "grad_norm": 0.04248047, "learning_rate": 4.29e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020876, "epoch": 4.39195328, "global_step/max_steps": "1695/1925", "percentage": "88.05%", "elapsed_time": "22h 33m 10s", "remaining_time": "3h 3m 37s"}
{"loss": 0.00520869, "token_acc": 0.99917012, "grad_norm": 0.49414062, "learning_rate": 4.11e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020878, "epoch": 4.40493186, "global_step/max_steps": "1700/1925", "percentage": "88.31%", "elapsed_time": "22h 37m 2s", "remaining_time": "2h 59m 36s"}
{"loss": 0.00087819, "token_acc": 1.0, "grad_norm": 0.1796875, "learning_rate": 3.93e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020877, "epoch": 4.41791045, "global_step/max_steps": "1705/1925", "percentage": "88.57%", "elapsed_time": "22h 41m 7s", "remaining_time": "2h 55m 37s"}
{"loss": 0.01016, "token_acc": 0.99752475, "grad_norm": 0.67578125, "learning_rate": 3.75e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020878, "epoch": 4.43088903, "global_step/max_steps": "1710/1925", "percentage": "88.83%", "elapsed_time": "22h 45m 3s", "remaining_time": "2h 51m 37s"}
{"loss": 0.00050979, "token_acc": 1.0, "grad_norm": 0.01696777, "learning_rate": 3.58e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020882, "epoch": 4.44386762, "global_step/max_steps": "1715/1925", "percentage": "89.09%", "elapsed_time": "22h 48m 45s", "remaining_time": "2h 47m 36s"}
{"loss": 0.00185301, "token_acc": 1.0, "grad_norm": 0.51953125, "learning_rate": 3.42e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020878, "epoch": 4.4568462, "global_step/max_steps": "1720/1925", "percentage": "89.35%", "elapsed_time": "22h 52m 59s", "remaining_time": "2h 43m 38s"}
{"loss": 0.00179509, "token_acc": 1.0, "grad_norm": 0.36328125, "learning_rate": 3.25e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020874, "epoch": 4.46982479, "global_step/max_steps": "1725/1925", "percentage": "89.61%", "elapsed_time": "22h 57m 14s", "remaining_time": "2h 39m 40s"}
{"loss": 0.00163544, "token_acc": 0.99918033, "grad_norm": 0.08398438, "learning_rate": 3.1e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020874, "epoch": 4.48280337, "global_step/max_steps": "1730/1925", "percentage": "89.87%", "elapsed_time": "23h 1m 13s", "remaining_time": "2h 35m 41s"}
{"loss": 0.00533278, "token_acc": 0.99833055, "grad_norm": 0.28515625, "learning_rate": 2.94e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02087, "epoch": 4.49578196, "global_step/max_steps": "1735/1925", "percentage": "90.13%", "elapsed_time": "23h 5m 30s", "remaining_time": "2h 31m 43s"}
{"loss": 0.00884347, "token_acc": 0.99752271, "grad_norm": 0.12695312, "learning_rate": 2.79e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020865, "epoch": 4.50876055, "global_step/max_steps": "1740/1925", "percentage": "90.39%", "elapsed_time": "23h 9m 50s", "remaining_time": "2h 27m 46s"}
{"loss": 0.00133485, "token_acc": 1.0, "grad_norm": 0.10693359, "learning_rate": 2.64e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02086, "epoch": 4.52173913, "global_step/max_steps": "1745/1925", "percentage": "90.65%", "elapsed_time": "23h 14m 10s", "remaining_time": "2h 23m 48s"}
{"loss": 0.00182302, "token_acc": 0.99916736, "grad_norm": 0.02758789, "learning_rate": 2.5e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02086, "epoch": 4.53471772, "global_step/max_steps": "1750/1925", "percentage": "90.91%", "elapsed_time": "23h 18m 9s", "remaining_time": "2h 19m 48s"}
{"loss": 0.00334373, "token_acc": 0.99916107, "grad_norm": 0.28515625, "learning_rate": 2.36e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020857, "epoch": 4.5476963, "global_step/max_steps": "1755/1925", "percentage": "91.17%", "elapsed_time": "23h 22m 21s", "remaining_time": "2h 15m 50s"}
{"loss": 0.00152796, "token_acc": 1.0, "grad_norm": 0.1796875, "learning_rate": 2.22e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020854, "epoch": 4.56067489, "global_step/max_steps": "1760/1925", "percentage": "91.43%", "elapsed_time": "23h 26m 32s", "remaining_time": "2h 11m 51s"}
{"loss": 0.00666914, "token_acc": 0.99917831, "grad_norm": 0.00540161, "learning_rate": 2.09e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020851, "epoch": 4.57365347, "global_step/max_steps": "1765/1925", "percentage": "91.69%", "elapsed_time": "23h 30m 46s", "remaining_time": "2h 7m 53s"}
{"loss": 0.00494981, "token_acc": 0.99834711, "grad_norm": 0.00946045, "learning_rate": 1.96e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020843, "epoch": 4.58663206, "global_step/max_steps": "1770/1925", "percentage": "91.95%", "elapsed_time": "23h 35m 18s", "remaining_time": "2h 3m 56s"}
{"loss": 0.00329267, "token_acc": 0.99834163, "grad_norm": 0.06933594, "learning_rate": 1.84e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020848, "epoch": 4.59961064, "global_step/max_steps": "1775/1925", "percentage": "92.21%", "elapsed_time": "23h 38m 59s", "remaining_time": "1h 59m 54s"}
{"loss": 0.00151995, "token_acc": 0.99918301, "grad_norm": 0.02624512, "learning_rate": 1.72e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020849, "epoch": 4.61258923, "global_step/max_steps": "1780/1925", "percentage": "92.47%", "elapsed_time": "23h 42m 54s", "remaining_time": "1h 55m 54s"}
{"loss": 0.00504421, "token_acc": 0.99836468, "grad_norm": 0.42578125, "learning_rate": 1.6e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020846, "epoch": 4.62556781, "global_step/max_steps": "1785/1925", "percentage": "92.73%", "elapsed_time": "23h 47m 7s", "remaining_time": "1h 51m 55s"}
{"loss": 0.00268856, "token_acc": 0.9983753, "grad_norm": 0.04516602, "learning_rate": 1.49e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020847, "epoch": 4.6385464, "global_step/max_steps": "1790/1925", "percentage": "92.99%", "elapsed_time": "23h 51m 1s", "remaining_time": "1h 47m 55s"}
{"loss": 0.00148416, "token_acc": 1.0, "grad_norm": 0.06884766, "learning_rate": 1.38e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020847, "epoch": 4.65152498, "global_step/max_steps": "1795/1925", "percentage": "93.25%", "elapsed_time": "23h 54m 59s", "remaining_time": "1h 43m 55s"}
{"loss": 0.001921, "token_acc": 0.99917081, "grad_norm": 0.29882812, "learning_rate": 1.28e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020842, "epoch": 4.66450357, "global_step/max_steps": "1800/1925", "percentage": "93.51%", "elapsed_time": "23h 59m 23s", "remaining_time": "1h 39m 57s"}
{"loss": 0.00392412, "token_acc": 0.99834983, "grad_norm": 0.07861328, "learning_rate": 1.18e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020835, "epoch": 4.67748215, "global_step/max_steps": "1805/1925", "percentage": "93.77%", "elapsed_time": "1d 0h 3m 51s", "remaining_time": "1h 35m 59s"}
{"loss": 0.00390602, "token_acc": 0.99917219, "grad_norm": 0.24023438, "learning_rate": 1.08e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02083, "epoch": 4.69046074, "global_step/max_steps": "1810/1925", "percentage": "94.03%", "elapsed_time": "1d 0h 8m 11s", "remaining_time": "1h 32m 0s"}
{"loss": 0.00224975, "token_acc": 0.99835931, "grad_norm": 0.03613281, "learning_rate": 9.9e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020827, "epoch": 4.70343933, "global_step/max_steps": "1815/1925", "percentage": "94.29%", "elapsed_time": "1d 0h 12m 22s", "remaining_time": "1h 28m 1s"}
{"loss": 0.00053148, "token_acc": 1.0, "grad_norm": 0.02880859, "learning_rate": 9e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020824, "epoch": 4.71641791, "global_step/max_steps": "1820/1925", "percentage": "94.55%", "elapsed_time": "1d 0h 16m 36s", "remaining_time": "1h 24m 2s"}
{"loss": 0.00086746, "token_acc": 1.0, "grad_norm": 0.06494141, "learning_rate": 8.2e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02082, "epoch": 4.7293965, "global_step/max_steps": "1825/1925", "percentage": "94.81%", "elapsed_time": "1d 0h 20m 53s", "remaining_time": "1h 20m 2s"}
{"loss": 0.00250605, "token_acc": 0.9991342, "grad_norm": 0.04150391, "learning_rate": 7.4e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020813, "epoch": 4.74237508, "global_step/max_steps": "1830/1925", "percentage": "95.06%", "elapsed_time": "1d 0h 25m 20s", "remaining_time": "1h 16m 4s"}
{"loss": 0.00181947, "token_acc": 0.99919028, "grad_norm": 0.0072937, "learning_rate": 6.6e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020817, "epoch": 4.75535367, "global_step/max_steps": "1835/1925", "percentage": "95.32%", "elapsed_time": "1d 0h 29m 8s", "remaining_time": "1h 12m 3s"}
{"loss": 0.00069599, "token_acc": 1.0, "grad_norm": 0.03710938, "learning_rate": 5.9e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020812, "epoch": 4.76833225, "global_step/max_steps": "1840/1925", "percentage": "95.58%", "elapsed_time": "1d 0h 33m 26s", "remaining_time": "1h 8m 3s"}
{"loss": 0.0052086, "token_acc": 0.99836066, "grad_norm": 0.5078125, "learning_rate": 5.3e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02081, "epoch": 4.78131084, "global_step/max_steps": "1845/1925", "percentage": "95.84%", "elapsed_time": "1d 0h 37m 37s", "remaining_time": "1h 4m 4s"}
{"loss": 0.00509922, "token_acc": 0.99916667, "grad_norm": 0.5625, "learning_rate": 4.6e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020811, "epoch": 4.79428942, "global_step/max_steps": "1850/1925", "percentage": "96.10%", "elapsed_time": "1d 0h 41m 31s", "remaining_time": "1h 0m 3s"}
{"loss": 0.00712334, "token_acc": 0.99835931, "grad_norm": 0.04150391, "learning_rate": 4e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020807, "epoch": 4.80726801, "global_step/max_steps": "1855/1925", "percentage": "96.36%", "elapsed_time": "1d 0h 45m 51s", "remaining_time": "56m 4s"}
{"loss": 0.00386918, "token_acc": 0.99835391, "grad_norm": 0.06347656, "learning_rate": 3.5e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020812, "epoch": 4.82024659, "global_step/max_steps": "1860/1925", "percentage": "96.62%", "elapsed_time": "1d 0h 49m 29s", "remaining_time": "52m 3s"}
{"loss": 0.00132384, "token_acc": 1.0, "grad_norm": 0.34570312, "learning_rate": 3e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020812, "epoch": 4.83322518, "global_step/max_steps": "1865/1925", "percentage": "96.88%", "elapsed_time": "1d 0h 53m 29s", "remaining_time": "48m 2s"}
{"loss": 0.00251984, "token_acc": 1.0, "grad_norm": 0.11914062, "learning_rate": 2.5e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020811, "epoch": 4.84620376, "global_step/max_steps": "1870/1925", "percentage": "97.14%", "elapsed_time": "1d 0h 57m 33s", "remaining_time": "44m 2s"}
{"loss": 0.00115552, "token_acc": 1.0, "grad_norm": 0.09375, "learning_rate": 2.1e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02081, "epoch": 4.85918235, "global_step/max_steps": "1875/1925", "percentage": "97.40%", "elapsed_time": "1d 1h 1m 38s", "remaining_time": "40m 2s"}
{"loss": 0.00208023, "token_acc": 0.99916805, "grad_norm": 0.14355469, "learning_rate": 1.7e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020807, "epoch": 4.87216093, "global_step/max_steps": "1880/1925", "percentage": "97.66%", "elapsed_time": "1d 1h 5m 53s", "remaining_time": "36m 2s"}
{"loss": 0.00272243, "token_acc": 0.9983512, "grad_norm": 0.50390625, "learning_rate": 1.3e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020808, "epoch": 4.88513952, "global_step/max_steps": "1885/1925", "percentage": "97.92%", "elapsed_time": "1d 1h 9m 45s", "remaining_time": "32m 2s"}
{"loss": 0.00205016, "token_acc": 0.99915398, "grad_norm": 0.32617188, "learning_rate": 1e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020809, "epoch": 4.89811811, "global_step/max_steps": "1890/1925", "percentage": "98.18%", "elapsed_time": "1d 1h 13m 43s", "remaining_time": "28m 1s"}
{"loss": 0.00116063, "token_acc": 1.0, "grad_norm": 0.03515625, "learning_rate": 7e-08, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020811, "epoch": 4.91109669, "global_step/max_steps": "1895/1925", "percentage": "98.44%", "elapsed_time": "1d 1h 17m 34s", "remaining_time": "24m 1s"}
{"loss": 0.00063409, "token_acc": 1.0, "grad_norm": 0.08935547, "learning_rate": 5e-08, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02081, "epoch": 4.92407528, "global_step/max_steps": "1900/1925", "percentage": "98.70%", "elapsed_time": "1d 1h 21m 41s", "remaining_time": "20m 1s"}
{"loss": 0.00033662, "token_acc": 1.0, "grad_norm": 0.13574219, "learning_rate": 3e-08, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020812, "epoch": 4.93705386, "global_step/max_steps": "1905/1925", "percentage": "98.96%", "elapsed_time": "1d 1h 25m 31s", "remaining_time": "16m 0s"}
{"loss": 0.0060442, "token_acc": 0.9984051, "grad_norm": 0.40625, "learning_rate": 2e-08, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020814, "epoch": 4.95003245, "global_step/max_steps": "1910/1925", "percentage": "99.22%", "elapsed_time": "1d 1h 29m 21s", "remaining_time": "12m 0s"}
{"loss": 0.00470068, "token_acc": 0.99835255, "grad_norm": 0.00793457, "learning_rate": 1e-08, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020815, "epoch": 4.96301103, "global_step/max_steps": "1915/1925", "percentage": "99.48%", "elapsed_time": "1d 1h 33m 17s", "remaining_time": "8m 0s"}
{"loss": 0.00759491, "token_acc": 0.99751861, "grad_norm": 0.76171875, "learning_rate": 0.0, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020814, "epoch": 4.97598962, "global_step/max_steps": "1920/1925", "percentage": "99.74%", "elapsed_time": "1d 1h 37m 21s", "remaining_time": "4m 0s"}
{"loss": 0.00792349, "token_acc": 0.99750623, "grad_norm": 0.00866699, "learning_rate": 0.0, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020815, "epoch": 4.9889682, "global_step/max_steps": "1925/1925", "percentage": "100.00%", "elapsed_time": "1d 1h 41m 20s", "remaining_time": "0s"}
{"train_runtime": 92570.8058, "train_samples_per_second": 1.332, "train_steps_per_second": 0.021, "total_flos": 3.70560890425983e+18, "train_loss": 0.07658965, "epoch": 4.9889682, "global_step/max_steps": "1925/1925", "percentage": "100.00%", "elapsed_time": "1d 1h 42m 48s", "remaining_time": "0s"}
{"model_parameter_info": "Qwen2_5_VLForConditionalGeneration: 8292.1667M Params (8247.5922M Trainable [99.4625%]), 0.0019M Buffers.", "last_model_checkpoint": "/data/LLM-SFT/SFT_Output/test/Qwen2.5-VL-7B-Instruct/v0-20250716-145652/checkpoint-1925", "best_model_checkpoint": null, "best_metric": null, "global_step": 1925, "log_history": [{"loss": 0.6761971116065979, "token_acc": 0.8360655737704918, "grad_norm": 64.5, "learning_rate": 5.181347150259068e-07, "memory(GiB)": 82.76, "train_speed(iter/s)": 0.019472, "epoch": 0.0025957170668397143, "step": 1}, {"loss": 0.6650555729866028, "token_acc": 0.8231644260599793, "grad_norm": 66.0, "learning_rate": 2.5906735751295338e-06, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.021208, "epoch": 0.012978585334198572, "step": 5}, {"loss": 0.4840095043182373, "token_acc": 0.8467274233637117, "grad_norm": 48.5, "learning_rate": 5.1813471502590676e-06, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020876, "epoch": 0.025957170668397145, "step": 10}, {"loss": 0.3320765018463135, "token_acc": 0.8696356275303644, "grad_norm": 137.0, "learning_rate": 7.772020725388602e-06, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.021309, "epoch": 0.03893575600259572, "step": 15}, {"loss": 0.3111098766326904, "token_acc": 0.8665008291873963, "grad_norm": 21.25, "learning_rate": 1.0362694300518135e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02131, "epoch": 0.05191434133679429, "step": 20}, {"loss": 0.24997963905334472, "token_acc": 0.8876127973748975, "grad_norm": 21.5, "learning_rate": 1.2953367875647668e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.021488, "epoch": 0.06489292667099286, "step": 25}, {"loss": 0.21027085781097413, "token_acc": 0.9171974522292994, "grad_norm": 25.5, "learning_rate": 1.5544041450777204e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.021184, "epoch": 0.07787151200519143, "step": 30}, {"loss": 0.18175365924835205, "token_acc": 0.9322742474916388, "grad_norm": 19.25, "learning_rate": 1.813471502590674e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.021197, "epoch": 0.09085009733939, "step": 35}, {"loss": 0.2070298671722412, "token_acc": 0.9324546952224053, "grad_norm": 42.0, "learning_rate": 2.072538860103627e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.021199, "epoch": 0.10382868267358858, "step": 40}, {"loss": 0.14612390995025634, "token_acc": 0.9431356620633631, "grad_norm": 31.0, "learning_rate": 2.3316062176165805e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.021162, "epoch": 0.11680726800778715, "step": 45}, {"loss": 0.1252074122428894, "token_acc": 0.9554140127388535, "grad_norm": 33.5, "learning_rate": 2.5906735751295337e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02116, "epoch": 0.12978585334198572, "step": 50}, {"loss": 0.08470200896263122, "token_acc": 0.9732360097323601, "grad_norm": 14.6875, "learning_rate": 2.8497409326424872e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.021224, "epoch": 0.1427644386761843, "step": 55}, {"loss": 0.1468730688095093, "token_acc": 0.9489539748953975, "grad_norm": 24.75, "learning_rate": 3.108808290155441e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.021111, "epoch": 0.15574302401038287, "step": 60}, {"loss": 0.0989870309829712, "token_acc": 0.9626016260162602, "grad_norm": 13.375, "learning_rate": 3.367875647668394e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.021069, "epoch": 0.16872160934458144, "step": 65}, {"loss": 0.15612951517105103, "token_acc": 0.9571428571428572, "grad_norm": 29.25, "learning_rate": 3.626943005181348e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.021002, "epoch": 0.18170019467878, "step": 70}, {"loss": 0.10940381288528442, "token_acc": 0.9690635451505016, "grad_norm": 16.0, "learning_rate": 3.886010362694301e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02104, "epoch": 0.1946787800129786, "step": 75}, {"loss": 0.13801568746566772, "token_acc": 0.9573378839590444, "grad_norm": 11.5, "learning_rate": 4.145077720207254e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020886, "epoch": 0.20765736534717716, "step": 80}, {"loss": 0.11146789789199829, "token_acc": 0.9662447257383966, "grad_norm": 28.375, "learning_rate": 4.404145077720208e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020723, "epoch": 0.22063595068137573, "step": 85}, {"loss": 0.1379382848739624, "token_acc": 0.9566284779050737, "grad_norm": 8.125, "learning_rate": 4.663212435233161e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020619, "epoch": 0.2336145360155743, "step": 90}, {"loss": 0.13294199705123902, "token_acc": 0.9616985845129059, "grad_norm": 11.9375, "learning_rate": 4.922279792746114e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020597, "epoch": 0.24659312134977288, "step": 95}, {"loss": 0.15107507705688478, "token_acc": 0.9649415692821369, "grad_norm": 78.5, "learning_rate": 5.1813471502590674e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020542, "epoch": 0.25957170668397145, "step": 100}, {"loss": 0.27871944904327395, "token_acc": 0.9241034195162635, "grad_norm": 13.75, "learning_rate": 5.440414507772021e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02047, "epoch": 0.27255029201817, "step": 105}, {"loss": 0.14314144849777222, "token_acc": 0.9567027477102414, "grad_norm": 7.1875, "learning_rate": 5.6994818652849744e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020365, "epoch": 0.2855288773523686, "step": 110}, {"loss": 0.14128265380859376, "token_acc": 0.9611248966087675, "grad_norm": 7.40625, "learning_rate": 5.9585492227979276e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02028, "epoch": 0.29850746268656714, "step": 115}, {"loss": 0.15114243030548097, "token_acc": 0.9528145695364238, "grad_norm": 7.625, "learning_rate": 6.217616580310881e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02023, "epoch": 0.31148604802076574, "step": 120}, {"loss": 0.14014118909835815, "token_acc": 0.9568755085435313, "grad_norm": 15.875, "learning_rate": 6.476683937823834e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020211, "epoch": 0.3244646333549643, "step": 125}, {"loss": 0.14676700830459594, "token_acc": 0.9623430962343096, "grad_norm": 13.0625, "learning_rate": 6.735751295336788e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020168, "epoch": 0.3374432186891629, "step": 130}, {"loss": 0.15908313989639283, "token_acc": 0.9551495016611296, "grad_norm": 11.5625, "learning_rate": 6.994818652849742e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020164, "epoch": 0.3504218040233614, "step": 135}, {"loss": 0.14115771055221557, "token_acc": 0.953757225433526, "grad_norm": 8.375, "learning_rate": 7.253886010362695e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02013, "epoch": 0.36340038935756, "step": 140}, {"loss": 0.17947659492492676, "token_acc": 0.947986577181208, "grad_norm": 12.5, "learning_rate": 7.512953367875648e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020045, "epoch": 0.37637897469175857, "step": 145}, {"loss": 0.13564352989196776, "token_acc": 0.9545454545454546, "grad_norm": 12.6875, "learning_rate": 7.772020725388602e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020024, "epoch": 0.3893575600259572, "step": 150}, {"loss": 0.14768315553665162, "token_acc": 0.954248366013072, "grad_norm": 9.375, "learning_rate": 8.031088082901554e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02003, "epoch": 0.4023361453601557, "step": 155}, {"loss": 0.15876761674880982, "token_acc": 0.955701394585726, "grad_norm": 8.5625, "learning_rate": 8.290155440414508e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020035, "epoch": 0.4153147306943543, "step": 160}, {"loss": 0.20421419143676758, "token_acc": 0.9363484087102177, "grad_norm": 5.6875, "learning_rate": 8.549222797927462e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020018, "epoch": 0.42829331602855286, "step": 165}, {"loss": 0.13153988122940063, "token_acc": 0.9637860082304527, "grad_norm": 8.4375, "learning_rate": 8.808290155440416e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02002, "epoch": 0.44127190136275146, "step": 170}, {"loss": 0.3276111364364624, "token_acc": 0.9358245329000813, "grad_norm": 8.375, "learning_rate": 9.067357512953368e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020018, "epoch": 0.45425048669695, "step": 175}, {"loss": 0.13705794811248778, "token_acc": 0.9646090534979423, "grad_norm": 12.1875, "learning_rate": 9.326424870466322e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020066, "epoch": 0.4672290720311486, "step": 180}, {"loss": 0.17206543684005737, "token_acc": 0.9485903814262023, "grad_norm": 10.0, "learning_rate": 9.585492227979275e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020072, "epoch": 0.48020765736534715, "step": 185}, {"loss": 0.15847107172012329, "token_acc": 0.9510135135135135, "grad_norm": 65.5, "learning_rate": 9.844559585492228e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020079, "epoch": 0.49318624269954575, "step": 190}, {"loss": 0.12832045555114746, "token_acc": 0.9608482871125612, "grad_norm": 6.75, "learning_rate": 9.999967099424576e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020091, "epoch": 0.5061648280337443, "step": 195}, {"loss": 0.2224121570587158, "token_acc": 0.9467312348668281, "grad_norm": 21.625, "learning_rate": 9.999596972923527e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020136, "epoch": 0.5191434133679429, "step": 200}, {"loss": 0.3189336538314819, "token_acc": 0.9142156862745098, "grad_norm": 105.5, "learning_rate": 9.99881562474689e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020143, "epoch": 0.5321219987021415, "step": 205}, {"loss": 0.3505317211151123, "token_acc": 0.9171452009844134, "grad_norm": 14.0, "learning_rate": 9.997623119161306e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020174, "epoch": 0.54510058403634, "step": 210}, {"loss": 0.17262498140335084, "token_acc": 0.9438110749185668, "grad_norm": 7.8125, "learning_rate": 9.996019554251512e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020194, "epoch": 0.5580791693705386, "step": 215}, {"loss": 0.19659049510955812, "token_acc": 0.9433805162364696, "grad_norm": 9.6875, "learning_rate": 9.994005061912266e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020193, "epoch": 0.5710577547047372, "step": 220}, {"loss": 0.1692946434020996, "token_acc": 0.9464138499587799, "grad_norm": 5.78125, "learning_rate": 9.99157980783751e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020175, "epoch": 0.5840363400389358, "step": 225}, {"loss": 0.15234674215316774, "token_acc": 0.9539094650205762, "grad_norm": 8.0625, "learning_rate": 9.98874399150673e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020195, "epoch": 0.5970149253731343, "step": 230}, {"loss": 0.16389089822769165, "token_acc": 0.9533333333333334, "grad_norm": 3.796875, "learning_rate": 9.985497846168557e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020211, "epoch": 0.6099935107073329, "step": 235}, {"loss": 0.1235802173614502, "token_acc": 0.9637972646822205, "grad_norm": 5.65625, "learning_rate": 9.981841638821576e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020238, "epoch": 0.6229720960415315, "step": 240}, {"loss": 0.2110598564147949, "token_acc": 0.9404081632653061, "grad_norm": 9.5, "learning_rate": 9.977775670192371e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020262, "epoch": 0.6359506813757301, "step": 245}, {"loss": 0.1242341160774231, "token_acc": 0.9559800664451827, "grad_norm": 4.34375, "learning_rate": 9.973300274710787e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020264, "epoch": 0.6489292667099286, "step": 250}, {"loss": 1.1298725128173828, "token_acc": 0.8556193601312552, "grad_norm": 32.25, "learning_rate": 9.968415820482415e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020271, "epoch": 0.6619078520441272, "step": 255}, {"loss": 0.6246148586273194, "token_acc": 0.8447154471544716, "grad_norm": 20.5, "learning_rate": 9.963122709258335e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020304, "epoch": 0.6748864373783258, "step": 260}, {"loss": 0.5492882251739502, "token_acc": 0.9152404237978811, "grad_norm": 596.0, "learning_rate": 9.957421376402051e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020343, "epoch": 0.6878650227125244, "step": 265}, {"loss": 0.1511799216270447, "token_acc": 0.9506688963210702, "grad_norm": 5.8125, "learning_rate": 9.951312290853691e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02037, "epoch": 0.7008436080467229, "step": 270}, {"loss": 1.6441532135009767, "token_acc": 0.8274153592072667, "grad_norm": 154.0, "learning_rate": 9.944795955091438e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020369, "epoch": 0.7138221933809215, "step": 275}, {"loss": 1.1411833763122559, "token_acc": 0.8543371522094927, "grad_norm": 7.75, "learning_rate": 9.937872905090196e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020393, "epoch": 0.72680077871512, "step": 280}, {"loss": 0.2547419548034668, "token_acc": 0.9494702526487367, "grad_norm": 13.6875, "learning_rate": 9.930543710277509e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020415, "epoch": 0.7397793640493187, "step": 285}, {"loss": 0.21210157871246338, "token_acc": 0.9428571428571428, "grad_norm": 27.75, "learning_rate": 9.922808973486721e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020426, "epoch": 0.7527579493835171, "step": 290}, {"loss": 0.1741060733795166, "token_acc": 0.9474116680361545, "grad_norm": 220.0, "learning_rate": 9.914669330907399e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020432, "epoch": 0.7657365347177157, "step": 295}, {"loss": 0.19344786405563355, "token_acc": 0.928453947368421, "grad_norm": 11.5, "learning_rate": 9.906125452032999e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020448, "epoch": 0.7787151200519143, "step": 300}, {"loss": 0.20231380462646484, "token_acc": 0.9286898839137645, "grad_norm": 3.109375, "learning_rate": 9.897178039605802e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020452, "epoch": 0.791693705386113, "step": 305}, {"loss": 0.16842379570007324, "token_acc": 0.9444444444444444, "grad_norm": 4.59375, "learning_rate": 9.887827829559117e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020467, "epoch": 0.8046722907203114, "step": 310}, {"loss": 0.1668983817100525, "token_acc": 0.9469026548672567, "grad_norm": 2.875, "learning_rate": 9.878075590956742e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020482, "epoch": 0.81765087605451, "step": 315}, {"loss": 0.15300171375274657, "token_acc": 0.9525395503746877, "grad_norm": 4.40625, "learning_rate": 9.867922125929712e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020508, "epoch": 0.8306294613887086, "step": 320}, {"loss": 0.16267441511154174, "token_acc": 0.9539094650205762, "grad_norm": 3.3125, "learning_rate": 9.857368269610324e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020505, "epoch": 0.8436080467229072, "step": 325}, {"loss": 0.1828203558921814, "token_acc": 0.9417637271214643, "grad_norm": 7.21875, "learning_rate": 9.846414890063446e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020508, "epoch": 0.8565866320571057, "step": 330}, {"loss": 0.15136970281600953, "token_acc": 0.9577814569536424, "grad_norm": 4.53125, "learning_rate": 9.835062888215114e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020508, "epoch": 0.8695652173913043, "step": 335}, {"loss": 0.12393852472305297, "token_acc": 0.9630541871921182, "grad_norm": 2.515625, "learning_rate": 9.823313197778434e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020525, "epoch": 0.8825438027255029, "step": 340}, {"loss": 0.12096943855285644, "token_acc": 0.9550748752079867, "grad_norm": 5.65625, "learning_rate": 9.811166785176784e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020516, "epoch": 0.8955223880597015, "step": 345}, {"loss": 0.1423720359802246, "token_acc": 0.9579554822753503, "grad_norm": 5.03125, "learning_rate": 9.798624649464323e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020518, "epoch": 0.9085009733939, "step": 350}, {"loss": 0.1521735668182373, "token_acc": 0.9567699836867863, "grad_norm": 3.609375, "learning_rate": 9.785687822243812e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020531, "epoch": 0.9214795587280986, "step": 355}, {"loss": 0.14129271507263183, "token_acc": 0.9501661129568106, "grad_norm": 2.078125, "learning_rate": 9.772357367581779e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020532, "epoch": 0.9344581440622972, "step": 360}, {"loss": 0.14151201248168946, "token_acc": 0.9601626016260163, "grad_norm": 3.828125, "learning_rate": 9.758634381920981e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020539, "epoch": 0.9474367293964958, "step": 365}, {"loss": 0.11267189979553223, "token_acc": 0.9596042868920033, "grad_norm": 2.0625, "learning_rate": 9.744519993990232e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020569, "epoch": 0.9604153147306943, "step": 370}, {"loss": 0.1380028247833252, "token_acc": 0.9532163742690059, "grad_norm": 2.796875, "learning_rate": 9.730015364711557e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02059, "epoch": 0.9733939000648929, "step": 375}, {"loss": 0.13349549770355223, "token_acc": 0.9553933495539335, "grad_norm": 4.21875, "learning_rate": 9.715121687104715e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020592, "epoch": 0.9863724853990915, "step": 380}, {"loss": 0.11987955570220947, "token_acc": 0.9604377104377104, "grad_norm": 4.25, "learning_rate": 9.69984018618906e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020608, "epoch": 0.9993510707332901, "step": 385}, {"loss": 0.07847663760185242, "token_acc": 0.9701636188642926, "grad_norm": 2.890625, "learning_rate": 9.684172118882788e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020645, "epoch": 1.0103828682673588, "step": 390}, {"loss": 0.28727552890777586, "token_acc": 0.9520661157024793, "grad_norm": 2.578125, "learning_rate": 9.668118773899554e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020648, "epoch": 1.0233614536015574, "step": 395}, {"loss": 0.10458629131317139, "token_acc": 0.9610927152317881, "grad_norm": 2.71875, "learning_rate": 9.651681471642476e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020649, "epoch": 1.036340038935756, "step": 400}, {"loss": 0.07832455635070801, "token_acc": 0.9694041867954911, "grad_norm": 4.0625, "learning_rate": 9.634861564095524e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020671, "epoch": 1.0493186242699546, "step": 405}, {"loss": 0.09629099369049073, "token_acc": 0.9698942229454841, "grad_norm": 9.375, "learning_rate": 9.617660434712328e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02067, "epoch": 1.0622972096041532, "step": 410}, {"loss": 0.0883810818195343, "token_acc": 0.9762295081967213, "grad_norm": 1.5625, "learning_rate": 9.600079498302376e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020675, "epoch": 1.0752757949383518, "step": 415}, {"loss": 0.09818625450134277, "token_acc": 0.962360992301112, "grad_norm": 2.203125, "learning_rate": 9.582120200914654e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020676, "epoch": 1.0882543802725504, "step": 420}, {"loss": 0.09333943128585816, "token_acc": 0.9726468222043443, "grad_norm": 3.734375, "learning_rate": 9.563784019718703e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020684, "epoch": 1.1012329656067488, "step": 425}, {"loss": 0.0676883339881897, "token_acc": 0.978981406628941, "grad_norm": 1.6953125, "learning_rate": 9.54507246288312e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020692, "epoch": 1.1142115509409474, "step": 430}, {"loss": 0.09929285049438477, "token_acc": 0.9760383386581469, "grad_norm": 2.390625, "learning_rate": 9.525987069451513e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020704, "epoch": 1.127190136275146, "step": 435}, {"loss": 0.11887519359588623, "token_acc": 0.956989247311828, "grad_norm": 1.7890625, "learning_rate": 9.506529409215907e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020705, "epoch": 1.1401687216093446, "step": 440}, {"loss": 0.09447284936904907, "token_acc": 0.9660033167495854, "grad_norm": 1.6484375, "learning_rate": 9.486701082587634e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020692, "epoch": 1.1531473069435432, "step": 445}, {"loss": 0.11140995025634766, "token_acc": 0.9652247667514843, "grad_norm": 2.796875, "learning_rate": 9.466503720465692e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020698, "epoch": 1.1661258922777418, "step": 450}, {"loss": 0.10376272201538086, "token_acc": 0.9649551752241239, "grad_norm": 4.9375, "learning_rate": 9.445938984102606e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020701, "epoch": 1.1791044776119404, "step": 455}, {"loss": 0.10616805553436279, "token_acc": 0.9671052631578947, "grad_norm": 5.6875, "learning_rate": 9.42500856496779e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020708, "epoch": 1.1920830629461387, "step": 460}, {"loss": 0.10312038660049438, "token_acc": 0.9700404858299595, "grad_norm": 2.046875, "learning_rate": 9.403714184608411e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02071, "epoch": 1.2050616482803373, "step": 465}, {"loss": 0.11268962621688842, "token_acc": 0.9643146796431468, "grad_norm": 1.6796875, "learning_rate": 9.382057594507805e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020713, "epoch": 1.218040233614536, "step": 470}, {"loss": 0.11029822826385498, "token_acc": 0.961439588688946, "grad_norm": 2.125, "learning_rate": 9.360040575941405e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02071, "epoch": 1.2310188189487346, "step": 475}, {"loss": 0.08803771138191223, "token_acc": 0.9737919737919738, "grad_norm": 2.734375, "learning_rate": 9.337664939830234e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020707, "epoch": 1.2439974042829332, "step": 480}, {"loss": 0.12014460563659668, "token_acc": 0.9600665557404326, "grad_norm": 2.46875, "learning_rate": 9.314932526591956e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020714, "epoch": 1.2569759896171318, "step": 485}, {"loss": 0.061967766284942626, "token_acc": 0.9819227608874281, "grad_norm": 1.2890625, "learning_rate": 9.291845205989495e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020719, "epoch": 1.2699545749513304, "step": 490}, {"loss": 0.0871744453907013, "token_acc": 0.971240755957272, "grad_norm": 3.328125, "learning_rate": 9.26840487697725e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020714, "epoch": 1.282933160285529, "step": 495}, {"loss": 0.11612107753753662, "token_acc": 0.9636062861869313, "grad_norm": 3.53125, "learning_rate": 9.244613467544903e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020724, "epoch": 1.2959117456197276, "step": 500}, {"loss": 0.09390705227851867, "token_acc": 0.9699499165275459, "grad_norm": 2.453125, "learning_rate": 9.220472934558837e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020673, "epoch": 1.308890330953926, "step": 505}, {"loss": 0.09252657294273377, "token_acc": 0.9639865996649917, "grad_norm": 3.546875, "learning_rate": 9.195985263601186e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020679, "epoch": 1.3218689162881245, "step": 510}, {"loss": 0.08763872385025025, "token_acc": 0.9736408566721582, "grad_norm": 1.4765625, "learning_rate": 9.171152468806516e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020688, "epoch": 1.3348475016223231, "step": 515}, {"loss": 0.08335199356079101, "token_acc": 0.9696472518457753, "grad_norm": 1.9609375, "learning_rate": 9.145976592696163e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020699, "epoch": 1.3478260869565217, "step": 520}, {"loss": 0.08465138673782349, "token_acc": 0.9754028837998303, "grad_norm": 1.734375, "learning_rate": 9.120459706010233e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020692, "epoch": 1.3608046722907203, "step": 525}, {"loss": 0.08926347494125367, "token_acc": 0.9690893901420217, "grad_norm": 2.21875, "learning_rate": 9.094603907537276e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020699, "epoch": 1.373783257624919, "step": 530}, {"loss": 0.07581965327262878, "token_acc": 0.9726141078838174, "grad_norm": 2.28125, "learning_rate": 9.06841132394167e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020705, "epoch": 1.3867618429591175, "step": 535}, {"loss": 0.08940108418464661, "token_acc": 0.9730639730639731, "grad_norm": 1.5, "learning_rate": 9.04188410958869e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020709, "epoch": 1.399740428293316, "step": 540}, {"loss": 0.10531409978866577, "token_acc": 0.9656862745098039, "grad_norm": 3.90625, "learning_rate": 9.015024446367315e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020718, "epoch": 1.4127190136275147, "step": 545}, {"loss": 0.10772655010223389, "token_acc": 0.9670608108108109, "grad_norm": 1.09375, "learning_rate": 8.987834543510765e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020719, "epoch": 1.425697598961713, "step": 550}, {"loss": 0.09822122454643249, "token_acc": 0.9698205546492659, "grad_norm": 0.9921875, "learning_rate": 8.960316637414787e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020733, "epoch": 1.4386761842959117, "step": 555}, {"loss": 0.08714378476142884, "token_acc": 0.966988727858293, "grad_norm": 3.953125, "learning_rate": 8.932472991453712e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020744, "epoch": 1.4516547696301103, "step": 560}, {"loss": 0.1327031970024109, "token_acc": 0.9598689598689598, "grad_norm": 3.6875, "learning_rate": 8.904305895794292e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020746, "epoch": 1.464633354964309, "step": 565}, {"loss": 0.08868688941001893, "token_acc": 0.9679539852095317, "grad_norm": 2.4375, "learning_rate": 8.87581766720732e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020754, "epoch": 1.4776119402985075, "step": 570}, {"loss": 0.07921120524406433, "token_acc": 0.9747145187601958, "grad_norm": 2.296875, "learning_rate": 8.847010648877094e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020753, "epoch": 1.490590525632706, "step": 575}, {"loss": 0.0657312035560608, "token_acc": 0.9798387096774194, "grad_norm": 0.96875, "learning_rate": 8.817887210208668e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020759, "epoch": 1.5035691109669047, "step": 580}, {"loss": 0.06689350008964538, "token_acc": 0.974485596707819, "grad_norm": 1.2265625, "learning_rate": 8.788449746632977e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020758, "epoch": 1.516547696301103, "step": 585}, {"loss": 0.05711461901664734, "token_acc": 0.9784860557768924, "grad_norm": 2.03125, "learning_rate": 8.758700679409804e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02076, "epoch": 1.529526281635302, "step": 590}, {"loss": 0.09383676052093506, "token_acc": 0.9712595097210481, "grad_norm": 1.546875, "learning_rate": 8.72864245542864e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020756, "epoch": 1.5425048669695003, "step": 595}, {"loss": 0.08457305431365966, "token_acc": 0.9700404858299595, "grad_norm": 1.140625, "learning_rate": 8.69827754700741e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020755, "epoch": 1.5554834523036989, "step": 600}, {"loss": 0.05290307998657227, "token_acc": 0.9830097087378641, "grad_norm": 1.171875, "learning_rate": 8.667608451689134e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020756, "epoch": 1.5684620376378975, "step": 605}, {"loss": 0.0879642128944397, "token_acc": 0.9723577235772358, "grad_norm": 3.734375, "learning_rate": 8.6366376920365e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020761, "epoch": 1.581440622972096, "step": 610}, {"loss": 0.05958070755004883, "token_acc": 0.9812244897959184, "grad_norm": 1.1875, "learning_rate": 8.605367815424372e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020757, "epoch": 1.5944192083062947, "step": 615}, {"loss": 0.15476706027984619, "token_acc": 0.9589958158995816, "grad_norm": 14.3125, "learning_rate": 8.573801393830281e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020758, "epoch": 1.607397793640493, "step": 620}, {"loss": 0.08144987821578979, "token_acc": 0.977796052631579, "grad_norm": 1.484375, "learning_rate": 8.54194102362286e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020753, "epoch": 1.6203763789746919, "step": 625}, {"loss": 0.12072479724884033, "token_acc": 0.9649122807017544, "grad_norm": 2.375, "learning_rate": 8.509789325348306e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020756, "epoch": 1.6333549643088903, "step": 630}, {"loss": 0.057410746812820435, "token_acc": 0.9805510534846029, "grad_norm": 1.171875, "learning_rate": 8.477348943514828e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020755, "epoch": 1.6463335496430889, "step": 635}, {"loss": 0.08364247083663941, "token_acc": 0.9754500818330606, "grad_norm": 2.890625, "learning_rate": 8.444622546375136e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020763, "epoch": 1.6593121349772875, "step": 640}, {"loss": 0.0733809769153595, "token_acc": 0.9763458401305057, "grad_norm": 1.4296875, "learning_rate": 8.411612825706977e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020764, "epoch": 1.672290720311486, "step": 645}, {"loss": 0.0672804057598114, "token_acc": 0.9800498753117207, "grad_norm": 2.375, "learning_rate": 8.378322496591727e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020765, "epoch": 1.6852693056456847, "step": 650}, {"loss": 0.09220197200775146, "token_acc": 0.971900826446281, "grad_norm": 3.328125, "learning_rate": 8.34475429719108e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020763, "epoch": 1.698247890979883, "step": 655}, {"loss": 0.09654330611228942, "token_acc": 0.9690301548492257, "grad_norm": 2.109375, "learning_rate": 8.310910988521836e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020766, "epoch": 1.7112264763140819, "step": 660}, {"loss": 0.09199822545051575, "token_acc": 0.9744027303754266, "grad_norm": 1.609375, "learning_rate": 8.276795354228786e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020764, "epoch": 1.7242050616482802, "step": 665}, {"loss": 0.08355655074119568, "token_acc": 0.9717607973421927, "grad_norm": 1.9609375, "learning_rate": 8.242410200355772e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020765, "epoch": 1.737183646982479, "step": 670}, {"loss": 0.07680343985557556, "token_acc": 0.9784768211920529, "grad_norm": 1.3671875, "learning_rate": 8.207758355114883e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020769, "epoch": 1.7501622323166774, "step": 675}, {"loss": 0.05557375550270081, "token_acc": 0.983150800336984, "grad_norm": 1.1328125, "learning_rate": 8.172842668653829e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02077, "epoch": 1.763140817650876, "step": 680}, {"loss": 0.0882513701915741, "token_acc": 0.9752770673486786, "grad_norm": 3.46875, "learning_rate": 8.137666012821514e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020769, "epoch": 1.7761194029850746, "step": 685}, {"loss": 0.10723648071289063, "token_acc": 0.9713584288052373, "grad_norm": 2.96875, "learning_rate": 8.102231280931825e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020778, "epoch": 1.7890979883192732, "step": 690}, {"loss": 0.07498295307159424, "token_acc": 0.9777043765483072, "grad_norm": 1.5078125, "learning_rate": 8.066541387525656e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020784, "epoch": 1.8020765736534718, "step": 695}, {"loss": 0.06255126595497132, "token_acc": 0.9788617886178862, "grad_norm": 2.84375, "learning_rate": 8.030599268131178e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020786, "epoch": 1.8150551589876702, "step": 700}, {"loss": 0.09357861876487732, "token_acc": 0.9719703215169002, "grad_norm": 2.65625, "learning_rate": 7.994407879022396e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02079, "epoch": 1.828033744321869, "step": 705}, {"loss": 0.04933435618877411, "token_acc": 0.984375, "grad_norm": 2.078125, "learning_rate": 7.957970196975989e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020784, "epoch": 1.8410123296560674, "step": 710}, {"loss": 0.04773304164409638, "token_acc": 0.9835390946502057, "grad_norm": 1.2421875, "learning_rate": 7.921289219026465e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020777, "epoch": 1.853990914990266, "step": 715}, {"loss": 0.10059411525726318, "token_acc": 0.9690117252931323, "grad_norm": 2.40625, "learning_rate": 7.884367962219659e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020775, "epoch": 1.8669695003244646, "step": 720}, {"loss": 0.0753164291381836, "token_acc": 0.9772727272727273, "grad_norm": 2.65625, "learning_rate": 7.847209463364573e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020777, "epoch": 1.8799480856586632, "step": 725}, {"loss": 0.06520425677299499, "token_acc": 0.9786359901396877, "grad_norm": 3.03125, "learning_rate": 7.809816778783594e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020783, "epoch": 1.8929266709928618, "step": 730}, {"loss": 0.08142446279525757, "token_acc": 0.9693454846727423, "grad_norm": 1.9609375, "learning_rate": 7.772192984061113e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020794, "epoch": 1.9059052563270602, "step": 735}, {"loss": 0.07234652638435364, "token_acc": 0.9788617886178862, "grad_norm": 1.03125, "learning_rate": 7.734341173790551e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020798, "epoch": 1.918883841661259, "step": 740}, {"loss": 0.07108655571937561, "token_acc": 0.975103734439834, "grad_norm": 2.03125, "learning_rate": 7.696264461319831e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020799, "epoch": 1.9318624269954574, "step": 745}, {"loss": 0.06093553900718689, "token_acc": 0.9778325123152709, "grad_norm": 2.125, "learning_rate": 7.657965978495299e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020808, "epoch": 1.9448410123296562, "step": 750}, {"loss": 0.06928490400314331, "token_acc": 0.9735973597359736, "grad_norm": 1.28125, "learning_rate": 7.619448875404121e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020817, "epoch": 1.9578195976638546, "step": 755}, {"loss": 0.07247484922409057, "token_acc": 0.9713322091062394, "grad_norm": 1.7734375, "learning_rate": 7.580716320115195e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020819, "epoch": 1.9707981829980532, "step": 760}, {"loss": 0.07819901704788208, "token_acc": 0.9759136212624585, "grad_norm": 0.9375, "learning_rate": 7.541771498418575e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02082, "epoch": 1.9837767683322518, "step": 765}, {"loss": 0.06084737181663513, "token_acc": 0.978369384359401, "grad_norm": 1.5859375, "learning_rate": 7.502617613563423e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020823, "epoch": 1.9967553536664504, "step": 770}, {"loss": 0.04811320304870605, "token_acc": 0.980980980980981, "grad_norm": 1.625, "learning_rate": 7.463257885994552e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020841, "epoch": 2.007787151200519, "step": 775}, {"loss": 0.045494526624679565, "token_acc": 0.985233798195242, "grad_norm": 0.427734375, "learning_rate": 7.423695553087537e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020848, "epoch": 2.0207657365347176, "step": 780}, {"loss": 0.05818610191345215, "token_acc": 0.9794168096054888, "grad_norm": 5.78125, "learning_rate": 7.383933868882438e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020848, "epoch": 2.0337443218689164, "step": 785}, {"loss": 0.037669533491134645, "token_acc": 0.9900083263946711, "grad_norm": 0.890625, "learning_rate": 7.343976103816148e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020846, "epoch": 2.046722907203115, "step": 790}, {"loss": 0.09385978579521179, "token_acc": 0.9735973597359736, "grad_norm": 2.9375, "learning_rate": 7.303825544453404e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020851, "epoch": 2.0597014925373136, "step": 795}, {"loss": 0.05340880751609802, "token_acc": 0.985353946297803, "grad_norm": 1.3359375, "learning_rate": 7.263485493216458e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020854, "epoch": 2.072680077871512, "step": 800}, {"loss": 0.05104266405105591, "token_acc": 0.9834574028122415, "grad_norm": 1.0625, "learning_rate": 7.222959268113452e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020863, "epoch": 2.0856586632057104, "step": 805}, {"loss": 0.052119660377502444, "token_acc": 0.9854721549636803, "grad_norm": 1.828125, "learning_rate": 7.182250202465509e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020866, "epoch": 2.098637248539909, "step": 810}, {"loss": 0.06113920211791992, "token_acc": 0.9816360601001669, "grad_norm": 1.96875, "learning_rate": 7.14136164463256e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02087, "epoch": 2.1116158338741076, "step": 815}, {"loss": 0.06530060768127441, "token_acc": 0.9808811305070657, "grad_norm": 1.3125, "learning_rate": 7.100296957737948e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020872, "epoch": 2.1245944192083064, "step": 820}, {"loss": 0.07137447595596313, "token_acc": 0.9771801140994295, "grad_norm": 2.0, "learning_rate": 7.059059519391794e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020874, "epoch": 2.1375730045425048, "step": 825}, {"loss": 0.045847800374031064, "token_acc": 0.9824267782426779, "grad_norm": 1.4609375, "learning_rate": 7.017652721413196e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020875, "epoch": 2.1505515898767036, "step": 830}, {"loss": 0.04792321026325226, "token_acc": 0.9875415282392026, "grad_norm": 1.2265625, "learning_rate": 6.976079969551246e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020875, "epoch": 2.163530175210902, "step": 835}, {"loss": 0.04856709241867065, "token_acc": 0.9852459016393442, "grad_norm": 1.125, "learning_rate": 6.934344683204901e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020879, "epoch": 2.176508760545101, "step": 840}, {"loss": 0.05035427212715149, "token_acc": 0.9832635983263598, "grad_norm": 1.4453125, "learning_rate": 6.892450295141736e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020881, "epoch": 2.189487345879299, "step": 845}, {"loss": 0.03542698323726654, "token_acc": 0.9884488448844885, "grad_norm": 1.5703125, "learning_rate": 6.8504002512156e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02088, "epoch": 2.2024659312134975, "step": 850}, {"loss": 0.03674182891845703, "token_acc": 0.9859271523178808, "grad_norm": 1.8515625, "learning_rate": 6.808198010083185e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020878, "epoch": 2.2154445165476964, "step": 855}, {"loss": 0.0341266006231308, "token_acc": 0.9859504132231405, "grad_norm": 0.408203125, "learning_rate": 6.765847042919542e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020878, "epoch": 2.2284231018818947, "step": 860}, {"loss": 0.047522234916687014, "token_acc": 0.9858333333333333, "grad_norm": 0.8828125, "learning_rate": 6.723350833132595e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020875, "epoch": 2.2414016872160936, "step": 865}, {"loss": 0.04341578781604767, "token_acc": 0.9887005649717514, "grad_norm": 0.6953125, "learning_rate": 6.680712876076605e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020876, "epoch": 2.254380272550292, "step": 870}, {"loss": 0.04864707887172699, "token_acc": 0.9837000814995925, "grad_norm": 1.25, "learning_rate": 6.637936678764684e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020885, "epoch": 2.2673588578844908, "step": 875}, {"loss": 0.02881385087966919, "token_acc": 0.9928684627575277, "grad_norm": 0.7421875, "learning_rate": 6.595025759580342e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02089, "epoch": 2.280337443218689, "step": 880}, {"loss": 0.03752845525741577, "token_acc": 0.9892650701899257, "grad_norm": 1.0625, "learning_rate": 6.551983647988089e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020893, "epoch": 2.293316028552888, "step": 885}, {"loss": 0.04644352495670319, "token_acc": 0.9877149877149877, "grad_norm": 0.70703125, "learning_rate": 6.508813884243138e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020898, "epoch": 2.3062946138870863, "step": 890}, {"loss": 0.04913321435451508, "token_acc": 0.9795221843003413, "grad_norm": 2.21875, "learning_rate": 6.465520019100217e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020898, "epoch": 2.3192731992212847, "step": 895}, {"loss": 0.03812339305877686, "token_acc": 0.986938775510204, "grad_norm": 1.59375, "learning_rate": 6.422105613521512e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020893, "epoch": 2.3322517845554835, "step": 900}, {"loss": 0.04143716990947723, "token_acc": 0.9850746268656716, "grad_norm": 1.5625, "learning_rate": 6.378574238383777e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020891, "epoch": 2.345230369889682, "step": 905}, {"loss": 0.05560382604598999, "token_acc": 0.9827018121911038, "grad_norm": 1.6875, "learning_rate": 6.334929474184621e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020899, "epoch": 2.3582089552238807, "step": 910}, {"loss": 0.044425755739212036, "token_acc": 0.9833610648918469, "grad_norm": 1.7890625, "learning_rate": 6.29117491074802e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020901, "epoch": 2.371187540558079, "step": 915}, {"loss": 0.050447821617126465, "token_acc": 0.9852700490998363, "grad_norm": 0.8359375, "learning_rate": 6.247314146929038e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020905, "epoch": 2.3841661258922775, "step": 920}, {"loss": 0.022633786499500274, "token_acc": 0.9933388842631141, "grad_norm": 0.5, "learning_rate": 6.203350790317825e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020904, "epoch": 2.3971447112264763, "step": 925}, {"loss": 0.04340067207813263, "token_acc": 0.9845024469820555, "grad_norm": 1.2109375, "learning_rate": 6.159288456942884e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020906, "epoch": 2.4101232965606747, "step": 930}, {"loss": 0.055285471677780154, "token_acc": 0.9786008230452675, "grad_norm": 1.2890625, "learning_rate": 6.115130770973658e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020909, "epoch": 2.4231018818948735, "step": 935}, {"loss": 0.04441152215003967, "token_acc": 0.9883040935672515, "grad_norm": 1.25, "learning_rate": 6.0708813644224304e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020911, "epoch": 2.436080467229072, "step": 940}, {"loss": 0.05162730813026428, "token_acc": 0.9811629811629812, "grad_norm": 1.6015625, "learning_rate": 6.026543876845586e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020914, "epoch": 2.4490590525632707, "step": 945}, {"loss": 0.055637669563293454, "token_acc": 0.9836199836199836, "grad_norm": 1.0546875, "learning_rate": 5.9821219550442654e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020911, "epoch": 2.462037637897469, "step": 950}, {"loss": 0.0420777291059494, "token_acc": 0.9869494290375204, "grad_norm": 5.96875, "learning_rate": 5.937619252764405e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020911, "epoch": 2.475016223231668, "step": 955}, {"loss": 0.04367494285106659, "token_acc": 0.9859154929577465, "grad_norm": 1.1875, "learning_rate": 5.89303943039621e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020917, "epoch": 2.4879948085658663, "step": 960}, {"loss": 0.03488150537014008, "token_acc": 0.9899328859060402, "grad_norm": 0.416015625, "learning_rate": 5.848386154673091e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02092, "epoch": 2.5009733939000647, "step": 965}, {"loss": 0.023914022743701933, "token_acc": 0.9895414320193081, "grad_norm": 0.73828125, "learning_rate": 5.8036630983700644e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02092, "epoch": 2.5139519792342635, "step": 970}, {"loss": 0.05280606150627136, "token_acc": 0.9850374064837906, "grad_norm": 1.359375, "learning_rate": 5.7588739400016676e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020918, "epoch": 2.526930564568462, "step": 975}, {"loss": 0.04982930719852448, "token_acc": 0.9808333333333333, "grad_norm": 1.1796875, "learning_rate": 5.7140223635194034e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020914, "epoch": 2.5399091499026607, "step": 980}, {"loss": 0.03910036683082581, "token_acc": 0.9869067103109657, "grad_norm": 1.234375, "learning_rate": 5.669112058008712e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02092, "epoch": 2.552887735236859, "step": 985}, {"loss": 0.0397420197725296, "token_acc": 0.9874055415617129, "grad_norm": 0.5234375, "learning_rate": 5.624146717385563e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020921, "epoch": 2.565866320571058, "step": 990}, {"loss": 0.04704872071743012, "token_acc": 0.9854486661277284, "grad_norm": 1.625, "learning_rate": 5.579130040092613e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020919, "epoch": 2.5788449059052563, "step": 995}, {"loss": 0.025661391019821168, "token_acc": 0.9893179950698439, "grad_norm": 0.640625, "learning_rate": 5.53406572879501e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020923, "epoch": 2.591823491239455, "step": 1000}, {"loss": 0.04738643765449524, "token_acc": 0.9881756756756757, "grad_norm": 1.453125, "learning_rate": 5.488957490075846e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020894, "epoch": 2.6048020765736535, "step": 1005}, {"loss": 0.033944371342659, "token_acc": 0.9890664423885618, "grad_norm": 1.7421875, "learning_rate": 5.443809034131283e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020898, "epoch": 2.617780661907852, "step": 1010}, {"loss": 0.049440020322799684, "token_acc": 0.9846938775510204, "grad_norm": 2.09375, "learning_rate": 5.3986240744653916e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020897, "epoch": 2.6307592472420507, "step": 1015}, {"loss": 0.03849257528781891, "token_acc": 0.9902439024390244, "grad_norm": 0.8046875, "learning_rate": 5.3534063275847065e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.0209, "epoch": 2.643737832576249, "step": 1020}, {"loss": 0.026245176792144775, "token_acc": 0.9879614767255217, "grad_norm": 0.734375, "learning_rate": 5.3081595126925444e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.0209, "epoch": 2.656716417910448, "step": 1025}, {"loss": 0.031640535593032836, "token_acc": 0.9879227053140096, "grad_norm": 1.2421875, "learning_rate": 5.2628873513830914e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020902, "epoch": 2.6696950032446463, "step": 1030}, {"loss": 0.0378518670797348, "token_acc": 0.9883040935672515, "grad_norm": 1.09375, "learning_rate": 5.2175935673353035e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020904, "epoch": 2.6826735885788446, "step": 1035}, {"loss": 0.022336561977863312, "token_acc": 0.9894991922455574, "grad_norm": 1.1328125, "learning_rate": 5.1722818860066245e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020904, "epoch": 2.6956521739130435, "step": 1040}, {"loss": 0.02416083961725235, "token_acc": 0.9918300653594772, "grad_norm": 0.8671875, "learning_rate": 5.126956034326573e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020901, "epoch": 2.7086307592472423, "step": 1045}, {"loss": 0.03095039427280426, "token_acc": 0.9918500407497962, "grad_norm": 1.3125, "learning_rate": 5.0816197403901876e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020906, "epoch": 2.7216093445814407, "step": 1050}, {"loss": 0.033490809798240664, "token_acc": 0.9894907033144705, "grad_norm": 1.59375, "learning_rate": 5.036276733151392e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020909, "epoch": 2.734587929915639, "step": 1055}, {"loss": 0.03164244592189789, "token_acc": 0.9892915980230642, "grad_norm": 1.2421875, "learning_rate": 4.990930742116287e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020906, "epoch": 2.747566515249838, "step": 1060}, {"loss": 0.03462514877319336, "token_acc": 0.9894050529747351, "grad_norm": 0.466796875, "learning_rate": 4.945585497036396e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020908, "epoch": 2.7605451005840362, "step": 1065}, {"loss": 0.026110872626304626, "token_acc": 0.9886086248982913, "grad_norm": 0.640625, "learning_rate": 4.900244727601882e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020906, "epoch": 2.773523685918235, "step": 1070}, {"loss": 0.025288578867912293, "token_acc": 0.9925496688741722, "grad_norm": 0.470703125, "learning_rate": 4.854912163134788e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020904, "epoch": 2.7865022712524334, "step": 1075}, {"loss": 0.017070406675338747, "token_acc": 0.9924874791318865, "grad_norm": 0.63671875, "learning_rate": 4.809591532282282e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020904, "epoch": 2.799480856586632, "step": 1080}, {"loss": 0.0368634819984436, "token_acc": 0.9875724937862469, "grad_norm": 2.03125, "learning_rate": 4.76428656270999e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020902, "epoch": 2.8124594419208306, "step": 1085}, {"loss": 0.04762330949306488, "token_acc": 0.9873843566021867, "grad_norm": 1.5546875, "learning_rate": 4.719000980795375e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020899, "epoch": 2.8254380272550295, "step": 1090}, {"loss": 0.03311816155910492, "token_acc": 0.990139687756779, "grad_norm": 1.28125, "learning_rate": 4.6737385113212516e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020899, "epoch": 2.838416612589228, "step": 1095}, {"loss": 0.06551986932754517, "token_acc": 0.9826589595375722, "grad_norm": 1.078125, "learning_rate": 4.628502877169413e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020901, "epoch": 2.851395197923426, "step": 1100}, {"loss": 0.042209646105766295, "token_acc": 0.9876543209876543, "grad_norm": 1.203125, "learning_rate": 4.583297799014416e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020908, "epoch": 2.864373783257625, "step": 1105}, {"loss": 0.031407338380813596, "token_acc": 0.988255033557047, "grad_norm": 0.412109375, "learning_rate": 4.538126995017566e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.02091, "epoch": 2.8773523685918234, "step": 1110}, {"loss": 0.05650691390037536, "token_acc": 0.9760726072607261, "grad_norm": 1.46875, "learning_rate": 4.492994180521077e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020913, "epoch": 2.8903309539260222, "step": 1115}, {"loss": 0.03302388489246368, "token_acc": 0.988663967611336, "grad_norm": 0.875, "learning_rate": 4.447903067742496e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020919, "epoch": 2.9033095392602206, "step": 1120}, {"loss": 0.043651098012924196, "token_acc": 0.986960065199674, "grad_norm": 1.0546875, "learning_rate": 4.402857365469364e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020924, "epoch": 2.916288124594419, "step": 1125}, {"loss": 0.0316637396812439, "token_acc": 0.9907016060862215, "grad_norm": 0.490234375, "learning_rate": 4.357860778754157e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020924, "epoch": 2.929266709928618, "step": 1130}, {"loss": 0.032447722554206845, "token_acc": 0.9874266554903605, "grad_norm": 0.79296875, "learning_rate": 4.3129170086095564e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020925, "epoch": 2.942245295262816, "step": 1135}, {"loss": 0.026056140661239624, "token_acc": 0.989344262295082, "grad_norm": 1.296875, "learning_rate": 4.2680297517040285e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020921, "epoch": 2.955223880597015, "step": 1140}, {"loss": 0.024412302672863005, "token_acc": 0.9907795473595976, "grad_norm": 1.1015625, "learning_rate": 4.223202700057765e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020915, "epoch": 2.9682024659312134, "step": 1145}, {"loss": 0.01829095035791397, "token_acc": 0.991876523151909, "grad_norm": 1.546875, "learning_rate": 4.178439540739026e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020917, "epoch": 2.981181051265412, "step": 1150}, {"loss": 0.024611912667751312, "token_acc": 0.9909015715467329, "grad_norm": 0.3984375, "learning_rate": 4.133743955560857e-05, "memory(GiB)": 118.47, "train_speed(iter/s)": 0.020918, "epoch": 2.9941596365996106, "step": 1155}, {"loss": 0.020613734424114228, "token_acc": 0.9933206106870229, "grad_norm": 0.4765625, "learning_rate": 4.0891196207782726e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020936, "epoch": 3.0051914341336796, "step": 1160}, {"loss": 0.021938510239124298, "token_acc": 0.9925681255161024, "grad_norm": 1.0703125, "learning_rate": 4.0445702067858734e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020936, "epoch": 3.018170019467878, "step": 1165}, {"loss": 0.010195696353912353, "token_acc": 0.9966832504145937, "grad_norm": 0.56640625, "learning_rate": 4.0000993778159444e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020933, "epoch": 3.0311486048020764, "step": 1170}, {"loss": 0.010604656487703323, "token_acc": 0.9966216216216216, "grad_norm": 0.224609375, "learning_rate": 3.955710791637085e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020933, "epoch": 3.044127190136275, "step": 1175}, {"loss": 0.021324822306632997, "token_acc": 0.9967239967239967, "grad_norm": 0.38671875, "learning_rate": 3.9114080992533445e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020935, "epoch": 3.0571057754704736, "step": 1180}, {"loss": 0.01656488925218582, "token_acc": 0.9967186218211649, "grad_norm": 0.09814453125, "learning_rate": 3.86719494460392e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020935, "epoch": 3.0700843608046724, "step": 1185}, {"loss": 0.019095677137374877, "token_acc": 0.9923534409515717, "grad_norm": 0.51171875, "learning_rate": 3.82307496426345e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020933, "epoch": 3.0830629461388708, "step": 1190}, {"loss": 0.009211428463459015, "token_acc": 0.9958949096880131, "grad_norm": 0.61328125, "learning_rate": 3.7790517871428895e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02093, "epoch": 3.0960415314730696, "step": 1195}, {"loss": 0.026492729783058167, "token_acc": 0.9933665008291874, "grad_norm": 1.578125, "learning_rate": 3.7351290341910394e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020932, "epoch": 3.109020116807268, "step": 1200}, {"loss": 0.010792587697505952, "token_acc": 0.9958333333333333, "grad_norm": 0.66015625, "learning_rate": 3.691310318096719e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020937, "epoch": 3.1219987021414664, "step": 1205}, {"loss": 0.01512954980134964, "token_acc": 0.9942763695829926, "grad_norm": 1.3515625, "learning_rate": 3.647599242991607e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020937, "epoch": 3.134977287475665, "step": 1210}, {"loss": 0.014608421921730041, "token_acc": 0.9949748743718593, "grad_norm": 0.921875, "learning_rate": 3.603999404153814e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020937, "epoch": 3.1479558728098636, "step": 1215}, {"loss": 0.010922598093748093, "token_acc": 0.996694214876033, "grad_norm": 0.6875, "learning_rate": 3.560514387712156e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020931, "epoch": 3.1609344581440624, "step": 1220}, {"loss": 0.025766593217849732, "token_acc": 0.992671009771987, "grad_norm": 0.828125, "learning_rate": 3.517147770351199e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020933, "epoch": 3.1739130434782608, "step": 1225}, {"loss": 0.01006721258163452, "token_acc": 0.995928338762215, "grad_norm": 0.486328125, "learning_rate": 3.473903119017071e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020937, "epoch": 3.1868916288124596, "step": 1230}, {"loss": 0.02418680191040039, "token_acc": 0.9934264585045193, "grad_norm": 1.3203125, "learning_rate": 3.430783990624071e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020937, "epoch": 3.199870214146658, "step": 1235}, {"loss": 0.020583410561084748, "token_acc": 0.9924874791318865, "grad_norm": 0.33984375, "learning_rate": 3.3877939317621224e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020941, "epoch": 3.2128487994808568, "step": 1240}, {"loss": 0.03402402400970459, "token_acc": 0.9892915980230642, "grad_norm": 1.359375, "learning_rate": 3.344936478405051e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02094, "epoch": 3.225827384815055, "step": 1245}, {"loss": 0.011006417870521545, "token_acc": 0.9958711808422791, "grad_norm": 0.29296875, "learning_rate": 3.302215155619752e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020942, "epoch": 3.2388059701492535, "step": 1250}, {"loss": 0.015421295166015625, "token_acc": 0.9951060358890701, "grad_norm": 0.64453125, "learning_rate": 3.259633477276251e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020945, "epoch": 3.2517845554834524, "step": 1255}, {"loss": 0.007819992303848267, "token_acc": 0.9975267930750206, "grad_norm": 0.349609375, "learning_rate": 3.217194945758678e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020948, "epoch": 3.2647631408176507, "step": 1260}, {"loss": 0.0145877406001091, "token_acc": 0.9958779884583677, "grad_norm": 0.50390625, "learning_rate": 3.174903051677208e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020954, "epoch": 3.2777417261518496, "step": 1265}, {"loss": 0.019092942774295806, "token_acc": 0.992462311557789, "grad_norm": 0.84375, "learning_rate": 3.132761273580942e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020953, "epoch": 3.290720311486048, "step": 1270}, {"loss": 0.019785448908805847, "token_acc": 0.9926047658175843, "grad_norm": 0.90625, "learning_rate": 3.0907730776717894e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020952, "epoch": 3.3036988968202468, "step": 1275}, {"loss": 0.009959718585014344, "token_acc": 0.9958982772764561, "grad_norm": 0.26953125, "learning_rate": 3.0489419175193866e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020951, "epoch": 3.316677482154445, "step": 1280}, {"loss": 0.007476700097322464, "token_acc": 0.9991568296795953, "grad_norm": 0.703125, "learning_rate": 3.0072712337770197e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020953, "epoch": 3.329656067488644, "step": 1285}, {"loss": 0.004897921904921531, "token_acc": 0.9975308641975309, "grad_norm": 0.515625, "learning_rate": 2.96576445389864e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020954, "epoch": 3.3426346528228423, "step": 1290}, {"loss": 0.016269049048423766, "token_acc": 0.99581589958159, "grad_norm": 0.65234375, "learning_rate": 2.924424991856947e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020954, "epoch": 3.3556132381570407, "step": 1295}, {"loss": 0.016820259392261505, "token_acc": 0.9916597164303587, "grad_norm": 1.421875, "learning_rate": 2.8832562478625825e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020953, "epoch": 3.3685918234912395, "step": 1300}, {"loss": 0.016832391917705535, "token_acc": 0.9942291838417148, "grad_norm": 0.458984375, "learning_rate": 2.84226160808447e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020954, "epoch": 3.381570408825438, "step": 1305}, {"loss": 0.009478585422039032, "token_acc": 0.9959083469721768, "grad_norm": 0.49609375, "learning_rate": 2.8014444443712918e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020958, "epoch": 3.3945489941596367, "step": 1310}, {"loss": 0.014081387221813202, "token_acc": 0.9958711808422791, "grad_norm": 0.68359375, "learning_rate": 2.7608081139741482e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020956, "epoch": 3.407527579493835, "step": 1315}, {"loss": 0.02275892049074173, "token_acc": 0.9916597164303587, "grad_norm": 0.79296875, "learning_rate": 2.7203559592704315e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020956, "epoch": 3.420506164828034, "step": 1320}, {"loss": 0.011998300999403, "token_acc": 0.9959415584415584, "grad_norm": 1.0546875, "learning_rate": 2.6800913074888983e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020958, "epoch": 3.4334847501622323, "step": 1325}, {"loss": 0.016256968677043914, "token_acc": 0.9941569282136895, "grad_norm": 0.6015625, "learning_rate": 2.6400174704360148e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020955, "epoch": 3.446463335496431, "step": 1330}, {"loss": 0.007994767278432846, "token_acc": 0.9983361064891847, "grad_norm": 0.130859375, "learning_rate": 2.600137744223552e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020952, "epoch": 3.4594419208306295, "step": 1335}, {"loss": 0.018077975511550902, "token_acc": 0.9942434210526315, "grad_norm": 0.8203125, "learning_rate": 2.5604554089974692e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020956, "epoch": 3.472420506164828, "step": 1340}, {"loss": 0.008464429527521133, "token_acc": 0.9975470155355682, "grad_norm": 0.435546875, "learning_rate": 2.520973728668137e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020958, "epoch": 3.4853990914990267, "step": 1345}, {"loss": 0.011597975343465804, "token_acc": 0.9959016393442623, "grad_norm": 0.392578125, "learning_rate": 2.4816959506418584e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02096, "epoch": 3.498377676833225, "step": 1350}, {"loss": 0.013302412629127503, "token_acc": 0.9957912457912458, "grad_norm": 0.32421875, "learning_rate": 2.442625305553779e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020962, "epoch": 3.511356262167424, "step": 1355}, {"loss": 0.010544488579034806, "token_acc": 0.9975267930750206, "grad_norm": 0.58984375, "learning_rate": 2.4037650070021624e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02096, "epoch": 3.5243348475016223, "step": 1360}, {"loss": 0.011504454165697097, "token_acc": 0.9958915365653246, "grad_norm": 0.7109375, "learning_rate": 2.3651182512840603e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020962, "epoch": 3.5373134328358207, "step": 1365}, {"loss": 0.016246941685676575, "token_acc": 0.9942622950819672, "grad_norm": 0.7109375, "learning_rate": 2.3266882171324272e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020961, "epoch": 3.5502920181700195, "step": 1370}, {"loss": 0.005815648287534714, "token_acc": 0.9983766233766234, "grad_norm": 0.6640625, "learning_rate": 2.2884780654546584e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020964, "epoch": 3.5632706035042183, "step": 1375}, {"loss": 0.009234168380498887, "token_acc": 0.9975728155339806, "grad_norm": 0.451171875, "learning_rate": 2.2504909390725975e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020964, "epoch": 3.5762491888384167, "step": 1380}, {"loss": 0.004879490658640862, "token_acc": 0.9983525535420099, "grad_norm": 0.265625, "learning_rate": 2.2127299624640508e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020965, "epoch": 3.589227774172615, "step": 1385}, {"loss": 0.007252056151628494, "token_acc": 0.9967871485943776, "grad_norm": 0.007049560546875, "learning_rate": 2.1751982415057798e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020963, "epoch": 3.602206359506814, "step": 1390}, {"loss": 0.006177261471748352, "token_acc": 0.9975609756097561, "grad_norm": 0.08349609375, "learning_rate": 2.1378988632180523e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020959, "epoch": 3.6151849448410123, "step": 1395}, {"loss": 0.034820249676704405, "token_acc": 0.992462311557789, "grad_norm": 1.1796875, "learning_rate": 2.100834895510729e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020959, "epoch": 3.628163530175211, "step": 1400}, {"loss": 0.022104519605636596, "token_acc": 0.9942528735632183, "grad_norm": 0.83984375, "learning_rate": 2.064009386930915e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020962, "epoch": 3.6411421155094095, "step": 1405}, {"loss": 0.009468887746334077, "token_acc": 0.9950041631973355, "grad_norm": 0.89453125, "learning_rate": 2.0274253664122316e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020963, "epoch": 3.654120700843608, "step": 1410}, {"loss": 0.01620858609676361, "token_acc": 0.9950083194675541, "grad_norm": 0.484375, "learning_rate": 1.991085843025667e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020962, "epoch": 3.6670992861778067, "step": 1415}, {"loss": 0.013029226660728454, "token_acc": 0.9934102141680395, "grad_norm": 0.671875, "learning_rate": 1.9549938057320893e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020967, "epoch": 3.680077871512005, "step": 1420}, {"loss": 0.01667369157075882, "token_acc": 0.9950657894736842, "grad_norm": 0.08447265625, "learning_rate": 1.9191522231363913e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020966, "epoch": 3.693056456846204, "step": 1425}, {"loss": 0.010355699062347411, "token_acc": 0.9950940310711366, "grad_norm": 0.20703125, "learning_rate": 1.883564043243332e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020966, "epoch": 3.7060350421804023, "step": 1430}, {"loss": 0.011119000613689423, "token_acc": 0.9934959349593496, "grad_norm": 1.8125, "learning_rate": 1.8482321932150467e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020968, "epoch": 3.719013627514601, "step": 1435}, {"loss": 0.007310156524181366, "token_acc": 0.9958368026644463, "grad_norm": 0.08642578125, "learning_rate": 1.8131595791302974e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020965, "epoch": 3.7319922128487995, "step": 1440}, {"loss": 0.005807420611381531, "token_acc": 0.9983179142136249, "grad_norm": 0.2470703125, "learning_rate": 1.7783490857454356e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020965, "epoch": 3.7449707981829983, "step": 1445}, {"loss": 0.012262566387653351, "token_acc": 0.9958949096880131, "grad_norm": 0.890625, "learning_rate": 1.743803576257136e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020964, "epoch": 3.7579493835171967, "step": 1450}, {"loss": 0.004218662902712822, "token_acc": 0.9991843393148451, "grad_norm": 0.55859375, "learning_rate": 1.7095258920668866e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020966, "epoch": 3.770927968851395, "step": 1455}, {"loss": 0.0074700690805912014, "token_acc": 0.9991735537190083, "grad_norm": 0.21875, "learning_rate": 1.6755188525472927e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020964, "epoch": 3.783906554185594, "step": 1460}, {"loss": 0.013314425945281982, "token_acc": 0.9966527196652719, "grad_norm": 0.271484375, "learning_rate": 1.641785254810172e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020963, "epoch": 3.7968851395197922, "step": 1465}, {"loss": 0.008113903552293777, "token_acc": 0.9983150800336984, "grad_norm": 0.54296875, "learning_rate": 1.608327873476488e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020962, "epoch": 3.809863724853991, "step": 1470}, {"loss": 0.008881012350320816, "token_acc": 0.9950289975144988, "grad_norm": 0.3828125, "learning_rate": 1.5751494604481427e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020962, "epoch": 3.8228423101881894, "step": 1475}, {"loss": 0.00938112884759903, "token_acc": 0.9967611336032388, "grad_norm": 0.76171875, "learning_rate": 1.542252744681627e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020964, "epoch": 3.835820895522388, "step": 1480}, {"loss": 0.01805244982242584, "token_acc": 0.9935794542536116, "grad_norm": 0.6015625, "learning_rate": 1.5096404319635532e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020963, "epoch": 3.8487994808565866, "step": 1485}, {"loss": 0.009089920669794083, "token_acc": 0.997504159733777, "grad_norm": 0.58984375, "learning_rate": 1.4773152046881178e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020964, "epoch": 3.8617780661907855, "step": 1490}, {"loss": 0.011903107911348344, "token_acc": 0.9974979149291076, "grad_norm": 0.1494140625, "learning_rate": 1.4452797216364527e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020962, "epoch": 3.874756651524984, "step": 1495}, {"loss": 0.008489710092544556, "token_acc": 0.9967266775777414, "grad_norm": 1.28125, "learning_rate": 1.4135366177579579e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020964, "epoch": 3.887735236859182, "step": 1500}, {"loss": 0.00586327463388443, "token_acc": 0.9983818770226537, "grad_norm": 0.07421875, "learning_rate": 1.3820885039535563e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020943, "epoch": 3.900713822193381, "step": 1505}, {"loss": 0.0139107346534729, "token_acc": 0.9917149958574979, "grad_norm": 0.05029296875, "learning_rate": 1.3509379668609545e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020943, "epoch": 3.9136924075275794, "step": 1510}, {"loss": 0.007857438921928406, "token_acc": 0.9975409836065574, "grad_norm": 0.6640625, "learning_rate": 1.3200875686418906e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020935, "epoch": 3.9266709928617782, "step": 1515}, {"loss": 0.014272244274616241, "token_acc": 0.99597747385358, "grad_norm": 0.296875, "learning_rate": 1.289539846771387e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020937, "epoch": 3.9396495781959766, "step": 1520}, {"loss": 0.011531826108694077, "token_acc": 0.9974811083123426, "grad_norm": 0.10888671875, "learning_rate": 1.2592973138290459e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020935, "epoch": 3.952628163530175, "step": 1525}, {"loss": 0.005492057651281357, "token_acc": 0.998371335504886, "grad_norm": 0.412109375, "learning_rate": 1.229362457292388e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02093, "epoch": 3.965606748864374, "step": 1530}, {"loss": 0.011842301487922669, "token_acc": 0.997545008183306, "grad_norm": 0.81640625, "learning_rate": 1.1997377393322461e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020928, "epoch": 3.9785853341985726, "step": 1535}, {"loss": 0.009899959713220597, "token_acc": 0.9958609271523179, "grad_norm": 0.302734375, "learning_rate": 1.1704255966102628e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020926, "epoch": 3.991563919532771, "step": 1540}, {"loss": 0.005935117974877358, "token_acc": 0.9990328820116054, "grad_norm": 0.130859375, "learning_rate": 1.1414284400784642e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020936, "epoch": 4.00259571706684, "step": 1545}, {"loss": 0.001661362312734127, "token_acc": 1.0, "grad_norm": 0.05517578125, "learning_rate": 1.1127486547809534e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02093, "epoch": 4.015574302401038, "step": 1550}, {"loss": 0.0012222844175994395, "token_acc": 1.0, "grad_norm": 0.09423828125, "learning_rate": 1.0843885996577502e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02093, "epoch": 4.028552887735237, "step": 1555}, {"loss": 0.0020720928907394407, "token_acc": 1.0, "grad_norm": 0.1494140625, "learning_rate": 1.0563506073507523e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020927, "epoch": 4.041531473069435, "step": 1560}, {"loss": 0.003256053477525711, "token_acc": 0.9984, "grad_norm": 0.404296875, "learning_rate": 1.0286369840118858e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020925, "epoch": 4.054510058403634, "step": 1565}, {"loss": 0.0011426467448472978, "token_acc": 1.0, "grad_norm": 0.373046875, "learning_rate": 1.0012500091134148e-05, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020924, "epoch": 4.067488643737833, "step": 1570}, {"loss": 0.0045836478471755985, "token_acc": 0.9991809991809992, "grad_norm": 0.59375, "learning_rate": 9.741919352604506e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020917, "epoch": 4.080467229072031, "step": 1575}, {"loss": 0.0034479245543479918, "token_acc": 0.9991749174917491, "grad_norm": 0.46484375, "learning_rate": 9.47464988005678e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020909, "epoch": 4.09344581440623, "step": 1580}, {"loss": 0.007430323213338852, "token_acc": 0.9983235540653814, "grad_norm": 0.875, "learning_rate": 9.210713656663023e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020896, "epoch": 4.106424399740428, "step": 1585}, {"loss": 0.002633281424641609, "token_acc": 0.9991922455573505, "grad_norm": 0.083984375, "learning_rate": 8.950132391432275e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020886, "epoch": 4.119402985074627, "step": 1590}, {"loss": 0.006271419674158096, "token_acc": 0.9975409836065574, "grad_norm": 0.73828125, "learning_rate": 8.692927517425092e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020881, "epoch": 4.132381570408826, "step": 1595}, {"loss": 0.0012338412925601006, "token_acc": 1.0, "grad_norm": 0.05322265625, "learning_rate": 8.439120189990535e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02087, "epoch": 4.145360155743024, "step": 1600}, {"loss": 0.003544669598340988, "token_acc": 0.9991889699918897, "grad_norm": 0.0537109375, "learning_rate": 8.188731285026219e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020869, "epoch": 4.158338741077222, "step": 1605}, {"loss": 0.003528529778122902, "token_acc": 0.9991961414790996, "grad_norm": 0.1943359375, "learning_rate": 7.94178139726121e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020868, "epoch": 4.171317326411421, "step": 1610}, {"loss": 0.003579762205481529, "token_acc": 0.998326359832636, "grad_norm": 0.07568359375, "learning_rate": 7.698290838562062e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020864, "epoch": 4.18429591174562, "step": 1615}, {"loss": 0.004392120614647865, "token_acc": 0.9991742361684558, "grad_norm": 0.0517578125, "learning_rate": 7.4582796362622105e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02086, "epoch": 4.197274497079818, "step": 1620}, {"loss": 0.0038825396448373796, "token_acc": 0.9991532599491956, "grad_norm": 0.40625, "learning_rate": 7.2217675315146395e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02086, "epoch": 4.210253082414017, "step": 1625}, {"loss": 0.0008973809890449047, "token_acc": 1.0, "grad_norm": 0.1630859375, "learning_rate": 6.9887739776681945e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020866, "epoch": 4.223231667748215, "step": 1630}, {"loss": 0.006377002596855164, "token_acc": 0.9991735537190083, "grad_norm": 0.1748046875, "learning_rate": 6.759318138667542e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02087, "epoch": 4.236210253082414, "step": 1635}, {"loss": 0.004097684845328331, "token_acc": 0.9991721854304636, "grad_norm": 0.224609375, "learning_rate": 6.533418887476833e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020869, "epoch": 4.249188838416613, "step": 1640}, {"loss": 0.0022493613883852958, "token_acc": 1.0, "grad_norm": 0.294921875, "learning_rate": 6.3110948045274896e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020866, "epoch": 4.262167423750811, "step": 1645}, {"loss": 0.0012288546189665795, "token_acc": 1.0, "grad_norm": 0.0172119140625, "learning_rate": 6.092364176189857e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020865, "epoch": 4.2751460090850095, "step": 1650}, {"loss": 0.0014213725924491882, "token_acc": 0.9991850040749797, "grad_norm": 0.142578125, "learning_rate": 5.877244993269209e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020866, "epoch": 4.288124594419208, "step": 1655}, {"loss": 0.000797341950237751, "token_acc": 1.0, "grad_norm": 0.0224609375, "learning_rate": 5.665754949525914e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020865, "epoch": 4.301103179753407, "step": 1660}, {"loss": 0.003249218687415123, "token_acc": 0.9991789819376026, "grad_norm": 0.28125, "learning_rate": 5.457911440220154e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020867, "epoch": 4.3140817650876055, "step": 1665}, {"loss": 0.001958874985575676, "token_acc": 1.0, "grad_norm": 0.30859375, "learning_rate": 5.253731560681141e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020868, "epoch": 4.327060350421804, "step": 1670}, {"loss": 0.0055487435311079025, "token_acc": 0.9991755976916735, "grad_norm": 0.2490234375, "learning_rate": 5.0532321049010065e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02087, "epoch": 4.340038935756002, "step": 1675}, {"loss": 0.006631163507699966, "token_acc": 0.9983566146261298, "grad_norm": 0.228515625, "learning_rate": 4.856429564153453e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020869, "epoch": 4.353017521090202, "step": 1680}, {"loss": 0.0010011550970375539, "token_acc": 1.0, "grad_norm": 0.291015625, "learning_rate": 4.6633401256373886e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020871, "epoch": 4.3659961064244, "step": 1685}, {"loss": 0.0011706288903951644, "token_acc": 1.0, "grad_norm": 0.0400390625, "learning_rate": 4.473979671145445e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020872, "epoch": 4.378974691758598, "step": 1690}, {"loss": 0.0005182858556509018, "token_acc": 1.0, "grad_norm": 0.04248046875, "learning_rate": 4.288363775757737e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020876, "epoch": 4.391953277092797, "step": 1695}, {"loss": 0.005208690091967583, "token_acc": 0.9991701244813278, "grad_norm": 0.494140625, "learning_rate": 4.106507706560792e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020878, "epoch": 4.404931862426995, "step": 1700}, {"loss": 0.0008781940676271915, "token_acc": 1.0, "grad_norm": 0.1796875, "learning_rate": 3.9284264213917735e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020877, "epoch": 4.417910447761194, "step": 1705}, {"loss": 0.010159996896982193, "token_acc": 0.9975247524752475, "grad_norm": 0.67578125, "learning_rate": 3.7541345676082475e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020878, "epoch": 4.430889033095393, "step": 1710}, {"loss": 0.000509789539501071, "token_acc": 1.0, "grad_norm": 0.0169677734375, "learning_rate": 3.5836464808833625e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020882, "epoch": 4.443867618429591, "step": 1715}, {"loss": 0.0018530094996094704, "token_acc": 1.0, "grad_norm": 0.51953125, "learning_rate": 3.4169761840267824e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020878, "epoch": 4.4568462037637895, "step": 1720}, {"loss": 0.0017950931563973428, "token_acc": 1.0, "grad_norm": 0.36328125, "learning_rate": 3.2541373858312628e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020874, "epoch": 4.469824789097988, "step": 1725}, {"loss": 0.001635444164276123, "token_acc": 0.9991803278688525, "grad_norm": 0.083984375, "learning_rate": 3.0951434799450784e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020874, "epoch": 4.482803374432187, "step": 1730}, {"loss": 0.005332779884338379, "token_acc": 0.998330550918197, "grad_norm": 0.28515625, "learning_rate": 2.9400075437704287e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02087, "epoch": 4.4957819597663855, "step": 1735}, {"loss": 0.008843471109867097, "token_acc": 0.9975227085053675, "grad_norm": 0.126953125, "learning_rate": 2.7887423373877673e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020865, "epoch": 4.508760545100584, "step": 1740}, {"loss": 0.0013348528183996678, "token_acc": 1.0, "grad_norm": 0.10693359375, "learning_rate": 2.6413603025062792e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02086, "epoch": 4.521739130434782, "step": 1745}, {"loss": 0.0018230179324746133, "token_acc": 0.9991673605328892, "grad_norm": 0.027587890625, "learning_rate": 2.4978735614405668e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02086, "epoch": 4.5347177157689815, "step": 1750}, {"loss": 0.0033437285572290422, "token_acc": 0.9991610738255033, "grad_norm": 0.28515625, "learning_rate": 2.3582939161135353e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020857, "epoch": 4.54769630110318, "step": 1755}, {"loss": 0.001527964323759079, "token_acc": 1.0, "grad_norm": 0.1796875, "learning_rate": 2.222632847085715e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020854, "epoch": 4.560674886437378, "step": 1760}, {"loss": 0.006669136136770249, "token_acc": 0.9991783073130649, "grad_norm": 0.005401611328125, "learning_rate": 2.090901512610949e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020851, "epoch": 4.573653471771577, "step": 1765}, {"loss": 0.004949812591075897, "token_acc": 0.9983471074380166, "grad_norm": 0.00946044921875, "learning_rate": 1.9631107477185995e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020843, "epoch": 4.586632057105776, "step": 1770}, {"loss": 0.0032926712185144423, "token_acc": 0.9983416252072969, "grad_norm": 0.0693359375, "learning_rate": 1.8392710633224064e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020848, "epoch": 4.599610642439974, "step": 1775}, {"loss": 0.0015199462883174419, "token_acc": 0.9991830065359477, "grad_norm": 0.0262451171875, "learning_rate": 1.7193926453559272e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020849, "epoch": 4.612589227774173, "step": 1780}, {"loss": 0.005044208467006683, "token_acc": 0.9983646770237122, "grad_norm": 0.42578125, "learning_rate": 1.6034853539347029e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020846, "epoch": 4.625567813108371, "step": 1785}, {"loss": 0.0026885632425546644, "token_acc": 0.9983753046303818, "grad_norm": 0.045166015625, "learning_rate": 1.491558722545311e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020847, "epoch": 4.638546398442569, "step": 1790}, {"loss": 0.001484157331287861, "token_acc": 1.0, "grad_norm": 0.06884765625, "learning_rate": 1.3836219572611886e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020847, "epoch": 4.651524983776769, "step": 1795}, {"loss": 0.0019209956750273704, "token_acc": 0.9991708126036484, "grad_norm": 0.298828125, "learning_rate": 1.2796839359854374e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020842, "epoch": 4.664503569110967, "step": 1800}, {"loss": 0.003924115002155304, "token_acc": 0.9983498349834984, "grad_norm": 0.07861328125, "learning_rate": 1.1797532077206187e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020835, "epoch": 4.6774821544451655, "step": 1805}, {"loss": 0.003906024992465973, "token_acc": 0.9991721854304636, "grad_norm": 0.240234375, "learning_rate": 1.0838379918655495e-06, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02083, "epoch": 4.690460739779364, "step": 1810}, {"loss": 0.002249754220247269, "token_acc": 0.9983593109105825, "grad_norm": 0.0361328125, "learning_rate": 9.919461775392935e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020827, "epoch": 4.703439325113562, "step": 1815}, {"loss": 0.0005314781330525876, "token_acc": 1.0, "grad_norm": 0.02880859375, "learning_rate": 9.040853229322577e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020824, "epoch": 4.7164179104477615, "step": 1820}, {"loss": 0.0008674643933773041, "token_acc": 1.0, "grad_norm": 0.06494140625, "learning_rate": 8.202626546845171e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02082, "epoch": 4.72939649578196, "step": 1825}, {"loss": 0.0025060476735234262, "token_acc": 0.9991341991341991, "grad_norm": 0.04150390625, "learning_rate": 7.404850672914243e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020813, "epoch": 4.742375081116158, "step": 1830}, {"loss": 0.0018194744363427163, "token_acc": 0.9991902834008097, "grad_norm": 0.007293701171875, "learning_rate": 6.647591225365235e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020817, "epoch": 4.755353666450357, "step": 1835}, {"loss": 0.0006959903985261917, "token_acc": 1.0, "grad_norm": 0.037109375, "learning_rate": 5.930910489518493e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020812, "epoch": 4.768332251784555, "step": 1840}, {"loss": 0.0052085991948843, "token_acc": 0.9983606557377049, "grad_norm": 0.5078125, "learning_rate": 5.254867413056197e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02081, "epoch": 4.781310837118754, "step": 1845}, {"loss": 0.005099215358495712, "token_acc": 0.9991666666666666, "grad_norm": 0.5625, "learning_rate": 4.619517601173573e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020811, "epoch": 4.794289422452953, "step": 1850}, {"loss": 0.007123344391584396, "token_acc": 0.9983593109105825, "grad_norm": 0.04150390625, "learning_rate": 4.0249133120059404e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020807, "epoch": 4.807268007787151, "step": 1855}, {"loss": 0.0038691792637109757, "token_acc": 0.9983539094650206, "grad_norm": 0.0634765625, "learning_rate": 3.47110345232976e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020812, "epoch": 4.820246593121349, "step": 1860}, {"loss": 0.001323844026774168, "token_acc": 1.0, "grad_norm": 0.345703125, "learning_rate": 2.958133573540467e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020812, "epoch": 4.833225178455549, "step": 1865}, {"loss": 0.00251984428614378, "token_acc": 1.0, "grad_norm": 0.119140625, "learning_rate": 2.486045867905573e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020811, "epoch": 4.846203763789747, "step": 1870}, {"loss": 0.0011555158533155918, "token_acc": 1.0, "grad_norm": 0.09375, "learning_rate": 2.0548791650944477e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02081, "epoch": 4.859182349123945, "step": 1875}, {"loss": 0.002080231159925461, "token_acc": 0.9991680532445923, "grad_norm": 0.1435546875, "learning_rate": 1.6646689289844254e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020807, "epoch": 4.872160934458144, "step": 1880}, {"loss": 0.0027224332094192504, "token_acc": 0.998351195383347, "grad_norm": 0.50390625, "learning_rate": 1.315447254744029e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020808, "epoch": 4.885139519792343, "step": 1885}, {"loss": 0.0020501598715782165, "token_acc": 0.9991539763113367, "grad_norm": 0.326171875, "learning_rate": 1.0072428661929701e-07, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020809, "epoch": 4.898118105126541, "step": 1890}, {"loss": 0.0011606317013502121, "token_acc": 1.0, "grad_norm": 0.03515625, "learning_rate": 7.400811134395946e-08, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020811, "epoch": 4.91109669046074, "step": 1895}, {"loss": 0.0006340864114463329, "token_acc": 1.0, "grad_norm": 0.08935546875, "learning_rate": 5.139839707958283e-08, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.02081, "epoch": 4.924075275794938, "step": 1900}, {"loss": 0.00033661627676337955, "token_acc": 1.0, "grad_norm": 0.1357421875, "learning_rate": 3.289700349698999e-08, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020812, "epoch": 4.937053861129137, "step": 1905}, {"loss": 0.006044203415513039, "token_acc": 0.9984051036682615, "grad_norm": 0.40625, "learning_rate": 1.8505452353656526e-08, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020814, "epoch": 4.950032446463336, "step": 1910}, {"loss": 0.00470067672431469, "token_acc": 0.9983525535420099, "grad_norm": 0.0079345703125, "learning_rate": 8.22492736854974e-09, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020815, "epoch": 4.963011031797534, "step": 1915}, {"loss": 0.007594909518957138, "token_acc": 0.9975186104218362, "grad_norm": 0.76171875, "learning_rate": 2.056274124773161e-09, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020814, "epoch": 4.975989617131733, "step": 1920}, {"loss": 0.007923491299152374, "token_acc": 0.9975062344139651, "grad_norm": 0.0086669921875, "learning_rate": 0.0, "memory(GiB)": 118.49, "train_speed(iter/s)": 0.020815, "epoch": 4.988968202465931, "step": 1925}, {"train_runtime": 92570.8058, "train_samples_per_second": 1.332, "train_steps_per_second": 0.021, "total_flos": 3.70560890425983e+18, "train_loss": 0.07658964898806161, "epoch": 4.988968202465931, "step": 1925}], "memory": 118.4921875}