{ "best_metric": 3.1630301475524902, "best_model_checkpoint": "checkpoints/it5-large/checkpoint-78926", "epoch": 14.251715420729505, "eval_steps": 4154, "global_step": 78926, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14987360057782592, "eval_g2l_cer": 67.2645, "eval_g2l_gen_len": 4.5733, "eval_g2l_rouge1": 27.1595, "eval_g2l_rouge2": 15.5941, "eval_g2l_rougeL": 26.9535, "eval_g2l_rougeLsum": 26.9576, "eval_l2ex_cer": 130.3597, "eval_l2ex_gen_len": 47.8171, "eval_l2ex_rouge1": 22.1003, "eval_l2ex_rouge2": 9.5437, "eval_l2ex_rougeL": 20.2017, "eval_l2ex_rougeLsum": 19.2847, "eval_l2g_cer": 106.9099, "eval_l2g_gen_len": 30.346, "eval_l2g_rouge1": 27.2135, "eval_l2g_rouge2": 14.1149, "eval_l2g_rougeL": 25.3922, "eval_l2g_rougeLsum": 25.3986, "eval_loss": 3.804034948348999, "eval_runtime": 310.7233, "eval_samples_per_second": 31.929, "eval_steps_per_second": 0.502, "step": 830 }, { "epoch": 0.15005417118093176, "grad_norm": 129.91856384277344, "learning_rate": 6.004335260115606e-05, "loss": 4.6945, "step": 831 }, { "epoch": 0.3001083423618635, "grad_norm": 190.0506591796875, "learning_rate": 0.00012008670520231212, "loss": 3.8417, "step": 1662 }, { "epoch": 0.45016251354279524, "grad_norm": 208.42779541015625, "learning_rate": 0.0001801300578034682, "loss": 3.712, "step": 2493 }, { "epoch": 0.600216684723727, "grad_norm": 293.423583984375, "learning_rate": 0.00024017341040462423, "loss": 3.6763, "step": 3324 }, { "epoch": 0.7500902853015529, "eval_g2l_cer": 53.5059, "eval_g2l_gen_len": 3.5087, "eval_g2l_rouge1": 37.5417, "eval_g2l_rouge2": 29.1384, "eval_g2l_rougeL": 37.463, "eval_g2l_rougeLsum": 37.4022, "eval_l2ex_cer": 102.0708, "eval_l2ex_gen_len": 25.9866, "eval_l2ex_rouge1": 26.7853, "eval_l2ex_rouge2": 12.9071, "eval_l2ex_rougeL": 24.0724, "eval_l2ex_rougeLsum": 24.0445, "eval_l2g_cer": 86.4648, "eval_l2g_gen_len": 15.0081, "eval_l2g_rouge1": 30.7776, "eval_l2g_rouge2": 18.1789, "eval_l2g_rougeL": 29.1675, "eval_l2g_rougeLsum": 29.2136, "eval_loss": 3.5662293434143066, "eval_runtime": 296.355, "eval_samples_per_second": 33.477, "eval_steps_per_second": 0.526, "step": 4154 }, { "epoch": 0.7502708559046587, "grad_norm": 500.86932373046875, "learning_rate": 0.0002999999989317841, "loss": 3.6694, "step": 4155 }, { "epoch": 0.9003250270855905, "grad_norm": 378.1305236816406, "learning_rate": 0.00029991745158829114, "loss": 3.641, "step": 4986 }, { "epoch": 1.0503791982665223, "grad_norm": 309.59503173828125, "learning_rate": 0.0002996710832786393, "loss": 3.6169, "step": 5817 }, { "epoch": 1.200433369447454, "grad_norm": 364.1076965332031, "learning_rate": 0.00029926116366930635, "loss": 3.5732, "step": 6648 }, { "epoch": 1.3504875406283858, "grad_norm": 402.1815490722656, "learning_rate": 0.00029868814144453027, "loss": 3.5547, "step": 7479 }, { "epoch": 1.500180570603106, "eval_g2l_cer": 49.6927, "eval_g2l_gen_len": 4.4371, "eval_g2l_rouge1": 42.6629, "eval_g2l_rouge2": 32.7133, "eval_g2l_rougeL": 42.5078, "eval_g2l_rougeLsum": 42.487, "eval_l2ex_cer": 85.9069, "eval_l2ex_gen_len": 27.3155, "eval_l2ex_rouge1": 31.0018, "eval_l2ex_rouge2": 14.7792, "eval_l2ex_rougeL": 27.5259, "eval_l2ex_rougeLsum": 27.5817, "eval_l2g_cer": 76.6936, "eval_l2g_gen_len": 19.6286, "eval_l2g_rouge1": 38.3213, "eval_l2g_rouge2": 24.5167, "eval_l2g_rougeL": 36.1971, "eval_l2g_rougeLsum": 36.2764, "eval_loss": 3.4418885707855225, "eval_runtime": 302.3378, "eval_samples_per_second": 32.814, "eval_steps_per_second": 0.516, "step": 8308 }, { "epoch": 1.5005417118093174, "grad_norm": 380.1324157714844, "learning_rate": 0.0002979526438151941, "loss": 3.533, "step": 8310 }, { "epoch": 1.6505958829902492, "grad_norm": 358.8219909667969, "learning_rate": 0.0002970554758323025, "loss": 3.5167, "step": 9141 }, { "epoch": 1.800650054171181, "grad_norm": 323.29583740234375, "learning_rate": 0.0002959976195057994, "loss": 3.5114, "step": 9972 }, { "epoch": 1.9507042253521125, "grad_norm": 283.41790771484375, "learning_rate": 0.00029478023272969345, "loss": 3.4955, "step": 10803 }, { "epoch": 2.1007583965330445, "grad_norm": 326.611572265625, "learning_rate": 0.0002934046480146657, "loss": 3.4415, "step": 11634 }, { "epoch": 2.2502708559046587, "eval_g2l_cer": 49.446, "eval_g2l_gen_len": 4.5047, "eval_g2l_rouge1": 43.91, "eval_g2l_rouge2": 33.662, "eval_g2l_rougeL": 43.778, "eval_g2l_rougeLsum": 43.7883, "eval_l2ex_cer": 86.4808, "eval_l2ex_gen_len": 30.4358, "eval_l2ex_rouge1": 30.7974, "eval_l2ex_rouge2": 14.4266, "eval_l2ex_rougeL": 27.2278, "eval_l2ex_rougeLsum": 27.3219, "eval_l2g_cer": 84.1694, "eval_l2g_gen_len": 24.5493, "eval_l2g_rouge1": 38.1858, "eval_l2g_rouge2": 25.1392, "eval_l2g_rougeL": 36.1473, "eval_l2g_rougeLsum": 36.1987, "eval_loss": 3.374830961227417, "eval_runtime": 306.8842, "eval_samples_per_second": 32.328, "eval_steps_per_second": 0.508, "step": 12462 }, { "epoch": 2.250812567713976, "grad_norm": 368.7247314453125, "learning_rate": 0.0002918723710295482, "loss": 3.424, "step": 12465 }, { "epoch": 2.400866738894908, "grad_norm": 468.8291320800781, "learning_rate": 0.00029018507895326985, "loss": 3.4228, "step": 13296 }, { "epoch": 2.5509209100758397, "grad_norm": 266.9640808105469, "learning_rate": 0.00028834461863907226, "loss": 3.4152, "step": 14127 }, { "epoch": 2.7009750812567717, "grad_norm": 237.8248748779297, "learning_rate": 0.0002863530045930063, "loss": 3.4187, "step": 14958 }, { "epoch": 2.851029252437703, "grad_norm": 371.8949890136719, "learning_rate": 0.00028421241676892145, "loss": 3.3965, "step": 15789 }, { "epoch": 3.0003611412062114, "eval_g2l_cer": 48.3881, "eval_g2l_gen_len": 4.4987, "eval_g2l_rouge1": 44.8641, "eval_g2l_rouge2": 34.4032, "eval_g2l_rougeL": 44.6885, "eval_g2l_rougeLsum": 44.711, "eval_l2ex_cer": 89.7331, "eval_l2ex_gen_len": 30.1634, "eval_l2ex_rouge1": 30.5096, "eval_l2ex_rouge2": 14.191, "eval_l2ex_rougeL": 26.9741, "eval_l2ex_rougeLsum": 27.0965, "eval_l2g_cer": 81.1389, "eval_l2g_gen_len": 23.1439, "eval_l2g_rouge1": 39.3934, "eval_l2g_rouge2": 25.9597, "eval_l2g_rougeL": 37.0903, "eval_l2g_rougeLsum": 37.1641, "eval_loss": 3.325451612472534, "eval_runtime": 304.8315, "eval_samples_per_second": 32.546, "eval_steps_per_second": 0.512, "step": 16616 }, { "epoch": 3.001083423618635, "grad_norm": 298.2984619140625, "learning_rate": 0.0002819251981823618, "loss": 3.3917, "step": 16620 }, { "epoch": 3.151137594799567, "grad_norm": 318.9201354980469, "learning_rate": 0.00027949385234597935, "loss": 3.3406, "step": 17451 }, { "epoch": 3.3011917659804983, "grad_norm": 286.3103942871094, "learning_rate": 0.0002769210405292737, "loss": 3.3328, "step": 18282 }, { "epoch": 3.4512459371614304, "grad_norm": 352.0337829589844, "learning_rate": 0.0002742095788456554, "loss": 3.3333, "step": 19113 }, { "epoch": 3.601300108342362, "grad_norm": 210.31089782714844, "learning_rate": 0.0002713624351700232, "loss": 3.3251, "step": 19944 }, { "epoch": 3.7504514265077646, "eval_g2l_cer": 47.0881, "eval_g2l_gen_len": 4.2162, "eval_g2l_rouge1": 45.6068, "eval_g2l_rouge2": 34.9617, "eval_g2l_rougeL": 45.369, "eval_g2l_rougeLsum": 45.3992, "eval_l2ex_cer": 87.2057, "eval_l2ex_gen_len": 25.9743, "eval_l2ex_rouge1": 32.044, "eval_l2ex_rouge2": 15.4907, "eval_l2ex_rougeL": 28.2386, "eval_l2ex_rougeLsum": 28.3364, "eval_l2g_cer": 81.5351, "eval_l2g_gen_len": 20.6293, "eval_l2g_rouge1": 39.7177, "eval_l2g_rouge2": 26.6455, "eval_l2g_rougeL": 37.5652, "eval_l2g_rougeLsum": 37.5978, "eval_loss": 3.267240047454834, "eval_runtime": 303.1609, "eval_samples_per_second": 32.725, "eval_steps_per_second": 0.515, "step": 20770 }, { "epoch": 3.7513542795232935, "grad_norm": 240.68614196777344, "learning_rate": 0.0002683827258902275, "loss": 3.3215, "step": 20775 }, { "epoch": 3.9014084507042255, "grad_norm": 203.9857177734375, "learning_rate": 0.0002652737124959771, "loss": 3.311, "step": 21606 }, { "epoch": 4.0514626218851575, "grad_norm": 182.24288940429688, "learning_rate": 0.00026203879800892194, "loss": 3.2913, "step": 22437 }, { "epoch": 4.201516793066089, "grad_norm": 300.8165283203125, "learning_rate": 0.00025868152325781986, "loss": 3.2554, "step": 23268 }, { "epoch": 4.351570964247021, "grad_norm": 230.65304565429688, "learning_rate": 0.00025520556300286454, "loss": 3.2636, "step": 24099 }, { "epoch": 4.500541711809317, "eval_g2l_cer": 46.1491, "eval_g2l_gen_len": 3.8852, "eval_g2l_rouge1": 45.7526, "eval_g2l_rouge2": 35.7656, "eval_g2l_rougeL": 45.6115, "eval_g2l_rougeLsum": 45.6146, "eval_l2ex_cer": 82.9457, "eval_l2ex_gen_len": 17.2662, "eval_l2ex_rouge1": 31.8116, "eval_l2ex_rouge2": 16.1098, "eval_l2ex_rougeL": 28.581, "eval_l2ex_rougeLsum": 28.6511, "eval_l2g_cer": 69.3136, "eval_l2g_gen_len": 12.4397, "eval_l2g_rouge1": 39.1199, "eval_l2g_rouge2": 26.5659, "eval_l2g_rougeL": 37.2837, "eval_l2g_rougeLsum": 37.3241, "eval_loss": 3.275228261947632, "eval_runtime": 264.7397, "eval_samples_per_second": 37.475, "eval_steps_per_second": 0.589, "step": 24924 }, { "epoch": 4.501625135427952, "grad_norm": 278.4391174316406, "learning_rate": 0.00025161472191341646, "loss": 3.2605, "step": 24930 }, { "epoch": 4.651679306608884, "grad_norm": 185.57086181640625, "learning_rate": 0.00024791293040353913, "loss": 3.2372, "step": 25761 }, { "epoch": 4.801733477789816, "grad_norm": 199.41229248046875, "learning_rate": 0.0002441042403299005, "loss": 3.2549, "step": 26592 }, { "epoch": 4.951787648970748, "grad_norm": 111.84984588623047, "learning_rate": 0.000240192820556746, "loss": 3.2505, "step": 27423 }, { "epoch": 5.101841820151679, "grad_norm": 231.06040954589844, "learning_rate": 0.0002361829523928005, "loss": 3.2162, "step": 28254 }, { "epoch": 5.250631997110871, "eval_g2l_cer": 45.3929, "eval_g2l_gen_len": 4.2856, "eval_g2l_rouge1": 47.7028, "eval_g2l_rouge2": 36.7159, "eval_g2l_rougeL": 47.5076, "eval_g2l_rougeLsum": 47.5342, "eval_l2ex_cer": 84.2916, "eval_l2ex_gen_len": 27.5293, "eval_l2ex_rouge1": 32.3354, "eval_l2ex_rouge2": 15.6055, "eval_l2ex_rougeL": 28.4133, "eval_l2ex_rougeLsum": 28.5758, "eval_l2g_cer": 74.8295, "eval_l2g_gen_len": 19.749, "eval_l2g_rouge1": 40.6449, "eval_l2g_rouge2": 27.1184, "eval_l2g_rougeL": 38.3335, "eval_l2g_rougeLsum": 38.3945, "eval_loss": 3.2382800579071045, "eval_runtime": 300.872, "eval_samples_per_second": 32.974, "eval_steps_per_second": 0.518, "step": 29078 }, { "epoch": 5.251895991332611, "grad_norm": 298.8398132324219, "learning_rate": 0.00023207902490509098, "loss": 3.187, "step": 29085 }, { "epoch": 5.401950162513542, "grad_norm": 126.86690521240234, "learning_rate": 0.0002278855301148215, "loss": 3.2012, "step": 29916 }, { "epoch": 5.552004333694475, "grad_norm": 221.00885009765625, "learning_rate": 0.0002236070580805574, "loss": 3.1999, "step": 30747 }, { "epoch": 5.702058504875406, "grad_norm": 193.86273193359375, "learning_rate": 0.00021924829187410153, "loss": 3.1942, "step": 31578 }, { "epoch": 5.852112676056338, "grad_norm": 126.05673217773438, "learning_rate": 0.00021481400245456104, "loss": 3.1947, "step": 32409 }, { "epoch": 6.000722282412423, "eval_g2l_cer": 45.9412, "eval_g2l_gen_len": 4.4229, "eval_g2l_rouge1": 47.5003, "eval_g2l_rouge2": 36.6595, "eval_g2l_rougeL": 47.3175, "eval_g2l_rougeLsum": 47.3017, "eval_l2ex_cer": 82.4504, "eval_l2ex_gen_len": 23.3741, "eval_l2ex_rouge1": 32.8857, "eval_l2ex_rouge2": 15.6166, "eval_l2ex_rougeL": 28.7672, "eval_l2ex_rougeLsum": 28.8746, "eval_l2g_cer": 73.7451, "eval_l2g_gen_len": 18.5067, "eval_l2g_rouge1": 40.8866, "eval_l2g_rouge2": 27.3687, "eval_l2g_rougeL": 38.5521, "eval_l2g_rougeLsum": 38.621, "eval_loss": 3.2279489040374756, "eval_runtime": 300.5167, "eval_samples_per_second": 33.013, "eval_steps_per_second": 0.519, "step": 33232 }, { "epoch": 6.00216684723727, "grad_norm": 254.65907287597656, "learning_rate": 0.00021030904344621589, "loss": 3.1923, "step": 33240 }, { "epoch": 6.152221018418201, "grad_norm": 228.19200134277344, "learning_rate": 0.0002057383458259045, "loss": 3.1351, "step": 34071 }, { "epoch": 6.302275189599134, "grad_norm": 305.9356689453125, "learning_rate": 0.00020110691252574222, "loss": 3.1421, "step": 34902 }, { "epoch": 6.452329360780065, "grad_norm": 190.717041015625, "learning_rate": 0.00019641981295707994, "loss": 3.1515, "step": 35733 }, { "epoch": 6.602383531960997, "grad_norm": 241.10513305664062, "learning_rate": 0.00019168217746169658, "loss": 3.1506, "step": 36564 }, { "epoch": 6.750812567713976, "eval_g2l_cer": 45.0204, "eval_g2l_gen_len": 4.0829, "eval_g2l_rouge1": 47.6328, "eval_g2l_rouge2": 37.0338, "eval_g2l_rougeL": 47.5319, "eval_g2l_rougeLsum": 47.5196, "eval_l2ex_cer": 84.416, "eval_l2ex_gen_len": 23.2662, "eval_l2ex_rouge1": 33.1718, "eval_l2ex_rouge2": 16.167, "eval_l2ex_rougeL": 29.1263, "eval_l2ex_rougeLsum": 29.2504, "eval_l2g_cer": 75.3622, "eval_l2g_gen_len": 17.9792, "eval_l2g_rouge1": 40.4989, "eval_l2g_rouge2": 27.2808, "eval_l2g_rougeL": 38.3025, "eval_l2g_rougeLsum": 38.3215, "eval_loss": 3.1947431564331055, "eval_runtime": 297.6683, "eval_samples_per_second": 33.329, "eval_steps_per_second": 0.524, "step": 37386 }, { "epoch": 6.752437703141928, "grad_norm": 194.801513671875, "learning_rate": 0.0001868991916962991, "loss": 3.1481, "step": 37395 }, { "epoch": 6.902491874322861, "grad_norm": 133.14971923828125, "learning_rate": 0.00018207609095647728, "loss": 3.1368, "step": 38226 }, { "epoch": 7.052546045503792, "grad_norm": 179.32647705078125, "learning_rate": 0.00017721815444632445, "loss": 3.1199, "step": 39057 }, { "epoch": 7.202600216684724, "grad_norm": 218.9005889892578, "learning_rate": 0.00017233069949999837, "loss": 3.094, "step": 39888 }, { "epoch": 7.352654387865655, "grad_norm": 215.17083740234375, "learning_rate": 0.00016741907576154572, "loss": 3.0896, "step": 40719 }, { "epoch": 7.500902853015529, "eval_g2l_cer": 44.5361, "eval_g2l_gen_len": 4.0809, "eval_g2l_rouge1": 48.0404, "eval_g2l_rouge2": 37.3411, "eval_g2l_rougeL": 47.8907, "eval_g2l_rougeLsum": 47.867, "eval_l2ex_cer": 83.3722, "eval_l2ex_gen_len": 21.9188, "eval_l2ex_rouge1": 33.2159, "eval_l2ex_rouge2": 16.5159, "eval_l2ex_rougeL": 29.1348, "eval_l2ex_rougeLsum": 29.2304, "eval_l2g_cer": 72.9959, "eval_l2g_gen_len": 15.519, "eval_l2g_rouge1": 40.681, "eval_l2g_rouge2": 27.6769, "eval_l2g_rougeL": 38.6264, "eval_l2g_rougeLsum": 38.6627, "eval_loss": 3.1981189250946045, "eval_runtime": 291.8194, "eval_samples_per_second": 33.997, "eval_steps_per_second": 0.535, "step": 41540 }, { "epoch": 7.502708559046587, "grad_norm": 153.0230712890625, "learning_rate": 0.00016248865932936134, "loss": 3.0927, "step": 41550 }, { "epoch": 7.6527627302275185, "grad_norm": 212.92391967773438, "learning_rate": 0.0001575448468716914, "loss": 3.0974, "step": 42381 }, { "epoch": 7.802816901408451, "grad_norm": 186.11282348632812, "learning_rate": 0.00015259304971962191, "loss": 3.09, "step": 43212 }, { "epoch": 7.9528710725893825, "grad_norm": 107.77149200439453, "learning_rate": 0.00014763868794401698, "loss": 3.0957, "step": 44043 }, { "epoch": 8.102925243770315, "grad_norm": 111.45164489746094, "learning_rate": 0.00014268718442289166, "loss": 3.0703, "step": 44874 }, { "epoch": 8.250993138317082, "eval_g2l_cer": 44.1066, "eval_g2l_gen_len": 3.9695, "eval_g2l_rouge1": 48.1237, "eval_g2l_rouge2": 37.5462, "eval_g2l_rougeL": 48.0143, "eval_g2l_rougeLsum": 48.0057, "eval_l2ex_cer": 83.1439, "eval_l2ex_gen_len": 22.295, "eval_l2ex_rouge1": 33.8654, "eval_l2ex_rouge2": 16.5697, "eval_l2ex_rougeL": 29.7053, "eval_l2ex_rougeLsum": 29.8195, "eval_l2g_cer": 71.4647, "eval_l2g_gen_len": 15.6419, "eval_l2g_rouge1": 41.0845, "eval_l2g_rouge2": 27.5338, "eval_l2g_rougeL": 38.8182, "eval_l2g_rougeLsum": 38.8839, "eval_loss": 3.19246506690979, "eval_runtime": 290.3246, "eval_samples_per_second": 34.172, "eval_steps_per_second": 0.537, "step": 45694 }, { "epoch": 8.252979414951247, "grad_norm": 117.11378479003906, "learning_rate": 0.0001377439589057116, "loss": 3.0554, "step": 45705 }, { "epoch": 8.403033586132178, "grad_norm": 98.44864654541016, "learning_rate": 0.00013281442208111732, "loss": 3.0581, "step": 46536 }, { "epoch": 8.55308775731311, "grad_norm": 110.35213470458984, "learning_rate": 0.00012790396965456613, "loss": 3.0478, "step": 47367 }, { "epoch": 8.703141928494041, "grad_norm": 56.789737701416016, "learning_rate": 0.00012301797644237423, "loss": 3.0599, "step": 48198 }, { "epoch": 8.853196099674973, "grad_norm": 111.45304107666016, "learning_rate": 0.00011816179048862318, "loss": 3.0381, "step": 49029 }, { "epoch": 9.001083423618635, "eval_g2l_cer": 44.1774, "eval_g2l_gen_len": 4.1188, "eval_g2l_rouge1": 48.6114, "eval_g2l_rouge2": 37.8262, "eval_g2l_rougeL": 48.5072, "eval_g2l_rougeLsum": 48.4844, "eval_l2ex_cer": 83.6477, "eval_l2ex_gen_len": 22.4625, "eval_l2ex_rouge1": 33.2375, "eval_l2ex_rouge2": 16.4943, "eval_l2ex_rougeL": 29.1757, "eval_l2ex_rougeLsum": 29.2794, "eval_l2g_cer": 72.9254, "eval_l2g_gen_len": 17.2116, "eval_l2g_rouge1": 41.0375, "eval_l2g_rouge2": 27.5603, "eval_l2g_rougeL": 38.745, "eval_l2g_rougeLsum": 38.787, "eval_loss": 3.180062770843506, "eval_runtime": 297.1459, "eval_samples_per_second": 33.388, "eval_steps_per_second": 0.525, "step": 49848 }, { "epoch": 9.003250270855904, "grad_norm": 78.80842590332031, "learning_rate": 0.00011334072721137046, "loss": 3.0595, "step": 49860 }, { "epoch": 9.153304442036836, "grad_norm": 138.7894287109375, "learning_rate": 0.00010856006358457137, "loss": 3.0096, "step": 50691 }, { "epoch": 9.303358613217767, "grad_norm": 132.17127990722656, "learning_rate": 0.00010382503236208064, "loss": 3.0273, "step": 51522 }, { "epoch": 9.453412784398699, "grad_norm": 130.38265991210938, "learning_rate": 9.914081635005574e-05, "loss": 3.0237, "step": 52353 }, { "epoch": 9.603466955579632, "grad_norm": 100.80162811279297, "learning_rate": 9.451254273403124e-05, "loss": 3.0167, "step": 53184 }, { "epoch": 9.751173708920188, "eval_g2l_cer": 44.0723, "eval_g2l_gen_len": 4.1516, "eval_g2l_rouge1": 48.7144, "eval_g2l_rouge2": 37.9052, "eval_g2l_rougeL": 48.5889, "eval_g2l_rougeLsum": 48.5704, "eval_l2ex_cer": 82.0577, "eval_l2ex_gen_len": 22.3731, "eval_l2ex_rouge1": 33.8214, "eval_l2ex_rouge2": 17.2047, "eval_l2ex_rougeL": 29.9782, "eval_l2ex_rougeLsum": 30.0546, "eval_l2g_cer": 72.335, "eval_l2g_gen_len": 17.0699, "eval_l2g_rouge1": 41.6605, "eval_l2g_rouge2": 28.2593, "eval_l2g_rougeL": 39.3968, "eval_l2g_rougeLsum": 39.4309, "eval_loss": 3.1734836101531982, "eval_runtime": 297.1857, "eval_samples_per_second": 33.383, "eval_steps_per_second": 0.525, "step": 54002 }, { "epoch": 9.753521126760564, "grad_norm": 98.55856323242188, "learning_rate": 8.994527746687389e-05, "loss": 3.0202, "step": 54015 }, { "epoch": 9.903575297941495, "grad_norm": 110.90308380126953, "learning_rate": 8.544401972376058e-05, "loss": 3.0123, "step": 54846 }, { "epoch": 10.053629469122427, "grad_norm": 103.5262451171875, "learning_rate": 8.10136964302491e-05, "loss": 3.0112, "step": 55677 }, { "epoch": 10.203683640303359, "grad_norm": 73.44245147705078, "learning_rate": 7.665915686943095e-05, "loss": 2.9824, "step": 56508 }, { "epoch": 10.35373781148429, "grad_norm": 77.93965148925781, "learning_rate": 7.238516737406908e-05, "loss": 2.9999, "step": 57339 }, { "epoch": 10.501263994221741, "eval_g2l_cer": 44.1363, "eval_g2l_gen_len": 4.1471, "eval_g2l_rouge1": 48.6933, "eval_g2l_rouge2": 38.0423, "eval_g2l_rougeL": 48.565, "eval_g2l_rougeLsum": 48.5648, "eval_l2ex_cer": 81.2579, "eval_l2ex_gen_len": 21.4666, "eval_l2ex_rouge1": 33.958, "eval_l2ex_rouge2": 16.8411, "eval_l2ex_rougeL": 29.5656, "eval_l2ex_rougeLsum": 29.6795, "eval_l2g_cer": 71.0675, "eval_l2g_gen_len": 16.2517, "eval_l2g_rouge1": 41.5203, "eval_l2g_rouge2": 28.0296, "eval_l2g_rougeL": 39.1863, "eval_l2g_rougeLsum": 39.2508, "eval_loss": 3.1717426776885986, "eval_runtime": 290.2937, "eval_samples_per_second": 34.176, "eval_steps_per_second": 0.537, "step": 58156 }, { "epoch": 10.503791982665222, "grad_norm": 129.3556365966797, "learning_rate": 6.81964061095297e-05, "loss": 2.9888, "step": 58170 }, { "epoch": 10.653846153846153, "grad_norm": 112.28192138671875, "learning_rate": 6.409745795321991e-05, "loss": 2.9878, "step": 59001 }, { "epoch": 10.803900325027085, "grad_norm": 76.26856231689453, "learning_rate": 6.009280947613472e-05, "loss": 2.9817, "step": 59832 }, { "epoch": 10.953954496208016, "grad_norm": 58.20437240600586, "learning_rate": 5.618684403200737e-05, "loss": 2.9851, "step": 60663 }, { "epoch": 11.10400866738895, "grad_norm": 108.53790283203125, "learning_rate": 5.238383695943713e-05, "loss": 2.9823, "step": 61494 }, { "epoch": 11.251354279523294, "eval_g2l_cer": 44.0289, "eval_g2l_gen_len": 4.1275, "eval_g2l_rouge1": 48.9057, "eval_g2l_rouge2": 38.3159, "eval_g2l_rougeL": 48.7647, "eval_g2l_rougeLsum": 48.766, "eval_l2ex_cer": 82.4492, "eval_l2ex_gen_len": 22.9445, "eval_l2ex_rouge1": 33.8799, "eval_l2ex_rouge2": 16.7295, "eval_l2ex_rougeL": 29.4575, "eval_l2ex_rougeLsum": 29.6104, "eval_l2g_cer": 71.7288, "eval_l2g_gen_len": 16.7564, "eval_l2g_rouge1": 41.5535, "eval_l2g_rouge2": 28.1997, "eval_l2g_rougeL": 39.2564, "eval_l2g_rougeLsum": 39.323, "eval_loss": 3.1673169136047363, "eval_runtime": 295.6043, "eval_samples_per_second": 33.562, "eval_steps_per_second": 0.528, "step": 62310 }, { "epoch": 11.254062838569881, "grad_norm": 78.87760162353516, "learning_rate": 4.868795090224752e-05, "loss": 2.9644, "step": 62325 }, { "epoch": 11.404117009750813, "grad_norm": 63.00550079345703, "learning_rate": 4.510323125319609e-05, "loss": 2.9714, "step": 63156 }, { "epoch": 11.554171180931744, "grad_norm": 107.51451110839844, "learning_rate": 4.1633601726023533e-05, "loss": 2.972, "step": 63987 }, { "epoch": 11.704225352112676, "grad_norm": 66.15150451660156, "learning_rate": 3.82828600606881e-05, "loss": 2.9604, "step": 64818 }, { "epoch": 11.854279523293608, "grad_norm": 65.40077209472656, "learning_rate": 3.505467386648718e-05, "loss": 2.9667, "step": 65649 }, { "epoch": 12.001444564824846, "eval_g2l_cer": 43.9512, "eval_g2l_gen_len": 4.0624, "eval_g2l_rouge1": 48.889, "eval_g2l_rouge2": 38.1288, "eval_g2l_rougeL": 48.7444, "eval_g2l_rougeLsum": 48.751, "eval_l2ex_cer": 83.3432, "eval_l2ex_gen_len": 22.5889, "eval_l2ex_rouge1": 33.672, "eval_l2ex_rouge2": 16.682, "eval_l2ex_rougeL": 29.3383, "eval_l2ex_rougeLsum": 29.4381, "eval_l2g_cer": 72.969, "eval_l2g_gen_len": 17.2907, "eval_l2g_rouge1": 41.384, "eval_l2g_rouge2": 28.0121, "eval_l2g_rougeL": 39.0564, "eval_l2g_rougeLsum": 39.1247, "eval_loss": 3.163238286972046, "eval_runtime": 296.9381, "eval_samples_per_second": 33.411, "eval_steps_per_second": 0.525, "step": 66464 }, { "epoch": 12.00433369447454, "grad_norm": 80.00790405273438, "learning_rate": 3.195257660761534e-05, "loss": 2.9548, "step": 66480 }, { "epoch": 12.15438786565547, "grad_norm": 50.12080001831055, "learning_rate": 2.897996373555297e-05, "loss": 2.9599, "step": 67311 }, { "epoch": 12.304442036836402, "grad_norm": 137.98057556152344, "learning_rate": 2.6140088972519277e-05, "loss": 2.9426, "step": 68142 }, { "epoch": 12.454496208017336, "grad_norm": 74.86833190917969, "learning_rate": 2.343606075005708e-05, "loss": 2.9445, "step": 68973 }, { "epoch": 12.604550379198267, "grad_norm": 85.49880981445312, "learning_rate": 2.0870838806648037e-05, "loss": 2.9445, "step": 69804 }, { "epoch": 12.751534850126399, "eval_g2l_cer": 44.0472, "eval_g2l_gen_len": 4.109, "eval_g2l_rouge1": 48.9965, "eval_g2l_rouge2": 38.1664, "eval_g2l_rougeL": 48.8442, "eval_g2l_rougeLsum": 48.8419, "eval_l2ex_cer": 81.2857, "eval_l2ex_gen_len": 21.2364, "eval_l2ex_rouge1": 34.1658, "eval_l2ex_rouge2": 17.3387, "eval_l2ex_rougeL": 29.9082, "eval_l2ex_rougeLsum": 30.0362, "eval_l2g_cer": 70.6762, "eval_l2g_gen_len": 15.9381, "eval_l2g_rouge1": 41.6215, "eval_l2g_rouge2": 28.1386, "eval_l2g_rougeL": 39.3091, "eval_l2g_rougeLsum": 39.3715, "eval_loss": 3.163139581680298, "eval_runtime": 291.4682, "eval_samples_per_second": 34.038, "eval_steps_per_second": 0.535, "step": 70618 }, { "epoch": 12.754604550379199, "grad_norm": 60.18415451049805, "learning_rate": 1.844723094808244e-05, "loss": 2.9515, "step": 70635 }, { "epoch": 12.90465872156013, "grad_norm": 39.721649169921875, "learning_rate": 1.6167889974129134e-05, "loss": 2.9545, "step": 71466 }, { "epoch": 13.054712892741062, "grad_norm": 49.22962188720703, "learning_rate": 1.4035310774870041e-05, "loss": 2.9433, "step": 72297 }, { "epoch": 13.204767063921993, "grad_norm": 55.78800964355469, "learning_rate": 1.205182759987737e-05, "loss": 2.9241, "step": 73128 }, { "epoch": 13.354821235102925, "grad_norm": 69.98873138427734, "learning_rate": 1.0219611503222213e-05, "loss": 2.939, "step": 73959 }, { "epoch": 13.501625135427952, "eval_g2l_cer": 43.9078, "eval_g2l_gen_len": 4.0981, "eval_g2l_rouge1": 48.969, "eval_g2l_rouge2": 38.1559, "eval_g2l_rougeL": 48.8152, "eval_g2l_rougeLsum": 48.8193, "eval_l2ex_cer": 81.5515, "eval_l2ex_gen_len": 21.7205, "eval_l2ex_rouge1": 33.9427, "eval_l2ex_rouge2": 17.0266, "eval_l2ex_rougeL": 29.5977, "eval_l2ex_rougeLsum": 29.7301, "eval_l2g_cer": 70.7346, "eval_l2g_gen_len": 16.1531, "eval_l2g_rouge1": 41.7374, "eval_l2g_rouge2": 28.1793, "eval_l2g_rougeL": 39.3779, "eval_l2g_rougeLsum": 39.4426, "eval_loss": 3.1640655994415283, "eval_runtime": 292.5347, "eval_samples_per_second": 33.914, "eval_steps_per_second": 0.533, "step": 74772 }, { "epoch": 13.504875406283857, "grad_norm": 69.49555206298828, "learning_rate": 8.54066796711184e-06, "loss": 2.9465, "step": 74790 }, { "epoch": 13.654929577464788, "grad_norm": 73.59809112548828, "learning_rate": 7.016834706756168e-06, "loss": 2.9391, "step": 75621 }, { "epoch": 13.804983748645721, "grad_norm": 64.26115417480469, "learning_rate": 5.649779658866368e-06, "loss": 2.9356, "step": 76452 }, { "epoch": 13.955037919826653, "grad_norm": 37.47693634033203, "learning_rate": 4.440999155987467e-06, "loss": 2.9523, "step": 77283 }, { "epoch": 14.105092091007585, "grad_norm": 97.73049926757812, "learning_rate": 3.391816288662864e-06, "loss": 2.9394, "step": 78114 }, { "epoch": 14.251715420729505, "eval_g2l_cer": 44.1271, "eval_g2l_gen_len": 4.1016, "eval_g2l_rouge1": 48.8563, "eval_g2l_rouge2": 38.0804, "eval_g2l_rougeL": 48.7034, "eval_g2l_rougeLsum": 48.679, "eval_l2ex_cer": 81.746, "eval_l2ex_gen_len": 21.9486, "eval_l2ex_rouge1": 34.1479, "eval_l2ex_rouge2": 17.1381, "eval_l2ex_rougeL": 29.7996, "eval_l2ex_rougeLsum": 29.9184, "eval_l2g_cer": 71.184, "eval_l2g_gen_len": 16.3747, "eval_l2g_rouge1": 41.7919, "eval_l2g_rouge2": 28.2088, "eval_l2g_rougeL": 39.4284, "eval_l2g_rougeLsum": 39.4987, "eval_loss": 3.1630301475524902, "eval_runtime": 294.6002, "eval_samples_per_second": 33.676, "eval_steps_per_second": 0.53, "step": 78926 } ], "logging_steps": 831, "max_steps": 83070, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 4154, "total_flos": 1.4551992475225948e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }