{ "best_metric": 3.5637874603271484, "best_model_checkpoint": "checkpoints/mt5-base/checkpoint-37386", "epoch": 13.501625135427952, "eval_steps": 2077, "global_step": 37386, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14987360057782592, "eval_g2l_cer": 286.0084, "eval_g2l_gen_len": 8.1826, "eval_g2l_rouge1": 5.5622, "eval_g2l_rouge2": 1.1913, "eval_g2l_rougeL": 5.4996, "eval_g2l_rougeLsum": 5.5114, "eval_l2ex_cer": 86.6247, "eval_l2ex_gen_len": 7.9383, "eval_l2ex_rouge1": 16.8571, "eval_l2ex_rouge2": 5.0595, "eval_l2ex_rougeL": 15.4605, "eval_l2ex_rougeLsum": 15.4778, "eval_l2g_cer": 87.9265, "eval_l2g_gen_len": 5.5834, "eval_l2g_rouge1": 12.3664, "eval_l2g_rouge2": 1.6791, "eval_l2g_rougeL": 11.57, "eval_l2g_rougeLsum": 11.5871, "eval_loss": 8.359071731567383, "eval_runtime": 145.7254, "eval_samples_per_second": 68.08, "eval_steps_per_second": 2.134, "step": 415 }, { "epoch": 0.15023474178403756, "grad_norm": 1659.9119873046875, "learning_rate": 2.0038535645472063e-05, "loss": 15.088, "step": 416 }, { "epoch": 0.3004694835680751, "grad_norm": 8.369742393493652, "learning_rate": 4.0077071290944125e-05, "loss": 7.1403, "step": 832 }, { "epoch": 0.4507042253521127, "grad_norm": 1.8832136392593384, "learning_rate": 6.0115606936416195e-05, "loss": 5.3841, "step": 1248 }, { "epoch": 0.6009389671361502, "grad_norm": 23.656179428100586, "learning_rate": 8.015414258188825e-05, "loss": 4.9928, "step": 1664 }, { "epoch": 0.7500902853015529, "eval_g2l_cer": 59.4485, "eval_g2l_gen_len": 2.9556, "eval_g2l_rouge1": 27.1975, "eval_g2l_rouge2": 17.2281, "eval_g2l_rougeL": 27.1916, "eval_g2l_rougeLsum": 27.2117, "eval_l2ex_cer": 95.5123, "eval_l2ex_gen_len": 23.6742, "eval_l2ex_rouge1": 25.7497, "eval_l2ex_rouge2": 11.1192, "eval_l2ex_rougeL": 22.9461, "eval_l2ex_rougeLsum": 22.9441, "eval_l2g_cer": 83.2502, "eval_l2g_gen_len": 15.9711, "eval_l2g_rouge1": 27.2934, "eval_l2g_rouge2": 14.8195, "eval_l2g_rougeL": 25.9582, "eval_l2g_rougeLsum": 25.9617, "eval_loss": 4.2178425788879395, "eval_runtime": 203.8137, "eval_samples_per_second": 48.677, "eval_steps_per_second": 1.526, "step": 2077 }, { "epoch": 0.7511737089201878, "grad_norm": 1.5714240074157715, "learning_rate": 9.999999747638704e-05, "loss": 4.7614, "step": 2080 }, { "epoch": 0.9014084507042254, "grad_norm": 2.348027467727661, "learning_rate": 9.997217976013284e-05, "loss": 4.6037, "step": 2496 }, { "epoch": 1.051643192488263, "grad_norm": 2.6275577545166016, "learning_rate": 9.98898067640237e-05, "loss": 4.5136, "step": 2912 }, { "epoch": 1.2018779342723005, "grad_norm": 1.1122652292251587, "learning_rate": 9.975296886788363e-05, "loss": 4.4057, "step": 3328 }, { "epoch": 1.352112676056338, "grad_norm": 1.2248876094818115, "learning_rate": 9.956181621053908e-05, "loss": 4.3513, "step": 3744 }, { "epoch": 1.500180570603106, "eval_g2l_cer": 56.9353, "eval_g2l_gen_len": 3.093, "eval_g2l_rouge1": 32.8366, "eval_g2l_rouge2": 24.3793, "eval_g2l_rougeL": 32.7964, "eval_g2l_rougeLsum": 32.7681, "eval_l2ex_cer": 84.2827, "eval_l2ex_gen_len": 20.4604, "eval_l2ex_rouge1": 28.5353, "eval_l2ex_rouge2": 12.5551, "eval_l2ex_rougeL": 25.5058, "eval_l2ex_rougeLsum": 25.5427, "eval_l2g_cer": 81.1104, "eval_l2g_gen_len": 18.0072, "eval_l2g_rouge1": 32.688, "eval_l2g_rouge2": 18.9467, "eval_l2g_rougeL": 30.7295, "eval_l2g_rougeLsum": 30.7676, "eval_loss": 3.949233293533325, "eval_runtime": 197.691, "eval_samples_per_second": 50.184, "eval_steps_per_second": 1.573, "step": 4154 }, { "epoch": 1.5023474178403755, "grad_norm": 1.4981008768081665, "learning_rate": 9.931655852508637e-05, "loss": 4.3061, "step": 4160 }, { "epoch": 1.652582159624413, "grad_norm": 1.050997018814087, "learning_rate": 9.901746490877203e-05, "loss": 4.2525, "step": 4576 }, { "epoch": 1.8028169014084507, "grad_norm": 0.8377422094345093, "learning_rate": 9.866486352773886e-05, "loss": 4.2289, "step": 4992 }, { "epoch": 1.9530516431924883, "grad_norm": 0.968815267086029, "learning_rate": 9.82591412569612e-05, "loss": 4.1958, "step": 5408 }, { "epoch": 2.103286384976526, "grad_norm": 0.9952152967453003, "learning_rate": 9.780074325576496e-05, "loss": 4.1187, "step": 5824 }, { "epoch": 2.2502708559046587, "eval_g2l_cer": 53.8806, "eval_g2l_gen_len": 3.0548, "eval_g2l_rouge1": 35.439, "eval_g2l_rouge2": 27.2433, "eval_g2l_rougeL": 35.4384, "eval_g2l_rougeLsum": 35.3985, "eval_l2ex_cer": 89.3431, "eval_l2ex_gen_len": 23.4573, "eval_l2ex_rouge1": 27.8815, "eval_l2ex_rouge2": 12.1568, "eval_l2ex_rougeL": 24.5796, "eval_l2ex_rougeLsum": 24.6286, "eval_l2g_cer": 78.589, "eval_l2g_gen_len": 17.4946, "eval_l2g_rouge1": 35.6236, "eval_l2g_rouge2": 22.8027, "eval_l2g_rougeL": 33.8966, "eval_l2g_rougeLsum": 33.9001, "eval_loss": 3.8344309329986572, "eval_runtime": 202.9852, "eval_samples_per_second": 48.875, "eval_steps_per_second": 1.532, "step": 6231 }, { "epoch": 2.2535211267605635, "grad_norm": 24.567110061645508, "learning_rate": 9.72901724793979e-05, "loss": 4.0993, "step": 6240 }, { "epoch": 2.403755868544601, "grad_norm": 0.9726364612579346, "learning_rate": 9.672798912718604e-05, "loss": 4.0734, "step": 6656 }, { "epoch": 2.5539906103286385, "grad_norm": 0.9216151833534241, "learning_rate": 9.611481002788184e-05, "loss": 4.0584, "step": 7072 }, { "epoch": 2.704225352112676, "grad_norm": 0.7880883812904358, "learning_rate": 9.545130796287832e-05, "loss": 4.0312, "step": 7488 }, { "epoch": 2.8544600938967135, "grad_norm": 0.9635422229766846, "learning_rate": 9.473821092803199e-05, "loss": 4.0046, "step": 7904 }, { "epoch": 3.0003611412062114, "eval_g2l_cer": 52.062, "eval_g2l_gen_len": 3.0702, "eval_g2l_rouge1": 36.8811, "eval_g2l_rouge2": 28.8156, "eval_g2l_rougeL": 36.8925, "eval_g2l_rougeLsum": 36.8317, "eval_l2ex_cer": 90.1083, "eval_l2ex_gen_len": 22.4645, "eval_l2ex_rouge1": 27.5056, "eval_l2ex_rouge2": 12.5248, "eval_l2ex_rougeL": 24.4085, "eval_l2ex_rougeLsum": 24.4463, "eval_l2g_cer": 78.1779, "eval_l2g_gen_len": 17.8095, "eval_l2g_rouge1": 36.8332, "eval_l2g_rouge2": 23.9422, "eval_l2g_rougeL": 34.9672, "eval_l2g_rougeLsum": 34.995, "eval_loss": 3.7595808506011963, "eval_runtime": 202.3409, "eval_samples_per_second": 49.031, "eval_steps_per_second": 1.537, "step": 8308 }, { "epoch": 3.004694835680751, "grad_norm": 0.900855541229248, "learning_rate": 9.397630133490413e-05, "loss": 3.992, "step": 8320 }, { "epoch": 3.1549295774647885, "grad_norm": 0.8881470561027527, "learning_rate": 9.316641515229741e-05, "loss": 3.9362, "step": 8736 }, { "epoch": 3.3051643192488265, "grad_norm": 0.7969784140586853, "learning_rate": 9.230944098902894e-05, "loss": 3.9143, "step": 9152 }, { "epoch": 3.455399061032864, "grad_norm": 0.8603357672691345, "learning_rate": 9.1406319118947e-05, "loss": 3.9162, "step": 9568 }, { "epoch": 3.6056338028169015, "grad_norm": 0.9974511861801147, "learning_rate": 9.045804044926044e-05, "loss": 3.8987, "step": 9984 }, { "epoch": 3.7504514265077646, "eval_g2l_cer": 50.7917, "eval_g2l_gen_len": 3.0031, "eval_g2l_rouge1": 37.7135, "eval_g2l_rouge2": 29.9526, "eval_g2l_rougeL": 37.7649, "eval_g2l_rougeLsum": 37.7041, "eval_l2ex_cer": 86.8671, "eval_l2ex_gen_len": 22.2271, "eval_l2ex_rouge1": 28.7692, "eval_l2ex_rouge2": 12.8536, "eval_l2ex_rougeL": 25.3768, "eval_l2ex_rougeLsum": 25.4158, "eval_l2g_cer": 73.3411, "eval_l2g_gen_len": 15.6692, "eval_l2g_rouge1": 37.5152, "eval_l2g_rouge2": 24.5536, "eval_l2g_rougeL": 35.5225, "eval_l2g_rougeLsum": 35.5437, "eval_loss": 3.7121169567108154, "eval_runtime": 192.3527, "eval_samples_per_second": 51.577, "eval_steps_per_second": 1.617, "step": 10385 }, { "epoch": 3.755868544600939, "grad_norm": 0.8458616733551025, "learning_rate": 8.94656454333133e-05, "loss": 3.8883, "step": 10400 }, { "epoch": 3.9061032863849765, "grad_norm": 3.1263327598571777, "learning_rate": 8.843022292899726e-05, "loss": 3.8775, "step": 10816 }, { "epoch": 4.056338028169014, "grad_norm": 1.013489842414856, "learning_rate": 8.735290900405437e-05, "loss": 3.8514, "step": 11232 }, { "epoch": 4.206572769953052, "grad_norm": 0.9674685001373291, "learning_rate": 8.623488568958123e-05, "loss": 3.7962, "step": 11648 }, { "epoch": 4.356807511737089, "grad_norm": 1.0607421398162842, "learning_rate": 8.507737968310197e-05, "loss": 3.8043, "step": 12064 }, { "epoch": 4.500541711809317, "eval_g2l_cer": 50.088, "eval_g2l_gen_len": 3.0488, "eval_g2l_rouge1": 38.7702, "eval_g2l_rouge2": 30.6004, "eval_g2l_rougeL": 38.7959, "eval_g2l_rougeLsum": 38.7454, "eval_l2ex_cer": 84.5143, "eval_l2ex_gen_len": 20.52, "eval_l2ex_rouge1": 28.9181, "eval_l2ex_rouge2": 13.2853, "eval_l2ex_rougeL": 25.6409, "eval_l2ex_rougeLsum": 25.6588, "eval_l2g_cer": 72.4949, "eval_l2g_gen_len": 15.2432, "eval_l2g_rouge1": 37.6479, "eval_l2g_rouge2": 24.833, "eval_l2g_rougeL": 35.7678, "eval_l2g_rougeLsum": 35.776, "eval_loss": 3.674677848815918, "eval_runtime": 190.2532, "eval_samples_per_second": 52.146, "eval_steps_per_second": 1.635, "step": 12462 }, { "epoch": 4.507042253521127, "grad_norm": 0.9242987632751465, "learning_rate": 8.388166100263313e-05, "loss": 3.804, "step": 12480 }, { "epoch": 4.657276995305164, "grad_norm": 0.8233311772346497, "learning_rate": 8.264904159321721e-05, "loss": 3.7844, "step": 12896 }, { "epoch": 4.807511737089202, "grad_norm": 1.918661117553711, "learning_rate": 8.138087388745395e-05, "loss": 3.7948, "step": 13312 }, { "epoch": 4.957746478873239, "grad_norm": 0.8277648091316223, "learning_rate": 8.00785493216083e-05, "loss": 3.7951, "step": 13728 }, { "epoch": 5.107981220657277, "grad_norm": 1.0518523454666138, "learning_rate": 7.874349680892367e-05, "loss": 3.7423, "step": 14144 }, { "epoch": 5.250631997110871, "eval_g2l_cer": 49.743, "eval_g2l_gen_len": 3.0201, "eval_g2l_rouge1": 38.8263, "eval_g2l_rouge2": 31.1673, "eval_g2l_rougeL": 38.8286, "eval_g2l_rougeLsum": 38.7898, "eval_l2ex_cer": 86.565, "eval_l2ex_gen_len": 21.7523, "eval_l2ex_rouge1": 28.4984, "eval_l2ex_rouge2": 13.072, "eval_l2ex_rougeL": 25.2667, "eval_l2ex_rougeLsum": 25.2757, "eval_l2g_cer": 73.2917, "eval_l2g_gen_len": 16.0011, "eval_l2g_rouge1": 38.0438, "eval_l2g_rouge2": 25.3209, "eval_l2g_rougeL": 36.1091, "eval_l2g_rougeLsum": 36.1243, "eval_loss": 3.649608850479126, "eval_runtime": 197.4229, "eval_samples_per_second": 50.253, "eval_steps_per_second": 1.575, "step": 14539 }, { "epoch": 5.258215962441315, "grad_norm": 0.8540360331535339, "learning_rate": 7.737718117181538e-05, "loss": 3.7126, "step": 14560 }, { "epoch": 5.408450704225352, "grad_norm": 0.9189392328262329, "learning_rate": 7.598110153466441e-05, "loss": 3.7223, "step": 14976 }, { "epoch": 5.55868544600939, "grad_norm": 0.92618727684021, "learning_rate": 7.45567896789749e-05, "loss": 3.7139, "step": 15392 }, { "epoch": 5.708920187793427, "grad_norm": 0.7882264852523804, "learning_rate": 7.310580836270044e-05, "loss": 3.7179, "step": 15808 }, { "epoch": 5.859154929577465, "grad_norm": 0.8529959321022034, "learning_rate": 7.162974960558259e-05, "loss": 3.7121, "step": 16224 }, { "epoch": 6.000722282412423, "eval_g2l_cer": 49.3934, "eval_g2l_gen_len": 3.0096, "eval_g2l_rouge1": 39.4408, "eval_g2l_rouge2": 31.7057, "eval_g2l_rougeL": 39.4639, "eval_g2l_rougeLsum": 39.4161, "eval_l2ex_cer": 86.119, "eval_l2ex_gen_len": 20.7112, "eval_l2ex_rouge1": 28.8739, "eval_l2ex_rouge2": 13.2661, "eval_l2ex_rougeL": 25.7042, "eval_l2ex_rougeLsum": 25.7118, "eval_l2g_cer": 73.625, "eval_l2g_gen_len": 15.9897, "eval_l2g_rouge1": 38.1171, "eval_l2g_rouge2": 25.6405, "eval_l2g_rougeL": 36.2592, "eval_l2g_rougeLsum": 36.2666, "eval_loss": 3.6273715496063232, "eval_runtime": 193.9276, "eval_samples_per_second": 51.158, "eval_steps_per_second": 1.604, "step": 16616 }, { "epoch": 6.009389671361502, "grad_norm": 0.7976297736167908, "learning_rate": 7.013023294238368e-05, "loss": 3.7191, "step": 16640 }, { "epoch": 6.15962441314554, "grad_norm": 0.8516309261322021, "learning_rate": 6.860890364592963e-05, "loss": 3.6428, "step": 17056 }, { "epoch": 6.309859154929577, "grad_norm": 0.9273515343666077, "learning_rate": 6.706743092191335e-05, "loss": 3.6566, "step": 17472 }, { "epoch": 6.460093896713615, "grad_norm": 0.932829737663269, "learning_rate": 6.550750607743873e-05, "loss": 3.6627, "step": 17888 }, { "epoch": 6.610328638497653, "grad_norm": 0.9968202114105225, "learning_rate": 6.393084066531485e-05, "loss": 3.6652, "step": 18304 }, { "epoch": 6.750812567713976, "eval_g2l_cer": 49.5579, "eval_g2l_gen_len": 2.9938, "eval_g2l_rouge1": 39.6581, "eval_g2l_rouge2": 32.026, "eval_g2l_rougeL": 39.6932, "eval_g2l_rougeLsum": 39.6518, "eval_l2ex_cer": 88.4427, "eval_l2ex_gen_len": 23.11, "eval_l2ex_rouge1": 28.1485, "eval_l2ex_rouge2": 12.4558, "eval_l2ex_rougeL": 24.9414, "eval_l2ex_rougeLsum": 24.9605, "eval_l2g_cer": 73.3296, "eval_l2g_gen_len": 16.3263, "eval_l2g_rouge1": 38.4506, "eval_l2g_rouge2": 25.7696, "eval_l2g_rougeL": 36.5748, "eval_l2g_rougeLsum": 36.6091, "eval_loss": 3.6120047569274902, "eval_runtime": 197.9501, "eval_samples_per_second": 50.119, "eval_steps_per_second": 1.571, "step": 18693 }, { "epoch": 6.76056338028169, "grad_norm": 0.7791869640350342, "learning_rate": 6.233916460613673e-05, "loss": 3.6614, "step": 18720 }, { "epoch": 6.910798122065728, "grad_norm": 0.9385781288146973, "learning_rate": 6.0734224290212784e-05, "loss": 3.6471, "step": 19136 }, { "epoch": 7.061032863849765, "grad_norm": 0.8267916440963745, "learning_rate": 5.9117780661421754e-05, "loss": 3.6264, "step": 19552 }, { "epoch": 7.211267605633803, "grad_norm": 0.794131875038147, "learning_rate": 5.7491607285101345e-05, "loss": 3.6015, "step": 19968 }, { "epoch": 7.36150234741784, "grad_norm": 0.8748852610588074, "learning_rate": 5.585748840208869e-05, "loss": 3.5993, "step": 20384 }, { "epoch": 7.500902853015529, "eval_g2l_cer": 50.088, "eval_g2l_gen_len": 3.0582, "eval_g2l_rouge1": 39.9874, "eval_g2l_rouge2": 32.4432, "eval_g2l_rougeL": 40.0195, "eval_g2l_rougeLsum": 39.9365, "eval_l2ex_cer": 87.6165, "eval_l2ex_gen_len": 22.7133, "eval_l2ex_rouge1": 28.1937, "eval_l2ex_rouge2": 12.5673, "eval_l2ex_rougeL": 24.9397, "eval_l2ex_rougeLsum": 24.921, "eval_l2g_cer": 72.7284, "eval_l2g_gen_len": 15.6759, "eval_l2g_rouge1": 38.4813, "eval_l2g_rouge2": 25.936, "eval_l2g_rougeL": 36.5693, "eval_l2g_rougeLsum": 36.5729, "eval_loss": 3.6013987064361572, "eval_runtime": 195.438, "eval_samples_per_second": 50.763, "eval_steps_per_second": 1.591, "step": 20770 }, { "epoch": 7.511737089201878, "grad_norm": 0.9019631743431091, "learning_rate": 5.4217216971047445e-05, "loss": 3.5978, "step": 20800 }, { "epoch": 7.661971830985916, "grad_norm": 0.8872570395469666, "learning_rate": 5.257259270122993e-05, "loss": 3.6113, "step": 21216 }, { "epoch": 7.812206572769953, "grad_norm": 0.7394893169403076, "learning_rate": 5.0925420077832285e-05, "loss": 3.593, "step": 21632 }, { "epoch": 7.962441314553991, "grad_norm": 0.8534842133522034, "learning_rate": 4.927750638210947e-05, "loss": 3.5963, "step": 22048 }, { "epoch": 8.112676056338028, "grad_norm": 0.9047814607620239, "learning_rate": 4.7630659708422666e-05, "loss": 3.5722, "step": 22464 }, { "epoch": 8.250993138317082, "eval_g2l_cer": 49.5716, "eval_g2l_gen_len": 3.0388, "eval_g2l_rouge1": 40.4088, "eval_g2l_rouge2": 32.7272, "eval_g2l_rougeL": 40.4374, "eval_g2l_rougeLsum": 40.3677, "eval_l2ex_cer": 83.5858, "eval_l2ex_gen_len": 20.4851, "eval_l2ex_rouge1": 29.084, "eval_l2ex_rouge2": 12.9208, "eval_l2ex_rougeL": 25.6832, "eval_l2ex_rougeLsum": 25.7033, "eval_l2g_cer": 72.1741, "eval_l2g_gen_len": 15.6461, "eval_l2g_rouge1": 38.8628, "eval_l2g_rouge2": 26.1912, "eval_l2g_rougeL": 36.9072, "eval_l2g_rougeLsum": 36.9086, "eval_loss": 3.5901942253112793, "eval_runtime": 190.412, "eval_samples_per_second": 52.103, "eval_steps_per_second": 1.633, "step": 22847 }, { "epoch": 8.262910798122066, "grad_norm": 0.8366677761077881, "learning_rate": 4.598668698039414e-05, "loss": 3.5641, "step": 22880 }, { "epoch": 8.413145539906104, "grad_norm": 0.8628195524215698, "learning_rate": 4.4347391968347015e-05, "loss": 3.5702, "step": 23296 }, { "epoch": 8.56338028169014, "grad_norm": 0.9060849547386169, "learning_rate": 4.27145733102046e-05, "loss": 3.5508, "step": 23712 }, { "epoch": 8.713615023474178, "grad_norm": 0.8726539015769958, "learning_rate": 4.109002253802116e-05, "loss": 3.5637, "step": 24128 }, { "epoch": 8.863849765258216, "grad_norm": 0.9154978394508362, "learning_rate": 3.947552211230913e-05, "loss": 3.5435, "step": 24544 }, { "epoch": 9.001083423618635, "eval_g2l_cer": 48.6326, "eval_g2l_gen_len": 3.008, "eval_g2l_rouge1": 40.6427, "eval_g2l_rouge2": 33.0447, "eval_g2l_rougeL": 40.6651, "eval_g2l_rougeLsum": 40.6197, "eval_l2ex_cer": 85.6816, "eval_l2ex_gen_len": 20.9753, "eval_l2ex_rouge1": 28.5827, "eval_l2ex_rouge2": 12.8213, "eval_l2ex_rougeL": 25.352, "eval_l2ex_rougeLsum": 25.3642, "eval_l2g_cer": 72.7802, "eval_l2g_gen_len": 15.8102, "eval_l2g_rouge1": 38.814, "eval_l2g_rouge2": 26.1373, "eval_l2g_rougeL": 36.8943, "eval_l2g_rougeLsum": 36.9272, "eval_loss": 3.5814104080200195, "eval_runtime": 193.5202, "eval_samples_per_second": 51.266, "eval_steps_per_second": 1.607, "step": 24924 }, { "epoch": 9.014084507042254, "grad_norm": 0.9910312294960022, "learning_rate": 3.7872843466319744e-05, "loss": 3.5601, "step": 24960 }, { "epoch": 9.164319248826292, "grad_norm": 0.913223922252655, "learning_rate": 3.6283745062422726e-05, "loss": 3.5156, "step": 25376 }, { "epoch": 9.314553990610328, "grad_norm": 0.9026065468788147, "learning_rate": 3.470997046271774e-05, "loss": 3.5337, "step": 25792 }, { "epoch": 9.464788732394366, "grad_norm": 0.9726517796516418, "learning_rate": 3.315324641599434e-05, "loss": 3.5294, "step": 26208 }, { "epoch": 9.615023474178404, "grad_norm": 0.954593300819397, "learning_rate": 3.161528096313964e-05, "loss": 3.5242, "step": 26624 }, { "epoch": 9.751173708920188, "eval_g2l_cer": 48.3196, "eval_g2l_gen_len": 3.0196, "eval_g2l_rouge1": 41.1733, "eval_g2l_rouge2": 33.4761, "eval_g2l_rougeL": 41.172, "eval_g2l_rougeLsum": 41.1111, "eval_l2ex_cer": 86.3469, "eval_l2ex_gen_len": 21.333, "eval_l2ex_rouge1": 28.6196, "eval_l2ex_rouge2": 12.797, "eval_l2ex_rougeL": 25.331, "eval_l2ex_rougeLsum": 25.3251, "eval_l2g_cer": 71.8519, "eval_l2g_gen_len": 15.5771, "eval_l2g_rouge1": 38.9877, "eval_l2g_rouge2": 26.3016, "eval_l2g_rougeL": 36.97, "eval_l2g_rougeLsum": 37.0109, "eval_loss": 3.5751187801361084, "eval_runtime": 190.5769, "eval_samples_per_second": 52.058, "eval_steps_per_second": 1.632, "step": 27001 }, { "epoch": 9.765258215962442, "grad_norm": 0.7817335724830627, "learning_rate": 3.00977615630722e-05, "loss": 3.5332, "step": 27040 }, { "epoch": 9.915492957746478, "grad_norm": 0.8576836585998535, "learning_rate": 2.8602353241258667e-05, "loss": 3.5247, "step": 27456 }, { "epoch": 10.065727699530516, "grad_norm": 0.924045741558075, "learning_rate": 2.7130696762844198e-05, "loss": 3.5171, "step": 27872 }, { "epoch": 10.215962441314554, "grad_norm": 0.9701129198074341, "learning_rate": 2.568440683240166e-05, "loss": 3.4886, "step": 28288 }, { "epoch": 10.366197183098592, "grad_norm": 0.8473976850509644, "learning_rate": 2.426507032227427e-05, "loss": 3.5134, "step": 28704 }, { "epoch": 10.501263994221741, "eval_g2l_cer": 48.8336, "eval_g2l_gen_len": 3.0502, "eval_g2l_rouge1": 41.0241, "eval_g2l_rouge2": 33.2994, "eval_g2l_rougeL": 41.0374, "eval_g2l_rougeLsum": 40.9554, "eval_l2ex_cer": 85.2795, "eval_l2ex_gen_len": 21.6999, "eval_l2ex_rouge1": 28.6576, "eval_l2ex_rouge2": 12.5848, "eval_l2ex_rougeL": 25.1057, "eval_l2ex_rougeLsum": 25.1478, "eval_l2g_cer": 71.5555, "eval_l2g_gen_len": 15.5923, "eval_l2g_rouge1": 39.111, "eval_l2g_rouge2": 26.3632, "eval_l2g_rougeL": 37.134, "eval_l2g_rougeLsum": 37.1562, "eval_loss": 3.5716097354888916, "eval_runtime": 190.1354, "eval_samples_per_second": 52.179, "eval_steps_per_second": 1.636, "step": 29078 }, { "epoch": 10.51643192488263, "grad_norm": 0.9222161769866943, "learning_rate": 2.2874244531456016e-05, "loss": 3.4995, "step": 29120 }, { "epoch": 10.666666666666666, "grad_norm": 0.8834406137466431, "learning_rate": 2.1513455476919875e-05, "loss": 3.5005, "step": 29536 }, { "epoch": 10.816901408450704, "grad_norm": 1.2534151077270508, "learning_rate": 2.0184196219268805e-05, "loss": 3.4956, "step": 29952 }, { "epoch": 10.967136150234742, "grad_norm": 1.0579476356506348, "learning_rate": 1.8887925224546575e-05, "loss": 3.4984, "step": 30368 }, { "epoch": 11.11737089201878, "grad_norm": 0.9352797269821167, "learning_rate": 1.7626064764005655e-05, "loss": 3.4891, "step": 30784 }, { "epoch": 11.251354279523294, "eval_g2l_cer": 48.1779, "eval_g2l_gen_len": 3.0241, "eval_g2l_rouge1": 41.3076, "eval_g2l_rouge2": 33.5874, "eval_g2l_rougeL": 41.3381, "eval_g2l_rougeLsum": 41.2834, "eval_l2ex_cer": 86.303, "eval_l2ex_gen_len": 21.6927, "eval_l2ex_rouge1": 28.5306, "eval_l2ex_rouge2": 12.66, "eval_l2ex_rougeL": 25.107, "eval_l2ex_rougeLsum": 25.1229, "eval_l2g_cer": 71.7607, "eval_l2g_gen_len": 15.6002, "eval_l2g_rouge1": 39.1998, "eval_l2g_rouge2": 26.5146, "eval_l2g_rougeL": 37.2299, "eval_l2g_rougeLsum": 37.2583, "eval_loss": 3.5692920684814453, "eval_runtime": 191.2935, "eval_samples_per_second": 51.863, "eval_steps_per_second": 1.626, "step": 31155 }, { "epoch": 11.267605633802816, "grad_norm": 0.8403520584106445, "learning_rate": 1.6399999353588347e-05, "loss": 3.4762, "step": 31200 }, { "epoch": 11.417840375586854, "grad_norm": 0.8685266375541687, "learning_rate": 1.5211074234832911e-05, "loss": 3.491, "step": 31616 }, { "epoch": 11.568075117370892, "grad_norm": 0.8662200570106506, "learning_rate": 1.4060593898871712e-05, "loss": 3.4818, "step": 32032 }, { "epoch": 11.71830985915493, "grad_norm": 0.915972888469696, "learning_rate": 1.2949820655140888e-05, "loss": 3.4729, "step": 32448 }, { "epoch": 11.868544600938968, "grad_norm": 0.9427916407585144, "learning_rate": 1.187997324637174e-05, "loss": 3.4837, "step": 32864 }, { "epoch": 12.001444564824846, "eval_g2l_cer": 48.4635, "eval_g2l_gen_len": 3.0374, "eval_g2l_rouge1": 41.42, "eval_g2l_rouge2": 33.7871, "eval_g2l_rougeL": 41.41, "eval_g2l_rougeLsum": 41.3653, "eval_l2ex_cer": 84.6873, "eval_l2ex_gen_len": 21.5406, "eval_l2ex_rouge1": 28.7533, "eval_l2ex_rouge2": 12.7721, "eval_l2ex_rougeL": 25.3715, "eval_l2ex_rougeLsum": 25.3817, "eval_l2g_cer": 71.4847, "eval_l2g_gen_len": 15.5437, "eval_l2g_rouge1": 39.2147, "eval_l2g_rouge2": 26.5099, "eval_l2g_rougeL": 37.2362, "eval_l2g_rougeLsum": 37.2641, "eval_loss": 3.5653076171875, "eval_runtime": 189.8727, "eval_samples_per_second": 52.251, "eval_steps_per_second": 1.638, "step": 33232 }, { "epoch": 12.018779342723004, "grad_norm": 0.8259687423706055, "learning_rate": 1.0852225511383663e-05, "loss": 3.4764, "step": 33280 }, { "epoch": 12.169014084507042, "grad_norm": 0.904097855091095, "learning_rate": 9.86770509714574e-06, "loss": 3.4791, "step": 33696 }, { "epoch": 12.31924882629108, "grad_norm": 0.9662612080574036, "learning_rate": 8.927492221520133e-06, "loss": 3.4593, "step": 34112 }, { "epoch": 12.469483568075118, "grad_norm": 0.9324942231178284, "learning_rate": 8.032618488044715e-06, "loss": 3.4564, "step": 34528 }, { "epoch": 12.619718309859154, "grad_norm": 0.9966897964477539, "learning_rate": 7.184065754055608e-06, "loss": 3.4576, "step": 34944 }, { "epoch": 12.751534850126399, "eval_g2l_cer": 47.8718, "eval_g2l_gen_len": 3.0243, "eval_g2l_rouge1": 41.399, "eval_g2l_rouge2": 33.8189, "eval_g2l_rougeL": 41.4105, "eval_g2l_rougeLsum": 41.3515, "eval_l2ex_cer": 84.0524, "eval_l2ex_gen_len": 21.0206, "eval_l2ex_rouge1": 28.7814, "eval_l2ex_rouge2": 12.7663, "eval_l2ex_rougeL": 25.3724, "eval_l2ex_rougeLsum": 25.3895, "eval_l2g_cer": 71.6622, "eval_l2g_gen_len": 15.563, "eval_l2g_rouge1": 39.1666, "eval_l2g_rouge2": 26.5275, "eval_l2g_rougeL": 37.1881, "eval_l2g_rougeLsum": 37.2249, "eval_loss": 3.564103841781616, "eval_runtime": 190.2806, "eval_samples_per_second": 52.139, "eval_steps_per_second": 1.634, "step": 35309 }, { "epoch": 12.769953051643192, "grad_norm": 1.0099953413009644, "learning_rate": 6.382765053391182e-06, "loss": 3.4757, "step": 35360 }, { "epoch": 12.92018779342723, "grad_norm": 0.8347458243370056, "learning_rate": 5.629595574859816e-06, "loss": 3.4814, "step": 35776 }, { "epoch": 13.070422535211268, "grad_norm": 0.8532468676567078, "learning_rate": 4.925383697592043e-06, "loss": 3.4667, "step": 36192 }, { "epoch": 13.220657276995306, "grad_norm": 0.8852038383483887, "learning_rate": 4.2709020843357075e-06, "loss": 3.4512, "step": 36608 }, { "epoch": 13.370892018779342, "grad_norm": 1.058424472808838, "learning_rate": 3.666868833688726e-06, "loss": 3.4616, "step": 37024 }, { "epoch": 13.501625135427952, "eval_g2l_cer": 47.8581, "eval_g2l_gen_len": 3.0221, "eval_g2l_rouge1": 41.4693, "eval_g2l_rouge2": 33.7773, "eval_g2l_rougeL": 41.4822, "eval_g2l_rougeLsum": 41.4356, "eval_l2ex_cer": 84.3083, "eval_l2ex_gen_len": 21.0319, "eval_l2ex_rouge1": 28.654, "eval_l2ex_rouge2": 12.8413, "eval_l2ex_rougeL": 25.3941, "eval_l2ex_rougeLsum": 25.4326, "eval_l2g_cer": 71.0018, "eval_l2g_gen_len": 15.3407, "eval_l2g_rouge1": 39.2009, "eval_l2g_rouge2": 26.5422, "eval_l2g_rougeL": 37.2433, "eval_l2g_rougeLsum": 37.2693, "eval_loss": 3.5637874603271484, "eval_runtime": 187.7571, "eval_samples_per_second": 52.84, "eval_steps_per_second": 1.656, "step": 37386 } ], "logging_steps": 416, "max_steps": 41535, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 2077, "total_flos": 7.17240637379838e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }