{ "best_global_step": 26000, "best_metric": 74.62046528623556, "best_model_checkpoint": "./aramaic_diacritization_model_deep/checkpoint-26000", "epoch": 36.32760898282695, "eval_steps": 500, "global_step": 27500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.13210039630118892, "grad_norm": 49.535423278808594, "learning_rate": 9.400000000000001e-07, "loss": 13.2291, "step": 100 }, { "epoch": 0.26420079260237783, "grad_norm": 8.345534324645996, "learning_rate": 1.94e-06, "loss": 4.9005, "step": 200 }, { "epoch": 0.3963011889035667, "grad_norm": 6.039630889892578, "learning_rate": 2.9400000000000002e-06, "loss": 3.1719, "step": 300 }, { "epoch": 0.5284015852047557, "grad_norm": 5.1691412925720215, "learning_rate": 3.94e-06, "loss": 2.6108, "step": 400 }, { "epoch": 0.6605019815059445, "grad_norm": 4.929126262664795, "learning_rate": 4.94e-06, "loss": 2.2213, "step": 500 }, { "epoch": 0.6605019815059445, "eval_bleu": 0.19172411762066338, "eval_char_accuracy": 10.030946701760158, "eval_loss": 1.71331787109375, "eval_runtime": 308.7303, "eval_samples_per_second": 4.904, "eval_steps_per_second": 0.615, "step": 500 }, { "epoch": 0.7926023778071334, "grad_norm": 5.598935604095459, "learning_rate": 5.94e-06, "loss": 1.9171, "step": 600 }, { "epoch": 0.9247027741083224, "grad_norm": 5.103219032287598, "learning_rate": 6.9400000000000005e-06, "loss": 1.6624, "step": 700 }, { "epoch": 1.0568031704095113, "grad_norm": 4.852541923522949, "learning_rate": 7.94e-06, "loss": 1.473, "step": 800 }, { "epoch": 1.1889035667107002, "grad_norm": 4.4410624504089355, "learning_rate": 8.94e-06, "loss": 1.3081, "step": 900 }, { "epoch": 1.321003963011889, "grad_norm": 3.9946470260620117, "learning_rate": 9.940000000000001e-06, "loss": 1.1709, "step": 1000 }, { "epoch": 1.321003963011889, "eval_bleu": 5.2707215480174545, "eval_char_accuracy": 18.391696825135714, "eval_loss": 0.8503363132476807, "eval_runtime": 326.5655, "eval_samples_per_second": 4.636, "eval_steps_per_second": 0.582, "step": 1000 }, { "epoch": 1.453104359313078, "grad_norm": 5.828360557556152, "learning_rate": 9.99996092907511e-06, "loss": 1.0774, "step": 1100 }, { "epoch": 1.5852047556142668, "grad_norm": 3.975123643875122, "learning_rate": 9.999833582267183e-06, "loss": 0.9803, "step": 1200 }, { "epoch": 1.7173051519154559, "grad_norm": 4.110162258148193, "learning_rate": 9.999617802644021e-06, "loss": 0.9023, "step": 1300 }, { "epoch": 1.8494055482166445, "grad_norm": 4.341949462890625, "learning_rate": 9.999313594022158e-06, "loss": 0.8494, "step": 1400 }, { "epoch": 1.9815059445178336, "grad_norm": 3.7582991123199463, "learning_rate": 9.99892096178217e-06, "loss": 0.7841, "step": 1500 }, { "epoch": 1.9815059445178336, "eval_bleu": 13.790865530076871, "eval_char_accuracy": 25.990602895213026, "eval_loss": 0.5515339374542236, "eval_runtime": 310.3546, "eval_samples_per_second": 4.878, "eval_steps_per_second": 0.612, "step": 1500 }, { "epoch": 2.1136063408190227, "grad_norm": 4.066399574279785, "learning_rate": 9.998439912868608e-06, "loss": 0.7379, "step": 1600 }, { "epoch": 2.2457067371202113, "grad_norm": 3.739553928375244, "learning_rate": 9.997870455789855e-06, "loss": 0.6859, "step": 1700 }, { "epoch": 2.3778071334214004, "grad_norm": 4.019917964935303, "learning_rate": 9.997212600617986e-06, "loss": 0.6547, "step": 1800 }, { "epoch": 2.509907529722589, "grad_norm": 3.1273276805877686, "learning_rate": 9.99646635898858e-06, "loss": 0.6313, "step": 1900 }, { "epoch": 2.642007926023778, "grad_norm": 3.0856070518493652, "learning_rate": 9.995631744100536e-06, "loss": 0.6058, "step": 2000 }, { "epoch": 2.642007926023778, "eval_bleu": 22.141127198903924, "eval_char_accuracy": 30.873190491857212, "eval_loss": 0.4170660674571991, "eval_runtime": 310.921, "eval_samples_per_second": 4.869, "eval_steps_per_second": 0.611, "step": 2000 }, { "epoch": 2.7741083223249667, "grad_norm": 3.660649299621582, "learning_rate": 9.994708770715807e-06, "loss": 0.5758, "step": 2100 }, { "epoch": 2.906208718626156, "grad_norm": 3.3534188270568848, "learning_rate": 9.993697455159165e-06, "loss": 0.5507, "step": 2200 }, { "epoch": 3.038309114927345, "grad_norm": 2.831392526626587, "learning_rate": 9.992597815317901e-06, "loss": 0.5334, "step": 2300 }, { "epoch": 3.1704095112285335, "grad_norm": 3.2069804668426514, "learning_rate": 9.991409870641512e-06, "loss": 0.508, "step": 2400 }, { "epoch": 3.3025099075297226, "grad_norm": 3.3302793502807617, "learning_rate": 9.990133642141359e-06, "loss": 0.4816, "step": 2500 }, { "epoch": 3.3025099075297226, "eval_bleu": 28.8217714543364, "eval_char_accuracy": 34.30457312057904, "eval_loss": 0.34086647629737854, "eval_runtime": 312.1625, "eval_samples_per_second": 4.85, "eval_steps_per_second": 0.609, "step": 2500 }, { "epoch": 3.4346103038309117, "grad_norm": 3.0527114868164062, "learning_rate": 9.988769152390284e-06, "loss": 0.4779, "step": 2600 }, { "epoch": 3.5667107001321003, "grad_norm": 2.557722568511963, "learning_rate": 9.987316425522226e-06, "loss": 0.4626, "step": 2700 }, { "epoch": 3.6988110964332894, "grad_norm": 2.993014097213745, "learning_rate": 9.985775487231788e-06, "loss": 0.4452, "step": 2800 }, { "epoch": 3.830911492734478, "grad_norm": 2.7321043014526367, "learning_rate": 9.984146364773777e-06, "loss": 0.4408, "step": 2900 }, { "epoch": 3.963011889035667, "grad_norm": 2.8790836334228516, "learning_rate": 9.982429086962729e-06, "loss": 0.4108, "step": 3000 }, { "epoch": 3.963011889035667, "eval_bleu": 33.52667574407562, "eval_char_accuracy": 37.42289027800625, "eval_loss": 0.29226553440093994, "eval_runtime": 316.4732, "eval_samples_per_second": 4.784, "eval_steps_per_second": 0.6, "step": 3000 }, { "epoch": 4.095112285336856, "grad_norm": 2.8862805366516113, "learning_rate": 9.980623684172396e-06, "loss": 0.4134, "step": 3100 }, { "epoch": 4.227212681638045, "grad_norm": 2.4621527194976807, "learning_rate": 9.978730188335215e-06, "loss": 0.3919, "step": 3200 }, { "epoch": 4.359313077939234, "grad_norm": 2.822957992553711, "learning_rate": 9.976748632941733e-06, "loss": 0.384, "step": 3300 }, { "epoch": 4.491413474240423, "grad_norm": 2.448110818862915, "learning_rate": 9.974679053040018e-06, "loss": 0.3735, "step": 3400 }, { "epoch": 4.623513870541611, "grad_norm": 2.2914109230041504, "learning_rate": 9.972521485235045e-06, "loss": 0.3604, "step": 3500 }, { "epoch": 4.623513870541611, "eval_bleu": 37.45556816331904, "eval_char_accuracy": 40.02354416844876, "eval_loss": 0.25823718309402466, "eval_runtime": 320.7507, "eval_samples_per_second": 4.72, "eval_steps_per_second": 0.592, "step": 3500 }, { "epoch": 4.755614266842801, "grad_norm": 2.5924477577209473, "learning_rate": 9.970275967688047e-06, "loss": 0.3624, "step": 3600 }, { "epoch": 4.887714663143989, "grad_norm": 2.49125075340271, "learning_rate": 9.967942540115829e-06, "loss": 0.3508, "step": 3700 }, { "epoch": 5.019815059445178, "grad_norm": 2.464569330215454, "learning_rate": 9.965521243790079e-06, "loss": 0.3355, "step": 3800 }, { "epoch": 5.1519154557463676, "grad_norm": 2.6026785373687744, "learning_rate": 9.963012121536635e-06, "loss": 0.3284, "step": 3900 }, { "epoch": 5.284015852047556, "grad_norm": 2.351313591003418, "learning_rate": 9.96041521773472e-06, "loss": 0.328, "step": 4000 }, { "epoch": 5.284015852047556, "eval_bleu": 40.737027833366824, "eval_char_accuracy": 42.49156933706202, "eval_loss": 0.23189863562583923, "eval_runtime": 320.7889, "eval_samples_per_second": 4.72, "eval_steps_per_second": 0.592, "step": 4000 }, { "epoch": 5.416116248348745, "grad_norm": 3.0100066661834717, "learning_rate": 9.95773057831617e-06, "loss": 0.311, "step": 4100 }, { "epoch": 5.5482166446499335, "grad_norm": 2.049722671508789, "learning_rate": 9.954958250764604e-06, "loss": 0.3136, "step": 4200 }, { "epoch": 5.680317040951123, "grad_norm": 2.1132702827453613, "learning_rate": 9.952098284114604e-06, "loss": 0.3, "step": 4300 }, { "epoch": 5.812417437252312, "grad_norm": 2.174574613571167, "learning_rate": 9.949150728950833e-06, "loss": 0.3093, "step": 4400 }, { "epoch": 5.9445178335535, "grad_norm": 2.8350446224212646, "learning_rate": 9.946115637407145e-06, "loss": 0.2988, "step": 4500 }, { "epoch": 5.9445178335535, "eval_bleu": 43.8559984358133, "eval_char_accuracy": 44.56273646981411, "eval_loss": 0.21132107079029083, "eval_runtime": 319.4845, "eval_samples_per_second": 4.739, "eval_steps_per_second": 0.595, "step": 4500 }, { "epoch": 6.07661822985469, "grad_norm": 2.289716958999634, "learning_rate": 9.94299306316567e-06, "loss": 0.2938, "step": 4600 }, { "epoch": 6.208718626155878, "grad_norm": 2.9307034015655518, "learning_rate": 9.939783061455845e-06, "loss": 0.2814, "step": 4700 }, { "epoch": 6.340819022457067, "grad_norm": 2.3431613445281982, "learning_rate": 9.936485689053462e-06, "loss": 0.2782, "step": 4800 }, { "epoch": 6.472919418758257, "grad_norm": 2.2339768409729004, "learning_rate": 9.933101004279647e-06, "loss": 0.2752, "step": 4900 }, { "epoch": 6.605019815059445, "grad_norm": 2.076145887374878, "learning_rate": 9.92962906699983e-06, "loss": 0.265, "step": 5000 }, { "epoch": 6.605019815059445, "eval_bleu": 47.09645003475602, "eval_char_accuracy": 46.29205050172725, "eval_loss": 0.1954895406961441, "eval_runtime": 322.9917, "eval_samples_per_second": 4.687, "eval_steps_per_second": 0.588, "step": 5000 }, { "epoch": 6.737120211360634, "grad_norm": 1.715720295906067, "learning_rate": 9.926069938622698e-06, "loss": 0.266, "step": 5100 }, { "epoch": 6.869220607661823, "grad_norm": 2.501234531402588, "learning_rate": 9.922423682099088e-06, "loss": 0.2633, "step": 5200 }, { "epoch": 7.001321003963012, "grad_norm": 1.929123044013977, "learning_rate": 9.918690361920898e-06, "loss": 0.2584, "step": 5300 }, { "epoch": 7.133421400264201, "grad_norm": 2.0370264053344727, "learning_rate": 9.914870044119924e-06, "loss": 0.2451, "step": 5400 }, { "epoch": 7.265521796565389, "grad_norm": 2.3562278747558594, "learning_rate": 9.91096279626671e-06, "loss": 0.2476, "step": 5500 }, { "epoch": 7.265521796565389, "eval_bleu": 48.90758910970442, "eval_char_accuracy": 46.93308932390195, "eval_loss": 0.18339309096336365, "eval_runtime": 328.302, "eval_samples_per_second": 4.612, "eval_steps_per_second": 0.579, "step": 5500 }, { "epoch": 7.397622192866579, "grad_norm": 2.410529613494873, "learning_rate": 9.90696868746934e-06, "loss": 0.2419, "step": 5600 }, { "epoch": 7.5297225891677675, "grad_norm": 1.685939908027649, "learning_rate": 9.902887788372223e-06, "loss": 0.2448, "step": 5700 }, { "epoch": 7.661822985468956, "grad_norm": 2.3180549144744873, "learning_rate": 9.89872017115484e-06, "loss": 0.2379, "step": 5800 }, { "epoch": 7.793923381770146, "grad_norm": 2.4159021377563477, "learning_rate": 9.894465909530471e-06, "loss": 0.2339, "step": 5900 }, { "epoch": 7.926023778071334, "grad_norm": 2.0757477283477783, "learning_rate": 9.890125078744884e-06, "loss": 0.2356, "step": 6000 }, { "epoch": 7.926023778071334, "eval_bleu": 51.273473767746786, "eval_char_accuracy": 49.088563086033886, "eval_loss": 0.1728673279285431, "eval_runtime": 317.183, "eval_samples_per_second": 4.773, "eval_steps_per_second": 0.599, "step": 6000 }, { "epoch": 8.058124174372523, "grad_norm": 1.9405860900878906, "learning_rate": 9.885697755575015e-06, "loss": 0.2251, "step": 6100 }, { "epoch": 8.190224570673712, "grad_norm": 1.7473342418670654, "learning_rate": 9.881184018327597e-06, "loss": 0.2195, "step": 6200 }, { "epoch": 8.3223249669749, "grad_norm": 1.7633724212646484, "learning_rate": 9.876583946837787e-06, "loss": 0.219, "step": 6300 }, { "epoch": 8.45442536327609, "grad_norm": 2.1117053031921387, "learning_rate": 9.871897622467748e-06, "loss": 0.2148, "step": 6400 }, { "epoch": 8.58652575957728, "grad_norm": 2.114854574203491, "learning_rate": 9.867125128105211e-06, "loss": 0.2222, "step": 6500 }, { "epoch": 8.58652575957728, "eval_bleu": 53.15738681385049, "eval_char_accuracy": 49.680251686132586, "eval_loss": 0.1632871925830841, "eval_runtime": 325.9103, "eval_samples_per_second": 4.645, "eval_steps_per_second": 0.583, "step": 6500 }, { "epoch": 8.718626155878468, "grad_norm": 2.6163322925567627, "learning_rate": 9.862266548162008e-06, "loss": 0.2141, "step": 6600 }, { "epoch": 8.850726552179657, "grad_norm": 2.1705501079559326, "learning_rate": 9.857321968572577e-06, "loss": 0.2126, "step": 6700 }, { "epoch": 8.982826948480845, "grad_norm": 2.4352259635925293, "learning_rate": 9.85229147679245e-06, "loss": 0.2124, "step": 6800 }, { "epoch": 9.114927344782034, "grad_norm": 1.912975788116455, "learning_rate": 9.847175161796696e-06, "loss": 0.2032, "step": 6900 }, { "epoch": 9.247027741083222, "grad_norm": 1.7575359344482422, "learning_rate": 9.841973114078358e-06, "loss": 0.2005, "step": 7000 }, { "epoch": 9.247027741083222, "eval_bleu": 54.81436388056976, "eval_char_accuracy": 50.20562592531667, "eval_loss": 0.1546466052532196, "eval_runtime": 319.5902, "eval_samples_per_second": 4.737, "eval_steps_per_second": 0.595, "step": 7000 }, { "epoch": 9.379128137384413, "grad_norm": 1.6994798183441162, "learning_rate": 9.836685425646842e-06, "loss": 0.1929, "step": 7100 }, { "epoch": 9.511228533685602, "grad_norm": 1.8375500440597534, "learning_rate": 9.831312190026295e-06, "loss": 0.1954, "step": 7200 }, { "epoch": 9.64332892998679, "grad_norm": 2.735320568084717, "learning_rate": 9.825853502253951e-06, "loss": 0.1949, "step": 7300 }, { "epoch": 9.775429326287979, "grad_norm": 1.9880143404006958, "learning_rate": 9.820309458878447e-06, "loss": 0.196, "step": 7400 }, { "epoch": 9.907529722589167, "grad_norm": 3.1160881519317627, "learning_rate": 9.814680157958122e-06, "loss": 0.1957, "step": 7500 }, { "epoch": 9.907529722589167, "eval_bleu": 56.684180257446236, "eval_char_accuracy": 51.796656522454356, "eval_loss": 0.14744216203689575, "eval_runtime": 318.7305, "eval_samples_per_second": 4.75, "eval_steps_per_second": 0.596, "step": 7500 }, { "epoch": 10.039630118890356, "grad_norm": 1.975994348526001, "learning_rate": 9.808965699059276e-06, "loss": 0.1964, "step": 7600 }, { "epoch": 10.171730515191545, "grad_norm": 1.6857510805130005, "learning_rate": 9.80316618325441e-06, "loss": 0.1832, "step": 7700 }, { "epoch": 10.303830911492735, "grad_norm": 1.6473827362060547, "learning_rate": 9.797281713120438e-06, "loss": 0.1846, "step": 7800 }, { "epoch": 10.435931307793924, "grad_norm": 2.1330363750457764, "learning_rate": 9.79131239273688e-06, "loss": 0.1783, "step": 7900 }, { "epoch": 10.568031704095112, "grad_norm": 2.0598771572113037, "learning_rate": 9.785258327684007e-06, "loss": 0.183, "step": 8000 }, { "epoch": 10.568031704095112, "eval_bleu": 57.92928420103779, "eval_char_accuracy": 51.580235236058556, "eval_loss": 0.1413801610469818, "eval_runtime": 318.3694, "eval_samples_per_second": 4.755, "eval_steps_per_second": 0.597, "step": 8000 }, { "epoch": 10.700132100396301, "grad_norm": 1.8669029474258423, "learning_rate": 9.779119625040988e-06, "loss": 0.1801, "step": 8100 }, { "epoch": 10.83223249669749, "grad_norm": 1.967623233795166, "learning_rate": 9.772896393383991e-06, "loss": 0.1772, "step": 8200 }, { "epoch": 10.964332892998678, "grad_norm": 2.0888118743896484, "learning_rate": 9.766588742784255e-06, "loss": 0.1741, "step": 8300 }, { "epoch": 11.096433289299869, "grad_norm": 1.8615264892578125, "learning_rate": 9.760196784806155e-06, "loss": 0.1733, "step": 8400 }, { "epoch": 11.228533685601057, "grad_norm": 1.786023736000061, "learning_rate": 9.753720632505219e-06, "loss": 0.171, "step": 8500 }, { "epoch": 11.228533685601057, "eval_bleu": 58.97794662773473, "eval_char_accuracy": 53.53111120250041, "eval_loss": 0.13721999526023865, "eval_runtime": 310.3442, "eval_samples_per_second": 4.878, "eval_steps_per_second": 0.612, "step": 8500 }, { "epoch": 11.360634081902246, "grad_norm": 2.22037935256958, "learning_rate": 9.74716040042614e-06, "loss": 0.169, "step": 8600 }, { "epoch": 11.492734478203435, "grad_norm": 2.2769389152526855, "learning_rate": 9.740516204600734e-06, "loss": 0.1631, "step": 8700 }, { "epoch": 11.624834874504623, "grad_norm": 2.1513566970825195, "learning_rate": 9.733788162545902e-06, "loss": 0.1669, "step": 8800 }, { "epoch": 11.756935270805812, "grad_norm": 1.640358328819275, "learning_rate": 9.726976393261547e-06, "loss": 0.1674, "step": 8900 }, { "epoch": 11.889035667107, "grad_norm": 1.6976934671401978, "learning_rate": 9.720081017228462e-06, "loss": 0.1646, "step": 9000 }, { "epoch": 11.889035667107, "eval_bleu": 60.38328284328657, "eval_char_accuracy": 54.70215084717881, "eval_loss": 0.13088105618953705, "eval_runtime": 316.5706, "eval_samples_per_second": 4.783, "eval_steps_per_second": 0.6, "step": 9000 }, { "epoch": 12.021136063408191, "grad_norm": 1.7420779466629028, "learning_rate": 9.713102156406213e-06, "loss": 0.1629, "step": 9100 }, { "epoch": 12.15323645970938, "grad_norm": 1.9723796844482422, "learning_rate": 9.706039934230967e-06, "loss": 0.1578, "step": 9200 }, { "epoch": 12.285336856010568, "grad_norm": 2.3517324924468994, "learning_rate": 9.698894475613323e-06, "loss": 0.1561, "step": 9300 }, { "epoch": 12.417437252311757, "grad_norm": 1.5132865905761719, "learning_rate": 9.691665906936088e-06, "loss": 0.157, "step": 9400 }, { "epoch": 12.549537648612946, "grad_norm": 2.435624599456787, "learning_rate": 9.684354356052055e-06, "loss": 0.1538, "step": 9500 }, { "epoch": 12.549537648612946, "eval_bleu": 61.08870144349316, "eval_char_accuracy": 55.10055107747986, "eval_loss": 0.12675440311431885, "eval_runtime": 314.3152, "eval_samples_per_second": 4.817, "eval_steps_per_second": 0.604, "step": 9500 }, { "epoch": 12.681638044914134, "grad_norm": 2.092256784439087, "learning_rate": 9.676959952281733e-06, "loss": 0.1518, "step": 9600 }, { "epoch": 12.813738441215325, "grad_norm": 1.7985179424285889, "learning_rate": 9.669482826411065e-06, "loss": 0.158, "step": 9700 }, { "epoch": 12.945838837516513, "grad_norm": 1.7571889162063599, "learning_rate": 9.66192311068911e-06, "loss": 0.152, "step": 9800 }, { "epoch": 13.077939233817702, "grad_norm": 1.3526843786239624, "learning_rate": 9.654280938825705e-06, "loss": 0.1426, "step": 9900 }, { "epoch": 13.21003963011889, "grad_norm": 1.6651784181594849, "learning_rate": 9.646556445989106e-06, "loss": 0.1476, "step": 10000 }, { "epoch": 13.21003963011889, "eval_bleu": 61.91608356246592, "eval_char_accuracy": 55.006477216647475, "eval_loss": 0.12389995902776718, "eval_runtime": 315.2745, "eval_samples_per_second": 4.802, "eval_steps_per_second": 0.603, "step": 10000 }, { "epoch": 13.34214002642008, "grad_norm": 1.5739635229110718, "learning_rate": 9.63874976880359e-06, "loss": 0.148, "step": 10100 }, { "epoch": 13.474240422721268, "grad_norm": 1.779477834701538, "learning_rate": 9.63086104534704e-06, "loss": 0.1469, "step": 10200 }, { "epoch": 13.606340819022456, "grad_norm": 1.5422449111938477, "learning_rate": 9.622890415148505e-06, "loss": 0.143, "step": 10300 }, { "epoch": 13.738441215323647, "grad_norm": 1.80446457862854, "learning_rate": 9.61483801918573e-06, "loss": 0.1424, "step": 10400 }, { "epoch": 13.870541611624835, "grad_norm": 1.7641818523406982, "learning_rate": 9.606703999882667e-06, "loss": 0.1406, "step": 10500 }, { "epoch": 13.870541611624835, "eval_bleu": 62.97532147351367, "eval_char_accuracy": 56.364636453364035, "eval_loss": 0.12048687040805817, "eval_runtime": 315.4459, "eval_samples_per_second": 4.8, "eval_steps_per_second": 0.602, "step": 10500 }, { "epoch": 14.002642007926024, "grad_norm": 1.5506337881088257, "learning_rate": 9.598488501106947e-06, "loss": 0.1436, "step": 10600 }, { "epoch": 14.134742404227213, "grad_norm": 1.338537335395813, "learning_rate": 9.590191668167343e-06, "loss": 0.1396, "step": 10700 }, { "epoch": 14.266842800528401, "grad_norm": 1.9432475566864014, "learning_rate": 9.581813647811199e-06, "loss": 0.1427, "step": 10800 }, { "epoch": 14.39894319682959, "grad_norm": 1.8222265243530273, "learning_rate": 9.573354588221833e-06, "loss": 0.1352, "step": 10900 }, { "epoch": 14.531043593130779, "grad_norm": 1.7141766548156738, "learning_rate": 9.564814639015915e-06, "loss": 0.1361, "step": 11000 }, { "epoch": 14.531043593130779, "eval_bleu": 63.45481969831487, "eval_char_accuracy": 56.61755634150354, "eval_loss": 0.11702162027359009, "eval_runtime": 318.0259, "eval_samples_per_second": 4.761, "eval_steps_per_second": 0.597, "step": 11000 }, { "epoch": 14.663143989431969, "grad_norm": 1.473866581916809, "learning_rate": 9.556193951240821e-06, "loss": 0.1302, "step": 11100 }, { "epoch": 14.795244385733158, "grad_norm": 1.3840774297714233, "learning_rate": 9.547492677371968e-06, "loss": 0.1355, "step": 11200 }, { "epoch": 14.927344782034346, "grad_norm": 1.8060030937194824, "learning_rate": 9.538710971310104e-06, "loss": 0.1332, "step": 11300 }, { "epoch": 15.059445178335535, "grad_norm": 1.8242031335830688, "learning_rate": 9.529848988378597e-06, "loss": 0.1247, "step": 11400 }, { "epoch": 15.191545574636724, "grad_norm": 1.3062950372695923, "learning_rate": 9.520906885320682e-06, "loss": 0.1295, "step": 11500 }, { "epoch": 15.191545574636724, "eval_bleu": 64.22910948855701, "eval_char_accuracy": 57.40099111696003, "eval_loss": 0.11363621801137924, "eval_runtime": 315.2122, "eval_samples_per_second": 4.803, "eval_steps_per_second": 0.603, "step": 11500 }, { "epoch": 15.323645970937912, "grad_norm": 1.5021114349365234, "learning_rate": 9.511884820296695e-06, "loss": 0.1292, "step": 11600 }, { "epoch": 15.455746367239101, "grad_norm": 1.8947999477386475, "learning_rate": 9.502782952881268e-06, "loss": 0.128, "step": 11700 }, { "epoch": 15.587846763540291, "grad_norm": 1.5116643905639648, "learning_rate": 9.493601444060514e-06, "loss": 0.1276, "step": 11800 }, { "epoch": 15.71994715984148, "grad_norm": 1.5134871006011963, "learning_rate": 9.48434045622917e-06, "loss": 0.132, "step": 11900 }, { "epoch": 15.852047556142669, "grad_norm": 1.7438267469406128, "learning_rate": 9.475000153187733e-06, "loss": 0.1243, "step": 12000 }, { "epoch": 15.852047556142669, "eval_bleu": 65.05289431113529, "eval_char_accuracy": 57.989081263365684, "eval_loss": 0.1115257665514946, "eval_runtime": 309.8437, "eval_samples_per_second": 4.886, "eval_steps_per_second": 0.613, "step": 12000 }, { "epoch": 15.984147952443857, "grad_norm": 1.913855791091919, "learning_rate": 9.46558070013956e-06, "loss": 0.1261, "step": 12100 }, { "epoch": 16.116248348745046, "grad_norm": 1.4257742166519165, "learning_rate": 9.456082263687946e-06, "loss": 0.117, "step": 12200 }, { "epoch": 16.248348745046236, "grad_norm": 1.49489164352417, "learning_rate": 9.44650501183318e-06, "loss": 0.1232, "step": 12300 }, { "epoch": 16.380449141347423, "grad_norm": 1.7977708578109741, "learning_rate": 9.436849113969567e-06, "loss": 0.1212, "step": 12400 }, { "epoch": 16.512549537648614, "grad_norm": 1.2978798151016235, "learning_rate": 9.427212472501483e-06, "loss": 0.122, "step": 12500 }, { "epoch": 16.512549537648614, "eval_bleu": 65.59844085659223, "eval_char_accuracy": 58.32784997532489, "eval_loss": 0.10947112739086151, "eval_runtime": 332.8226, "eval_samples_per_second": 4.549, "eval_steps_per_second": 0.571, "step": 12500 }, { "epoch": 16.6446499339498, "grad_norm": 1.5628472566604614, "learning_rate": 9.417400578537868e-06, "loss": 0.1219, "step": 12600 }, { "epoch": 16.77675033025099, "grad_norm": 1.5304769277572632, "learning_rate": 9.407510553339931e-06, "loss": 0.1192, "step": 12700 }, { "epoch": 16.90885072655218, "grad_norm": 1.6370787620544434, "learning_rate": 9.397542571834054e-06, "loss": 0.1181, "step": 12800 }, { "epoch": 17.040951122853368, "grad_norm": 1.5388678312301636, "learning_rate": 9.387496810325436e-06, "loss": 0.1137, "step": 12900 }, { "epoch": 17.17305151915456, "grad_norm": 1.8251888751983643, "learning_rate": 9.377373446494984e-06, "loss": 0.1122, "step": 13000 }, { "epoch": 17.17305151915456, "eval_bleu": 66.79852257149867, "eval_char_accuracy": 58.47692877117947, "eval_loss": 0.1069813147187233, "eval_runtime": 350.9901, "eval_samples_per_second": 4.314, "eval_steps_per_second": 0.541, "step": 13000 }, { "epoch": 17.305151915455745, "grad_norm": 1.6730031967163086, "learning_rate": 9.367172659396172e-06, "loss": 0.1123, "step": 13100 }, { "epoch": 17.437252311756936, "grad_norm": 1.2402830123901367, "learning_rate": 9.35689462945187e-06, "loss": 0.1125, "step": 13200 }, { "epoch": 17.569352708058123, "grad_norm": 1.9425466060638428, "learning_rate": 9.34653953845115e-06, "loss": 0.1109, "step": 13300 }, { "epoch": 17.701453104359313, "grad_norm": 3.105457067489624, "learning_rate": 9.33610756954608e-06, "loss": 0.1161, "step": 13400 }, { "epoch": 17.833553500660503, "grad_norm": 1.679442048072815, "learning_rate": 9.325598907248478e-06, "loss": 0.1131, "step": 13500 }, { "epoch": 17.833553500660503, "eval_bleu": 66.72469802657548, "eval_char_accuracy": 59.26910264846191, "eval_loss": 0.10426344722509384, "eval_runtime": 353.0152, "eval_samples_per_second": 4.289, "eval_steps_per_second": 0.538, "step": 13500 }, { "epoch": 17.96565389696169, "grad_norm": 1.1806349754333496, "learning_rate": 9.315013737426645e-06, "loss": 0.115, "step": 13600 }, { "epoch": 18.09775429326288, "grad_norm": 1.5437259674072266, "learning_rate": 9.304352247302091e-06, "loss": 0.1071, "step": 13700 }, { "epoch": 18.229854689564068, "grad_norm": 3.9584083557128906, "learning_rate": 9.293614625446205e-06, "loss": 0.11, "step": 13800 }, { "epoch": 18.361955085865258, "grad_norm": 1.4071292877197266, "learning_rate": 9.282801061776937e-06, "loss": 0.1093, "step": 13900 }, { "epoch": 18.494055482166445, "grad_norm": 1.599787950515747, "learning_rate": 9.271911747555425e-06, "loss": 0.1057, "step": 14000 }, { "epoch": 18.494055482166445, "eval_bleu": 67.26763967365643, "eval_char_accuracy": 58.966318473433134, "eval_loss": 0.10266197472810745, "eval_runtime": 318.6046, "eval_samples_per_second": 4.752, "eval_steps_per_second": 0.596, "step": 14000 }, { "epoch": 18.626155878467635, "grad_norm": 1.3243365287780762, "learning_rate": 9.260946875382624e-06, "loss": 0.1054, "step": 14100 }, { "epoch": 18.758256274768826, "grad_norm": 1.8453656435012817, "learning_rate": 9.249906639195894e-06, "loss": 0.1096, "step": 14200 }, { "epoch": 18.890356671070013, "grad_norm": 1.2308754920959473, "learning_rate": 9.238791234265565e-06, "loss": 0.1045, "step": 14300 }, { "epoch": 19.022457067371203, "grad_norm": 1.3015666007995605, "learning_rate": 9.22760085719149e-06, "loss": 0.1065, "step": 14400 }, { "epoch": 19.15455746367239, "grad_norm": 1.3568576574325562, "learning_rate": 9.21633570589957e-06, "loss": 0.1024, "step": 14500 }, { "epoch": 19.15455746367239, "eval_bleu": 67.65790078495372, "eval_char_accuracy": 59.99547622964303, "eval_loss": 0.10274580866098404, "eval_runtime": 317.6719, "eval_samples_per_second": 4.766, "eval_steps_per_second": 0.598, "step": 14500 }, { "epoch": 19.28665785997358, "grad_norm": 1.6260790824890137, "learning_rate": 9.204995979638241e-06, "loss": 0.1025, "step": 14600 }, { "epoch": 19.418758256274767, "grad_norm": 1.1852062940597534, "learning_rate": 9.193581878974964e-06, "loss": 0.101, "step": 14700 }, { "epoch": 19.550858652575958, "grad_norm": 2.5521767139434814, "learning_rate": 9.18209360579267e-06, "loss": 0.0999, "step": 14800 }, { "epoch": 19.682959048877148, "grad_norm": 1.0231155157089233, "learning_rate": 9.17053136328619e-06, "loss": 0.1055, "step": 14900 }, { "epoch": 19.815059445178335, "grad_norm": 1.3518396615982056, "learning_rate": 9.15889535595866e-06, "loss": 0.1014, "step": 15000 }, { "epoch": 19.815059445178335, "eval_bleu": 68.43528395628573, "eval_char_accuracy": 61.64870866918901, "eval_loss": 0.09970895200967789, "eval_runtime": 314.0763, "eval_samples_per_second": 4.82, "eval_steps_per_second": 0.605, "step": 15000 }, { "epoch": 19.947159841479525, "grad_norm": 1.2678903341293335, "learning_rate": 9.147185789617907e-06, "loss": 0.1005, "step": 15100 }, { "epoch": 20.079260237780712, "grad_norm": 1.2439242601394653, "learning_rate": 9.13540287137281e-06, "loss": 0.0989, "step": 15200 }, { "epoch": 20.211360634081903, "grad_norm": 1.3169798851013184, "learning_rate": 9.123546809629632e-06, "loss": 0.1006, "step": 15300 }, { "epoch": 20.34346103038309, "grad_norm": 1.3063526153564453, "learning_rate": 9.111617814088332e-06, "loss": 0.0966, "step": 15400 }, { "epoch": 20.47556142668428, "grad_norm": 1.8397212028503418, "learning_rate": 9.099616095738867e-06, "loss": 0.0965, "step": 15500 }, { "epoch": 20.47556142668428, "eval_bleu": 68.40618966106221, "eval_char_accuracy": 60.30751357131107, "eval_loss": 0.09965521842241287, "eval_runtime": 315.776, "eval_samples_per_second": 4.795, "eval_steps_per_second": 0.602, "step": 15500 }, { "epoch": 20.60766182298547, "grad_norm": 1.544118046760559, "learning_rate": 9.087541866857453e-06, "loss": 0.0954, "step": 15600 }, { "epoch": 20.739762219286657, "grad_norm": 1.5378237962722778, "learning_rate": 9.075395341002804e-06, "loss": 0.0975, "step": 15700 }, { "epoch": 20.871862615587848, "grad_norm": 1.0197510719299316, "learning_rate": 9.06317673301237e-06, "loss": 0.0964, "step": 15800 }, { "epoch": 21.003963011889034, "grad_norm": 0.9621543884277344, "learning_rate": 9.05088625899852e-06, "loss": 0.0925, "step": 15900 }, { "epoch": 21.136063408190225, "grad_norm": 1.356550931930542, "learning_rate": 9.038524136344736e-06, "loss": 0.0917, "step": 16000 }, { "epoch": 21.136063408190225, "eval_bleu": 69.05043210174456, "eval_char_accuracy": 60.879153643691396, "eval_loss": 0.0960288867354393, "eval_runtime": 317.6216, "eval_samples_per_second": 4.767, "eval_steps_per_second": 0.598, "step": 16000 }, { "epoch": 21.268163804491415, "grad_norm": 2.0014472007751465, "learning_rate": 9.026090583701755e-06, "loss": 0.0962, "step": 16100 }, { "epoch": 21.400264200792602, "grad_norm": 1.4762715101242065, "learning_rate": 9.013585820983713e-06, "loss": 0.0917, "step": 16200 }, { "epoch": 21.532364597093792, "grad_norm": 1.242245078086853, "learning_rate": 9.001010069364241e-06, "loss": 0.0907, "step": 16300 }, { "epoch": 21.66446499339498, "grad_norm": 1.9390721321105957, "learning_rate": 8.98836355127257e-06, "loss": 0.0918, "step": 16400 }, { "epoch": 21.79656538969617, "grad_norm": 1.070357322692871, "learning_rate": 8.975646490389581e-06, "loss": 0.0903, "step": 16500 }, { "epoch": 21.79656538969617, "eval_bleu": 69.68531857632678, "eval_char_accuracy": 62.48920463892087, "eval_loss": 0.09554192423820496, "eval_runtime": 335.0637, "eval_samples_per_second": 4.519, "eval_steps_per_second": 0.567, "step": 16500 }, { "epoch": 21.928665785997357, "grad_norm": 1.8990777730941772, "learning_rate": 8.962859111643862e-06, "loss": 0.0946, "step": 16600 }, { "epoch": 22.060766182298547, "grad_norm": 1.6244333982467651, "learning_rate": 8.950001641207719e-06, "loss": 0.0895, "step": 16700 }, { "epoch": 22.192866578599737, "grad_norm": 1.6511205434799194, "learning_rate": 8.937074306493187e-06, "loss": 0.0907, "step": 16800 }, { "epoch": 22.324966974900924, "grad_norm": 1.421342372894287, "learning_rate": 8.924077336147992e-06, "loss": 0.0864, "step": 16900 }, { "epoch": 22.457067371202115, "grad_norm": 1.572800874710083, "learning_rate": 8.911010960051522e-06, "loss": 0.088, "step": 17000 }, { "epoch": 22.457067371202115, "eval_bleu": 69.94746089002493, "eval_char_accuracy": 61.5572051324231, "eval_loss": 0.09443064033985138, "eval_runtime": 345.5738, "eval_samples_per_second": 4.381, "eval_steps_per_second": 0.55, "step": 17000 }, { "epoch": 22.5891677675033, "grad_norm": 1.2412383556365967, "learning_rate": 8.897875409310755e-06, "loss": 0.085, "step": 17100 }, { "epoch": 22.721268163804492, "grad_norm": 1.4429903030395508, "learning_rate": 8.884803301685314e-06, "loss": 0.0908, "step": 17200 }, { "epoch": 22.85336856010568, "grad_norm": 1.1933690309524536, "learning_rate": 8.871530785794356e-06, "loss": 0.092, "step": 17300 }, { "epoch": 22.98546895640687, "grad_norm": 1.399186611175537, "learning_rate": 8.85818979355093e-06, "loss": 0.0837, "step": 17400 }, { "epoch": 23.11756935270806, "grad_norm": 1.2939783334732056, "learning_rate": 8.844780560919194e-06, "loss": 0.0871, "step": 17500 }, { "epoch": 23.11756935270806, "eval_bleu": 70.43896205976631, "eval_char_accuracy": 61.825032900148045, "eval_loss": 0.0947960913181305, "eval_runtime": 324.1925, "eval_samples_per_second": 4.67, "eval_steps_per_second": 0.586, "step": 17500 }, { "epoch": 23.249669749009247, "grad_norm": 1.2098222970962524, "learning_rate": 8.831303325070279e-06, "loss": 0.0827, "step": 17600 }, { "epoch": 23.381770145310437, "grad_norm": 1.5045851469039917, "learning_rate": 8.8177583243781e-06, "loss": 0.0838, "step": 17700 }, { "epoch": 23.513870541611624, "grad_norm": 1.5295897722244263, "learning_rate": 8.80414579841514e-06, "loss": 0.0858, "step": 17800 }, { "epoch": 23.645970937912814, "grad_norm": 1.4860461950302124, "learning_rate": 8.790465987948212e-06, "loss": 0.0875, "step": 17900 }, { "epoch": 23.778071334214, "grad_norm": 1.4711731672286987, "learning_rate": 8.776719134934199e-06, "loss": 0.0828, "step": 18000 }, { "epoch": 23.778071334214, "eval_bleu": 70.5623767627479, "eval_char_accuracy": 62.505140648132915, "eval_loss": 0.0924154594540596, "eval_runtime": 342.2412, "eval_samples_per_second": 4.424, "eval_steps_per_second": 0.555, "step": 18000 }, { "epoch": 23.91017173051519, "grad_norm": 1.4447731971740723, "learning_rate": 8.762905482515775e-06, "loss": 0.0814, "step": 18100 }, { "epoch": 24.042272126816382, "grad_norm": 1.350907802581787, "learning_rate": 8.749025275017107e-06, "loss": 0.0806, "step": 18200 }, { "epoch": 24.17437252311757, "grad_norm": 1.7207551002502441, "learning_rate": 8.735078757939532e-06, "loss": 0.08, "step": 18300 }, { "epoch": 24.30647291941876, "grad_norm": 1.0851505994796753, "learning_rate": 8.721066177957213e-06, "loss": 0.0779, "step": 18400 }, { "epoch": 24.438573315719946, "grad_norm": 1.2182328701019287, "learning_rate": 8.70698778291278e-06, "loss": 0.0814, "step": 18500 }, { "epoch": 24.438573315719946, "eval_bleu": 70.99257164437572, "eval_char_accuracy": 62.22189093600922, "eval_loss": 0.09152651578187943, "eval_runtime": 323.7174, "eval_samples_per_second": 4.677, "eval_steps_per_second": 0.587, "step": 18500 }, { "epoch": 24.570673712021136, "grad_norm": 2.0363285541534424, "learning_rate": 8.69284382181294e-06, "loss": 0.0821, "step": 18600 }, { "epoch": 24.702774108322323, "grad_norm": 1.3864432573318481, "learning_rate": 8.67863454482408e-06, "loss": 0.0809, "step": 18700 }, { "epoch": 24.834874504623514, "grad_norm": 2.032351493835449, "learning_rate": 8.664360203267838e-06, "loss": 0.0819, "step": 18800 }, { "epoch": 24.966974900924704, "grad_norm": 1.2007182836532593, "learning_rate": 8.65002104961666e-06, "loss": 0.0819, "step": 18900 }, { "epoch": 25.09907529722589, "grad_norm": 1.167693853378296, "learning_rate": 8.635617337489331e-06, "loss": 0.0778, "step": 19000 }, { "epoch": 25.09907529722589, "eval_bleu": 70.99070971451881, "eval_char_accuracy": 63.15440450732028, "eval_loss": 0.09109245985746384, "eval_runtime": 331.1976, "eval_samples_per_second": 4.571, "eval_steps_per_second": 0.574, "step": 19000 }, { "epoch": 25.23117569352708, "grad_norm": 1.9939295053482056, "learning_rate": 8.621149321646495e-06, "loss": 0.076, "step": 19100 }, { "epoch": 25.36327608982827, "grad_norm": 1.148555874824524, "learning_rate": 8.60661725798614e-06, "loss": 0.078, "step": 19200 }, { "epoch": 25.49537648612946, "grad_norm": 1.159621238708496, "learning_rate": 8.592167677001219e-06, "loss": 0.0823, "step": 19300 }, { "epoch": 25.627476882430646, "grad_norm": 1.7136179208755493, "learning_rate": 8.57750892397125e-06, "loss": 0.0755, "step": 19400 }, { "epoch": 25.759577278731836, "grad_norm": 1.105714201927185, "learning_rate": 8.5627868949981e-06, "loss": 0.0756, "step": 19500 }, { "epoch": 25.759577278731836, "eval_bleu": 71.15813151741806, "eval_char_accuracy": 63.43405576575094, "eval_loss": 0.09030098468065262, "eval_runtime": 316.3492, "eval_samples_per_second": 4.786, "eval_steps_per_second": 0.601, "step": 19500 }, { "epoch": 25.891677675033026, "grad_norm": 1.6091099977493286, "learning_rate": 8.548001850472529e-06, "loss": 0.0778, "step": 19600 }, { "epoch": 26.023778071334213, "grad_norm": 1.1730881929397583, "learning_rate": 8.533154051899864e-06, "loss": 0.0787, "step": 19700 }, { "epoch": 26.155878467635404, "grad_norm": 1.2703460454940796, "learning_rate": 8.518243761895369e-06, "loss": 0.0711, "step": 19800 }, { "epoch": 26.28797886393659, "grad_norm": 1.3379662036895752, "learning_rate": 8.503271244179608e-06, "loss": 0.075, "step": 19900 }, { "epoch": 26.42007926023778, "grad_norm": 1.370871901512146, "learning_rate": 8.488236763573772e-06, "loss": 0.0717, "step": 20000 }, { "epoch": 26.42007926023778, "eval_bleu": 72.10017178272712, "eval_char_accuracy": 63.853018588583645, "eval_loss": 0.08871379494667053, "eval_runtime": 316.1038, "eval_samples_per_second": 4.79, "eval_steps_per_second": 0.601, "step": 20000 }, { "epoch": 26.552179656538968, "grad_norm": 1.3396387100219727, "learning_rate": 8.473140585995004e-06, "loss": 0.0726, "step": 20100 }, { "epoch": 26.68428005284016, "grad_norm": 1.1219794750213623, "learning_rate": 8.457982978451683e-06, "loss": 0.0754, "step": 20200 }, { "epoch": 26.81638044914135, "grad_norm": 1.0815324783325195, "learning_rate": 8.442764209038717e-06, "loss": 0.0745, "step": 20300 }, { "epoch": 26.948480845442536, "grad_norm": 1.4396206140518188, "learning_rate": 8.427484546932789e-06, "loss": 0.0749, "step": 20400 }, { "epoch": 27.080581241743726, "grad_norm": 1.2644987106323242, "learning_rate": 8.4121442623876e-06, "loss": 0.0731, "step": 20500 }, { "epoch": 27.080581241743726, "eval_bleu": 71.61514434619416, "eval_char_accuracy": 63.06290097055437, "eval_loss": 0.08903466165065765, "eval_runtime": 316.4287, "eval_samples_per_second": 4.785, "eval_steps_per_second": 0.6, "step": 20500 }, { "epoch": 27.212681638044913, "grad_norm": 1.3456913232803345, "learning_rate": 8.396743626729093e-06, "loss": 0.0728, "step": 20600 }, { "epoch": 27.344782034346103, "grad_norm": 1.1268248558044434, "learning_rate": 8.381282912350646e-06, "loss": 0.072, "step": 20700 }, { "epoch": 27.476882430647294, "grad_norm": 0.964856743812561, "learning_rate": 8.365762392708259e-06, "loss": 0.0711, "step": 20800 }, { "epoch": 27.60898282694848, "grad_norm": 1.2877197265625, "learning_rate": 8.350182342315719e-06, "loss": 0.0681, "step": 20900 }, { "epoch": 27.74108322324967, "grad_norm": 3.0796854496002197, "learning_rate": 8.334543036739743e-06, "loss": 0.0681, "step": 21000 }, { "epoch": 27.74108322324967, "eval_bleu": 72.44362762449254, "eval_char_accuracy": 64.56139990129955, "eval_loss": 0.08809462934732437, "eval_runtime": 319.538, "eval_samples_per_second": 4.738, "eval_steps_per_second": 0.595, "step": 21000 }, { "epoch": 27.873183619550858, "grad_norm": 1.445993185043335, "learning_rate": 8.3188447525951e-06, "loss": 0.0701, "step": 21100 }, { "epoch": 28.005284015852048, "grad_norm": 0.9414767622947693, "learning_rate": 8.303087767539723e-06, "loss": 0.0698, "step": 21200 }, { "epoch": 28.137384412153235, "grad_norm": 1.0644490718841553, "learning_rate": 8.28758923914531e-06, "loss": 0.0674, "step": 21300 }, { "epoch": 28.269484808454425, "grad_norm": 1.2708711624145508, "learning_rate": 8.27171684949204e-06, "loss": 0.0689, "step": 21400 }, { "epoch": 28.401585204755616, "grad_norm": 1.2177033424377441, "learning_rate": 8.25578659248641e-06, "loss": 0.0677, "step": 21500 }, { "epoch": 28.401585204755616, "eval_bleu": 72.7907604984095, "eval_char_accuracy": 64.78141964138838, "eval_loss": 0.0869230180978775, "eval_runtime": 315.3901, "eval_samples_per_second": 4.8, "eval_steps_per_second": 0.602, "step": 21500 }, { "epoch": 28.533685601056803, "grad_norm": 1.1179392337799072, "learning_rate": 8.239798749889293e-06, "loss": 0.0673, "step": 21600 }, { "epoch": 28.665785997357993, "grad_norm": 1.2472251653671265, "learning_rate": 8.223753604480086e-06, "loss": 0.0682, "step": 21700 }, { "epoch": 28.79788639365918, "grad_norm": 0.8336161375045776, "learning_rate": 8.207651440051714e-06, "loss": 0.0689, "step": 21800 }, { "epoch": 28.92998678996037, "grad_norm": 1.2652006149291992, "learning_rate": 8.1914925414056e-06, "loss": 0.0688, "step": 21900 }, { "epoch": 29.062087186261557, "grad_norm": 1.1424204111099243, "learning_rate": 8.175277194346636e-06, "loss": 0.0677, "step": 22000 }, { "epoch": 29.062087186261557, "eval_bleu": 72.84923297683062, "eval_char_accuracy": 64.63285491034709, "eval_loss": 0.08696427941322327, "eval_runtime": 316.7435, "eval_samples_per_second": 4.78, "eval_steps_per_second": 0.6, "step": 22000 }, { "epoch": 29.194187582562748, "grad_norm": 1.617885708808899, "learning_rate": 8.159005685678126e-06, "loss": 0.0638, "step": 22100 }, { "epoch": 29.326287978863938, "grad_norm": 1.2440978288650513, "learning_rate": 8.142678303196715e-06, "loss": 0.0606, "step": 22200 }, { "epoch": 29.458388375165125, "grad_norm": 1.1825751066207886, "learning_rate": 8.12629533568729e-06, "loss": 0.0661, "step": 22300 }, { "epoch": 29.590488771466315, "grad_norm": 1.5328364372253418, "learning_rate": 8.109857072917887e-06, "loss": 0.0647, "step": 22400 }, { "epoch": 29.722589167767502, "grad_norm": 1.1308438777923584, "learning_rate": 8.093363805634556e-06, "loss": 0.0666, "step": 22500 }, { "epoch": 29.722589167767502, "eval_bleu": 73.15661772633777, "eval_char_accuracy": 63.81086527389373, "eval_loss": 0.08648520708084106, "eval_runtime": 315.6708, "eval_samples_per_second": 4.796, "eval_steps_per_second": 0.602, "step": 22500 }, { "epoch": 29.854689564068693, "grad_norm": 1.0442135334014893, "learning_rate": 8.076815825556213e-06, "loss": 0.0648, "step": 22600 }, { "epoch": 29.98678996036988, "grad_norm": 1.363897681236267, "learning_rate": 8.060213425369492e-06, "loss": 0.0654, "step": 22700 }, { "epoch": 30.11889035667107, "grad_norm": 0.7751464247703552, "learning_rate": 8.043556898723568e-06, "loss": 0.0628, "step": 22800 }, { "epoch": 30.25099075297226, "grad_norm": 0.8685150742530823, "learning_rate": 8.026846540224956e-06, "loss": 0.0584, "step": 22900 }, { "epoch": 30.383091149273447, "grad_norm": 1.1484973430633545, "learning_rate": 8.0100826454323e-06, "loss": 0.0604, "step": 23000 }, { "epoch": 30.383091149273447, "eval_bleu": 73.25107285034876, "eval_char_accuracy": 65.01943164994243, "eval_loss": 0.08666232973337173, "eval_runtime": 323.7755, "eval_samples_per_second": 4.676, "eval_steps_per_second": 0.587, "step": 23000 }, { "epoch": 30.515191545574638, "grad_norm": 0.986289918422699, "learning_rate": 7.993265510851148e-06, "loss": 0.0688, "step": 23100 }, { "epoch": 30.647291941875825, "grad_norm": 1.0403423309326172, "learning_rate": 7.97639543392872e-06, "loss": 0.0638, "step": 23200 }, { "epoch": 30.779392338177015, "grad_norm": 1.517040729522705, "learning_rate": 7.959472713048617e-06, "loss": 0.0653, "step": 23300 }, { "epoch": 30.911492734478202, "grad_norm": 1.1347965002059937, "learning_rate": 7.942497647525576e-06, "loss": 0.0642, "step": 23400 }, { "epoch": 31.043593130779392, "grad_norm": 1.0789778232574463, "learning_rate": 7.925470537600155e-06, "loss": 0.0614, "step": 23500 }, { "epoch": 31.043593130779392, "eval_bleu": 73.23277401857816, "eval_char_accuracy": 65.0646693535121, "eval_loss": 0.08618722856044769, "eval_runtime": 313.4951, "eval_samples_per_second": 4.829, "eval_steps_per_second": 0.606, "step": 23500 }, { "epoch": 31.175693527080583, "grad_norm": 1.4853187799453735, "learning_rate": 7.908391684433432e-06, "loss": 0.0585, "step": 23600 }, { "epoch": 31.30779392338177, "grad_norm": 0.9656835794448853, "learning_rate": 7.891261390101675e-06, "loss": 0.0578, "step": 23700 }, { "epoch": 31.43989431968296, "grad_norm": 1.1521549224853516, "learning_rate": 7.874079957590997e-06, "loss": 0.0622, "step": 23800 }, { "epoch": 31.571994715984147, "grad_norm": 1.0636780261993408, "learning_rate": 7.856847690792002e-06, "loss": 0.0604, "step": 23900 }, { "epoch": 31.704095112285337, "grad_norm": 1.0833789110183716, "learning_rate": 7.839564894494409e-06, "loss": 0.0633, "step": 24000 }, { "epoch": 31.704095112285337, "eval_bleu": 73.62536575895216, "eval_char_accuracy": 65.15565882546471, "eval_loss": 0.08546082675457001, "eval_runtime": 318.0064, "eval_samples_per_second": 4.761, "eval_steps_per_second": 0.597, "step": 24000 }, { "epoch": 31.836195508586528, "grad_norm": 1.1620845794677734, "learning_rate": 7.822231874381658e-06, "loss": 0.0604, "step": 24100 }, { "epoch": 31.968295904887714, "grad_norm": 1.315012812614441, "learning_rate": 7.804848937025507e-06, "loss": 0.0593, "step": 24200 }, { "epoch": 32.1003963011889, "grad_norm": 0.8739562034606934, "learning_rate": 7.787416389880605e-06, "loss": 0.0608, "step": 24300 }, { "epoch": 32.23249669749009, "grad_norm": 0.9168538451194763, "learning_rate": 7.769934541279059e-06, "loss": 0.0577, "step": 24400 }, { "epoch": 32.36459709379128, "grad_norm": 0.9820032715797424, "learning_rate": 7.752403700424978e-06, "loss": 0.0569, "step": 24500 }, { "epoch": 32.36459709379128, "eval_bleu": 73.72149088930817, "eval_char_accuracy": 65.55457312057904, "eval_loss": 0.08627723157405853, "eval_runtime": 314.5801, "eval_samples_per_second": 4.813, "eval_steps_per_second": 0.604, "step": 24500 }, { "epoch": 32.49669749009247, "grad_norm": 1.0042686462402344, "learning_rate": 7.734824177389006e-06, "loss": 0.0582, "step": 24600 }, { "epoch": 32.628797886393656, "grad_norm": 1.251654863357544, "learning_rate": 7.71719628310283e-06, "loss": 0.0589, "step": 24700 }, { "epoch": 32.760898282694846, "grad_norm": 1.3150684833526611, "learning_rate": 7.699520329353694e-06, "loss": 0.0585, "step": 24800 }, { "epoch": 32.89299867899604, "grad_norm": 1.318556547164917, "learning_rate": 7.681796628778876e-06, "loss": 0.0588, "step": 24900 }, { "epoch": 33.02509907529723, "grad_norm": 1.2693874835968018, "learning_rate": 7.664025494860155e-06, "loss": 0.0605, "step": 25000 }, { "epoch": 33.02509907529723, "eval_bleu": 73.95631775899457, "eval_char_accuracy": 65.17056670505016, "eval_loss": 0.08447689563035965, "eval_runtime": 317.601, "eval_samples_per_second": 4.767, "eval_steps_per_second": 0.598, "step": 25000 }, { "epoch": 33.15719947159842, "grad_norm": 0.7866926193237305, "learning_rate": 7.646207241918272e-06, "loss": 0.055, "step": 25100 }, { "epoch": 33.2892998678996, "grad_norm": 1.0340533256530762, "learning_rate": 7.628342185107373e-06, "loss": 0.0563, "step": 25200 }, { "epoch": 33.42140026420079, "grad_norm": 1.6704190969467163, "learning_rate": 7.610430640409427e-06, "loss": 0.0568, "step": 25300 }, { "epoch": 33.55350066050198, "grad_norm": 1.4271676540374756, "learning_rate": 7.592472924628642e-06, "loss": 0.056, "step": 25400 }, { "epoch": 33.68560105680317, "grad_norm": 1.3886315822601318, "learning_rate": 7.574469355385865e-06, "loss": 0.0552, "step": 25500 }, { "epoch": 33.68560105680317, "eval_bleu": 73.67062952266826, "eval_char_accuracy": 65.04410676098043, "eval_loss": 0.08498267084360123, "eval_runtime": 314.7186, "eval_samples_per_second": 4.811, "eval_steps_per_second": 0.604, "step": 25500 }, { "epoch": 33.81770145310436, "grad_norm": 1.1037873029708862, "learning_rate": 7.556420251112956e-06, "loss": 0.0551, "step": 25600 }, { "epoch": 33.949801849405546, "grad_norm": 2.125624418258667, "learning_rate": 7.538325931047159e-06, "loss": 0.0591, "step": 25700 }, { "epoch": 34.081902245706736, "grad_norm": 1.674501895904541, "learning_rate": 7.52018671522546e-06, "loss": 0.0561, "step": 25800 }, { "epoch": 34.21400264200793, "grad_norm": 1.386206030845642, "learning_rate": 7.502002924478924e-06, "loss": 0.0509, "step": 25900 }, { "epoch": 34.34610303830912, "grad_norm": 1.0778214931488037, "learning_rate": 7.48377488042701e-06, "loss": 0.0544, "step": 26000 }, { "epoch": 34.34610303830912, "eval_bleu": 74.62046528623556, "eval_char_accuracy": 66.02442835992763, "eval_loss": 0.08464069664478302, "eval_runtime": 316.1374, "eval_samples_per_second": 4.789, "eval_steps_per_second": 0.601, "step": 26000 }, { "epoch": 34.4782034346103, "grad_norm": 0.8076276779174805, "learning_rate": 7.465502905471907e-06, "loss": 0.055, "step": 26100 }, { "epoch": 34.61030383091149, "grad_norm": 1.1508395671844482, "learning_rate": 7.447187322792806e-06, "loss": 0.057, "step": 26200 }, { "epoch": 34.74240422721268, "grad_norm": 1.2695698738098145, "learning_rate": 7.4288284563401945e-06, "loss": 0.055, "step": 26300 }, { "epoch": 34.87450462351387, "grad_norm": 1.166051983833313, "learning_rate": 7.410426630830131e-06, "loss": 0.0552, "step": 26400 }, { "epoch": 35.00660501981506, "grad_norm": 0.9517413973808289, "learning_rate": 7.391982171738496e-06, "loss": 0.0555, "step": 26500 }, { "epoch": 35.00660501981506, "eval_bleu": 74.18793581426324, "eval_char_accuracy": 65.63373910182597, "eval_loss": 0.08454510569572449, "eval_runtime": 313.1895, "eval_samples_per_second": 4.834, "eval_steps_per_second": 0.607, "step": 26500 }, { "epoch": 35.138705416116245, "grad_norm": 1.1265789270401, "learning_rate": 7.373495405295236e-06, "loss": 0.0529, "step": 26600 }, { "epoch": 35.270805812417436, "grad_norm": 1.0067466497421265, "learning_rate": 7.354966658478594e-06, "loss": 0.0502, "step": 26700 }, { "epoch": 35.402906208718626, "grad_norm": 0.9871610999107361, "learning_rate": 7.336396259009325e-06, "loss": 0.0508, "step": 26800 }, { "epoch": 35.53500660501982, "grad_norm": 1.4898390769958496, "learning_rate": 7.317784535344905e-06, "loss": 0.0544, "step": 26900 }, { "epoch": 35.66710700132101, "grad_norm": 1.1168763637542725, "learning_rate": 7.2991318166737126e-06, "loss": 0.0535, "step": 27000 }, { "epoch": 35.66710700132101, "eval_bleu": 74.4767569683976, "eval_char_accuracy": 65.44353512090805, "eval_loss": 0.08464961498975754, "eval_runtime": 313.4254, "eval_samples_per_second": 4.83, "eval_steps_per_second": 0.606, "step": 27000 }, { "epoch": 35.79920739762219, "grad_norm": 1.197704553604126, "learning_rate": 7.280625566954032e-06, "loss": 0.0547, "step": 27100 }, { "epoch": 35.93130779392338, "grad_norm": 1.6144860982894897, "learning_rate": 7.261892250434568e-06, "loss": 0.0516, "step": 27200 }, { "epoch": 36.06340819022457, "grad_norm": 1.1599304676055908, "learning_rate": 7.243118927483657e-06, "loss": 0.0502, "step": 27300 }, { "epoch": 36.19550858652576, "grad_norm": 1.043449878692627, "learning_rate": 7.22430593014791e-06, "loss": 0.0472, "step": 27400 }, { "epoch": 36.32760898282695, "grad_norm": 1.1489434242248535, "learning_rate": 7.205453591175666e-06, "loss": 0.0558, "step": 27500 }, { "epoch": 36.32760898282695, "eval_bleu": 74.20946250489693, "eval_char_accuracy": 65.89436996216483, "eval_loss": 0.08468983322381973, "eval_runtime": 314.7938, "eval_samples_per_second": 4.809, "eval_steps_per_second": 0.604, "step": 27500 }, { "epoch": 36.32760898282695, "step": 27500, "total_flos": 7035660725649408.0, "train_loss": 0.282735899699818, "train_runtime": 21727.9767, "train_samples_per_second": 55.735, "train_steps_per_second": 3.484 } ], "logging_steps": 100, "max_steps": 75700, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 3 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7035660725649408.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }