{ "best_global_step": 8775, "best_metric": 2.2569968700408936, "best_model_checkpoint": "./output/bert-base-mti881/checkpoint-8775", "epoch": 15.0, "eval_steps": 500, "global_step": 43875, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.17094017094017094, "grad_norm": 1.6328529119491577, "learning_rate": 4.943019943019943e-05, "loss": 2.469, "step": 500 }, { "epoch": 0.3418803418803419, "grad_norm": 1.7880568504333496, "learning_rate": 4.886039886039887e-05, "loss": 2.3525, "step": 1000 }, { "epoch": 0.5128205128205128, "grad_norm": 2.440093994140625, "learning_rate": 4.829059829059829e-05, "loss": 2.331, "step": 1500 }, { "epoch": 0.6837606837606838, "grad_norm": 2.336617946624756, "learning_rate": 4.772079772079772e-05, "loss": 2.3127, "step": 2000 }, { "epoch": 0.8547008547008547, "grad_norm": 2.22334885597229, "learning_rate": 4.7150997150997157e-05, "loss": 2.2999, "step": 2500 }, { "epoch": 1.0, "eval_accuracy": 0.8566329784624442, "eval_f1": 0.5589353612167302, "eval_loss": 2.286189317703247, "eval_precision": 0.5451906796742724, "eval_recall": 0.5733909946578479, "eval_runtime": 6.4563, "eval_samples_per_second": 452.889, "eval_steps_per_second": 56.689, "step": 2925 }, { "epoch": 1.0256410256410255, "grad_norm": 1.5501340627670288, "learning_rate": 4.6581196581196586e-05, "loss": 2.2831, "step": 3000 }, { "epoch": 1.1965811965811965, "grad_norm": 1.7197738885879517, "learning_rate": 4.6011396011396016e-05, "loss": 2.2335, "step": 3500 }, { "epoch": 1.3675213675213675, "grad_norm": 2.045734405517578, "learning_rate": 4.544159544159544e-05, "loss": 2.2371, "step": 4000 }, { "epoch": 1.5384615384615383, "grad_norm": 1.6353585720062256, "learning_rate": 4.4871794871794874e-05, "loss": 2.2339, "step": 4500 }, { "epoch": 1.7094017094017095, "grad_norm": 2.460322141647339, "learning_rate": 4.4301994301994304e-05, "loss": 2.233, "step": 5000 }, { "epoch": 1.8803418803418803, "grad_norm": 1.5123356580734253, "learning_rate": 4.3732193732193733e-05, "loss": 2.2263, "step": 5500 }, { "epoch": 2.0, "eval_accuracy": 0.8603247804543002, "eval_f1": 0.5899178255372945, "eval_loss": 2.275588274002075, "eval_precision": 0.5522597825282936, "eval_recall": 0.6330874247434919, "eval_runtime": 5.6077, "eval_samples_per_second": 521.421, "eval_steps_per_second": 65.267, "step": 5850 }, { "epoch": 2.051282051282051, "grad_norm": 1.0685631036758423, "learning_rate": 4.316239316239317e-05, "loss": 2.2079, "step": 6000 }, { "epoch": 2.2222222222222223, "grad_norm": 1.3912978172302246, "learning_rate": 4.259259259259259e-05, "loss": 2.1651, "step": 6500 }, { "epoch": 2.393162393162393, "grad_norm": 1.7378512620925903, "learning_rate": 4.202279202279202e-05, "loss": 2.1688, "step": 7000 }, { "epoch": 2.564102564102564, "grad_norm": 2.20090913772583, "learning_rate": 4.145299145299146e-05, "loss": 2.1664, "step": 7500 }, { "epoch": 2.735042735042735, "grad_norm": 1.6386638879776, "learning_rate": 4.088319088319089e-05, "loss": 2.1683, "step": 8000 }, { "epoch": 2.905982905982906, "grad_norm": 0.7773854732513428, "learning_rate": 4.031339031339032e-05, "loss": 2.1624, "step": 8500 }, { "epoch": 3.0, "eval_accuracy": 0.8788941765196487, "eval_f1": 0.6219396959024139, "eval_loss": 2.2569968700408936, "eval_precision": 0.6301679867699539, "eval_recall": 0.6139235139489527, "eval_runtime": 5.624, "eval_samples_per_second": 519.914, "eval_steps_per_second": 65.078, "step": 8775 }, { "epoch": 3.076923076923077, "grad_norm": 2.1296703815460205, "learning_rate": 3.974358974358974e-05, "loss": 2.1407, "step": 9000 }, { "epoch": 3.247863247863248, "grad_norm": 3.029876708984375, "learning_rate": 3.9173789173789176e-05, "loss": 2.1139, "step": 9500 }, { "epoch": 3.4188034188034186, "grad_norm": 2.393371820449829, "learning_rate": 3.8603988603988605e-05, "loss": 2.117, "step": 10000 }, { "epoch": 3.58974358974359, "grad_norm": 3.3726866245269775, "learning_rate": 3.8034188034188035e-05, "loss": 2.1141, "step": 10500 }, { "epoch": 3.7606837606837606, "grad_norm": 1.123772382736206, "learning_rate": 3.746438746438747e-05, "loss": 2.1151, "step": 11000 }, { "epoch": 3.931623931623932, "grad_norm": 2.8514039516448975, "learning_rate": 3.6894586894586894e-05, "loss": 2.1192, "step": 11500 }, { "epoch": 4.0, "eval_accuracy": 0.8776431339842026, "eval_f1": 0.6255963151834184, "eval_loss": 2.269813299179077, "eval_precision": 0.6073624530863212, "eval_recall": 0.6449588739082507, "eval_runtime": 5.588, "eval_samples_per_second": 523.26, "eval_steps_per_second": 65.497, "step": 11700 }, { "epoch": 4.102564102564102, "grad_norm": 0.6382957100868225, "learning_rate": 3.6324786324786323e-05, "loss": 2.0942, "step": 12000 }, { "epoch": 4.273504273504273, "grad_norm": 2.4572439193725586, "learning_rate": 3.575498575498576e-05, "loss": 2.079, "step": 12500 }, { "epoch": 4.444444444444445, "grad_norm": 4.030599117279053, "learning_rate": 3.518518518518519e-05, "loss": 2.0824, "step": 13000 }, { "epoch": 4.615384615384615, "grad_norm": 1.3176660537719727, "learning_rate": 3.461538461538462e-05, "loss": 2.0844, "step": 13500 }, { "epoch": 4.786324786324786, "grad_norm": 2.164088010787964, "learning_rate": 3.404558404558404e-05, "loss": 2.0804, "step": 14000 }, { "epoch": 4.957264957264957, "grad_norm": 6.9171552658081055, "learning_rate": 3.347578347578348e-05, "loss": 2.0896, "step": 14500 }, { "epoch": 5.0, "eval_accuracy": 0.8791149487317863, "eval_f1": 0.6324646008618922, "eval_loss": 2.2901737689971924, "eval_precision": 0.612869869551384, "eval_recall": 0.6533536843890444, "eval_runtime": 5.621, "eval_samples_per_second": 520.196, "eval_steps_per_second": 65.113, "step": 14625 }, { "epoch": 5.128205128205128, "grad_norm": 2.0550243854522705, "learning_rate": 3.290598290598291e-05, "loss": 2.0631, "step": 15000 }, { "epoch": 5.299145299145299, "grad_norm": 1.2494322061538696, "learning_rate": 3.2336182336182337e-05, "loss": 2.0561, "step": 15500 }, { "epoch": 5.47008547008547, "grad_norm": 2.4397966861724854, "learning_rate": 3.176638176638177e-05, "loss": 2.058, "step": 16000 }, { "epoch": 5.641025641025641, "grad_norm": 2.813675880432129, "learning_rate": 3.1196581196581195e-05, "loss": 2.0611, "step": 16500 }, { "epoch": 5.811965811965812, "grad_norm": 1.493696928024292, "learning_rate": 3.0626780626780625e-05, "loss": 2.0609, "step": 17000 }, { "epoch": 5.982905982905983, "grad_norm": 2.580273389816284, "learning_rate": 3.005698005698006e-05, "loss": 2.0621, "step": 17500 }, { "epoch": 6.0, "eval_accuracy": 0.8811141637639209, "eval_f1": 0.6368790156637131, "eval_loss": 2.2965099811553955, "eval_precision": 0.630865224625624, "eval_recall": 0.6430085644026117, "eval_runtime": 5.5652, "eval_samples_per_second": 525.412, "eval_steps_per_second": 65.766, "step": 17550 }, { "epoch": 6.153846153846154, "grad_norm": 3.136852741241455, "learning_rate": 2.948717948717949e-05, "loss": 2.0441, "step": 18000 }, { "epoch": 6.3247863247863245, "grad_norm": 1.3632102012634277, "learning_rate": 2.8917378917378917e-05, "loss": 2.0433, "step": 18500 }, { "epoch": 6.495726495726496, "grad_norm": 4.941199779510498, "learning_rate": 2.8347578347578346e-05, "loss": 2.0427, "step": 19000 }, { "epoch": 6.666666666666667, "grad_norm": 2.8133013248443604, "learning_rate": 2.777777777777778e-05, "loss": 2.0436, "step": 19500 }, { "epoch": 6.837606837606837, "grad_norm": 1.1807732582092285, "learning_rate": 2.720797720797721e-05, "loss": 2.0442, "step": 20000 }, { "epoch": 7.0, "eval_accuracy": 0.8829661973212971, "eval_f1": 0.6478284496091627, "eval_loss": 2.306105852127075, "eval_precision": 0.6388293487221764, "eval_recall": 0.657084711269397, "eval_runtime": 5.5992, "eval_samples_per_second": 522.219, "eval_steps_per_second": 65.367, "step": 20475 }, { "epoch": 7.0085470085470085, "grad_norm": 1.7212845087051392, "learning_rate": 2.6638176638176638e-05, "loss": 2.0458, "step": 20500 }, { "epoch": 7.17948717948718, "grad_norm": 2.134288787841797, "learning_rate": 2.606837606837607e-05, "loss": 2.0322, "step": 21000 }, { "epoch": 7.35042735042735, "grad_norm": 2.6075599193573, "learning_rate": 2.54985754985755e-05, "loss": 2.033, "step": 21500 }, { "epoch": 7.521367521367521, "grad_norm": 0.940613329410553, "learning_rate": 2.492877492877493e-05, "loss": 2.0315, "step": 22000 }, { "epoch": 7.6923076923076925, "grad_norm": 5.997873783111572, "learning_rate": 2.435897435897436e-05, "loss": 2.0317, "step": 22500 }, { "epoch": 7.863247863247864, "grad_norm": 1.9498519897460938, "learning_rate": 2.3789173789173792e-05, "loss": 2.0301, "step": 23000 }, { "epoch": 8.0, "eval_accuracy": 0.8818132757690232, "eval_f1": 0.6476386036960986, "eval_loss": 2.3260273933410645, "eval_precision": 0.6279366090626742, "eval_recall": 0.6686169761723056, "eval_runtime": 5.5616, "eval_samples_per_second": 525.751, "eval_steps_per_second": 65.809, "step": 23400 }, { "epoch": 8.034188034188034, "grad_norm": 3.1696274280548096, "learning_rate": 2.321937321937322e-05, "loss": 2.0325, "step": 23500 }, { "epoch": 8.205128205128204, "grad_norm": 0.9211856126785278, "learning_rate": 2.264957264957265e-05, "loss": 2.0222, "step": 24000 }, { "epoch": 8.376068376068377, "grad_norm": 2.332916259765625, "learning_rate": 2.207977207977208e-05, "loss": 2.0244, "step": 24500 }, { "epoch": 8.547008547008547, "grad_norm": 1.2731038331985474, "learning_rate": 2.150997150997151e-05, "loss": 2.0242, "step": 25000 }, { "epoch": 8.717948717948717, "grad_norm": 0.8299376964569092, "learning_rate": 2.0940170940170943e-05, "loss": 2.0238, "step": 25500 }, { "epoch": 8.88888888888889, "grad_norm": 1.503308892250061, "learning_rate": 2.037037037037037e-05, "loss": 2.0242, "step": 26000 }, { "epoch": 9.0, "eval_accuracy": 0.8830275229357798, "eval_f1": 0.6493926454127109, "eval_loss": 2.3398172855377197, "eval_precision": 0.6353017521090201, "eval_recall": 0.6641227847027897, "eval_runtime": 5.6249, "eval_samples_per_second": 519.833, "eval_steps_per_second": 65.068, "step": 26325 }, { "epoch": 9.05982905982906, "grad_norm": 1.7587120532989502, "learning_rate": 1.9800569800569802e-05, "loss": 2.0226, "step": 26500 }, { "epoch": 9.23076923076923, "grad_norm": 0.7542155385017395, "learning_rate": 1.923076923076923e-05, "loss": 2.0177, "step": 27000 }, { "epoch": 9.401709401709402, "grad_norm": 0.33988329768180847, "learning_rate": 1.866096866096866e-05, "loss": 2.0203, "step": 27500 }, { "epoch": 9.572649572649572, "grad_norm": 1.8626066446304321, "learning_rate": 1.8091168091168094e-05, "loss": 2.0175, "step": 28000 }, { "epoch": 9.743589743589745, "grad_norm": 2.40765118598938, "learning_rate": 1.752136752136752e-05, "loss": 2.0183, "step": 28500 }, { "epoch": 9.914529914529915, "grad_norm": 2.155571222305298, "learning_rate": 1.6951566951566953e-05, "loss": 2.0173, "step": 29000 }, { "epoch": 10.0, "eval_accuracy": 0.8841559142422607, "eval_f1": 0.652157598499062, "eval_loss": 2.3391082286834717, "eval_precision": 0.641486220472441, "eval_recall": 0.6631900279827017, "eval_runtime": 5.5617, "eval_samples_per_second": 525.742, "eval_steps_per_second": 65.808, "step": 29250 }, { "epoch": 10.085470085470085, "grad_norm": 1.149816870689392, "learning_rate": 1.6381766381766382e-05, "loss": 2.0171, "step": 29500 }, { "epoch": 10.256410256410255, "grad_norm": 0.5041487812995911, "learning_rate": 1.581196581196581e-05, "loss": 2.0133, "step": 30000 }, { "epoch": 10.427350427350428, "grad_norm": 6.211667537689209, "learning_rate": 1.5242165242165243e-05, "loss": 2.0144, "step": 30500 }, { "epoch": 10.598290598290598, "grad_norm": 0.1538165956735611, "learning_rate": 1.4672364672364672e-05, "loss": 2.0135, "step": 31000 }, { "epoch": 10.76923076923077, "grad_norm": 1.0518053770065308, "learning_rate": 1.4102564102564104e-05, "loss": 2.0128, "step": 31500 }, { "epoch": 10.94017094017094, "grad_norm": 1.116525650024414, "learning_rate": 1.3532763532763535e-05, "loss": 2.0132, "step": 32000 }, { "epoch": 11.0, "eval_accuracy": 0.8832973556395035, "eval_f1": 0.6500785318674052, "eval_loss": 2.3498170375823975, "eval_precision": 0.634142407870333, "eval_recall": 0.6668362587975918, "eval_runtime": 5.7697, "eval_samples_per_second": 506.782, "eval_steps_per_second": 63.434, "step": 32175 }, { "epoch": 11.11111111111111, "grad_norm": 0.1830213963985443, "learning_rate": 1.2962962962962962e-05, "loss": 2.0121, "step": 32500 }, { "epoch": 11.282051282051283, "grad_norm": 2.5111734867095947, "learning_rate": 1.2393162393162394e-05, "loss": 2.0103, "step": 33000 }, { "epoch": 11.452991452991453, "grad_norm": 3.7082180976867676, "learning_rate": 1.1823361823361825e-05, "loss": 2.0103, "step": 33500 }, { "epoch": 11.623931623931623, "grad_norm": 1.1296755075454712, "learning_rate": 1.1253561253561254e-05, "loss": 2.011, "step": 34000 }, { "epoch": 11.794871794871796, "grad_norm": 2.4463248252868652, "learning_rate": 1.0683760683760684e-05, "loss": 2.0093, "step": 34500 }, { "epoch": 11.965811965811966, "grad_norm": 0.03058500401675701, "learning_rate": 1.0113960113960115e-05, "loss": 2.0097, "step": 35000 }, { "epoch": 12.0, "eval_accuracy": 0.8845851935436393, "eval_f1": 0.6505743299483937, "eval_loss": 2.355226993560791, "eval_precision": 0.6388230486309767, "eval_recall": 0.6627660476553888, "eval_runtime": 5.5805, "eval_samples_per_second": 523.964, "eval_steps_per_second": 65.585, "step": 35100 }, { "epoch": 12.136752136752136, "grad_norm": 1.262992024421692, "learning_rate": 9.544159544159544e-06, "loss": 2.0083, "step": 35500 }, { "epoch": 12.307692307692308, "grad_norm": 0.350888192653656, "learning_rate": 8.974358974358976e-06, "loss": 2.0082, "step": 36000 }, { "epoch": 12.478632478632479, "grad_norm": 0.7504994869232178, "learning_rate": 8.404558404558405e-06, "loss": 2.0089, "step": 36500 }, { "epoch": 12.649572649572649, "grad_norm": 2.052617311477661, "learning_rate": 7.834757834757835e-06, "loss": 2.0072, "step": 37000 }, { "epoch": 12.820512820512821, "grad_norm": 0.4613409638404846, "learning_rate": 7.264957264957266e-06, "loss": 2.0073, "step": 37500 }, { "epoch": 12.991452991452991, "grad_norm": 4.136294364929199, "learning_rate": 6.695156695156696e-06, "loss": 2.007, "step": 38000 }, { "epoch": 13.0, "eval_accuracy": 0.8839228769072266, "eval_f1": 0.6545124566903151, "eval_loss": 2.3634226322174072, "eval_precision": 0.6372178941450486, "eval_recall": 0.6727719833799711, "eval_runtime": 5.622, "eval_samples_per_second": 520.097, "eval_steps_per_second": 65.101, "step": 38025 }, { "epoch": 13.162393162393162, "grad_norm": 0.16694723069667816, "learning_rate": 6.1253561253561255e-06, "loss": 2.0057, "step": 38500 }, { "epoch": 13.333333333333334, "grad_norm": 0.8811143636703491, "learning_rate": 5.555555555555556e-06, "loss": 2.0065, "step": 39000 }, { "epoch": 13.504273504273504, "grad_norm": 0.4992905855178833, "learning_rate": 4.985754985754986e-06, "loss": 2.0068, "step": 39500 }, { "epoch": 13.675213675213675, "grad_norm": 0.6530119180679321, "learning_rate": 4.415954415954416e-06, "loss": 2.0052, "step": 40000 }, { "epoch": 13.846153846153847, "grad_norm": 2.222022771835327, "learning_rate": 3.846153846153847e-06, "loss": 2.0062, "step": 40500 }, { "epoch": 14.0, "eval_accuracy": 0.884462542314674, "eval_f1": 0.6561026065370293, "eval_loss": 2.3629047870635986, "eval_precision": 0.6406237375777653, "eval_recall": 0.6723480030526584, "eval_runtime": 5.6036, "eval_samples_per_second": 521.81, "eval_steps_per_second": 65.316, "step": 40950 }, { "epoch": 14.017094017094017, "grad_norm": 0.11298029124736786, "learning_rate": 3.2763532763532763e-06, "loss": 2.0064, "step": 41000 }, { "epoch": 14.188034188034187, "grad_norm": 0.11808889359235764, "learning_rate": 2.7065527065527066e-06, "loss": 2.0048, "step": 41500 }, { "epoch": 14.35897435897436, "grad_norm": 0.051862556487321854, "learning_rate": 2.136752136752137e-06, "loss": 2.0052, "step": 42000 }, { "epoch": 14.52991452991453, "grad_norm": 0.021300671622157097, "learning_rate": 1.566951566951567e-06, "loss": 2.0053, "step": 42500 }, { "epoch": 14.7008547008547, "grad_norm": 0.11307813972234726, "learning_rate": 9.971509971509971e-07, "loss": 2.005, "step": 43000 }, { "epoch": 14.871794871794872, "grad_norm": 1.3423974514007568, "learning_rate": 4.273504273504274e-07, "loss": 2.0041, "step": 43500 }, { "epoch": 15.0, "eval_accuracy": 0.8847446401412942, "eval_f1": 0.6565610672834661, "eval_loss": 2.365044116973877, "eval_precision": 0.6400386535674022, "eval_recall": 0.673959128296447, "eval_runtime": 5.6195, "eval_samples_per_second": 520.335, "eval_steps_per_second": 65.131, "step": 43875 }, { "epoch": 15.0, "step": 43875, "total_flos": 1.39563382170006e+16, "train_loss": 2.07735239021323, "train_runtime": 2398.7609, "train_samples_per_second": 146.319, "train_steps_per_second": 18.291 } ], "logging_steps": 500, "max_steps": 43875, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.39563382170006e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }