|
{ |
|
"best_global_step": 36872, |
|
"best_metric": 0.995850622406639, |
|
"best_model_checkpoint": "./rubert-good-bad/checkpoint-36872", |
|
"epoch": 12.0, |
|
"eval_steps": 500, |
|
"global_step": 40224, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.14916467780429593, |
|
"grad_norm": 7.852961540222168, |
|
"learning_rate": 1.975188941925219e-05, |
|
"loss": 0.4052, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.29832935560859186, |
|
"grad_norm": 3.1220028400421143, |
|
"learning_rate": 1.9503281622911697e-05, |
|
"loss": 0.2525, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.44749403341288785, |
|
"grad_norm": 4.411465644836426, |
|
"learning_rate": 1.9254673826571203e-05, |
|
"loss": 0.2181, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.5966587112171837, |
|
"grad_norm": 4.210575580596924, |
|
"learning_rate": 1.900606603023071e-05, |
|
"loss": 0.1906, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.7458233890214797, |
|
"grad_norm": 7.871537685394287, |
|
"learning_rate": 1.8757458233890215e-05, |
|
"loss": 0.1713, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.8949880668257757, |
|
"grad_norm": 4.302709102630615, |
|
"learning_rate": 1.8508850437549724e-05, |
|
"loss": 0.1618, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.9545433353536296, |
|
"eval_loss": 0.11475327610969543, |
|
"eval_runtime": 5.422, |
|
"eval_samples_per_second": 3955.922, |
|
"eval_steps_per_second": 123.755, |
|
"step": 3352 |
|
}, |
|
{ |
|
"epoch": 1.0441527446300716, |
|
"grad_norm": 6.954808235168457, |
|
"learning_rate": 1.826024264120923e-05, |
|
"loss": 0.1452, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.1933174224343674, |
|
"grad_norm": 6.441762924194336, |
|
"learning_rate": 1.8011634844868736e-05, |
|
"loss": 0.1349, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.3424821002386635, |
|
"grad_norm": 2.3034775257110596, |
|
"learning_rate": 1.7763027048528242e-05, |
|
"loss": 0.1261, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.4916467780429594, |
|
"grad_norm": 1.0179331302642822, |
|
"learning_rate": 1.751441925218775e-05, |
|
"loss": 0.116, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.6408114558472553, |
|
"grad_norm": 8.965306282043457, |
|
"learning_rate": 1.7265811455847257e-05, |
|
"loss": 0.1095, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.7899761336515514, |
|
"grad_norm": 2.703307867050171, |
|
"learning_rate": 1.7017203659506763e-05, |
|
"loss": 0.114, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.9391408114558473, |
|
"grad_norm": 8.306221961975098, |
|
"learning_rate": 1.676859586316627e-05, |
|
"loss": 0.1088, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.9744510233577323, |
|
"eval_loss": 0.06880993396043777, |
|
"eval_runtime": 5.2887, |
|
"eval_samples_per_second": 4055.651, |
|
"eval_steps_per_second": 126.875, |
|
"step": 6704 |
|
}, |
|
{ |
|
"epoch": 2.088305489260143, |
|
"grad_norm": 5.413713455200195, |
|
"learning_rate": 1.6519988066825775e-05, |
|
"loss": 0.096, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.2374701670644392, |
|
"grad_norm": 9.367850303649902, |
|
"learning_rate": 1.6271380270485284e-05, |
|
"loss": 0.0894, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.386634844868735, |
|
"grad_norm": 0.4695867896080017, |
|
"learning_rate": 1.602277247414479e-05, |
|
"loss": 0.0882, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.535799522673031, |
|
"grad_norm": 8.3032808303833, |
|
"learning_rate": 1.57741646778043e-05, |
|
"loss": 0.0885, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.684964200477327, |
|
"grad_norm": 3.0729820728302, |
|
"learning_rate": 1.5525556881463802e-05, |
|
"loss": 0.0844, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.834128878281623, |
|
"grad_norm": 6.244034767150879, |
|
"learning_rate": 1.527694908512331e-05, |
|
"loss": 0.0854, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 2.983293556085919, |
|
"grad_norm": 14.806967735290527, |
|
"learning_rate": 1.5028341288782817e-05, |
|
"loss": 0.0871, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.9813044897198004, |
|
"eval_loss": 0.04988732188940048, |
|
"eval_runtime": 5.3443, |
|
"eval_samples_per_second": 4013.427, |
|
"eval_steps_per_second": 125.554, |
|
"step": 10056 |
|
}, |
|
{ |
|
"epoch": 3.132458233890215, |
|
"grad_norm": 3.9732472896575928, |
|
"learning_rate": 1.4779733492442325e-05, |
|
"loss": 0.0745, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 3.2816229116945106, |
|
"grad_norm": 3.8117780685424805, |
|
"learning_rate": 1.4531125696101831e-05, |
|
"loss": 0.0697, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 3.4307875894988067, |
|
"grad_norm": 1.4716224670410156, |
|
"learning_rate": 1.4282517899761337e-05, |
|
"loss": 0.0707, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 3.579952267303103, |
|
"grad_norm": 1.516440749168396, |
|
"learning_rate": 1.4033910103420844e-05, |
|
"loss": 0.0687, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 3.7291169451073984, |
|
"grad_norm": 9.91383171081543, |
|
"learning_rate": 1.378530230708035e-05, |
|
"loss": 0.074, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 3.8782816229116945, |
|
"grad_norm": 0.14348161220550537, |
|
"learning_rate": 1.3536694510739858e-05, |
|
"loss": 0.0743, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.986386311716164, |
|
"eval_loss": 0.037734854966402054, |
|
"eval_runtime": 5.3579, |
|
"eval_samples_per_second": 4003.226, |
|
"eval_steps_per_second": 125.235, |
|
"step": 13408 |
|
}, |
|
{ |
|
"epoch": 4.02744630071599, |
|
"grad_norm": 3.358393430709839, |
|
"learning_rate": 1.3288086714399364e-05, |
|
"loss": 0.0687, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 4.176610978520286, |
|
"grad_norm": 11.118879318237305, |
|
"learning_rate": 1.3039478918058872e-05, |
|
"loss": 0.0572, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 4.325775656324582, |
|
"grad_norm": 3.1460227966308594, |
|
"learning_rate": 1.2790871121718378e-05, |
|
"loss": 0.0604, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 4.4749403341288785, |
|
"grad_norm": 5.454690933227539, |
|
"learning_rate": 1.2542263325377885e-05, |
|
"loss": 0.0576, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 4.624105011933175, |
|
"grad_norm": 5.29376220703125, |
|
"learning_rate": 1.2293655529037391e-05, |
|
"loss": 0.0619, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 4.77326968973747, |
|
"grad_norm": 0.4051016867160797, |
|
"learning_rate": 1.2045047732696897e-05, |
|
"loss": 0.0591, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 4.922434367541766, |
|
"grad_norm": 3.9757938385009766, |
|
"learning_rate": 1.1796439936356405e-05, |
|
"loss": 0.0566, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.9901627115483239, |
|
"eval_loss": 0.02832028456032276, |
|
"eval_runtime": 5.3872, |
|
"eval_samples_per_second": 3981.457, |
|
"eval_steps_per_second": 124.554, |
|
"step": 16760 |
|
}, |
|
{ |
|
"epoch": 5.071599045346062, |
|
"grad_norm": 5.767542839050293, |
|
"learning_rate": 1.154783214001591e-05, |
|
"loss": 0.0554, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 5.220763723150358, |
|
"grad_norm": 0.15083061158657074, |
|
"learning_rate": 1.1299224343675418e-05, |
|
"loss": 0.0507, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 5.369928400954654, |
|
"grad_norm": 3.9027342796325684, |
|
"learning_rate": 1.1050616547334924e-05, |
|
"loss": 0.0526, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 5.519093078758949, |
|
"grad_norm": 0.34978094696998596, |
|
"learning_rate": 1.0802008750994433e-05, |
|
"loss": 0.0489, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 5.6682577565632455, |
|
"grad_norm": 24.885211944580078, |
|
"learning_rate": 1.0553400954653938e-05, |
|
"loss": 0.0496, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 5.817422434367542, |
|
"grad_norm": 3.8451194763183594, |
|
"learning_rate": 1.0304793158313447e-05, |
|
"loss": 0.0462, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 5.966587112171838, |
|
"grad_norm": 2.7605342864990234, |
|
"learning_rate": 1.0056185361972953e-05, |
|
"loss": 0.0524, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.9920276003543289, |
|
"eval_loss": 0.023812290281057358, |
|
"eval_runtime": 5.7441, |
|
"eval_samples_per_second": 3734.067, |
|
"eval_steps_per_second": 116.815, |
|
"step": 20112 |
|
}, |
|
{ |
|
"epoch": 6.115751789976134, |
|
"grad_norm": 12.299605369567871, |
|
"learning_rate": 9.807577565632459e-06, |
|
"loss": 0.0451, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 6.26491646778043, |
|
"grad_norm": 0.014037308283150196, |
|
"learning_rate": 9.558969769291967e-06, |
|
"loss": 0.0439, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 6.414081145584726, |
|
"grad_norm": 4.366552829742432, |
|
"learning_rate": 9.310361972951472e-06, |
|
"loss": 0.0447, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 6.563245823389021, |
|
"grad_norm": 0.3234240412712097, |
|
"learning_rate": 9.06175417661098e-06, |
|
"loss": 0.0474, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 6.712410501193317, |
|
"grad_norm": 5.625737190246582, |
|
"learning_rate": 8.813146380270486e-06, |
|
"loss": 0.0401, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 6.861575178997613, |
|
"grad_norm": 1.9961190223693848, |
|
"learning_rate": 8.564538583929994e-06, |
|
"loss": 0.0442, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.9929134225371812, |
|
"eval_loss": 0.02055698074400425, |
|
"eval_runtime": 5.2686, |
|
"eval_samples_per_second": 4071.116, |
|
"eval_steps_per_second": 127.359, |
|
"step": 23464 |
|
}, |
|
{ |
|
"epoch": 7.0107398568019095, |
|
"grad_norm": 0.7006446123123169, |
|
"learning_rate": 8.3159307875895e-06, |
|
"loss": 0.0446, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 7.159904534606206, |
|
"grad_norm": 0.009974448010325432, |
|
"learning_rate": 8.067322991249006e-06, |
|
"loss": 0.0353, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 7.309069212410501, |
|
"grad_norm": 13.073637962341309, |
|
"learning_rate": 7.818715194908513e-06, |
|
"loss": 0.0327, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 7.458233890214797, |
|
"grad_norm": 0.47934669256210327, |
|
"learning_rate": 7.57010739856802e-06, |
|
"loss": 0.0415, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 7.607398568019093, |
|
"grad_norm": 9.82204532623291, |
|
"learning_rate": 7.321499602227527e-06, |
|
"loss": 0.0396, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 7.756563245823389, |
|
"grad_norm": 3.7507293224334717, |
|
"learning_rate": 7.0728918058870335e-06, |
|
"loss": 0.0419, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 7.905727923627685, |
|
"grad_norm": 12.792013168334961, |
|
"learning_rate": 6.82428400954654e-06, |
|
"loss": 0.0437, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.9944053335819851, |
|
"eval_loss": 0.017598649486899376, |
|
"eval_runtime": 5.3397, |
|
"eval_samples_per_second": 4016.869, |
|
"eval_steps_per_second": 125.662, |
|
"step": 26816 |
|
}, |
|
{ |
|
"epoch": 8.05489260143198, |
|
"grad_norm": 2.062351703643799, |
|
"learning_rate": 6.575676213206047e-06, |
|
"loss": 0.037, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 8.204057279236277, |
|
"grad_norm": 7.230820178985596, |
|
"learning_rate": 6.327068416865554e-06, |
|
"loss": 0.0363, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 8.353221957040573, |
|
"grad_norm": 9.108207702636719, |
|
"learning_rate": 6.078460620525061e-06, |
|
"loss": 0.0343, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 8.50238663484487, |
|
"grad_norm": 3.1893274784088135, |
|
"learning_rate": 5.829852824184567e-06, |
|
"loss": 0.0374, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 8.651551312649165, |
|
"grad_norm": 7.297391414642334, |
|
"learning_rate": 5.581245027844073e-06, |
|
"loss": 0.0371, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 8.80071599045346, |
|
"grad_norm": 0.8091713190078735, |
|
"learning_rate": 5.33263723150358e-06, |
|
"loss": 0.0322, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 8.949880668257757, |
|
"grad_norm": 9.286097526550293, |
|
"learning_rate": 5.084029435163087e-06, |
|
"loss": 0.0366, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.995104666884237, |
|
"eval_loss": 0.015091103501617908, |
|
"eval_runtime": 5.5227, |
|
"eval_samples_per_second": 3883.811, |
|
"eval_steps_per_second": 121.499, |
|
"step": 30168 |
|
}, |
|
{ |
|
"epoch": 9.099045346062052, |
|
"grad_norm": 2.9286885261535645, |
|
"learning_rate": 4.835421638822594e-06, |
|
"loss": 0.0295, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 9.24821002386635, |
|
"grad_norm": 0.06805476546287537, |
|
"learning_rate": 4.5868138424821005e-06, |
|
"loss": 0.0337, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 9.397374701670644, |
|
"grad_norm": 1.571494698524475, |
|
"learning_rate": 4.338206046141607e-06, |
|
"loss": 0.0331, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 9.54653937947494, |
|
"grad_norm": 0.042112987488508224, |
|
"learning_rate": 4.089598249801114e-06, |
|
"loss": 0.0298, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 9.695704057279237, |
|
"grad_norm": 0.060358405113220215, |
|
"learning_rate": 3.840990453460621e-06, |
|
"loss": 0.0354, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 9.844868735083532, |
|
"grad_norm": 17.142175674438477, |
|
"learning_rate": 3.5923826571201277e-06, |
|
"loss": 0.0315, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 9.994033412887829, |
|
"grad_norm": 0.03519833832979202, |
|
"learning_rate": 3.3437748607796345e-06, |
|
"loss": 0.0341, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.9953844002051377, |
|
"eval_loss": 0.014557760208845139, |
|
"eval_runtime": 5.4843, |
|
"eval_samples_per_second": 3911.0, |
|
"eval_steps_per_second": 122.35, |
|
"step": 33520 |
|
}, |
|
{ |
|
"epoch": 10.143198090692124, |
|
"grad_norm": 8.01278018951416, |
|
"learning_rate": 3.0951670644391413e-06, |
|
"loss": 0.0291, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 10.292362768496421, |
|
"grad_norm": 1.2233223915100098, |
|
"learning_rate": 2.8465592680986476e-06, |
|
"loss": 0.0295, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 10.441527446300716, |
|
"grad_norm": 0.13490265607833862, |
|
"learning_rate": 2.5979514717581544e-06, |
|
"loss": 0.0314, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 10.590692124105011, |
|
"grad_norm": 22.723793029785156, |
|
"learning_rate": 2.349343675417661e-06, |
|
"loss": 0.0323, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 10.739856801909308, |
|
"grad_norm": 0.0038608636241406202, |
|
"learning_rate": 2.100735879077168e-06, |
|
"loss": 0.0304, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 10.889021479713604, |
|
"grad_norm": 0.08787856996059418, |
|
"learning_rate": 1.8521280827366748e-06, |
|
"loss": 0.031, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.995850622406639, |
|
"eval_loss": 0.013228594325482845, |
|
"eval_runtime": 5.4141, |
|
"eval_samples_per_second": 3961.682, |
|
"eval_steps_per_second": 123.935, |
|
"step": 36872 |
|
}, |
|
{ |
|
"epoch": 11.0381861575179, |
|
"grad_norm": 0.11552688479423523, |
|
"learning_rate": 1.6035202863961815e-06, |
|
"loss": 0.0318, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 11.187350835322196, |
|
"grad_norm": 0.19442684948444366, |
|
"learning_rate": 1.3549124900556881e-06, |
|
"loss": 0.027, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 11.336515513126491, |
|
"grad_norm": 10.4265718460083, |
|
"learning_rate": 1.106304693715195e-06, |
|
"loss": 0.027, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 11.485680190930788, |
|
"grad_norm": 0.03579527512192726, |
|
"learning_rate": 8.576968973747017e-07, |
|
"loss": 0.0325, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 11.634844868735083, |
|
"grad_norm": 3.7483468055725098, |
|
"learning_rate": 6.090891010342085e-07, |
|
"loss": 0.0267, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 11.78400954653938, |
|
"grad_norm": 0.031889185309410095, |
|
"learning_rate": 3.604813046937152e-07, |
|
"loss": 0.0268, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 11.933174224343675, |
|
"grad_norm": 0.04797011986374855, |
|
"learning_rate": 1.1187350835322197e-07, |
|
"loss": 0.0273, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.9957573779663388, |
|
"eval_loss": 0.013085125014185905, |
|
"eval_runtime": 5.2528, |
|
"eval_samples_per_second": 4083.366, |
|
"eval_steps_per_second": 127.742, |
|
"step": 40224 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 40224, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 12, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1159020529550640.0, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|