{ "best_global_step": 36872, "best_metric": 0.995850622406639, "best_model_checkpoint": "./rubert-good-bad/checkpoint-36872", "epoch": 12.0, "eval_steps": 500, "global_step": 40224, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14916467780429593, "grad_norm": 7.852961540222168, "learning_rate": 1.975188941925219e-05, "loss": 0.4052, "step": 500 }, { "epoch": 0.29832935560859186, "grad_norm": 3.1220028400421143, "learning_rate": 1.9503281622911697e-05, "loss": 0.2525, "step": 1000 }, { "epoch": 0.44749403341288785, "grad_norm": 4.411465644836426, "learning_rate": 1.9254673826571203e-05, "loss": 0.2181, "step": 1500 }, { "epoch": 0.5966587112171837, "grad_norm": 4.210575580596924, "learning_rate": 1.900606603023071e-05, "loss": 0.1906, "step": 2000 }, { "epoch": 0.7458233890214797, "grad_norm": 7.871537685394287, "learning_rate": 1.8757458233890215e-05, "loss": 0.1713, "step": 2500 }, { "epoch": 0.8949880668257757, "grad_norm": 4.302709102630615, "learning_rate": 1.8508850437549724e-05, "loss": 0.1618, "step": 3000 }, { "epoch": 1.0, "eval_accuracy": 0.9545433353536296, "eval_loss": 0.11475327610969543, "eval_runtime": 5.422, "eval_samples_per_second": 3955.922, "eval_steps_per_second": 123.755, "step": 3352 }, { "epoch": 1.0441527446300716, "grad_norm": 6.954808235168457, "learning_rate": 1.826024264120923e-05, "loss": 0.1452, "step": 3500 }, { "epoch": 1.1933174224343674, "grad_norm": 6.441762924194336, "learning_rate": 1.8011634844868736e-05, "loss": 0.1349, "step": 4000 }, { "epoch": 1.3424821002386635, "grad_norm": 2.3034775257110596, "learning_rate": 1.7763027048528242e-05, "loss": 0.1261, "step": 4500 }, { "epoch": 1.4916467780429594, "grad_norm": 1.0179331302642822, "learning_rate": 1.751441925218775e-05, "loss": 0.116, "step": 5000 }, { "epoch": 1.6408114558472553, "grad_norm": 8.965306282043457, "learning_rate": 1.7265811455847257e-05, "loss": 0.1095, "step": 5500 }, { "epoch": 1.7899761336515514, "grad_norm": 2.703307867050171, "learning_rate": 1.7017203659506763e-05, "loss": 0.114, "step": 6000 }, { "epoch": 1.9391408114558473, "grad_norm": 8.306221961975098, "learning_rate": 1.676859586316627e-05, "loss": 0.1088, "step": 6500 }, { "epoch": 2.0, "eval_accuracy": 0.9744510233577323, "eval_loss": 0.06880993396043777, "eval_runtime": 5.2887, "eval_samples_per_second": 4055.651, "eval_steps_per_second": 126.875, "step": 6704 }, { "epoch": 2.088305489260143, "grad_norm": 5.413713455200195, "learning_rate": 1.6519988066825775e-05, "loss": 0.096, "step": 7000 }, { "epoch": 2.2374701670644392, "grad_norm": 9.367850303649902, "learning_rate": 1.6271380270485284e-05, "loss": 0.0894, "step": 7500 }, { "epoch": 2.386634844868735, "grad_norm": 0.4695867896080017, "learning_rate": 1.602277247414479e-05, "loss": 0.0882, "step": 8000 }, { "epoch": 2.535799522673031, "grad_norm": 8.3032808303833, "learning_rate": 1.57741646778043e-05, "loss": 0.0885, "step": 8500 }, { "epoch": 2.684964200477327, "grad_norm": 3.0729820728302, "learning_rate": 1.5525556881463802e-05, "loss": 0.0844, "step": 9000 }, { "epoch": 2.834128878281623, "grad_norm": 6.244034767150879, "learning_rate": 1.527694908512331e-05, "loss": 0.0854, "step": 9500 }, { "epoch": 2.983293556085919, "grad_norm": 14.806967735290527, "learning_rate": 1.5028341288782817e-05, "loss": 0.0871, "step": 10000 }, { "epoch": 3.0, "eval_accuracy": 0.9813044897198004, "eval_loss": 0.04988732188940048, "eval_runtime": 5.3443, "eval_samples_per_second": 4013.427, "eval_steps_per_second": 125.554, "step": 10056 }, { "epoch": 3.132458233890215, "grad_norm": 3.9732472896575928, "learning_rate": 1.4779733492442325e-05, "loss": 0.0745, "step": 10500 }, { "epoch": 3.2816229116945106, "grad_norm": 3.8117780685424805, "learning_rate": 1.4531125696101831e-05, "loss": 0.0697, "step": 11000 }, { "epoch": 3.4307875894988067, "grad_norm": 1.4716224670410156, "learning_rate": 1.4282517899761337e-05, "loss": 0.0707, "step": 11500 }, { "epoch": 3.579952267303103, "grad_norm": 1.516440749168396, "learning_rate": 1.4033910103420844e-05, "loss": 0.0687, "step": 12000 }, { "epoch": 3.7291169451073984, "grad_norm": 9.91383171081543, "learning_rate": 1.378530230708035e-05, "loss": 0.074, "step": 12500 }, { "epoch": 3.8782816229116945, "grad_norm": 0.14348161220550537, "learning_rate": 1.3536694510739858e-05, "loss": 0.0743, "step": 13000 }, { "epoch": 4.0, "eval_accuracy": 0.986386311716164, "eval_loss": 0.037734854966402054, "eval_runtime": 5.3579, "eval_samples_per_second": 4003.226, "eval_steps_per_second": 125.235, "step": 13408 }, { "epoch": 4.02744630071599, "grad_norm": 3.358393430709839, "learning_rate": 1.3288086714399364e-05, "loss": 0.0687, "step": 13500 }, { "epoch": 4.176610978520286, "grad_norm": 11.118879318237305, "learning_rate": 1.3039478918058872e-05, "loss": 0.0572, "step": 14000 }, { "epoch": 4.325775656324582, "grad_norm": 3.1460227966308594, "learning_rate": 1.2790871121718378e-05, "loss": 0.0604, "step": 14500 }, { "epoch": 4.4749403341288785, "grad_norm": 5.454690933227539, "learning_rate": 1.2542263325377885e-05, "loss": 0.0576, "step": 15000 }, { "epoch": 4.624105011933175, "grad_norm": 5.29376220703125, "learning_rate": 1.2293655529037391e-05, "loss": 0.0619, "step": 15500 }, { "epoch": 4.77326968973747, "grad_norm": 0.4051016867160797, "learning_rate": 1.2045047732696897e-05, "loss": 0.0591, "step": 16000 }, { "epoch": 4.922434367541766, "grad_norm": 3.9757938385009766, "learning_rate": 1.1796439936356405e-05, "loss": 0.0566, "step": 16500 }, { "epoch": 5.0, "eval_accuracy": 0.9901627115483239, "eval_loss": 0.02832028456032276, "eval_runtime": 5.3872, "eval_samples_per_second": 3981.457, "eval_steps_per_second": 124.554, "step": 16760 }, { "epoch": 5.071599045346062, "grad_norm": 5.767542839050293, "learning_rate": 1.154783214001591e-05, "loss": 0.0554, "step": 17000 }, { "epoch": 5.220763723150358, "grad_norm": 0.15083061158657074, "learning_rate": 1.1299224343675418e-05, "loss": 0.0507, "step": 17500 }, { "epoch": 5.369928400954654, "grad_norm": 3.9027342796325684, "learning_rate": 1.1050616547334924e-05, "loss": 0.0526, "step": 18000 }, { "epoch": 5.519093078758949, "grad_norm": 0.34978094696998596, "learning_rate": 1.0802008750994433e-05, "loss": 0.0489, "step": 18500 }, { "epoch": 5.6682577565632455, "grad_norm": 24.885211944580078, "learning_rate": 1.0553400954653938e-05, "loss": 0.0496, "step": 19000 }, { "epoch": 5.817422434367542, "grad_norm": 3.8451194763183594, "learning_rate": 1.0304793158313447e-05, "loss": 0.0462, "step": 19500 }, { "epoch": 5.966587112171838, "grad_norm": 2.7605342864990234, "learning_rate": 1.0056185361972953e-05, "loss": 0.0524, "step": 20000 }, { "epoch": 6.0, "eval_accuracy": 0.9920276003543289, "eval_loss": 0.023812290281057358, "eval_runtime": 5.7441, "eval_samples_per_second": 3734.067, "eval_steps_per_second": 116.815, "step": 20112 }, { "epoch": 6.115751789976134, "grad_norm": 12.299605369567871, "learning_rate": 9.807577565632459e-06, "loss": 0.0451, "step": 20500 }, { "epoch": 6.26491646778043, "grad_norm": 0.014037308283150196, "learning_rate": 9.558969769291967e-06, "loss": 0.0439, "step": 21000 }, { "epoch": 6.414081145584726, "grad_norm": 4.366552829742432, "learning_rate": 9.310361972951472e-06, "loss": 0.0447, "step": 21500 }, { "epoch": 6.563245823389021, "grad_norm": 0.3234240412712097, "learning_rate": 9.06175417661098e-06, "loss": 0.0474, "step": 22000 }, { "epoch": 6.712410501193317, "grad_norm": 5.625737190246582, "learning_rate": 8.813146380270486e-06, "loss": 0.0401, "step": 22500 }, { "epoch": 6.861575178997613, "grad_norm": 1.9961190223693848, "learning_rate": 8.564538583929994e-06, "loss": 0.0442, "step": 23000 }, { "epoch": 7.0, "eval_accuracy": 0.9929134225371812, "eval_loss": 0.02055698074400425, "eval_runtime": 5.2686, "eval_samples_per_second": 4071.116, "eval_steps_per_second": 127.359, "step": 23464 }, { "epoch": 7.0107398568019095, "grad_norm": 0.7006446123123169, "learning_rate": 8.3159307875895e-06, "loss": 0.0446, "step": 23500 }, { "epoch": 7.159904534606206, "grad_norm": 0.009974448010325432, "learning_rate": 8.067322991249006e-06, "loss": 0.0353, "step": 24000 }, { "epoch": 7.309069212410501, "grad_norm": 13.073637962341309, "learning_rate": 7.818715194908513e-06, "loss": 0.0327, "step": 24500 }, { "epoch": 7.458233890214797, "grad_norm": 0.47934669256210327, "learning_rate": 7.57010739856802e-06, "loss": 0.0415, "step": 25000 }, { "epoch": 7.607398568019093, "grad_norm": 9.82204532623291, "learning_rate": 7.321499602227527e-06, "loss": 0.0396, "step": 25500 }, { "epoch": 7.756563245823389, "grad_norm": 3.7507293224334717, "learning_rate": 7.0728918058870335e-06, "loss": 0.0419, "step": 26000 }, { "epoch": 7.905727923627685, "grad_norm": 12.792013168334961, "learning_rate": 6.82428400954654e-06, "loss": 0.0437, "step": 26500 }, { "epoch": 8.0, "eval_accuracy": 0.9944053335819851, "eval_loss": 0.017598649486899376, "eval_runtime": 5.3397, "eval_samples_per_second": 4016.869, "eval_steps_per_second": 125.662, "step": 26816 }, { "epoch": 8.05489260143198, "grad_norm": 2.062351703643799, "learning_rate": 6.575676213206047e-06, "loss": 0.037, "step": 27000 }, { "epoch": 8.204057279236277, "grad_norm": 7.230820178985596, "learning_rate": 6.327068416865554e-06, "loss": 0.0363, "step": 27500 }, { "epoch": 8.353221957040573, "grad_norm": 9.108207702636719, "learning_rate": 6.078460620525061e-06, "loss": 0.0343, "step": 28000 }, { "epoch": 8.50238663484487, "grad_norm": 3.1893274784088135, "learning_rate": 5.829852824184567e-06, "loss": 0.0374, "step": 28500 }, { "epoch": 8.651551312649165, "grad_norm": 7.297391414642334, "learning_rate": 5.581245027844073e-06, "loss": 0.0371, "step": 29000 }, { "epoch": 8.80071599045346, "grad_norm": 0.8091713190078735, "learning_rate": 5.33263723150358e-06, "loss": 0.0322, "step": 29500 }, { "epoch": 8.949880668257757, "grad_norm": 9.286097526550293, "learning_rate": 5.084029435163087e-06, "loss": 0.0366, "step": 30000 }, { "epoch": 9.0, "eval_accuracy": 0.995104666884237, "eval_loss": 0.015091103501617908, "eval_runtime": 5.5227, "eval_samples_per_second": 3883.811, "eval_steps_per_second": 121.499, "step": 30168 }, { "epoch": 9.099045346062052, "grad_norm": 2.9286885261535645, "learning_rate": 4.835421638822594e-06, "loss": 0.0295, "step": 30500 }, { "epoch": 9.24821002386635, "grad_norm": 0.06805476546287537, "learning_rate": 4.5868138424821005e-06, "loss": 0.0337, "step": 31000 }, { "epoch": 9.397374701670644, "grad_norm": 1.571494698524475, "learning_rate": 4.338206046141607e-06, "loss": 0.0331, "step": 31500 }, { "epoch": 9.54653937947494, "grad_norm": 0.042112987488508224, "learning_rate": 4.089598249801114e-06, "loss": 0.0298, "step": 32000 }, { "epoch": 9.695704057279237, "grad_norm": 0.060358405113220215, "learning_rate": 3.840990453460621e-06, "loss": 0.0354, "step": 32500 }, { "epoch": 9.844868735083532, "grad_norm": 17.142175674438477, "learning_rate": 3.5923826571201277e-06, "loss": 0.0315, "step": 33000 }, { "epoch": 9.994033412887829, "grad_norm": 0.03519833832979202, "learning_rate": 3.3437748607796345e-06, "loss": 0.0341, "step": 33500 }, { "epoch": 10.0, "eval_accuracy": 0.9953844002051377, "eval_loss": 0.014557760208845139, "eval_runtime": 5.4843, "eval_samples_per_second": 3911.0, "eval_steps_per_second": 122.35, "step": 33520 }, { "epoch": 10.143198090692124, "grad_norm": 8.01278018951416, "learning_rate": 3.0951670644391413e-06, "loss": 0.0291, "step": 34000 }, { "epoch": 10.292362768496421, "grad_norm": 1.2233223915100098, "learning_rate": 2.8465592680986476e-06, "loss": 0.0295, "step": 34500 }, { "epoch": 10.441527446300716, "grad_norm": 0.13490265607833862, "learning_rate": 2.5979514717581544e-06, "loss": 0.0314, "step": 35000 }, { "epoch": 10.590692124105011, "grad_norm": 22.723793029785156, "learning_rate": 2.349343675417661e-06, "loss": 0.0323, "step": 35500 }, { "epoch": 10.739856801909308, "grad_norm": 0.0038608636241406202, "learning_rate": 2.100735879077168e-06, "loss": 0.0304, "step": 36000 }, { "epoch": 10.889021479713604, "grad_norm": 0.08787856996059418, "learning_rate": 1.8521280827366748e-06, "loss": 0.031, "step": 36500 }, { "epoch": 11.0, "eval_accuracy": 0.995850622406639, "eval_loss": 0.013228594325482845, "eval_runtime": 5.4141, "eval_samples_per_second": 3961.682, "eval_steps_per_second": 123.935, "step": 36872 }, { "epoch": 11.0381861575179, "grad_norm": 0.11552688479423523, "learning_rate": 1.6035202863961815e-06, "loss": 0.0318, "step": 37000 }, { "epoch": 11.187350835322196, "grad_norm": 0.19442684948444366, "learning_rate": 1.3549124900556881e-06, "loss": 0.027, "step": 37500 }, { "epoch": 11.336515513126491, "grad_norm": 10.4265718460083, "learning_rate": 1.106304693715195e-06, "loss": 0.027, "step": 38000 }, { "epoch": 11.485680190930788, "grad_norm": 0.03579527512192726, "learning_rate": 8.576968973747017e-07, "loss": 0.0325, "step": 38500 }, { "epoch": 11.634844868735083, "grad_norm": 3.7483468055725098, "learning_rate": 6.090891010342085e-07, "loss": 0.0267, "step": 39000 }, { "epoch": 11.78400954653938, "grad_norm": 0.031889185309410095, "learning_rate": 3.604813046937152e-07, "loss": 0.0268, "step": 39500 }, { "epoch": 11.933174224343675, "grad_norm": 0.04797011986374855, "learning_rate": 1.1187350835322197e-07, "loss": 0.0273, "step": 40000 }, { "epoch": 12.0, "eval_accuracy": 0.9957573779663388, "eval_loss": 0.013085125014185905, "eval_runtime": 5.2528, "eval_samples_per_second": 4083.366, "eval_steps_per_second": 127.742, "step": 40224 } ], "logging_steps": 500, "max_steps": 40224, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1159020529550640.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }