{ "best_metric": 0.8415273271774395, "best_model_checkpoint": "results_retain/facebook/hubert-base-ls960/42/checkpoint-30000", "epoch": 69.20415224913495, "eval_steps": 1000, "global_step": 30000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.1534025374855825, "grad_norm": 2.9732823371887207, "learning_rate": 8.333333333333333e-05, "loss": 3.9827, "step": 500 }, { "epoch": 2.306805074971165, "grad_norm": 3.6868040561676025, "learning_rate": 0.00016666666666666666, "loss": 3.3559, "step": 1000 }, { "epoch": 2.306805074971165, "eval_accuracy": 0.33953082106313953, "eval_f1_macro": 0.12928496278744922, "eval_loss": 2.5730652809143066, "eval_runtime": 35.334, "eval_samples_per_second": 226.807, "eval_steps_per_second": 7.104, "step": 1000 }, { "epoch": 3.4602076124567476, "grad_norm": 7.014188766479492, "learning_rate": 0.00025, "loss": 2.1949, "step": 1500 }, { "epoch": 4.61361014994233, "grad_norm": 6.496627330780029, "learning_rate": 0.0003333333333333333, "loss": 1.6389, "step": 2000 }, { "epoch": 4.61361014994233, "eval_accuracy": 0.6416271524831545, "eval_f1_macro": 0.4299797469182877, "eval_loss": 1.4779495000839233, "eval_runtime": 35.3694, "eval_samples_per_second": 226.58, "eval_steps_per_second": 7.097, "step": 2000 }, { "epoch": 5.767012687427912, "grad_norm": 7.616945266723633, "learning_rate": 0.0004166666666666667, "loss": 1.4162, "step": 2500 }, { "epoch": 6.920415224913495, "grad_norm": 8.488947868347168, "learning_rate": 0.0005, "loss": 1.3587, "step": 3000 }, { "epoch": 6.920415224913495, "eval_accuracy": 0.6574744197654105, "eval_f1_macro": 0.4595491108495356, "eval_loss": 1.427338719367981, "eval_runtime": 72.7464, "eval_samples_per_second": 110.164, "eval_steps_per_second": 3.45, "step": 3000 }, { "epoch": 8.073817762399077, "grad_norm": 8.156586647033691, "learning_rate": 0.0004907407407407408, "loss": 1.3027, "step": 3500 }, { "epoch": 9.22722029988466, "grad_norm": 6.934875965118408, "learning_rate": 0.00048148148148148144, "loss": 1.1695, "step": 4000 }, { "epoch": 9.22722029988466, "eval_accuracy": 0.6961567257299726, "eval_f1_macro": 0.5353451961331095, "eval_loss": 1.2435524463653564, "eval_runtime": 70.2906, "eval_samples_per_second": 114.012, "eval_steps_per_second": 3.571, "step": 4000 }, { "epoch": 10.380622837370241, "grad_norm": 7.151013374328613, "learning_rate": 0.00047222222222222224, "loss": 1.0695, "step": 4500 }, { "epoch": 11.534025374855824, "grad_norm": 7.771185874938965, "learning_rate": 0.000462962962962963, "loss": 0.9787, "step": 5000 }, { "epoch": 11.534025374855824, "eval_accuracy": 0.6901672073870726, "eval_f1_macro": 0.5134333803516367, "eval_loss": 1.3313419818878174, "eval_runtime": 71.0472, "eval_samples_per_second": 112.798, "eval_steps_per_second": 3.533, "step": 5000 }, { "epoch": 12.687427912341407, "grad_norm": 6.754736423492432, "learning_rate": 0.0004537037037037037, "loss": 0.9056, "step": 5500 }, { "epoch": 13.84083044982699, "grad_norm": 6.322958946228027, "learning_rate": 0.0004444444444444444, "loss": 0.836, "step": 6000 }, { "epoch": 13.84083044982699, "eval_accuracy": 0.7138757174943848, "eval_f1_macro": 0.5729793747297807, "eval_loss": 1.2415224313735962, "eval_runtime": 73.4693, "eval_samples_per_second": 109.08, "eval_steps_per_second": 3.416, "step": 6000 }, { "epoch": 14.994232987312571, "grad_norm": 7.2826619148254395, "learning_rate": 0.0004351851851851852, "loss": 0.7867, "step": 6500 }, { "epoch": 16.147635524798154, "grad_norm": 5.9969482421875, "learning_rate": 0.00042592592592592595, "loss": 0.7135, "step": 7000 }, { "epoch": 16.147635524798154, "eval_accuracy": 0.7389568255552783, "eval_f1_macro": 0.5793519253003285, "eval_loss": 1.1902633905410767, "eval_runtime": 73.1754, "eval_samples_per_second": 109.518, "eval_steps_per_second": 3.43, "step": 7000 }, { "epoch": 17.301038062283737, "grad_norm": 6.838934421539307, "learning_rate": 0.0004166666666666667, "loss": 0.6719, "step": 7500 }, { "epoch": 18.45444059976932, "grad_norm": 7.784801006317139, "learning_rate": 0.0004074074074074074, "loss": 0.6009, "step": 8000 }, { "epoch": 18.45444059976932, "eval_accuracy": 0.7414524581981532, "eval_f1_macro": 0.6081990369390977, "eval_loss": 1.2159614562988281, "eval_runtime": 55.5881, "eval_samples_per_second": 144.168, "eval_steps_per_second": 4.515, "step": 8000 }, { "epoch": 19.607843137254903, "grad_norm": 6.171660423278809, "learning_rate": 0.0003981481481481481, "loss": 0.5756, "step": 8500 }, { "epoch": 20.761245674740483, "grad_norm": 7.959474563598633, "learning_rate": 0.0003888888888888889, "loss": 0.5355, "step": 9000 }, { "epoch": 20.761245674740483, "eval_accuracy": 0.7543049663089593, "eval_f1_macro": 0.5947199785680519, "eval_loss": 1.1460059881210327, "eval_runtime": 55.0321, "eval_samples_per_second": 145.624, "eval_steps_per_second": 4.561, "step": 9000 }, { "epoch": 21.914648212226066, "grad_norm": 7.287164211273193, "learning_rate": 0.00037962962962962966, "loss": 0.5046, "step": 9500 }, { "epoch": 23.06805074971165, "grad_norm": 6.368403434753418, "learning_rate": 0.00037037037037037035, "loss": 0.4737, "step": 10000 }, { "epoch": 23.06805074971165, "eval_accuracy": 0.759920139755428, "eval_f1_macro": 0.6214937740706044, "eval_loss": 1.1644535064697266, "eval_runtime": 54.1187, "eval_samples_per_second": 148.082, "eval_steps_per_second": 4.638, "step": 10000 }, { "epoch": 24.22145328719723, "grad_norm": 9.294144630432129, "learning_rate": 0.0003611111111111111, "loss": 0.4349, "step": 10500 }, { "epoch": 25.374855824682815, "grad_norm": 7.4235310554504395, "learning_rate": 0.0003518518518518519, "loss": 0.4352, "step": 11000 }, { "epoch": 25.374855824682815, "eval_accuracy": 0.7545545295732469, "eval_f1_macro": 0.5917892398903293, "eval_loss": 1.213472843170166, "eval_runtime": 54.2317, "eval_samples_per_second": 147.773, "eval_steps_per_second": 4.628, "step": 11000 }, { "epoch": 26.528258362168398, "grad_norm": 5.937560558319092, "learning_rate": 0.00034259259259259263, "loss": 0.4017, "step": 11500 }, { "epoch": 27.68166089965398, "grad_norm": 6.036593914031982, "learning_rate": 0.0003333333333333333, "loss": 0.3652, "step": 12000 }, { "epoch": 27.68166089965398, "eval_accuracy": 0.7732717743948091, "eval_f1_macro": 0.6375373960767734, "eval_loss": 1.1644330024719238, "eval_runtime": 75.4416, "eval_samples_per_second": 106.228, "eval_steps_per_second": 3.327, "step": 12000 }, { "epoch": 28.83506343713956, "grad_norm": 6.821892738342285, "learning_rate": 0.00032407407407407406, "loss": 0.3443, "step": 12500 }, { "epoch": 29.988465974625143, "grad_norm": 4.1507463455200195, "learning_rate": 0.0003148148148148148, "loss": 0.3246, "step": 13000 }, { "epoch": 29.988465974625143, "eval_accuracy": 0.7776391315198403, "eval_f1_macro": 0.6181477901694947, "eval_loss": 1.143282175064087, "eval_runtime": 75.3981, "eval_samples_per_second": 106.289, "eval_steps_per_second": 3.329, "step": 13000 }, { "epoch": 31.141868512110726, "grad_norm": 7.311563491821289, "learning_rate": 0.0003055555555555556, "loss": 0.3082, "step": 13500 }, { "epoch": 32.29527104959631, "grad_norm": 2.214399576187134, "learning_rate": 0.0002962962962962963, "loss": 0.2876, "step": 14000 }, { "epoch": 32.29527104959631, "eval_accuracy": 0.7700274519590716, "eval_f1_macro": 0.6278465966595438, "eval_loss": 1.2212963104248047, "eval_runtime": 73.2438, "eval_samples_per_second": 109.415, "eval_steps_per_second": 3.427, "step": 14000 }, { "epoch": 33.44867358708189, "grad_norm": 5.876758575439453, "learning_rate": 0.00028703703703703703, "loss": 0.2722, "step": 14500 }, { "epoch": 34.602076124567475, "grad_norm": 3.34192156791687, "learning_rate": 0.0002777777777777778, "loss": 0.2539, "step": 15000 }, { "epoch": 34.602076124567475, "eval_accuracy": 0.7858747192413277, "eval_f1_macro": 0.6310309906248334, "eval_loss": 1.1600251197814941, "eval_runtime": 55.43, "eval_samples_per_second": 144.579, "eval_steps_per_second": 4.528, "step": 15000 }, { "epoch": 35.75547866205306, "grad_norm": 4.611924648284912, "learning_rate": 0.0002685185185185186, "loss": 0.2428, "step": 15500 }, { "epoch": 36.90888119953864, "grad_norm": 3.3283474445343018, "learning_rate": 0.00025925925925925926, "loss": 0.2322, "step": 16000 }, { "epoch": 36.90888119953864, "eval_accuracy": 0.7816321437484403, "eval_f1_macro": 0.6319203799590871, "eval_loss": 1.135780930519104, "eval_runtime": 63.9861, "eval_samples_per_second": 125.246, "eval_steps_per_second": 3.923, "step": 16000 }, { "epoch": 38.062283737024224, "grad_norm": 6.641352653503418, "learning_rate": 0.00025, "loss": 0.2146, "step": 16500 }, { "epoch": 39.21568627450981, "grad_norm": 6.247890949249268, "learning_rate": 0.00024074074074074072, "loss": 0.2003, "step": 17000 }, { "epoch": 39.21568627450981, "eval_accuracy": 0.7962315947092587, "eval_f1_macro": 0.6542244286445125, "eval_loss": 1.150564432144165, "eval_runtime": 69.2204, "eval_samples_per_second": 115.775, "eval_steps_per_second": 3.626, "step": 17000 }, { "epoch": 40.36908881199539, "grad_norm": 4.363713264465332, "learning_rate": 0.0002314814814814815, "loss": 0.1947, "step": 17500 }, { "epoch": 41.522491349480966, "grad_norm": 3.4260287284851074, "learning_rate": 0.0002222222222222222, "loss": 0.1794, "step": 18000 }, { "epoch": 41.522491349480966, "eval_accuracy": 0.7979785375592713, "eval_f1_macro": 0.6796792585107833, "eval_loss": 1.1864490509033203, "eval_runtime": 71.1001, "eval_samples_per_second": 112.714, "eval_steps_per_second": 3.53, "step": 18000 }, { "epoch": 42.67589388696655, "grad_norm": 2.606008291244507, "learning_rate": 0.00021296296296296298, "loss": 0.1689, "step": 18500 }, { "epoch": 43.82929642445213, "grad_norm": 4.665687084197998, "learning_rate": 0.0002037037037037037, "loss": 0.1645, "step": 19000 }, { "epoch": 43.82929642445213, "eval_accuracy": 0.80059895183429, "eval_f1_macro": 0.6667890419585701, "eval_loss": 1.2014110088348389, "eval_runtime": 74.7112, "eval_samples_per_second": 107.266, "eval_steps_per_second": 3.36, "step": 19000 }, { "epoch": 44.982698961937714, "grad_norm": 3.0378100872039795, "learning_rate": 0.00019444444444444446, "loss": 0.1602, "step": 19500 }, { "epoch": 46.1361014994233, "grad_norm": 5.274627685546875, "learning_rate": 0.00018518518518518518, "loss": 0.144, "step": 20000 }, { "epoch": 46.1361014994233, "eval_accuracy": 0.7989767906164212, "eval_f1_macro": 0.6582157335341974, "eval_loss": 1.1411352157592773, "eval_runtime": 73.2065, "eval_samples_per_second": 109.471, "eval_steps_per_second": 3.429, "step": 20000 }, { "epoch": 47.28950403690888, "grad_norm": 2.336925745010376, "learning_rate": 0.00017592592592592595, "loss": 0.1368, "step": 20500 }, { "epoch": 48.44290657439446, "grad_norm": 2.7309417724609375, "learning_rate": 0.00016666666666666666, "loss": 0.1298, "step": 21000 }, { "epoch": 48.44290657439446, "eval_accuracy": 0.8064636885450461, "eval_f1_macro": 0.6782237618476237, "eval_loss": 1.1389836072921753, "eval_runtime": 73.3032, "eval_samples_per_second": 109.327, "eval_steps_per_second": 3.424, "step": 21000 }, { "epoch": 49.596309111880046, "grad_norm": 2.79067325592041, "learning_rate": 0.0001574074074074074, "loss": 0.1206, "step": 21500 }, { "epoch": 50.74971164936563, "grad_norm": 4.826747417449951, "learning_rate": 0.00014814814814814815, "loss": 0.1175, "step": 22000 }, { "epoch": 50.74971164936563, "eval_accuracy": 0.8068380334414774, "eval_f1_macro": 0.6700916407139905, "eval_loss": 1.2090946435928345, "eval_runtime": 73.8226, "eval_samples_per_second": 108.557, "eval_steps_per_second": 3.4, "step": 22000 }, { "epoch": 51.90311418685121, "grad_norm": 3.403858184814453, "learning_rate": 0.0001388888888888889, "loss": 0.1021, "step": 22500 }, { "epoch": 53.056516724336795, "grad_norm": 5.1802496910095215, "learning_rate": 0.00012962962962962963, "loss": 0.0977, "step": 23000 }, { "epoch": 53.056516724336795, "eval_accuracy": 0.8149488395308211, "eval_f1_macro": 0.682806558028361, "eval_loss": 1.1759377717971802, "eval_runtime": 73.8391, "eval_samples_per_second": 108.533, "eval_steps_per_second": 3.399, "step": 23000 }, { "epoch": 54.20991926182238, "grad_norm": 1.122316837310791, "learning_rate": 0.00012037037037037036, "loss": 0.0912, "step": 23500 }, { "epoch": 55.36332179930796, "grad_norm": 1.1100833415985107, "learning_rate": 0.0001111111111111111, "loss": 0.0823, "step": 24000 }, { "epoch": 55.36332179930796, "eval_accuracy": 0.8166957823808335, "eval_f1_macro": 0.7045678569443168, "eval_loss": 1.2304565906524658, "eval_runtime": 74.7299, "eval_samples_per_second": 107.24, "eval_steps_per_second": 3.359, "step": 24000 }, { "epoch": 56.516724336793544, "grad_norm": 4.60992956161499, "learning_rate": 0.00010185185185185185, "loss": 0.0873, "step": 24500 }, { "epoch": 57.67012687427912, "grad_norm": 5.945472240447998, "learning_rate": 9.259259259259259e-05, "loss": 0.0767, "step": 25000 }, { "epoch": 57.67012687427912, "eval_accuracy": 0.8238083354130272, "eval_f1_macro": 0.6889311414034964, "eval_loss": 1.231188416481018, "eval_runtime": 72.9471, "eval_samples_per_second": 109.86, "eval_steps_per_second": 3.441, "step": 25000 }, { "epoch": 58.8235294117647, "grad_norm": 6.51999044418335, "learning_rate": 8.333333333333333e-05, "loss": 0.0667, "step": 25500 }, { "epoch": 59.976931949250286, "grad_norm": 3.6006715297698975, "learning_rate": 7.407407407407407e-05, "loss": 0.066, "step": 26000 }, { "epoch": 59.976931949250286, "eval_accuracy": 0.8235587721487397, "eval_f1_macro": 0.7127282615425515, "eval_loss": 1.212782621383667, "eval_runtime": 70.4845, "eval_samples_per_second": 113.699, "eval_steps_per_second": 3.561, "step": 26000 }, { "epoch": 61.13033448673587, "grad_norm": 1.102469563484192, "learning_rate": 6.481481481481482e-05, "loss": 0.0601, "step": 26500 }, { "epoch": 62.28373702422145, "grad_norm": 3.476552724838257, "learning_rate": 5.555555555555555e-05, "loss": 0.0493, "step": 27000 }, { "epoch": 62.28373702422145, "eval_accuracy": 0.8310456700773646, "eval_f1_macro": 0.7115209260308665, "eval_loss": 1.15741765499115, "eval_runtime": 74.8395, "eval_samples_per_second": 107.083, "eval_steps_per_second": 3.354, "step": 27000 }, { "epoch": 63.437139561707035, "grad_norm": 0.26378124952316284, "learning_rate": 4.6296296296296294e-05, "loss": 0.0527, "step": 27500 }, { "epoch": 64.59054209919262, "grad_norm": 1.6174193620681763, "learning_rate": 3.7037037037037037e-05, "loss": 0.0479, "step": 28000 }, { "epoch": 64.59054209919262, "eval_accuracy": 0.836785625155977, "eval_f1_macro": 0.7171903480510493, "eval_loss": 1.1416091918945312, "eval_runtime": 72.9274, "eval_samples_per_second": 109.89, "eval_steps_per_second": 3.442, "step": 28000 }, { "epoch": 65.7439446366782, "grad_norm": 1.4499250650405884, "learning_rate": 2.7777777777777776e-05, "loss": 0.0453, "step": 28500 }, { "epoch": 66.89734717416378, "grad_norm": 3.988093614578247, "learning_rate": 1.8518518518518518e-05, "loss": 0.0389, "step": 29000 }, { "epoch": 66.89734717416378, "eval_accuracy": 0.8370351884202646, "eval_f1_macro": 0.7212408780468642, "eval_loss": 1.1253269910812378, "eval_runtime": 53.9632, "eval_samples_per_second": 148.509, "eval_steps_per_second": 4.651, "step": 29000 }, { "epoch": 68.05074971164936, "grad_norm": 2.909609317779541, "learning_rate": 9.259259259259259e-06, "loss": 0.0433, "step": 29500 }, { "epoch": 69.20415224913495, "grad_norm": 2.1668026447296143, "learning_rate": 0.0, "loss": 0.0343, "step": 30000 }, { "epoch": 69.20415224913495, "eval_accuracy": 0.8415273271774395, "eval_f1_macro": 0.7162084790077747, "eval_loss": 1.1328068971633911, "eval_runtime": 73.3281, "eval_samples_per_second": 109.29, "eval_steps_per_second": 3.423, "step": 30000 } ], "logging_steps": 500, "max_steps": 30000, "num_input_tokens_seen": 0, "num_train_epochs": 70, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7427529644770302e+20, "train_batch_size": 32, "trial_name": null, "trial_params": null }