|
{ |
|
"best_metric": 0.8419870138168335, |
|
"best_model_checkpoint": "./model_fine-tune/glot/xlm-r/amh-Ethi/checkpoint-94500", |
|
"epoch": 121.46529562982005, |
|
"eval_steps": 500, |
|
"global_step": 94500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.6426735218508998, |
|
"grad_norm": 4.40508508682251, |
|
"learning_rate": 9.95e-05, |
|
"loss": 1.6805, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6426735218508998, |
|
"eval_accuracy": 0.6904924931069853, |
|
"eval_loss": 1.4787547588348389, |
|
"eval_runtime": 34.6674, |
|
"eval_samples_per_second": 179.506, |
|
"eval_steps_per_second": 5.625, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2853470437017995, |
|
"grad_norm": 3.9189295768737793, |
|
"learning_rate": 9.900000000000001e-05, |
|
"loss": 1.5356, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.2853470437017995, |
|
"eval_accuracy": 0.7092839911070465, |
|
"eval_loss": 1.3684136867523193, |
|
"eval_runtime": 35.1037, |
|
"eval_samples_per_second": 177.275, |
|
"eval_steps_per_second": 5.555, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.9280205655526992, |
|
"grad_norm": 4.308630466461182, |
|
"learning_rate": 9.850000000000001e-05, |
|
"loss": 1.4583, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.9280205655526992, |
|
"eval_accuracy": 0.7174822425497339, |
|
"eval_loss": 1.3316432237625122, |
|
"eval_runtime": 32.7687, |
|
"eval_samples_per_second": 189.907, |
|
"eval_steps_per_second": 5.951, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.570694087403599, |
|
"grad_norm": 3.4791815280914307, |
|
"learning_rate": 9.8e-05, |
|
"loss": 1.4029, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.570694087403599, |
|
"eval_accuracy": 0.7272827658735376, |
|
"eval_loss": 1.280930519104004, |
|
"eval_runtime": 35.7018, |
|
"eval_samples_per_second": 174.305, |
|
"eval_steps_per_second": 5.462, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.2133676092544987, |
|
"grad_norm": 3.705988645553589, |
|
"learning_rate": 9.75e-05, |
|
"loss": 1.3485, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.2133676092544987, |
|
"eval_accuracy": 0.7312191932269085, |
|
"eval_loss": 1.2635687589645386, |
|
"eval_runtime": 34.9687, |
|
"eval_samples_per_second": 177.959, |
|
"eval_steps_per_second": 5.576, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.8560411311053984, |
|
"grad_norm": 3.63777232170105, |
|
"learning_rate": 9.7e-05, |
|
"loss": 1.318, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.8560411311053984, |
|
"eval_accuracy": 0.7356960949650779, |
|
"eval_loss": 1.2287126779556274, |
|
"eval_runtime": 35.1013, |
|
"eval_samples_per_second": 177.287, |
|
"eval_steps_per_second": 5.555, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.4987146529562985, |
|
"grad_norm": 3.065805673599243, |
|
"learning_rate": 9.65e-05, |
|
"loss": 1.2853, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 4.4987146529562985, |
|
"eval_accuracy": 0.7411218708879311, |
|
"eval_loss": 1.2096679210662842, |
|
"eval_runtime": 35.3152, |
|
"eval_samples_per_second": 176.213, |
|
"eval_steps_per_second": 5.522, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 5.141388174807198, |
|
"grad_norm": 3.083160161972046, |
|
"learning_rate": 9.6e-05, |
|
"loss": 1.2579, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 5.141388174807198, |
|
"eval_accuracy": 0.743514192750392, |
|
"eval_loss": 1.1946351528167725, |
|
"eval_runtime": 35.3519, |
|
"eval_samples_per_second": 176.03, |
|
"eval_steps_per_second": 5.516, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 5.784061696658098, |
|
"grad_norm": 3.1521048545837402, |
|
"learning_rate": 9.55e-05, |
|
"loss": 1.225, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 5.784061696658098, |
|
"eval_accuracy": 0.7478054754473986, |
|
"eval_loss": 1.1656205654144287, |
|
"eval_runtime": 35.2967, |
|
"eval_samples_per_second": 176.305, |
|
"eval_steps_per_second": 5.525, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 6.426735218508997, |
|
"grad_norm": 2.947038412094116, |
|
"learning_rate": 9.5e-05, |
|
"loss": 1.2156, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 6.426735218508997, |
|
"eval_accuracy": 0.7494688799077729, |
|
"eval_loss": 1.1547260284423828, |
|
"eval_runtime": 35.244, |
|
"eval_samples_per_second": 176.569, |
|
"eval_steps_per_second": 5.533, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 7.069408740359897, |
|
"grad_norm": 3.245880603790283, |
|
"learning_rate": 9.449999999999999e-05, |
|
"loss": 1.1878, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 7.069408740359897, |
|
"eval_accuracy": 0.7533389855345517, |
|
"eval_loss": 1.1394718885421753, |
|
"eval_runtime": 34.1955, |
|
"eval_samples_per_second": 181.983, |
|
"eval_steps_per_second": 5.703, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 7.712082262210797, |
|
"grad_norm": 3.02457857131958, |
|
"learning_rate": 9.4e-05, |
|
"loss": 1.1713, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 7.712082262210797, |
|
"eval_accuracy": 0.7548033682748899, |
|
"eval_loss": 1.136972188949585, |
|
"eval_runtime": 33.9827, |
|
"eval_samples_per_second": 183.123, |
|
"eval_steps_per_second": 5.738, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 8.354755784061696, |
|
"grad_norm": 2.8089816570281982, |
|
"learning_rate": 9.350000000000001e-05, |
|
"loss": 1.1587, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 8.354755784061696, |
|
"eval_accuracy": 0.7558409318256807, |
|
"eval_loss": 1.1281076669692993, |
|
"eval_runtime": 34.9074, |
|
"eval_samples_per_second": 178.272, |
|
"eval_steps_per_second": 5.586, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 8.997429305912597, |
|
"grad_norm": 2.808964729309082, |
|
"learning_rate": 9.300000000000001e-05, |
|
"loss": 1.1416, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 8.997429305912597, |
|
"eval_accuracy": 0.7600179626094105, |
|
"eval_loss": 1.107399821281433, |
|
"eval_runtime": 34.0824, |
|
"eval_samples_per_second": 182.587, |
|
"eval_steps_per_second": 5.721, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 9.640102827763496, |
|
"grad_norm": 2.8558380603790283, |
|
"learning_rate": 9.250000000000001e-05, |
|
"loss": 1.128, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 9.640102827763496, |
|
"eval_accuracy": 0.7628805020537848, |
|
"eval_loss": 1.0930116176605225, |
|
"eval_runtime": 33.4368, |
|
"eval_samples_per_second": 186.112, |
|
"eval_steps_per_second": 5.832, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 10.282776349614396, |
|
"grad_norm": 2.9800262451171875, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 1.1213, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 10.282776349614396, |
|
"eval_accuracy": 0.7627036601926251, |
|
"eval_loss": 1.0944606065750122, |
|
"eval_runtime": 34.4461, |
|
"eval_samples_per_second": 180.659, |
|
"eval_steps_per_second": 5.661, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 10.925449871465295, |
|
"grad_norm": 3.040515422821045, |
|
"learning_rate": 9.15e-05, |
|
"loss": 1.1075, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 10.925449871465295, |
|
"eval_accuracy": 0.764068771058977, |
|
"eval_loss": 1.0873337984085083, |
|
"eval_runtime": 32.7071, |
|
"eval_samples_per_second": 190.265, |
|
"eval_steps_per_second": 5.962, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 11.568123393316196, |
|
"grad_norm": 2.973078966140747, |
|
"learning_rate": 9.1e-05, |
|
"loss": 1.0912, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 11.568123393316196, |
|
"eval_accuracy": 0.7658058887449325, |
|
"eval_loss": 1.0768815279006958, |
|
"eval_runtime": 32.6454, |
|
"eval_samples_per_second": 190.624, |
|
"eval_steps_per_second": 5.973, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 12.210796915167094, |
|
"grad_norm": 3.5373220443725586, |
|
"learning_rate": 9.05e-05, |
|
"loss": 1.085, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 12.210796915167094, |
|
"eval_accuracy": 0.7658247272593731, |
|
"eval_loss": 1.086098074913025, |
|
"eval_runtime": 32.662, |
|
"eval_samples_per_second": 190.527, |
|
"eval_steps_per_second": 5.97, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 12.853470437017995, |
|
"grad_norm": 5.5106964111328125, |
|
"learning_rate": 9e-05, |
|
"loss": 1.0746, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 12.853470437017995, |
|
"eval_accuracy": 0.7682209938962093, |
|
"eval_loss": 1.0635614395141602, |
|
"eval_runtime": 32.6593, |
|
"eval_samples_per_second": 190.543, |
|
"eval_steps_per_second": 5.971, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 13.496143958868895, |
|
"grad_norm": 3.238605499267578, |
|
"learning_rate": 8.950000000000001e-05, |
|
"loss": 1.0531, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 13.496143958868895, |
|
"eval_accuracy": 0.7686876543500863, |
|
"eval_loss": 1.0645390748977661, |
|
"eval_runtime": 32.6258, |
|
"eval_samples_per_second": 190.739, |
|
"eval_steps_per_second": 5.977, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 14.138817480719794, |
|
"grad_norm": 2.7707929611206055, |
|
"learning_rate": 8.900000000000001e-05, |
|
"loss": 1.0545, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 14.138817480719794, |
|
"eval_accuracy": 0.7713913141190644, |
|
"eval_loss": 1.0523449182510376, |
|
"eval_runtime": 33.0121, |
|
"eval_samples_per_second": 188.507, |
|
"eval_steps_per_second": 5.907, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 14.781491002570695, |
|
"grad_norm": 3.1346287727355957, |
|
"learning_rate": 8.850000000000001e-05, |
|
"loss": 1.042, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 14.781491002570695, |
|
"eval_accuracy": 0.7711833853943258, |
|
"eval_loss": 1.0591264963150024, |
|
"eval_runtime": 32.5788, |
|
"eval_samples_per_second": 191.014, |
|
"eval_steps_per_second": 5.985, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 15.424164524421593, |
|
"grad_norm": 2.98738169670105, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 1.0327, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 15.424164524421593, |
|
"eval_accuracy": 0.7727139790786457, |
|
"eval_loss": 1.0433602333068848, |
|
"eval_runtime": 32.6912, |
|
"eval_samples_per_second": 190.357, |
|
"eval_steps_per_second": 5.965, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 16.066838046272494, |
|
"grad_norm": 2.7987964153289795, |
|
"learning_rate": 8.75e-05, |
|
"loss": 1.0226, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 16.066838046272494, |
|
"eval_accuracy": 0.7744916530689092, |
|
"eval_loss": 1.0427190065383911, |
|
"eval_runtime": 32.7105, |
|
"eval_samples_per_second": 190.245, |
|
"eval_steps_per_second": 5.961, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 16.709511568123393, |
|
"grad_norm": 2.927727699279785, |
|
"learning_rate": 8.7e-05, |
|
"loss": 1.0164, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 16.709511568123393, |
|
"eval_accuracy": 0.7755782307589745, |
|
"eval_loss": 1.033629059791565, |
|
"eval_runtime": 32.6997, |
|
"eval_samples_per_second": 190.307, |
|
"eval_steps_per_second": 5.963, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 17.35218508997429, |
|
"grad_norm": 2.902130603790283, |
|
"learning_rate": 8.65e-05, |
|
"loss": 1.012, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 17.35218508997429, |
|
"eval_accuracy": 0.7768737795368466, |
|
"eval_loss": 1.024613380432129, |
|
"eval_runtime": 32.6543, |
|
"eval_samples_per_second": 190.572, |
|
"eval_steps_per_second": 5.972, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 17.994858611825194, |
|
"grad_norm": 2.6647133827209473, |
|
"learning_rate": 8.6e-05, |
|
"loss": 0.9992, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 17.994858611825194, |
|
"eval_accuracy": 0.7779743325654593, |
|
"eval_loss": 1.0130186080932617, |
|
"eval_runtime": 32.7375, |
|
"eval_samples_per_second": 190.088, |
|
"eval_steps_per_second": 5.956, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 18.637532133676093, |
|
"grad_norm": 2.590914249420166, |
|
"learning_rate": 8.55e-05, |
|
"loss": 0.9928, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 18.637532133676093, |
|
"eval_accuracy": 0.7779691811484551, |
|
"eval_loss": 1.0204987525939941, |
|
"eval_runtime": 33.4597, |
|
"eval_samples_per_second": 185.985, |
|
"eval_steps_per_second": 5.828, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 19.28020565552699, |
|
"grad_norm": 2.6548004150390625, |
|
"learning_rate": 8.5e-05, |
|
"loss": 0.985, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 19.28020565552699, |
|
"eval_accuracy": 0.7788039344262295, |
|
"eval_loss": 1.0153322219848633, |
|
"eval_runtime": 34.4688, |
|
"eval_samples_per_second": 180.54, |
|
"eval_steps_per_second": 5.657, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 19.922879177377894, |
|
"grad_norm": 2.6247718334198, |
|
"learning_rate": 8.450000000000001e-05, |
|
"loss": 0.9822, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 19.922879177377894, |
|
"eval_accuracy": 0.7795580110497238, |
|
"eval_loss": 1.0121690034866333, |
|
"eval_runtime": 34.3855, |
|
"eval_samples_per_second": 180.977, |
|
"eval_steps_per_second": 5.671, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 20.565552699228792, |
|
"grad_norm": 2.7304604053497314, |
|
"learning_rate": 8.4e-05, |
|
"loss": 0.9735, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 20.565552699228792, |
|
"eval_accuracy": 0.7793682688302468, |
|
"eval_loss": 1.018441081047058, |
|
"eval_runtime": 34.5499, |
|
"eval_samples_per_second": 180.116, |
|
"eval_steps_per_second": 5.644, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 21.20822622107969, |
|
"grad_norm": 2.7982735633850098, |
|
"learning_rate": 8.35e-05, |
|
"loss": 0.971, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 21.20822622107969, |
|
"eval_accuracy": 0.7816012690588251, |
|
"eval_loss": 1.0020060539245605, |
|
"eval_runtime": 34.6648, |
|
"eval_samples_per_second": 179.519, |
|
"eval_steps_per_second": 5.625, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 21.85089974293059, |
|
"grad_norm": 2.6124517917633057, |
|
"learning_rate": 8.3e-05, |
|
"loss": 0.965, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 21.85089974293059, |
|
"eval_accuracy": 0.7828390025055424, |
|
"eval_loss": 0.9919770359992981, |
|
"eval_runtime": 34.2625, |
|
"eval_samples_per_second": 181.627, |
|
"eval_steps_per_second": 5.691, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 22.493573264781492, |
|
"grad_norm": 2.7260468006134033, |
|
"learning_rate": 8.25e-05, |
|
"loss": 0.9536, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 22.493573264781492, |
|
"eval_accuracy": 0.7826561078737454, |
|
"eval_loss": 0.9964642524719238, |
|
"eval_runtime": 35.1121, |
|
"eval_samples_per_second": 177.232, |
|
"eval_steps_per_second": 5.554, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 23.13624678663239, |
|
"grad_norm": 2.5959136486053467, |
|
"learning_rate": 8.2e-05, |
|
"loss": 0.9515, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 23.13624678663239, |
|
"eval_accuracy": 0.7842808178702281, |
|
"eval_loss": 0.9843628406524658, |
|
"eval_runtime": 35.1683, |
|
"eval_samples_per_second": 176.949, |
|
"eval_steps_per_second": 5.545, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 23.77892030848329, |
|
"grad_norm": 2.8720030784606934, |
|
"learning_rate": 8.15e-05, |
|
"loss": 0.9459, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 23.77892030848329, |
|
"eval_accuracy": 0.7822331893892659, |
|
"eval_loss": 0.9950876235961914, |
|
"eval_runtime": 35.4899, |
|
"eval_samples_per_second": 175.345, |
|
"eval_steps_per_second": 5.495, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 24.42159383033419, |
|
"grad_norm": 3.0184552669525146, |
|
"learning_rate": 8.1e-05, |
|
"loss": 0.9374, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 24.42159383033419, |
|
"eval_accuracy": 0.7852158669501615, |
|
"eval_loss": 0.9859277009963989, |
|
"eval_runtime": 34.116, |
|
"eval_samples_per_second": 182.407, |
|
"eval_steps_per_second": 5.716, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 25.06426735218509, |
|
"grad_norm": 2.516874074935913, |
|
"learning_rate": 8.05e-05, |
|
"loss": 0.9318, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 25.06426735218509, |
|
"eval_accuracy": 0.7829394214826243, |
|
"eval_loss": 0.9957559108734131, |
|
"eval_runtime": 35.0753, |
|
"eval_samples_per_second": 177.419, |
|
"eval_steps_per_second": 5.559, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 25.70694087403599, |
|
"grad_norm": 2.5600101947784424, |
|
"learning_rate": 8e-05, |
|
"loss": 0.9311, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 25.70694087403599, |
|
"eval_accuracy": 0.786506985609304, |
|
"eval_loss": 0.9760250449180603, |
|
"eval_runtime": 34.6952, |
|
"eval_samples_per_second": 179.362, |
|
"eval_steps_per_second": 5.62, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 26.34961439588689, |
|
"grad_norm": 2.7094664573669434, |
|
"learning_rate": 7.950000000000001e-05, |
|
"loss": 0.913, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 26.34961439588689, |
|
"eval_accuracy": 0.78658190904128, |
|
"eval_loss": 0.9806557297706604, |
|
"eval_runtime": 34.3785, |
|
"eval_samples_per_second": 181.014, |
|
"eval_steps_per_second": 5.672, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 26.99228791773779, |
|
"grad_norm": 2.3693931102752686, |
|
"learning_rate": 7.900000000000001e-05, |
|
"loss": 0.9171, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 26.99228791773779, |
|
"eval_accuracy": 0.7854132867260752, |
|
"eval_loss": 0.9825689196586609, |
|
"eval_runtime": 34.4341, |
|
"eval_samples_per_second": 180.722, |
|
"eval_steps_per_second": 5.663, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 27.63496143958869, |
|
"grad_norm": 3.0254580974578857, |
|
"learning_rate": 7.850000000000001e-05, |
|
"loss": 0.9062, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 27.63496143958869, |
|
"eval_accuracy": 0.7879520726883571, |
|
"eval_loss": 0.9732692241668701, |
|
"eval_runtime": 34.5783, |
|
"eval_samples_per_second": 179.968, |
|
"eval_steps_per_second": 5.639, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 28.27763496143959, |
|
"grad_norm": 2.6540822982788086, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 0.9095, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 28.27763496143959, |
|
"eval_accuracy": 0.7879547877788341, |
|
"eval_loss": 0.9751275777816772, |
|
"eval_runtime": 33.0181, |
|
"eval_samples_per_second": 188.473, |
|
"eval_steps_per_second": 5.906, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 28.920308483290487, |
|
"grad_norm": 2.7978150844573975, |
|
"learning_rate": 7.75e-05, |
|
"loss": 0.9003, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 28.920308483290487, |
|
"eval_accuracy": 0.789157797413295, |
|
"eval_loss": 0.9676902890205383, |
|
"eval_runtime": 34.176, |
|
"eval_samples_per_second": 182.087, |
|
"eval_steps_per_second": 5.706, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 29.56298200514139, |
|
"grad_norm": 2.6367812156677246, |
|
"learning_rate": 7.7e-05, |
|
"loss": 0.8994, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 29.56298200514139, |
|
"eval_accuracy": 0.78875261302136, |
|
"eval_loss": 0.9720832109451294, |
|
"eval_runtime": 34.6435, |
|
"eval_samples_per_second": 179.63, |
|
"eval_steps_per_second": 5.629, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 30.205655526992288, |
|
"grad_norm": 2.8410837650299072, |
|
"learning_rate": 7.65e-05, |
|
"loss": 0.8937, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 30.205655526992288, |
|
"eval_accuracy": 0.7898726325583352, |
|
"eval_loss": 0.9666265249252319, |
|
"eval_runtime": 34.382, |
|
"eval_samples_per_second": 180.996, |
|
"eval_steps_per_second": 5.672, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 30.848329048843187, |
|
"grad_norm": 2.6249313354492188, |
|
"learning_rate": 7.6e-05, |
|
"loss": 0.8853, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 30.848329048843187, |
|
"eval_accuracy": 0.7914164731021732, |
|
"eval_loss": 0.9592417478561401, |
|
"eval_runtime": 34.7355, |
|
"eval_samples_per_second": 179.154, |
|
"eval_steps_per_second": 5.614, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 31.491002570694086, |
|
"grad_norm": 2.563669443130493, |
|
"learning_rate": 7.55e-05, |
|
"loss": 0.8838, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 31.491002570694086, |
|
"eval_accuracy": 0.7892845518211694, |
|
"eval_loss": 0.969780683517456, |
|
"eval_runtime": 34.7542, |
|
"eval_samples_per_second": 179.057, |
|
"eval_steps_per_second": 5.611, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 32.13367609254499, |
|
"grad_norm": 2.7462027072906494, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.8809, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 32.13367609254499, |
|
"eval_accuracy": 0.7920457676713335, |
|
"eval_loss": 0.9529361128807068, |
|
"eval_runtime": 34.3098, |
|
"eval_samples_per_second": 181.377, |
|
"eval_steps_per_second": 5.684, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 32.77634961439589, |
|
"grad_norm": 2.533267021179199, |
|
"learning_rate": 7.450000000000001e-05, |
|
"loss": 0.874, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 32.77634961439589, |
|
"eval_accuracy": 0.7919883512638669, |
|
"eval_loss": 0.9449800252914429, |
|
"eval_runtime": 34.7988, |
|
"eval_samples_per_second": 178.828, |
|
"eval_steps_per_second": 5.604, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 33.419023136246786, |
|
"grad_norm": 2.718332529067993, |
|
"learning_rate": 7.4e-05, |
|
"loss": 0.8647, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 33.419023136246786, |
|
"eval_accuracy": 0.7927094725330193, |
|
"eval_loss": 0.9489922523498535, |
|
"eval_runtime": 34.773, |
|
"eval_samples_per_second": 178.961, |
|
"eval_steps_per_second": 5.608, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 34.06169665809769, |
|
"grad_norm": 2.664407253265381, |
|
"learning_rate": 7.35e-05, |
|
"loss": 0.8638, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 34.06169665809769, |
|
"eval_accuracy": 0.7909423101119826, |
|
"eval_loss": 0.9638357758522034, |
|
"eval_runtime": 34.4764, |
|
"eval_samples_per_second": 180.501, |
|
"eval_steps_per_second": 5.656, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 34.70437017994858, |
|
"grad_norm": 2.9981689453125, |
|
"learning_rate": 7.3e-05, |
|
"loss": 0.8604, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 34.70437017994858, |
|
"eval_accuracy": 0.792258094990671, |
|
"eval_loss": 0.9567773938179016, |
|
"eval_runtime": 34.4124, |
|
"eval_samples_per_second": 180.836, |
|
"eval_steps_per_second": 5.667, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 35.347043701799485, |
|
"grad_norm": 2.6397457122802734, |
|
"learning_rate": 7.25e-05, |
|
"loss": 0.8516, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 35.347043701799485, |
|
"eval_accuracy": 0.7944613334662306, |
|
"eval_loss": 0.9480540156364441, |
|
"eval_runtime": 34.3734, |
|
"eval_samples_per_second": 181.041, |
|
"eval_steps_per_second": 5.673, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 35.98971722365039, |
|
"grad_norm": 2.9263813495635986, |
|
"learning_rate": 7.2e-05, |
|
"loss": 0.8529, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 35.98971722365039, |
|
"eval_accuracy": 0.7949814058148135, |
|
"eval_loss": 0.9380202293395996, |
|
"eval_runtime": 34.1807, |
|
"eval_samples_per_second": 182.062, |
|
"eval_steps_per_second": 5.705, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 36.63239074550128, |
|
"grad_norm": 2.446913719177246, |
|
"learning_rate": 7.15e-05, |
|
"loss": 0.8454, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 36.63239074550128, |
|
"eval_accuracy": 0.7940900415285876, |
|
"eval_loss": 0.9452695250511169, |
|
"eval_runtime": 33.1944, |
|
"eval_samples_per_second": 187.472, |
|
"eval_steps_per_second": 5.874, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 37.275064267352185, |
|
"grad_norm": 2.429306983947754, |
|
"learning_rate": 7.1e-05, |
|
"loss": 0.8432, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 37.275064267352185, |
|
"eval_accuracy": 0.7944350218702906, |
|
"eval_loss": 0.9466774463653564, |
|
"eval_runtime": 33.291, |
|
"eval_samples_per_second": 186.927, |
|
"eval_steps_per_second": 5.857, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 37.91773778920309, |
|
"grad_norm": 2.648399591445923, |
|
"learning_rate": 7.05e-05, |
|
"loss": 0.8468, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 37.91773778920309, |
|
"eval_accuracy": 0.7957481551099058, |
|
"eval_loss": 0.9400731325149536, |
|
"eval_runtime": 33.2405, |
|
"eval_samples_per_second": 187.211, |
|
"eval_steps_per_second": 5.866, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 38.56041131105398, |
|
"grad_norm": 2.644608974456787, |
|
"learning_rate": 7e-05, |
|
"loss": 0.8328, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 38.56041131105398, |
|
"eval_accuracy": 0.795926071569013, |
|
"eval_loss": 0.9422577619552612, |
|
"eval_runtime": 33.3493, |
|
"eval_samples_per_second": 186.601, |
|
"eval_steps_per_second": 5.847, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 39.203084832904885, |
|
"grad_norm": 2.6563777923583984, |
|
"learning_rate": 6.95e-05, |
|
"loss": 0.8323, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 39.203084832904885, |
|
"eval_accuracy": 0.7951947535355016, |
|
"eval_loss": 0.9368035793304443, |
|
"eval_runtime": 32.9628, |
|
"eval_samples_per_second": 188.788, |
|
"eval_steps_per_second": 5.916, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 39.84575835475579, |
|
"grad_norm": 2.5932886600494385, |
|
"learning_rate": 6.9e-05, |
|
"loss": 0.8268, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 39.84575835475579, |
|
"eval_accuracy": 0.7967557476448579, |
|
"eval_loss": 0.9303238391876221, |
|
"eval_runtime": 32.4851, |
|
"eval_samples_per_second": 191.565, |
|
"eval_steps_per_second": 6.003, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 40.48843187660668, |
|
"grad_norm": 2.7462244033813477, |
|
"learning_rate": 6.850000000000001e-05, |
|
"loss": 0.8246, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 40.48843187660668, |
|
"eval_accuracy": 0.7966305277950054, |
|
"eval_loss": 0.9323325157165527, |
|
"eval_runtime": 33.1165, |
|
"eval_samples_per_second": 187.912, |
|
"eval_steps_per_second": 5.888, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 41.131105398457585, |
|
"grad_norm": 2.647944211959839, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 0.8216, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 41.131105398457585, |
|
"eval_accuracy": 0.7962699125415105, |
|
"eval_loss": 0.9350172281265259, |
|
"eval_runtime": 32.873, |
|
"eval_samples_per_second": 189.305, |
|
"eval_steps_per_second": 5.932, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 41.77377892030848, |
|
"grad_norm": 2.661036729812622, |
|
"learning_rate": 6.750000000000001e-05, |
|
"loss": 0.8218, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 41.77377892030848, |
|
"eval_accuracy": 0.7972857023271388, |
|
"eval_loss": 0.9303329586982727, |
|
"eval_runtime": 32.6506, |
|
"eval_samples_per_second": 190.594, |
|
"eval_steps_per_second": 5.972, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 42.41645244215938, |
|
"grad_norm": 2.7694153785705566, |
|
"learning_rate": 6.7e-05, |
|
"loss": 0.8132, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 42.41645244215938, |
|
"eval_accuracy": 0.7977203787112787, |
|
"eval_loss": 0.9363131523132324, |
|
"eval_runtime": 33.0889, |
|
"eval_samples_per_second": 188.069, |
|
"eval_steps_per_second": 5.893, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 43.059125964010285, |
|
"grad_norm": 2.8761026859283447, |
|
"learning_rate": 6.65e-05, |
|
"loss": 0.8082, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 43.059125964010285, |
|
"eval_accuracy": 0.7981091852326369, |
|
"eval_loss": 0.9364966154098511, |
|
"eval_runtime": 32.3923, |
|
"eval_samples_per_second": 192.113, |
|
"eval_steps_per_second": 6.02, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 43.70179948586118, |
|
"grad_norm": 2.6908044815063477, |
|
"learning_rate": 6.6e-05, |
|
"loss": 0.806, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 43.70179948586118, |
|
"eval_accuracy": 0.7986442847736496, |
|
"eval_loss": 0.9224198460578918, |
|
"eval_runtime": 33.388, |
|
"eval_samples_per_second": 186.384, |
|
"eval_steps_per_second": 5.84, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 44.34447300771208, |
|
"grad_norm": 2.6623470783233643, |
|
"learning_rate": 6.55e-05, |
|
"loss": 0.8024, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 44.34447300771208, |
|
"eval_accuracy": 0.7992224783044672, |
|
"eval_loss": 0.9192689061164856, |
|
"eval_runtime": 33.3722, |
|
"eval_samples_per_second": 186.473, |
|
"eval_steps_per_second": 5.843, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 44.987146529562985, |
|
"grad_norm": 2.6698567867279053, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 0.8016, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 44.987146529562985, |
|
"eval_accuracy": 0.7984851112126992, |
|
"eval_loss": 0.9178963303565979, |
|
"eval_runtime": 33.2663, |
|
"eval_samples_per_second": 187.066, |
|
"eval_steps_per_second": 5.862, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 45.62982005141388, |
|
"grad_norm": 2.980325222015381, |
|
"learning_rate": 6.450000000000001e-05, |
|
"loss": 0.796, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 45.62982005141388, |
|
"eval_accuracy": 0.7979037090039865, |
|
"eval_loss": 0.9273726344108582, |
|
"eval_runtime": 33.0665, |
|
"eval_samples_per_second": 188.196, |
|
"eval_steps_per_second": 5.897, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 46.27249357326478, |
|
"grad_norm": 2.536480188369751, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 0.7976, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 46.27249357326478, |
|
"eval_accuracy": 0.8009755918651776, |
|
"eval_loss": 0.9136722087860107, |
|
"eval_runtime": 33.7816, |
|
"eval_samples_per_second": 184.213, |
|
"eval_steps_per_second": 5.772, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 46.915167095115685, |
|
"grad_norm": 2.369147777557373, |
|
"learning_rate": 6.35e-05, |
|
"loss": 0.7888, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 46.915167095115685, |
|
"eval_accuracy": 0.8006721188810271, |
|
"eval_loss": 0.9139747619628906, |
|
"eval_runtime": 33.0443, |
|
"eval_samples_per_second": 188.323, |
|
"eval_steps_per_second": 5.901, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 47.55784061696658, |
|
"grad_norm": 2.606424570083618, |
|
"learning_rate": 6.3e-05, |
|
"loss": 0.7826, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 47.55784061696658, |
|
"eval_accuracy": 0.8006546893234877, |
|
"eval_loss": 0.9165197610855103, |
|
"eval_runtime": 32.9373, |
|
"eval_samples_per_second": 188.935, |
|
"eval_steps_per_second": 5.92, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 48.20051413881748, |
|
"grad_norm": 2.869199514389038, |
|
"learning_rate": 6.25e-05, |
|
"loss": 0.789, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 48.20051413881748, |
|
"eval_accuracy": 0.8009030033534118, |
|
"eval_loss": 0.9167375564575195, |
|
"eval_runtime": 33.2315, |
|
"eval_samples_per_second": 187.262, |
|
"eval_steps_per_second": 5.868, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 48.84318766066838, |
|
"grad_norm": 2.5320894718170166, |
|
"learning_rate": 6.2e-05, |
|
"loss": 0.7827, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 48.84318766066838, |
|
"eval_accuracy": 0.7997841239160237, |
|
"eval_loss": 0.9282008409500122, |
|
"eval_runtime": 33.3582, |
|
"eval_samples_per_second": 186.551, |
|
"eval_steps_per_second": 5.846, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 49.48586118251928, |
|
"grad_norm": 2.741992712020874, |
|
"learning_rate": 6.15e-05, |
|
"loss": 0.7739, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 49.48586118251928, |
|
"eval_accuracy": 0.8020669565628894, |
|
"eval_loss": 0.9078426361083984, |
|
"eval_runtime": 33.2299, |
|
"eval_samples_per_second": 187.271, |
|
"eval_steps_per_second": 5.868, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 50.12853470437018, |
|
"grad_norm": 2.563089370727539, |
|
"learning_rate": 6.1e-05, |
|
"loss": 0.7766, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 50.12853470437018, |
|
"eval_accuracy": 0.8028247073903633, |
|
"eval_loss": 0.9107823967933655, |
|
"eval_runtime": 33.5047, |
|
"eval_samples_per_second": 185.735, |
|
"eval_steps_per_second": 5.82, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 50.77120822622108, |
|
"grad_norm": 2.5732595920562744, |
|
"learning_rate": 6.05e-05, |
|
"loss": 0.7728, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 50.77120822622108, |
|
"eval_accuracy": 0.8025567474818919, |
|
"eval_loss": 0.9065195918083191, |
|
"eval_runtime": 33.1402, |
|
"eval_samples_per_second": 187.778, |
|
"eval_steps_per_second": 5.884, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 51.41388174807198, |
|
"grad_norm": 2.4717209339141846, |
|
"learning_rate": 6e-05, |
|
"loss": 0.7676, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 51.41388174807198, |
|
"eval_accuracy": 0.8028714592601541, |
|
"eval_loss": 0.9060749411582947, |
|
"eval_runtime": 32.3856, |
|
"eval_samples_per_second": 192.153, |
|
"eval_steps_per_second": 6.021, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 52.05655526992288, |
|
"grad_norm": 2.5366320610046387, |
|
"learning_rate": 5.95e-05, |
|
"loss": 0.7653, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 52.05655526992288, |
|
"eval_accuracy": 0.8030310980056771, |
|
"eval_loss": 0.9148956537246704, |
|
"eval_runtime": 33.3546, |
|
"eval_samples_per_second": 186.571, |
|
"eval_steps_per_second": 5.846, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 52.69922879177378, |
|
"grad_norm": 2.9074325561523438, |
|
"learning_rate": 5.9e-05, |
|
"loss": 0.7607, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 52.69922879177378, |
|
"eval_accuracy": 0.8039304797381431, |
|
"eval_loss": 0.8974832892417908, |
|
"eval_runtime": 33.4483, |
|
"eval_samples_per_second": 186.049, |
|
"eval_steps_per_second": 5.83, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 53.34190231362468, |
|
"grad_norm": 3.1211259365081787, |
|
"learning_rate": 5.85e-05, |
|
"loss": 0.7604, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 53.34190231362468, |
|
"eval_accuracy": 0.8036355728594897, |
|
"eval_loss": 0.90870201587677, |
|
"eval_runtime": 33.4646, |
|
"eval_samples_per_second": 185.957, |
|
"eval_steps_per_second": 5.827, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 53.98457583547558, |
|
"grad_norm": 2.762848377227783, |
|
"learning_rate": 5.8e-05, |
|
"loss": 0.7593, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 53.98457583547558, |
|
"eval_accuracy": 0.802461854621613, |
|
"eval_loss": 0.9101512432098389, |
|
"eval_runtime": 32.3612, |
|
"eval_samples_per_second": 192.298, |
|
"eval_steps_per_second": 6.026, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 54.62724935732648, |
|
"grad_norm": 2.6548171043395996, |
|
"learning_rate": 5.7499999999999995e-05, |
|
"loss": 0.7564, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 54.62724935732648, |
|
"eval_accuracy": 0.804102518442478, |
|
"eval_loss": 0.9050089120864868, |
|
"eval_runtime": 33.1101, |
|
"eval_samples_per_second": 187.949, |
|
"eval_steps_per_second": 5.889, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 55.26992287917738, |
|
"grad_norm": 2.6228950023651123, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 0.7452, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 55.26992287917738, |
|
"eval_accuracy": 0.8040075271997299, |
|
"eval_loss": 0.9055400490760803, |
|
"eval_runtime": 33.014, |
|
"eval_samples_per_second": 188.496, |
|
"eval_steps_per_second": 5.907, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 55.912596401028274, |
|
"grad_norm": 2.6490426063537598, |
|
"learning_rate": 5.65e-05, |
|
"loss": 0.7488, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 55.912596401028274, |
|
"eval_accuracy": 0.8048906062428912, |
|
"eval_loss": 0.9053667187690735, |
|
"eval_runtime": 32.3856, |
|
"eval_samples_per_second": 192.153, |
|
"eval_steps_per_second": 6.021, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 56.55526992287918, |
|
"grad_norm": 2.494063377380371, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 0.7472, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 56.55526992287918, |
|
"eval_accuracy": 0.8050882383051106, |
|
"eval_loss": 0.8990674614906311, |
|
"eval_runtime": 34.0904, |
|
"eval_samples_per_second": 182.544, |
|
"eval_steps_per_second": 5.72, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 57.19794344473008, |
|
"grad_norm": 2.8385369777679443, |
|
"learning_rate": 5.550000000000001e-05, |
|
"loss": 0.7469, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 57.19794344473008, |
|
"eval_accuracy": 0.8047514040498369, |
|
"eval_loss": 0.9029610753059387, |
|
"eval_runtime": 32.596, |
|
"eval_samples_per_second": 190.913, |
|
"eval_steps_per_second": 5.982, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 57.840616966580974, |
|
"grad_norm": 2.4786102771759033, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 0.7386, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 57.840616966580974, |
|
"eval_accuracy": 0.8045387474256496, |
|
"eval_loss": 0.8992837071418762, |
|
"eval_runtime": 33.7728, |
|
"eval_samples_per_second": 184.261, |
|
"eval_steps_per_second": 5.774, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 58.48329048843188, |
|
"grad_norm": 2.57100248336792, |
|
"learning_rate": 5.45e-05, |
|
"loss": 0.7383, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 58.48329048843188, |
|
"eval_accuracy": 0.8045938243522368, |
|
"eval_loss": 0.9126896858215332, |
|
"eval_runtime": 33.0001, |
|
"eval_samples_per_second": 188.575, |
|
"eval_steps_per_second": 5.909, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 59.12596401028278, |
|
"grad_norm": 2.6267364025115967, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 0.7372, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 59.12596401028278, |
|
"eval_accuracy": 0.8047429982319032, |
|
"eval_loss": 0.9037800431251526, |
|
"eval_runtime": 33.4956, |
|
"eval_samples_per_second": 185.786, |
|
"eval_steps_per_second": 5.822, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 59.768637532133674, |
|
"grad_norm": 2.6067042350769043, |
|
"learning_rate": 5.3500000000000006e-05, |
|
"loss": 0.7293, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 59.768637532133674, |
|
"eval_accuracy": 0.8063738822187712, |
|
"eval_loss": 0.891742467880249, |
|
"eval_runtime": 33.3465, |
|
"eval_samples_per_second": 186.616, |
|
"eval_steps_per_second": 5.848, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 60.411311053984576, |
|
"grad_norm": 2.748093843460083, |
|
"learning_rate": 5.300000000000001e-05, |
|
"loss": 0.7266, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 60.411311053984576, |
|
"eval_accuracy": 0.8060851226391891, |
|
"eval_loss": 0.8950145840644836, |
|
"eval_runtime": 33.417, |
|
"eval_samples_per_second": 186.223, |
|
"eval_steps_per_second": 5.835, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 61.05398457583548, |
|
"grad_norm": 2.7752370834350586, |
|
"learning_rate": 5.25e-05, |
|
"loss": 0.7278, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 61.05398457583548, |
|
"eval_accuracy": 0.8052570807938081, |
|
"eval_loss": 0.9001266956329346, |
|
"eval_runtime": 33.2575, |
|
"eval_samples_per_second": 187.116, |
|
"eval_steps_per_second": 5.863, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 61.696658097686374, |
|
"grad_norm": 2.4993159770965576, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 0.7224, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 61.696658097686374, |
|
"eval_accuracy": 0.8063545774342619, |
|
"eval_loss": 0.9000985622406006, |
|
"eval_runtime": 32.5061, |
|
"eval_samples_per_second": 191.441, |
|
"eval_steps_per_second": 5.999, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 62.339331619537276, |
|
"grad_norm": 2.795344829559326, |
|
"learning_rate": 5.1500000000000005e-05, |
|
"loss": 0.7156, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 62.339331619537276, |
|
"eval_accuracy": 0.8078125860121053, |
|
"eval_loss": 0.8850185871124268, |
|
"eval_runtime": 33.3065, |
|
"eval_samples_per_second": 186.84, |
|
"eval_steps_per_second": 5.855, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 62.98200514138817, |
|
"grad_norm": 2.4736196994781494, |
|
"learning_rate": 5.1000000000000006e-05, |
|
"loss": 0.7179, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 62.98200514138817, |
|
"eval_accuracy": 0.8069766099061457, |
|
"eval_loss": 0.8914857506752014, |
|
"eval_runtime": 33.2699, |
|
"eval_samples_per_second": 187.046, |
|
"eval_steps_per_second": 5.861, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 63.624678663239074, |
|
"grad_norm": 2.7739675045013428, |
|
"learning_rate": 5.05e-05, |
|
"loss": 0.7158, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 63.624678663239074, |
|
"eval_accuracy": 0.806712037699331, |
|
"eval_loss": 0.8928523063659668, |
|
"eval_runtime": 33.5382, |
|
"eval_samples_per_second": 185.55, |
|
"eval_steps_per_second": 5.814, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 64.26735218508998, |
|
"grad_norm": 2.6932947635650635, |
|
"learning_rate": 5e-05, |
|
"loss": 0.711, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 64.26735218508998, |
|
"eval_accuracy": 0.8060159735552406, |
|
"eval_loss": 0.9013872742652893, |
|
"eval_runtime": 33.9301, |
|
"eval_samples_per_second": 183.407, |
|
"eval_steps_per_second": 5.747, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 64.91002570694087, |
|
"grad_norm": 2.7735486030578613, |
|
"learning_rate": 4.9500000000000004e-05, |
|
"loss": 0.7097, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 64.91002570694087, |
|
"eval_accuracy": 0.8067475980060699, |
|
"eval_loss": 0.8957981467247009, |
|
"eval_runtime": 32.2026, |
|
"eval_samples_per_second": 193.245, |
|
"eval_steps_per_second": 6.055, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 65.55269922879178, |
|
"grad_norm": 2.9156525135040283, |
|
"learning_rate": 4.9e-05, |
|
"loss": 0.7079, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 65.55269922879178, |
|
"eval_accuracy": 0.8081258363288117, |
|
"eval_loss": 0.894716739654541, |
|
"eval_runtime": 32.7636, |
|
"eval_samples_per_second": 189.936, |
|
"eval_steps_per_second": 5.952, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 66.19537275064268, |
|
"grad_norm": 2.6359803676605225, |
|
"learning_rate": 4.85e-05, |
|
"loss": 0.7091, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 66.19537275064268, |
|
"eval_accuracy": 0.8079719372684523, |
|
"eval_loss": 0.8968275189399719, |
|
"eval_runtime": 32.9482, |
|
"eval_samples_per_second": 188.872, |
|
"eval_steps_per_second": 5.918, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 66.83804627249357, |
|
"grad_norm": 2.481441020965576, |
|
"learning_rate": 4.8e-05, |
|
"loss": 0.7023, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 66.83804627249357, |
|
"eval_accuracy": 0.8089100382369007, |
|
"eval_loss": 0.8855522274971008, |
|
"eval_runtime": 33.2751, |
|
"eval_samples_per_second": 187.017, |
|
"eval_steps_per_second": 5.86, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 67.48071979434447, |
|
"grad_norm": 2.7268731594085693, |
|
"learning_rate": 4.75e-05, |
|
"loss": 0.701, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 67.48071979434447, |
|
"eval_accuracy": 0.8084944735097863, |
|
"eval_loss": 0.8864369988441467, |
|
"eval_runtime": 33.1906, |
|
"eval_samples_per_second": 187.493, |
|
"eval_steps_per_second": 5.875, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 68.12339331619538, |
|
"grad_norm": 2.5931150913238525, |
|
"learning_rate": 4.7e-05, |
|
"loss": 0.6988, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 68.12339331619538, |
|
"eval_accuracy": 0.8089927462608983, |
|
"eval_loss": 0.8866479992866516, |
|
"eval_runtime": 33.2058, |
|
"eval_samples_per_second": 187.407, |
|
"eval_steps_per_second": 5.872, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 68.76606683804627, |
|
"grad_norm": 2.736328601837158, |
|
"learning_rate": 4.6500000000000005e-05, |
|
"loss": 0.6994, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 68.76606683804627, |
|
"eval_accuracy": 0.8084927024019897, |
|
"eval_loss": 0.8820142149925232, |
|
"eval_runtime": 32.9508, |
|
"eval_samples_per_second": 188.857, |
|
"eval_steps_per_second": 5.918, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 69.40874035989717, |
|
"grad_norm": 2.8003087043762207, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 0.6831, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 69.40874035989717, |
|
"eval_accuracy": 0.8091836520937638, |
|
"eval_loss": 0.8856648206710815, |
|
"eval_runtime": 33.3733, |
|
"eval_samples_per_second": 186.466, |
|
"eval_steps_per_second": 5.843, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 70.05141388174808, |
|
"grad_norm": 2.7809290885925293, |
|
"learning_rate": 4.55e-05, |
|
"loss": 0.6948, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 70.05141388174808, |
|
"eval_accuracy": 0.8089959519265054, |
|
"eval_loss": 0.89335036277771, |
|
"eval_runtime": 33.324, |
|
"eval_samples_per_second": 186.743, |
|
"eval_steps_per_second": 5.852, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 70.69408740359897, |
|
"grad_norm": 2.5319223403930664, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.688, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 70.69408740359897, |
|
"eval_accuracy": 0.8102897851773414, |
|
"eval_loss": 0.8859269022941589, |
|
"eval_runtime": 33.0875, |
|
"eval_samples_per_second": 188.077, |
|
"eval_steps_per_second": 5.893, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 71.33676092544987, |
|
"grad_norm": 3.0281503200531006, |
|
"learning_rate": 4.4500000000000004e-05, |
|
"loss": 0.6894, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 71.33676092544987, |
|
"eval_accuracy": 0.8097594530479655, |
|
"eval_loss": 0.8875166773796082, |
|
"eval_runtime": 32.7248, |
|
"eval_samples_per_second": 190.161, |
|
"eval_steps_per_second": 5.959, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 71.97943444730078, |
|
"grad_norm": 2.476452589035034, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 0.6855, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 71.97943444730078, |
|
"eval_accuracy": 0.8097316597458161, |
|
"eval_loss": 0.8884576559066772, |
|
"eval_runtime": 33.1154, |
|
"eval_samples_per_second": 187.919, |
|
"eval_steps_per_second": 5.889, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 72.62210796915167, |
|
"grad_norm": 2.662523031234741, |
|
"learning_rate": 4.35e-05, |
|
"loss": 0.6811, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 72.62210796915167, |
|
"eval_accuracy": 0.809297297864581, |
|
"eval_loss": 0.8833766579627991, |
|
"eval_runtime": 33.1424, |
|
"eval_samples_per_second": 187.766, |
|
"eval_steps_per_second": 5.884, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 73.26478149100257, |
|
"grad_norm": 2.4902071952819824, |
|
"learning_rate": 4.3e-05, |
|
"loss": 0.6782, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 73.26478149100257, |
|
"eval_accuracy": 0.8089903844893186, |
|
"eval_loss": 0.8910095691680908, |
|
"eval_runtime": 33.3324, |
|
"eval_samples_per_second": 186.695, |
|
"eval_steps_per_second": 5.85, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 73.90745501285348, |
|
"grad_norm": 2.897806167602539, |
|
"learning_rate": 4.25e-05, |
|
"loss": 0.6761, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 73.90745501285348, |
|
"eval_accuracy": 0.8115941649425137, |
|
"eval_loss": 0.8758607506752014, |
|
"eval_runtime": 33.305, |
|
"eval_samples_per_second": 186.849, |
|
"eval_steps_per_second": 5.855, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 74.55012853470437, |
|
"grad_norm": 2.6833012104034424, |
|
"learning_rate": 4.2e-05, |
|
"loss": 0.6713, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 74.55012853470437, |
|
"eval_accuracy": 0.8105924384594244, |
|
"eval_loss": 0.8842668533325195, |
|
"eval_runtime": 32.104, |
|
"eval_samples_per_second": 193.839, |
|
"eval_steps_per_second": 6.074, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 75.19280205655527, |
|
"grad_norm": 3.058183193206787, |
|
"learning_rate": 4.15e-05, |
|
"loss": 0.6751, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 75.19280205655527, |
|
"eval_accuracy": 0.8100959331379869, |
|
"eval_loss": 0.8922275304794312, |
|
"eval_runtime": 32.0158, |
|
"eval_samples_per_second": 194.373, |
|
"eval_steps_per_second": 6.091, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 75.83547557840618, |
|
"grad_norm": 2.5572476387023926, |
|
"learning_rate": 4.1e-05, |
|
"loss": 0.667, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 75.83547557840618, |
|
"eval_accuracy": 0.8115087833220307, |
|
"eval_loss": 0.8827089071273804, |
|
"eval_runtime": 32.068, |
|
"eval_samples_per_second": 194.056, |
|
"eval_steps_per_second": 6.081, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 76.47814910025707, |
|
"grad_norm": 2.6060643196105957, |
|
"learning_rate": 4.05e-05, |
|
"loss": 0.6628, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 76.47814910025707, |
|
"eval_accuracy": 0.8114595604343708, |
|
"eval_loss": 0.8741211891174316, |
|
"eval_runtime": 32.0679, |
|
"eval_samples_per_second": 194.057, |
|
"eval_steps_per_second": 6.081, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 77.12082262210797, |
|
"grad_norm": 2.643742322921753, |
|
"learning_rate": 4e-05, |
|
"loss": 0.6685, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 77.12082262210797, |
|
"eval_accuracy": 0.8120777359560294, |
|
"eval_loss": 0.8767244219779968, |
|
"eval_runtime": 32.012, |
|
"eval_samples_per_second": 194.396, |
|
"eval_steps_per_second": 6.091, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 77.76349614395887, |
|
"grad_norm": 3.0320351123809814, |
|
"learning_rate": 3.9500000000000005e-05, |
|
"loss": 0.6618, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 77.76349614395887, |
|
"eval_accuracy": 0.8108977044235925, |
|
"eval_loss": 0.882882833480835, |
|
"eval_runtime": 32.0023, |
|
"eval_samples_per_second": 194.455, |
|
"eval_steps_per_second": 6.093, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 78.40616966580977, |
|
"grad_norm": 2.645836114883423, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 0.6626, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 78.40616966580977, |
|
"eval_accuracy": 0.8128617042506485, |
|
"eval_loss": 0.8741999268531799, |
|
"eval_runtime": 32.0443, |
|
"eval_samples_per_second": 194.2, |
|
"eval_steps_per_second": 6.085, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 79.04884318766067, |
|
"grad_norm": 2.858215570449829, |
|
"learning_rate": 3.85e-05, |
|
"loss": 0.6617, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 79.04884318766067, |
|
"eval_accuracy": 0.8136581954693509, |
|
"eval_loss": 0.8712067604064941, |
|
"eval_runtime": 32.061, |
|
"eval_samples_per_second": 194.099, |
|
"eval_steps_per_second": 6.082, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 79.69151670951157, |
|
"grad_norm": 2.5988807678222656, |
|
"learning_rate": 3.8e-05, |
|
"loss": 0.655, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 79.69151670951157, |
|
"eval_accuracy": 0.8128823572620334, |
|
"eval_loss": 0.8742080926895142, |
|
"eval_runtime": 32.0565, |
|
"eval_samples_per_second": 194.126, |
|
"eval_steps_per_second": 6.083, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 80.33419023136247, |
|
"grad_norm": 2.6401941776275635, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.6545, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 80.33419023136247, |
|
"eval_accuracy": 0.812843042566724, |
|
"eval_loss": 0.8718605041503906, |
|
"eval_runtime": 31.9943, |
|
"eval_samples_per_second": 194.503, |
|
"eval_steps_per_second": 6.095, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 80.97686375321337, |
|
"grad_norm": 2.8062636852264404, |
|
"learning_rate": 3.7e-05, |
|
"loss": 0.655, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 80.97686375321337, |
|
"eval_accuracy": 0.811865863240631, |
|
"eval_loss": 0.8775948882102966, |
|
"eval_runtime": 32.0581, |
|
"eval_samples_per_second": 194.116, |
|
"eval_steps_per_second": 6.083, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 81.61953727506426, |
|
"grad_norm": 2.548220157623291, |
|
"learning_rate": 3.65e-05, |
|
"loss": 0.6468, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 81.61953727506426, |
|
"eval_accuracy": 0.8119920345795418, |
|
"eval_loss": 0.883579432964325, |
|
"eval_runtime": 32.0753, |
|
"eval_samples_per_second": 194.012, |
|
"eval_steps_per_second": 6.079, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 82.26221079691517, |
|
"grad_norm": 2.5861518383026123, |
|
"learning_rate": 3.6e-05, |
|
"loss": 0.6487, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 82.26221079691517, |
|
"eval_accuracy": 0.8131239363417762, |
|
"eval_loss": 0.8730902075767517, |
|
"eval_runtime": 32.1045, |
|
"eval_samples_per_second": 193.835, |
|
"eval_steps_per_second": 6.074, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 82.90488431876607, |
|
"grad_norm": 2.780217409133911, |
|
"learning_rate": 3.55e-05, |
|
"loss": 0.6495, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 82.90488431876607, |
|
"eval_accuracy": 0.8124547391400174, |
|
"eval_loss": 0.8819155097007751, |
|
"eval_runtime": 32.0183, |
|
"eval_samples_per_second": 194.357, |
|
"eval_steps_per_second": 6.09, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 83.54755784061696, |
|
"grad_norm": 2.5522782802581787, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.6425, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 83.54755784061696, |
|
"eval_accuracy": 0.8124381288595807, |
|
"eval_loss": 0.885295569896698, |
|
"eval_runtime": 32.0293, |
|
"eval_samples_per_second": 194.291, |
|
"eval_steps_per_second": 6.088, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 84.19023136246787, |
|
"grad_norm": 2.6785833835601807, |
|
"learning_rate": 3.45e-05, |
|
"loss": 0.6423, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 84.19023136246787, |
|
"eval_accuracy": 0.8132641437128263, |
|
"eval_loss": 0.8791692852973938, |
|
"eval_runtime": 32.0427, |
|
"eval_samples_per_second": 194.209, |
|
"eval_steps_per_second": 6.086, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 84.83290488431876, |
|
"grad_norm": 2.6489408016204834, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 0.6377, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 84.83290488431876, |
|
"eval_accuracy": 0.8154822401853656, |
|
"eval_loss": 0.8657127618789673, |
|
"eval_runtime": 32.102, |
|
"eval_samples_per_second": 193.851, |
|
"eval_steps_per_second": 6.074, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 85.47557840616966, |
|
"grad_norm": 2.556199312210083, |
|
"learning_rate": 3.35e-05, |
|
"loss": 0.6334, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 85.47557840616966, |
|
"eval_accuracy": 0.8141076970581924, |
|
"eval_loss": 0.8747490644454956, |
|
"eval_runtime": 32.0636, |
|
"eval_samples_per_second": 194.083, |
|
"eval_steps_per_second": 6.082, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 86.11825192802057, |
|
"grad_norm": 2.477529764175415, |
|
"learning_rate": 3.3e-05, |
|
"loss": 0.6421, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 86.11825192802057, |
|
"eval_accuracy": 0.8136067662347515, |
|
"eval_loss": 0.8816725015640259, |
|
"eval_runtime": 32.0966, |
|
"eval_samples_per_second": 193.883, |
|
"eval_steps_per_second": 6.075, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 86.76092544987146, |
|
"grad_norm": 2.6774518489837646, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 0.6366, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 86.76092544987146, |
|
"eval_accuracy": 0.814412672148727, |
|
"eval_loss": 0.8695808053016663, |
|
"eval_runtime": 32.0613, |
|
"eval_samples_per_second": 194.097, |
|
"eval_steps_per_second": 6.082, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 87.40359897172236, |
|
"grad_norm": 2.6455330848693848, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 0.6305, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 87.40359897172236, |
|
"eval_accuracy": 0.8148675345064433, |
|
"eval_loss": 0.8716031312942505, |
|
"eval_runtime": 32.0969, |
|
"eval_samples_per_second": 193.882, |
|
"eval_steps_per_second": 6.075, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 88.04627249357327, |
|
"grad_norm": 2.7725086212158203, |
|
"learning_rate": 3.15e-05, |
|
"loss": 0.6325, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 88.04627249357327, |
|
"eval_accuracy": 0.813131776644422, |
|
"eval_loss": 0.8786645531654358, |
|
"eval_runtime": 32.024, |
|
"eval_samples_per_second": 194.323, |
|
"eval_steps_per_second": 6.089, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 88.68894601542416, |
|
"grad_norm": 2.6008856296539307, |
|
"learning_rate": 3.1e-05, |
|
"loss": 0.6282, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 88.68894601542416, |
|
"eval_accuracy": 0.8146181108967369, |
|
"eval_loss": 0.8678516149520874, |
|
"eval_runtime": 32.0037, |
|
"eval_samples_per_second": 194.447, |
|
"eval_steps_per_second": 6.093, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 89.33161953727506, |
|
"grad_norm": 2.7128331661224365, |
|
"learning_rate": 3.05e-05, |
|
"loss": 0.6261, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 89.33161953727506, |
|
"eval_accuracy": 0.8144260367762641, |
|
"eval_loss": 0.871059000492096, |
|
"eval_runtime": 32.0173, |
|
"eval_samples_per_second": 194.364, |
|
"eval_steps_per_second": 6.09, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 89.97429305912597, |
|
"grad_norm": 2.324878692626953, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6263, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 89.97429305912597, |
|
"eval_accuracy": 0.8156161510225485, |
|
"eval_loss": 0.8654680252075195, |
|
"eval_runtime": 32.0295, |
|
"eval_samples_per_second": 194.29, |
|
"eval_steps_per_second": 6.088, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 90.61696658097686, |
|
"grad_norm": 2.7399654388427734, |
|
"learning_rate": 2.95e-05, |
|
"loss": 0.6233, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 90.61696658097686, |
|
"eval_accuracy": 0.8162172670337523, |
|
"eval_loss": 0.8661888241767883, |
|
"eval_runtime": 32.0203, |
|
"eval_samples_per_second": 194.346, |
|
"eval_steps_per_second": 6.09, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 91.25964010282776, |
|
"grad_norm": 2.8856687545776367, |
|
"learning_rate": 2.9e-05, |
|
"loss": 0.6176, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 91.25964010282776, |
|
"eval_accuracy": 0.8161184520753214, |
|
"eval_loss": 0.8683423399925232, |
|
"eval_runtime": 32.0579, |
|
"eval_samples_per_second": 194.117, |
|
"eval_steps_per_second": 6.083, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 91.90231362467867, |
|
"grad_norm": 2.4138548374176025, |
|
"learning_rate": 2.8499999999999998e-05, |
|
"loss": 0.6239, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 91.90231362467867, |
|
"eval_accuracy": 0.8153887598602306, |
|
"eval_loss": 0.8599680066108704, |
|
"eval_runtime": 32.0297, |
|
"eval_samples_per_second": 194.289, |
|
"eval_steps_per_second": 6.088, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 92.54498714652956, |
|
"grad_norm": 2.6343512535095215, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 0.6189, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 92.54498714652956, |
|
"eval_accuracy": 0.8154344802246543, |
|
"eval_loss": 0.8673732876777649, |
|
"eval_runtime": 32.0317, |
|
"eval_samples_per_second": 194.276, |
|
"eval_steps_per_second": 6.088, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 93.18766066838046, |
|
"grad_norm": 2.6886157989501953, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 0.6117, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 93.18766066838046, |
|
"eval_accuracy": 0.8162594874166894, |
|
"eval_loss": 0.8670706748962402, |
|
"eval_runtime": 32.0796, |
|
"eval_samples_per_second": 193.986, |
|
"eval_steps_per_second": 6.079, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 93.83033419023137, |
|
"grad_norm": 2.6037755012512207, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 0.615, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 93.83033419023137, |
|
"eval_accuracy": 0.8150874471307802, |
|
"eval_loss": 0.8663885593414307, |
|
"eval_runtime": 32.0301, |
|
"eval_samples_per_second": 194.286, |
|
"eval_steps_per_second": 6.088, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 94.47300771208226, |
|
"grad_norm": 2.5215036869049072, |
|
"learning_rate": 2.6500000000000004e-05, |
|
"loss": 0.6138, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 94.47300771208226, |
|
"eval_accuracy": 0.8168731185953577, |
|
"eval_loss": 0.8630892634391785, |
|
"eval_runtime": 32.066, |
|
"eval_samples_per_second": 194.069, |
|
"eval_steps_per_second": 6.081, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 95.11568123393316, |
|
"grad_norm": 2.797325611114502, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 0.6107, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 95.11568123393316, |
|
"eval_accuracy": 0.8156241335474724, |
|
"eval_loss": 0.8718482851982117, |
|
"eval_runtime": 32.0559, |
|
"eval_samples_per_second": 194.13, |
|
"eval_steps_per_second": 6.083, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 95.75835475578407, |
|
"grad_norm": 3.0883982181549072, |
|
"learning_rate": 2.5500000000000003e-05, |
|
"loss": 0.6099, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 95.75835475578407, |
|
"eval_accuracy": 0.8160560285491784, |
|
"eval_loss": 0.8676818013191223, |
|
"eval_runtime": 32.0504, |
|
"eval_samples_per_second": 194.163, |
|
"eval_steps_per_second": 6.084, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 96.40102827763496, |
|
"grad_norm": 2.922588348388672, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.6026, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 96.40102827763496, |
|
"eval_accuracy": 0.8162867029734353, |
|
"eval_loss": 0.8721068501472473, |
|
"eval_runtime": 32.019, |
|
"eval_samples_per_second": 194.353, |
|
"eval_steps_per_second": 6.09, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 97.04370179948586, |
|
"grad_norm": 2.663724660873413, |
|
"learning_rate": 2.45e-05, |
|
"loss": 0.6084, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 97.04370179948586, |
|
"eval_accuracy": 0.8182687960636296, |
|
"eval_loss": 0.858277440071106, |
|
"eval_runtime": 32.0344, |
|
"eval_samples_per_second": 194.26, |
|
"eval_steps_per_second": 6.087, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 97.68637532133675, |
|
"grad_norm": 2.8198440074920654, |
|
"learning_rate": 2.4e-05, |
|
"loss": 0.6033, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 97.68637532133675, |
|
"eval_accuracy": 0.8163839511643823, |
|
"eval_loss": 0.8688974380493164, |
|
"eval_runtime": 32.0629, |
|
"eval_samples_per_second": 194.087, |
|
"eval_steps_per_second": 6.082, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 98.32904884318766, |
|
"grad_norm": 2.3979711532592773, |
|
"learning_rate": 2.35e-05, |
|
"loss": 0.603, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 98.32904884318766, |
|
"eval_accuracy": 0.8169446192573946, |
|
"eval_loss": 0.8678939342498779, |
|
"eval_runtime": 32.0422, |
|
"eval_samples_per_second": 194.213, |
|
"eval_steps_per_second": 6.086, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 98.97172236503856, |
|
"grad_norm": 2.643324613571167, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 0.6009, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 98.97172236503856, |
|
"eval_accuracy": 0.8176424081803114, |
|
"eval_loss": 0.8609100580215454, |
|
"eval_runtime": 32.0681, |
|
"eval_samples_per_second": 194.056, |
|
"eval_steps_per_second": 6.081, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 99.61439588688945, |
|
"grad_norm": 2.791355848312378, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.601, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 99.61439588688945, |
|
"eval_accuracy": 0.816467387038539, |
|
"eval_loss": 0.867489755153656, |
|
"eval_runtime": 32.1192, |
|
"eval_samples_per_second": 193.747, |
|
"eval_steps_per_second": 6.071, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 100.25706940874036, |
|
"grad_norm": 2.787290096282959, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 0.5966, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 100.25706940874036, |
|
"eval_accuracy": 0.8171603328039044, |
|
"eval_loss": 0.8652510046958923, |
|
"eval_runtime": 32.0539, |
|
"eval_samples_per_second": 194.142, |
|
"eval_steps_per_second": 6.083, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 100.89974293059126, |
|
"grad_norm": 2.791646957397461, |
|
"learning_rate": 2.15e-05, |
|
"loss": 0.5935, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 100.89974293059126, |
|
"eval_accuracy": 0.8178409455191369, |
|
"eval_loss": 0.8600234985351562, |
|
"eval_runtime": 32.0149, |
|
"eval_samples_per_second": 194.378, |
|
"eval_steps_per_second": 6.091, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 101.54241645244215, |
|
"grad_norm": 2.4990010261535645, |
|
"learning_rate": 2.1e-05, |
|
"loss": 0.5912, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 101.54241645244215, |
|
"eval_accuracy": 0.8169466975666281, |
|
"eval_loss": 0.8653005957603455, |
|
"eval_runtime": 31.9972, |
|
"eval_samples_per_second": 194.486, |
|
"eval_steps_per_second": 6.094, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 102.18508997429306, |
|
"grad_norm": 2.3002467155456543, |
|
"learning_rate": 2.05e-05, |
|
"loss": 0.5873, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 102.18508997429306, |
|
"eval_accuracy": 0.81857837810817, |
|
"eval_loss": 0.8576123118400574, |
|
"eval_runtime": 31.9904, |
|
"eval_samples_per_second": 194.527, |
|
"eval_steps_per_second": 6.096, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 102.82776349614396, |
|
"grad_norm": 2.712336540222168, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5913, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 102.82776349614396, |
|
"eval_accuracy": 0.8185828450767534, |
|
"eval_loss": 0.855664074420929, |
|
"eval_runtime": 32.0488, |
|
"eval_samples_per_second": 194.173, |
|
"eval_steps_per_second": 6.084, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 103.47043701799485, |
|
"grad_norm": 2.769562005996704, |
|
"learning_rate": 1.9500000000000003e-05, |
|
"loss": 0.5908, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 103.47043701799485, |
|
"eval_accuracy": 0.8184711461830915, |
|
"eval_loss": 0.8605988621711731, |
|
"eval_runtime": 32.0286, |
|
"eval_samples_per_second": 194.295, |
|
"eval_steps_per_second": 6.088, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 104.11311053984576, |
|
"grad_norm": 2.6929941177368164, |
|
"learning_rate": 1.9e-05, |
|
"loss": 0.5898, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 104.11311053984576, |
|
"eval_accuracy": 0.8183153166169729, |
|
"eval_loss": 0.8647379875183105, |
|
"eval_runtime": 32.0673, |
|
"eval_samples_per_second": 194.06, |
|
"eval_steps_per_second": 6.081, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 104.75578406169666, |
|
"grad_norm": 2.8258962631225586, |
|
"learning_rate": 1.85e-05, |
|
"loss": 0.5841, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 104.75578406169666, |
|
"eval_accuracy": 0.817774798787225, |
|
"eval_loss": 0.8651922941207886, |
|
"eval_runtime": 32.011, |
|
"eval_samples_per_second": 194.402, |
|
"eval_steps_per_second": 6.092, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 105.39845758354755, |
|
"grad_norm": 2.46081805229187, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.582, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 105.39845758354755, |
|
"eval_accuracy": 0.817711216701606, |
|
"eval_loss": 0.8619120717048645, |
|
"eval_runtime": 32.0215, |
|
"eval_samples_per_second": 194.338, |
|
"eval_steps_per_second": 6.09, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 106.04113110539846, |
|
"grad_norm": 2.960406541824341, |
|
"learning_rate": 1.75e-05, |
|
"loss": 0.5833, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 106.04113110539846, |
|
"eval_accuracy": 0.8198030559915176, |
|
"eval_loss": 0.8566803932189941, |
|
"eval_runtime": 31.9884, |
|
"eval_samples_per_second": 194.539, |
|
"eval_steps_per_second": 6.096, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 106.68380462724936, |
|
"grad_norm": 2.3195433616638184, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 0.5804, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 106.68380462724936, |
|
"eval_accuracy": 0.8193473285144606, |
|
"eval_loss": 0.8546783328056335, |
|
"eval_runtime": 32.0522, |
|
"eval_samples_per_second": 194.152, |
|
"eval_steps_per_second": 6.084, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 107.32647814910025, |
|
"grad_norm": 2.7828962802886963, |
|
"learning_rate": 1.65e-05, |
|
"loss": 0.5839, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 107.32647814910025, |
|
"eval_accuracy": 0.817869892847695, |
|
"eval_loss": 0.859340488910675, |
|
"eval_runtime": 32.2415, |
|
"eval_samples_per_second": 193.012, |
|
"eval_steps_per_second": 6.048, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 107.96915167095116, |
|
"grad_norm": 2.8258416652679443, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.5774, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 107.96915167095116, |
|
"eval_accuracy": 0.8181858662056681, |
|
"eval_loss": 0.8593936562538147, |
|
"eval_runtime": 32.0552, |
|
"eval_samples_per_second": 194.134, |
|
"eval_steps_per_second": 6.083, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 108.61182519280206, |
|
"grad_norm": 2.5473225116729736, |
|
"learning_rate": 1.55e-05, |
|
"loss": 0.5791, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 108.61182519280206, |
|
"eval_accuracy": 0.8201209689038143, |
|
"eval_loss": 0.8578335046768188, |
|
"eval_runtime": 32.0789, |
|
"eval_samples_per_second": 193.99, |
|
"eval_steps_per_second": 6.079, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 109.25449871465295, |
|
"grad_norm": 2.6014022827148438, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.5724, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 109.25449871465295, |
|
"eval_accuracy": 0.8179463406457561, |
|
"eval_loss": 0.8615684509277344, |
|
"eval_runtime": 32.0619, |
|
"eval_samples_per_second": 194.093, |
|
"eval_steps_per_second": 6.082, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 109.89717223650386, |
|
"grad_norm": 2.6044790744781494, |
|
"learning_rate": 1.45e-05, |
|
"loss": 0.5782, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 109.89717223650386, |
|
"eval_accuracy": 0.818407341792689, |
|
"eval_loss": 0.8595022559165955, |
|
"eval_runtime": 32.0145, |
|
"eval_samples_per_second": 194.381, |
|
"eval_steps_per_second": 6.091, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 110.53984575835476, |
|
"grad_norm": 2.570466995239258, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 0.5735, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 110.53984575835476, |
|
"eval_accuracy": 0.8203102481461101, |
|
"eval_loss": 0.8511986136436462, |
|
"eval_runtime": 32.0544, |
|
"eval_samples_per_second": 194.139, |
|
"eval_steps_per_second": 6.083, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 111.18251928020565, |
|
"grad_norm": 2.4701082706451416, |
|
"learning_rate": 1.3500000000000001e-05, |
|
"loss": 0.5664, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 111.18251928020565, |
|
"eval_accuracy": 0.8206346877390821, |
|
"eval_loss": 0.8540939688682556, |
|
"eval_runtime": 32.046, |
|
"eval_samples_per_second": 194.19, |
|
"eval_steps_per_second": 6.085, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 111.82519280205655, |
|
"grad_norm": 2.9233691692352295, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 0.5719, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 111.82519280205655, |
|
"eval_accuracy": 0.8200771187996747, |
|
"eval_loss": 0.8584414124488831, |
|
"eval_runtime": 32.0359, |
|
"eval_samples_per_second": 194.251, |
|
"eval_steps_per_second": 6.087, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 112.46786632390746, |
|
"grad_norm": 2.3986382484436035, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.5682, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 112.46786632390746, |
|
"eval_accuracy": 0.8193907912255869, |
|
"eval_loss": 0.8565849661827087, |
|
"eval_runtime": 32.0525, |
|
"eval_samples_per_second": 194.15, |
|
"eval_steps_per_second": 6.084, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 113.11053984575835, |
|
"grad_norm": 2.5226054191589355, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.5719, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 113.11053984575835, |
|
"eval_accuracy": 0.8199272051177008, |
|
"eval_loss": 0.8500083684921265, |
|
"eval_runtime": 32.0889, |
|
"eval_samples_per_second": 193.93, |
|
"eval_steps_per_second": 6.077, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 113.75321336760925, |
|
"grad_norm": 2.609858274459839, |
|
"learning_rate": 1.1500000000000002e-05, |
|
"loss": 0.5674, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 113.75321336760925, |
|
"eval_accuracy": 0.8205847363486674, |
|
"eval_loss": 0.8524363040924072, |
|
"eval_runtime": 32.0335, |
|
"eval_samples_per_second": 194.265, |
|
"eval_steps_per_second": 6.087, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 114.39588688946016, |
|
"grad_norm": 2.2834887504577637, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 0.5643, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 114.39588688946016, |
|
"eval_accuracy": 0.821479995279122, |
|
"eval_loss": 0.8454416990280151, |
|
"eval_runtime": 32.0112, |
|
"eval_samples_per_second": 194.401, |
|
"eval_steps_per_second": 6.092, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 115.03856041131105, |
|
"grad_norm": 2.776373863220215, |
|
"learning_rate": 1.05e-05, |
|
"loss": 0.5681, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 115.03856041131105, |
|
"eval_accuracy": 0.8196546091505466, |
|
"eval_loss": 0.8588145971298218, |
|
"eval_runtime": 31.2312, |
|
"eval_samples_per_second": 199.256, |
|
"eval_steps_per_second": 6.244, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 115.68123393316195, |
|
"grad_norm": 2.7792084217071533, |
|
"learning_rate": 1e-05, |
|
"loss": 0.563, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 115.68123393316195, |
|
"eval_accuracy": 0.8215367161011942, |
|
"eval_loss": 0.8472273945808411, |
|
"eval_runtime": 32.0865, |
|
"eval_samples_per_second": 193.944, |
|
"eval_steps_per_second": 6.077, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 116.32390745501286, |
|
"grad_norm": 2.5021698474884033, |
|
"learning_rate": 9.5e-06, |
|
"loss": 0.5563, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 116.32390745501286, |
|
"eval_accuracy": 0.8198419059293134, |
|
"eval_loss": 0.8576837778091431, |
|
"eval_runtime": 32.0421, |
|
"eval_samples_per_second": 194.214, |
|
"eval_steps_per_second": 6.086, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 116.96658097686375, |
|
"grad_norm": 2.7690136432647705, |
|
"learning_rate": 9e-06, |
|
"loss": 0.5667, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 116.96658097686375, |
|
"eval_accuracy": 0.8201571685631533, |
|
"eval_loss": 0.8592654466629028, |
|
"eval_runtime": 32.0402, |
|
"eval_samples_per_second": 194.225, |
|
"eval_steps_per_second": 6.086, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 117.60925449871465, |
|
"grad_norm": 2.7082719802856445, |
|
"learning_rate": 8.500000000000002e-06, |
|
"loss": 0.5598, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 117.60925449871465, |
|
"eval_accuracy": 0.8194785991809854, |
|
"eval_loss": 0.8624646067619324, |
|
"eval_runtime": 31.9751, |
|
"eval_samples_per_second": 194.62, |
|
"eval_steps_per_second": 6.098, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 118.25192802056556, |
|
"grad_norm": 2.6081254482269287, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.5588, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 118.25192802056556, |
|
"eval_accuracy": 0.8200990472992823, |
|
"eval_loss": 0.8558657169342041, |
|
"eval_runtime": 32.0709, |
|
"eval_samples_per_second": 194.039, |
|
"eval_steps_per_second": 6.08, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 118.89460154241645, |
|
"grad_norm": 2.39003849029541, |
|
"learning_rate": 7.5e-06, |
|
"loss": 0.5573, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 118.89460154241645, |
|
"eval_accuracy": 0.821544695178437, |
|
"eval_loss": 0.8529016375541687, |
|
"eval_runtime": 32.0688, |
|
"eval_samples_per_second": 194.052, |
|
"eval_steps_per_second": 6.081, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 119.53727506426735, |
|
"grad_norm": 2.671393871307373, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 0.5583, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 119.53727506426735, |
|
"eval_accuracy": 0.8202954907050746, |
|
"eval_loss": 0.8513576984405518, |
|
"eval_runtime": 32.0535, |
|
"eval_samples_per_second": 194.144, |
|
"eval_steps_per_second": 6.084, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 120.17994858611826, |
|
"grad_norm": 2.8136119842529297, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": 0.5599, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 120.17994858611826, |
|
"eval_accuracy": 0.8210522728285633, |
|
"eval_loss": 0.8533715605735779, |
|
"eval_runtime": 32.0108, |
|
"eval_samples_per_second": 194.403, |
|
"eval_steps_per_second": 6.092, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 120.82262210796915, |
|
"grad_norm": 2.5622072219848633, |
|
"learning_rate": 6e-06, |
|
"loss": 0.5538, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 120.82262210796915, |
|
"eval_accuracy": 0.8210836727544613, |
|
"eval_loss": 0.8567976951599121, |
|
"eval_runtime": 32.0891, |
|
"eval_samples_per_second": 193.929, |
|
"eval_steps_per_second": 6.077, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 121.46529562982005, |
|
"grad_norm": 2.870851516723633, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": 0.5544, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 121.46529562982005, |
|
"eval_accuracy": 0.8227272131920523, |
|
"eval_loss": 0.8419870138168335, |
|
"eval_runtime": 32.0445, |
|
"eval_samples_per_second": 194.199, |
|
"eval_steps_per_second": 6.085, |
|
"step": 94500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 129, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.979699566574305e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|